diff --git a/src/ao_basis/aos_in_r.irp.f b/src/ao_basis/aos_in_r.irp.f
index 7fcb980a..1b1595a3 100644
--- a/src/ao_basis/aos_in_r.irp.f
+++ b/src/ao_basis/aos_in_r.irp.f
@@ -65,46 +65,60 @@ double precision function primitive_value(i,j,r)
 
 end
 
+! ---
 
-subroutine give_all_aos_at_r(r,aos_array)
- implicit none
- BEGIN_dOC
-! input  : r == r(1) = x and so on
-!
-! output : aos_array(i) = aos(i) evaluated in $\textbf{r}$
- END_DOC
- double precision, intent(in) :: r(3)
- double precision, intent(out):: aos_array(ao_num)
+subroutine give_all_aos_at_r(r, tmp_array)
 
- integer :: power_ao(3)
- integer :: i,j,k,l,m
- double precision :: dx,dy,dz,r2
- double precision ::      dx2,dy2,dz2
- double precision :: center_ao(3)
- double precision :: beta
- do i = 1, nucl_num
-  center_ao(1:3) = nucl_coord(i,1:3)
-  dx = (r(1) - center_ao(1))
-  dy = (r(2) - center_ao(2))
-  dz = (r(3) - center_ao(3))
-  r2 = dx*dx + dy*dy + dz*dz
-  do j = 1,Nucl_N_Aos(i)
-   k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
-   aos_array(k) = 0.d0
-   power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
-   dx2 = dx**power_ao(1)
-   dy2 = dy**power_ao(2)
-   dz2 = dz**power_ao(3)
-   do l = 1,ao_prim_num(k)
-    beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-    if(dabs(beta*r2).gt.40.d0)cycle
-    aos_array(k)+= ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
-   enddo
-   aos_array(k) = aos_array(k) * dx2 * dy2 * dz2
+  BEGIN_dOC
+  !
+  ! input  : r == r(1) = x and so on
+  !
+  ! output : tmp_array(i) = aos(i) evaluated in $\textbf{r}$
+  !
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: tmp_array(ao_num)
+  integer                       :: p_ao(3)
+  integer                       :: i, j, k, l, m
+  double precision              :: dx, dy, dz, r2
+  double precision              :: dx2, dy2, dz2
+  double precision              :: c_ao(3)
+  double precision              :: beta
+
+  do i = 1, nucl_num
+
+    c_ao(1:3) = nucl_coord(i,1:3)
+    dx = r(1) - c_ao(1)
+    dy = r(2) - c_ao(2)
+    dz = r(3) - c_ao(3)
+    r2 = dx*dx + dy*dy + dz*dz
+
+    do j = 1, Nucl_N_Aos(i)
+
+      k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
+      p_ao(1:3) = ao_power_ordered_transp_per_nucl(1:3,j,i)
+      dx2 = dx**p_ao(1)
+      dy2 = dy**p_ao(2)
+      dz2 = dz**p_ao(3)
+
+      tmp_array(k) = 0.d0
+      do l = 1,ao_prim_num(k)
+        beta = ao_expo_ordered_transp_per_nucl(l,j,i)
+        if(dabs(beta*r2).gt.40.d0) cycle
+
+        tmp_array(k) += ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
+      enddo
+
+      tmp_array(k) = tmp_array(k) * dx2 * dy2 * dz2
+    enddo
   enddo
- enddo
+
+  return
 end
 
+! ---
 
 subroutine give_all_aos_and_grad_at_r(r,aos_array,aos_grad_array)
  implicit none
diff --git a/src/ao_basis/aos_transp.irp.f b/src/ao_basis/aos_transp.irp.f
index ae6193bf..4e44a9f6 100644
--- a/src/ao_basis/aos_transp.irp.f
+++ b/src/ao_basis/aos_transp.irp.f
@@ -1,20 +1,28 @@
- BEGIN_PROVIDER [ integer, Nucl_Aos_transposed, (N_AOs_max,nucl_num)]
- implicit none
- BEGIN_DOC
- ! List of AOs attached on each atom
- END_DOC
- integer :: i
- integer, allocatable :: nucl_tmp(:)
- allocate(nucl_tmp(nucl_num))
- nucl_tmp = 0
- Nucl_Aos = 0
- do i = 1, ao_num
-  nucl_tmp(ao_nucl(i))+=1
-  Nucl_Aos_transposed(nucl_tmp(ao_nucl(i)),ao_nucl(i)) = i
- enddo
- deallocate(nucl_tmp)
+
+! ---
+
+BEGIN_PROVIDER [ integer, Nucl_Aos_transposed, (N_AOs_max,nucl_num)]
+
+  BEGIN_DOC
+  ! List of AOs attached on each atom
+  END_DOC
+ 
+  implicit none
+  integer              :: i
+  integer, allocatable :: nucl_tmp(:)
+
+  allocate(nucl_tmp(nucl_num))
+  nucl_tmp = 0
+  do i = 1, ao_num
+    nucl_tmp(ao_nucl(i)) += 1
+    Nucl_Aos_transposed(nucl_tmp(ao_nucl(i)),ao_nucl(i)) = i
+  enddo
+  deallocate(nucl_tmp)
+
 END_PROVIDER
 
+! ---
+
 BEGIN_PROVIDER [double precision, ao_expo_ordered_transp_per_nucl, (ao_prim_num_max,N_AOs_max,nucl_num) ]
  implicit none
  integer :: i,j,k,l
diff --git a/src/ao_many_one_e_ints/ao_erf_gauss.irp.f b/src/ao_many_one_e_ints/ao_erf_gauss.irp.f
index 3d7fbe50..b1077161 100644
--- a/src/ao_many_one_e_ints/ao_erf_gauss.irp.f
+++ b/src/ao_many_one_e_ints/ao_erf_gauss.irp.f
@@ -212,9 +212,7 @@ subroutine NAI_pol_x_mult_erf_ao(i_ao, j_ao, mu_in, C_center, ints)
   ! Computes the following integral :
   !
   ! $\int_{-\infty}^{infty} dr x * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
-  !
   ! $\int_{-\infty}^{infty} dr y * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
-  !
   ! $\int_{-\infty}^{infty} dr z * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
   !
   END_DOC
@@ -279,9 +277,7 @@ subroutine NAI_pol_x_mult_erf_ao_v0(i_ao, j_ao, mu_in, C_center, LD_C, ints, LD_
   ! Computes the following integral :
   !
   ! $\int_{-\infty}^{infty} dr x * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
-  !
   ! $\int_{-\infty}^{infty} dr y * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
-  !
   ! $\int_{-\infty}^{infty} dr z * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
   !
   END_DOC
@@ -1111,3 +1107,141 @@ end
 
 ! ---
 
+subroutine NAI_pol_x2_mult_erf_ao_with1s(i_ao, j_ao, beta, B_center, mu_in, C_center, ints)
+
+  BEGIN_DOC
+  !
+  ! Computes the following integral :
+  !
+  ! $\int_{-\infty}^{infty} dr x^2 * \chi_i(r) \chi_j(r) e^{-\beta (r - B_center)^2} \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
+  ! $\int_{-\infty}^{infty} dr y^2 * \chi_i(r) \chi_j(r) e^{-\beta (r - B_center)^2} \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
+  ! $\int_{-\infty}^{infty} dr z^2 * \chi_i(r) \chi_j(r) e^{-\beta (r - B_center)^2} \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
+  !
+  END_DOC
+
+  include 'utils/constants.include.F'
+
+  implicit none
+
+  integer,          intent(in)  :: i_ao, j_ao
+  double precision, intent(in)  :: beta, B_center(3), mu_in, C_center(3)
+  double precision, intent(out) :: ints(3)
+
+  integer                       :: i, j, power_Ai(3), power_Aj(3), n_pt_in, m
+  integer                       :: power_A1(3), power_A2(3)
+  double precision              :: Ai_center(3), Aj_center(3), alphai, alphaj, coef, coefi
+  double precision              :: integral0, integral1, integral2
+
+  double precision, external    :: NAI_pol_mult_erf_with1s
+
+  ASSERT(beta .ge. 0.d0)
+  if(beta .lt. 1d-10) then
+    call NAI_pol_x2_mult_erf_ao(i_ao, j_ao, mu_in, C_center, ints)
+    return
+  endif
+
+  ints = 0.d0
+
+  power_Ai(1:3) = ao_power(i_ao,1:3)
+  power_Aj(1:3) = ao_power(j_ao,1:3)
+
+  Ai_center(1:3) = nucl_coord(ao_nucl(i_ao),1:3)
+  Aj_center(1:3) = nucl_coord(ao_nucl(j_ao),1:3)
+
+  n_pt_in = n_pt_max_integrals
+
+  do i = 1, ao_prim_num(i_ao)
+    alphai = ao_expo_ordered_transp           (i,i_ao)
+    coefi  = ao_coef_normalized_ordered_transp(i,i_ao)
+
+    do m = 1, 3
+
+      power_A1     = power_Ai
+      power_A1(m) += 1
+
+      power_A2     = power_Ai
+      power_A2(m) += 2
+
+      do j = 1, ao_prim_num(j_ao)
+        alphaj = ao_expo_ordered_transp                   (j,j_ao)
+        coef   = coefi * ao_coef_normalized_ordered_transp(j,j_ao)
+
+        integral0 = NAI_pol_mult_erf_with1s(Ai_center, Aj_center, power_Ai, power_Aj, alphai, alphaj, beta, B_center, C_center, n_pt_in, mu_in)
+        integral1 = NAI_pol_mult_erf_with1s(Ai_center, Aj_center, power_A1, power_Aj, alphai, alphaj, beta, B_center, C_center, n_pt_in, mu_in)
+        integral2 = NAI_pol_mult_erf_with1s(Ai_center, Aj_center, power_A2, power_Aj, alphai, alphaj, beta, B_center, C_center, n_pt_in, mu_in)
+
+        ints(m) += coef * (integral2 + Ai_center(m) * (2.d0*integral1 + Ai_center(m)*integral0))
+      enddo
+    enddo
+  enddo
+
+end subroutine NAI_pol_x2_mult_erf_ao_with1s
+
+! ---
+
+subroutine NAI_pol_x2_mult_erf_ao(i_ao, j_ao, mu_in, C_center, ints)
+
+  BEGIN_DOC
+  !
+  ! Computes the following integral :
+  !
+  ! $\int_{-\infty}^{infty} dr x^2 * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
+  ! $\int_{-\infty}^{infty} dr y^2 * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
+  ! $\int_{-\infty}^{infty} dr z^2 * \chi_i(r) \chi_j(r) \frac{\erf(\mu | r - R_C | )}{ | r - R_C | }$.
+  !
+  END_DOC
+
+  include 'utils/constants.include.F'
+
+  implicit none
+
+  integer,          intent(in)  :: i_ao, j_ao
+  double precision, intent(in)  :: mu_in, C_center(3)
+  double precision, intent(out) :: ints(3)
+
+  integer                       :: i, j, num_A, num_B, power_A(3), power_B(3), n_pt_in, m
+  integer                       :: power_A1(3), power_A2(3)
+  double precision              :: A_center(3), B_center(3), alpha, beta, coef
+  double precision              :: integral0, integral1, integral2
+
+  double precision              :: NAI_pol_mult_erf
+
+  ints = 0.d0
+
+  num_A         = ao_nucl(i_ao)
+  power_A(1:3)  = ao_power(i_ao,1:3)
+  A_center(1:3) = nucl_coord(num_A,1:3)
+  num_B         = ao_nucl(j_ao)
+  power_B(1:3)  = ao_power(j_ao,1:3)
+  B_center(1:3) = nucl_coord(num_B,1:3)
+
+  n_pt_in = n_pt_max_integrals
+
+  do i = 1, ao_prim_num(i_ao)
+    alpha = ao_expo_ordered_transp(i,i_ao)
+
+    do m = 1, 3
+
+      power_A1 = power_A
+      power_A1(m) += 1
+
+      power_A2 = power_A
+      power_A2(m) += 2
+
+      do j = 1, ao_prim_num(j_ao)
+        beta = ao_expo_ordered_transp(j,j_ao)
+        coef = ao_coef_normalized_ordered_transp(j,j_ao) * ao_coef_normalized_ordered_transp(i,i_ao)
+
+        integral0 = NAI_pol_mult_erf(A_center, B_center, power_A , power_B, alpha, beta, C_center, n_pt_in, mu_in)
+        integral1 = NAI_pol_mult_erf(A_center, B_center, power_A1, power_B, alpha, beta, C_center, n_pt_in, mu_in)
+        integral2 = NAI_pol_mult_erf(A_center, B_center, power_A2, power_B, alpha, beta, C_center, n_pt_in, mu_in)
+
+        ints(m) += coef * (integral2 + A_center(m) * (2.d0*integral1 + A_center(m)*integral0))
+      enddo
+    enddo
+  enddo
+
+end subroutine NAI_pol_x2_mult_erf_ao
+
+! ---
+
diff --git a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
index 7c68de75..fda2db82 100644
--- a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
@@ -128,6 +128,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n
           do i_1s = 2, List_all_comb_b3_size
 
             coef        = List_all_comb_b3_coef  (i_1s)
+            if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
             beta        = List_all_comb_b3_expo  (i_1s)
             B_center(1) = List_all_comb_b3_cent(1,i_1s)
             B_center(2) = List_all_comb_b3_cent(2,i_1s)
@@ -222,6 +223,7 @@ BEGIN_PROVIDER [double precision, int2_u2_j1b2, (ao_num, ao_num, n_points_final_
           do i_1s = 2, List_all_comb_b3_size
          
             coef        = List_all_comb_b3_coef  (i_1s)
+            if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
             beta        = List_all_comb_b3_expo  (i_1s)
             B_center(1) = List_all_comb_b3_cent(1,i_1s)
             B_center(2) = List_all_comb_b3_cent(2,i_1s)
@@ -322,6 +324,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (ao_num, ao_num, n_poin
           do i_1s = 2, List_all_comb_b3_size
           
             coef        = List_all_comb_b3_coef  (i_1s)
+            if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
             beta        = List_all_comb_b3_expo  (i_1s)
             B_center(1) = List_all_comb_b3_cent(1,i_1s)
             B_center(2) = List_all_comb_b3_cent(2,i_1s)
@@ -436,6 +439,7 @@ BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points
           do i_1s = 2, List_all_comb_b3_size
 
             coef        = List_all_comb_b3_coef  (i_1s)
+            if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
             beta        = List_all_comb_b3_expo  (i_1s)
             B_center(1) = List_all_comb_b3_cent(1,i_1s)
             B_center(2) = List_all_comb_b3_cent(2,i_1s)
diff --git a/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f b/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
index 25bb2f8b..9af3f9a9 100644
--- a/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
@@ -60,6 +60,7 @@ BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_po
         do i_1s = 2, List_all_comb_b2_size
 
           coef        = List_all_comb_b2_coef  (i_1s)
+          if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
           beta        = List_all_comb_b2_expo  (i_1s)
           B_center(1) = List_all_comb_b2_cent(1,i_1s)
           B_center(2) = List_all_comb_b2_cent(2,i_1s)
@@ -154,6 +155,7 @@ BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_
         do i_1s = 2, List_all_comb_b2_size
 
           coef        = List_all_comb_b2_coef  (i_1s)
+          if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
           beta        = List_all_comb_b2_expo  (i_1s)
           B_center(1) = List_all_comb_b2_cent(1,i_1s)
           B_center(2) = List_all_comb_b2_cent(2,i_1s)
@@ -195,8 +197,7 @@ END_PROVIDER
 
 ! ---
 
-! TODO analytically
-BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_fit, (ao_num, ao_num, n_points_final_grid)]
 
   BEGIN_DOC
   !
@@ -213,12 +214,14 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
 
   double precision, external :: overlap_gauss_r12_ao_with1s
 
-  print*, ' providing v_ij_u_cst_mu_j1b ...'
+  print*, ' providing v_ij_u_cst_mu_j1b_fit ...'
   call wall_time(wall0)
 
   provide mu_erf final_grid_points j1b_pen
+  PROVIDE ng_fit_jast expo_gauss_j_mu_x coef_gauss_j_mu_x
+  PROVIDE List_all_comb_b2_size List_all_comb_b2_coef List_all_comb_b2_expo List_all_comb_b2_cent
 
-  v_ij_u_cst_mu_j1b = 0.d0
+  v_ij_u_cst_mu_j1b_fit = 0.d0
 
   !$OMP PARALLEL DEFAULT (NONE)                                      &
   !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
@@ -227,9 +230,8 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
   !$OMP          final_grid_points, ng_fit_jast,                     &
   !$OMP          expo_gauss_j_mu_x, coef_gauss_j_mu_x,               &
   !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo,       & 
-  !$OMP          List_all_comb_b2_cent, v_ij_u_cst_mu_j1b)
+  !$OMP          List_all_comb_b2_cent, v_ij_u_cst_mu_j1b_fit)
   !$OMP DO
-  !do ipoint = 1, 10
   do ipoint = 1, n_points_final_grid
     r(1) = final_grid_points(1,ipoint)
     r(2) = final_grid_points(2,ipoint)
@@ -240,7 +242,6 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
 
         tmp = 0.d0
         do i_fit = 1, ng_fit_jast
-
           expo_fit = expo_gauss_j_mu_x(i_fit)
           coef_fit = coef_gauss_j_mu_x(i_fit)
 
@@ -253,7 +254,6 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
           B_center(3) = List_all_comb_b2_cent(3,1)
 
           int_fit = overlap_gauss_r12_ao_with1s(B_center, beta, r, expo_fit, i, j)
-!         if(dabs(int_fit*coef) .lt. 1d-12) cycle
 
           tmp += coef * coef_fit * int_fit
 
@@ -262,6 +262,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
           do i_1s = 2, List_all_comb_b2_size
 
             coef        = List_all_comb_b2_coef  (i_1s)
+            if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
             beta        = List_all_comb_b2_expo  (i_1s)
             B_center(1) = List_all_comb_b2_cent(1,i_1s)
             B_center(2) = List_all_comb_b2_cent(2,i_1s)
@@ -276,7 +277,7 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
 
         enddo
 
-        v_ij_u_cst_mu_j1b(j,i,ipoint) = tmp
+        v_ij_u_cst_mu_j1b_fit(j,i,ipoint) = tmp
       enddo
     enddo
   enddo
@@ -286,13 +287,149 @@ BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_
   do ipoint = 1, n_points_final_grid
     do i = 2, ao_num
       do j = 1, i-1
-        v_ij_u_cst_mu_j1b(j,i,ipoint) = v_ij_u_cst_mu_j1b(i,j,ipoint)
+        v_ij_u_cst_mu_j1b_fit(j,i,ipoint) = v_ij_u_cst_mu_j1b_fit(i,j,ipoint)
       enddo
     enddo
   enddo
  
   call wall_time(wall1)
-  print*, ' wall time for v_ij_u_cst_mu_j1b', wall1 - wall0
+  print*, ' wall time for v_ij_u_cst_mu_j1b_fit', wall1 - wall0
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b_an, (ao_num, ao_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  !
+  ! int dr2 phi_i(r2) phi_j(r2) 1s_j1b(r2) u(mu, r12)
+  !
+  ! TODO
+  ! one subroutine for all integrals
+  !
+  END_DOC
+
+  include 'constants.include.F'
+
+  implicit none
+  integer                    :: i, j, ipoint, i_1s
+  double precision           :: r(3), r1_2
+  double precision           :: int_c1, int_e1, int_o
+  double precision           :: int_c2(3), int_e2(3)
+  double precision           :: int_c3(3), int_e3(3)
+  double precision           :: coef, beta, B_center(3)
+  double precision           :: tmp, ct
+  double precision           :: wall0, wall1
+
+  double precision, external :: overlap_gauss_r12_ao_with1s
+  double precision, external :: NAI_pol_mult_erf_ao_with1s
+
+  print*, ' providing v_ij_u_cst_mu_j1b_an ...'
+  call wall_time(wall0)
+
+  provide mu_erf final_grid_points j1b_pen
+  PROVIDE List_all_comb_b2_size List_all_comb_b2_coef List_all_comb_b2_expo List_all_comb_b2_cent
+
+  ct = inv_sq_pi_2 / mu_erf
+
+  v_ij_u_cst_mu_j1b_an = 0.d0
+
+  !$OMP PARALLEL DEFAULT (NONE)                                      &
+  !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center,        &
+  !$OMP          r1_2, tmp, int_c1, int_e1, int_o, int_c2,           &
+  !$OMP          int_e2, int_c3, int_e3)                             &
+  !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b2_size, & 
+  !$OMP          final_grid_points, mu_erf, ct,                      &
+  !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo,       & 
+  !$OMP          List_all_comb_b2_cent, v_ij_u_cst_mu_j1b_an)
+  !$OMP DO
+  do ipoint = 1, n_points_final_grid
+
+    r(1) = final_grid_points(1,ipoint)
+    r(2) = final_grid_points(2,ipoint)
+    r(3) = final_grid_points(3,ipoint)
+    r1_2 = 0.5d0 * (r(1)*r(1) + r(2)*r(2) + r(3)*r(3))
+
+    do i = 1, ao_num
+      do j = i, ao_num
+
+        ! ---
+
+        coef        = List_all_comb_b2_coef  (1)
+        beta        = List_all_comb_b2_expo  (1)
+        B_center(1) = List_all_comb_b2_cent(1,1)
+        B_center(2) = List_all_comb_b2_cent(2,1)
+        B_center(3) = List_all_comb_b2_cent(3,1)
+
+        int_c1 = NAI_pol_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r)
+        int_e1 = NAI_pol_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r)
+
+        call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, int_c2)
+        call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, int_e2)
+
+        call NAI_pol_x2_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, int_c3)
+        call NAI_pol_x2_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, int_e3)
+
+        int_o = overlap_gauss_r12_ao_with1s(B_center, beta, r, mu_erf*mu_erf, i, j)
+
+        tmp = coef                                                                                               &
+            * ( r1_2 * (int_c1 - int_e1)                                                                         &
+              - r(1) * (int_c2(1) - int_e2(1)) - r(2) * (int_c2(2) - int_e2(2)) - r(3) * (int_c2(3) - int_e2(3)) &
+              + 0.5d0 * (int_c3(1) + int_c3(2) + int_c3(3) - int_e3(1) - int_e3(2) - int_e3(3))                  &
+              - ct * int_o                                                                                       &
+              )
+
+        ! ---
+
+        do i_1s = 2, List_all_comb_b2_size
+
+          coef        = List_all_comb_b2_coef  (i_1s)
+          if(dabs(coef) .lt. 1d-15) cycle ! beta = 0.0
+          beta        = List_all_comb_b2_expo  (i_1s)
+          B_center(1) = List_all_comb_b2_cent(1,i_1s)
+          B_center(2) = List_all_comb_b2_cent(2,i_1s)
+          B_center(3) = List_all_comb_b2_cent(3,i_1s)
+
+          int_c1 = NAI_pol_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r)
+          int_e1 = NAI_pol_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r)
+
+          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, int_c2)
+          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, int_e2)
+
+          call NAI_pol_x2_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, int_c3)
+          call NAI_pol_x2_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, int_e3)
+
+          int_o = overlap_gauss_r12_ao_with1s(B_center, beta, r, mu_erf*mu_erf, i, j)
+
+          tmp = tmp + coef                                                                                         &
+              * ( r1_2 * (int_c1 - int_e1)                                                                         &
+                - r(1) * (int_c2(1) - int_e2(1)) - r(2) * (int_c2(2) - int_e2(2)) - r(3) * (int_c2(3) - int_e2(3)) &
+                + 0.5d0 * (int_c3(1) + int_c3(2) + int_c3(3) - int_e3(1) - int_e3(2) - int_e3(3))                  &
+                - ct * int_o                                                                                       &
+                )
+
+        enddo
+
+        ! ---
+
+        v_ij_u_cst_mu_j1b_an(j,i,ipoint) = tmp
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  do ipoint = 1, n_points_final_grid
+    do i = 2, ao_num
+      do j = 1, i-1
+        v_ij_u_cst_mu_j1b_an(j,i,ipoint) = v_ij_u_cst_mu_j1b_an(i,j,ipoint)
+      enddo
+    enddo
+  enddo
+ 
+  call wall_time(wall1)
+  print*, ' wall time for v_ij_u_cst_mu_j1b_an', wall1 - wall0
 
 END_PROVIDER 
 
diff --git a/src/ao_many_one_e_ints/listj1b.irp.f b/src/ao_many_one_e_ints/listj1b.irp.f
index 02963605..33ca8085 100644
--- a/src/ao_many_one_e_ints/listj1b.irp.f
+++ b/src/ao_many_one_e_ints/listj1b.irp.f
@@ -62,6 +62,7 @@ END_PROVIDER
   double precision :: tmp_cent_x, tmp_cent_y, tmp_cent_z
 
   provide j1b_pen
+  provide j1b_pen_coef
 
   List_all_comb_b2_coef = 0.d0
   List_all_comb_b2_expo = 0.d0
@@ -127,8 +128,8 @@ END_PROVIDER
     List_all_comb_b2_expo(    1) = 0.d0
     List_all_comb_b2_cent(1:3,1) = 0.d0
     do i = 1, nucl_num
-      List_all_comb_b2_coef(  i+1) = -1.d0
-      List_all_comb_b2_expo(  i+1) = j1b_pen(   i)
+      List_all_comb_b2_coef(  i+1) = -1.d0 * j1b_pen_coef(i)
+      List_all_comb_b2_expo(  i+1) = j1b_pen(i)
       List_all_comb_b2_cent(1,i+1) = nucl_coord(i,1)
       List_all_comb_b2_cent(2,i+1) = nucl_coord(i,2)
       List_all_comb_b2_cent(3,i+1) = nucl_coord(i,3)
@@ -225,6 +226,7 @@ END_PROVIDER
   double precision :: dx, dy, dz, r2
 
   provide j1b_pen
+  provide j1b_pen_coef
 
   List_all_comb_b3_coef = 0.d0
   List_all_comb_b3_expo = 0.d0
@@ -296,8 +298,8 @@ END_PROVIDER
 
     do i = 1, nucl_num
       ii = ii + 1
-      List_all_comb_b3_coef(  ii) = -2.d0
-      List_all_comb_b3_expo(  ii) = j1b_pen(   i)
+      List_all_comb_b3_coef(  ii) = -2.d0 * j1b_pen_coef(i)
+      List_all_comb_b3_expo(  ii) = j1b_pen(i)
       List_all_comb_b3_cent(1,ii) = nucl_coord(i,1)
       List_all_comb_b3_cent(2,ii) = nucl_coord(i,2)
       List_all_comb_b3_cent(3,ii) = nucl_coord(i,3)
@@ -305,7 +307,7 @@ END_PROVIDER
 
     do i = 1, nucl_num
       ii = ii + 1
-      List_all_comb_b3_coef(  ii) = 1.d0
+      List_all_comb_b3_coef(  ii) = 1.d0 * j1b_pen_coef(i) * j1b_pen_coef(i)
       List_all_comb_b3_expo(  ii) = 2.d0 * j1b_pen(i)
       List_all_comb_b3_cent(1,ii) = nucl_coord(i,1)
       List_all_comb_b3_cent(2,ii) = nucl_coord(i,2)
@@ -337,7 +339,7 @@ END_PROVIDER
         
         ii = ii + 1
         ! x 2 to avoid doing integrals twice
-        List_all_comb_b3_coef(  ii) = 2.d0 * dexp(-tmp1*tmp2*tmp4*r2)
+        List_all_comb_b3_coef(  ii) = 2.d0 * dexp(-tmp1*tmp2*tmp4*r2) * j1b_pen_coef(i) * j1b_pen_coef(j)
         List_all_comb_b3_expo(  ii) = tmp3
         List_all_comb_b3_cent(1,ii) = tmp4 * (tmp1 * xi + tmp2 * xj)
         List_all_comb_b3_cent(2,ii) = tmp4 * (tmp1 * yi + tmp2 * yj)
diff --git a/src/ao_tc_eff_map/fit_j.irp.f b/src/ao_tc_eff_map/fit_j.irp.f
index 4730d003..0fc3da2f 100644
--- a/src/ao_tc_eff_map/fit_j.irp.f
+++ b/src/ao_tc_eff_map/fit_j.irp.f
@@ -36,16 +36,25 @@ END_PROVIDER
  END_PROVIDER 
 
 BEGIN_PROVIDER [ double precision, expo_j_xmu, (n_fit_1_erf_x) ]
- implicit none
- BEGIN_DOC
- ! F(x) = x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2) is fitted with a gaussian and a Slater
- !
- !      \approx - 1/sqrt(pi) * exp(-alpha * x ) exp(-beta * x**2)
- !
- ! where alpha = expo_j_xmu(1) and beta = expo_j_xmu(2)
- END_DOC
- expo_j_xmu(1) = 1.7477d0
- expo_j_xmu(2) = 0.668662d0
+
+  BEGIN_DOC
+  ! F(x) = x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2) is fitted with a gaussian and a Slater
+  !
+  !      \approx - 1/sqrt(pi) * exp(-alpha * x ) exp(-beta * x**2)
+  !
+  ! where alpha = expo_j_xmu(1) and beta = expo_j_xmu(2)
+  END_DOC
+
+  implicit none
+
+  !expo_j_xmu(1) = 1.7477d0
+  !expo_j_xmu(2) = 0.668662d0
+
+  !expo_j_xmu(1) = 1.74766377595541d0 
+  !expo_j_xmu(2) = 0.668719925486403d0
+
+  expo_j_xmu(1) = 1.74770446934522d0 
+  expo_j_xmu(2) = 0.668659706559979d0
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 9ca624b9..cac46b18 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -9,19 +9,19 @@ program bi_ort_ints
   implicit none
 
   my_grid_becke = .True.
-  !my_n_pt_r_grid = 10
-  !my_n_pt_a_grid = 14
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
 ! call test_3e
 ! call test_5idx
 ! call test_5idx2
- !call test_4idx
-  call test_4idx2()
-  call test_5idx2
- call test_5idx
+  call test_4idx()
+  call test_4idx_n4()
+  !call test_4idx2()
+  !call test_5idx2
+  !call test_5idx
 end
 
 subroutine test_5idx2
@@ -211,13 +211,138 @@ end
 
 ! ---
 
+subroutine test_4idx_n4()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  thr = 1d-10
+
+  PROVIDE three_e_4_idx_direct_bi_ort_old
+  PROVIDE three_e_4_idx_direct_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_direct_bi_ort_n4 (l,k,j,i)
+          ref = three_e_4_idx_direct_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_direct_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_direct_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch13_bi_ort_old
+  PROVIDE three_e_4_idx_exch13_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch13_bi_ort_n4 (l,k,j,i)
+          ref = three_e_4_idx_exch13_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch13_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch13_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_cycle_1_bi_ort_old
+  PROVIDE three_e_4_idx_cycle_1_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_cycle_1_bi_ort_n4 (l,k,j,i)
+          ref = three_e_4_idx_cycle_1_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_cycle_1_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_cycle_1_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch23_bi_ort_old
+  PROVIDE three_e_4_idx_exch23_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch23_bi_ort_n4  (l,k,j,i)
+          ref = three_e_4_idx_exch23_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch23_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch23_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  return
+end
+
+! ---
+
 subroutine test_4idx()
 
   implicit none
   integer          :: i, j, k, l
   double precision :: accu, contrib, new, ref, thr
 
-  thr = 1d-5
+  thr = 1d-10
 
   PROVIDE three_e_4_idx_direct_bi_ort_old
   PROVIDE three_e_4_idx_direct_bi_ort 
@@ -275,34 +400,6 @@ subroutine test_4idx()
 
   ! ---
 
-!  PROVIDE three_e_4_idx_exch12_bi_ort_old
-!  PROVIDE three_e_4_idx_exch12_bi_ort 
-!
-!  accu = 0.d0
-!  do i = 1, mo_num
-!    do j = 1, mo_num
-!      do k = 1, mo_num
-!        do l = 1, mo_num
-!
-!          new = three_e_4_idx_exch12_bi_ort    (l,k,j,i)
-!          ref = three_e_4_idx_exch12_bi_ort_old(l,k,j,i)
-!          contrib = dabs(new - ref)
-!          accu += contrib
-!          if(contrib .gt. thr) then
-!            print*, ' problem in three_e_4_idx_exch12_bi_ort'
-!            print*, l, k, j, i
-!            print*, ref, new, contrib
-!            stop
-!          endif
-!
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  print*, ' accu on three_e_4_idx_exch12_bi_ort = ', accu / dble(mo_num)**4
-
-  ! ---
-
   PROVIDE three_e_4_idx_cycle_1_bi_ort_old
   PROVIDE three_e_4_idx_cycle_1_bi_ort
 
@@ -331,34 +428,6 @@ subroutine test_4idx()
 
   ! ---
 
-!  PROVIDE three_e_4_idx_cycle_2_bi_ort_old
-!  PROVIDE three_e_4_idx_cycle_2_bi_ort
-!
-!  accu = 0.d0
-!  do i = 1, mo_num
-!    do j = 1, mo_num
-!      do k = 1, mo_num
-!        do l = 1, mo_num
-!
-!          new = three_e_4_idx_cycle_2_bi_ort    (l,k,j,i)
-!          ref = three_e_4_idx_cycle_2_bi_ort_old(l,k,j,i)
-!          contrib = dabs(new - ref)
-!          accu += contrib
-!          if(contrib .gt. thr) then
-!            print*, ' problem in three_e_4_idx_cycle_2_bi_ort'
-!            print*, l, k, j, i
-!            print*, ref, new, contrib
-!            stop
-!          endif
-!
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  print*, ' accu on three_e_4_idx_cycle_2_bi_ort = ', accu / dble(mo_num)**4
-
-  ! ---
-
   PROVIDE three_e_4_idx_exch23_bi_ort_old
   PROVIDE three_e_4_idx_exch23_bi_ort
 
diff --git a/src/bi_ort_ints/one_e_bi_ort.irp.f b/src/bi_ort_ints/one_e_bi_ort.irp.f
index 5f2795f1..49181182 100644
--- a/src/bi_ort_ints/one_e_bi_ort.irp.f
+++ b/src/bi_ort_ints/one_e_bi_ort.irp.f
@@ -6,7 +6,7 @@ BEGIN_PROVIDER [double precision, ao_one_e_integrals_tc_tot, (ao_num,ao_num)]
   implicit none
   integer :: i, j
 
-  ao_one_e_integrals_tc_tot = ao_one_e_integrals      
+  ao_one_e_integrals_tc_tot = ao_one_e_integrals
 
   !provide j1b_type
 
diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 355fa38f..51f0cba4 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -140,8 +140,6 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
       enddo
     enddo
 
-    FREE int2_grad1_u12_ao
-
   endif
 
   call wall_time(wall1)
@@ -225,6 +223,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3,
   implicit none
   integer :: i, j, ipoint
 
+  PROVIDE int2_grad1_u12_ao
+
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       do j = 1, ao_num
diff --git a/src/bi_ort_ints/three_body_ijm.irp.f b/src/bi_ort_ints/three_body_ijm.irp.f
index ae100fb5..5de33a76 100644
--- a/src/bi_ort_ints/three_body_ijm.irp.f
+++ b/src/bi_ort_ints/three_body_ijm.irp.f
@@ -17,6 +17,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
   integer          :: i, j, m
   double precision :: integral, wall1, wall0
 
+  PROVIDE mo_l_coef mo_r_coef
+
   three_e_3_idx_direct_bi_ort = 0.d0
   print *, ' Providing the three_e_3_idx_direct_bi_ort ...'
   call wall_time(wall0)
@@ -125,6 +127,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_cycle_2_bi_ort, (mo_num, mo_num
   integer          :: i, j, m
   double precision :: integral, wall1, wall0
 
+  PROVIDE mo_l_coef mo_r_coef
+
   three_e_3_idx_cycle_2_bi_ort = 0.d0
   print *, ' Providing the three_e_3_idx_cycle_2_bi_ort ...'
   call wall_time(wall0)
@@ -179,6 +183,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch23_bi_ort, (mo_num, mo_num,
   integer          :: i, j, m
   double precision :: integral, wall1, wall0
 
+  PROVIDE mo_l_coef mo_r_coef
+
   three_e_3_idx_exch23_bi_ort = 0.d0
   print*,'Providing the three_e_3_idx_exch23_bi_ort ...'
   call wall_time(wall0)
@@ -233,6 +239,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch13_bi_ort, (mo_num, mo_num,
   integer :: i,j,m
   double precision :: integral, wall1, wall0
 
+  PROVIDE mo_l_coef mo_r_coef
+
   three_e_3_idx_exch13_bi_ort = 0.d0
   print *, ' Providing the three_e_3_idx_exch13_bi_ort ...'
   call wall_time(wall0)
@@ -287,6 +295,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch12_bi_ort, (mo_num, mo_num,
   integer          :: i, j, m
   double precision :: integral, wall1, wall0
 
+  PROVIDE mo_l_coef mo_r_coef
+
   three_e_3_idx_exch12_bi_ort = 0.d0
   print *, ' Providing the three_e_3_idx_exch12_bi_ort ...'
   call wall_time(wall0)
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index ee7e88ef..669861b7 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -3,9 +3,8 @@
 
  BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
-!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -13,28 +12,25 @@
   !
   ! three_e_4_idx_direct_bi_ort (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
   ! three_e_4_idx_exch13_bi_ort (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  ! three_e_4_idx_exch12_bi_ort (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !                                       = three_e_4_idx_exch13_bi_ort (j,m,k,i) 
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
   ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !                                       = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
   !
   ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
   ! three_e_4_idx_direct_bi_ort (m,j,k,i) : Lk Ri Imm Ijj + Lj Rj Imm Iki + Lm Rm Ijj Iki 
   ! three_e_4_idx_exch13_bi_ort (m,j,k,i) : Lk Rm Imi Ijj + Lj Rj Imi Ikm + Lm Ri Ijj Ikm 
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
   ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) : Lk Rm Imj Iji + Lj Ri Imj Ikm + Lm Rj Iji Ikm 
   !
   END_DOC
 
   implicit none
-  integer                       :: ipoint, i, j, k, l, m
+  integer                       :: ipoint, i, j, k, m, n
   double precision              :: wall1, wall0
-  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:)
-  double precision, allocatable :: tmp_4d(:,:,:,:)
-  double precision, allocatable :: tmp4(:,:,:)
-  double precision, allocatable :: tmp5(:,:)
-  double precision, allocatable :: tmp_3d(:,:,:)
+  double precision              :: tmp_loc_1, tmp_loc_2
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
+  double precision, allocatable :: tmp_2d(:,:)
+  double precision, allocatable :: tmp_aux_1(:,:,:), tmp_aux_2(:,:)
 
   print *, ' Providing the three_e_4_idx_bi_ort ...'
   call wall_time(wall0)
@@ -42,324 +38,188 @@
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
 
-  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
-
-  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
+  ! to reduce the number of operations
+  allocate(tmp_aux_1(n_points_final_grid,4,mo_num))
+  allocate(tmp_aux_2(n_points_final_grid,mo_num))
 
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP PRIVATE (n, ipoint)                                       &
   !$OMP SHARED (mo_num, n_points_final_grid,                      &
   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1, tmp2, tmp3)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-
-        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
-
-        tmp2(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_r_in_r_array_transp(ipoint,i)
-        tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
-        tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
-
-        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp3, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp1)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_4d(m,i,j,k)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp2)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_4d(m,k,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp1, 3*n_points_final_grid, tmp3, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp3)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
-            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp1)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_direct_bi_ort(m,j,k,i) = three_e_4_idx_direct_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  deallocate(tmp_4d)
-
-
-  allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp5(n_points_final_grid,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, ipoint)                                       &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector,                         &
-  !$OMP         tmp5)
+  !$OMP         tmp_aux_1, tmp_aux_2)
   !$OMP DO
-  do i = 1, mo_num
+  do n = 1, mo_num
     do ipoint = 1, n_points_final_grid
-      tmp5(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_aux_1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * final_weight_at_r_vector(ipoint)
+        tmp_aux_1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,n) * final_weight_at_r_vector(ipoint)
+        tmp_aux_1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,n) * final_weight_at_r_vector(ipoint)
+        tmp_aux_1(ipoint,4,n) = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,n) * final_weight_at_r_vector(ipoint)
+
+        tmp_aux_2(ipoint,n) = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,n)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  allocate(tmp_2d(mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,4,mo_num))
+  allocate(tmp2(n_points_final_grid,4,mo_num))
 
-  allocate(tmp4(n_points_final_grid,mo_num,mo_num))
-
-  do m = 1, mo_num
-
-    !$OMP PARALLEL                                                 &
-    !$OMP DEFAULT (NONE)                                           &
-    !$OMP PRIVATE (i, k, ipoint)                                   &
-    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
-    !$OMP         int2_grad1_u12_bimo_t,                           &
-    !$OMP         tmp4)
-    !$OMP DO COLLAPSE(2)
+  ! loops approach to break the O(N^4) scaling in memory
+  do k = 1, mo_num
     do i = 1, mo_num
-      do k = 1, mo_num
-        do ipoint = 1, n_points_final_grid
 
-          tmp4(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
-                           + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
-                           + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
-        enddo
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2)                 &
+    !$OMP SHARED (mo_num, n_points_final_grid, i, k,                &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp_aux_2, tmp1)
+    !$OMP DO
+    do n = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i)
+        tmp_loc_2 = tmp_aux_2(ipoint,n)
+
+        tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,i) * tmp_loc_2
+        tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,i) * tmp_loc_2
+        tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,i) * tmp_loc_2
+        tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * int2_grad1_u12_bimo_t(ipoint,1,k,i) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,n,n) * int2_grad1_u12_bimo_t(ipoint,2,k,i) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,n,n) * int2_grad1_u12_bimo_t(ipoint,3,k,i)
+
       enddo
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0 &
-              , tmp5, n_points_final_grid, tmp4, n_points_final_grid       &
-              , 0.d0, tmp_3d, mo_num)
+    call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                       &
+              , tmp_aux_1(1,1,1), 4*n_points_final_grid, tmp1(1,1,1), 4*n_points_final_grid &
+              , 0.d0, tmp_2d(1,1), mo_num)
 
-    !$OMP PARALLEL DO PRIVATE(i,j,k)
-    do i = 1, mo_num
-      do k = 1, mo_num
-        do j = 1, mo_num
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
-        enddo
-      enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-
-
-    !$OMP PARALLEL                                                 &
-    !$OMP DEFAULT (NONE)                                           &
-    !$OMP PRIVATE (j, k, ipoint)                                   &
-    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
-    !$OMP         mos_l_in_r_array_transp,                         &
-    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp4)
-    !$OMP DO COLLAPSE(2)
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-
-          tmp4(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
-                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
-                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
-                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
-        enddo
-      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
-              , tmp4, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
-              , 0.d0, tmp_3d, mo_num*mo_num)
-
-    !$OMP PARALLEL DO PRIVATE(i,j,k)
-    do i = 1, mo_num
-      do k = 1, mo_num
-        do j = 1, mo_num
-          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
-        enddo
-      enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-  enddo
-
-  deallocate(tmp5)
-  deallocate(tmp_3d)
-
-
-
-  do i = 1, mo_num
-
-    !$OMP PARALLEL                                                 &
-    !$OMP DEFAULT (NONE)                                           &
-    !$OMP PRIVATE (m, j, ipoint)                                   &
-    !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
-    !$OMP         mos_r_in_r_array_transp,                         &
-    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp4)
-    !$OMP DO COLLAPSE(2)
+    !$OMP PARALLEL DO PRIVATE(j,m)
     do j = 1, mo_num
       do m = 1, mo_num
-        do ipoint = 1, n_points_final_grid
+        three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_2d(m,j)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
 
-          tmp4(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
-                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-        enddo
+
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2)                 &
+    !$OMP SHARED (mo_num, n_points_final_grid, i, k,                &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp1, tmp2)
+    !$OMP DO
+    do n = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n)
+        tmp_loc_2 = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,i)
+
+        tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,n) * tmp_loc_2
+        tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,n) * tmp_loc_2
+        tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,n) * tmp_loc_2
+        tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * int2_grad1_u12_bimo_t(ipoint,1,k,n) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,n,i) * int2_grad1_u12_bimo_t(ipoint,2,k,n) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,n,i) * int2_grad1_u12_bimo_t(ipoint,3,k,n)
+
+        tmp2(ipoint,1,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,n)
+        tmp2(ipoint,2,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,n)
+        tmp2(ipoint,3,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,n)
+        tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,n)
       enddo
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
-              , tmp4, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
-              , 1.d0, three_e_4_idx_cycle_1_bi_ort(1,1,1,i), mo_num*mo_num)
+    call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                       &
+              , tmp1(1,1,1), 4*n_points_final_grid, tmp_aux_1(1,1,1), 4*n_points_final_grid &
+              , 0.d0, tmp_2d(1,1), mo_num)
 
-  enddo
+    !$OMP PARALLEL DO PRIVATE(j,m)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_2d(m,j)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
 
-  deallocate(tmp4)
+    call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                  &
+              , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid &
+              , 0.d0, tmp_2d(1,1), mo_num)
 
+    !$OMP PARALLEL DO PRIVATE(j,m)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        three_e_4_idx_cycle_1_bi_ort(m,i,k,j) = -tmp_2d(m,j)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
 
-!  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-!  do i = 1, mo_num
-!    do k = 1, mo_num
-!      do j = 1, mo_num
-!        do m = 1, mo_num
-!          three_e_4_idx_exch12_bi_ort (m,j,k,i) = three_e_4_idx_exch13_bi_ort (j,m,k,i)
-!          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END PARALLEL DO
+  enddo ! i
+
+    do j = 1, mo_num
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2)                 &
+      !$OMP SHARED (mo_num, n_points_final_grid, j, k,                &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp1, tmp2)
+      !$OMP DO
+      do n = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp_loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,n)
+          tmp_loc_2 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,j)
+
+          tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,j) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,j,n) * tmp_loc_2
+          tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,j) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,j,n) * tmp_loc_2
+          tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,j) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,j,n) * tmp_loc_2
+          tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,j) * int2_grad1_u12_bimo_t(ipoint,1,j,n) &
+                           + int2_grad1_u12_bimo_t(ipoint,2,n,j) * int2_grad1_u12_bimo_t(ipoint,2,j,n) &
+                           + int2_grad1_u12_bimo_t(ipoint,3,n,j) * int2_grad1_u12_bimo_t(ipoint,3,j,n)
+
+          tmp2(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,k,n)
+          tmp2(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,k,n)
+          tmp2(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,k,n)
+          tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                  &
+                , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid &
+                , 0.d0, tmp_2d(1,1), mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(i,m)
+      do i = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch23_bi_ort(m,j,k,i) = -tmp_2d(m,i)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+    enddo ! j
+  enddo !k
+
+  deallocate(tmp_2d)
+  deallocate(tmp1)
+  deallocate(tmp2)
+  deallocate(tmp_aux_1)
+  deallocate(tmp_aux_2)
 
 
   call wall_time(wall1)
@@ -370,115 +230,3 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: i, j, k, l, m, ipoint
-  double precision              :: wall1, wall0
-  double precision, allocatable :: tmp1(:,:,:,:), tmp_4d(:,:,:,:)
-  double precision, allocatable :: tmp5(:,:,:), tmp6(:,:,:)
-
-  print *, ' Providing the three_e_4_idx_exch23_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
-
-  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
-  allocate(tmp6(n_points_final_grid,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp5, tmp6)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-
-        tmp5(ipoint,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * int2_grad1_u12_bimo_t(ipoint,1,i,l) &
-                         + int2_grad1_u12_bimo_t(ipoint,2,l,i) * int2_grad1_u12_bimo_t(ipoint,2,i,l) &
-                         + int2_grad1_u12_bimo_t(ipoint,3,l,i) * int2_grad1_u12_bimo_t(ipoint,3,i,l) 
-
-        tmp6(ipoint,l,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, -1.d0 &
-            , tmp5, n_points_final_grid, tmp6, n_points_final_grid               &
-            , 0.d0, three_e_4_idx_exch23_bi_ort, mo_num*mo_num)
-
-  deallocate(tmp5)
-  deallocate(tmp6)
-
-
-  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
-            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp1)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_exch23_bi_ort(m,j,k,i) = three_e_4_idx_exch23_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  deallocate(tmp_4d)
-
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
diff --git a/src/bi_ort_ints/three_body_ijmk_n4.irp.f b/src/bi_ort_ints/three_body_ijmk_n4.irp.f
new file mode 100644
index 00000000..e3faeff0
--- /dev/null
+++ b/src/bi_ort_ints/three_body_ijmk_n4.irp.f
@@ -0,0 +1,486 @@
+
+! ---
+
+ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort_n4 ,  (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort_n4 ,  (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort_n4,  (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort_n4,  (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort_n4, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_direct_bi_ort_n4 (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch13_bi_ort_n4 (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch12_bi_ort_n4 (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                          = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i) 
+  ! three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_cycle_2_bi_ort_n4(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                          = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
+  !
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort_n4 can be directly used to compute Slater rules with a + sign
+  !
+  ! three_e_4_idx_direct_bi_ort_n4 (m,j,k,i) : Lk Ri Imm Ijj + Lj Rj Imm Iki + Lm Rm Ijj Iki 
+  ! three_e_4_idx_exch13_bi_ort_n4 (m,j,k,i) : Lk Rm Imi Ijj + Lj Rj Imi Ikm + Lm Ri Ijj Ikm 
+  ! three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) : Lk Rm Imj Iji + Lj Ri Imj Ikm + Lm Rj Iji Ikm 
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: ipoint, i, j, k, l, m
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:)
+  double precision, allocatable :: tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp4(:,:,:)
+  double precision, allocatable :: tmp5(:,:)
+  double precision, allocatable :: tmp_3d(:,:,:)
+
+  print *, ' Providing the O(N^4) three_e_4_idx_bi_ort ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1, tmp2, tmp3)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+
+        tmp2(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
+
+        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, tmp2(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort_n4(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp3(1,1,1,1), 3*n_points_final_grid, tmp1(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) = -tmp_4d(m,i,j,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, tmp2(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+
+  deallocate(tmp2)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) - tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, tmp3(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp3)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0                         &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort_n4(m,j,k,i) = three_e_4_idx_direct_bi_ort_n4(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp5(n_points_final_grid,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, ipoint)                                       &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector,                         &
+  !$OMP         tmp5)
+  !$OMP DO
+  do i = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+      tmp5(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  allocate(tmp4(n_points_final_grid,mo_num,mo_num))
+
+  do m = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (i, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         int2_grad1_u12_bimo_t,                           &
+    !$OMP         tmp4)
+    !$OMP DO COLLAPSE(2)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp4(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0       &
+              , tmp5(1,1), n_points_final_grid, tmp4(1,1,1), n_points_final_grid &
+              , 0.d0, tmp_3d(1,1,1), mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (j, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         mos_l_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp4)
+    !$OMP DO COLLAPSE(2)
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp4(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0                          &
+              , tmp4(1,1,1), n_points_final_grid, mos_r_in_r_array_transp(1,1), n_points_final_grid &
+              , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+  enddo
+
+  deallocate(tmp5)
+  deallocate(tmp_3d)
+
+
+
+  do i = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (m, j, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
+    !$OMP         mos_r_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp4)
+    !$OMP DO COLLAPSE(2)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp4(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0                         &
+              , tmp4(1,1,1), n_points_final_grid, mos_l_in_r_array_transp(1,1), n_points_final_grid &
+              , 1.d0, three_e_4_idx_cycle_1_bi_ort_n4(1,1,1,i), mo_num*mo_num)
+
+  enddo
+
+  deallocate(tmp4)
+
+
+!  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+!  do i = 1, mo_num
+!    do k = 1, mo_num
+!      do j = 1, mo_num
+!        do m = 1, mo_num
+!          three_e_4_idx_exch12_bi_ort_n4 (m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i)
+!          three_e_4_idx_cycle_2_bi_ort_n4(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END PARALLEL DO
+
+
+  call wall_time(wall1)
+  print *, ' wall time for O(N^4) three_e_4_idx_bi_ort', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort_n4 , (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch23_bi_ort_n4 (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort_n4 can be directly used to compute Slater rules with a + sign
+  !
+  ! three_e_4_idx_exch23_bi_ort_n4 (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: i, j, k, l, m, ipoint
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp5(:,:,:), tmp6(:,:,:)
+
+  print *, ' Providing the O(N^4) three_e_4_idx_exch23_bi_ort_n4 ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+
+  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp6(n_points_final_grid,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp5, tmp6)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp5(ipoint,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * int2_grad1_u12_bimo_t(ipoint,1,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,l,i) * int2_grad1_u12_bimo_t(ipoint,2,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,l,i) * int2_grad1_u12_bimo_t(ipoint,3,i,l) 
+
+        tmp6(ipoint,l,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, -1.d0 &
+            , tmp5(1,1,1), n_points_final_grid, tmp6(1,1,1), n_points_final_grid &
+            , 0.d0, three_e_4_idx_exch23_bi_ort_n4(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp5)
+  deallocate(tmp6)
+
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0                         &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch23_bi_ort_n4(m,j,k,i) = three_e_4_idx_exch23_bi_ort_n4(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  call wall_time(wall1)
+  print *, ' wall time for O(N^4) three_e_4_idx_exch23_bi_ort_n4', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
diff --git a/src/bi_ort_ints/total_twoe_pot.irp.f b/src/bi_ort_ints/total_twoe_pot.irp.f
index 721ea0c8..f03e8a34 100644
--- a/src/bi_ort_ints/total_twoe_pot.irp.f
+++ b/src/bi_ort_ints/total_twoe_pot.irp.f
@@ -80,6 +80,8 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       enddo
     enddo
 
+    FREE ao_tc_int_chemist
+
   endif
 
 END_PROVIDER 
@@ -128,69 +130,99 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
 
   implicit none
   integer                       :: i, j, k, l, m, n, p, q
-  double precision, allocatable :: mo_tmp_1(:,:,:,:), mo_tmp_2(:,:,:,:)
+  double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
 
-  allocate(mo_tmp_1(mo_num,ao_num,ao_num,ao_num))
-  mo_tmp_1 = 0.d0
+  PROVIDE mo_r_coef mo_l_coef
 
-  do m = 1, ao_num
-    do p = 1, ao_num
-      do n = 1, ao_num
-        do q = 1, ao_num
-          do k = 1, mo_num
-            !       (k n|p m)    = sum_q c_qk * (q n|p m)
-            mo_tmp_1(k,n,p,m) += mo_l_coef_transp(k,q) * ao_two_e_tc_tot(q,n,p,m)
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
+  allocate(a2(ao_num,ao_num,ao_num,mo_num))
 
-  allocate(mo_tmp_2(mo_num,mo_num,ao_num,ao_num))
-  mo_tmp_2 = 0.d0
+  call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
+            , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+            , 0.d0 , a2(1,1,1,1), ao_num*ao_num*ao_num)
 
-  do m = 1, ao_num
-    do p = 1, ao_num
-      do n = 1, ao_num
-        do i = 1, mo_num
-          do k = 1, mo_num
-            !       (k i|p m) = sum_n c_ni * (k n|p m)
-            mo_tmp_2(k,i,p,m) += mo_r_coef_transp(i,n) * mo_tmp_1(k,n,p,m)
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  deallocate(mo_tmp_1)
+  allocate(a1(ao_num,ao_num,mo_num,mo_num))
 
-  allocate(mo_tmp_1(mo_num,mo_num,mo_num,ao_num))
-  mo_tmp_1 = 0.d0
-  do m = 1, ao_num
-    do p = 1, ao_num
-      do l = 1, mo_num
-        do i = 1, mo_num
-          do k = 1, mo_num
-            mo_tmp_1(k,i,l,m) += mo_l_coef_transp(l,p) * mo_tmp_2(k,i,p,m)
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  deallocate(mo_tmp_2)
+  call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
+            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+            , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
 
-  mo_bi_ortho_tc_two_e_chemist = 0.d0 
-  do m = 1, ao_num
-    do j = 1, mo_num
-      do l = 1, mo_num
-        do i = 1, mo_num
-          do k = 1, mo_num
-            mo_bi_ortho_tc_two_e_chemist(k,i,l,j) += mo_r_coef_transp(j,m) * mo_tmp_1(k,i,l,m)
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  deallocate(mo_tmp_1)
+  deallocate(a2)
+  allocate(a2(ao_num,mo_num,mo_num,mo_num))
+
+  call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+            , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
+            , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+
+  deallocate(a1)
+
+  call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+            , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+
+  deallocate(a2)
+
+
+  !allocate(a1(mo_num,ao_num,ao_num,ao_num))
+  !a1 = 0.d0
+
+  !do m = 1, ao_num
+  !  do p = 1, ao_num
+  !    do n = 1, ao_num
+  !      do q = 1, ao_num
+  !        do k = 1, mo_num
+  !          !       (k n|p m)    = sum_q c_qk * (q n|p m)
+  !          a1(k,n,p,m) += mo_l_coef_transp(k,q) * ao_two_e_tc_tot(q,n,p,m)
+  !        enddo
+  !      enddo
+  !    enddo
+  !  enddo
+  !enddo
+
+  !allocate(a2(mo_num,mo_num,ao_num,ao_num))
+  !a2 = 0.d0
+
+  !do m = 1, ao_num
+  !  do p = 1, ao_num
+  !    do n = 1, ao_num
+  !      do i = 1, mo_num
+  !        do k = 1, mo_num
+  !          !       (k i|p m) = sum_n c_ni * (k n|p m)
+  !          a2(k,i,p,m) += mo_r_coef_transp(i,n) * a1(k,n,p,m)
+  !        enddo
+  !      enddo
+  !    enddo
+  !  enddo
+  !enddo
+  !deallocate(a1)
+
+  !allocate(a1(mo_num,mo_num,mo_num,ao_num))
+  !a1 = 0.d0
+  !do m = 1, ao_num
+  !  do p = 1, ao_num
+  !    do l = 1, mo_num
+  !      do i = 1, mo_num
+  !        do k = 1, mo_num
+  !          a1(k,i,l,m) += mo_l_coef_transp(l,p) * a2(k,i,p,m)
+  !        enddo
+  !      enddo
+  !    enddo
+  !  enddo
+  !enddo
+  !deallocate(a2)
+
+  !mo_bi_ortho_tc_two_e_chemist = 0.d0 
+  !do m = 1, ao_num
+  !  do j = 1, mo_num
+  !    do l = 1, mo_num
+  !      do i = 1, mo_num
+  !        do k = 1, mo_num
+  !          mo_bi_ortho_tc_two_e_chemist(k,i,l,j) += mo_r_coef_transp(j,m) * a1(k,i,l,m)
+  !        enddo
+  !      enddo
+  !    enddo
+  !  enddo
+  !enddo
+  !deallocate(a1)
 
 END_PROVIDER 
 
@@ -209,6 +241,8 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e, (mo_num, mo_num, mo_num,
   implicit none
   integer :: i, j, k, l
 
+  PROVIDE mo_bi_ortho_tc_two_e_chemist
+
   do j = 1, mo_num
     do i = 1, mo_num
       do l = 1, mo_num
@@ -220,56 +254,62 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e, (mo_num, mo_num, mo_num,
     enddo
   enddo
 
+  FREE mo_bi_ortho_tc_two_e_chemist
+
 END_PROVIDER 
 
 ! ---
 
 
- BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj_exchange, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj_anti, (mo_num,mo_num) ]
-  implicit none
+ BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj,          (mo_num,mo_num)]
+&BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj_exchange, (mo_num,mo_num)]
+&BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj_anti,     (mo_num,mo_num)]
+
   BEGIN_DOC
   ! mo_bi_ortho_tc_two_e_jj(i,j) = J_ij = <ji|W-K|ji>
   ! mo_bi_ortho_tc_two_e_jj_exchange(i,j) = K_ij = <ij|W-K|ji>
   ! mo_bi_ortho_tc_two_e_jj_anti(i,j) = J_ij - K_ij
   END_DOC
 
-  integer                        :: i,j
-  double precision               :: get_two_e_integral
+  implicit none
+  integer :: i, j
 
-  mo_bi_ortho_tc_two_e_jj = 0.d0
+  mo_bi_ortho_tc_two_e_jj          = 0.d0
   mo_bi_ortho_tc_two_e_jj_exchange = 0.d0
 
-  do i=1,mo_num
-    do j=1,mo_num
-      mo_bi_ortho_tc_two_e_jj(i,j) = mo_bi_ortho_tc_two_e(j,i,j,i)
+  do i = 1, mo_num
+    do j = 1, mo_num
+      mo_bi_ortho_tc_two_e_jj(i,j)          = mo_bi_ortho_tc_two_e(j,i,j,i)
       mo_bi_ortho_tc_two_e_jj_exchange(i,j) = mo_bi_ortho_tc_two_e(i,j,j,i)
-      mo_bi_ortho_tc_two_e_jj_anti(i,j) = mo_bi_ortho_tc_two_e_jj(i,j) - mo_bi_ortho_tc_two_e_jj_exchange(i,j)
+      mo_bi_ortho_tc_two_e_jj_anti(i,j)     = mo_bi_ortho_tc_two_e_jj(i,j) - mo_bi_ortho_tc_two_e_jj_exchange(i,j)
     enddo
   enddo
 
 END_PROVIDER
 
- BEGIN_PROVIDER [double precision, tc_2e_3idx_coulomb_integrals, (mo_num,mo_num, mo_num)]
-&BEGIN_PROVIDER [double precision, tc_2e_3idx_exchange_integrals,(mo_num,mo_num, mo_num)]
- implicit none
- BEGIN_DOC
- ! tc_2e_3idx_coulomb_integrals(j,k,i)  = <jk|ji> 
- !
- ! tc_2e_3idx_exchange_integrals(j,k,i) = <kj|ji> 
- END_DOC
- integer :: i,j,k,l
- double precision :: get_two_e_integral
- double precision :: integral
+! ---
 
- do i = 1, mo_num
-  do k = 1, mo_num
-   do j = 1, mo_num
-     tc_2e_3idx_coulomb_integrals(j, k,i) = mo_bi_ortho_tc_two_e(j ,k ,j ,i ) 
-     tc_2e_3idx_exchange_integrals(j,k,i) = mo_bi_ortho_tc_two_e(k ,j ,j ,i ) 
-   enddo
+ BEGIN_PROVIDER [double precision, tc_2e_3idx_coulomb_integrals , (mo_num,mo_num,mo_num)]
+&BEGIN_PROVIDER [double precision, tc_2e_3idx_exchange_integrals, (mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  ! tc_2e_3idx_coulomb_integrals (j,k,i) = <jk|ji> 
+  ! tc_2e_3idx_exchange_integrals(j,k,i) = <kj|ji> 
+  END_DOC
+
+  implicit none
+  integer :: i, j, k
+
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        tc_2e_3idx_coulomb_integrals(j, k,i) = mo_bi_ortho_tc_two_e(j ,k ,j ,i ) 
+        tc_2e_3idx_exchange_integrals(j,k,i) = mo_bi_ortho_tc_two_e(k ,j ,j ,i ) 
+      enddo
+    enddo
   enddo
- enddo
 
 END_PROVIDER
+
+! ---
+
diff --git a/src/bi_ortho_mos/mos_rl.irp.f b/src/bi_ortho_mos/mos_rl.irp.f
index c69309d1..13eedfb7 100644
--- a/src/bi_ortho_mos/mos_rl.irp.f
+++ b/src/bi_ortho_mos/mos_rl.irp.f
@@ -136,6 +136,7 @@ BEGIN_PROVIDER [ double precision, mo_r_coef, (ao_num, mo_num) ]
         mo_r_coef(j,i) = mo_coef(j,i)
       enddo
     enddo
+    call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
   endif
 
 END_PROVIDER
@@ -191,6 +192,7 @@ BEGIN_PROVIDER [ double precision, mo_l_coef, (ao_num, mo_num) ]
         mo_l_coef(j,i) = mo_coef(j,i)
       enddo
     enddo
+    call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
   endif
 
 END_PROVIDER
diff --git a/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
index a06f28e9..66d82964 100644
--- a/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@@ -1,47 +1,54 @@
+
+! ---
+
 subroutine run_stochastic_cipsi
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+
   use selection_types
   implicit none
-  BEGIN_DOC
-! Selected Full Configuration Interaction with Stochastic selection and PT2.
-  END_DOC
-  integer                        :: i,j,k,ndet
-  double precision, allocatable  :: zeros(:)
-  integer                        :: to_select
-  type(pt2_type)                 :: pt2_data, pt2_data_err
-  logical, external              :: qp_stop
-  logical                        :: print_pt2
+  integer                       :: i, j, k, ndet
+  integer                       :: to_select
+  logical                       :: print_pt2
+  logical                       :: has
+  type(pt2_type)                :: pt2_data, pt2_data_err
+  double precision              :: rss
+  double precision              :: correlation_energy_ratio, E_denom, E_tc, norm
+  double precision              :: hf_energy_ref
+  double precision              :: relative_error
+  double precision, allocatable :: ept2(:), pt1(:), extrap_energy(:)
+  double precision, allocatable :: zeros(:)
 
-  double precision :: rss
-  double precision, external :: memory_of_double
-  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
-  double precision, allocatable :: ept2(:), pt1(:),extrap_energy(:)
+  logical,          external    :: qp_stop
+  double precision, external    :: memory_of_double
+
+  PROVIDE mo_l_coef mo_r_coef
   PROVIDE H_apply_buffer_allocated distributed_davidson 
 
-  print*,'Diagonal elements of the Fock matrix '
+  print*, ' Diagonal elements of the Fock matrix '
   do i = 1, mo_num
-   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
   enddo
+
   N_iter = 1
   threshold_generators = 1.d0
   SOFT_TOUCH threshold_generators
 
   rss = memory_of_double(N_states)*4.d0
-  call check_mem(rss,irp_here)
+  call check_mem(rss, irp_here)
 
-  allocate (zeros(N_states))
+  allocate(zeros(N_states))
   call pt2_alloc(pt2_data, N_states)
   call pt2_alloc(pt2_data_err, N_states)
 
-  double precision               :: hf_energy_ref
-  logical                        :: has
-  double precision               :: relative_error
+  relative_error = PT2_relative_error
 
-  relative_error=PT2_relative_error
-
-  zeros = 0.d0
-  pt2_data % pt2   = -huge(1.e0)
-  pt2_data % rpt2  = -huge(1.e0)
-  pt2_data % overlap= 0.d0
+  zeros               = 0.d0
+  pt2_data % pt2      = -huge(1.e0)
+  pt2_data % rpt2     = -huge(1.e0)
+  pt2_data % overlap  = 0.d0
   pt2_data % variance = huge(1.e0)
 
   !!!! WARNING  !!!! SEEMS TO BE PROBLEM WTH make_s2_eigenfunction !!!! THE DETERMINANTS CAN APPEAR TWICE IN THE WFT DURING SELECTION
@@ -49,7 +56,7 @@ subroutine run_stochastic_cipsi
 !    call make_s2_eigenfunction
 !  endif
   print_pt2 = .False.
-  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc, norm, pt2_data, print_pt2)
 !  call routine_save_right
 
 
@@ -74,14 +81,12 @@ subroutine run_stochastic_cipsi
 ! soft_touch thresh_it_dav
 
   print_pt2 = .True.
-  do while (                                                         &
-        (N_det < N_det_max) .and.                                    &
-        (maxval(abs(pt2_data % pt2(1:N_states))) > pt2_max)          &
-        )
-      print*,'maxval(abs(pt2_data % pt2(1:N_states)))',maxval(abs(pt2_data % pt2(1:N_states)))
-      print*,pt2_max
-      write(*,'(A)')  '--------------------------------------------------------------------------------'
+  do while( (N_det < N_det_max) .and. &
+            (maxval(abs(pt2_data % pt2(1:N_states))) > pt2_max))
 
+    print*,'maxval(abs(pt2_data % pt2(1:N_states)))',maxval(abs(pt2_data % pt2(1:N_states)))
+    print*,pt2_max
+    write(*,'(A)')  '--------------------------------------------------------------------------------'
 
     to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
     to_select = max(N_states_diag, to_select)
@@ -94,8 +99,7 @@ subroutine run_stochastic_cipsi
     call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
 !    stop
 
-    call print_summary(psi_energy_with_nucl_rep, &
-       pt2_data, pt2_data_err, N_det,N_configuration,N_states,psi_s2)
+    call print_summary(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)
 
     call save_energy(psi_energy_with_nucl_rep, pt2_data % pt2)
 
@@ -109,13 +113,13 @@ subroutine run_stochastic_cipsi
     ! Add selected determinants
     call copy_H_apply_buffer_to_wf_tc()
 
-    PROVIDE  psi_l_coef_bi_ortho psi_r_coef_bi_ortho
-    PROVIDE  psi_det
-    PROVIDE  psi_det_sorted_tc
+    PROVIDE psi_l_coef_bi_ortho psi_r_coef_bi_ortho
+    PROVIDE psi_det
+    PROVIDE psi_det_sorted_tc
 
     ept2(N_iter-1) = E_tc + nuclear_repulsion + (pt2_data % pt2(1))/norm
-    pt1(N_iter-1) = dsqrt(pt2_data % overlap(1,1))
-    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+    pt1(N_iter-1)  = dsqrt(pt2_data % overlap(1,1))
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc, norm, pt2_data, print_pt2)
 !    stop
     if (qp_stop()) exit
   enddo
diff --git a/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f b/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
index 670b2395..1a8269f4 100644
--- a/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
@@ -343,7 +343,7 @@ subroutine davidson_general_diag_dressed_ext_rout_nonsym_b1space(u_in, H_jj, Dre
         if(lambda_tmp .lt. 0.7d0) then
           print *, ' very small overlap ...', l, i_omax(l)
           print *, ' max overlap = ', lambda_tmp
-          stop
+          !stop
         endif
 
         if(i_omax(l) .ne. l) then
diff --git a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
index 1bed60fe..4b7b9cc9 100644
--- a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
@@ -342,7 +342,7 @@ subroutine davidson_general_ext_rout_nonsym_b1space(u_in, H_jj, energies, sze, N
         if(lambda_tmp .lt. 0.7d0) then
           print *, ' very small overlap ...', l, i_omax(l)
           print *, ' max overlap = ', lambda_tmp
-          stop
+          !stop
         endif
 
         if(i_omax(l) .ne. l) then
diff --git a/src/dft_utils_in_r/ao_in_r.irp.f b/src/dft_utils_in_r/ao_in_r.irp.f
index b8beea76..16414f39 100644
--- a/src/dft_utils_in_r/ao_in_r.irp.f
+++ b/src/dft_utils_in_r/ao_in_r.irp.f
@@ -1,53 +1,64 @@
 
- BEGIN_PROVIDER[double precision, aos_in_r_array, (ao_num,n_points_final_grid)]
- implicit none
- BEGIN_DOC
- ! aos_in_r_array(i,j)        = value of the ith ao on the jth grid point
- END_DOC
- integer :: i,j
- double precision :: aos_array(ao_num), r(3)
- !$OMP PARALLEL DO &
- !$OMP DEFAULT (NONE)  &
- !$OMP PRIVATE (i,r,aos_array,j) & 
- !$OMP SHARED(aos_in_r_array,n_points_final_grid,ao_num,final_grid_points)
- do i = 1, n_points_final_grid
-  r(1) = final_grid_points(1,i)
-  r(2) = final_grid_points(2,i)
-  r(3) = final_grid_points(3,i)
-  call give_all_aos_at_r(r,aos_array)
-  do j = 1, ao_num
-   aos_in_r_array(j,i) = aos_array(j)
+! ---
+
+BEGIN_PROVIDER[double precision, aos_in_r_array, (ao_num,n_points_final_grid)]
+
+  BEGIN_DOC
+  ! aos_in_r_array(i,j) = value of the ith ao on the jth grid point
+  END_DOC
+
+  implicit none
+  integer          :: i, j
+  double precision :: tmp_array(ao_num), r(3)
+
+  !$OMP PARALLEL DO               &
+  !$OMP DEFAULT (NONE)            &
+  !$OMP PRIVATE (i,r,tmp_array,j) & 
+  !$OMP SHARED(aos_in_r_array,n_points_final_grid,ao_num,final_grid_points)
+  do i = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    call give_all_aos_at_r(r, tmp_array)
+    do j = 1, ao_num
+      aos_in_r_array(j,i) = tmp_array(j)
+    enddo
   enddo
- enddo
- !$OMP END PARALLEL DO
+  !$OMP END PARALLEL DO
 
- END_PROVIDER
+END_PROVIDER
 
+! ---
 
- BEGIN_PROVIDER[double precision, aos_in_r_array_transp, (n_points_final_grid,ao_num)]
- implicit none
- BEGIN_DOC
- ! aos_in_r_array_transp(i,j) = value of the jth ao on the ith grid point
- END_DOC
- integer :: i,j
- double precision :: aos_array(ao_num), r(3)
- do i = 1, n_points_final_grid
-  do j = 1, ao_num
-   aos_in_r_array_transp(i,j) = aos_in_r_array(j,i) 
+BEGIN_PROVIDER[double precision, aos_in_r_array_transp, (n_points_final_grid,ao_num)]
+
+  BEGIN_DOC
+  ! aos_in_r_array_transp(i,j) = value of the jth ao on the ith grid point
+  END_DOC
+
+  implicit none
+  integer          :: i, j
+  double precision :: aos_array(ao_num), r(3)
+
+  do i = 1, n_points_final_grid
+    do j = 1, ao_num
+      aos_in_r_array_transp(i,j) = aos_in_r_array(j,i) 
+    enddo
   enddo
- enddo
 
- END_PROVIDER
+END_PROVIDER
 
+! ---
 
+BEGIN_PROVIDER[double precision, aos_grad_in_r_array, (ao_num,n_points_final_grid,3)]
 
- BEGIN_PROVIDER[double precision, aos_grad_in_r_array, (ao_num,n_points_final_grid,3)]
- implicit none
  BEGIN_DOC
  ! aos_grad_in_r_array(i,j,k)          = value of the kth component of the gradient of ith ao on the jth grid point
  !
  ! k = 1 : x, k= 2, y, k  3, z
  END_DOC
+
+ implicit none
  integer :: i,j,m
  double precision :: aos_array(ao_num), r(3)
  double precision :: aos_grad_array(3,ao_num)
diff --git a/src/fci_tc_bi/diagonalize_ci.irp.f b/src/fci_tc_bi/diagonalize_ci.irp.f
index df753449..6c8f3431 100644
--- a/src/fci_tc_bi/diagonalize_ci.irp.f
+++ b/src/fci_tc_bi/diagonalize_ci.irp.f
@@ -1,21 +1,29 @@
 
-subroutine diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+! ---
+
+subroutine diagonalize_CI_tc_bi_ortho(ndet, E_tc, norm, pt2_data, print_pt2)
+
+  BEGIN_DOC
+  !  Replace the coefficients of the CI states by the coefficients of the
+  !  eigenstates of the CI matrix
+  END_DOC
+
   use selection_types
   implicit none
-  integer, intent(inout)              :: ndet      ! number of determinants from before 
-  double precision, intent(inout)     :: E_tc,norm ! E and norm from previous wave function 
-  type(pt2_type)  , intent(in)        :: pt2_data  ! PT2 from previous wave function 
-  logical, intent(in) :: print_pt2
-  BEGIN_DOC
-!  Replace the coefficients of the CI states by the coefficients of the
-!  eigenstates of the CI matrix
-  END_DOC
-  integer :: i,j
- double precision :: pt2_tmp,pt1_norm,rpt2_tmp,abs_pt2
- pt2_tmp = pt2_data % pt2(1)
- abs_pt2 = pt2_data % variance(1)
- pt1_norm = pt2_data % overlap(1,1)
- rpt2_tmp = pt2_tmp/(1.d0 + pt1_norm)
+  integer,          intent(inout) :: ndet       ! number of determinants from before 
+  double precision, intent(inout) :: E_tc, norm ! E and norm from previous wave function 
+  type(pt2_type)  , intent(in)    :: pt2_data   ! PT2 from previous wave function 
+  logical,          intent(in)    :: print_pt2
+  integer                         :: i, j
+  double precision                :: pt2_tmp, pt1_norm, rpt2_tmp, abs_pt2
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  pt2_tmp  = pt2_data % pt2(1)
+  abs_pt2  = pt2_data % variance(1)
+  pt1_norm = pt2_data % overlap(1,1)
+  rpt2_tmp = pt2_tmp/(1.d0 + pt1_norm)
+
   print*,'*****'
   print*,'New wave function information'
   print*,'N_det tc               = ',N_det
@@ -23,53 +31,61 @@ subroutine diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
   print*,'eigval_right_tc = ',eigval_right_tc_bi_orth(1)
   print*,'Ndet, E_tc = ',N_det,eigval_right_tc_bi_orth(1)
   print*,'*****'
-  if(print_pt2)then
-   print*,'*****'
-   print*,'previous wave function info'
-   print*,'norm(before)      = ',norm
-   print*,'E(before)         = ',E_tc 
-   print*,'PT1 norm          = ',dsqrt(pt1_norm)
-   print*,'PT2               = ',pt2_tmp
-   print*,'rPT2              = ',rpt2_tmp
-   print*,'|PT2|             = ',abs_pt2
-   print*,'Positive PT2      = ',(pt2_tmp + abs_pt2)*0.5d0
-   print*,'Negative PT2      = ',(pt2_tmp - abs_pt2)*0.5d0
-   print*,'E(before) + PT2   = ',E_tc + pt2_tmp/norm
-   print*,'E(before) +rPT2   = ',E_tc + rpt2_tmp/norm
-   write(*,'(A28,X,I10,X,100(F16.8,X))')'Ndet,E,E+PT2,E+RPT2,|PT2|=',ndet,E_tc ,E_tc  + pt2_tmp/norm,E_tc  + rpt2_tmp/norm,abs_pt2
-   print*,'*****'
+
+  if(print_pt2) then
+    print*,'*****'
+    print*,'previous wave function info'
+    print*,'norm(before)      = ',norm
+    print*,'E(before)         = ',E_tc 
+    print*,'PT1 norm          = ',dsqrt(pt1_norm)
+    print*,'PT2               = ',pt2_tmp
+    print*,'rPT2              = ',rpt2_tmp
+    print*,'|PT2|             = ',abs_pt2
+    print*,'Positive PT2      = ',(pt2_tmp + abs_pt2)*0.5d0
+    print*,'Negative PT2      = ',(pt2_tmp - abs_pt2)*0.5d0
+    print*,'E(before) + PT2   = ',E_tc + pt2_tmp/norm
+    print*,'E(before) +rPT2   = ',E_tc + rpt2_tmp/norm
+    write(*,'(A28,X,I10,X,100(F16.8,X))')'Ndet,E,E+PT2,E+RPT2,|PT2|=',ndet,E_tc ,E_tc  + pt2_tmp/norm,E_tc  + rpt2_tmp/norm,abs_pt2
+    print*,'*****'
   endif
+
   psi_energy(1:N_states) = eigval_right_tc_bi_orth(1:N_states) - nuclear_repulsion
   psi_s2(1:N_states) = s2_eigvec_tc_bi_orth(1:N_states)
 
-  E_tc  = eigval_right_tc_bi_orth(1)
-  norm  = norm_ground_left_right_bi_orth
-  ndet  = N_det
-  do j=1,N_states
-    do i=1,N_det
+  E_tc = eigval_right_tc_bi_orth(1)
+  norm = norm_ground_left_right_bi_orth
+  ndet = N_det
+  do j = 1, N_states
+    do i = 1, N_det
       psi_l_coef_bi_ortho(i,j) = leigvec_tc_bi_orth(i,j)
       psi_r_coef_bi_ortho(i,j) = reigvec_tc_bi_orth(i,j)
-      psi_coef(i,j) = dabs(psi_l_coef_bi_ortho(i,j) * psi_r_coef_bi_ortho(i,j))   
+      psi_coef(i,j)            = dabs(psi_l_coef_bi_ortho(i,j) * psi_r_coef_bi_ortho(i,j))   
     enddo
   enddo
-  SOFT_TOUCH eigval_left_tc_bi_orth  eigval_right_tc_bi_orth leigvec_tc_bi_orth reigvec_tc_bi_orth norm_ground_left_right_bi_orth 
+  SOFT_TOUCH eigval_left_tc_bi_orth eigval_right_tc_bi_orth leigvec_tc_bi_orth reigvec_tc_bi_orth norm_ground_left_right_bi_orth 
   SOFT_TOUCH psi_l_coef_bi_ortho psi_r_coef_bi_ortho psi_coef psi_energy psi_s2
 
-  call save_tc_bi_ortho_wavefunction
+  call save_tc_bi_ortho_wavefunction()
+
 end
 
-subroutine print_CI_dressed(ndet, E_tc,norm,pt2_data,print_pt2)
+! ---
+
+subroutine print_CI_dressed(ndet, E_tc, norm, pt2_data, print_pt2)
+
+  BEGIN_DOC
+  !  Replace the coefficients of the CI states by the coefficients of the
+  !  eigenstates of the CI matrix
+  END_DOC
+
   use selection_types
   implicit none
-  integer, intent(inout)              :: ndet      ! number of determinants from before 
-  double precision, intent(inout)     :: E_tc,norm ! E and norm from previous wave function 
-  type(pt2_type)  , intent(in)        :: pt2_data  ! PT2 from previous wave function 
-  logical, intent(in) :: print_pt2
-  BEGIN_DOC
-!  Replace the coefficients of the CI states by the coefficients of the
-!  eigenstates of the CI matrix
-  END_DOC
-  integer :: i,j
+  integer,          intent(inout) :: ndet      ! number of determinants from before 
+  double precision, intent(inout) :: E_tc,norm ! E and norm from previous wave function 
+  type(pt2_type)  , intent(in)    :: pt2_data  ! PT2 from previous wave function 
+  logical,          intent(in)    :: print_pt2
+  integer                         :: i, j
+
   print*,'*****'
   print*,'New wave function information'
   print*,'N_det tc               = ',N_det
@@ -77,22 +93,25 @@ subroutine print_CI_dressed(ndet, E_tc,norm,pt2_data,print_pt2)
   print*,'eigval_right_tc = ',eigval_right_tc_bi_orth(1)
   print*,'Ndet, E_tc = ',N_det,eigval_right_tc_bi_orth(1)
   print*,'*****'
-  if(print_pt2)then
-   print*,'*****'
-   print*,'previous wave function info'
-   print*,'norm(before)      = ',norm
-   print*,'E(before)         = ',E_tc
-   print*,'PT1 norm          = ',dsqrt(pt2_data % overlap(1,1))
-   print*,'E(before) + PT2   = ',E_tc + (pt2_data % pt2(1))/norm
-   print*,'PT2               = ',pt2_data % pt2(1)
-   print*,'Ndet, E_tc, E+PT2 = ',ndet,E_tc,E_tc + (pt2_data % pt2(1))/norm,dsqrt(pt2_data % overlap(1,1))
-   print*,'*****'
+
+  if(print_pt2) then
+    print*,'*****'
+    print*,'previous wave function info'
+    print*,'norm(before)      = ',norm
+    print*,'E(before)         = ',E_tc
+    print*,'PT1 norm          = ',dsqrt(pt2_data % overlap(1,1))
+    print*,'E(before) + PT2   = ',E_tc + (pt2_data % pt2(1))/norm
+    print*,'PT2               = ',pt2_data % pt2(1)
+    print*,'Ndet, E_tc, E+PT2 = ',ndet,E_tc,E_tc + (pt2_data % pt2(1))/norm,dsqrt(pt2_data % overlap(1,1))
+    print*,'*****'
   endif
+
   E_tc  = eigval_right_tc_bi_orth(1)
   norm  = norm_ground_left_right_bi_orth
   ndet  = N_det
-  do j=1,N_states
-    do i=1,N_det
+
+  do j = 1, N_states
+    do i = 1, N_det
       psi_coef(i,j) = reigvec_tc_bi_orth(i,j)
     enddo
   enddo
@@ -100,3 +119,5 @@ subroutine print_CI_dressed(ndet, E_tc,norm,pt2_data,print_pt2)
 
 end
 
+! ---
+
diff --git a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
index 3e6f229b..1c1c0411 100644
--- a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -1,5 +1,8 @@
-program fci
-  implicit none
+
+! ---
+
+program fci_tc_bi
+
   BEGIN_DOC
   ! Selected Full Configuration Interaction with stochastic selection
   ! and PT2.
@@ -36,21 +39,27 @@ program fci
   !
   END_DOC
 
+  implicit none
 
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid 
+
   pruning = -1.d0
   touch pruning
+
 !  pt2_relative_error = 0.01d0
 !  touch pt2_relative_error
-  call run_cipsi_tc
+
+  call run_cipsi_tc()
 
 end
 
+! ---
 
-subroutine run_cipsi_tc
+subroutine run_cipsi_tc()
 
   implicit none
 
@@ -58,20 +67,21 @@ subroutine run_cipsi_tc
 
     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
 
-    if(elec_alpha_num+elec_beta_num .ge. 3) then
-      if(three_body_h_tc)then
+    if((elec_alpha_num+elec_beta_num) .ge. 3) then
+      if(three_body_h_tc) then
         call provide_all_three_ints_bi_ortho()
       endif
     endif
 
-    FREE int2_grad1_u12_bimo_transp int2_grad1_u12_ao_transp
+    FREE int2_grad1_u12_ao int2_grad1_u12_ao_t int2_grad1_u12_ao_transp
+    FREE int2_grad1_u12_bimo_transp 
 
     write(json_unit,json_array_open_fmt) 'fci_tc'
 
-    if (do_pt2) then
-      call run_stochastic_cipsi
+    if(do_pt2) then
+      call run_stochastic_cipsi()
     else
-      call run_cipsi
+      call run_cipsi()
     endif
 
     write(json_unit,json_dict_uopen_fmt)
@@ -83,13 +93,14 @@ subroutine run_cipsi_tc
 
     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
 
-    if(elec_alpha_num+elec_beta_num.ge.3)then
-      if(three_body_h_tc)then
-        call provide_all_three_ints_bi_ortho
+    if((elec_alpha_num+elec_beta_num) .ge. 3) then
+      if(three_body_h_tc) then
+        call provide_all_three_ints_bi_ortho()
       endif
     endif
 
-    FREE int2_grad1_u12_bimo_transp int2_grad1_u12_ao_transp
+    FREE int2_grad1_u12_ao int2_grad1_u12_ao_t int2_grad1_u12_ao_transp
+    FREE int2_grad1_u12_bimo_transp 
 
     call run_slave_cipsi
 
diff --git a/src/fci_tc_bi/pt2_tc.irp.f b/src/fci_tc_bi/pt2_tc.irp.f
index 96a54825..390042bf 100644
--- a/src/fci_tc_bi/pt2_tc.irp.f
+++ b/src/fci_tc_bi/pt2_tc.irp.f
@@ -1,31 +1,42 @@
+
+! ---
+
 program tc_pt2_prog
+
   implicit none
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid 
+
   pruning = -1.d0
   touch pruning
+
 !  pt2_relative_error = 0.01d0
 !  touch pt2_relative_error
-  call run_pt2_tc
+  call run_pt2_tc()
 
 end
 
+! ---
 
-subroutine run_pt2_tc
+subroutine run_pt2_tc()
 
- implicit none
+  implicit none
 
   PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
-  if(elec_alpha_num+elec_beta_num.ge.3)then
+
+  if(elec_alpha_num+elec_beta_num.ge.3) then
     if(three_body_h_tc)then
-      call provide_all_three_ints_bi_ortho
+      call provide_all_three_ints_bi_ortho()
     endif
   endif
-  ! ---
-
- call tc_pt2
 
+  call tc_pt2()
 
 end
+
+! ---
+
diff --git a/src/non_h_ints_mu/debug_fit.irp.f b/src/non_h_ints_mu/debug_fit.irp.f
index 146028d5..05d2db68 100644
--- a/src/non_h_ints_mu/debug_fit.irp.f
+++ b/src/non_h_ints_mu/debug_fit.irp.f
@@ -6,13 +6,9 @@ program debug_fit
   implicit none
 
   my_grid_becke  = .True.
-
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-  !my_n_pt_r_grid = 100
-  !my_n_pt_a_grid = 170
-  !my_n_pt_r_grid = 150
-  !my_n_pt_a_grid = 194
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   PROVIDE mu_erf j1b_pen
diff --git a/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f b/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
index 5e7ef7e9..b9e8df25 100644
--- a/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
+++ b/src/non_h_ints_mu/debug_integ_jmu_modif.irp.f
@@ -6,13 +6,9 @@ program debug_integ_jmu_modif
   implicit none
 
   my_grid_becke  = .True.
-
-  !my_n_pt_r_grid = 30
-  !my_n_pt_a_grid = 50
-  !my_n_pt_r_grid = 100
-  !my_n_pt_a_grid = 170
-  my_n_pt_r_grid = 150
-  my_n_pt_a_grid = 194
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   PROVIDE mu_erf j1b_pen
@@ -27,12 +23,13 @@ program debug_integ_jmu_modif
 !  call test_int2_grad1_u12_ao()
 !
 !  call test_grad12_j12()
+  call test_tchint_rsdft()
 !  call test_u12sq_j1bsq()
 !  call test_u12_grad1_u12_j1b_grad1_j1b()
 !  !call test_gradu_squared_u_ij_mu()
 
   !call test_vect_overlap_gauss_r12_ao()
-  call test_vect_overlap_gauss_r12_ao_with1s()
+  !call test_vect_overlap_gauss_r12_ao_with1s()
 
 end
 
@@ -47,22 +44,21 @@ subroutine test_v_ij_u_cst_mu_j1b()
 
   print*, ' test_v_ij_u_cst_mu_j1b ...'
 
-  PROVIDE v_ij_u_cst_mu_j1b
+  PROVIDE v_ij_u_cst_mu_j1b_fit
 
   eps_ij  = 1d-3
   acc_tot = 0.d0
   normalz = 0.d0
 
-  !do ipoint = 1, 10
   do ipoint = 1, n_points_final_grid
     do j = 1, ao_num
       do i = 1, ao_num
 
-        i_exc  =     v_ij_u_cst_mu_j1b(i,j,ipoint) 
-        i_num  = num_v_ij_u_cst_mu_j1b(i,j,ipoint)
+        i_exc  =     v_ij_u_cst_mu_j1b_fit(i,j,ipoint) 
+        i_num  = num_v_ij_u_cst_mu_j1b    (i,j,ipoint)
         acc_ij = dabs(i_exc - i_num)
         if(acc_ij .gt. eps_ij) then
-          print *, ' problem in v_ij_u_cst_mu_j1b on', i, j, ipoint
+          print *, ' problem in v_ij_u_cst_mu_j1b_fit on', i, j, ipoint
           print *, ' analyt integ = ', i_exc
           print *, ' numeri integ = ', i_num
           print *, ' diff         = ', acc_ij
@@ -473,6 +469,65 @@ end subroutine test_gradu_squared_u_ij_mu
 
 ! ---
 
+subroutine test_tchint_rsdft()
+
+  implicit none
+  integer          :: i, j, m, ipoint, jpoint
+  double precision :: acc_ij, acc_tot, eps_ij, i_exc, i_num, normalz
+  double precision :: x(3), y(3), dj_1(3), dj_2(3), dj_3(3)
+
+  print*, ' test rsdft_jastrow ...'
+
+  PROVIDE grad1_u12_num
+
+  eps_ij  = 1d-4
+  acc_tot = 0.d0
+  normalz = 0.d0
+
+  do ipoint = 1, n_points_final_grid
+    x(1) = final_grid_points(1,ipoint)
+    x(2) = final_grid_points(2,ipoint)
+    x(3) = final_grid_points(3,ipoint)
+
+    do jpoint = 1, n_points_extra_final_grid
+      y(1) = final_grid_points_extra(1,jpoint)
+      y(2) = final_grid_points_extra(2,jpoint)
+      y(3) = final_grid_points_extra(3,jpoint)
+
+      dj_1(1) = grad1_u12_num(jpoint,ipoint,1)
+      dj_1(2) = grad1_u12_num(jpoint,ipoint,2)
+      dj_1(3) = grad1_u12_num(jpoint,ipoint,3)
+
+      call get_tchint_rsdft_jastrow(x, y, dj_2)
+
+      do m = 1, 3
+        i_exc = dj_1(m)
+        i_num = dj_2(m)
+        acc_ij = dabs(i_exc - i_num)
+        if(acc_ij .gt. eps_ij) then
+          print *, ' problem on', ipoint, jpoint, m
+          print *, ' x = ', x
+          print *, ' y = ', y
+          print *, ' exc, num, diff = ', i_exc, i_num, acc_ij
+          call grad1_jmu_modif_num(x, y, dj_3)
+          print *, ' check = ', dj_3(m)
+          stop
+        endif
+
+        acc_tot += acc_ij
+        normalz += dabs(i_exc)
+      enddo
+    enddo
+  enddo
+
+  print*, ' acc_tot = ', acc_tot
+  print*, ' normalz = ', normalz
+
+  return
+end subroutine test_tchint_rsdft
+
+! ---
+
 subroutine test_grad12_j12()
 
   implicit none
@@ -484,7 +539,7 @@ subroutine test_grad12_j12()
 
   PROVIDE grad12_j12
 
-  eps_ij  = 1d-3
+  eps_ij  = 1d-6
   acc_tot = 0.d0
   normalz = 0.d0
 
diff --git a/src/non_h_ints_mu/j12_nucl_utils.irp.f b/src/non_h_ints_mu/j12_nucl_utils.irp.f
index 9b91a8ed..7dd13f14 100644
--- a/src/non_h_ints_mu/j12_nucl_utils.irp.f
+++ b/src/non_h_ints_mu/j12_nucl_utils.irp.f
@@ -35,7 +35,7 @@ BEGIN_PROVIDER [ double precision, v_1b, (n_points_final_grid)]
 
   elseif(j1b_type .eq. 4) then
 
-    ! v(r) = 1 - \sum_{a} \exp(-\alpha_a (r - r_a)^2)
+    ! v(r) = 1 - \sum_{a} \beta_a \exp(-\alpha_a (r - r_a)^2)
 
     do ipoint = 1, n_points_final_grid
 
@@ -51,7 +51,7 @@ BEGIN_PROVIDER [ double precision, v_1b, (n_points_final_grid)]
         dz = z - nucl_coord(j,3)
         d  = dx*dx + dy*dy + dz*dz
 
-        fact_r = fact_r - dexp(-a*d)
+        fact_r = fact_r - j1b_pen_coef(j) * dexp(-a*d)
       enddo
 
       v_1b(ipoint) = fact_r
@@ -125,7 +125,7 @@ BEGIN_PROVIDER [double precision, v_1b_grad, (3, n_points_final_grid)]
 
   elseif(j1b_type .eq. 4) then
 
-    ! v(r) = 1 - \sum_{a} \exp(-\alpha_a (r - r_a)^2)
+    ! v(r) = 1 - \sum_{a} \beta_a \exp(-\alpha_a (r - r_a)^2)
 
     do ipoint = 1, n_points_final_grid
 
@@ -144,7 +144,7 @@ BEGIN_PROVIDER [double precision, v_1b_grad, (3, n_points_final_grid)]
         r2 = dx*dx + dy*dy + dz*dz
 
         a = j1b_pen(j)
-        e = a * dexp(-a * r2)
+        e = a * j1b_pen_coef(j) * dexp(-a * r2)
 
         ax_der += e * dx
         ay_der += e * dy
@@ -668,7 +668,7 @@ subroutine grad1_jmu_modif_num(r1, r2, grad)
   double precision, intent(in)  :: r1(3), r2(3)
   double precision, intent(out) :: grad(3)
 
-  double precision              :: tmp0, tmp1, tmp2, tmp3, tmp4, grad_u12(3)
+  double precision              :: tmp0, tmp1, tmp2, grad_u12(3)
 
   double precision, external    :: j12_mu
   double precision, external    :: j1b_nucl
@@ -681,18 +681,93 @@ subroutine grad1_jmu_modif_num(r1, r2, grad)
   tmp0 = j1b_nucl(r1) 
   tmp1 = j1b_nucl(r2)
   tmp2 = j12_mu(r1, r2)
-  tmp3 = tmp0 * tmp1
-  tmp4 = tmp2 * tmp1
 
-  grad(1) = tmp3 * grad_u12(1) + tmp4 * grad_x_j1b_nucl_num(r1)
-  grad(2) = tmp3 * grad_u12(2) + tmp4 * grad_y_j1b_nucl_num(r1)
-  grad(3) = tmp3 * grad_u12(3) + tmp4 * grad_z_j1b_nucl_num(r1)
+  grad(1) = (tmp0 * grad_u12(1) + tmp2 * grad_x_j1b_nucl_num(r1)) * tmp1
+  grad(2) = (tmp0 * grad_u12(2) + tmp2 * grad_y_j1b_nucl_num(r1)) * tmp1
+  grad(3) = (tmp0 * grad_u12(3) + tmp2 * grad_z_j1b_nucl_num(r1)) * tmp1
 
   return
 end subroutine grad1_jmu_modif_num
 
 ! ---
 
-
+subroutine get_tchint_rsdft_jastrow(x, y, dj)
+
+  implicit none
+  double precision, intent(in)  :: x(3), y(3)
+  double precision, intent(out) :: dj(3)
+  integer                       :: at
+  double precision              :: a, mu_tmp, inv_sq_pi_2
+  double precision              :: tmp_x, tmp_y, tmp_z, tmp
+  double precision              :: dx2, dy2, pos(3), dxy, dxy2
+  double precision              :: v1b_x, v1b_y
+  double precision              :: u2b, grad1_u2b(3), grad1_v1b(3)
+
+  PROVIDE mu_erf
+
+  inv_sq_pi_2 = 0.5d0 / dsqrt(dacos(-1.d0))
+
+  dj = 0.d0
+
+!  double precision, external :: j12_mu, j1b_nucl
+!  v1b_x = j1b_nucl(x)
+!  v1b_y = j1b_nucl(y)
+!  call grad1_j1b_nucl(x, grad1_v1b)
+!  u2b = j12_mu(x, y)
+!  call grad1_j12_mu(x, y, grad1_u2b)
+
+  ! 1b terms
+  v1b_x = 1.d0
+  v1b_y = 1.d0
+  tmp_x = 0.d0
+  tmp_y = 0.d0
+  tmp_z = 0.d0
+  do at = 1, nucl_num
+
+    a = j1b_pen(at)
+    pos(1) = nucl_coord(at,1)
+    pos(2) = nucl_coord(at,2)
+    pos(3) = nucl_coord(at,3)
+
+    dx2 = sum((x-pos)**2)
+    dy2 = sum((y-pos)**2)
+    tmp = dexp(-a*dx2) * a
+
+    v1b_x = v1b_x - dexp(-a*dx2)
+    v1b_y = v1b_y - dexp(-a*dy2)
+
+    tmp_x = tmp_x + tmp * (x(1) - pos(1))
+    tmp_y = tmp_y + tmp * (x(2) - pos(2))
+    tmp_z = tmp_z + tmp * (x(3) - pos(3))
+  end do
+  grad1_v1b(1) = 2.d0 * tmp_x
+  grad1_v1b(2) = 2.d0 * tmp_y
+  grad1_v1b(3) = 2.d0 * tmp_z
+
+  ! 2b terms
+  dxy2   = sum((x-y)**2)
+  dxy    = dsqrt(dxy2)
+  mu_tmp = mu_erf * dxy
+  u2b    = 0.5d0 * dxy * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_erf
+
+  if(dxy .lt. 1d-8) then
+    grad1_u2b(1) = 0.d0
+    grad1_u2b(2) = 0.d0
+    grad1_u2b(3) = 0.d0
+  else
+    tmp = 0.5d0 * (1.d0 - derf(mu_tmp)) / dxy
+    grad1_u2b(1) = tmp * (x(1) - y(1))
+    grad1_u2b(2) = tmp * (x(2) - y(2))
+    grad1_u2b(3) = tmp * (x(3) - y(3))
+  endif
+
+  dj(1) = (grad1_u2b(1) * v1b_x + u2b * grad1_v1b(1)) * v1b_y
+  dj(2) = (grad1_u2b(2) * v1b_x + u2b * grad1_v1b(2)) * v1b_y
+  dj(3) = (grad1_u2b(3) * v1b_x + u2b * grad1_v1b(3)) * v1b_y
+
+  return
+end subroutine get_tchint_rsdft_jastrow
+
+! ---
 
 
diff --git a/src/non_h_ints_mu/jast_deriv.irp.f b/src/non_h_ints_mu/jast_deriv.irp.f
index 5e99600e..859f2aa5 100644
--- a/src/non_h_ints_mu/jast_deriv.irp.f
+++ b/src/non_h_ints_mu/jast_deriv.irp.f
@@ -70,6 +70,8 @@
 
   elseif((j1b_type .gt. 100) .and. (j1b_type .lt. 200)) then
 
+    PROVIDE final_grid_points
+
     !$OMP PARALLEL                                                                                    &
     !$OMP DEFAULT (NONE)                                                                              &
     !$OMP PRIVATE (ipoint, jpoint, r1, r2, v1b_r1, v1b_r2, u2b_r12, grad1_v1b, grad1_u2b, dx, dy, dz) &
@@ -296,7 +298,7 @@ double precision function j1b_nucl(r)
       d = ( (r(1) - nucl_coord(i,1)) * (r(1) - nucl_coord(i,1)) &
           + (r(2) - nucl_coord(i,2)) * (r(2) - nucl_coord(i,2)) &
           + (r(3) - nucl_coord(i,3)) * (r(3) - nucl_coord(i,3)) )
-      j1b_nucl = j1b_nucl - dexp(-a*d)
+      j1b_nucl = j1b_nucl - j1b_pen_coef(i) * dexp(-a*d)
     enddo
 
   elseif((j1b_type .eq. 5) .or. (j1b_type .eq. 105)) then
@@ -363,7 +365,7 @@ double precision function j1b_nucl_square(r)
       d = ( (r(1) - nucl_coord(i,1)) * (r(1) - nucl_coord(i,1)) &
           + (r(2) - nucl_coord(i,2)) * (r(2) - nucl_coord(i,2)) &
           + (r(3) - nucl_coord(i,3)) * (r(3) - nucl_coord(i,3)) )
-      j1b_nucl_square = j1b_nucl_square - dexp(-a*d)
+      j1b_nucl_square = j1b_nucl_square - j1b_pen_coef(i) * dexp(-a*d)
     enddo
     j1b_nucl_square = j1b_nucl_square * j1b_nucl_square
 
@@ -475,7 +477,7 @@ subroutine grad1_j1b_nucl(r, grad)
       y = r(2) - nucl_coord(i,2)
       z = r(3) - nucl_coord(i,3)
       d = x*x + y*y + z*z
-      e = a * dexp(-a*d)
+      e = a * j1b_pen_coef(i) * dexp(-a*d)
 
       fact_x += e * x
       fact_y += e * y
diff --git a/src/non_h_ints_mu/new_grad_tc.irp.f b/src/non_h_ints_mu/new_grad_tc.irp.f
index 499ffe9d..dc76431d 100644
--- a/src/non_h_ints_mu/new_grad_tc.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc.irp.f
@@ -1,68 +1,3 @@
-! ---
-
-!BEGIN_PROVIDER [ double precision, int1_grad2_u12_ao, (3, ao_num, ao_num, n_points_final_grid)]
-!
-!  BEGIN_DOC
-!  !
-!  ! int1_grad2_u12_ao(:,i,j,ipoint) = \int dr1 [-1 * \grad_r2 J(r1,r2)] \phi_i(r1) \phi_j(r1) 
-!  !
-!  ! where r1 = r(ipoint)
-!  !
-!  ! if J(r1,r2) = u12:
-!  !
-!  ! int1_grad2_u12_ao(:,i,j,ipoint) = +0.5 x \int dr1 [-(r1 - r2) (erf(mu * r12)-1)r_12] \phi_i(r1) \phi_j(r1)
-!  !                                 = -0.5 * [ v_ij_erf_rk_cst_mu(i,j,ipoint) * r(:) - x_v_ij_erf_rk_cst_mu(i,j,ipoint,:) ]
-!  !                                 = -int2_grad1_u12_ao(i,j,ipoint,:)
-!  !
-!  ! if J(r1,r2) = u12 x v1 x v2
-!  !
-!  ! int1_grad2_u12_ao(:,i,j,ipoint) =      v2    x [ 0.5 x \int dr1 [-(r1 - r2) (erf(mu * r12)-1)r_12] v1 \phi_i(r1) \phi_j(r1) ]
-!  !                                 - \grad_2 v2 x [       \int dr1                   u12              v1 \phi_i(r1) \phi_j(r1) ] 
-!  !                                 =   -0.5 v_1b(ipoint) * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint) * r(:) 
-!  !                                 +    0.5 v_1b(ipoint) * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,:) 
-!  !                                 - v_1b_grad[:,ipoint] * v_ij_u_cst_mu_j1b(i,j,ipoint)
-!  !
-!  !
-!  END_DOC
-!
-!  implicit none
-!  integer          :: ipoint, i, j
-!  double precision :: x, y, z, tmp_x, tmp_y, tmp_z, tmp0, tmp1, tmp2
-!
-!  PROVIDE j1b_type
-!  
-!  if(j1b_type .eq. 3) then
-!
-!    do ipoint = 1, n_points_final_grid
-!      x = final_grid_points(1,ipoint)
-!      y = final_grid_points(2,ipoint)
-!      z = final_grid_points(3,ipoint)
-!
-!      tmp0  = 0.5d0 * v_1b(ipoint)
-!      tmp_x =  v_1b_grad(1,ipoint)
-!      tmp_y =  v_1b_grad(2,ipoint)
-!      tmp_z =  v_1b_grad(3,ipoint)
-!  
-!      do j = 1, ao_num
-!        do i = 1, ao_num
-!
-!          tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
-!          tmp2 = v_ij_u_cst_mu_j1b(i,j,ipoint)
-!
-!          int1_grad2_u12_ao(1,i,j,ipoint) = -tmp1 * x + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1) - tmp2 * tmp_x
-!          int1_grad2_u12_ao(2,i,j,ipoint) = -tmp1 * y + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2) - tmp2 * tmp_y
-!          int1_grad2_u12_ao(3,i,j,ipoint) = -tmp1 * z + tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3) - tmp2 * tmp_z
-!        enddo
-!      enddo
-!    enddo
-!
-!  else
-!
-!    int1_grad2_u12_ao = -1.d0 * int2_grad1_u12_ao
-!
-!  endif
-!
-!END_PROVIDER 
 
 ! ---
 
@@ -98,22 +33,14 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_loop, (ao_num, ao_num, ao_
     weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
 
     do i = 1, ao_num
-      !ao_i_r  = weight1 * aos_in_r_array_transp         (ipoint,i)
-      !ao_i_dx = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,1)
-      !ao_i_dy = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,2)
-      !ao_i_dz = weight1 * aos_grad_in_r_array_transp_bis(ipoint,i,3)
       ao_i_r  = weight1 * aos_in_r_array     (i,ipoint)
       ao_i_dx = weight1 * aos_grad_in_r_array(i,ipoint,1)
       ao_i_dy = weight1 * aos_grad_in_r_array(i,ipoint,2)
       ao_i_dz = weight1 * aos_grad_in_r_array(i,ipoint,3)
 
       do k = 1, ao_num
-        !ao_k_r = aos_in_r_array_transp(ipoint,k)
         ao_k_r = aos_in_r_array(k,ipoint)
 
-        !tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1) 
-        !tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2) 
-        !tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3) 
         tmp_x = ao_k_r * ao_i_dx - ao_i_r * aos_grad_in_r_array(k,ipoint,1) 
         tmp_y = ao_k_r * ao_i_dy - ao_i_r * aos_grad_in_r_array(k,ipoint,2) 
         tmp_z = ao_k_r * ao_i_dz - ao_i_r * aos_grad_in_r_array(k,ipoint,3) 
@@ -134,44 +61,11 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao_loop, (ao_num, ao_num, ao_
 
   ! ---
 
-  !do ipoint = 1, n_points_final_grid
-  !  weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-
-  !  do l = 1, ao_num
-  !    ao_l_r  = weight1 * aos_in_r_array_transp         (ipoint,l)
-  !    ao_l_dx = weight1 * aos_grad_in_r_array_transp_bis(ipoint,l,1)
-  !    ao_l_dy = weight1 * aos_grad_in_r_array_transp_bis(ipoint,l,2)
-  !    ao_l_dz = weight1 * aos_grad_in_r_array_transp_bis(ipoint,l,3)
-
-  !    do j = 1, ao_num
-  !      ao_j_r = aos_in_r_array_transp(ipoint,j)
-
-  !      tmp_x = ao_j_r * ao_l_dx - ao_l_r * aos_grad_in_r_array_transp_bis(ipoint,j,1) 
-  !      tmp_y = ao_j_r * ao_l_dy - ao_l_r * aos_grad_in_r_array_transp_bis(ipoint,j,2) 
-  !      tmp_z = ao_j_r * ao_l_dz - ao_l_r * aos_grad_in_r_array_transp_bis(ipoint,j,3) 
-
-  !      do i = 1, ao_num
-  !        do k = 1, ao_num
-
-  !          contrib_x = int2_grad1_u12_ao(k,i,ipoint,1) * tmp_x 
-  !          contrib_y = int2_grad1_u12_ao(k,i,ipoint,2) * tmp_y 
-  !          contrib_z = int2_grad1_u12_ao(k,i,ipoint,3) * tmp_z 
-
-  !          ac_mat(k,i,l,j) += contrib_x + contrib_y + contrib_z
-  !        enddo
-  !      enddo
-  !    enddo
-  !  enddo
-  !enddo
-
-  ! ---
- 
   do j = 1, ao_num
     do l = 1, ao_num
       do i = 1, ao_num
         do k = 1, ao_num
           tc_grad_and_lapl_ao_loop(k,i,l,j) = ac_mat(k,i,l,j) + ac_mat(l,j,k,i)
-          !tc_grad_and_lapl_ao_loop(k,i,l,j) = ac_mat(k,i,l,j)
         enddo
       enddo
     enddo
diff --git a/src/non_h_ints_mu/tc_integ.irp.f b/src/non_h_ints_mu/tc_integ.irp.f
index b2c0df31..d569b25c 100644
--- a/src/non_h_ints_mu/tc_integ.irp.f
+++ b/src/non_h_ints_mu/tc_integ.irp.f
@@ -70,14 +70,15 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
 
     elseif((j1b_type .eq. 3) .or. (j1b_type .eq. 4)) then
 
-      PROVIDE v_1b_grad v_ij_erf_rk_cst_mu_j1b v_ij_u_cst_mu_j1b x_v_ij_erf_rk_cst_mu_j1b
+      PROVIDE v_1b_grad
+      PROVIDE v_ij_erf_rk_cst_mu_j1b v_ij_u_cst_mu_j1b_an x_v_ij_erf_rk_cst_mu_j1b
 
       int2_grad1_u12_ao = 0.d0
       !$OMP PARALLEL                                                                 &
       !$OMP DEFAULT (NONE)                                                           &
       !$OMP PRIVATE (ipoint, i, j, x, y, z, tmp0, tmp1, tmp2, tmp_x, tmp_y, tmp_z)   &
       !$OMP SHARED ( ao_num, n_points_final_grid, final_grid_points, v_1b, v_1b_grad &
-      !$OMP        , v_ij_erf_rk_cst_mu_j1b, v_ij_u_cst_mu_j1b, x_v_ij_erf_rk_cst_mu_j1b, int2_grad1_u12_ao)
+      !$OMP        , v_ij_erf_rk_cst_mu_j1b, v_ij_u_cst_mu_j1b_an, x_v_ij_erf_rk_cst_mu_j1b, int2_grad1_u12_ao)
       !$OMP DO SCHEDULE (static)
       do ipoint = 1, n_points_final_grid
         x     = final_grid_points(1,ipoint)
@@ -90,7 +91,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
         do j = 1, ao_num
           do i = 1, ao_num
             tmp1 = tmp0 * v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
-            tmp2 = v_ij_u_cst_mu_j1b(i,j,ipoint)
+            tmp2 = v_ij_u_cst_mu_j1b_an(i,j,ipoint)
             int2_grad1_u12_ao(i,j,ipoint,1) = tmp1 * x - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,1) - tmp2 * tmp_x
             int2_grad1_u12_ao(i,j,ipoint,2) = tmp1 * y - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,2) - tmp2 * tmp_y
             int2_grad1_u12_ao(i,j,ipoint,3) = tmp1 * z - tmp0 * x_v_ij_erf_rk_cst_mu_j1b(i,j,ipoint,3) - tmp2 * tmp_z
@@ -100,7 +101,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
       !$OMP END DO
       !$OMP END PARALLEL
 
-      FREE v_ij_erf_rk_cst_mu_j1b v_ij_u_cst_mu_j1b x_v_ij_erf_rk_cst_mu_j1b
+      FREE v_ij_erf_rk_cst_mu_j1b v_ij_u_cst_mu_j1b_an x_v_ij_erf_rk_cst_mu_j1b
 
     elseif(j1b_type .ge. 100) then
 
diff --git a/src/non_h_ints_mu/test_non_h_ints.irp.f b/src/non_h_ints_mu/test_non_h_ints.irp.f
index a6e0a311..aff53c2d 100644
--- a/src/non_h_ints_mu/test_non_h_ints.irp.f
+++ b/src/non_h_ints_mu/test_non_h_ints.irp.f
@@ -1,19 +1,18 @@
+
+! ---
+
 program test_non_h
 
   implicit none
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 50
-  my_n_pt_a_grid = 74
-  !my_n_pt_r_grid = 400
-  !my_n_pt_a_grid = 974
-
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
-  !call routine_grad_squared
-  !call routine_fit
+  !call routine_grad_squared()
+  !call routine_fit()
   
   call test_ipp()
 end
diff --git a/src/non_h_ints_mu/total_tc_int.irp.f b/src/non_h_ints_mu/total_tc_int.irp.f
index afa10305..158ee2fb 100644
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@@ -1,4 +1,7 @@
 
+! TODO
+! remove ao_two_e_coul and use map directly
+
 ! ---
 
 BEGIN_PROVIDER [double precision, ao_vartc_int_chemist, (ao_num, ao_num, ao_num, ao_num)]
@@ -116,6 +119,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist_no_cycle, (ao_num, ao_num, a
 
   call wall_time(wall1)
   print *, ' wall time for ao_tc_int_chemist_no_cycle ', wall1 - wall0
+
 END_PROVIDER 
 
 ! ---
diff --git a/src/tc_bi_ortho/compute_deltamu_right.irp.f b/src/tc_bi_ortho/compute_deltamu_right.irp.f
index 7ca2c890..ab9dc093 100644
--- a/src/tc_bi_ortho/compute_deltamu_right.irp.f
+++ b/src/tc_bi_ortho/compute_deltamu_right.irp.f
@@ -3,8 +3,9 @@ program compute_deltamu_right
   implicit none
 
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   read_wf = .True.
diff --git a/src/tc_bi_ortho/dav_h_tc_s2.irp.f b/src/tc_bi_ortho/dav_h_tc_s2.irp.f
index 3e89bbe2..19df847a 100644
--- a/src/tc_bi_ortho/dav_h_tc_s2.irp.f
+++ b/src/tc_bi_ortho/dav_h_tc_s2.irp.f
@@ -425,7 +425,7 @@ subroutine davidson_hs2_nonsym_b1space(u_in, H_jj, s2_out,energies, sze, N_st, N
         if(lambda_tmp .lt. 0.7d0) then
           print *, ' very small overlap ...', l, i_omax(l)
           print *, ' max overlap = ', lambda_tmp
-          stop
+          !stop
         endif
 
         if(i_omax(l) .ne. l) then
diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 7259c270..ca5875c9 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_v0, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC 
   ! Normal ordering of the three body interaction on the HF density
@@ -18,13 +18,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
 
-  print*,' Providing normal_two_body_bi_orth ...'
+  print*,' Providing normal_two_body_bi_orth_v0 ...'
   call wall_time(walli)
  
   if(read_tc_norm_ord) then
 
-    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
-      read(11) normal_two_body_bi_orth
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_v0', action="read")
+      read(11) normal_two_body_bi_orth_v0
     close(11)
 
   else
@@ -318,7 +318,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
     call wall_time(wall1)
     print*,' Wall time for aba_contraction', wall1-wall0
 
-    normal_two_body_bi_orth = tmp
+    normal_two_body_bi_orth_v0 = tmp
 
     ! ---
     ! aab contraction
@@ -491,12 +491,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
         enddo
       enddo
     enddo
+    !$OMP END DO
     !$OMP END PARALLEL
 
     call wall_time(wall1)
     print*,' Wall time for aab_contraction', wall1-wall0
 
-    normal_two_body_bi_orth += tmp
+    normal_two_body_bi_orth_v0 += tmp
 
     ! ---
     ! aaa contraction
@@ -1002,9 +1003,948 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
           enddo
         enddo
       enddo
+      !$OMP END DO
       !$OMP END PARALLEL
 
-      call wall_time(wallf)
+      call wall_time(wall1)
+      print*,' Wall time for aaa_contraction', wall1-wall0
+
+      normal_two_body_bi_orth_v0 += tmp
+    endif ! Ne(2) .ge. 3
+
+    deallocate(tmp)
+
+  endif ! read_tc_norm_ord
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_v0', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth_v0
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
+  endif
+
+  call wall_time(wallf)
+  print*,' Wall time for normal_two_body_bi_orth_v0 ', wallf-walli
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC 
+  ! Normal ordering of the three body interaction on the HF density
+  END_DOC 
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: hh1, hh2, pp1, pp2
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1, walli, wallf
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+
+  print*,' Providing normal_two_body_bi_orth ...'
+  call wall_time(walli)
+ 
+  if(read_tc_norm_ord) then
+
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
+      read(11) normal_two_body_bi_orth
+    close(11)
+
+  else
+
+    double precision, allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+    double precision, allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+    double precision, allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+    double precision, allocatable :: tmp(:,:,:,:)
+
+    PROVIDE N_int
+
+    allocate( occ(N_int*bit_kind_size,2) )
+    allocate( key_i_core(N_int,2) )
+
+    if(core_tc_op) then
+      do i = 1, N_int
+        key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+      enddo
+      call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+    else
+      call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+    endif
+
+    allocate(tmp(mo_num,mo_num,mo_num,mo_num))
+
+    ! ---
+    ! aba contraction
+
+    print*,' Providing aba_contraction ...'
+    call wall_time(wall0)
+
+    tmp = 0.d0
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+    !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+    !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+    !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp)
+
+    allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+    allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+    tmp_3d   = 0.d0
+    tmp_2d   = 0.d0
+    tmp1     = 0.d0
+    tmp2     = 0.d0
+    tmpval_1 = 0.d0
+    tmpval_2 = 0.d0
+    tmpvec_1 = 0.d0
+    tmpvec_2 = 0.d0 
+
+    !$OMP DO
+
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                             ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+    !$OMP END DO
+
+    deallocate(tmp_3d, tmp_2d)
+    deallocate(tmp1, tmp2)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2)
+
+    !$OMP END PARALLEL
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+      !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+      !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+      !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp)
+
+      Allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+      Allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+      Allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+      Allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+      Tmp_3d   = 0.d0
+      Tmp_2d   = 0.d0
+      Tmp1     = 0.d0
+      Tmp2     = 0.d0
+      Tmpval_1 = 0.d0
+      Tmpval_2 = 0.d0
+      Tmpvec_1 = 0.d0
+      Tmpvec_2 = 0.d0 
+
+      !$OMP DO
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+        do h1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+            tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          enddo
+
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                               ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                               - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                               - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                               - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+            enddo
+
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+      !$OMP END DO
+
+      deallocate(tmp_3d, tmp_2d)
+      deallocate(tmp1, tmp2)
+      deallocate(tmpval_1, tmpval_2)
+      deallocate(tmpvec_1, tmpvec_2)
+
+      !$OMP END PARALLEL
+    endif
+
+    tmp = -0.5d0 * tmp
+    call sum_A_At(tmp(1,1,1,1), mo_num*mo_num)
+
+    call wall_time(wall1)
+    print*,' Wall time for aba_contraction', wall1-wall0
+
+    normal_two_body_bi_orth = tmp
+
+    ! ---
+    ! aab contraction
+
+    print*,' Providing aab_contraction ...'
+    call wall_time(wall0)
+
+    tmp = 0.d0
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, ii, i, h1, p1, h2, p2,                   &
+    !$OMP          tmp_2d, tmp_3d, tmp1, tmp2,                      &
+    !$OMP          tmpval_1, tmpvec_1)                              &
+    !$OMP SHARED (n_points_final_grid, mo_num, Ne, occ,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp)
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+
+    tmp_2d   = 0.d0
+    tmp_3d   = 0.d0
+    tmp1     = 0.d0
+    tmp2     = 0.d0
+    tmpval_1 = 0.d0
+    tmpvec_1 = 0.d0
+
+    !$OMP DO
+
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+    !$OMP END DO
+
+    deallocate(tmp_3d)
+    deallocate(tmp1, tmp2)
+    deallocate(tmpval_1)
+    deallocate(tmpvec_1)
+
+    !$OMP END PARALLEL
+
+    tmp = -0.5d0 * tmp
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (tmp, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call wall_time(wall1)
+    print*,' Wall time for aab_contraction', wall1-wall0
+
+    normal_two_body_bi_orth += tmp
+
+    ! ---
+    ! aaa contraction
+
+    if(Ne(2) .ge. 3) then
+
+      print*,' Providing aaa_contraction ...'
+      call wall_time(wall0)
+
+      tmp = 0.d0
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+      !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+      !$OMP          tmpval_1, tmpval_2,                              &
+      !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+      !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp)
+
+      allocate(tmp_2d(mo_num,mo_num))
+      allocate(tmp_3d(mo_num,mo_num,mo_num))
+      allocate(tmp1(n_points_final_grid,3,mo_num))
+      allocate(tmp2(n_points_final_grid,mo_num))
+      allocate(tmp3(n_points_final_grid,3,mo_num))
+      allocate(tmpval_1(n_points_final_grid))
+      allocate(tmpval_2(n_points_final_grid))
+      allocate(tmpvec_1(n_points_final_grid,3))
+      allocate(tmpvec_2(n_points_final_grid,3))
+      allocate(tmpvec_3(n_points_final_grid,3))
+
+      tmp_2d   = 0.d0
+      tmp_3d   = 0.d0
+      tmp1     = 0.d0
+      tmp2     = 0.d0
+      tmp3     = 0.d0
+      tmpval_1 = 0.d0
+      tmpval_2 = 0.d0
+      tmpvec_1 = 0.d0
+      tmpvec_2 = 0.d0
+      tmpvec_3 = 0.d0
+
+      !$OMP DO
+      do ii = 1, Ne(2)
+        i = occ(ii,2)
+
+        do h1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                      , tmp2(1,1), n_points_final_grid                      &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , tmp2(1,1), n_points_final_grid                      &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                    &
+                      , tmp1(1,1,1), 3*n_points_final_grid                    &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+          enddo ! p1
+        enddo ! h1
+      enddo ! i
+      !$OMP END DO
+
+      deallocate(tmp_2d)
+      deallocate(tmp_3d)
+      deallocate(tmp1)
+      deallocate(tmp2)
+      deallocate(tmp3)
+      deallocate(tmpval_1)
+      deallocate(tmpval_2)
+      deallocate(tmpvec_1)
+      deallocate(tmpvec_2)
+      deallocate(tmpvec_3)
+
+      !$OMP END PARALLEL
+
+      ! purely open-shell part 
+      if(Ne(2) < Ne(1)) then
+
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+        !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+        !$OMP          tmpval_1, tmpval_2,                              &
+        !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+        !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmp)
+
+        allocate(tmp_2d(mo_num,mo_num))
+        allocate(tmp_3d(mo_num,mo_num,mo_num))
+        allocate(tmp1(n_points_final_grid,3,mo_num))
+        allocate(tmp2(n_points_final_grid,mo_num))
+        allocate(tmp3(n_points_final_grid,3,mo_num))
+        allocate(tmpval_1(n_points_final_grid))
+        allocate(tmpval_2(n_points_final_grid))
+        allocate(tmpvec_1(n_points_final_grid,3))
+        allocate(tmpvec_2(n_points_final_grid,3))
+        allocate(tmpvec_3(n_points_final_grid,3))
+
+        tmp_2d   = 0.d0
+        tmp_3d   = 0.d0
+        tmp1     = 0.d0
+        tmp2     = 0.d0
+        tmp3     = 0.d0
+        tmpval_1 = 0.d0
+        tmpval_2 = 0.d0
+        tmpvec_1 = 0.d0
+        tmpvec_2 = 0.d0
+        tmpvec_3 = 0.d0
+
+        !$OMP DO
+
+        do ii = Ne(2) + 1, Ne(1)
+          i = occ(ii,1)
+
+          do h1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            enddo
+
+            do p1 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+                tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+                tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+            enddo
+
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+                tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+                tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , 0.d0, tmp_3d(1,1,1), mo_num)
+
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+            enddo
+
+            do p1 = 1, mo_num
+
+              do ipoint = 1, n_points_final_grid
+
+                tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                                 ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+                tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+                tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+                tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+                tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              enddo
+
+              do h2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                  + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                  tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                  tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                  tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+                enddo
+              enddo
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+
+              do p2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                  tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                  tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                  tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+                enddo
+              enddo
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                        , tmp3(1,1,1), 3*n_points_final_grid                     &
+                        , tmp1(1,1,1), 3*n_points_final_grid                     &
+                        , 1.d0, tmp_2d(1,1), mo_num)
+
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+
+            enddo ! p1
+          enddo ! h1
+        enddo !i
+        !$OMP END DO
+
+        deallocate(tmp_2d)
+        deallocate(tmp_3d)
+        deallocate(tmp1)
+        deallocate(tmp2)
+        deallocate(tmp3)
+        deallocate(tmpval_1)
+        deallocate(tmpval_2)
+        deallocate(tmpvec_1)
+        deallocate(tmpvec_2)
+        deallocate(tmpvec_3)
+
+        !$OMP END PARALLEL
+      endif
+
+      tmp = -0.5d0 * tmp
+
+      !$OMP PARALLEL                 &
+      !$OMP DEFAULT (NONE)           &
+      !$OMP PRIVATE (h1, h2, p1, p2) & 
+      !$OMP SHARED (tmp, mo_num)
+
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 1, mo_num
+            do p2 = p1, mo_num
+              tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP DO 
+      do h1 = 1, mo_num-1
+        do h2 = h1+1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) *= -1.d0
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call wall_time(wall1)
       print*,' Wall time for aaa_contraction', wall1-wall0
 
       normal_two_body_bi_orth += tmp
diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
index 855cfd17..d11c6727 100644
--- a/src/tc_bi_ortho/normal_ordered_contractions.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+BEGIN_PROVIDER [ double precision, no_aba_contraction_v0, (mo_num,mo_num,mo_num,mo_num)]
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
 
@@ -16,7 +16,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
   double precision,  allocatable :: tmp_2d(:,:)
 
-  print*,' Providing no_aba_contraction ...'
+  print*,' Providing no_aba_contraction_v0 ...'
   call wall_time(wall0)
 
   PROVIDE N_int
@@ -102,7 +102,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
           enddo
         enddo
       enddo
@@ -153,7 +153,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP PARALLEL DO PRIVATE(h2,p2)
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
           enddo
         enddo
         !$OMP END PARALLEL DO
@@ -220,7 +220,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
@@ -270,7 +270,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -285,17 +285,17 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpval_1, tmpval_2)
   deallocate(tmpvec_1, tmpvec_2)
 
-  no_aba_contraction = -0.5d0 * no_aba_contraction
-  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+  no_aba_contraction_v0 = -0.5d0 * no_aba_contraction_v0
+  call sum_A_At(no_aba_contraction_v0(1,1,1,1), mo_num*mo_num)
 
   call wall_time(wall1)
-  print*,' Wall time for no_aba_contraction', wall1-wall0
+  print*,' Wall time for no_aba_contraction_v0', wall1-wall0
 
 END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
+BEGIN_PROVIDER [ double precision, no_aab_contraction_v0, (mo_num,mo_num,mo_num,mo_num)]
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
 
@@ -310,7 +310,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
   double precision,  allocatable :: tmp_2d(:,:)
 
-  print*,' Providing no_aab_contraction ...'
+  print*,' Providing no_aab_contraction_v0 ...'
   call wall_time(wall0)
 
   PROVIDE N_int
@@ -387,7 +387,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            no_aab_contraction_v0(p2,h2,p1,h1) = no_aab_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
           enddo
         enddo
       enddo
@@ -435,7 +435,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP PARALLEL DO PRIVATE(h2,p2)
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            no_aab_contraction_v0(p2,h2,p1,h1) = no_aab_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
           enddo
         enddo
         !$OMP END PARALLEL DO
@@ -449,19 +449,19 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpval_1)
   deallocate(tmpvec_1)
 
-  no_aab_contraction = -0.5d0 * no_aab_contraction
+  no_aab_contraction_v0 = -0.5d0 * no_aab_contraction_v0
 
   !$OMP PARALLEL                 &
   !$OMP DEFAULT (NONE)           &
   !$OMP PRIVATE (h1, h2, p1, p2) & 
-  !$OMP SHARED (no_aab_contraction, mo_num)
+  !$OMP SHARED (no_aab_contraction_v0, mo_num)
 
   !$OMP DO 
   do h1 = 1, mo_num
     do h2 = 1, mo_num
       do p1 = 1, mo_num
         do p2 = p1, mo_num
-          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+          no_aab_contraction_v0(p2,h2,p1,h1) -= no_aab_contraction_v0(p1,h2,p2,h1)
         enddo
       enddo
     enddo
@@ -473,7 +473,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
     do h2 = 1, mo_num
       do p1 = 2, mo_num
         do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+          no_aab_contraction_v0(p2,h2,p1,h1) = -no_aab_contraction_v0(p1,h2,p2,h1)
         enddo
       enddo
     enddo
@@ -485,21 +485,22 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
     do h2 = h1+1, mo_num
       do p1 = 2, mo_num
         do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+          no_aab_contraction_v0(p2,h2,p1,h1) *= -1.d0
         enddo
       enddo
     enddo
   enddo
+  !$OMP END DO
   !$OMP END PARALLEL
 
   call wall_time(wall1)
-  print*,' Wall time for no_aab_contraction', wall1-wall0
+  print*,' Wall time for no_aab_contraction_v0', wall1-wall0
 
 END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+BEGIN_PROVIDER [ double precision, no_aaa_contraction_v0, (mo_num,mo_num,mo_num,mo_num)]
 
   BEGIN_DOC
   !
@@ -507,12 +508,12 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
   !    h1 < h2
   !    p1 > p2
   !
-  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
+  !   no_aaa_contraction_v0(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
   !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
   !
   ! else:
   !
-  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !   no_aaa_contraction_v0(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
   !
   ! 
   ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
@@ -535,7 +536,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
   double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
 
-  print*,' Providing no_aaa_contraction ...'
+  print*,' Providing no_aaa_contraction_v0 ...'
   call wall_time(wall0)
 
   PROVIDE N_int
@@ -555,7 +556,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
 
   if(Ne(2) .lt. 3) then
 
-    no_aaa_contraction = 0.d0
+    no_aaa_contraction_v0 = 0.d0
 
   else
 
@@ -629,7 +630,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
@@ -661,7 +662,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
             enddo
           enddo
         enddo
@@ -735,7 +736,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -775,7 +776,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -848,7 +849,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
               enddo
             enddo
           enddo
@@ -880,7 +881,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
               enddo
             enddo
           enddo
@@ -954,7 +955,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
@@ -994,7 +995,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
@@ -1009,6 +1010,964 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
     deallocate(tmpval_1, tmpval_2)
     deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
 
+    no_aaa_contraction_v0 = -0.5d0 * no_aaa_contraction_v0
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (no_aaa_contraction_v0, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            no_aaa_contraction_v0(p2,h2,p1,h1) -= no_aaa_contraction_v0(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction_v0(p2,h2,p1,h1) = -no_aaa_contraction_v0(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction_v0(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aaa_contraction_v0', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aba_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+  !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+  !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+  !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         no_aba_contraction)
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+  tmp_3d   = 0.d0
+  tmp_2d   = 0.d0
+  tmp1     = 0.d0
+  tmp2     = 0.d0
+  tmpval_1 = 0.d0
+  tmpval_2 = 0.d0
+  tmpvec_1 = 0.d0
+  tmpvec_2 = 0.d0 
+
+  !$OMP DO
+
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    do h1 = 1, mo_num
+
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+      enddo
+
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+        enddo
+      enddo
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            !$OMP CRITICAL
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            !$OMP END CRITICAL
+          enddo
+        enddo
+      enddo
+
+      do p1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                           ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                           + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                           + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                           - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                           - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                           - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+        enddo
+
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            !$OMP CRITICAL
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            !$OMP END CRITICAL
+          enddo
+        enddo
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  !$OMP END DO
+
+  deallocate(tmp_3d, tmp_2d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+  !$OMP END PARALLEL
+
+
+  ! purely open-shell part 
+  if(Ne(2) < Ne(1)) then
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+    !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+    !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+    !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         no_aba_contraction)
+
+    Allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+    Allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+    Allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+    Allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+    Tmp_3d   = 0.d0
+    Tmp_2d   = 0.d0
+    Tmp1     = 0.d0
+    Tmp2     = 0.d0
+    Tmpval_1 = 0.d0
+    Tmpval_2 = 0.d0
+    Tmpvec_1 = 0.d0
+    Tmpvec_2 = 0.d0 
+
+    !$OMP DO
+
+    do ii = Ne(2) + 1, Ne(1)
+      i = occ(ii,1)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                             ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                    , tmp2(1,1), n_points_final_grid                       &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo !i
+    !$OMP END DO
+
+    deallocate(tmp_3d, tmp_2d)
+    deallocate(tmp1, tmp2)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2)
+
+    !$OMP END PARALLEL
+  endif
+
+  no_aba_contraction = -0.5d0 * no_aba_contraction
+  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aba_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aab_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, ii, i, h1, p1, h2, p2,                   &
+  !$OMP          tmp_2d, tmp_3d, tmp1, tmp2,                      &
+  !$OMP          tmpval_1, tmpvec_1)                              &
+  !$OMP SHARED (n_points_final_grid, mo_num, Ne, occ,             &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         no_aab_contraction)
+
+
+  allocate(tmp_2d(mo_num,mo_num))
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+
+  tmp_2d   = 0.d0
+  tmp_3d   = 0.d0
+  tmp1     = 0.d0
+  tmp2     = 0.d0
+  tmpval_1 = 0.d0
+  tmpvec_1 = 0.d0
+
+  !$OMP DO
+
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    do h1 = 1, mo_num
+
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+      enddo
+
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+        enddo
+      enddo
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            !$OMP CRITICAL
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            !$OMP END CRITICAL
+          enddo
+        enddo
+      enddo
+
+      do p1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+        enddo
+
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            !$OMP CRITICAL
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            !$OMP END CRITICAL
+          enddo
+        enddo
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  !$OMP END DO
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1)
+  deallocate(tmpvec_1)
+
+  !$OMP END PARALLEL
+
+  no_aab_contraction = -0.5d0 * no_aab_contraction
+
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (h1, h2, p1, p2) & 
+  !$OMP SHARED (no_aab_contraction, mo_num)
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 1, mo_num
+        do p2 = p1, mo_num
+          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num-1
+    do h2 = h1+1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aab_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+
+  print*,' Providing no_aaa_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  if(Ne(2) .lt. 3) then
+
+    no_aaa_contraction = 0.d0
+
+  else
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+    !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+    !$OMP          tmpval_1, tmpval_2,                              &
+    !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+    !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         no_aaa_contraction)
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmp3(n_points_final_grid,3,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmpvec_3(n_points_final_grid,3))
+
+    tmp_2d   = 0.d0
+    tmp_3d   = 0.d0
+    tmp1     = 0.d0
+    tmp2     = 0.d0
+    tmp3     = 0.d0
+    tmpval_1 = 0.d0
+    tmpval_2 = 0.d0
+    tmpvec_1 = 0.d0
+    tmpvec_2 = 0.d0
+    tmpvec_3 = 0.d0
+
+    !$OMP DO
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , 0.d0, tmp_3d(1,1,1), mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp3(1,1,1), 3*n_points_final_grid                    &
+                    , tmp1(1,1,1), 3*n_points_final_grid                    &
+                    , 1.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+    !$OMP END DO
+
+    deallocate(tmp_2d)
+    deallocate(tmp_3d)
+    deallocate(tmp1)
+    deallocate(tmp2)
+    deallocate(tmp3)
+    deallocate(tmpval_1)
+    deallocate(tmpval_2)
+    deallocate(tmpvec_1)
+    deallocate(tmpvec_2)
+    deallocate(tmpvec_3)
+
+    !$OMP END PARALLEL
+
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+      !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+      !$OMP          tmpval_1, tmpval_2,                              &
+      !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+      !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         no_aaa_contraction)
+
+      allocate(tmp_2d(mo_num,mo_num))
+      allocate(tmp_3d(mo_num,mo_num,mo_num))
+      allocate(tmp1(n_points_final_grid,3,mo_num))
+      allocate(tmp2(n_points_final_grid,mo_num))
+      allocate(tmp3(n_points_final_grid,3,mo_num))
+      allocate(tmpval_1(n_points_final_grid))
+      allocate(tmpval_2(n_points_final_grid))
+      allocate(tmpvec_1(n_points_final_grid,3))
+      allocate(tmpvec_2(n_points_final_grid,3))
+      allocate(tmpvec_3(n_points_final_grid,3))
+
+      tmp_2d   = 0.d0
+      tmp_3d   = 0.d0
+      tmp1     = 0.d0
+      tmp2     = 0.d0
+      tmp3     = 0.d0
+      tmpval_1 = 0.d0
+      tmpval_2 = 0.d0
+      tmpvec_1 = 0.d0
+      tmpvec_2 = 0.d0
+      tmpvec_3 = 0.d0
+
+      !$OMP DO
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+        do h1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                     &
+                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+      !$OMP END DO
+
+      deallocate(tmp_2d)
+      deallocate(tmp_3d)
+      deallocate(tmp1)
+      deallocate(tmp2)
+      deallocate(tmp3)
+      deallocate(tmpval_1)
+      deallocate(tmpval_2)
+      deallocate(tmpvec_1)
+      deallocate(tmpvec_2)
+      deallocate(tmpvec_3)
+
+      !$OMP END PARALLEL
+    endif
+
     no_aaa_contraction = -0.5d0 * no_aaa_contraction
 
     !$OMP PARALLEL                 &
@@ -1050,6 +2009,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         enddo
       enddo
     enddo
+    !$OMP END DO
     !$OMP END PARALLEL
 
   endif
diff --git a/src/tc_bi_ortho/print_tc_dump.irp.f b/src/tc_bi_ortho/print_tc_dump.irp.f
new file mode 100644
index 00000000..868de444
--- /dev/null
+++ b/src/tc_bi_ortho/print_tc_dump.irp.f
@@ -0,0 +1,187 @@
+program tc_bi_ortho
+
+  BEGIN_DOC
+  ! TODO
+  END_DOC
+  implicit none
+
+  my_grid_becke = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call ERI_dump()
+  call KMat_tilde_dump()
+  call LMat_tilde_dump()
+
+end
+
+! ---
+
+subroutine KMat_tilde_dump()
+
+  implicit none
+  integer              :: i, j, k, l
+  integer              :: isym, ms2, st, iii
+  character(16)        :: corb
+  double precision     :: t1, t2
+  integer, allocatable :: orbsym(:)
+
+  print *, ' generating FCIDUMP'
+  call wall_time(t1)
+
+  PROVIDE mo_bi_ortho_tc_two_e_chemist
+  PROVIDE mo_bi_ortho_tc_one_e
+
+  isym = 1
+  ms2  = elec_alpha_num - elec_beta_num
+  st   = 0
+  iii  = 0
+
+  allocate(orbsym(mo_num))
+  orbsym(1:mo_num) = 1
+
+  open(33, file='FCIDUMP', action='write')  
+
+    write(33,'("&",a)') 'FCI'
+    write(33,'(1x,a,"=",i0,",")') 'NORB', mo_num
+    write(33,'(1x,a,"=",i0,",")') 'NELEC', elec_num
+    write(33,'(1x,a,"=",i0,",")') 'MS2', ms2
+    write(33,'(1x,a,"=",i0,",")') 'ISYM', isym
+    write(corb,'(i0)') mo_num
+    write(33,'(1x,a,"=",'//corb//'(i0,","))') 'ORBSYM', orbsym
+    write(33,'(1x,a,"=",i0,",")') 'ST', st
+    write(33,'(1x,a,"=",i0,",")') 'III', iii
+    write(33,'(1x,a,"=",i0,",")') 'OCC', (elec_num-ms2)/2+ms2
+    write(33,'(1x,a,"=",i0,",")') 'CLOSED', 2*elec_alpha_num
+    write(33,'(1x,"/")')
+
+    do l = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do i = 1, mo_num
+            ! TCHint convention
+            write(33, '(E15.7, 4X, 4(I4, 2X))') mo_bi_ortho_tc_two_e_chemist(j,i,l,k), i, j, k, l
+          enddo
+        enddo
+      enddo
+    enddo
+
+    do j = 1, mo_num
+      do i = 1, mo_num
+        ! TCHint convention
+        write(33, '(E15.7, 4X, 4(I4, 2X))') mo_bi_ortho_tc_one_e(i,j), i, j, 0, 0
+      enddo
+    enddo
+
+  close(33)
+
+  deallocate(orbsym)
+
+  call wall_time(t2)
+  print *, ' end after (min)', (t2-t1)/60.d0
+
+  return
+end subroutine KMat_tilde_dump
+
+! ---
+
+subroutine ERI_dump()
+
+  implicit none
+  integer                       :: i, j, k, l
+  double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
+
+  PROVIDE mo_r_coef mo_l_coef
+
+  allocate(a2(ao_num,ao_num,ao_num,mo_num))
+
+  call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0   &
+            , ao_two_e_coul(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+            , 0.d0 , a2(1,1,1,1), ao_num*ao_num*ao_num)
+
+  allocate(a1(ao_num,ao_num,mo_num,mo_num))
+
+  call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
+            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+            , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+
+  deallocate(a2)
+  allocate(a2(ao_num,mo_num,mo_num,mo_num))
+
+  call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+            , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
+            , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+
+  deallocate(a1)
+  allocate(a1(mo_num,mo_num,mo_num,mo_num))
+
+  call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+            , 0.d0, a1(1,1,1,1), mo_num*mo_num*mo_num)
+
+  deallocate(a2)
+
+  open(33, file='ERI.dat', action='write')  
+    do l = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do i = 1, mo_num
+            write(33, '(4(I4, 2X), 4X, E15.7)') i, j, k, l, a1(i,j,k,l)
+          enddo
+        enddo
+      enddo
+    enddo
+  close(33)
+
+  deallocate(a1)
+
+  return
+end subroutine ERI_dump
+
+! ---
+
+subroutine LMat_tilde_dump()
+
+  implicit none
+  integer          :: i, j, k, l, m, n
+  double precision :: integral
+  double precision :: t1, t2
+
+  print *, ' generating TCDUMP'
+  call wall_time(t1)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  open(33, file='TCDUMP', action='write')  
+    write(33, '(4X, I4)') mo_num
+    do n = 1, mo_num
+      do m = 1, mo_num
+        do l = 1, mo_num
+          do k = 1, mo_num
+            do j = 1, mo_num
+              do i = 1, mo_num
+                ! < i j k | -L | l m n > with a BI-ORTHONORMAL MOLECULAR ORBITALS
+                call give_integrals_3_body_bi_ort(i, j, k, l, m, n, integral)
+                !write(33, '(6(I4, 2X), 4X, E15.7)') i, j, k, l, m, n, integral
+                ! TCHint convention
+                if(dabs(integral).gt.1d-10) then
+                  write(33, '(E15.7, 4X, 6(I4, 2X))') -integral/3.d0, i, j, k, l, m, n
+                  !write(33, '(E15.7, 4X, 6(I4, 2X))') -integral/3.d0, l, m, n, i, j, k
+                endif
+              enddo
+            enddo
+          enddo
+        enddo
+      enddo
+    enddo
+  close(33)
+
+  call wall_time(t2)
+  print *, ' end after (min)', (t2-t1)/60.d0
+
+  return
+end subroutine LMat_tilde_dump
+
+! ---
diff --git a/src/tc_bi_ortho/print_tc_energy.irp.f b/src/tc_bi_ortho/print_tc_energy.irp.f
index 980d12de..7bca72a1 100644
--- a/src/tc_bi_ortho/print_tc_energy.irp.f
+++ b/src/tc_bi_ortho/print_tc_energy.irp.f
@@ -1,19 +1,26 @@
 program print_tc_energy
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
 
   PROVIDE j1b_type
   print*, 'j1b_type = ', j1b_type
 
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call write_tc_energy
+  call write_tc_energy()
+
 end
 
diff --git a/src/tc_bi_ortho/print_tc_spin_dens.irp.f b/src/tc_bi_ortho/print_tc_spin_dens.irp.f
index 8308140d..c7da5bc8 100644
--- a/src/tc_bi_ortho/print_tc_spin_dens.irp.f
+++ b/src/tc_bi_ortho/print_tc_spin_dens.irp.f
@@ -1,16 +1,26 @@
+
+! ---
+
 program test_spin_dens
-  implicit none
+
   BEGIN_DOC
-! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together with the energy. Saves the left-right wave functions at the end. 
+  ! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together with the energy. Saves the left-right wave functions at the end. 
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call tc_print_mulliken_sd
-! call test
+
+  call tc_print_mulliken_sd()
+  !call test
 
 end
diff --git a/src/tc_bi_ortho/print_tc_var.irp.f b/src/tc_bi_ortho/print_tc_var.irp.f
index fa0a4363..bec34f18 100644
--- a/src/tc_bi_ortho/print_tc_var.irp.f
+++ b/src/tc_bi_ortho/print_tc_var.irp.f
@@ -7,12 +7,15 @@ program print_tc_var
   implicit none
 
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   call write_tc_var()
 
diff --git a/src/tc_bi_ortho/print_tc_wf.irp.f b/src/tc_bi_ortho/print_tc_wf.irp.f
index 0c4198a9..c755485b 100644
--- a/src/tc_bi_ortho/print_tc_wf.irp.f
+++ b/src/tc_bi_ortho/print_tc_wf.irp.f
@@ -1,20 +1,31 @@
+
+! ---
+
 program print_tc_bi_ortho
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
 !  if(three_body_h_tc)then
 !   call provide_all_three_ints_bi_ortho
 !  endif
 !  call routine
- call write_l_r_wf
+  call write_l_r_wf
+
 end
 
 subroutine write_l_r_wf
diff --git a/src/tc_bi_ortho/pt2_tc_cisd.irp.f b/src/tc_bi_ortho/pt2_tc_cisd.irp.f
index 9cb9a600..8940a4f6 100644
--- a/src/tc_bi_ortho/pt2_tc_cisd.irp.f
+++ b/src/tc_bi_ortho/pt2_tc_cisd.irp.f
@@ -7,12 +7,16 @@ program pt2_tc_cisd
   !
   END_DOC
 
+  implicit none
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   print*, ' nb of states = ', N_states
   print*, ' nb of det    = ', N_det
diff --git a/src/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f b/src/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f
index 8b6eb1d1..47ade8df 100644
--- a/src/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f
+++ b/src/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f
@@ -1,35 +1,59 @@
- program tc_natorb_bi_ortho
-   implicit none
-   BEGIN_DOC
- ! TODO : Put the documentation of the program here
-   END_DOC
-   print *, 'Hello world'
-   my_grid_becke = .True.
-   my_n_pt_r_grid = 30
-   my_n_pt_a_grid = 50
-   read_wf = .True.
-   touch read_wf
-   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-   call print_energy_and_mos
-   call save_tc_natorb
-!   call minimize_tc_orb_angles
- end
- 
- subroutine save_tc_natorb 
+
+! ---
+
+program tc_natorb_bi_ortho
+
+  BEGIN_DOC
+  ! TODO : Put the documentation of the program here
+  END_DOC
+
   implicit none
+
+  print *, 'Hello world'
+
+  my_grid_becke = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  read_wf = .True.
+  touch read_wf
+
+  call print_energy_and_mos()
+  call save_tc_natorb()
+  !call minimize_tc_orb_angles()
+
+end
+
+! ---
+ 
+subroutine save_tc_natorb()
+
+  implicit none
+
   print*,'Saving the natorbs '
+
   provide natorb_tc_leigvec_ao natorb_tc_reigvec_ao
+
   call ezfio_set_bi_ortho_mos_mo_l_coef(natorb_tc_leigvec_ao)
   call ezfio_set_bi_ortho_mos_mo_r_coef(natorb_tc_reigvec_ao)
-  call save_ref_determinant_nstates_1
+  call save_ref_determinant_nstates_1()
   call ezfio_set_determinants_read_wf(.False.)
- end
+
+end
+
+! ---
  
- subroutine save_ref_determinant_nstates_1
-   implicit none
+subroutine save_ref_determinant_nstates_1()
+
    use bitmasks
-   double precision               :: buffer(1,N_states)
+   implicit none
+   double precision :: buffer(1,N_states)
+
    buffer = 0.d0
    buffer(1,1) = 1.d0
-   call save_wavefunction_general(1,1,ref_bitmask,1,buffer)
- end
+   call save_wavefunction_general(1, 1, ref_bitmask, 1, buffer)
+
+end
+
diff --git a/src/tc_bi_ortho/select_dets_bi_ortho.irp.f b/src/tc_bi_ortho/select_dets_bi_ortho.irp.f
index e6bf3d6e..e923548a 100644
--- a/src/tc_bi_ortho/select_dets_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/select_dets_bi_ortho.irp.f
@@ -1,15 +1,24 @@
-program tc_bi_ortho
-  implicit none
+
+! ---
+
+program select_dets_bi_ortho()
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   !!!!!!!!!!!!!!! WARNING NO 3-BODY
   !!!!!!!!!!!!!!! WARNING NO 3-BODY
@@ -22,6 +31,8 @@ program tc_bi_ortho
 ! call test
 end
 
+! ---
+
 subroutine routine_test
  implicit none
  use bitmasks ! you need to include the bitmasks_module.f90 features
@@ -57,5 +68,7 @@ subroutine routine_test
  enddo
  call save_wavefunction_general(n_good,n_states,dets,n_good,coef_new)
 
-
 end
+
+! ---
+
diff --git a/src/tc_bi_ortho/slater_tc_3e_slow.irp.f b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
index 76539cb3..35abbbc4 100644
--- a/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
+++ b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
@@ -1,4 +1,6 @@
 
+! ---
+
 subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
 
   BEGIN_DOC
@@ -14,81 +16,89 @@ subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
   integer                       :: occ(Nint*bit_kind_size,2)
   integer                       :: Ne(2),i,j,ii,jj,ispin,jspin,m,mm
   integer(bit_kind)             :: key_i_core(Nint,2)
-  double precision              :: direct_int, exchange_int
-  double precision              :: sym_3_e_int_from_6_idx_tensor
-  double precision              :: three_e_diag_parrallel_spin
+  double precision              :: direct_int, exchange_int, ref
+  double precision, external    :: sym_3_e_int_from_6_idx_tensor
+  double precision, external    :: three_e_diag_parrallel_spin
 
-  if(core_tc_op)then
-   do i = 1, Nint
-    key_i_core(i,1) = xor(key_i(i,1),core_bitmask(i,1))
-    key_i_core(i,2) = xor(key_i(i,2),core_bitmask(i,2))
-   enddo
-   call bitstring_to_list_ab(key_i_core,occ,Ne,Nint)
+  PROVIDE mo_l_coef mo_r_coef
+
+  if(core_tc_op) then
+    do i = 1, Nint
+      key_i_core(i,1) = xor(key_i(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(key_i(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, Nint)
   else
-   call bitstring_to_list_ab(key_i,occ,Ne,Nint)
+    call bitstring_to_list_ab(key_i, occ, Ne, Nint)
   endif
+
   hthree = 0.d0
 
-  if(Ne(1)+Ne(2).ge.3)then
-!!  ! alpha/alpha/beta three-body
-   do i = 1, Ne(1)
-    ii = occ(i,1)
-    do j = i+1, Ne(1)
-     jj = occ(j,1)
-     do m = 1, Ne(2)
-      mm = occ(m,2)
-!      direct_int = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) USES THE 6-IDX TENSOR
-!      exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) USES THE 6-IDX TENSOR
-      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,ii) ! USES 3-IDX TENSOR
-      exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii) ! USES 3-IDX TENSOR
-      hthree += direct_int - exchange_int
-     enddo
-    enddo
-   enddo
+  if((Ne(1)+Ne(2)) .ge. 3) then
 
-   ! beta/beta/alpha three-body
-   do i = 1, Ne(2)
-    ii = occ(i,2)
-    do j = i+1, Ne(2)
-     jj = occ(j,2)
-     do m = 1, Ne(1)
-      mm = occ(m,1)
-      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,ii)
-      exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)
-      hthree += direct_int - exchange_int
-     enddo
+    ! alpha/alpha/beta three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1)
+      do j = i+1, Ne(1)
+        jj = occ(j,1)
+        do m = 1, Ne(2)
+          mm = occ(m,2)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
     enddo
-   enddo
 
-   ! alpha/alpha/alpha three-body
-   do i = 1, Ne(1)
-    ii = occ(i,1) ! 1
-    do j = i+1, Ne(1)
-     jj = occ(j,1) ! 2
-     do m = j+1, Ne(1)
-      mm = occ(m,1) ! 3
-!      ref =  sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) USES THE 6 IDX TENSOR
-      hthree += three_e_diag_parrallel_spin(mm,jj,ii) ! USES ONLY 3-IDX TENSORS
-     enddo
+    ! beta/beta/alpha three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2)
+      do j = i+1, Ne(2)
+        jj = occ(j,2)
+        do m = 1, Ne(1)
+          mm = occ(m,1)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
     enddo
-   enddo
 
-   ! beta/beta/beta three-body
-   do i = 1, Ne(2)
-    ii = occ(i,2) ! 1
-    do j = i+1, Ne(2)
-     jj = occ(j,2) ! 2
-     do m = j+1, Ne(2)
-      mm = occ(m,2) ! 3
-!      ref =  sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) USES THE 6 IDX TENSOR
-      hthree += three_e_diag_parrallel_spin(mm,jj,ii) ! USES ONLY 3-IDX TENSORS
-     enddo
+    ! alpha/alpha/alpha three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1) ! 1
+      do j = i+1, Ne(1)
+        jj = occ(j,1) ! 2
+        do m = j+1, Ne(1)
+          mm = occ(m,1) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
     enddo
-   enddo
+
+    ! beta/beta/beta three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2) ! 1
+      do j = i+1, Ne(2)
+        jj = occ(j,2) ! 2
+        do m = j+1, Ne(2)
+          mm = occ(m,2) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
   endif
 
 end
 
+! ---
 
 subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
 
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index 933479e9..72f55aca 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -1,3 +1,4 @@
+
 ! ---
 
 subroutine provide_all_three_ints_bi_ortho()
@@ -25,9 +26,9 @@ subroutine provide_all_three_ints_bi_ortho()
       PROVIDE normal_two_body_bi_orth
     endif
 
- endif
+  endif
 
- return
+  return
 end
 
 ! ---
@@ -46,13 +47,19 @@ subroutine htilde_mu_mat_opt_bi_ortho_tot(key_j, key_i, Nint, htot)
   END_DOC
 
   use bitmasks
- integer,           intent(in) :: Nint
- integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
- double precision, intent(out) :: htot
- double precision :: hmono, htwoe, hthree
- call htilde_mu_mat_opt_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: htot
+  double precision :: hmono, htwoe, hthree
+
+  call htilde_mu_mat_opt_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+
 end
+
+! ---
+
 subroutine htilde_mu_mat_opt_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+
   BEGIN_DOC
   !
   ! <key_j |H_tilde | key_i> where |key_j> is developed on the LEFT basis and |key_i> is developed on the RIGHT basis
@@ -80,11 +87,11 @@ subroutine htilde_mu_mat_opt_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree,
   call get_excitation_degree(key_i, key_j, degree, Nint)
   if(degree.gt.2) return
 
-  if(degree == 0)then
+  if(degree == 0) then
     call diag_htilde_mu_mat_fock_bi_ortho  (Nint, key_i, hmono, htwoe, hthree, htot)
-  else if (degree == 1)then
-    call single_htilde_mu_mat_fock_bi_ortho(Nint,key_j, key_i , hmono, htwoe, hthree, htot)
-  else if(degree == 2)then
+  else if (degree == 1) then
+    call single_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i , hmono, htwoe, hthree, htot)
+  else if(degree == 2) then
     call double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe, hthree, htot)
   endif
 
diff --git a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
index 531f0141..d95c87b1 100644
--- a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
@@ -1,32 +1,48 @@
+
+! ---
+
  BEGIN_PROVIDER [ double precision, ref_tc_energy_tot]
 &BEGIN_PROVIDER [ double precision, ref_tc_energy_1e]
 &BEGIN_PROVIDER [ double precision, ref_tc_energy_2e]
 &BEGIN_PROVIDER [ double precision, ref_tc_energy_3e]
- implicit none
- BEGIN_DOC
-! Various component of the TC energy for the reference "HF" Slater determinant
- END_DOC 
- double precision :: hmono, htwoe, htot, hthree
- call diag_htilde_mu_mat_bi_ortho_slow(N_int,HF_bitmask , hmono, htwoe, htot)
- ref_tc_energy_1e = hmono
- ref_tc_energy_2e = htwoe 
- if(three_body_h_tc)then
-  call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
-  ref_tc_energy_3e = hthree
- else
-  ref_tc_energy_3e = 0.d0
- endif
- ref_tc_energy_tot = ref_tc_energy_1e + ref_tc_energy_2e + ref_tc_energy_3e + nuclear_repulsion
- END_PROVIDER 
+
+  BEGIN_DOC
+  ! Various component of the TC energy for the reference "HF" Slater determinant
+  END_DOC 
+
+  implicit none
+  double precision :: hmono, htwoe, htot, hthree
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  call diag_htilde_mu_mat_bi_ortho_slow(N_int, HF_bitmask, hmono, htwoe, htot)
+
+  ref_tc_energy_1e = hmono
+  ref_tc_energy_2e = htwoe 
+
+  if(three_body_h_tc) then
+    call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
+    ref_tc_energy_3e = hthree
+  else
+    ref_tc_energy_3e = 0.d0
+  endif
+
+  ref_tc_energy_tot = ref_tc_energy_1e + ref_tc_energy_2e + ref_tc_energy_3e + nuclear_repulsion
+
+END_PROVIDER 
+
+! ---
 
 subroutine diag_htilde_mu_mat_fock_bi_ortho(Nint, det_in, hmono, htwoe, hthree, htot)
-  implicit none
+
   BEGIN_DOC
   ! Computes $\langle i|H|i \rangle$.
   END_DOC
-  integer,intent(in)             :: Nint
-  integer(bit_kind),intent(in)   :: det_in(Nint,2)
-  double precision, intent(out)  :: hmono,htwoe,htot,hthree
+
+  implicit none
+  integer,           intent(in)  :: Nint
+  integer(bit_kind), intent(in)  :: det_in(Nint,2)
+  double precision,  intent(out) :: hmono, htwoe, htot, hthree
 
   integer(bit_kind)              :: hole(Nint,2)
   integer(bit_kind)              :: particle(Nint,2)
@@ -40,7 +56,6 @@ subroutine diag_htilde_mu_mat_fock_bi_ortho(Nint, det_in, hmono, htwoe, hthree,
   ASSERT (sum(popcnt(det_in(:,1))) == elec_alpha_num)
   ASSERT (sum(popcnt(det_in(:,2))) == elec_beta_num)
 
-
   nexc(1) = 0
   nexc(2) = 0
   do i=1,Nint
@@ -55,15 +70,15 @@ subroutine diag_htilde_mu_mat_fock_bi_ortho(Nint, det_in, hmono, htwoe, hthree,
   enddo
 
   if (nexc(1)+nexc(2) == 0) then
-    hmono = ref_tc_energy_1e
-    htwoe = ref_tc_energy_2e
-    hthree= ref_tc_energy_3e
-    htot = ref_tc_energy_tot
+    hmono  = ref_tc_energy_1e
+    htwoe  = ref_tc_energy_2e
+    hthree = ref_tc_energy_3e
+    htot   = ref_tc_energy_tot
     return
   endif
 
   !call debug_det(det_in,Nint)
-  integer                        :: tmp(2)
+  integer  :: tmp(2)
   !DIR$ FORCEINLINE
   call bitstring_to_list_ab(particle, occ_particle, tmp, Nint)
   ASSERT (tmp(1) == nexc(1)) ! Number of particles alpha
@@ -73,27 +88,31 @@ subroutine diag_htilde_mu_mat_fock_bi_ortho(Nint, det_in, hmono, htwoe, hthree,
   ASSERT (tmp(1) == nexc(1)) ! Number of holes alpha
   ASSERT (tmp(2) == nexc(2)) ! Number of holes beta 
 
-  
+  hmono  = ref_tc_energy_1e
+  htwoe  = ref_tc_energy_2e 
+  hthree = ref_tc_energy_3e
+
   det_tmp = ref_bitmask
-  hmono = ref_tc_energy_1e
-  htwoe = ref_tc_energy_2e 
-  hthree= ref_tc_energy_3e
-  do ispin=1,2
+
+  do ispin = 1, 2
     na = elec_num_tab(ispin)
     nb = elec_num_tab(iand(ispin,1)+1)
-    do i=1,nexc(ispin)
+    do i = 1, nexc(ispin)
       !DIR$ FORCEINLINE
-      call ac_tc_operator( occ_particle(i,ispin), ispin, det_tmp, hmono,htwoe,hthree, Nint,na,nb)
+      call ac_tc_operator(occ_particle(i,ispin), ispin, det_tmp, hmono, htwoe, hthree, Nint, na, nb)
       !DIR$ FORCEINLINE
-      call a_tc_operator ( occ_hole    (i,ispin), ispin, det_tmp, hmono,htwoe,hthree, Nint,na,nb)
+      call a_tc_operator (occ_hole    (i,ispin), ispin, det_tmp, hmono, htwoe, hthree, Nint, na, nb)
     enddo
   enddo
-  htot = hmono+htwoe+hthree+nuclear_repulsion
+
+  htot = hmono + htwoe + hthree + nuclear_repulsion
+
 end
 
-subroutine ac_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
-  use bitmasks
-  implicit none
+! ---
+
+subroutine ac_tc_operator(iorb, ispin, key, hmono, htwoe, hthree, Nint, na, nb)
+
   BEGIN_DOC
   ! Routine that computes one- and two-body energy corresponding 
   ! 
@@ -105,17 +124,20 @@ subroutine ac_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
   !
   ! and the quantities hmono,htwoe,hthree are INCREMENTED 
   END_DOC
-  integer, intent(in)            :: iorb, ispin, Nint
-  integer, intent(inout)         :: na, nb
+
+  use bitmasks
+  implicit none
+  integer,           intent(in)    :: iorb, ispin, Nint
+  integer,           intent(inout) :: na, nb
   integer(bit_kind), intent(inout) :: key(Nint,2)
-  double precision, intent(inout) :: hmono,htwoe,hthree
+  double precision,  intent(inout) :: hmono, htwoe, hthree
 
-  integer                        :: occ(Nint*bit_kind_size,2)
-  integer                        :: other_spin
-  integer                        :: k,l,i,jj,mm,j,m
-  double precision ::  direct_int, exchange_int
+  integer                          :: occ(Nint*bit_kind_size,2)
+  integer                          :: other_spin
+  integer                          :: k, l, i, jj, mm, j, m
+  integer                          :: tmp(2)
+  double precision                 :: direct_int, exchange_int
   
-
   if (iorb < 1) then
     print *,  irp_here, ': iorb < 1'
     print *,  iorb, mo_num
@@ -131,7 +153,6 @@ subroutine ac_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
   ASSERT (ispin < 3)
   ASSERT (Nint > 0)
 
-  integer                        :: tmp(2)
   !DIR$ FORCEINLINE
   call bitstring_to_list_ab(key, occ, tmp, Nint)
   ASSERT (tmp(1) == elec_alpha_num)
@@ -147,50 +168,54 @@ subroutine ac_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
   hmono = hmono + mo_bi_ortho_tc_one_e(iorb,iorb)
 
   ! Same spin
-  do i=1,na
+  do i = 1, na
     htwoe = htwoe + mo_bi_ortho_tc_two_e_jj_anti(occ(i,ispin),iorb)
   enddo
 
   ! Opposite spin
-  do i=1,nb
+  do i = 1, nb
     htwoe = htwoe + mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
   enddo
 
-  if(three_body_h_tc.and.elec_num.gt.2.and.three_e_3_idx_term)then
-   !!!!! 3-e part 
-   !! same-spin/same-spin
-   do j = 1, na
-    jj = occ(j,ispin)
-    do m = j+1, na
-     mm = occ(m,ispin)
-     hthree += three_e_diag_parrallel_spin_prov(mm,jj,iorb)
+  if(three_body_h_tc .and. (elec_num.gt.2) .and. three_e_3_idx_term) then
+
+    !!!!! 3-e part 
+    !! same-spin/same-spin
+    do j = 1, na
+      jj = occ(j,ispin)
+      do m = j+1, na
+        mm = occ(m,ispin)
+        hthree += three_e_diag_parrallel_spin_prov(mm,jj,iorb)
+      enddo
     enddo
-   enddo
-   !! same-spin/oposite-spin
-   do j = 1, na
-    jj = occ(j,ispin)
-    do m = 1, nb
-     mm = occ(m,other_spin)
-     direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
-     exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
-     hthree += direct_int - exchange_int
+    !! same-spin/oposite-spin
+    do j = 1, na
+      jj = occ(j,ispin)
+      do m = 1, nb
+        mm = occ(m,other_spin)
+        direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
+        exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
+        hthree += direct_int - exchange_int
+      enddo
     enddo
-   enddo
-   !! oposite-spin/opposite-spin
+    !! oposite-spin/opposite-spin
     do j = 1, nb
-     jj = occ(j,other_spin) 
-     do m = j+1, nb 
-      mm = occ(m,other_spin) 
-      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
-      exchange_int = three_e_3_idx_exch23_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
-      hthree += direct_int - exchange_int
-     enddo
+      jj = occ(j,other_spin) 
+      do m = j+1, nb 
+        mm = occ(m,other_spin) 
+        direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
+        exchange_int = three_e_3_idx_exch23_bi_ort(mm,jj,iorb) ! USES 3-IDX TENSOR 
+        hthree += direct_int - exchange_int
+      enddo
     enddo
   endif
 
   na = na+1
+
 end
 
+! ---
+
 subroutine a_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
   use bitmasks
   implicit none
@@ -460,14 +485,16 @@ subroutine a_tc_operator_no_3e(iorb,ispin,key,hmono,htwoe,Nint,na,nb)
   hmono = hmono - mo_bi_ortho_tc_one_e(iorb,iorb)
 
   ! Same spin
-  do i=1,na
-    htwoe= htwoe- mo_bi_ortho_tc_two_e_jj_anti(occ(i,ispin),iorb)
+  do i = 1, na
+    htwoe = htwoe- mo_bi_ortho_tc_two_e_jj_anti(occ(i,ispin),iorb)
   enddo
 
   ! Opposite spin
-  do i=1,nb
-    htwoe= htwoe- mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
+  do i = 1, nb
+    htwoe = htwoe- mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
   enddo
 
 end
 
+! ---
+
diff --git a/src/tc_bi_ortho/slater_tc_slow.irp.f b/src/tc_bi_ortho/slater_tc_slow.irp.f
index 0e0b5812..83a56d2d 100644
--- a/src/tc_bi_ortho/slater_tc_slow.irp.f
+++ b/src/tc_bi_ortho/slater_tc_slow.irp.f
@@ -21,7 +21,7 @@ subroutine htilde_mu_mat_bi_ortho_tot_slow(key_j, key_i, Nint, htot)
   integer :: degree
 
   call get_excitation_degree(key_j, key_i, degree, Nint)
-  if(degree.gt.2)then
+  if(degree.gt.2) then
     htot = 0.d0
   else
     call htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
@@ -55,27 +55,27 @@ subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree,
   hmono  = 0.d0
   htwoe  = 0.d0
   htot   = 0.d0
-  hthree = 0.D0
+  hthree = 0.d0
 
   call get_excitation_degree(key_i, key_j, degree, Nint)
   if(degree.gt.2) return
 
-  if(degree == 0)then
+  if(degree == 0) then
     call diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
-  else if (degree == 1)then
+  else if (degree == 1) then
     call single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
-  else if(degree == 2)then
+  else if(degree == 2) then
     call double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
   endif
 
   if(three_body_h_tc) then
     if(degree == 2) then
-      if(.not.double_normal_ord.and.elec_num.gt.2.and.three_e_5_idx_term) then
+      if((.not.double_normal_ord) .and. (elec_num .gt. 2) .and. three_e_5_idx_term) then
         call double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
       endif
-    else if(degree == 1.and.elec_num.gt.2.and.three_e_4_idx_term) then
+    else if((degree == 1) .and. (elec_num .gt. 2) .and. three_e_4_idx_term) then
       call single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
-    else if(degree == 0.and.elec_num.gt.2.and.three_e_3_idx_term) then
+    else if((degree == 0) .and. (elec_num .gt. 2) .and. three_e_3_idx_term) then
       call diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
     endif
   endif
@@ -106,6 +106,8 @@ subroutine diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
   double precision               :: get_mo_two_e_integral_tc_int
   integer(bit_kind)              :: key_i_core(Nint,2)
 
+  PROVIDE mo_bi_ortho_tc_two_e
+
 !  PROVIDE mo_two_e_integrals_tc_int_in_map mo_bi_ortho_tc_two_e
 !
 !  PROVIDE mo_integrals_erf_map core_energy nuclear_repulsion core_bitmask
@@ -135,15 +137,6 @@ subroutine diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
     ii = occ(i,ispin) 
     hmono += mo_bi_ortho_tc_one_e(ii,ii)
 
-!    if(j1b_gauss .eq. 1) then
-!      print*,'j1b not implemented for bi ortho TC'
-!      print*,'stopping  ....'
-!      stop
-!      !hmono += mo_j1b_gauss_hermI  (ii,ii) &
-!      !       + mo_j1b_gauss_hermII (ii,ii) &
-!      !       + mo_j1b_gauss_nonherm(ii,ii)
-!    endif
-
 !    if(core_tc_op)then
 !   print*,'core_tc_op not already taken into account for bi ortho'
 !   print*,'stopping ...'
diff --git a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
index 3180d946..66360c36 100644
--- a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
+++ b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
@@ -41,14 +41,21 @@ subroutine give_all_perm_for_three_e(n,l,k,m,j,i,idx_list,phase)
 
 end
 
-double precision function sym_3_e_int_from_6_idx_tensor(n,l,k,m,j,i)
- implicit none
- BEGIN_DOC
- ! returns all good combinations of permutations of integrals with the good signs 
- !
- ! for a given (k^dagger l^dagger n^dagger m j i)  <nlk|L|mji> when all indices have the same spins
- END_DOC
- integer, intent(in)  :: n,l,k,m,j,i
+! ---
+
+double precision function sym_3_e_int_from_6_idx_tensor(n, l, k, m, j, i)
+
+  BEGIN_DOC
+  ! returns all good combinations of permutations of integrals with the good signs 
+  !
+  ! for a given (k^dagger l^dagger n^dagger m j i)  <nlk|L|mji> when all indices have the same spins
+  END_DOC
+ 
+  implicit none
+  integer, intent(in) :: n, l, k, m, j, i
+
+  PROVIDE mo_l_coef mo_r_coef
+
  sym_3_e_int_from_6_idx_tensor = three_body_ints_bi_ort(n,l,k,m,j,i) & ! direct 
                                + three_body_ints_bi_ort(n,l,k,j,i,m) & ! 1st cyclic permutation  
                                + three_body_ints_bi_ort(n,l,k,i,m,j) & ! 2nd cyclic permutation  
@@ -56,8 +63,11 @@ double precision function sym_3_e_int_from_6_idx_tensor(n,l,k,m,j,i)
                                - three_body_ints_bi_ort(n,l,k,i,j,m) & ! elec 2 is kept fixed
                                - three_body_ints_bi_ort(n,l,k,m,i,j)   ! elec 3 is kept fixed
 
+  return
 end
 
+! ---
+
 double precision function direct_sym_3_e_int(n,l,k,m,j,i)
  implicit none
  BEGIN_DOC
@@ -83,15 +93,25 @@ double precision function direct_sym_3_e_int(n,l,k,m,j,i)
 
 end
 
-double precision function three_e_diag_parrallel_spin(m,j,i)
- implicit none
- integer, intent(in) :: i,j,m
+! ---
+
+double precision function three_e_diag_parrallel_spin(m, j, i)
+
+  implicit none
+  integer, intent(in) :: i, j, m
+
+  PROVIDE mo_l_coef mo_r_coef
+
   three_e_diag_parrallel_spin = three_e_3_idx_direct_bi_ort(m,j,i)  ! direct
   three_e_diag_parrallel_spin += three_e_3_idx_cycle_1_bi_ort(m,j,i) + three_e_3_idx_cycle_2_bi_ort(m,j,i) & ! two cyclic permutations 
-  - three_e_3_idx_exch23_bi_ort(m,j,i) - three_e_3_idx_exch13_bi_ort(m,j,i)  & ! two first exchange 
-  - three_e_3_idx_exch12_bi_ort(m,j,i) ! last exchange 
+                               - three_e_3_idx_exch23_bi_ort (m,j,i) - three_e_3_idx_exch13_bi_ort(m,j,i)  & ! two first exchange 
+                               - three_e_3_idx_exch12_bi_ort (m,j,i)                                         ! last exchange 
+
+  return
 end
 
+! ---
+
 double precision function three_e_single_parrallel_spin(m,j,k,i)
  implicit none
  integer, intent(in) :: i,k,j,m
diff --git a/src/tc_bi_ortho/tc_bi_ortho.irp.f b/src/tc_bi_ortho/tc_bi_ortho.irp.f
index f69684c0..2887c7be 100644
--- a/src/tc_bi_ortho/tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/tc_bi_ortho.irp.f
@@ -8,11 +8,13 @@ program tc_bi_ortho
   END_DOC
 
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   print*, ' nb of states = ', N_states
   print*, ' nb of det    = ', N_det
@@ -20,22 +22,29 @@ program tc_bi_ortho
   call routine_diag()
   call write_tc_energy()
   call save_tc_bi_ortho_wavefunction()
-end
-
-subroutine test
- implicit none
- integer :: i,j 
- double precision :: hmono,htwoe,hthree,htot
- use bitmasks
- print*,'reading the wave function '
- do i = 1, N_det
-  call debug_det(psi_det(1,1,i),N_int)
-  print*,i,psi_l_coef_bi_ortho(i,1)*psi_r_coef_bi_ortho(i,1)
-  print*,i,psi_l_coef_bi_ortho(i,1),psi_r_coef_bi_ortho(i,1)
- enddo
 
 end
 
+! ---
+
+subroutine test()
+
+  use bitmasks
+  implicit none
+  integer          :: i, j 
+  double precision :: hmono, htwoe, hthree, htot
+
+  print*, 'reading the wave function '
+  do i = 1, N_det
+    call debug_det(psi_det(1,1,i), N_int)
+    print*, i, psi_l_coef_bi_ortho(i,1)*psi_r_coef_bi_ortho(i,1)
+    print*, i, psi_l_coef_bi_ortho(i,1),psi_r_coef_bi_ortho(i,1)
+  enddo
+
+end
+
+! ---
+
 subroutine routine_diag()
 
   implicit none
diff --git a/src/tc_bi_ortho/tc_bi_ortho_prop.irp.f b/src/tc_bi_ortho/tc_bi_ortho_prop.irp.f
index 28f122ee..9168fb3d 100644
--- a/src/tc_bi_ortho/tc_bi_ortho_prop.irp.f
+++ b/src/tc_bi_ortho/tc_bi_ortho_prop.irp.f
@@ -1,19 +1,32 @@
+
+! ---
+
 program tc_bi_ortho_prop
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-!  call routine_diag
- call test
+
+  !call routine_diag
+  call test
+
 end
 
+! ---
+
 subroutine test
  implicit none
  integer :: i
diff --git a/src/tc_bi_ortho/tc_cisd_sc2.irp.f b/src/tc_bi_ortho/tc_cisd_sc2.irp.f
index 0fb9f524..d4c8c55d 100644
--- a/src/tc_bi_ortho/tc_cisd_sc2.irp.f
+++ b/src/tc_bi_ortho/tc_cisd_sc2.irp.f
@@ -1,20 +1,32 @@
-program tc_bi_ortho
-  implicit none
+
+! ---
+
+program tc_cisd_sc2
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   call test
+
 end
 
-subroutine test
+! ---
+
+subroutine test()
  implicit none
 ! double precision, allocatable :: dressing_dets(:),e_corr_dets(:)
 ! allocate(dressing_dets(N_det),e_corr_dets(N_det))
diff --git a/src/tc_bi_ortho/tc_h_eigvectors.irp.f b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
index fa946d6a..f027c38f 100644
--- a/src/tc_bi_ortho/tc_h_eigvectors.irp.f
+++ b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
@@ -1,42 +1,56 @@
+
+! ---
+
   use bitmasks
 
- BEGIN_PROVIDER [ integer, index_HF_psi_det]                                                                                                            
- implicit none
- integer :: i,degree
- do i = 1, N_det
-   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-   if(degree == 0)then
-    index_HF_psi_det = i
-    exit
-   endif
- enddo
- END_PROVIDER
+! ---
+
+BEGIN_PROVIDER [integer, index_HF_psi_det]                                                                                                            
 
-subroutine diagonalize_CI_tc
   implicit none
+  integer :: i, degree
+
+  do i = 1, N_det
+    call get_excitation_degree(HF_bitmask, psi_det(1,1,i), degree, N_int)
+    if(degree == 0) then
+      index_HF_psi_det = i
+      exit
+    endif
+  enddo
+
+END_PROVIDER
+
+! ---
+
+subroutine diagonalize_CI_tc()
+
   BEGIN_DOC
-!  Replace the coefficients of the |CI| states by the coefficients of the
-!  eigenstates of the |CI| matrix.
+  !  Replace the coefficients of the |CI| states by the coefficients of the
+  !  eigenstates of the |CI| matrix.
   END_DOC
-  integer                        :: i,j
-  do j=1,N_states
-    do i=1,N_det
+
+  implicit none
+  integer :: i, j
+
+  do j = 1, N_states
+    do i = 1, N_det
       psi_l_coef_bi_ortho(i,j) = leigvec_tc_bi_orth(i,j)
       psi_r_coef_bi_ortho(i,j) = reigvec_tc_bi_orth(i,j)
     enddo
   enddo
 
   SOFT_TOUCH psi_l_coef_bi_ortho psi_r_coef_bi_ortho
+
 end
 
+! ---
 
-
- BEGIN_PROVIDER [double precision, eigval_right_tc_bi_orth, (N_states)]
-&BEGIN_PROVIDER [double precision, eigval_left_tc_bi_orth, (N_states)]
-&BEGIN_PROVIDER [double precision, reigvec_tc_bi_orth, (N_det,N_states)]
-&BEGIN_PROVIDER [double precision, leigvec_tc_bi_orth, (N_det,N_states)]
-&BEGIN_PROVIDER [double precision, s2_eigvec_tc_bi_orth, (N_states)]
-&BEGIN_PROVIDER [double precision, norm_ground_left_right_bi_orth ]
+ BEGIN_PROVIDER [double precision, eigval_right_tc_bi_orth, (N_states)      ]
+&BEGIN_PROVIDER [double precision, eigval_left_tc_bi_orth , (N_states)      ]
+&BEGIN_PROVIDER [double precision, reigvec_tc_bi_orth     , (N_det,N_states)]
+&BEGIN_PROVIDER [double precision, leigvec_tc_bi_orth     , (N_det,N_states)]
+&BEGIN_PROVIDER [double precision, s2_eigvec_tc_bi_orth   , (N_states)      ]
+&BEGIN_PROVIDER [double precision, norm_ground_left_right_bi_orth           ]
 
   BEGIN_DOC
   ! eigenvalues, right and left eigenvectors of the transcorrelated Hamiltonian on the BI-ORTHO basis 
@@ -44,29 +58,29 @@ end
 
   implicit none
   integer                       :: i, idx_dress, j, istate, k
+  integer                       :: i_good_state, i_other_state, i_state
+  integer                       :: n_real_tc_bi_orth_eigval_right, igood_r, igood_l
   logical                       :: converged, dagger
-  integer                       :: n_real_tc_bi_orth_eigval_right,igood_r,igood_l
-  double precision, allocatable :: reigvec_tc_bi_orth_tmp(:,:),leigvec_tc_bi_orth_tmp(:,:),eigval_right_tmp(:)
+  double precision, parameter   :: alpha = 0.1d0
+  integer,          allocatable :: index_good_state_array(:)
+  integer,          allocatable :: iorder(:)
+  logical,          allocatable :: good_state_array(:)
+  double precision, allocatable :: reigvec_tc_bi_orth_tmp(:,:), leigvec_tc_bi_orth_tmp(:,:),eigval_right_tmp(:)
   double precision, allocatable :: s2_values_tmp(:), H_prime(:,:), expect_e(:)
-  double precision, parameter :: alpha = 0.1d0
-  integer                        :: i_good_state,i_other_state, i_state
-  integer, allocatable           :: index_good_state_array(:)
-  logical, allocatable           :: good_state_array(:)
-  double precision, allocatable  :: coef_hf_r(:),coef_hf_l(:)
-  double precision, allocatable  :: Stmp(:,:)
-  integer, allocatable :: iorder(:)
+  double precision, allocatable :: coef_hf_r(:),coef_hf_l(:)
+  double precision, allocatable :: Stmp(:,:)
 
   PROVIDE N_det N_int
 
-  if(n_det .le. N_det_max_full) then
+  if(N_det .le. N_det_max_full) then
 
-    allocate(reigvec_tc_bi_orth_tmp(N_det,N_det),leigvec_tc_bi_orth_tmp(N_det,N_det),eigval_right_tmp(N_det),expect_e(N_det))
-    allocate (H_prime(N_det,N_det),s2_values_tmp(N_det))
+    allocate(reigvec_tc_bi_orth_tmp(N_det,N_det), leigvec_tc_bi_orth_tmp(N_det,N_det), eigval_right_tmp(N_det), expect_e(N_det))
+    allocate(H_prime(N_det,N_det), s2_values_tmp(N_det))
 
     H_prime(1:N_det,1:N_det) = htilde_matrix_elmt_bi_ortho(1:N_det,1:N_det)
     if(s2_eig) then
       H_prime(1:N_det,1:N_det) += alpha * S2_matrix_all_dets(1:N_det,1:N_det)
-      do j=1,N_det
+      do j = 1, N_det
         H_prime(j,j) = H_prime(j,j) - alpha*expected_s2
       enddo
     endif
diff --git a/src/tc_bi_ortho/tc_hmat.irp.f b/src/tc_bi_ortho/tc_hmat.irp.f
index e2c6f010..ceabf853 100644
--- a/src/tc_bi_ortho/tc_hmat.irp.f
+++ b/src/tc_bi_ortho/tc_hmat.irp.f
@@ -31,7 +31,9 @@
 
 END_PROVIDER 
 
- BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho_tranp, (N_det,N_det)]
+! ---
+
+BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho_tranp, (N_det,N_det)]
  implicit none
  integer ::i,j
   do i = 1, N_det
diff --git a/src/tc_bi_ortho/tc_som.irp.f b/src/tc_bi_ortho/tc_som.irp.f
index a7e4d09e..427508d2 100644
--- a/src/tc_bi_ortho/tc_som.irp.f
+++ b/src/tc_bi_ortho/tc_som.irp.f
@@ -12,10 +12,9 @@ program tc_som
   print *, ' do not forget to do tc-scf first'
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   PROVIDE mu_erf 
diff --git a/src/tc_bi_ortho/tc_utils.irp.f b/src/tc_bi_ortho/tc_utils.irp.f
index 9023e2f0..53fe5884 100644
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@@ -5,27 +5,34 @@ subroutine write_tc_energy()
   integer          :: i, j, k
   double precision :: hmono, htwoe, hthree, htot
   double precision :: E_TC, O_TC
+  double precision :: E_1e, E_2e, E_3e
 
   do k = 1, n_states
 
     E_TC = 0.d0
+    E_1e = 0.d0
+    E_2e = 0.d0
+    E_3e = 0.d0
     do i = 1, N_det
       do j = 1, N_det
-        !htot = htilde_matrix_elmt_bi_ortho(i,j)
         call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
         E_TC = E_TC + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htot
-        !E_TC = E_TC + leigvec_tc_bi_orth(i,k) * reigvec_tc_bi_orth(j,k) * htot
+        E_1e = E_1e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * hmono
+        E_2e = E_2e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htwoe
+        E_3e = E_3e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * hthree
       enddo
     enddo
 
     O_TC = 0.d0
     do i = 1, N_det
-      !O_TC = O_TC + leigvec_tc_bi_orth(i,k) * reigvec_tc_bi_orth(i,k)
       O_TC = O_TC + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(i,k)
     enddo
 
     print *, ' state :', k
     print *, " E_TC = ", E_TC / O_TC
+    print *, " E_1e = ", E_1e / O_TC
+    print *, " E_2e = ", E_2e / O_TC
+    print *, " E_3e = ", E_3e / O_TC
     print *, " O_TC = ", O_TC
 
   enddo
diff --git a/src/tc_bi_ortho/test_natorb.irp.f b/src/tc_bi_ortho/test_natorb.irp.f
index 54c9a827..5b8801f7 100644
--- a/src/tc_bi_ortho/test_natorb.irp.f
+++ b/src/tc_bi_ortho/test_natorb.irp.f
@@ -1,21 +1,34 @@
+
+! ---
+
 program test_natorb
-  implicit none
+
   BEGIN_DOC
-! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together with the energy. Saves the left-right wave functions at the end. 
+  ! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together with the energy. Saves the left-right wave functions at the end. 
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call routine
-! call test
+
+  call routine()
+  ! call test()
 
 end
 
-subroutine routine
+! ---
+
+subroutine routine()
+
  implicit none
  double precision, allocatable :: fock_diag(:),eigval(:),leigvec(:,:),reigvec(:,:),mat_ref(:,:)
  allocate(eigval(mo_num),leigvec(mo_num,mo_num),reigvec(mo_num,mo_num),fock_diag(mo_num),mat_ref(mo_num, mo_num))
diff --git a/src/tc_bi_ortho/test_normal_order.irp.f b/src/tc_bi_ortho/test_normal_order.irp.f
index cb0c355c..e805eecb 100644
--- a/src/tc_bi_ortho/test_normal_order.irp.f
+++ b/src/tc_bi_ortho/test_normal_order.irp.f
@@ -1,19 +1,32 @@
+
+! ---
+
 program test_normal_order
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-  call provide_all_three_ints_bi_ortho
-  call test
+
+  call provide_all_three_ints_bi_ortho()
+  call test()
+
 end
 
+! ---
+
 subroutine test
  implicit none
   use bitmasks ! you need to include the bitmasks_module.f90 features
diff --git a/src/tc_bi_ortho/test_s2_tc.irp.f b/src/tc_bi_ortho/test_s2_tc.irp.f
index 1f7bdfda..b398507a 100644
--- a/src/tc_bi_ortho/test_s2_tc.irp.f
+++ b/src/tc_bi_ortho/test_s2_tc.irp.f
@@ -1,14 +1,22 @@
+
+! ---
+
 program test_tc 
- implicit none
- read_wf = .True.
- my_grid_becke = .True.
- my_n_pt_r_grid = 30
- my_n_pt_a_grid = 50
- read_wf = .True.
- touch read_wf
- touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call routine_test_s2
- call routine_test_s2_davidson
+
+  implicit none
+ 
+  my_grid_becke = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  read_wf = .True.
+  touch read_wf
+
+  call routine_test_s2
+  call routine_test_s2_davidson
+
 end
 
 subroutine routine_test_s2
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index 33b5c5aa..b6beb65b 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -1,15 +1,24 @@
+
+! ---
+
 program tc_bi_ortho
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
 ! call test_h_u0
 ! call test_slater_tc_opt
@@ -19,6 +28,9 @@ program tc_bi_ortho
 ! call timing_double
 
   call test_no()
+  !call test_no_aba()
+  !call test_no_aab()
+  !call test_no_aaa()
 end
 
 subroutine test_h_u0
@@ -297,4 +309,126 @@ end
 
 ! ---
 
+subroutine test_no_aba()
 
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing no_aba_contraction ...'
+
+  thr = 1d-8
+
+  PROVIDE no_aba_contraction_v0
+  PROVIDE no_aba_contraction
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = no_aba_contraction   (l,k,j,i)
+          ref = no_aba_contraction_v0(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on no_aba_contraction'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on no_aba_contraction = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
+
+
+subroutine test_no_aab()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing no_aab_contraction ...'
+
+  thr = 1d-8
+
+  PROVIDE no_aab_contraction_v0
+  PROVIDE no_aab_contraction
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = no_aab_contraction   (l,k,j,i)
+          ref = no_aab_contraction_v0(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on no_aab_contraction'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on no_aab_contraction = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
+
+subroutine test_no_aaa()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing no_aaa_contraction ...'
+
+  thr = 1d-8
+
+  PROVIDE no_aaa_contraction_v0
+  PROVIDE no_aaa_contraction
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = no_aaa_contraction   (l,k,j,i)
+          ref = no_aaa_contraction_v0(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on no_aaa_contraction'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on no_aaa_contraction = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
diff --git a/src/tc_bi_ortho/test_tc_fock.irp.f b/src/tc_bi_ortho/test_tc_fock.irp.f
index b7de067f..182c03d7 100644
--- a/src/tc_bi_ortho/test_tc_fock.irp.f
+++ b/src/tc_bi_ortho/test_tc_fock.irp.f
@@ -1,22 +1,32 @@
+
+! ---
+
 program test_tc_fock
- implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   print *, 'Hello world'
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
   read_wf = .True.
   touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   !call routine_1
   !call routine_2
 !  call routine_3()
 
 ! call test_3e
- call routine_tot
+  call routine_tot
+
 end
 
 ! ---
diff --git a/src/tc_keywords/EZFIO.cfg b/src/tc_keywords/EZFIO.cfg
index a69f5bac..29c238e1 100644
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@@ -130,6 +130,12 @@ doc: exponents of the 1-body Jastrow
 interface: ezfio
 size: (nuclei.nucl_num)
 
+[j1b_pen_coef]
+type: double precision
+doc: coefficients of the 1-body Jastrow
+interface: ezfio
+size: (nuclei.nucl_num)
+
 [j1b_coeff]
 type: double precision
 doc: coeff of the 1-body Jastrow
@@ -256,3 +262,16 @@ doc: If |true|, use Manu IPP
 interface: ezfio,provider,ocaml
 default: True
 
+[tc_grid1_a]
+type: integer
+doc: size of angular grid over r1
+interface: ezfio,provider,ocaml
+default: 50
+
+[tc_grid1_r]
+type: integer
+doc: size of radial grid over r1
+interface: ezfio,provider,ocaml
+default: 30
+
+
diff --git a/src/tc_keywords/j1b_pen.irp.f b/src/tc_keywords/j1b_pen.irp.f
index 57250b52..2d5e59a9 100644
--- a/src/tc_keywords/j1b_pen.irp.f
+++ b/src/tc_keywords/j1b_pen.irp.f
@@ -1,17 +1,22 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
+ BEGIN_PROVIDER [ double precision, j1b_pen     , (nucl_num) ]
+&BEGIN_PROVIDER [ double precision, j1b_pen_coef, (nucl_num) ]
 
   BEGIN_DOC
-  ! exponents of the 1-body Jastrow
+  ! parameters of the 1-body Jastrow
   END_DOC
 
   implicit none
   logical :: exists
+  integer :: i
+  integer :: ierr
 
   PROVIDE ezfio_filename
 
+  ! ---
+
   if (mpi_master) then
     call ezfio_has_tc_keywords_j1b_pen(exists)
   endif
@@ -23,7 +28,6 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
 
   IRP_IF MPI
     include 'mpif.h'
-    integer :: ierr
     call MPI_BCAST(j1b_pen, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
     if (ierr /= MPI_SUCCESS) then
       stop 'Unable to read j1b_pen with MPI'
@@ -31,7 +35,6 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
   IRP_ENDIF
 
   if (exists) then
-
     if (mpi_master) then
       write(6,'(A)') '.. >>>>> [ IO READ: j1b_pen ] <<<<< ..'
       call ezfio_get_tc_keywords_j1b_pen(j1b_pen)
@@ -42,19 +45,54 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
         endif
       IRP_ENDIF
     endif
-
   else
- 
-    integer :: i
     do i = 1, nucl_num
       j1b_pen(i) = 1d5
     enddo
-
   endif
- print*,'parameters for nuclei jastrow'
- do i = 1, nucl_num
-  print*,'i,Z,j1b_pen(i)',i,nucl_charge(i),j1b_pen(i)
- enddo
+
+  ! ---
+
+  if (mpi_master) then
+    call ezfio_has_tc_keywords_j1b_pen_coef(exists)
+  endif
+
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+
+  IRP_IF MPI
+    call MPI_BCAST(j1b_pen_coef, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      stop 'Unable to read j1b_pen_coef with MPI'
+    endif
+  IRP_ENDIF
+
+  if (exists) then
+    if (mpi_master) then
+      write(6,'(A)') '.. >>>>> [ IO READ: j1b_pen_coef ] <<<<< ..'
+      call ezfio_get_tc_keywords_j1b_pen_coef(j1b_pen_coef)
+      IRP_IF MPI
+        call MPI_BCAST(j1b_pen_coef, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          stop 'Unable to read j1b_pen_coef with MPI'
+        endif
+      IRP_ENDIF
+    endif
+  else
+    do i = 1, nucl_num
+      j1b_pen_coef(i) = 1d0
+    enddo
+  endif
+
+  ! ---
+
+  print *, ' parameters for nuclei jastrow'
+  print *, ' i, Z, j1b_pen, j1b_pen_coef'
+  do i = 1, nucl_num
+    write(*,"(I4, 2x, 3(E15.7, 2X))"), i, nucl_charge(i), j1b_pen(i), j1b_pen_coef(i)
+  enddo
 
 END_PROVIDER
 
@@ -114,3 +152,4 @@ BEGIN_PROVIDER [ double precision, j1b_coeff, (nucl_num) ]
 END_PROVIDER
 
 ! ---
+
diff --git a/src/tc_scf/combine_lr_tcscf.irp.f b/src/tc_scf/combine_lr_tcscf.irp.f
index b257f4a5..a22614ba 100644
--- a/src/tc_scf/combine_lr_tcscf.irp.f
+++ b/src/tc_scf/combine_lr_tcscf.irp.f
@@ -10,8 +10,9 @@ program combine_lr_tcscf
   implicit none
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   bi_ortho = .True.
diff --git a/src/tc_scf/fock_three_hermit.irp.f b/src/tc_scf/fock_three_hermit.irp.f
index fe8fbfd7..a936da9b 100644
--- a/src/tc_scf/fock_three_hermit.irp.f
+++ b/src/tc_scf/fock_three_hermit.irp.f
@@ -1,30 +1,36 @@
+
+! ---
+
 BEGIN_PROVIDER [ double precision, fock_3_mat, (mo_num, mo_num)] 
- implicit none
+
+  implicit none
   integer :: i,j
   double precision :: contrib
+
   fock_3_mat = 0.d0
-  if(.not.bi_ortho.and.three_body_h_tc)then
-   call give_fock_ia_three_e_total(1,1,contrib)
-!!  !$OMP PARALLEL                  &
-!!  !$OMP DEFAULT (NONE)            &
-!!  !$OMP PRIVATE (i,j,m,integral) & 
-!!  !$OMP SHARED (mo_num,three_body_3_index)
-!!  !$OMP DO SCHEDULE (guided) COLLAPSE(3)
-   do i = 1, mo_num
-    do j = 1, mo_num
-     call give_fock_ia_three_e_total(j,i,contrib)
-     fock_3_mat(j,i) = -contrib
-    enddo
-   enddo
-  else if(bi_ortho.and.three_body_h_tc)then
-!!  !$OMP END DO
-!!  !$OMP END PARALLEL
-!!  do i = 1, mo_num
-!!   do j = 1, i-1
-!!    mat_three(j,i) = mat_three(i,j)
-!!   enddo
-!!  enddo
- endif
+  if(.not.bi_ortho .and. three_body_h_tc) then
+
+    call give_fock_ia_three_e_total(1, 1, contrib)
+    !!  !$OMP PARALLEL                  &
+    !!  !$OMP DEFAULT (NONE)            &
+    !!  !$OMP PRIVATE (i,j,m,integral) & 
+    !!  !$OMP SHARED (mo_num,three_body_3_index)
+    !!  !$OMP DO SCHEDULE (guided) COLLAPSE(3)
+    do i = 1, mo_num
+      do j = 1, mo_num
+        call give_fock_ia_three_e_total(j,i,contrib)
+        fock_3_mat(j,i) = -contrib
+       enddo
+     enddo
+    !else if(bi_ortho.and.three_body_h_tc) then
+    !!  !$OMP END DO
+    !!  !$OMP END PARALLEL
+    !!  do i = 1, mo_num
+    !!   do j = 1, i-1
+    !!    mat_three(j,i) = mat_three(i,j)
+    !!   enddo
+    !!  enddo
+  endif
 
 END_PROVIDER 
 
@@ -72,6 +78,7 @@ end
 
 ! ---
 
+! TODO DGEMM
 BEGIN_PROVIDER [double precision, diag_three_elem_hf]
 
   implicit none
@@ -100,7 +107,7 @@ BEGIN_PROVIDER [double precision, diag_three_elem_hf]
       do i = 1, elec_beta_num
         do j = 1, elec_beta_num
           do k = 1, elec_beta_num
-            call give_integrals_3_body(k, j, i, j, i, k,exchange_int_231)   
+            call give_integrals_3_body(k, j, i, j, i, k, exchange_int_231)   
             diag_three_elem_hf += two_third * exchange_int_231
           enddo
         enddo
diff --git a/src/tc_scf/minimize_tc_angles.irp.f b/src/tc_scf/minimize_tc_angles.irp.f
index 5d3ff7f0..c7752930 100644
--- a/src/tc_scf/minimize_tc_angles.irp.f
+++ b/src/tc_scf/minimize_tc_angles.irp.f
@@ -1,17 +1,26 @@
-program print_angles
- implicit none
- BEGIN_DOC
-! program that minimizes the angle between left- and right-orbitals when degeneracies are found in the TC-Fock matrix
- END_DOC
+
+! ---
+
+program minimize_tc_angles
+
+  BEGIN_DOC
+  ! program that minimizes the angle between left- and right-orbitals when degeneracies are found in the TC-Fock matrix
+  END_DOC
+
+  implicit none
+
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_n_pt_r_grid my_n_pt_a_grid
-!  call sort_by_tc_fock
+
+  !  call sort_by_tc_fock
 
   ! TODO
   ! check if rotations of orbitals affect the TC energy
   ! and refuse the step
   call minimize_tc_orb_angles
+
 end
 
diff --git a/src/tc_scf/molden_lr_mos.irp.f b/src/tc_scf/molden_lr_mos.irp.f
index 735349ba..b86009ee 100644
--- a/src/tc_scf/molden_lr_mos.irp.f
+++ b/src/tc_scf/molden_lr_mos.irp.f
@@ -1,6 +1,9 @@
-program molden
+! ---
+
+program molden_lr_mos
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
 
   implicit none
@@ -8,19 +11,26 @@ program molden
   print *, 'starting ...'
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
-  call molden_lr
+  !call molden_lr
+  call molden_l()
+  call molden_r()
+
 end
+
+! ---
+
 subroutine molden_lr
-  implicit none
+
   BEGIN_DOC
   ! Produces a Molden file
   END_DOC
+
+  implicit none
   character*(128)                :: output
   integer                        :: i_unit_output,getUnitAndOpen
   integer                        :: i,j,k,l
@@ -37,7 +47,7 @@ subroutine molden_lr
 
   write(i_unit_output,'(A)') '[Atoms] Angs'
   do i = 1, nucl_num
-    write(i_unit_output,'(A2,2X,I4,2X,I4,3(2X,F15.10))')                   &
+    write(i_unit_output,'(A2,2X,I4,2X,I4,3(2X,F15.10))')             &
         trim(element_name(int(nucl_charge(i)))),                     &
         i,                                                           &
         int(nucl_charge(i)),                                         &
@@ -174,3 +184,314 @@ subroutine molden_lr
   close(i_unit_output)
 end
 
+! ---
+
+subroutine molden_l()
+
+  BEGIN_DOC
+  ! Produces a Molden file
+  END_DOC
+
+  implicit none
+  character*(128)             :: output
+  integer                     :: i_unit_output, getUnitAndOpen
+  integer                     :: i, j, k, l
+  double precision, parameter :: a0 = 0.529177249d0
+
+  PROVIDE ezfio_filename
+  PROVIDE mo_l_coef
+
+  output=trim(ezfio_filename)//'_left.mol'
+  print*,'output = ',trim(output)
+
+  i_unit_output = getUnitAndOpen(output,'w')
+
+  write(i_unit_output,'(A)') '[Molden Format]'
+
+  write(i_unit_output,'(A)') '[Atoms] Angs'
+  do i = 1, nucl_num
+    write(i_unit_output,'(A2,2X,I4,2X,I4,3(2X,F15.10))')             &
+        trim(element_name(int(nucl_charge(i)))),                     &
+        i,                                                           &
+        int(nucl_charge(i)),                                         &
+        nucl_coord(i,1)*a0, nucl_coord(i,2)*a0, nucl_coord(i,3)*a0
+  enddo
+
+  write(i_unit_output,'(A)') '[GTO]'
+
+  character*(1)                  :: character_shell
+  integer                        :: i_shell,i_prim,i_ao
+  integer                        :: iorder(ao_num)
+  integer                        :: nsort(ao_num)
+
+  i_shell = 0
+  i_prim = 0
+  do i=1,nucl_num
+    write(i_unit_output,*) i, 0
+    do j=1,nucl_num_shell_aos(i)
+      i_shell +=1
+      i_ao = nucl_list_shell_aos(i,j)
+      character_shell = trim(ao_l_char(i_ao))
+      write(i_unit_output,*) character_shell, ao_prim_num(i_ao), '1.00'
+      do k = 1, ao_prim_num(i_ao)
+        i_prim +=1
+        write(i_unit_output,'(E20.10,2X,E20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k)
+      enddo
+      l = i_ao
+      do while ( ao_l(l) == ao_l(i_ao) )
+        nsort(l) = i*10000 + j*100
+        l += 1
+        if (l > ao_num) exit
+      enddo
+    enddo
+    write(i_unit_output,*)''
+  enddo
+
+
+  do i=1,ao_num
+    iorder(i) = i
+    ! p
+    if      ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 3
+    ! d
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 3
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 4
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 5
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 6
+    ! f
+    else if ((ao_power(i,1) == 3 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 3 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 3 )) then
+      nsort(i) += 3
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 4
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 5
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 6
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 7
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 8
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 9
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 10
+    ! g
+    else if ((ao_power(i,1) == 4 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 4 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 4 )) then
+      nsort(i) += 3
+    else if ((ao_power(i,1) == 3 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 4
+    else if ((ao_power(i,1) == 3 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 5
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 3 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 6
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 3 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 7
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 3 )) then
+      nsort(i) += 8
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 3 )) then
+      nsort(i) += 9
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 10
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 11
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 12
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 13
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 14
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 15
+    endif
+  enddo
+
+  call isort(nsort,iorder,ao_num)
+  write(i_unit_output,'(A)') '[MO]'
+  do i=1,mo_num
+    write (i_unit_output,*) 'Sym= 1'
+    write (i_unit_output,*) 'Ene=', Fock_matrix_tc_mo_tot(i,i)
+    write (i_unit_output,*) 'Spin= Alpha'
+    write (i_unit_output,*) 'Occup=', mo_occ(i)
+    do j=1,ao_num
+      write(i_unit_output, '(I6,2X,E20.10)') j, mo_l_coef(iorder(j),i)
+    enddo
+  enddo
+  close(i_unit_output)
+
+end
+
+! ---
+
+subroutine molden_r()
+
+  BEGIN_DOC
+  ! Produces a Molden file
+  END_DOC
+
+  implicit none
+  character*(128)             :: output
+  integer                     :: i_unit_output, getUnitAndOpen
+  integer                     :: i, j, k, l
+  double precision, parameter :: a0 = 0.529177249d0
+
+  PROVIDE ezfio_filename
+
+  output=trim(ezfio_filename)//'_right.mol'
+  print*,'output = ',trim(output)
+
+  i_unit_output = getUnitAndOpen(output,'w')
+
+  write(i_unit_output,'(A)') '[Molden Format]'
+
+  write(i_unit_output,'(A)') '[Atoms] Angs'
+  do i = 1, nucl_num
+    write(i_unit_output,'(A2,2X,I4,2X,I4,3(2X,F15.10))')             &
+        trim(element_name(int(nucl_charge(i)))),                     &
+        i,                                                           &
+        int(nucl_charge(i)),                                         &
+        nucl_coord(i,1)*a0, nucl_coord(i,2)*a0, nucl_coord(i,3)*a0
+  enddo
+
+  write(i_unit_output,'(A)') '[GTO]'
+
+  character*(1)                  :: character_shell
+  integer                        :: i_shell,i_prim,i_ao
+  integer                        :: iorder(ao_num)
+  integer                        :: nsort(ao_num)
+
+  i_shell = 0
+  i_prim = 0
+  do i=1,nucl_num
+    write(i_unit_output,*) i, 0
+    do j=1,nucl_num_shell_aos(i)
+      i_shell +=1
+      i_ao = nucl_list_shell_aos(i,j)
+      character_shell = trim(ao_l_char(i_ao))
+      write(i_unit_output,*) character_shell, ao_prim_num(i_ao), '1.00'
+      do k = 1, ao_prim_num(i_ao)
+        i_prim +=1
+        write(i_unit_output,'(E20.10,2X,E20.10)') ao_expo(i_ao,k), ao_coef(i_ao,k)
+      enddo
+      l = i_ao
+      do while ( ao_l(l) == ao_l(i_ao) )
+        nsort(l) = i*10000 + j*100
+        l += 1
+        if (l > ao_num) exit
+      enddo
+    enddo
+    write(i_unit_output,*)''
+  enddo
+
+
+  do i=1,ao_num
+    iorder(i) = i
+    ! p
+    if      ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 3
+    ! d
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 3
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 4
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 5
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 6
+    ! f
+    else if ((ao_power(i,1) == 3 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 3 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 3 )) then
+      nsort(i) += 3
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 4
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 5
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 6
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 7
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 8
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 9
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 10
+    ! g
+    else if ((ao_power(i,1) == 4 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 1
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 4 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 2
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 4 )) then
+      nsort(i) += 3
+    else if ((ao_power(i,1) == 3 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 4
+    else if ((ao_power(i,1) == 3 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 5
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 3 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 6
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 3 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 7
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 3 )) then
+      nsort(i) += 8
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 3 )) then
+      nsort(i) += 9
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 0 )) then
+      nsort(i) += 10
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 0 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 11
+    else if ((ao_power(i,1) == 0 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 12
+    else if ((ao_power(i,1) == 2 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 13
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 2 ).and.(ao_power(i,3) == 1 )) then
+      nsort(i) += 14
+    else if ((ao_power(i,1) == 1 ).and.(ao_power(i,2) == 1 ).and.(ao_power(i,3) == 2 )) then
+      nsort(i) += 15
+    endif
+  enddo
+
+  call isort(nsort, iorder, ao_num)
+  write(i_unit_output,'(A)') '[MO]'
+  do i=1,mo_num
+    write (i_unit_output,*) 'Sym= 1'
+    write (i_unit_output,*) 'Ene=', Fock_matrix_tc_mo_tot(i,i)
+    write (i_unit_output,*) 'Spin= Alpha'
+    write (i_unit_output,*) 'Occup=', mo_occ(i)
+    do j=1,ao_num
+      write(i_unit_output, '(I6,2X,E20.10)') j, mo_r_coef(iorder(j),i)
+    enddo
+  enddo
+  close(i_unit_output)
+
+end
+
diff --git a/src/tc_scf/print_fit_param.irp.f b/src/tc_scf/print_fit_param.irp.f
index f8bcfa7f..e62f0dde 100644
--- a/src/tc_scf/print_fit_param.irp.f
+++ b/src/tc_scf/print_fit_param.irp.f
@@ -7,10 +7,9 @@ program print_fit_param
   implicit none
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   !call create_guess
diff --git a/src/tc_scf/print_tcscf_energy.irp.f b/src/tc_scf/print_tcscf_energy.irp.f
new file mode 100644
index 00000000..05b8df23
--- /dev/null
+++ b/src/tc_scf/print_tcscf_energy.irp.f
@@ -0,0 +1,51 @@
+program print_tcscf_energy
+
+  BEGIN_DOC
+  ! TODO : Put the documentation of the program here
+  END_DOC
+
+  implicit none
+
+  print *, 'Hello world'
+  my_grid_becke = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+  double precision :: etc_tot, etc_1e, etc_2e, etc_3e
+
+  PROVIDE mu_erf
+  PROVIDE j1b_type
+
+  print*, ' mu_erf   = ', mu_erf
+  print*, ' j1b_type = ', j1b_type
+
+  etc_tot = TC_HF_energy
+  etc_1e  = TC_HF_one_e_energy
+  etc_2e  = TC_HF_two_e_energy
+  etc_3e  = 0.d0
+  if(three_body_h_tc) then
+    !etc_3e = diag_three_elem_hf
+    etc_3e = tcscf_energy_3e_naive
+  endif
+
+  print *, " E_TC = ", etc_tot
+  print *, " E_1e = ", etc_1e
+  print *, " E_2e = ", etc_2e
+  print *, " E_3e = ", etc_3e
+
+  return
+end subroutine main
+
+! ---
+
diff --git a/src/tc_scf/rotate_tcscf_orbitals.irp.f b/src/tc_scf/rotate_tcscf_orbitals.irp.f
index 2567faf0..0f2663e5 100644
--- a/src/tc_scf/rotate_tcscf_orbitals.irp.f
+++ b/src/tc_scf/rotate_tcscf_orbitals.irp.f
@@ -10,8 +10,9 @@ program rotate_tcscf_orbitals
   implicit none
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   bi_ortho = .True.
diff --git a/src/tc_scf/tc_petermann_factor.irp.f b/src/tc_scf/tc_petermann_factor.irp.f
index d3722098..2e9c67e2 100644
--- a/src/tc_scf/tc_petermann_factor.irp.f
+++ b/src/tc_scf/tc_petermann_factor.irp.f
@@ -10,10 +10,9 @@ program tc_petermann_factor
   implicit none
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   call main()
diff --git a/src/tc_scf/tc_scf.irp.f b/src/tc_scf/tc_scf.irp.f
index 04c4f92d..e4c38741 100644
--- a/src/tc_scf/tc_scf.irp.f
+++ b/src/tc_scf/tc_scf.irp.f
@@ -13,10 +13,11 @@ program tc_scf
   print *, ' starting ...'
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   PROVIDE mu_erf 
diff --git a/src/tc_scf/tcscf_energy_naive.irp.f b/src/tc_scf/tcscf_energy_naive.irp.f
new file mode 100644
index 00000000..82bb8799
--- /dev/null
+++ b/src/tc_scf/tcscf_energy_naive.irp.f
@@ -0,0 +1,80 @@
+
+! ---
+
+BEGIN_PROVIDER [double precision, tcscf_energy_3e_naive]
+
+  implicit none
+  integer          :: i, j, k
+  integer          :: neu, ned, D(elec_num)
+  integer          :: ii, jj, kk
+  integer          :: si, sj, sk
+  double precision :: I_ijk, I_jki, I_kij, I_jik, I_ikj, I_kji
+  double precision :: I_tot
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  neu = elec_alpha_num
+  ned = elec_beta_num
+  if (neu > 0) D(1:neu) = [(2*i-1, i = 1, neu)]
+  if (ned > 0) D(neu+1:neu+ned) = [(2*i, i = 1, ned)]
+
+  !print*, "D = "
+  !do i = 1, elec_num
+  !  ii = (D(i) - 1) / 2 + 1
+  !  si = mod(D(i), 2)
+  !  print*, i, D(i), ii, si
+  !enddo
+
+  tcscf_energy_3e_naive = 0.d0
+
+  do i = 1, elec_num - 2
+    ii = (D(i) - 1) / 2 + 1
+    si = mod(D(i), 2)
+
+    do j = i + 1, elec_num - 1
+      jj = (D(j) - 1) / 2 + 1
+      sj = mod(D(j), 2)
+
+      do k = j + 1, elec_num
+        kk = (D(k) - 1) / 2 + 1
+        sk = mod(D(k), 2)
+
+        call give_integrals_3_body_bi_ort(ii, jj, kk, ii, jj, kk, I_ijk)
+        I_tot = I_ijk
+
+        if(sj==si .and. sk==sj) then
+          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, kk, ii, I_jki)
+          I_tot += I_jki
+        endif
+
+        if(sk==si .and. si==sj) then
+          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, ii, jj, I_kij)
+          I_tot += I_kij
+        endif
+
+        if(sj==si) then
+          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, ii, kk, I_jik)
+          I_tot -= I_jik
+        endif
+
+        if(sk==sj) then
+          call give_integrals_3_body_bi_ort(ii, jj, kk, ii, kk, jj, I_ikj)
+          I_tot -= I_ikj
+        endif
+
+        if(sk==si) then
+          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, jj, ii, I_kji)
+          I_tot -= I_kji
+        endif
+
+        tcscf_energy_3e_naive += I_tot
+      enddo
+    enddo
+  enddo
+
+  tcscf_energy_3e_naive = -tcscf_energy_3e_naive
+
+END_PROVIDER
+
+! ---
+
diff --git a/src/tc_scf/test_int.irp.f b/src/tc_scf/test_int.irp.f
index b9287d58..649d0f3e 100644
--- a/src/tc_scf/test_int.irp.f
+++ b/src/tc_scf/test_int.irp.f
@@ -9,10 +9,9 @@ program test_ints
   print *, ' starting test_ints ...'
 
   my_grid_becke  = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-!  my_n_pt_r_grid = 15 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
   my_extra_grid_becke = .True.
@@ -280,7 +279,7 @@ subroutine routine_v_ij_u_cst_mu_j1b_test
     do i = 1, ao_num
      do j = 1, ao_num
       array(j,i,l,k)     += v_ij_u_cst_mu_j1b_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_u_cst_mu_j1b(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
+      array_ref(j,i,l,k) += v_ij_u_cst_mu_j1b_fit (j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
      enddo
     enddo
    enddo
@@ -506,7 +505,7 @@ subroutine routine_v_ij_u_cst_mu_j1b
     do i = 1, ao_num
      do j = 1, ao_num
       array(j,i,l,k)     += v_ij_u_cst_mu_j1b_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_u_cst_mu_j1b(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
+      array_ref(j,i,l,k) += v_ij_u_cst_mu_j1b_fit (j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
      enddo
     enddo
    enddo
diff --git a/src/tc_scf/three_e_energy_bi_ortho.irp.f b/src/tc_scf/three_e_energy_bi_ortho.irp.f
index 64212da8..0c9ebbd7 100644
--- a/src/tc_scf/three_e_energy_bi_ortho.irp.f
+++ b/src/tc_scf/three_e_energy_bi_ortho.irp.f
@@ -1,24 +1,32 @@
 
-subroutine contrib_3e_diag_sss(i,j,k,integral)
- implicit none
- integer, intent(in) :: i,j,k
- BEGIN_DOC
- ! returns the pure same spin contribution to diagonal matrix element of 3e term
- END_DOC
- double precision, intent(out) :: integral
- double precision :: direct_int, exch_13_int, exch_23_int, exch_12_int, c_3_int, c_minus_3_int
- call  give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int )!!! < i k j | i k j >
- call  give_integrals_3_body_bi_ort(i, k, j, j, i, k, c_3_int)      ! < i k j | j i k >
- call  give_integrals_3_body_bi_ort(i, k, j, k, j, i, c_minus_3_int)! < i k j | k j i >
- integral = direct_int + c_3_int + c_minus_3_int 
- ! negative terms :: exchange contrib
- call  give_integrals_3_body_bi_ort(i, k, j, j, k, i, exch_13_int)!!! < i k j | j k i > : E_13 
- call  give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)!!! < i k j | i j k > : E_23
- call  give_integrals_3_body_bi_ort(i, k, j, k, i, j, exch_12_int)!!! < i k j | k i j > : E_12
- integral += - exch_13_int - exch_23_int  - exch_12_int 
- integral = -integral
+subroutine contrib_3e_diag_sss(i, j, k, integral)
+
+  BEGIN_DOC
+  ! returns the pure same spin contribution to diagonal matrix element of 3e term
+  END_DOC
+
+  implicit none
+  integer,          intent(in)  :: i, j, k
+  double precision, intent(out) :: integral
+  double precision              :: direct_int, exch_13_int, exch_23_int, exch_12_int, c_3_int, c_minus_3_int
+
+  call give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int )!!! < i k j | i k j >
+  call give_integrals_3_body_bi_ort(i, k, j, j, i, k, c_3_int)      ! < i k j | j i k >
+  call give_integrals_3_body_bi_ort(i, k, j, k, j, i, c_minus_3_int)! < i k j | k j i >
+  integral = direct_int + c_3_int + c_minus_3_int 
+
+  ! negative terms :: exchange contrib
+  call give_integrals_3_body_bi_ort(i, k, j, j, k, i, exch_13_int)!!! < i k j | j k i > : E_13 
+  call give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)!!! < i k j | i j k > : E_23
+  call give_integrals_3_body_bi_ort(i, k, j, k, i, j, exch_12_int)!!! < i k j | k i j > : E_12
+
+  integral += - exch_13_int - exch_23_int  - exch_12_int 
+  integral = -integral
+
 end
 
+! ---
+
 subroutine contrib_3e_diag_soo(i,j,k,integral)
  implicit none
  integer, intent(in) :: i,j,k
@@ -51,23 +59,30 @@ subroutine give_aaa_contrib_bis(integral_aaa)
 
 end
 
+! ---
+
 subroutine give_aaa_contrib(integral_aaa)
- implicit none
- double precision, intent(out) :: integral_aaa
- double precision :: integral
- integer :: i,j,k
- integral_aaa = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_alpha_num
-   do k = 1, elec_alpha_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_aaa += integral
-   enddo
+
+  implicit none
+  integer                       :: i, j, k
+  double precision              :: integral
+  double precision, intent(out) :: integral_aaa
+
+  integral_aaa = 0.d0
+  do i = 1, elec_alpha_num
+    do j = 1, elec_alpha_num
+      do k = 1, elec_alpha_num
+        call contrib_3e_diag_sss(i, j, k, integral)
+        integral_aaa += integral
+      enddo
+    enddo
   enddo
- enddo
- integral_aaa *= 1.d0/6.d0 
+  integral_aaa *= 1.d0/6.d0 
+
+  return
 end
 
+! ---
 
 subroutine give_aab_contrib(integral_aab)
  implicit none
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index b548b18a..72029c73 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -418,7 +418,7 @@ subroutine gaussian_product_x(a,xa,b,xb,k,p,xp)
   xab = xa-xb
   ab = ab*p_inv
   k = ab*xab*xab
-  if (k > 40.d0) then
+  if (k > 400.d0) then
     k=0.d0
     return
   endif