From 2d06e8fdaefff2905933a0b5ff52e93f8193f0a7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 May 2023 10:52:36 +0200
Subject: [PATCH 01/79] Cholesky false by default

---
 external/qp2-dependencies   | 2 +-
 src/ao_two_e_ints/EZFIO.cfg | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/qp2-dependencies b/external/qp2-dependencies
index e0d0e02e..6e23ebac 160000
--- a/external/qp2-dependencies
+++ b/external/qp2-dependencies
@@ -1 +1 @@
-Subproject commit e0d0e02e9f5ece138d1520106954a881ab0b8db2
+Subproject commit 6e23ebac001acae91d1c762ca934e09a9b7d614a
diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg
index caed4698..4ab080ec 100644
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@@ -22,4 +22,4 @@ ezfio_name: direct
 type: logical
 doc: Perform Cholesky decomposition of AO integrals
 interface: ezfio,provider,ocaml
-default: True
+default: False

From b8804f058a2872976af4712248609fab5bf6edaf Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 12 May 2023 21:38:01 +0200
Subject: [PATCH 02/79] Moved qp_import_trexio.py

---
 {src/trexio => scripts}/qp_import_trexio.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
 rename {src/trexio => scripts}/qp_import_trexio.py (98%)

diff --git a/src/trexio/qp_import_trexio.py b/scripts/qp_import_trexio.py
similarity index 98%
rename from src/trexio/qp_import_trexio.py
rename to scripts/qp_import_trexio.py
index de8d1269..d8a19160 100755
--- a/src/trexio/qp_import_trexio.py
+++ b/scripts/qp_import_trexio.py
@@ -13,12 +13,17 @@ Options:
 
 import sys
 import os
-import trexio
 import numpy as np
 from functools import reduce
 from ezfio import ezfio
 from docopt import docopt
 
+try:
+  import trexio
+except ImportError:
+    print("Error: trexio python module is not found. Try python3 -m pip install trexio")
+    sys.exit(1)
+
 
 try:
     QP_ROOT = os.environ["QP_ROOT"]

From 6289508c1e4e1ae7abce6388cf42fa12b5d28752 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 13 May 2023 13:32:52 +0200
Subject: [PATCH 03/79] Swapped indices in CCSD(T)

---
 scripts/qp_import_trexio.py         |  23 +++---
 src/ccsd/ccsd_space_orb_sub.irp.f   |   2 +-
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 108 +++++++++++++---------------
 src/utils/linear_algebra.irp.f      |  30 ++++----
 4 files changed, 79 insertions(+), 84 deletions(-)

diff --git a/scripts/qp_import_trexio.py b/scripts/qp_import_trexio.py
index d8a19160..eb19e16b 100755
--- a/scripts/qp_import_trexio.py
+++ b/scripts/qp_import_trexio.py
@@ -95,14 +95,15 @@ def write_ezfio(trexio_filename, filename):
         p = re.compile(r'(\d*)$')
         label = [p.sub("", x).capitalize() for x in label]
         ezfio.set_nuclei_nucl_label(label)
+        print("OK")
 
     else:
         ezfio.set_nuclei_nucl_num(1)
         ezfio.set_nuclei_nucl_charge([0.])
         ezfio.set_nuclei_nucl_coord([0.,0.,0.])
         ezfio.set_nuclei_nucl_label(["X"])
+        print("None")
 
-    print("OK")
 
 
     print("Electrons\t...\t", end=' ')
@@ -110,12 +111,12 @@ def write_ezfio(trexio_filename, filename):
     try:
         num_beta = trexio.read_electron_dn_num(trexio_file)
     except:
-        num_beta = sum(charge)//2
+        num_beta = int(sum(charge))//2
 
     try:
         num_alpha = trexio.read_electron_up_num(trexio_file)
     except:
-        num_alpha = sum(charge) - num_beta
+        num_alpha = int(sum(charge)) - num_beta
 
     if num_alpha == 0:
         print("\n\nError: There are zero electrons in the TREXIO file.\n\n")
@@ -123,7 +124,7 @@ def write_ezfio(trexio_filename, filename):
     ezfio.set_electrons_elec_alpha_num(num_alpha)
     ezfio.set_electrons_elec_beta_num(num_beta)
 
-    print("OK")
+    print(f"{num_alpha} {num_beta}")
 
     print("Basis\t\t...\t", end=' ')
 
@@ -263,7 +264,10 @@ def write_ezfio(trexio_filename, filename):
         ezfio.set_ao_basis_ao_expo(expo)
         ezfio.set_ao_basis_ao_basis("Read from TREXIO")
 
-    print("OK")
+        print("OK")
+
+    else:
+        print("None")
 
 
     #                _
@@ -308,10 +312,10 @@ def write_ezfio(trexio_filename, filename):
       for i in range(num_beta):
          mo_occ[i] += 1.
       ezfio.set_mo_basis_mo_occ(mo_occ)
+      print("OK")
     except:
-      pass
+      print("None")
 
-    print("OK")
 
 
     print("Pseudos\t\t...\t", end=' ')
@@ -391,9 +395,10 @@ def write_ezfio(trexio_filename, filename):
         ezfio.set_pseudo_pseudo_n_kl(pseudo_n_kl)
         ezfio.set_pseudo_pseudo_v_kl(pseudo_v_kl)
         ezfio.set_pseudo_pseudo_dz_kl(pseudo_dz_kl)
+        print("OK")
 
-
-    print("OK")
+    else:
+        print("None")
 
 
 
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index b63375cf..acd14034 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -169,7 +169,7 @@ subroutine run_ccsd_space_orb
     ! New
     print*,'Computing (T) correction...'
     call wall_time(ta)
-    call ccsd_par_t_space_v2(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
     call wall_time(tb)
     print*,'Time: ',tb-ta, ' s'
diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 3b762a06..acc2aaa9 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -15,8 +15,8 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, allocatable :: W_abc(:,:,:), V_abc(:,:,:)
   double precision, allocatable :: W_cab(:,:,:), W_cba(:,:,:)
   double precision, allocatable :: W_bca(:,:,:), V_cba(:,:,:)
-  double precision, allocatable :: X_vvvo(:,:,:,:), X_ovoo(:,:,:,:), X_vvoo(:,:,:,:)
-  double precision, allocatable :: T_vvoo(:,:,:,:), T_ovvo(:,:,:,:), T_vo(:,:)
+  double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
+  double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
   double precision              :: e,ta,tb, delta, delta_abc
 
@@ -24,25 +24,25 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !allocate(V(nV,nV,nV,nO,nO,nO))
   allocate(W_abc(nO,nO,nO), V_abc(nO,nO,nO), W_cab(nO,nO,nO))
   allocate(W_bca(nO,nO,nO), V_cba(nO,nO,nO), W_cba(nO,nO,nO))
-  allocate(X_vvvo(nV,nV,nV,nO), X_ovoo(nO,nV,nO,nO), X_vvoo(nV,nV,nO,nO))
-  allocate(T_vvoo(nV,nV,nO,nO), T_ovvo(nO,nV,nV,nO), T_vo(nV,nO))
+  allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
+  allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
 
   ! Temporary arrays
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,T_vvoo,T_ovvo,T_vo,X_vvvo,X_ovoo,X_vvoo, &
+  !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
   !$OMP t1,t2,v_vvvo,v_vooo,v_vvoo) &
   !$OMP PRIVATE(a,b,c,d,i,j,k,l) &
   !$OMP DEFAULT(NONE)
 
   !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
-  !X_vvvo(d,b,a,i) * T_vvoo(d,c,k,j)
+  !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
 
   !$OMP DO collapse(3)
   do i = 1, nO
     do a = 1, nV
       do b = 1, nV
         do d = 1, nV
-          X_vvvo(d,b,a,i) = v_vvvo(b,a,d,i)
+          X_vovv(d,i,b,a) = v_vvvo(b,a,d,i)
         enddo
       enddo
     enddo
@@ -54,7 +54,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
     do k = 1, nO
       do c = 1, nV
         do d = 1, nV
-          T_vvoo(d,c,k,j) = t2(k,j,c,d)
+          T_voov(d,k,j,c) = t2(k,j,c,d)
         enddo
       enddo
     enddo
@@ -62,14 +62,14 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !$OMP END DO nowait
 
   !v_vooo(c,j,k,l) * t2(i,l,a,b) &
-  !X_ovoo(l,c,j,k) * T_ovvo(l,a,b,i) &
+  !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
 
   !$OMP DO collapse(3)
-  do k = 1, nO
-    do j = 1, nO
-      do c = 1, nV
+  do c = 1, nV
+    do k = 1, nO
+      do j = 1, nO
         do l = 1, nO
-           X_ovoo(l,c,j,k) = v_vooo(c,j,k,l)
+           X_ooov(l,j,k,c) = v_vooo(c,j,k,l)
         enddo
       enddo
     enddo
@@ -81,35 +81,27 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
     do b = 1, nV
       do a = 1, nV
         do l = 1, nO
-          T_ovvo(l,a,b,i) = t2(i,l,a,b)
+          T_oovv(l,i,a,b) = t2(i,l,a,b)
         enddo
       enddo
     enddo
   enddo
   !$OMP END DO nowait
 
-  !v_vvoo(b,c,j,k) * t1(i,a) &
-  !X_vvoo(b,c,k,j) * T1_vo(a,i) &
+  !X_oovv(j,k,b,c) * T1_vo(a,i) &
 
   !$OMP DO collapse(3)
-  do j = 1, nO
-    do k = 1, nO
-      do c = 1, nV
-        do b = 1, nV
-          X_vvoo(b,c,k,j) = v_vvoo(b,c,j,k)
+  do c = 1, nV
+    do b = 1, nV
+      do j = 1, nO
+        do k = 1, nO
+          X_oovv(j,k,b,c) = v_vvoo(b,c,j,k)
         enddo
       enddo
     enddo
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO collapse(1)
-  do i = 1, nO
-    do a = 1, nV
-      T_vo(a,i) = t1(i,a)
-    enddo
-  enddo
-  !$OMP END DO
   !$OMP END PARALLEL
 
   call wall_time(ta)
@@ -118,13 +110,13 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
     do b = 1, nV
       do a = 1, nV
         delta_abc = f_v(a) + f_v(b) + f_v(c)
-        call form_w_abc(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc)
-        call form_w_abc(nO,nV,b,c,a,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_bca)
-        call form_w_abc(nO,nV,c,a,b,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_cab)
-        call form_w_abc(nO,nV,c,b,a,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_cba)
+        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
+        call form_w_abc(nO,nV,b,c,a,T_voov,T_oovv,X_vovv,X_ooov,W_bca)
+        call form_w_abc(nO,nV,c,a,b,T_voov,T_oovv,X_vovv,X_ooov,W_cab)
+        call form_w_abc(nO,nV,c,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_cba)
 
-        call form_v_abc(nO,nV,a,b,c,T_vo,X_vvoo,W_abc,V_abc)
-        call form_v_abc(nO,nV,c,b,a,T_vo,X_vvoo,W_cba,V_cba)
+        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc)
+        call form_v_abc(nO,nV,c,b,a,t1,X_oovv,W_cba,V_cba)
         !$OMP PARALLEL                                               &
             !$OMP SHARED(energy,nO,a,b,c,W_abc,W_cab,W_bca,V_abc,V_cba,f_o,f_v,delta_abc)&
             !$OMP PRIVATE(i,j,k,e,delta)                             &
@@ -154,26 +146,26 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   energy = energy / 3d0
 
-  deallocate(W_abc,V_abc,W_cab,V_cba,W_bca,X_vvvo,X_ovoo,T_vvoo,T_ovvo,T_vo)
+  deallocate(W_abc,V_abc,W_cab,V_cba,W_bca,X_vovv,X_ooov,T_voov,T_oovv)
   !deallocate(V,W)
 end
 
 
-subroutine form_w_abc(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc)
+subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
 
   implicit none
 
   integer, intent(in)           :: nO,nV,a,b,c
   !double precision, intent(in) :: t2(nO,nO,nV,nV)
-  double precision, intent(in)  :: T_vvoo(nV,nV,nO,nO), T_ovvo(nO,nV,nV,nO)
-  double precision, intent(in)  :: X_vvvo(nV,nV,nV,nO), X_ovoo(nO,nV,nO,nO)
+  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
   double precision, intent(out) :: W_abc(nO,nO,nO)
 
   integer :: l,i,j,k,d
 
 
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc) &
+  !$OMP SHARED(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc) &
   !$OMP PRIVATE(i,j,k,d,l) &
   !$OMP DEFAULT(NONE)
 
@@ -185,23 +177,23 @@ subroutine form_w_abc(nO,nV,a,b,c,T_vvoo,T_ovvo,X_vvvo,X_ovoo,W_abc)
 
         do d = 1, nV
           W_abc(i,j,k) = W_abc(i,j,k) &
-                 + X_vvvo(d,b,a,i) * T_vvoo(d,c,k,j) &
-                 + X_vvvo(d,c,a,i) * T_vvoo(d,b,j,k) &
-                 + X_vvvo(d,a,c,k) * T_vvoo(d,b,j,i) &
-                 + X_vvvo(d,b,c,k) * T_vvoo(d,a,i,j) &
-                 + X_vvvo(d,c,b,j) * T_vvoo(d,a,i,k) &
-                 + X_vvvo(d,a,b,j) * T_vvoo(d,c,k,i)
+                 + X_vovv(d,i,b,a) * T_voov(d,k,j,c) &
+                 + X_vovv(d,i,c,a) * T_voov(d,j,k,b) &
+                 + X_vovv(d,k,a,c) * T_voov(d,j,i,b) &
+                 + X_vovv(d,k,b,c) * T_voov(d,i,j,a) &
+                 + X_vovv(d,j,c,b) * T_voov(d,i,k,a) &
+                 + X_vovv(d,j,a,b) * T_voov(d,k,i,c)
 
         enddo
 
         do l = 1, nO
           W_abc(i,j,k) = W_abc(i,j,k) &
-              - T_ovvo(l,a,b,i) * X_ovoo(l,c,j,k) &
-              - T_ovvo(l,a,c,i) * X_ovoo(l,b,k,j) & ! bc kj
-              - T_ovvo(l,c,a,k) * X_ovoo(l,b,i,j) & ! prev ac ik
-              - T_ovvo(l,c,b,k) * X_ovoo(l,a,j,i) & ! prev ab ij
-              - T_ovvo(l,b,c,j) * X_ovoo(l,a,k,i) & ! prev bc kj
-              - T_ovvo(l,b,a,j) * X_ovoo(l,c,i,k) ! prev ac ik
+              - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) &
+              - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) & ! bc kj
+              - T_oovv(l,k,c,a) * X_ooov(l,i,j,b) & ! prev ac ik
+              - T_oovv(l,k,c,b) * X_ooov(l,j,i,a) & ! prev ab ij
+              - T_oovv(l,j,b,c) * X_ooov(l,k,i,a) & ! prev bc kj
+              - T_oovv(l,j,b,a) * X_ooov(l,i,k,c) ! prev ac ik
         enddo
 
       enddo
@@ -216,21 +208,21 @@ end
 
 ! V_abc
 
-subroutine form_v_abc(nO,nV,a,b,c,T_vo,X_vvoo,W,V)
+subroutine form_v_abc(nO,nV,a,b,c,T_ov,X_oovv,W,V)
 
 implicit none
 
   integer, intent(in)           :: nO,nV,a,b,c
   !double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(in)  :: T_vo(nV,nO)
-  double precision, intent(in)  :: X_vvoo(nV,nV,nO,nO)
+  double precision, intent(in)  :: T_ov(nO,nV)
+  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
   double precision, intent(in)  :: W(nO,nO,nO)
   double precision, intent(out) :: V(nO,nO,nO)
 
   integer :: i,j,k
 
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,a,b,c,T_vo,X_vvoo,W,V) &
+  !$OMP SHARED(nO,nV,a,b,c,T_ov,X_oovv,W,V) &
   !$OMP PRIVATE(i,j,k) &
   !$OMP DEFAULT(NONE)
   !$OMP DO collapse(2)
@@ -239,9 +231,9 @@ implicit none
       do i = 1, nO
         !V(i,j,k,a,b,c) = V(i,j,k,a,b,c) + W(i,j,k,a,b,c) &
         V(i,j,k) = W(i,j,k) &
-           + X_vvoo(b,c,k,j) * T_vo(a,i) &
-           + X_vvoo(a,c,k,i) * T_vo(b,j) &
-           + X_vvoo(a,b,j,i) * T_vo(c,k)
+           + X_oovv(j,k,b,c) * T_ov(i,a) &
+           + X_oovv(i,k,a,c) * T_ov(j,b) &
+           + X_oovv(i,j,a,b) * T_ov(k,c)
       enddo
     enddo
   enddo
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 3b43d607..69873bc0 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -1823,41 +1823,39 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 ! U is allocated inside this subroutine
 ! rank is the number of Cholesky vectors depending on tol
 !
-integer :: ndim
-integer, intent(inout)                                        :: rank
-double precision, dimension(ndim, ndim), intent(inout)        :: A
-double precision, dimension(ndim, rank), intent(out)          :: U
-double precision, intent(in)                                  :: tol
+integer                          :: ndim
+integer, intent(inout)           :: rank
+double precision, intent(inout)  :: A(ndim, ndim)
+double precision, intent(out)    :: U(ndim, rank)
+double precision, intent(in)     :: tol
 
 integer, dimension(:), allocatable          :: piv
 double precision, dimension(:), allocatable :: work
 character, parameter :: uplo = "U"
-integer :: N, LDA
+integer :: LDA
 integer :: info
 integer :: k, l, rank0
-external :: dpstrf
 
 rank0 = rank
-N = size(A, dim=1)
-LDA = N
-allocate(piv(N))
-allocate(work(2*N))
-call dpstrf(uplo, N, A, LDA, piv, rank, tol, work, info)
+LDA = ndim
+allocate(piv(ndim))
+allocate(work(2*ndim))
+call dpstrf(uplo, ndim, A, LDA, piv, rank, tol, work, info)
 
 if (rank > rank0) then
   print *, 'Bug: rank > rank0 in pivoted cholesky. Increase rank before calling'
   stop
 end if
 
-do k = 1, N
-  A(k+1:, k) = 0.00D+0
+do k = 1, ndim
+  A(k+1:ndim, k) = 0.00D+0
 end do
 ! TODO: It should be possible to use only one vector of size (1:rank) as a buffer
 ! to do the swapping in-place
 U(:,:) = 0.00D+0
-do k = 1, N
+do k = 1, ndim
   l = piv(k)
-  U(l, :) = A(1:rank, k)
+  U(l, 1:rank) = A(1:rank, k)
 end do
 
 end subroutine pivoted_cholesky

From ca5857ac3630a452199bb25b29eed04e8674e6b3 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 13 May 2023 15:34:16 +0200
Subject: [PATCH 04/79] Added dgemm in ccsd_t_space_orb_abc.irp.f

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 92 +++++++++++++++++++++++++----
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index acc2aaa9..e960d47d 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -162,7 +162,86 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
   double precision, intent(out) :: W_abc(nO,nO,nO)
 
   integer :: l,i,j,k,d
+  double precision, allocatable, dimension(:,:,:) :: W_ikj, X
 
+  allocate(W_ikj(nO,nO,nO))
+  allocate(X(nV,nO,nO))
+
+  W_abc = 0.d0
+  W_ikj = 0.d0
+
+!   X_vovv(d,i,c,a) * T_voov(d,j,k,b) : i jk
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
+       X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
+
+!   T_voov(d,i,j,a) * X_vovv(d,k,b,c) : ij k
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
+       T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_abc, nO*nO)
+
+!  T_voov(d,k,i,c) * X_vovv(d,j,a,b) : ki j
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,k,d)
+  do k=1,nO
+    do i=1,nO
+      do d=1,nV
+        X(d,i,k) = T_voov(d,k,i,c)
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
+       X(1,1,1), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj, nO*nO)
+
+!   X_vovv(d,k,a,c) * T_voov(d,j,i,b) : k ji
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,k,d)
+  do j=1,nO
+    do i=1,nO
+      do d=1,nV
+        X(d,i,j) = T_voov(d,j,i,b)
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
+       X(1,1,1), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
+
+!   T_voov(d,i,k,a) * X_vovv(d,j,c,b) : ik j
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
+       T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_ikj, nO*nO)
+
+!   X_vovv(d,i,b,a) * T_voov(d,k,j,c) : i kj
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,k,d)
+  do k=1,nO
+    do j=1,nO
+      do d=1,nV
+        X(d,j,k) = T_voov(d,k,j,c)
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
+       X_vovv(1,1,b,a), nV, X(1,1,1), nV, 1.d0, W_abc, nO)
+
+
+
+!   - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) : i jk
+!   - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) : i kj
+!   - T_oovv(l,k,c,a) * X_ooov(l,i,j,b) : k ij
+!   - T_oovv(l,k,c,b) * X_ooov(l,j,i,a) : k ji
+!   - T_oovv(l,j,b,c) * X_ooov(l,k,i,a) : j ki
+!   - T_oovv(l,j,b,a) * X_ooov(l,i,k,c) : j ik
+
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j,k)
+  do k=1,nO
+    do j=1,nO
+      do i=1,nO
+        W_abc(i,j,k) = W_abc(i,j,k) + W_ikj(i,k,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
 
   !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc) &
@@ -173,18 +252,6 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
   do k = 1, nO
     do j = 1, nO
       do i = 1, nO
-        W_abc(i,j,k) = 0.d0
-
-        do d = 1, nV
-          W_abc(i,j,k) = W_abc(i,j,k) &
-                 + X_vovv(d,i,b,a) * T_voov(d,k,j,c) &
-                 + X_vovv(d,i,c,a) * T_voov(d,j,k,b) &
-                 + X_vovv(d,k,a,c) * T_voov(d,j,i,b) &
-                 + X_vovv(d,k,b,c) * T_voov(d,i,j,a) &
-                 + X_vovv(d,j,c,b) * T_voov(d,i,k,a) &
-                 + X_vovv(d,j,a,b) * T_voov(d,k,i,c)
-
-        enddo
 
         do l = 1, nO
           W_abc(i,j,k) = W_abc(i,j,k) &
@@ -202,7 +269,6 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
   !$OMP END DO
   !$OMP END PARALLEL
 
-
 end
 
 

From 1c0141d9a2be1b8025c76a178c81559b63432121 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 13 May 2023 21:25:49 +0200
Subject: [PATCH 05/79] Full dgemm in ccsd_t_space_orb_abc.irp.f

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 140 ++++++++++++----------------
 1 file changed, 62 insertions(+), 78 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index e960d47d..c5c15fb3 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -162,78 +162,97 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
   double precision, intent(out) :: W_abc(nO,nO,nO)
 
   integer :: l,i,j,k,d
-  double precision, allocatable, dimension(:,:,:) :: W_ikj, X
+  double precision, allocatable, dimension(:,:,:) :: W_ikj
+  double precision, allocatable :: X(:,:,:,:)
 
   allocate(W_ikj(nO,nO,nO))
-  allocate(X(nV,nO,nO))
+  allocate(X(nV,nO,nO,2))
 
-  W_abc = 0.d0
-  W_ikj = 0.d0
+  do k=1,nO
+    do i=1,nO
+      do d=1,nV
+        X(d,i,k,1) = T_voov(d,k,i,c)
+!        X(d,i,j,2) = T_voov(d,j,i,b)
+        X(d,i,k,2) = T_voov(d,k,i,b)
+!        X(d,j,k,1) = T_voov(d,k,j,c)
+      enddo
+    enddo
+  enddo
 
 !   X_vovv(d,i,c,a) * T_voov(d,j,k,b) : i jk
+
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
        X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
 
 !   T_voov(d,i,j,a) * X_vovv(d,k,b,c) : ij k
+
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
        T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_abc, nO*nO)
 
-!  T_voov(d,k,i,c) * X_vovv(d,j,a,b) : ki j
-  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,k,d)
-  do k=1,nO
-    do i=1,nO
-      do d=1,nV
-        X(d,i,k) = T_voov(d,k,i,c)
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
-       X(1,1,1), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj, nO*nO)
-
 !   X_vovv(d,k,a,c) * T_voov(d,j,i,b) : k ji
-  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,k,d)
-  do j=1,nO
-    do i=1,nO
-      do d=1,nV
-        X(d,i,j) = T_voov(d,j,i,b)
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
 
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
-       X(1,1,1), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
+       X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
+
+!   X_vovv(d,i,b,a) * T_voov(d,k,j,c) : i kj
+
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
+       X_vovv(1,1,b,a), nV, X(1,1,1,1), nV, 1.d0, W_abc, nO)
+
+!  T_voov(d,k,i,c) * X_vovv(d,j,a,b) : ki j
+
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
+       X(1,1,1,1), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj, nO*nO)
 
 !   T_voov(d,i,k,a) * X_vovv(d,j,c,b) : ik j
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
        T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_ikj, nO*nO)
 
-!   X_vovv(d,i,b,a) * T_voov(d,k,j,c) : i kj
-  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,k,d)
+  deallocate(X)
+
+  allocate(X(nO,nO,nO,2))
+
   do k=1,nO
     do j=1,nO
-      do d=1,nV
-        X(d,j,k) = T_voov(d,k,j,c)
+      do l=1,nO
+        X(l,j,k,1) = X_ooov(l,k,j,b)
+!        X(l,i,j,2) = X_ooov(l,j,i,a)
+        X(l,j,k,2) = X_ooov(l,k,j,a)
+!        X(l,i,k,2) = X_ooov(l,k,i,a)
       enddo
     enddo
   enddo
-  !$OMP END PARALLEL DO
-
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
-       X_vovv(1,1,b,a), nV, X(1,1,1), nV, 1.d0, W_abc, nO)
-
 
 
 !   - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) : i jk
-!   - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) : i kj
-!   - T_oovv(l,k,c,a) * X_ooov(l,i,j,b) : k ij
-!   - T_oovv(l,k,c,b) * X_ooov(l,j,i,a) : k ji
-!   - T_oovv(l,j,b,c) * X_ooov(l,k,i,a) : j ki
-!   - T_oovv(l,j,b,a) * X_ooov(l,i,k,c) : j ik
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, &
+       T_oovv(1,1,a,b), nO, X_ooov(1,1,1,c), nO, 1.d0, W_abc, nO)
+
+!   - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) : i kj
+
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, &
+       T_oovv(1,1,a,c), nO, X(1,1,1,1), nO, 1.d0, W_abc, nO)
+
+!   - X_ooov(l,i,j,b) * T_oovv(l,k,c,a) : ij k 
+
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
+       X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_abc, nO*nO)
+
+!   - X_ooov(l,j,i,a) * T_oovv(l,k,c,b) : ji k
+
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
+       X(1,1,1,2), nO, T_oovv(1,1,c,b), nO, 1.d0, W_abc, nO*nO)
+
+!   - X_ooov(l,k,i,a) * T_oovv(l,j,b,c) : ki j
+
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
+       X(1,1,1,2), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj, nO*nO)
+
+!   - X_ooov(l,i,k,c) * T_oovv(l,j,b,a) : ik j
+
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
+       X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj, nO*nO)
 
-  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j,k)
   do k=1,nO
     do j=1,nO
       do i=1,nO
@@ -241,33 +260,6 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
       enddo
     enddo
   enddo
-  !$OMP END PARALLEL DO
-
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc) &
-  !$OMP PRIVATE(i,j,k,d,l) &
-  !$OMP DEFAULT(NONE)
-
-  !$OMP DO collapse(3)
-  do k = 1, nO
-    do j = 1, nO
-      do i = 1, nO
-
-        do l = 1, nO
-          W_abc(i,j,k) = W_abc(i,j,k) &
-              - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) &
-              - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) & ! bc kj
-              - T_oovv(l,k,c,a) * X_ooov(l,i,j,b) & ! prev ac ik
-              - T_oovv(l,k,c,b) * X_ooov(l,j,i,a) & ! prev ab ij
-              - T_oovv(l,j,b,c) * X_ooov(l,k,i,a) & ! prev bc kj
-              - T_oovv(l,j,b,a) * X_ooov(l,i,k,c) ! prev ac ik
-        enddo
-
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
 
 end
 
@@ -287,15 +279,9 @@ implicit none
 
   integer :: i,j,k
 
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,a,b,c,T_ov,X_oovv,W,V) &
-  !$OMP PRIVATE(i,j,k) &
-  !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(2)
   do k = 1, nO
     do j = 1, nO
       do i = 1, nO
-        !V(i,j,k,a,b,c) = V(i,j,k,a,b,c) + W(i,j,k,a,b,c) &
         V(i,j,k) = W(i,j,k) &
            + X_oovv(j,k,b,c) * T_ov(i,a) &
            + X_oovv(i,k,a,c) * T_ov(j,b) &
@@ -303,8 +289,6 @@ implicit none
       enddo
     enddo
   enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
 
 end
 

From 2ff4e61c9e283890d5c1819c034b788487f08405 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 13 May 2023 21:48:04 +0200
Subject: [PATCH 06/79] Better parallelism in (T)

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 61 ++++++++++++++---------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index c5c15fb3..8b6db915 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -14,19 +14,17 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, allocatable :: V(:,:,:,:,:,:)
   double precision, allocatable :: W_abc(:,:,:), V_abc(:,:,:)
   double precision, allocatable :: W_cab(:,:,:), W_cba(:,:,:)
-  double precision, allocatable :: W_bca(:,:,:), V_cba(:,:,:)
+  double precision, allocatable :: W_bca(:,:,:)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
   double precision              :: e,ta,tb, delta, delta_abc
 
-  !allocate(W(nV,nV,nV,nO,nO,nO))
-  !allocate(V(nV,nV,nV,nO,nO,nO))
-  allocate(W_abc(nO,nO,nO), V_abc(nO,nO,nO), W_cab(nO,nO,nO))
-  allocate(W_bca(nO,nO,nO), V_cba(nO,nO,nO), W_cba(nO,nO,nO))
   allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
   allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
 
+  call set_multiple_levels_omp(.False.)
+
   ! Temporary arrays
   !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
@@ -104,50 +102,48 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   !$OMP END PARALLEL
 
-  call wall_time(ta)
   energy = 0d0
+  !$OMP PARALLEL                                               &
+  !$OMP PRIVATE(a,b,c,W_abc,W_cab,W_bca,W_cba,V_abc)           &
+  !$OMP PRIVATE(i,j,k,e,delta,delta_abc)                       &
+  !$OMP DEFAULT(SHARED)
+  allocate(W_abc(nO,nO,nO), W_cab(nO,nO,nO), V_abc(nO,nO,nO), &
+           W_bca(nO,nO,nO), W_cba(nO,nO,nO) )
+  !$OMP DO
   do c = 1, nV
     do b = 1, nV
       do a = 1, nV
+        e = 0d0
         delta_abc = f_v(a) + f_v(b) + f_v(c)
         call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
+        call form_w_abc(nO,nV,c,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_cba)
         call form_w_abc(nO,nV,b,c,a,T_voov,T_oovv,X_vovv,X_ooov,W_bca)
         call form_w_abc(nO,nV,c,a,b,T_voov,T_oovv,X_vovv,X_ooov,W_cab)
-        call form_w_abc(nO,nV,c,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_cba)
 
-        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc)
-        call form_v_abc(nO,nV,c,b,a,t1,X_oovv,W_cba,V_cba)
-        !$OMP PARALLEL                                               &
-            !$OMP SHARED(energy,nO,a,b,c,W_abc,W_cab,W_bca,V_abc,V_cba,f_o,f_v,delta_abc)&
-            !$OMP PRIVATE(i,j,k,e,delta)                             &
-            !$OMP DEFAULT(NONE)
-        e = 0d0
-        !$OMP DO
+        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba)
         do i = 1, nO
           do j = 1, nO
             do k = 1, nO
               delta = 1d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
-              !energy = energy + (4d0 * W(i,j,k,a,b,c) + W(i,j,k,b,c,a) + W(i,j,k,c,a,b)) * (V(i,j,k,a,b,c) - V(i,j,k,c,b,a)) / (cc_space_f_o(i) + cc_space_f_o(j) + cc_space_f_o(k) - cc_space_f_v(a) - cc_space_f_v(b) - cc_space_f_v(c))  !delta_ooovvv(i,j,k,a,b,c)
               e = e + (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k))&
-                  * (V_abc(i,j,k) - V_cba(i,j,k)) * delta
+                  * V_abc(i,j,k) * delta
             enddo
           enddo
         enddo
-        !$OMP END DO NOWAIT
         !$OMP CRITICAL
         energy = energy + e
         !$OMP END CRITICAL
-        !$OMP END PARALLEL
       enddo
     enddo
-    call wall_time(tb)
-    write(*,'(F12.2,A5,F12.2,A2)') dble(i)/dble(nO)*100d0, '% in ', tb - ta, ' s'
   enddo
+  !$OMP END DO
 
-  energy = energy / 3d0
+  deallocate(W_abc,V_abc,W_cab,W_bca,W_cba)
+  !$OMP END PARALLEL
 
-  deallocate(W_abc,V_abc,W_cab,V_cba,W_bca,X_vovv,X_ooov,T_voov,T_oovv)
-  !deallocate(V,W)
+  energy = energy / 3.d0
+
+  deallocate(X_vovv,X_ooov,T_voov,T_oovv)
 end
 
 
@@ -233,7 +229,7 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
   call dgemm('T','N', nO, nO*nO, nO, -1.d0, &
        T_oovv(1,1,a,c), nO, X(1,1,1,1), nO, 1.d0, W_abc, nO)
 
-!   - X_ooov(l,i,j,b) * T_oovv(l,k,c,a) : ij k 
+!   - X_ooov(l,i,j,b) * T_oovv(l,k,c,a) : ij k
 
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
        X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_abc, nO*nO)
@@ -261,31 +257,34 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
     enddo
   enddo
 
+  deallocate(X,W_ikj)
 end
 
 
 ! V_abc
 
-subroutine form_v_abc(nO,nV,a,b,c,T_ov,X_oovv,W,V)
+subroutine form_v_abc(nO,nV,a,b,c,T_ov,X_oovv,W_abc,V_abc,W_cba)
 
 implicit none
 
   integer, intent(in)           :: nO,nV,a,b,c
-  !double precision, intent(in)  :: t1(nO,nV)
   double precision, intent(in)  :: T_ov(nO,nV)
   double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: W(nO,nO,nO)
-  double precision, intent(out) :: V(nO,nO,nO)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cba(nO,nO,nO)
+  double precision, intent(out) :: V_abc(nO,nO,nO)
 
   integer :: i,j,k
 
   do k = 1, nO
     do j = 1, nO
       do i = 1, nO
-        V(i,j,k) = W(i,j,k) &
+        V_abc(i,j,k) = W_abc(i,j,k) - W_cba(i,j,k) &
            + X_oovv(j,k,b,c) * T_ov(i,a) &
            + X_oovv(i,k,a,c) * T_ov(j,b) &
-           + X_oovv(i,j,a,b) * T_ov(k,c)
+           + X_oovv(i,j,a,b) * T_ov(k,c) &
+           - X_oovv(j,k,b,a) * T_ov(i,c) &
+           - X_oovv(i,k,c,a) * T_ov(j,b) &
+           - X_oovv(i,j,c,b) * T_ov(k,a)
       enddo
     enddo
   enddo

From c18bea7e817af0142e2fd76577c9f7d90a39e533 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 13 May 2023 22:23:08 +0200
Subject: [PATCH 07/79] Merged 4 calls

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 109 ++++++++++++++++------------
 1 file changed, 64 insertions(+), 45 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 8b6db915..7f334a37 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -115,11 +115,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
       do a = 1, nV
         e = 0d0
         delta_abc = f_v(a) + f_v(b) + f_v(c)
-        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
-        call form_w_abc(nO,nV,c,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_cba)
-        call form_w_abc(nO,nV,b,c,a,T_voov,T_oovv,X_vovv,X_ooov,W_bca)
-        call form_w_abc(nO,nV,c,a,b,T_voov,T_oovv,X_vovv,X_ooov,W_cab)
-
+        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab)
         call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba)
         do i = 1, nO
           do j = 1, nO
@@ -147,112 +143,135 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 end
 
 
-subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc)
+subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab)
 
   implicit none
 
   integer, intent(in)           :: nO,nV,a,b,c
-  !double precision, intent(in) :: t2(nO,nO,nV,nV)
   double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
   double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
   double precision, intent(out) :: W_abc(nO,nO,nO)
+  double precision, intent(out) :: W_cba(nO,nO,nO)
+  double precision, intent(out) :: W_bca(nO,nO,nO)
+  double precision, intent(out) :: W_cab(nO,nO,nO)
 
   integer :: l,i,j,k,d
-  double precision, allocatable, dimension(:,:,:) :: W_ikj
+  double precision, allocatable, dimension(:,:,:,:) :: W_ikj
   double precision, allocatable :: X(:,:,:,:)
 
-  allocate(W_ikj(nO,nO,nO))
-  allocate(X(nV,nO,nO,2))
+  allocate(W_ikj(nO,nO,nO,4))
+  allocate(X(nV,nO,nO,3))
 
   do k=1,nO
     do i=1,nO
       do d=1,nV
         X(d,i,k,1) = T_voov(d,k,i,c)
-!        X(d,i,j,2) = T_voov(d,j,i,b)
         X(d,i,k,2) = T_voov(d,k,i,b)
-!        X(d,j,k,1) = T_voov(d,k,j,c)
+        X(d,i,k,3) = T_voov(d,k,i,a)
       enddo
     enddo
   enddo
 
 !   X_vovv(d,i,c,a) * T_voov(d,j,k,b) : i jk
 
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
-       X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, T_voov(1,1,1,b), nV, 0.d0, W_cba, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,b), nV, T_voov(1,1,1,c), nV, 0.d0, W_bca, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, T_voov(1,1,1,a), nV, 0.d0, W_cab, nO)
 
 !   T_voov(d,i,j,a) * X_vovv(d,k,b,c) : ij k
 
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
-       T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_cba, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,c,a), nV, 1.d0, W_bca, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_cab, nO*nO)
+
 
 !   X_vovv(d,k,a,c) * T_voov(d,j,i,b) : k ji
 
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
-       X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 1.d0, W_cba, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,b,a), nV, 1.d0, W_bca, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,c,b), nV, 1.d0, W_cab, nO*nO)
 
 !   X_vovv(d,i,b,a) * T_voov(d,k,j,c) : i kj
 
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, &
-       X_vovv(1,1,b,a), nV, X(1,1,1,1), nV, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, X(1,1,1,1), nV, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, X(1,1,1,3), nV, 1.d0, W_cba, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,b), nV, X(1,1,1,3), nV, 1.d0, W_bca, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, X(1,1,1,2), nV, 1.d0, W_cab, nO)
 
 !  T_voov(d,k,i,c) * X_vovv(d,j,a,b) : ki j
 
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
-       X(1,1,1,1), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj(1,1,1,1), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,c,b), nV, 0.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,b,c), nV, 0.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 0.d0, W_ikj(1,1,1,4), nO*nO)
 
 !   T_voov(d,i,k,a) * X_vovv(d,j,c,b) : ik j
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, &
-       T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_ikj, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_ikj(1,1,1,1), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,a,c), nV, 1.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_ikj(1,1,1,4), nO*nO)
 
   deallocate(X)
 
-  allocate(X(nO,nO,nO,2))
+  allocate(X(nO,nO,nO,3))
 
   do k=1,nO
     do j=1,nO
       do l=1,nO
         X(l,j,k,1) = X_ooov(l,k,j,b)
-!        X(l,i,j,2) = X_ooov(l,j,i,a)
         X(l,j,k,2) = X_ooov(l,k,j,a)
-!        X(l,i,k,2) = X_ooov(l,k,i,a)
+        X(l,j,k,3) = X_ooov(l,k,j,c)
       enddo
     enddo
   enddo
 
 
 !   - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) : i jk
-  call dgemm('T','N', nO, nO*nO, nO, -1.d0, &
-       T_oovv(1,1,a,b), nO, X_ooov(1,1,1,c), nO, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,b), nO, X_ooov(1,1,1,c), nO, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X_ooov(1,1,1,a), nO, 1.d0, W_cba, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,c), nO, X_ooov(1,1,1,a), nO, 1.d0, W_bca, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X_ooov(1,1,1,b), nO, 1.d0, W_cab, nO)
 
 !   - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) : i kj
-
-  call dgemm('T','N', nO, nO*nO, nO, -1.d0, &
-       T_oovv(1,1,a,c), nO, X(1,1,1,1), nO, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,c), nO, X(1,1,1,1), nO, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X(1,1,1,1), nO, 1.d0, W_cba, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,a), nO, X(1,1,1,3), nO, 1.d0, W_bca, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X(1,1,1,2), nO, 1.d0, W_cab, nO)
 
 !   - X_ooov(l,i,j,b) * T_oovv(l,k,c,a) : ij k
-
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
-       X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_cba, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,a,b), nO, 1.d0, W_bca, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_cab, nO*nO)
 
 !   - X_ooov(l,j,i,a) * T_oovv(l,k,c,b) : ji k
-
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
-       X(1,1,1,2), nO, T_oovv(1,1,c,b), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,c,b), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_cba, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,a,c), nO, 1.d0, W_bca, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_cab, nO*nO)
 
 !   - X_ooov(l,k,i,a) * T_oovv(l,j,b,c) : ki j
-
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
-       X(1,1,1,2), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,c,a), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
 
 !   - X_ooov(l,i,k,c) * T_oovv(l,j,b,a) : ik j
-
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, &
-       X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,c,b), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
 
   do k=1,nO
     do j=1,nO
       do i=1,nO
-        W_abc(i,j,k) = W_abc(i,j,k) + W_ikj(i,k,j)
+        W_abc(i,j,k) = W_abc(i,j,k) + W_ikj(i,k,j,1)
+        W_cba(i,j,k) = W_cba(i,j,k) + W_ikj(i,k,j,2)
+        W_bca(i,j,k) = W_bca(i,j,k) + W_ikj(i,k,j,3)
+        W_cab(i,j,k) = W_cab(i,j,k) + W_ikj(i,k,j,4)
       enddo
     enddo
   enddo

From cad1da1768b7ab3d9a93b6d6439a0bb414fb8ab7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 13 May 2023 23:29:58 +0200
Subject: [PATCH 08/79] All permutations in ccsd_t_space_orb_abc.irp.f

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 168 +++++++++++++++++++---------
 1 file changed, 114 insertions(+), 54 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 7f334a37..65a04549 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -12,9 +12,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   double precision, allocatable :: W(:,:,:,:,:,:)
   double precision, allocatable :: V(:,:,:,:,:,:)
-  double precision, allocatable :: W_abc(:,:,:), V_abc(:,:,:)
-  double precision, allocatable :: W_cab(:,:,:), W_cba(:,:,:)
-  double precision, allocatable :: W_bca(:,:,:)
+  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
+  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
+  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
+  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
@@ -103,26 +104,30 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !$OMP END PARALLEL
 
   energy = 0d0
-  !$OMP PARALLEL                                               &
-  !$OMP PRIVATE(a,b,c,W_abc,W_cab,W_bca,W_cba,V_abc)           &
-  !$OMP PRIVATE(i,j,k,e,delta,delta_abc)                       &
-  !$OMP DEFAULT(SHARED)
-  allocate(W_abc(nO,nO,nO), W_cab(nO,nO,nO), V_abc(nO,nO,nO), &
-           W_bca(nO,nO,nO), W_cba(nO,nO,nO) )
+  !$OMP PARALLEL                                                     &
+      !$OMP PRIVATE(a,b,c)                                           &
+      !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
+      !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
+      !$OMP PRIVATE(i,j,k,e,delta,delta_abc)                         &
+      !$OMP DEFAULT(SHARED)
+  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
+            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
+            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
+            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
   !$OMP DO
   do c = 1, nV
     do b = 1, nV
       do a = 1, nV
         e = 0d0
         delta_abc = f_v(a) + f_v(b) + f_v(c)
-        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab)
-        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba)
+        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
+        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
         do i = 1, nO
           do j = 1, nO
             do k = 1, nO
               delta = 1d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
               e = e + (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k))&
-                  * V_abc(i,j,k) * delta
+                  * (V_abc(i,j,k) - V_cba(i,j,k)) * delta
             enddo
           enddo
         enddo
@@ -134,7 +139,9 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO
 
-  deallocate(W_abc,V_abc,W_cab,W_bca,W_cba)
+  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
+             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
+
   !$OMP END PARALLEL
 
   energy = energy / 3.d0
@@ -143,7 +150,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 end
 
 
-subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab)
+subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
 
   implicit none
 
@@ -154,20 +161,22 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,
   double precision, intent(out) :: W_cba(nO,nO,nO)
   double precision, intent(out) :: W_bca(nO,nO,nO)
   double precision, intent(out) :: W_cab(nO,nO,nO)
+  double precision, intent(out) :: W_bac(nO,nO,nO)
+  double precision, intent(out) :: W_acb(nO,nO,nO)
 
   integer :: l,i,j,k,d
   double precision, allocatable, dimension(:,:,:,:) :: W_ikj
   double precision, allocatable :: X(:,:,:,:)
 
-  allocate(W_ikj(nO,nO,nO,4))
+  allocate(W_ikj(nO,nO,nO,6))
   allocate(X(nV,nO,nO,3))
 
   do k=1,nO
     do i=1,nO
       do d=1,nV
-        X(d,i,k,1) = T_voov(d,k,i,c)
+        X(d,i,k,1) = T_voov(d,k,i,a)
         X(d,i,k,2) = T_voov(d,k,i,b)
-        X(d,i,k,3) = T_voov(d,k,i,a)
+        X(d,i,k,3) = T_voov(d,k,i,c)
       enddo
     enddo
   enddo
@@ -175,44 +184,56 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,
 !   X_vovv(d,i,c,a) * T_voov(d,j,k,b) : i jk
 
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, T_voov(1,1,1,a), nV, 0.d0, W_bac, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, T_voov(1,1,1,b), nV, 0.d0, W_cba, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,b), nV, T_voov(1,1,1,c), nV, 0.d0, W_bca, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, T_voov(1,1,1,a), nV, 0.d0, W_cab, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, T_voov(1,1,1,c), nV, 0.d0, W_acb, nO)
 
 !   T_voov(d,i,j,a) * X_vovv(d,k,b,c) : ij k
 
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,a,c), nV, 1.d0, W_bac, nO*nO)
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_cba, nO*nO)
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,c,a), nV, 1.d0, W_bca, nO*nO)
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_cab, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_acb, nO*nO)
 
 
 !   X_vovv(d,k,a,c) * T_voov(d,j,i,b) : k ji
 
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,b,c), nV, 1.d0, W_bac, nO*nO)
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 1.d0, W_cba, nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,b,a), nV, 1.d0, W_bca, nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,c,b), nV, 1.d0, W_cab, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,b,a), nV, 1.d0, W_bca, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,c,b), nV, 1.d0, W_cab, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,a,b), nV, 1.d0, W_acb, nO*nO)
 
 !   X_vovv(d,i,b,a) * T_voov(d,k,j,c) : i kj
 
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, X(1,1,1,1), nV, 1.d0, W_abc, nO)
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, X(1,1,1,3), nV, 1.d0, W_cba, nO)
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,b), nV, X(1,1,1,3), nV, 1.d0, W_bca, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, X(1,1,1,3), nV, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,b), nV, X(1,1,1,3), nV, 1.d0, W_bac, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, X(1,1,1,1), nV, 1.d0, W_cba, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,b), nV, X(1,1,1,1), nV, 1.d0, W_bca, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, X(1,1,1,2), nV, 1.d0, W_cab, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,a), nV, X(1,1,1,2), nV, 1.d0, W_acb, nO)
 
 !  T_voov(d,k,i,c) * X_vovv(d,j,a,b) : ki j
 
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj(1,1,1,1), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,c,b), nV, 0.d0, W_ikj(1,1,1,2), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,b,c), nV, 0.d0, W_ikj(1,1,1,3), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 0.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,a,b), nV, 0.d0, W_ikj(1,1,1,1), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,3), nV, X_vovv(1,1,b,a), nV, 0.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,c,b), nV, 0.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,1), nV, X_vovv(1,1,b,c), nV, 0.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,c,a), nV, 0.d0, W_ikj(1,1,1,5), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, X(1,1,1,2), nV, X_vovv(1,1,a,c), nV, 0.d0, W_ikj(1,1,1,6), nO*nO)
 
 !   T_voov(d,i,k,a) * X_vovv(d,j,c,b) : ik j
   call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,c,b), nV, 1.d0, W_ikj(1,1,1,1), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_ikj(1,1,1,2), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,a,c), nV, 1.d0, W_ikj(1,1,1,3), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,c,a), nV, 1.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,a,b), nV, 1.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,b), nV, X_vovv(1,1,a,c), nV, 1.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,c), nV, X_vovv(1,1,b,a), nV, 1.d0, W_ikj(1,1,1,5), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nV, 1.d0, T_voov(1,1,1,a), nV, X_vovv(1,1,b,c), nV, 1.d0, W_ikj(1,1,1,6), nO*nO)
 
   deallocate(X)
 
@@ -221,8 +242,8 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,
   do k=1,nO
     do j=1,nO
       do l=1,nO
-        X(l,j,k,1) = X_ooov(l,k,j,b)
-        X(l,j,k,2) = X_ooov(l,k,j,a)
+        X(l,j,k,1) = X_ooov(l,k,j,a)
+        X(l,j,k,2) = X_ooov(l,k,j,b)
         X(l,j,k,3) = X_ooov(l,k,j,c)
       enddo
     enddo
@@ -231,47 +252,61 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,
 
 !   - T_oovv(l,i,a,b) * X_ooov(l,j,k,c) : i jk
   call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,b), nO, X_ooov(1,1,1,c), nO, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,a), nO, X_ooov(1,1,1,c), nO, 1.d0, W_bac, nO)
   call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X_ooov(1,1,1,a), nO, 1.d0, W_cba, nO)
   call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,c), nO, X_ooov(1,1,1,a), nO, 1.d0, W_bca, nO)
   call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X_ooov(1,1,1,b), nO, 1.d0, W_cab, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,c), nO, X_ooov(1,1,1,b), nO, 1.d0, W_acb, nO)
 
 !   - T_oovv(l,i,a,c) * X_ooov(l,k,j,b) : i kj
-  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,c), nO, X(1,1,1,1), nO, 1.d0, W_abc, nO)
-  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X(1,1,1,1), nO, 1.d0, W_cba, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,c), nO, X(1,1,1,2), nO, 1.d0, W_abc, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,c), nO, X(1,1,1,1), nO, 1.d0, W_bac, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,a), nO, X(1,1,1,2), nO, 1.d0, W_cba, nO)
   call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,b,a), nO, X(1,1,1,3), nO, 1.d0, W_bca, nO)
-  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X(1,1,1,2), nO, 1.d0, W_cab, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,c,b), nO, X(1,1,1,1), nO, 1.d0, W_cab, nO)
+  call dgemm('T','N', nO, nO*nO, nO, -1.d0, T_oovv(1,1,a,b), nO, X(1,1,1,3), nO, 1.d0, W_acb, nO)
 
 !   - X_ooov(l,i,j,b) * T_oovv(l,k,c,a) : ij k
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,c,b), nO, 1.d0, W_bac, nO*nO)
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_cba, nO*nO)
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,a,b), nO, 1.d0, W_bca, nO*nO)
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_cab, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_acb, nO*nO)
 
 !   - X_ooov(l,j,i,a) * T_oovv(l,k,c,b) : ji k
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,c,b), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,c,b), nO, 1.d0, W_abc, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,c,a), nO, 1.d0, W_bac, nO*nO)
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_cba, nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,a,c), nO, 1.d0, W_bca, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,a,c), nO, 1.d0, W_bca, nO*nO)
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_cab, nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,b,c), nO, 1.d0, W_acb, nO*nO)
 
 !   - X_ooov(l,k,i,a) * T_oovv(l,j,b,c) : ki j
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,c,a), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,a,c), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,2), nO, T_oovv(1,1,c,a), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,3), nO, T_oovv(1,1,a,b), nO, 1.d0, W_ikj(1,1,1,5), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X(1,1,1,1), nO, T_oovv(1,1,c,b), nO, 1.d0, W_ikj(1,1,1,6), nO*nO)
 
 !   - X_ooov(l,i,k,c) * T_oovv(l,j,b,a) : ik j
   call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,b,a), nO, 1.d0, W_ikj(1,1,1,1), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,c,b), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
-  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,c), nO, T_oovv(1,1,a,b), nO, 1.d0, W_ikj(1,1,1,2), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,b,c), nO, 1.d0, W_ikj(1,1,1,3), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,a), nO, T_oovv(1,1,c,b), nO, 1.d0, W_ikj(1,1,1,4), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,a,c), nO, 1.d0, W_ikj(1,1,1,5), nO*nO)
+  call dgemm('T','N', nO*nO, nO, nO, -1.d0, X_ooov(1,1,1,b), nO, T_oovv(1,1,c,a), nO, 1.d0, W_ikj(1,1,1,6), nO*nO)
 
   do k=1,nO
     do j=1,nO
       do i=1,nO
         W_abc(i,j,k) = W_abc(i,j,k) + W_ikj(i,k,j,1)
-        W_cba(i,j,k) = W_cba(i,j,k) + W_ikj(i,k,j,2)
-        W_bca(i,j,k) = W_bca(i,j,k) + W_ikj(i,k,j,3)
-        W_cab(i,j,k) = W_cab(i,j,k) + W_ikj(i,k,j,4)
+        W_bac(i,j,k) = W_bac(i,j,k) + W_ikj(i,k,j,2)
+        W_cba(i,j,k) = W_cba(i,j,k) + W_ikj(i,k,j,3)
+        W_bca(i,j,k) = W_bca(i,j,k) + W_ikj(i,k,j,4)
+        W_cab(i,j,k) = W_cab(i,j,k) + W_ikj(i,k,j,5)
+        W_acb(i,j,k) = W_acb(i,j,k) + W_ikj(i,k,j,6)
       enddo
     enddo
   enddo
@@ -282,28 +317,53 @@ end
 
 ! V_abc
 
-subroutine form_v_abc(nO,nV,a,b,c,T_ov,X_oovv,W_abc,V_abc,W_cba)
+subroutine form_v_abc(nO,nV,a,b,c,T_ov,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
 
 implicit none
 
   integer, intent(in)           :: nO,nV,a,b,c
   double precision, intent(in)  :: T_ov(nO,nV)
   double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cba(nO,nO,nO)
-  double precision, intent(out) :: V_abc(nO,nO,nO)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
+  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
+  double precision, intent(out) :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
+  double precision, intent(out) :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
 
   integer :: i,j,k
 
   do k = 1, nO
     do j = 1, nO
       do i = 1, nO
-        V_abc(i,j,k) = W_abc(i,j,k) - W_cba(i,j,k) &
+        V_abc(i,j,k) = W_abc(i,j,k) &
            + X_oovv(j,k,b,c) * T_ov(i,a) &
            + X_oovv(i,k,a,c) * T_ov(j,b) &
-           + X_oovv(i,j,a,b) * T_ov(k,c) &
-           - X_oovv(j,k,b,a) * T_ov(i,c) &
-           - X_oovv(i,k,c,a) * T_ov(j,b) &
-           - X_oovv(i,j,c,b) * T_ov(k,a)
+           + X_oovv(i,j,a,b) * T_ov(k,c)
+
+        V_cba(i,j,k) = W_cba(i,j,k) &
+           + X_oovv(j,k,b,a) * T_ov(i,c) &
+           + X_oovv(i,k,c,a) * T_ov(j,b) &
+           + X_oovv(i,j,c,b) * T_ov(k,a)
+
+        V_bca(i,j,k) = W_bca(i,j,k) &
+           + X_oovv(j,k,c,a) * T_ov(i,b) &
+           + X_oovv(i,k,b,a) * T_ov(j,c) &
+           + X_oovv(i,j,b,c) * T_ov(k,a)
+
+        V_cab(i,j,k) = W_cab(i,j,k) &
+           + X_oovv(j,k,a,b) * T_ov(i,c) &
+           + X_oovv(i,k,c,b) * T_ov(j,a) &
+           + X_oovv(i,j,c,a) * T_ov(k,b)
+
+        V_bac(i,j,k) = W_bac(i,j,k) &
+           + X_oovv(j,k,a,c) * T_ov(i,b) &
+           + X_oovv(i,k,b,c) * T_ov(j,a) &
+           + X_oovv(i,j,b,a) * T_ov(k,c)
+
+        V_acb(i,j,k) = W_acb(i,j,k) &
+           + X_oovv(j,k,c,b) * T_ov(i,a) &
+           + X_oovv(i,k,a,b) * T_ov(j,c) &
+           + X_oovv(i,j,a,c) * T_ov(k,b)
+
       enddo
     enddo
   enddo

From d4ba229e6fdb6d567dd0c0258cb14aa14fa6524d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 14 May 2023 02:13:55 +0200
Subject: [PATCH 09/79] Symmetries in (T)

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 51 +++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 65a04549..a2e4ec7b 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -19,7 +19,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
-  double precision              :: e,ta,tb, delta, delta_abc
+  double precision              :: e,ta,tb, delta, delta_abc, x1, x2, x3
 
   allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
   allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
@@ -105,7 +105,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   energy = 0d0
   !$OMP PARALLEL                                                     &
-      !$OMP PRIVATE(a,b,c)                                           &
+      !$OMP PRIVATE(a,b,c,x1)                                           &
       !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
       !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
       !$OMP PRIVATE(i,j,k,e,delta,delta_abc)                         &
@@ -114,30 +114,55 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
             W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
             V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
             V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
+  e = 0d0
   !$OMP DO
-  do c = 1, nV
-    do b = 1, nV
-      do a = 1, nV
-        e = 0d0
+  do a = 1, nV
+    do b = 1, a-1
+      do c = 1, b-1
         delta_abc = f_v(a) + f_v(b) + f_v(c)
         call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
         call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
         do i = 1, nO
           do j = 1, nO
             do k = 1, nO
-              delta = 1d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
-              e = e + (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k))&
-                  * (V_abc(i,j,k) - V_cba(i,j,k)) * delta
+              delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+              e = e + delta * ( &
+                 (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
+                 (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
+                 (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) + &
+                 (4d0 * W_bca(i,j,k) + W_cab(i,j,k) + W_abc(i,j,k)) * (V_bca(i,j,k) - V_acb(i,j,k)) + &
+                 (4d0 * W_cba(i,j,k) + W_bac(i,j,k) + W_acb(i,j,k)) * (V_cba(i,j,k) - V_abc(i,j,k)) + &
+                 (4d0 * W_cab(i,j,k) + W_abc(i,j,k) + W_bca(i,j,k)) * (V_cab(i,j,k) - V_bac(i,j,k)) + &
+                  0.d0)
             enddo
           enddo
         enddo
-        !$OMP CRITICAL
-        energy = energy + e
-        !$OMP END CRITICAL
+      enddo
+    enddo
+
+    c = a
+    do b = 1, nV
+      delta_abc = f_v(a) + f_v(b) + f_v(c)
+      call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
+      call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
+      do i = 1, nO
+        do j = 1, nO
+          do k = 1, nO
+            delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+            e = e + delta * ( &
+               (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
+               (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
+               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) + &
+                0.d0)
+          enddo
+        enddo
       enddo
     enddo
   enddo
   !$OMP END DO
+  !$OMP CRITICAL
+  energy = energy + e
+  !$OMP END CRITICAL
 
   deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
              V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
@@ -184,7 +209,7 @@ subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,
 !   X_vovv(d,i,c,a) * T_voov(d,j,k,b) : i jk
 
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,a), nV, T_voov(1,1,1,b), nV, 0.d0, W_abc, nO)
-  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,a), nV, T_voov(1,1,1,a), nV, 0.d0, W_bac, nO)
+  call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,c,b), nV, T_voov(1,1,1,a), nV, 0.d0, W_bac, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,c), nV, T_voov(1,1,1,b), nV, 0.d0, W_cba, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,a,b), nV, T_voov(1,1,1,c), nV, 0.d0, W_bca, nO)
   call dgemm('T','N', nO, nO*nO, nV, 1.d0, X_vovv(1,1,b,c), nV, T_voov(1,1,1,a), nV, 0.d0, W_cab, nO)

From 2e54537f1547861586c3c078e8ce5b3e1a9df652 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 14 May 2023 02:41:34 +0200
Subject: [PATCH 10/79] v3 of (T) is fast!

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index a2e4ec7b..462d4adf 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -115,7 +115,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
             V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
             V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
   e = 0d0
-  !$OMP DO
+  !$OMP DO SCHEDULE(dynamic)
   do a = 1, nV
     do b = 1, a-1
       do c = 1, b-1
@@ -142,6 +142,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
     c = a
     do b = 1, nV
+      if (b == c) cycle
       delta_abc = f_v(a) + f_v(b) + f_v(c)
       call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
       call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
@@ -159,7 +160,8 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
       enddo
     enddo
   enddo
-  !$OMP END DO
+  !$OMP END DO NOWAIT
+
   !$OMP CRITICAL
   energy = energy + e
   !$OMP END CRITICAL

From df07c65980affa277b304a17d35f1636f598171a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 14 May 2023 10:07:50 +0200
Subject: [PATCH 11/79] Fixed trexio installation

---
 configure | 2 --
 1 file changed, 2 deletions(-)

diff --git a/configure b/configure
index 66bc9419..48e6fd12 100755
--- a/configure
+++ b/configure
@@ -215,7 +215,6 @@ EOF
               cd trexio-${VERSION}
               ./configure --prefix=\${QP_ROOT} --without-hdf5
               make -j 8 && make -j 8 check && make -j 8 install
-              cp ${QP_ROOT}/include/trexio_f.f90 ${QP_ROOT}/src/ezfio_files
               tar -zxvf "\${QP_ROOT}"/external/qp2-dependencies/${ARCHITECTURE}/ninja.tar.gz 
               mv ninja "\${QP_ROOT}"/bin/
 EOF
@@ -229,7 +228,6 @@ EOF
               cd trexio-${VERSION}
               ./configure --prefix=\${QP_ROOT}
               make -j 8 && make -j 8 check && make -j 8 install
-              cp ${QP_ROOT}/include/trexio_f.f90 ${QP_ROOT}/src/ezfio_files
 EOF
  
 

From 873d978348018e6e9774444c3532ffb45d323fb2 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 15 May 2023 13:06:06 +0200
Subject: [PATCH 12/79] Less multiplications in (T)

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 55 +++++++++++++++--------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 462d4adf..7c0ed929 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -36,10 +36,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
   !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
 
-  !$OMP DO collapse(3)
-  do i = 1, nO
-    do a = 1, nV
-      do b = 1, nV
+  !$OMP DO 
+  do a = 1, nV
+    do b = 1, nV
+      do i = 1, nO
         do d = 1, nV
           X_vovv(d,i,b,a) = v_vvvo(b,a,d,i)
         enddo
@@ -48,10 +48,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO collapse(3)
-  do j = 1, nO
-    do k = 1, nO
-      do c = 1, nV
+  !$OMP DO 
+  do c = 1, nV
+    do j = 1, nO
+      do k = 1, nO
         do d = 1, nV
           T_voov(d,k,j,c) = t2(k,j,c,d)
         enddo
@@ -63,7 +63,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vooo(c,j,k,l) * t2(i,l,a,b) &
   !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
 
-  !$OMP DO collapse(3)
+  !$OMP DO 
   do c = 1, nV
     do k = 1, nO
       do j = 1, nO
@@ -75,10 +75,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO collapse(3)
-  do i = 1, nO
+  !$OMP DO 
+  do a = 1, nV
     do b = 1, nV
-      do a = 1, nV
+      do i = 1, nO
         do l = 1, nO
           T_oovv(l,i,a,b) = t2(i,l,a,b)
         enddo
@@ -89,7 +89,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   !X_oovv(j,k,b,c) * T1_vo(a,i) &
 
-  !$OMP DO collapse(3)
+  !$OMP DO 
   do c = 1, nV
     do b = 1, nV
       do j = 1, nO
@@ -122,18 +122,20 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
         delta_abc = f_v(a) + f_v(b) + f_v(c)
         call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
         call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
-        do i = 1, nO
+        do k = 1, nO
           do j = 1, nO
-            do k = 1, nO
+            do i = 1, nO
               delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
               e = e + delta * ( &
-                 (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
-                 (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
-                 (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) + &
-                 (4d0 * W_bca(i,j,k) + W_cab(i,j,k) + W_abc(i,j,k)) * (V_bca(i,j,k) - V_acb(i,j,k)) + &
-                 (4d0 * W_cba(i,j,k) + W_bac(i,j,k) + W_acb(i,j,k)) * (V_cba(i,j,k) - V_abc(i,j,k)) + &
-                 (4d0 * W_cab(i,j,k) + W_abc(i,j,k) + W_bca(i,j,k)) * (V_cab(i,j,k) - V_bac(i,j,k)) + &
-                  0.d0)
+                 (4d0 * (W_abc(i,j,k) - W_cba(i,j,k)) + &
+                         W_bca(i,j,k) - W_bac(i,j,k)  + &
+                         W_cab(i,j,k) - W_acb(i,j,k)  ) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
+                 (4d0 * (W_acb(i,j,k) - W_bca(i,j,k)) + &
+                         W_cba(i,j,k) - W_cab(i,j,k)  + &
+                         W_bac(i,j,k) - W_abc(i,j,k)  ) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
+                 (4d0 * (W_bac(i,j,k) - W_cab(i,j,k)) + &
+                         W_acb(i,j,k) - W_abc(i,j,k)  + &
+                         W_cba(i,j,k) - W_bca(i,j,k)  ) * (V_bac(i,j,k) - V_cab(i,j,k)) )
             enddo
           enddo
         enddo
@@ -146,15 +148,14 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
       delta_abc = f_v(a) + f_v(b) + f_v(c)
       call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
       call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
-      do i = 1, nO
+      do k = 1, nO
         do j = 1, nO
-          do k = 1, nO
-            delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+          do i = 1, nO
+            delta = 1.0d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
             e = e + delta * ( &
                (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
                (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
-               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) + &
-                0.d0)
+               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) )
           enddo
         enddo
       enddo

From 738140547974f4e1ec9cac4cb25fa24edc963cc1 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 15 May 2023 19:37:34 +0200
Subject: [PATCH 13/79] Removed collapse in ccsd

---
 src/ccsd/ccsd_space_orb_sub.irp.f   | 150 ++++++++++++++--------------
 src/ccsd/ccsd_t_space_orb_abc.irp.f |  36 +++----
 src/utils_cc/update_t.irp.f         |   4 +-
 3 files changed, 93 insertions(+), 97 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index acd14034..75752f5c 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -109,7 +109,7 @@ subroutine run_ccsd_space_orb
       call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
       call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
     else
-      print*,'Unkonw cc_method_method: '//cc_update_method
+      print*,'Unkown cc_method_method: '//cc_update_method
     endif
 
     call update_tau_space(nO,nV,t1,t2,tau)
@@ -211,8 +211,8 @@ subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
   !$omp default(none)
   e = 0d0
   !$omp do
-  do i = 1, nO
-    do a = 1, nV
+  do a = 1, nV
+    do i = 1, nO
       e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
     enddo
   enddo
@@ -255,7 +255,7 @@ subroutine update_tau_space(nO,nV,t1,t2,tau)
   !$OMP SHARED(nO,nV,tau,t2,t1) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(3)
+  !$OMP DO 
   do b = 1, nV
     do a = 1, nV
       do j = 1, nO
@@ -373,7 +373,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,X_voov,t2,t1) &
   !$omp private(u,beta,i,a) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do beta = 1, nV
     do u = 1, nO
       do i = 1, nO
@@ -412,7 +412,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
   !$omp private(u,beta,i,a) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do beta = 1, nV
     do u = 1, nO
       do a = 1, nv
@@ -452,7 +452,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) &
   !$omp private(b,beta,i,a) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do beta = 1, nV
     do i = 1, nO
       do b = 1, nV
@@ -464,11 +464,11 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   enddo
   !$omp end do nowait
 
-  !$omp do collapse(3)
-  do i = 1, nO
-    do b = 1, nV
-      do a = 1, nV
-        do u = 1, nO
+  !$omp do 
+  do u = 1, nO
+    do i = 1, nO
+      do b = 1, nV
+        do a = 1, nV
           T_vvoo(a,b,i,u) = tau(i,u,a,b)  
         enddo
       enddo
@@ -504,8 +504,8 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,cc_space_v_vooo,W_oovo) &
   !$omp private(u,a,i,j) &
   !$omp default(none)
-  !$omp do collapse(3)
   do u = 1, nO
+    !$omp do 
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
@@ -513,8 +513,8 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
 
   call dgemm('T','N', nO, nV, nO*nO*nV, &
@@ -527,9 +527,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   max_r1 = 0d0
   do a = 1, nV
     do i = 1, nO
-      if (dabs(r1(i,a)) > max_r1) then
-        max_r1 = dabs(r1(i,a))
-      endif
+      max_r1 = max(dabs(r1(i,a)), max_r1)
     enddo
   enddo
 
@@ -657,7 +655,7 @@ subroutine compute_H_vv(nO,nV,t1,t2,tau,H_vv)
   ! H_vv(a,beta) = H_vv(a,beta) - cc_space_w_vvoo(a,b,i,j) * tau(i,j,beta,b)
   ! H_vv(a,beta) = H_vv(a,beta) - cc_space_w_vvoo(a,b,i,j) * tmp_tau(b,i,j,beta)
   
-  !$omp do collapse(3)
+  !$omp do 
   do beta = 1, nV
     do j = 1, nO
       do i = 1, nO
@@ -727,7 +725,7 @@ subroutine compute_H_vo(nO,nV,t1,t2,H_vo)
   ! H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b)
   ! H_vo(a,i) = H_vo(a,i) + w(a,i,j,b) * t1(j,b)
 
-  !$omp do collapse(3)
+  !$omp do 
   do b = 1, nV
     do j = 1, nO
       do i = 1, nO
@@ -787,7 +785,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,cc_space_v_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -863,7 +861,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,t2,X_oovv) &
   !$omp private(u,v,gam,a) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do a = 1, nV
     do gam = 1, nV
       do v = 1, nO
@@ -885,7 +883,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Y_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -921,7 +919,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -957,7 +955,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,X_vovv,cc_space_v_ovvv) &
   !$omp private(u,a,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do u = 1, nO
@@ -979,7 +977,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Y_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1014,8 +1012,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,X_vovo,cc_space_v_ovov) &
   !$omp private(u,v,gam,i) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do gam = 1, nV
       do u = 1, nO
         do a = 1, nV
@@ -1023,8 +1021,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
 
   call dgemm('N','N',nV*nO*nV,nV,nO, &
@@ -1041,7 +1039,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1079,7 +1077,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1116,8 +1114,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
   !$omp private(a,v,gam,i) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do gam = 1, nV
       do v = 1, nO
         do a = 1, nV
@@ -1125,8 +1123,8 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
 
   call dgemm('N','N',nO,nO*nV*nO,nV, &
@@ -1143,7 +1141,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1182,19 +1180,19 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,X_ovvo,Y_voov,K1,J1,t2) &
   !$omp private(u,v,gam,beta,i,a) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do a = 1, nV
       do beta = 1, nV
         do u = 1, nO
-          X_ovvo(u,beta,a,i) = 0.5d0 * (2d0 * J1(u,a,beta,i) - K1(u,a,i,beta))
+          X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta))
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do nowait
 
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do v = 1, nO
       do i = 1, nO
@@ -1216,7 +1214,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1252,7 +1250,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
   !$omp private(u,a,i,beta,gam) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do beta = 1, nV
     do u = 1, nO
       do a = 1, nV
@@ -1264,7 +1262,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do nowait
 
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do v = 1, nO
       do a = 1, nV
@@ -1286,7 +1284,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1319,7 +1317,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,K1,X_ovov,Z_ovov,t2) &
   !$omp private(u,v,gam,beta,i,a) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
   do a = 1, nV
     do i = 1, nO
       do gam = 1, nV
@@ -1331,7 +1329,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do nowait
 
-  !$omp do collapse(3)
+  !$omp do
   do beta = 1, nV
     do v = 1, nO
       do a = 1, nV
@@ -1353,7 +1351,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1373,7 +1371,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2) &
   !$omp private(i,j,a,b) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
   do b = 1, nV
     do a = 1, nV
       do j = 1, nO
@@ -1391,9 +1389,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          if (dabs(r2(i,j,a,b)) > max_r2) then
-            max_r2 = dabs(r2(i,j,a,b))
-          endif
+          max_r2 = max(r2(i,j,a,b), max_r2)
         enddo
       enddo
     enddo
@@ -1448,7 +1444,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
   !$omp shared(nO,nV,A1,cc_space_v_oooo,cc_space_v_ovoo,X_vooo) &
   !$omp private(u,v,i,j) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do collapse(2)
   do j = 1, nO
     do i = 1, nO
       do v = 1, nO
@@ -1462,7 +1458,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
 
   ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
 
-  !$omp do collapse(3)
+  !$omp do collapse(2)
   do j = 1, nO
     do i = 1, nO
       do u = 1, nO
@@ -1484,7 +1480,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
   !$omp shared(nO,nV,A1,Y_oooo) &
   !$omp private(u,v,i,j) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do collapse(2)
   do j = 1, nO
     do i = 1, nO
       do v = 1, nO
@@ -1553,7 +1549,7 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
   !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo) &
   !$omp private(a,b,beta,gam) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do b = 1, nV
@@ -1564,8 +1560,8 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
     enddo
   enddo
   !$omp end do nowait
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do gam = 1, nV
       do b = 1, nV
         do a = 1, nV
@@ -1573,8 +1569,8 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
   
   ! B1(a,b,beta,gam) -= cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
@@ -1594,7 +1590,7 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
   !$omp shared(nV,B1,Y_vvvv) &
   !$omp private(a,b,beta,gam) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do gam = 1, nV
     do beta = 1, nV
       do b = 1, nV
@@ -1658,7 +1654,7 @@ subroutine compute_g_occ(nO,nV,t1,t2,H_oo,g_occ)
   enddo
   !$omp end do
   
-  !$omp do collapse(1)
+  !$omp do 
   do i = 1, nO
     do j = 1, nO
       do a = 1, nV
@@ -1720,7 +1716,7 @@ subroutine compute_g_vir(nO,nV,t1,t2,H_vv,g_vir)
   enddo
   !$omp end do
 
-  !$omp do collapse(1)
+  !$omp do 
   do beta = 1, nV
     do i = 1, nO
       do b = 1, nV
@@ -1788,8 +1784,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
   !$omp private(i,j,a,u,beta) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1797,10 +1793,10 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do nowait
 
-  !$omp do collapse(3)
+  !$omp do collapse(2)
   do j = 1, nO
     do i = 1, nO
       do a = 1, nV
@@ -1822,8 +1818,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp shared(nO,nV,J1,Y_ovov) &
   !$omp private(i,beta,a,u) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1831,8 +1827,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
   deallocate(X_ovoo)
 
@@ -1849,7 +1845,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
   !$omp private(i,beta,a,u,b,j) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do b = 1, nV
     do j = 1, nO
       do beta = 1, nV
@@ -1861,7 +1857,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   enddo
   !$omp end do nowait
 
-  !$omp do collapse(3)
+  !$omp do 
   do b = 1, nV
     do j = 1, nO
       do i = 1, nO
@@ -1886,8 +1882,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) &
   !$omp private(i,beta,a,u,j,b) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do 
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1895,12 +1891,12 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do nowait
   
   !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
-  !$omp do collapse(3)
   do j = 1, nO
+    !$omp do 
     do b = 1, nV
       do i = 1, nO
         do a = 1, nV
@@ -1908,11 +1904,11 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do nowait
   
-  !$omp do collapse(3)
   do j = 1, nO
+    !$omp do 
     do b = 1, nV
       do beta = 1, nV
         do u = 1, nO
@@ -1920,8 +1916,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
   
   call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
@@ -1933,8 +1929,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp shared(nO,nV,J1,Z_ovvo) &
   !$omp private(i,beta,a,u) &
   !$omp default(none)
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1942,8 +1938,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do
   !$omp end parallel
 
   deallocate(X_ovvo,Z_ovvo,Y_ovov)  
@@ -2003,7 +1999,7 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
   !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
   !$omp private(i,beta,a,u,j,b) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do 
   do beta = 1, nV
     do i = 1, nO
       do a = 1, nV
@@ -2015,8 +2011,8 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
   enddo
   !$omp end do nowait
 
-  !$omp do collapse(3)
   do i = 1, nO
+    !$omp do
     do a = 1, nV
       do j = 1, nO
         do b = 1, nV
@@ -2024,11 +2020,11 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
         enddo
       enddo
     enddo
+    !$omp end do nowait
   enddo
-  !$omp end do nowait
 
-  !$omp do collapse(3)
   do j = 1, nO
+    !$omp do
     do b = 1, nV
       do beta = 1, nV
         do u = 1, nO
@@ -2036,8 +2032,8 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
         enddo
       enddo
     enddo
+    !$omp end do
   enddo
-  !$omp end do
   !$omp end parallel
 
   call dgemm('N','N',nO*nV*nO,nV,nO, &
@@ -2060,7 +2056,7 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
   !$omp shared(nO,nV,K1,Z) &
   !$omp private(i,beta,a,u) &
   !$omp default(none)
-  !$omp do collapse(3)
+  !$omp do
    do beta = 1, nV
     do i = 1, nO
       do a = 1, nV
diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 462d4adf..5cf27568 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -36,10 +36,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
   !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
 
-  !$OMP DO collapse(3)
-  do i = 1, nO
-    do a = 1, nV
-      do b = 1, nV
+  !$OMP DO 
+  do a = 1, nV
+    do b = 1, nV
+      do i = 1, nO
         do d = 1, nV
           X_vovv(d,i,b,a) = v_vvvo(b,a,d,i)
         enddo
@@ -48,10 +48,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO collapse(3)
-  do j = 1, nO
-    do k = 1, nO
-      do c = 1, nV
+  !$OMP DO 
+  do c = 1, nV
+    do j = 1, nO
+      do k = 1, nO
         do d = 1, nV
           T_voov(d,k,j,c) = t2(k,j,c,d)
         enddo
@@ -63,7 +63,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vooo(c,j,k,l) * t2(i,l,a,b) &
   !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
 
-  !$OMP DO collapse(3)
+  !$OMP DO 
   do c = 1, nV
     do k = 1, nO
       do j = 1, nO
@@ -75,10 +75,10 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO collapse(3)
-  do i = 1, nO
-    do b = 1, nV
-      do a = 1, nV
+  !$OMP DO
+  do b = 1, nV
+    do a = 1, nV
+      do i = 1, nO
         do l = 1, nO
           T_oovv(l,i,a,b) = t2(i,l,a,b)
         enddo
@@ -89,11 +89,11 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   !X_oovv(j,k,b,c) * T1_vo(a,i) &
 
-  !$OMP DO collapse(3)
+  !$OMP DO
   do c = 1, nV
     do b = 1, nV
-      do j = 1, nO
-        do k = 1, nO
+      do k = 1, nO
+        do j = 1, nO
           X_oovv(j,k,b,c) = v_vvoo(b,c,j,k)
         enddo
       enddo
@@ -117,8 +117,8 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   e = 0d0
   !$OMP DO SCHEDULE(dynamic)
   do a = 1, nV
-    do b = 1, a-1
-      do c = 1, b-1
+    do b = a+1, nV
+      do c = b+1, nV
         delta_abc = f_v(a) + f_v(b) + f_v(c)
         call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
         call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
diff --git a/src/utils_cc/update_t.irp.f b/src/utils_cc/update_t.irp.f
index dbd4f4bd..0cf8626c 100644
--- a/src/utils_cc/update_t.irp.f
+++ b/src/utils_cc/update_t.irp.f
@@ -22,7 +22,7 @@ subroutine update_t1(nO,nV,f_o,f_v,r1,t1)
   !$OMP SHARED(nO,nV,t1,r1,cc_level_shift,f_o,f_v) &
   !$OMP PRIVATE(i,a) &
   !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(1)
+  !$OMP DO 
   do a = 1, nV
     do i = 1, nO
       t1(i,a) = t1(i,a) - r1(i,a) / (f_o(i) - f_v(a) - cc_level_shift)
@@ -57,7 +57,7 @@ subroutine update_t2(nO,nV,f_o,f_v,r2,t2)
   !$OMP SHARED(nO,nV,t2,r2,cc_level_shift,f_o,f_v) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(3)
+  !$OMP DO 
   do b = 1, nV
     do a = 1, nV
       do j = 1, nO

From 5b427641a66047513227fc1ed9912f8784a17630 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 15 May 2023 19:46:06 +0200
Subject: [PATCH 14/79] Inlined multiply_poly

---
 src/ao_two_e_ints/two_e_integrals.irp.f | 232 +++++++++++++++++++++---
 src/utils/integration.irp.f             | 129 +++++++++++--
 2 files changed, 317 insertions(+), 44 deletions(-)

diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f
index 83fbadfd..4c3c6190 100644
--- a/src/ao_two_e_ints/two_e_integrals.irp.f
+++ b/src/ao_two_e_ints/two_e_integrals.irp.f
@@ -563,8 +563,20 @@ double precision function general_primitive_integral(dim,            &
     d_poly(i)=0.d0
   enddo
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(Ix_pol,n_Ix,Iy_pol,n_Iy,d_poly,n_pt_tmp)
+!  call multiply_poly(Ix_pol,n_Ix,Iy_pol,n_Iy,d_poly,n_pt_tmp)
+  integer :: ib, ic
+  if (ior(n_Ix,n_Iy) >= 0) then
+    do ib=0,n_Ix
+      do ic = 0,n_Iy
+        d_poly(ib+ic) = d_poly(ib+ic) + Iy_pol(ic) * Ix_pol(ib)
+      enddo
+    enddo
+
+    do n_pt_tmp = n_Ix+n_Iy, 0, -1
+      if (d_poly(n_pt_tmp) /= 0.d0) exit
+    enddo
+  endif
+
   if (n_pt_tmp == -1) then
     return
   endif
@@ -573,8 +585,21 @@ double precision function general_primitive_integral(dim,            &
     d1(i)=0.d0
   enddo
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(d_poly ,n_pt_tmp ,Iz_pol,n_Iz,d1,n_pt_out)
+!  call multiply_poly(d_poly ,n_pt_tmp ,Iz_pol,n_Iz,d1,n_pt_out)
+  if (ior(n_pt_tmp,n_Iz) >= 0) then
+    ! Bottleneck here
+    do ib=0,n_pt_tmp
+      do ic = 0,n_Iz
+        d1(ib+ic) = d1(ib+ic) + Iz_pol(ic) * d_poly(ib)
+      enddo
+    enddo
+
+    do n_pt_out = n_pt_tmp+n_Iz, 0, -1
+      if (d1(n_pt_out) /= 0.d0) exit
+    enddo
+  endif
+
+
   double precision               :: rint_sum
   accu = accu + rint_sum(n_pt_out,const,d1)
 
@@ -921,8 +946,20 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
     X(ix) *= dble(a-1)
   enddo
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_10,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(X,nx,B_10,2,d,nd)
+  if (nx >= 0) then
+    integer :: ib
+    do ib=0,nx
+      d(ib  ) = d(ib  ) + B_10(0) * X(ib)
+      d(ib+1) = d(ib+1) + B_10(1) * X(ib)
+      d(ib+2) = d(ib+2) + B_10(2) * X(ib)
+    enddo
+
+    do nd = nx+2,0,-1
+      if (d(nd) /= 0.d0) exit
+    enddo
+  endif
 
   nx = nd
   !DIR$ LOOP COUNT(8)
@@ -943,8 +980,19 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
         X(ix) *= c
       enddo
     endif
-    !DIR$ FORCEINLINE
-    call multiply_poly(X,nx,B_00,2,d,nd)
+!    !DIR$ FORCEINLINE
+!    call multiply_poly(X,nx,B_00,2,d,nd)
+    if (nx >= 0) then
+       do ib=0,nx
+           d(ib  ) = d(ib  ) + B_00(0) * X(ib)
+           d(ib+1) = d(ib+1) + B_00(1) * X(ib)
+           d(ib+2) = d(ib+2) + B_00(2) * X(ib)
+       enddo
+
+       do nd = nx+2,0,-1
+         if (d(nd) /= 0.d0) exit
+       enddo
+    endif
   endif
 
   ny=0
@@ -961,9 +1009,19 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
     call I_x1_pol_mult_recurs(a-1,c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in)
   endif
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(Y,ny,C_00,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(Y,ny,C_00,2,d,nd)
+   if (ny >= 0) then
+     do ib=0,ny
+         d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
+         d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
+         d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
+     enddo
 
+   do nd = ny+2,0,-1
+     if (d(nd) /= 0.d0) exit
+   enddo
+  endif
 end
 
 recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
@@ -1001,8 +1059,20 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
     enddo
   endif
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_00,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(X,nx,B_00,2,d,nd)
+  if (nx >= 0) then
+    integer                        :: ib
+    do ib=0,nx
+      d(ib  ) = d(ib  ) + B_00(0) * X(ib)
+      d(ib+1) = d(ib+1) + B_00(1) * X(ib)
+      d(ib+2) = d(ib+2) + B_00(2) * X(ib)
+    enddo
+
+    do nd = nx+2,0,-1
+      if (d(nd) /= 0.d0) exit
+    enddo
+  endif
 
   ny=0
 
@@ -1012,8 +1082,19 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
   enddo
   call I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in)
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(Y,ny,C_00,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(Y,ny,C_00,2,d,nd)
+  if (ny >= 0) then
+    do ib=0,ny
+      d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
+      d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
+      d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
+    enddo
+
+    do nd = ny+2,0,-1
+      if (d(nd) /= 0.d0) exit
+    enddo
+  endif
 
 end
 
@@ -1040,8 +1121,20 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
   nx = 0
   call I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,X,nx,n_pt_in)
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_10,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(X,nx,B_10,2,d,nd)
+  if (nx >= 0) then
+    integer :: ib
+    do ib=0,nx
+      d(ib  ) = d(ib  ) + B_10(0) * X(ib)
+      d(ib+1) = d(ib+1) + B_10(1) * X(ib)
+      d(ib+2) = d(ib+2) + B_10(2) * X(ib)
+    enddo
+
+    do nd = nx+2,0,-1
+      if (d(nd) /= 0.d0) exit
+    enddo
+  endif
 
   nx = nd
   !DIR$ LOOP COUNT(8)
@@ -1059,8 +1152,19 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
     enddo
   endif
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(X,nx,B_00,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(X,nx,B_00,2,d,nd)
+  if (nx >= 0) then
+    do ib=0,nx
+      d(ib  ) = d(ib  ) + B_00(0) * X(ib)
+      d(ib+1) = d(ib+1) + B_00(1) * X(ib)
+      d(ib+2) = d(ib+2) + B_00(2) * X(ib)
+    enddo
+
+    do nd = nx+2,0,-1
+      if (d(nd) /= 0.d0) exit
+    enddo
+  endif
 
   ny=0
   !DIR$ LOOP COUNT(8)
@@ -1070,9 +1174,19 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
   !DIR$ FORCEINLINE
   call I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,Y,ny,n_pt_in)
 
-  !DIR$ FORCEINLINE
-  call multiply_poly(Y,ny,C_00,2,d,nd)
+!  !DIR$ FORCEINLINE
+!  call multiply_poly(Y,ny,C_00,2,d,nd)
+  if (ny >= 0) then
+    do ib=0,ny
+        d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
+        d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
+        d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
+    enddo
 
+    do nd = ny+2,0,-1
+      if (d(nd) /= 0.d0) exit
+    enddo
+  endif
 end
 
 recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
@@ -1119,8 +1233,21 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
       Y(1) = D_00(1)
       Y(2) = D_00(2)
 
-      !DIR$ FORCEINLINE
-      call multiply_poly(Y,ny,D_00,2,d,nd)
+!      !DIR$ FORCEINLINE
+!      call multiply_poly(Y,ny,D_00,2,d,nd)
+      if (ny >= 0) then
+        integer :: ib
+        do ib=0,ny
+            d(ib  ) = d(ib  ) + D_00(0) * Y(ib)
+            d(ib+1) = d(ib+1) + D_00(1) * Y(ib)
+            d(ib+2) = d(ib+2) + D_00(2) * Y(ib)
+        enddo
+
+        do nd = ny+2,0,-1
+          if (d(nd) /= 0.d0) exit
+        enddo
+      endif
+
       return
 
       case default
@@ -1137,8 +1264,19 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
         X(ix) *= dble(c-1)
       enddo
 
-      !DIR$ FORCEINLINE
-      call multiply_poly(X,nx,B_01,2,d,nd)
+!      !DIR$ FORCEINLINE
+!      call multiply_poly(X,nx,B_01,2,d,nd)
+      if (nx >= 0) then
+        do ib=0,nx
+          d(ib  ) = d(ib  ) + B_01(0) * X(ib)
+          d(ib+1) = d(ib+1) + B_01(1) * X(ib)
+          d(ib+2) = d(ib+2) + B_01(2) * X(ib)
+        enddo
+
+        do nd = nx+2,0,-1
+          if (d(nd) /= 0.d0) exit
+        enddo
+      endif
 
       ny = 0
       !DIR$ LOOP COUNT(6)
@@ -1147,8 +1285,19 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
       enddo
       call I_x2_pol_mult(c-1,B_10,B_01,B_00,C_00,D_00,Y,ny,dim)
 
-      !DIR$ FORCEINLINE
-      call multiply_poly(Y,ny,D_00,2,d,nd)
+!      !DIR$ FORCEINLINE
+!      call multiply_poly(Y,ny,D_00,2,d,nd)
+      if (ny >= 0) then
+        do ib=0,ny
+            d(ib  ) = d(ib  ) + D_00(0) * Y(ib)
+            d(ib+1) = d(ib+1) + D_00(1) * Y(ib)
+            d(ib+2) = d(ib+2) + D_00(2) * Y(ib)
+        enddo
+
+        do nd = ny+2,0,-1
+          if (d(nd) /= 0.d0) exit
+        enddo
+      endif
 
   end select
 end
@@ -1206,3 +1355,34 @@ subroutine compute_ao_integrals_jl(j,l,n_integrals,buffer_i,buffer_value)
   enddo
 
 end
+
+
+subroutine multiply_poly_local(b,nb,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb, nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:nc)
+  double precision, intent(inout) :: d(0:nb+nc)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(ior(nc,nb) < 0) return !False if nc>=0 and nb>=0
+
+  do ib=0,nb
+    do ic = 0,nc
+      d(ib+ic) = d(ib+ic) + c(ic) * b(ib)
+    enddo
+  enddo
+
+  do nd = nb+nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index 15d79622..c8a36775 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -428,6 +428,112 @@ end subroutine
 
 
 
+subroutine multiply_poly_0c(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:0), c(0:nc)
+  double precision, intent(inout) :: d(0:0+nc)
+
+  integer                        :: ic
+
+  do ic = 0,nc
+    d(ic) = d(ic) + c(ic) * b(0)
+  enddo
+
+  do nd = nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+subroutine multiply_poly_1c(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:1), c(0:nc)
+  double precision, intent(inout) :: d(0:1+nc)
+
+  integer                        :: ic, id
+  if(nc < 0) return
+
+  do ic = 0,nc
+    d(  ic) = d(  ic) + c(ic) * b(0)
+    d(1+ic) = d(1+ic) + c(ic) * b(1)
+  enddo
+
+  do nd = nc+1,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_2c(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:2), c(0:nc)
+  double precision, intent(inout) :: d(0:2+nc)
+
+  integer                        :: ic, id, k
+  if (nc <0) return
+
+  do ic = 0,nc
+    d(  ic) = d(  ic) + c(ic) * b(0)
+    d(1+ic) = d(1+ic) + c(ic) * b(1)
+    d(2+ic) = d(2+ic) + c(ic) * b(2)
+  enddo
+
+  do nd = nc+2,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+subroutine multiply_poly_3c(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:3), c(0:nc)
+  double precision, intent(inout) :: d(0:3+nc)
+
+  integer                        :: ic, id
+  if (nc <0) return
+
+  do ic = 1,nc
+    d(  ic) = d(1+ic) + c(ic) * b(0)
+    d(1+ic) = d(1+ic) + c(ic) * b(1)
+    d(2+ic) = d(1+ic) + c(ic) * b(2)
+    d(3+ic) = d(1+ic) + c(ic) * b(3)
+  enddo
+
+  do nd = nc+3,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
 
 
 subroutine multiply_poly(b,nb,c,nc,d,nd)
@@ -444,29 +550,16 @@ subroutine multiply_poly(b,nb,c,nc,d,nd)
 
   integer                        :: ndtmp
   integer                        :: ib, ic, id, k
-  if(ior(nc,nb) >= 0) then ! True if nc>=0 and nb>=0
-    continue
-  else
-    return
-  endif
-  ndtmp = nb+nc
+  if(ior(nc,nb) < 0) return !False if nc>=0 and nb>=0
 
-  do ic = 0,nc
-    d(ic) = d(ic) + c(ic) * b(0)
-  enddo
-
-  do ib=1,nb
-    d(ib) = d(ib) + c(0) * b(ib)
-    do ic = 1,nc
+  do ib=0,nb
+    do ic = 0,nc
       d(ib+ic) = d(ib+ic) + c(ic) * b(ib)
     enddo
   enddo
 
-  do nd = ndtmp,0,-1
-    if (d(nd) == 0.d0) then
-      cycle
-    endif
-    exit
+  do nd = nb+nc,0,-1
+    if (d(nd) /= 0.d0) exit
   enddo
 
 end

From e3c0df574ee9bed8f2de3c21dc4506fd34fc7b7b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 16 May 2023 01:40:40 +0200
Subject: [PATCH 15/79] Implementing stochastic (T)

---
 src/ccsd/ccsd_space_orb_sub.irp.f     |   4 +-
 src/ccsd/ccsd_t_space_orb_abc.irp.f   | 153 ++++++++----
 src/ccsd/ccsd_t_space_orb_stoch.irp.f | 320 ++++++++++++++++++++++++++
 3 files changed, 428 insertions(+), 49 deletions(-)
 create mode 100644 src/ccsd/ccsd_t_space_orb_stoch.irp.f

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 75752f5c..29ecca1c 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -169,7 +169,9 @@ subroutine run_ccsd_space_orb
     ! New
     print*,'Computing (T) correction...'
     call wall_time(ta)
-    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
+    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
     call wall_time(tb)
     print*,'Time: ',tb-ta, ' s'
diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 70900738..294296bf 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -19,14 +19,13 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
-  double precision              :: e,ta,tb, delta, delta_abc, x1, x2, x3
+  double precision              :: e,ta,tb
+
+  call set_multiple_levels_omp(.False.)
 
   allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
   allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
 
-  call set_multiple_levels_omp(.False.)
-
-  ! Temporary arrays
   !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
   !$OMP t1,t2,v_vvvo,v_vooo,v_vvoo) &
@@ -36,7 +35,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
   !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
 
-  !$OMP DO 
+  !$OMP DO
   do a = 1, nV
     do b = 1, nV
       do i = 1, nO
@@ -48,7 +47,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO 
+  !$OMP DO
   do c = 1, nV
     do j = 1, nO
       do k = 1, nO
@@ -63,7 +62,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vooo(c,j,k,l) * t2(i,l,a,b) &
   !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
 
-  !$OMP DO 
+  !$OMP DO
   do c = 1, nV
     do k = 1, nO
       do j = 1, nO
@@ -103,12 +102,13 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   !$OMP END PARALLEL
 
-  energy = 0d0
+  double precision, external :: ccsd_t_task_aba
+  double precision, external :: ccsd_t_task_abc
+
   !$OMP PARALLEL                                                     &
-      !$OMP PRIVATE(a,b,c,x1)                                           &
+      !$OMP PRIVATE(a,b,c,e)                                         &
       !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
       !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
-      !$OMP PRIVATE(i,j,k,e,delta,delta_abc)                         &
       !$OMP DEFAULT(SHARED)
   allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
             W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
@@ -119,46 +119,18 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   do a = 1, nV
     do b = a+1, nV
       do c = b+1, nV
-        delta_abc = f_v(a) + f_v(b) + f_v(c)
-        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
-        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
-        do k = 1, nO
-          do j = 1, nO
-            do i = 1, nO
-              delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
-              e = e + delta * ( &
-                 (4d0 * (W_abc(i,j,k) - W_cba(i,j,k)) + &
-                         W_bca(i,j,k) - W_bac(i,j,k)  + &
-                         W_cab(i,j,k) - W_acb(i,j,k)  ) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
-                 (4d0 * (W_acb(i,j,k) - W_bca(i,j,k)) + &
-                         W_cba(i,j,k) - W_cab(i,j,k)  + &
-                         W_bac(i,j,k) - W_abc(i,j,k)  ) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
-                 (4d0 * (W_bac(i,j,k) - W_cab(i,j,k)) + &
-                         W_acb(i,j,k) - W_abc(i,j,k)  + &
-                         W_cba(i,j,k) - W_bca(i,j,k)  ) * (V_bac(i,j,k) - V_cab(i,j,k)) )
-            enddo
-          enddo
-        enddo
+        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                        V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                        W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
       enddo
-    enddo
 
-    c = a
-    do b = 1, nV
-      if (b == c) cycle
-      delta_abc = f_v(a) + f_v(b) + f_v(c)
-      call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
-      call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
-      do k = 1, nO
-        do j = 1, nO
-          do i = 1, nO
-            delta = 1.0d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
-            e = e + delta * ( &
-               (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
-               (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
-               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) )
-          enddo
-        enddo
-      enddo
+      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+
+      e = e + ccsd_t_task_aba(b,a,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
     enddo
   enddo
   !$OMP END DO NOWAIT
@@ -178,6 +150,91 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 end
 
 
+double precision function ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,&
+      V_abc,V_acb,V_bac,V_bca,V_cab,V_cba,                           &
+      W_abc,W_acb,W_bac,W_bca,W_cab,W_cba,                           &
+      X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
+  implicit none
+  integer, intent(in)           :: nO,nV,a,b,c
+  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
+  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
+  double precision, intent(in)  :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
+  double precision, intent(in)  :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
+
+  double precision :: delta, delta_abc
+  integer  :: i,j,k
+
+  delta_abc = f_v(a) + f_v(b) + f_v(c)
+  e = 0.d0
+
+  call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
+
+  call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
+
+  do k = 1, nO
+    do j = 1, nO
+      do i = 1, nO
+        delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+        e = e + delta * (                                    &
+            (4d0 * (W_abc(i,j,k) - W_cba(i,j,k)) +           &
+            W_bca(i,j,k) - W_bac(i,j,k)  +                   &
+            W_cab(i,j,k) - W_acb(i,j,k)  ) * (V_abc(i,j,k) - V_cba(i,j,k)) +&
+            (4d0 * (W_acb(i,j,k) - W_bca(i,j,k)) +           &
+            W_cba(i,j,k) - W_cab(i,j,k)  +                   &
+            W_bac(i,j,k) - W_abc(i,j,k)  ) * (V_acb(i,j,k) - V_bca(i,j,k)) +&
+            (4d0 * (W_bac(i,j,k) - W_cab(i,j,k)) +           &
+            W_acb(i,j,k) - W_abc(i,j,k)  +                   &
+            W_cba(i,j,k) - W_bca(i,j,k)  ) * (V_bac(i,j,k) - V_cab(i,j,k)) )
+      enddo
+    enddo
+  enddo
+
+end
+
+double precision function ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,&
+      V_abc,V_acb,V_bac,V_bca,V_cab,V_cba,                           &
+      W_abc,W_acb,W_bac,W_bca,W_cab,W_cba,                           &
+      X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
+  implicit none
+  integer, intent(in)           :: nO,nV,a,b
+  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
+  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
+  double precision, intent(in)  :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
+  double precision, intent(in)  :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
+
+  double precision :: delta, delta_abc
+  integer  :: i,j,k
+
+  delta_abc = f_v(a) + f_v(b) + f_v(a)
+  e = 0.d0
+
+  call form_w_abc(nO,nV,a,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
+
+  call form_v_abc(nO,nV,a,b,a,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
+
+  do k = 1, nO
+    do j = 1, nO
+      do i = 1, nO
+        delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+        e = e + delta * (                                    &
+               (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
+               (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
+               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) )
+
+      enddo
+    enddo
+  enddo
+
+end
+
 subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
 
   implicit none
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
new file mode 100644
index 00000000..e8fae5cd
--- /dev/null
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -0,0 +1,320 @@
+! Main
+
+subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
+
+  implicit none
+
+  integer, intent(in)           :: nO,nV
+  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)  :: t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
+  double precision, intent(out) :: energy
+
+  double precision, allocatable :: W(:,:,:,:,:,:)
+  double precision, allocatable :: V(:,:,:,:,:,:)
+  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
+  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
+  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
+  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
+  double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
+  double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
+  integer                       :: i,j,k,l,a,b,c,d
+  double precision              :: e,ta,tb
+
+  call set_multiple_levels_omp(.False.)
+
+  allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
+  allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
+
+  !$OMP PARALLEL &
+  !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
+  !$OMP t1,t2,v_vvvo,v_vooo,v_vvoo) &
+  !$OMP PRIVATE(a,b,c,d,i,j,k,l) &
+  !$OMP DEFAULT(NONE)
+
+  !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
+  !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
+
+  !$OMP DO
+  do a = 1, nV
+    do b = 1, nV
+      do i = 1, nO
+        do d = 1, nV
+          X_vovv(d,i,b,a) = v_vvvo(b,a,d,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO nowait
+
+  !$OMP DO
+  do c = 1, nV
+    do j = 1, nO
+      do k = 1, nO
+        do d = 1, nV
+          T_voov(d,k,j,c) = t2(k,j,c,d)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO nowait
+
+  !v_vooo(c,j,k,l) * t2(i,l,a,b) &
+  !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
+
+  !$OMP DO
+  do c = 1, nV
+    do k = 1, nO
+      do j = 1, nO
+        do l = 1, nO
+           X_ooov(l,j,k,c) = v_vooo(c,j,k,l)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO nowait
+
+  !$OMP DO
+  do b = 1, nV
+    do a = 1, nV
+      do i = 1, nO
+        do l = 1, nO
+          T_oovv(l,i,a,b) = t2(i,l,a,b)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO nowait
+
+  !X_oovv(j,k,b,c) * T1_vo(a,i) &
+
+  !$OMP DO
+  do c = 1, nV
+    do b = 1, nV
+      do k = 1, nO
+        do j = 1, nO
+          X_oovv(j,k,b,c) = v_vvoo(b,c,j,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO nowait
+
+  !$OMP END PARALLEL
+
+  double precision, external :: ccsd_t_task_aba
+  double precision, external :: ccsd_t_task_abc
+
+  double precision, allocatable :: memo(:), Pabc(:), waccu(:)
+  logical         , allocatable :: computed(:)
+  integer*2       , allocatable :: abc(:,:)
+  integer*8                     :: Nabc, i8
+  integer*8, allocatable :: iorder(:)
+  double precision :: eocc
+  double precision :: Pabc_norm, sum_w
+
+
+  ! Prepare table of triplets (a,b,c)
+
+  Nabc = (int(nV,8) * int(nV+1,8) * int(nV+2,8))/6_8 - nV
+  allocate (memo(Nabc), computed(Nabc), Pabc(Nabc), waccu(0:Nabc))
+  allocate (abc(4,Nabc), iorder(Nabc))
+
+!  eocc = 3.d0/dble(nO) * sum(f_o(1:nO))
+  memo(:) = 0.d0
+  computed(:) = .False.
+  Nabc = 0_8
+  do a = 1, nV
+    do b = a+1, nV
+      do c = b+1, nV
+        Nabc = Nabc + 1_8
+!        Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(c))*(f_v(a)*f_v(b)*f_v(c))**(1.d0/2.d0))
+!        Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(c))**2)
+        Pabc(Nabc) = 1.d0/(f_v(a) + f_v(b) + f_v(c))
+        abc(1,Nabc) = a
+        abc(2,Nabc) = b
+        abc(3,Nabc) = c
+      enddo
+
+      Nabc = Nabc + 1_8
+      abc(1,Nabc) = a
+      abc(2,Nabc) = b
+      abc(3,Nabc) = a
+!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(a))*(f_v(a)*f_v(b)*f_v(a))**(1.d0/2.d0))
+!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(a))**2)
+      Pabc(Nabc) = 1.d0/(2.d0*f_v(a) + f_v(b))
+
+      Nabc = Nabc + 1_8
+      abc(1,Nabc) = b
+      abc(2,Nabc) = a
+      abc(3,Nabc) = b
+!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(b))*(f_v(b)*f_v(a)*f_v(b))**(1.d0/2.d0))
+!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(b))**2)
+      Pabc(Nabc) = 1.d0/(f_v(a) + 2.d0*f_v(b))
+    enddo
+  enddo
+
+  do i8=1,Nabc
+   iorder(i8) = i8
+  enddo
+
+  ! Sort triplets in decreasing Pabc
+  call dsort_big(Pabc, iorder, Nabc)
+
+  ! Normalize
+  Pabc_norm = 0.d0
+  do i8=Nabc,1,-1
+    Pabc_norm = Pabc_norm + Pabc(i8)
+  enddo
+  Pabc_norm = 1.d0/Pabc_norm
+  do i8=Nabc,1,-1
+    Pabc(i8) = Pabc(i8) * Pabc_norm
+  enddo
+
+  call i8set_order_big(abc, iorder, Nabc)
+
+
+  ! Cumulative distribution for sampling
+  waccu(Nabc) = 0.d0
+  sum_w = 0.d0
+  do i8=Nabc-1,1,-1
+   waccu(i8) = waccu(i8+1) - Pabc(i8)
+  enddo
+  waccu(:) = waccu(:) + 1.d0
+  waccu(0) = 0.d0
+
+  Pabc(:) = 1.d0/Pabc(:) * (1.d0/3.d0)
+
+  logical :: converged
+  double precision :: ET, ET2, eta, variance, average, error, sample
+  integer*8 :: isample, ieta, Ncomputed
+  integer*8, external :: find_sample
+
+  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
+            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
+            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
+            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
+
+  converged = .False.
+  ET = 0.d0
+  ET2 = 0.d0
+  Ncomputed = 0_8
+  isample = 0_8
+
+  average = 0.d0
+  variance = 0.d0
+  double precision :: t00, t01
+  call wall_time(t00)
+!  do ieta=1,Nabc
+  do while (.not.converged)
+    call random_number(eta)
+!    eta = eta/dble(1000)
+!    do k=0,1000-1
+!    ieta = find_sample(eta+dble(k)/dble(1000),waccu,Nabc)
+    ieta = find_sample(eta,waccu,Nabc)
+    isample = isample+1_8
+
+    if (.not.computed(ieta)) then
+      a = abc(1,ieta)
+      b = abc(2,ieta)
+      c = abc(3,ieta)
+      if (a/=c) then
+         memo(ieta) = ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                         V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                         W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+      else
+         memo(ieta) =  ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                       V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                       W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+      endif
+      computed(ieta) = .True.
+      Ncomputed += 1_8
+      call wall_time(t01)
+      if (t01-t00 > 1.d0) then
+        t00 = t01
+        print *, average, dsqrt(variance/dble(isample)), real(Ncomputed)/real(Nabc), real(isample)/real(Nabc)
+      endif
+!       print *, memo(ieta), Pabc(ieta), memo(ieta) * Pabc(ieta)
+    endif
+    sample = memo(ieta) * Pabc(ieta)
+    ET = ET + sample
+    ET2 = ET2 + sample*sample
+    average  = ET/dble(isample)
+    variance = ET2/dble(isample) - average*average
+    converged = (Ncomputed >= (Nabc*90_8)/100_8) .or. (isample>=1000*Nabc)
+!    enddo
+  enddo
+        print *, average, dsqrt(variance/dble(isample)), real(Ncomputed)/real(Nabc), real(isample)/real(Nabc)
+  energy = average
+
+!  !$OMP PARALLEL                                                     &
+!      !$OMP PRIVATE(a,b,c,e)                                         &
+!      !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
+!      !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
+!      !$OMP DEFAULT(SHARED)
+!  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
+!            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
+!            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
+!            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
+!  e = 0d0
+!  !$OMP DO SCHEDULE(dynamic)
+!  do a = 1, nV
+!    do b = a+1, nV
+!      do c = b+1, nV
+!        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
+!                        V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+!                        W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+!      enddo
+!    enddo
+!
+!    do b = 1, nV
+!      if (b == a) cycle
+!      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
+!                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+!                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+!    enddo
+!  enddo
+!  !$OMP END DO NOWAIT
+!
+!  !$OMP CRITICAL
+!  energy = energy + e
+!  !$OMP END CRITICAL
+!
+!  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
+!             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
+!
+!  !$OMP END PARALLEL
+
+  deallocate(X_vovv,X_ooov,T_voov,T_oovv)
+end
+
+
+integer*8 function find_sample(v, w, n)
+  implicit none
+  BEGIN_DOC
+! Finds sample v in weights w
+  END_DOC
+  integer*8, intent(in) :: n
+  double precision, intent(in) :: v, w(0:n)
+  integer*8 :: i,l,r
+
+  l=0
+  r=n
+
+  do while(r-l > 1)
+    i = shiftr(r+l,1)
+    if(w(i) < v) then
+      l = i
+    else
+      r = i
+    end if
+  end do
+  i = r
+  do r=i+1,n
+    if (w(r) /= w(i)) then
+      exit
+    endif
+  enddo
+  find_sample = r-1
+end function
+

From 134b6d016301d41ca78dfed2443118616d849ec2 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 16 May 2023 01:43:32 +0200
Subject: [PATCH 16/79] Adding tasks

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f | 153 +++++++++++++++++++---------
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 70900738..294296bf 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -19,14 +19,13 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
-  double precision              :: e,ta,tb, delta, delta_abc, x1, x2, x3
+  double precision              :: e,ta,tb
+
+  call set_multiple_levels_omp(.False.)
 
   allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
   allocate(T_voov(nV,nO,nO,nV),T_oovv(nO,nO,nV,nV))
 
-  call set_multiple_levels_omp(.False.)
-
-  ! Temporary arrays
   !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,T_voov,T_oovv,X_vovv,X_ooov,X_oovv, &
   !$OMP t1,t2,v_vvvo,v_vooo,v_vvoo) &
@@ -36,7 +35,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vvvo(b,a,d,i) * t2(k,j,c,d) &
   !X_vovv(d,i,b,a,i) * T_voov(d,j,c,k)
 
-  !$OMP DO 
+  !$OMP DO
   do a = 1, nV
     do b = 1, nV
       do i = 1, nO
@@ -48,7 +47,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   enddo
   !$OMP END DO nowait
 
-  !$OMP DO 
+  !$OMP DO
   do c = 1, nV
     do j = 1, nO
       do k = 1, nO
@@ -63,7 +62,7 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   !v_vooo(c,j,k,l) * t2(i,l,a,b) &
   !X_ooov(l,j,k,c) * T_oovv(l,i,a,b) &
 
-  !$OMP DO 
+  !$OMP DO
   do c = 1, nV
     do k = 1, nO
       do j = 1, nO
@@ -103,12 +102,13 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   !$OMP END PARALLEL
 
-  energy = 0d0
+  double precision, external :: ccsd_t_task_aba
+  double precision, external :: ccsd_t_task_abc
+
   !$OMP PARALLEL                                                     &
-      !$OMP PRIVATE(a,b,c,x1)                                           &
+      !$OMP PRIVATE(a,b,c,e)                                         &
       !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
       !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
-      !$OMP PRIVATE(i,j,k,e,delta,delta_abc)                         &
       !$OMP DEFAULT(SHARED)
   allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
             W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
@@ -119,46 +119,18 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   do a = 1, nV
     do b = a+1, nV
       do c = b+1, nV
-        delta_abc = f_v(a) + f_v(b) + f_v(c)
-        call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
-        call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
-        do k = 1, nO
-          do j = 1, nO
-            do i = 1, nO
-              delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
-              e = e + delta * ( &
-                 (4d0 * (W_abc(i,j,k) - W_cba(i,j,k)) + &
-                         W_bca(i,j,k) - W_bac(i,j,k)  + &
-                         W_cab(i,j,k) - W_acb(i,j,k)  ) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
-                 (4d0 * (W_acb(i,j,k) - W_bca(i,j,k)) + &
-                         W_cba(i,j,k) - W_cab(i,j,k)  + &
-                         W_bac(i,j,k) - W_abc(i,j,k)  ) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
-                 (4d0 * (W_bac(i,j,k) - W_cab(i,j,k)) + &
-                         W_acb(i,j,k) - W_abc(i,j,k)  + &
-                         W_cba(i,j,k) - W_bca(i,j,k)  ) * (V_bac(i,j,k) - V_cab(i,j,k)) )
-            enddo
-          enddo
-        enddo
+        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                        V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                        W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
       enddo
-    enddo
 
-    c = a
-    do b = 1, nV
-      if (b == c) cycle
-      delta_abc = f_v(a) + f_v(b) + f_v(c)
-      call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
-      call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
-      do k = 1, nO
-        do j = 1, nO
-          do i = 1, nO
-            delta = 1.0d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
-            e = e + delta * ( &
-               (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
-               (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
-               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) )
-          enddo
-        enddo
-      enddo
+      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+
+      e = e + ccsd_t_task_aba(b,a,nO,nV,t1,T_oovv,T_voov,V_abc, &
+                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
+                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
     enddo
   enddo
   !$OMP END DO NOWAIT
@@ -178,6 +150,91 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 end
 
 
+double precision function ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,&
+      V_abc,V_acb,V_bac,V_bca,V_cab,V_cba,                           &
+      W_abc,W_acb,W_bac,W_bca,W_cab,W_cba,                           &
+      X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
+  implicit none
+  integer, intent(in)           :: nO,nV,a,b,c
+  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
+  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
+  double precision, intent(in)  :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
+  double precision, intent(in)  :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
+
+  double precision :: delta, delta_abc
+  integer  :: i,j,k
+
+  delta_abc = f_v(a) + f_v(b) + f_v(c)
+  e = 0.d0
+
+  call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
+
+  call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
+
+  do k = 1, nO
+    do j = 1, nO
+      do i = 1, nO
+        delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+        e = e + delta * (                                    &
+            (4d0 * (W_abc(i,j,k) - W_cba(i,j,k)) +           &
+            W_bca(i,j,k) - W_bac(i,j,k)  +                   &
+            W_cab(i,j,k) - W_acb(i,j,k)  ) * (V_abc(i,j,k) - V_cba(i,j,k)) +&
+            (4d0 * (W_acb(i,j,k) - W_bca(i,j,k)) +           &
+            W_cba(i,j,k) - W_cab(i,j,k)  +                   &
+            W_bac(i,j,k) - W_abc(i,j,k)  ) * (V_acb(i,j,k) - V_bca(i,j,k)) +&
+            (4d0 * (W_bac(i,j,k) - W_cab(i,j,k)) +           &
+            W_acb(i,j,k) - W_abc(i,j,k)  +                   &
+            W_cba(i,j,k) - W_bca(i,j,k)  ) * (V_bac(i,j,k) - V_cab(i,j,k)) )
+      enddo
+    enddo
+  enddo
+
+end
+
+double precision function ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,&
+      V_abc,V_acb,V_bac,V_bca,V_cab,V_cba,                           &
+      W_abc,W_acb,W_bac,W_bca,W_cab,W_cba,                           &
+      X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
+  implicit none
+  integer, intent(in)           :: nO,nV,a,b
+  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
+  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
+  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
+  double precision, intent(in)  :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
+  double precision, intent(in)  :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
+
+  double precision :: delta, delta_abc
+  integer  :: i,j,k
+
+  delta_abc = f_v(a) + f_v(b) + f_v(a)
+  e = 0.d0
+
+  call form_w_abc(nO,nV,a,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
+
+  call form_v_abc(nO,nV,a,b,a,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
+
+  do k = 1, nO
+    do j = 1, nO
+      do i = 1, nO
+        delta = 1.d0 / (f_o(i) + f_o(j) + f_o(k) - delta_abc)
+        e = e + delta * (                                    &
+               (4d0 * W_abc(i,j,k) + W_bca(i,j,k) + W_cab(i,j,k)) * (V_abc(i,j,k) - V_cba(i,j,k)) + &
+               (4d0 * W_acb(i,j,k) + W_cba(i,j,k) + W_bac(i,j,k)) * (V_acb(i,j,k) - V_bca(i,j,k)) + &
+               (4d0 * W_bac(i,j,k) + W_acb(i,j,k) + W_cba(i,j,k)) * (V_bac(i,j,k) - V_cab(i,j,k)) )
+
+      enddo
+    enddo
+  enddo
+
+end
+
 subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
 
   implicit none

From de07f73ed9da98850002c459015ffc9e1868ed16 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 16 May 2023 18:32:15 +0200
Subject: [PATCH 17/79] Semi-stochastic (T) OK

---
 src/ccsd/ccsd_t_space_orb_abc.irp.f   | 101 ++++----
 src/ccsd/ccsd_t_space_orb_stoch.irp.f | 318 ++++++++++++++------------
 2 files changed, 224 insertions(+), 195 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_abc.irp.f b/src/ccsd/ccsd_t_space_orb_abc.irp.f
index 294296bf..1aab6bd7 100644
--- a/src/ccsd/ccsd_t_space_orb_abc.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_abc.irp.f
@@ -10,12 +10,6 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
   double precision, intent(out) :: energy
 
-  double precision, allocatable :: W(:,:,:,:,:,:)
-  double precision, allocatable :: V(:,:,:,:,:,:)
-  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
-  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
-  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
-  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
@@ -105,32 +99,22 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   double precision, external :: ccsd_t_task_aba
   double precision, external :: ccsd_t_task_abc
 
-  !$OMP PARALLEL                                                     &
-      !$OMP PRIVATE(a,b,c,e)                                         &
-      !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
-      !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
-      !$OMP DEFAULT(SHARED)
-  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
-            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
-            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
-            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
+  !$OMP PARALLEL PRIVATE(a,b,c,e) DEFAULT(SHARED)
   e = 0d0
   !$OMP DO SCHEDULE(dynamic)
   do a = 1, nV
     do b = a+1, nV
       do c = b+1, nV
-        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
-                        V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-                        W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov, &
+                        X_ooov,X_oovv,X_vovv,f_o,f_v)
       enddo
 
-      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
-                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
+      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov, &
+                      X_ooov,X_oovv,X_vovv,f_o,f_v)
+
+      e = e + ccsd_t_task_aba(b,a,nO,nV,t1,T_oovv,T_voov, &
+                      X_ooov,X_oovv,X_vovv,f_o,f_v)
 
-      e = e + ccsd_t_task_aba(b,a,nO,nV,t1,T_oovv,T_voov,V_abc, &
-                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
     enddo
   enddo
   !$OMP END DO NOWAIT
@@ -139,9 +123,6 @@ subroutine ccsd_par_t_space_v3(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
   energy = energy + e
   !$OMP END CRITICAL
 
-  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
-             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
-
   !$OMP END PARALLEL
 
   energy = energy / 3.d0
@@ -151,30 +132,34 @@ end
 
 
 double precision function ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,&
-      V_abc,V_acb,V_bac,V_bca,V_cab,V_cba,                           &
-      W_abc,W_acb,W_bac,W_bca,W_cab,W_cba,                           &
       X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
   implicit none
-  integer, intent(in)           :: nO,nV,a,b,c
-  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
-  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
-  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
-  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
-  double precision, intent(in)  :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
-  double precision, intent(in)  :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
+  integer, intent(in)              :: nO,nV,a,b,c
+  double precision, intent(in)     :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)     :: X_oovv(nO,nO,nV,nV)
+  double precision, intent(in)     :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)     :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
 
   double precision :: delta, delta_abc
   integer  :: i,j,k
 
-  delta_abc = f_v(a) + f_v(b) + f_v(c)
-  e = 0.d0
+  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
+  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
+  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
+  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
+
+  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
+            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
+            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
+            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
 
   call form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
 
   call form_v_abc(nO,nV,a,b,c,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
 
+  delta_abc = f_v(a) + f_v(b) + f_v(c)
+  e = 0.d0
+
   do k = 1, nO
     do j = 1, nO
       do i = 1, nO
@@ -193,33 +178,40 @@ double precision function ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,&
     enddo
   enddo
 
+  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
+             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
+
 end
 
 double precision function ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,&
-      V_abc,V_acb,V_bac,V_bca,V_cab,V_cba,                           &
-      W_abc,W_acb,W_bac,W_bca,W_cab,W_cba,                           &
       X_ooov,X_oovv,X_vovv,f_o,f_v) result(e)
   implicit none
-  integer, intent(in)           :: nO,nV,a,b
-  double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
-  double precision, intent(in)  :: X_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
-  double precision, intent(in)  :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
-  double precision, intent(in)  :: W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO)
-  double precision, intent(in)  :: W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO)
-  double precision, intent(in)  :: V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO)
-  double precision, intent(in)  :: V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO)
+  integer, intent(in)              :: nO,nV,a,b
+  double precision, intent(in)     :: t1(nO,nV), f_o(nO), f_v(nV)
+  double precision, intent(in)     :: X_oovv(nO,nO,nV,nV)
+  double precision, intent(in)     :: T_voov(nV,nO,nO,nV), T_oovv(nO,nO,nV,nV)
+  double precision, intent(in)     :: X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV)
 
   double precision :: delta, delta_abc
   integer  :: i,j,k
 
-  delta_abc = f_v(a) + f_v(b) + f_v(a)
-  e = 0.d0
+  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
+  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
+  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
+  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
+
+  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
+            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
+            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
+            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
 
   call form_w_abc(nO,nV,a,b,a,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
 
   call form_v_abc(nO,nV,a,b,a,t1,X_oovv,W_abc,V_abc,W_cba,V_cba,W_bca,V_bca,W_cab,V_cab,W_bac,V_bac,W_acb,V_acb)
 
+  delta_abc = f_v(a) + f_v(b) + f_v(a)
+  e = 0.d0
+
   do k = 1, nO
     do j = 1, nO
       do i = 1, nO
@@ -233,6 +225,9 @@ double precision function ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,&
     enddo
   enddo
 
+  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
+             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
+
 end
 
 subroutine form_w_abc(nO,nV,a,b,c,T_voov,T_oovv,X_vovv,X_ooov,W_abc,W_cba,W_bca,W_cab,W_bac,W_acb)
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index e8fae5cd..0081e9e7 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -1,5 +1,4 @@
 ! Main
-
 subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
 
   implicit none
@@ -10,12 +9,6 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
   double precision, intent(out) :: energy
 
-  double precision, allocatable :: W(:,:,:,:,:,:)
-  double precision, allocatable :: V(:,:,:,:,:,:)
-  double precision, allocatable :: W_abc(:,:,:), W_cab(:,:,:), W_bca(:,:,:)
-  double precision, allocatable :: W_bac(:,:,:), W_cba(:,:,:), W_acb(:,:,:)
-  double precision, allocatable :: V_abc(:,:,:), V_cab(:,:,:), V_bca(:,:,:)
-  double precision, allocatable :: V_bac(:,:,:), V_cba(:,:,:), V_acb(:,:,:)
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
@@ -104,33 +97,32 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
 
   double precision, external :: ccsd_t_task_aba
   double precision, external :: ccsd_t_task_abc
+!  logical, external :: omp_test_lock
 
   double precision, allocatable :: memo(:), Pabc(:), waccu(:)
-  logical         , allocatable :: computed(:)
+  integer*8, allocatable :: sampled(:)
+!  integer(omp_lock_kind), allocatable :: lock(:)
   integer*2       , allocatable :: abc(:,:)
   integer*8                     :: Nabc, i8
   integer*8, allocatable :: iorder(:)
   double precision :: eocc
-  double precision :: Pabc_norm, sum_w
+  double precision :: norm
+  integer :: kiter, isample
 
 
   ! Prepare table of triplets (a,b,c)
 
   Nabc = (int(nV,8) * int(nV+1,8) * int(nV+2,8))/6_8 - nV
-  allocate (memo(Nabc), computed(Nabc), Pabc(Nabc), waccu(0:Nabc))
-  allocate (abc(4,Nabc), iorder(Nabc))
+  allocate (memo(Nabc), sampled(Nabc), Pabc(Nabc), waccu(Nabc))
+  allocate (abc(4,Nabc), iorder(Nabc)) !, lock(Nabc))
 
 !  eocc = 3.d0/dble(nO) * sum(f_o(1:nO))
-  memo(:) = 0.d0
-  computed(:) = .False.
   Nabc = 0_8
   do a = 1, nV
     do b = a+1, nV
       do c = b+1, nV
         Nabc = Nabc + 1_8
-!        Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(c))*(f_v(a)*f_v(b)*f_v(c))**(1.d0/2.d0))
-!        Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(c))**2)
-        Pabc(Nabc) = 1.d0/(f_v(a) + f_v(b) + f_v(c))
+        Pabc(Nabc) = -1.d0/(f_v(a) + f_v(b) + f_v(c))
         abc(1,Nabc) = a
         abc(2,Nabc) = b
         abc(3,Nabc) = c
@@ -140,17 +132,13 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
       abc(1,Nabc) = a
       abc(2,Nabc) = b
       abc(3,Nabc) = a
-!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(a))*(f_v(a)*f_v(b)*f_v(a))**(1.d0/2.d0))
-!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(a))**2)
-      Pabc(Nabc) = 1.d0/(2.d0*f_v(a) + f_v(b))
+      Pabc(Nabc) = -1.d0/(2.d0*f_v(a) + f_v(b))
 
       Nabc = Nabc + 1_8
       abc(1,Nabc) = b
       abc(2,Nabc) = a
       abc(3,Nabc) = b
-!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(b))*(f_v(b)*f_v(a)*f_v(b))**(1.d0/2.d0))
-!      Pabc(Nabc) = 1.d0/((f_v(a) + f_v(b) + f_v(b))**2)
-      Pabc(Nabc) = 1.d0/(f_v(a) + 2.d0*f_v(b))
+      Pabc(Nabc) = -1.d0/(f_v(a) + 2.d0*f_v(b))
     enddo
   enddo
 
@@ -162,13 +150,13 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   call dsort_big(Pabc, iorder, Nabc)
 
   ! Normalize
-  Pabc_norm = 0.d0
+  norm = 0.d0
   do i8=Nabc,1,-1
-    Pabc_norm = Pabc_norm + Pabc(i8)
+    norm = norm + Pabc(i8)
   enddo
-  Pabc_norm = 1.d0/Pabc_norm
-  do i8=Nabc,1,-1
-    Pabc(i8) = Pabc(i8) * Pabc_norm
+  norm = 1.d0/norm
+  do i8=1,Nabc
+    Pabc(i8) = Pabc(i8) * norm
   enddo
 
   call i8set_order_big(abc, iorder, Nabc)
@@ -176,145 +164,191 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
 
   ! Cumulative distribution for sampling
   waccu(Nabc) = 0.d0
-  sum_w = 0.d0
   do i8=Nabc-1,1,-1
-   waccu(i8) = waccu(i8+1) - Pabc(i8)
+   waccu(i8) = waccu(i8+1) - Pabc(i8+1)
   enddo
   waccu(:) = waccu(:) + 1.d0
-  waccu(0) = 0.d0
 
-  Pabc(:) = 1.d0/Pabc(:) * (1.d0/3.d0)
+  logical :: converged, do_comp
+  double precision :: eta, variance, error, sample
+  double precision :: t00, t01
+  integer*8 :: ieta, Ncomputed
+  integer*8, external :: binary_search
 
-  logical :: converged
-  double precision :: ET, ET2, eta, variance, average, error, sample
-  integer*8 :: isample, ieta, Ncomputed
-  integer*8, external :: find_sample
+  integer :: nbuckets
+  nbuckets = 100
 
-  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
-            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
-            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
-            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
+  double precision, allocatable :: wsum(:)
+  allocate(wsum(nbuckets))
 
   converged = .False.
-  ET = 0.d0
-  ET2 = 0.d0
   Ncomputed = 0_8
-  isample = 0_8
 
-  average = 0.d0
+  energy = 0.d0
   variance = 0.d0
-  double precision :: t00, t01
-  call wall_time(t00)
-!  do ieta=1,Nabc
-  do while (.not.converged)
-    call random_number(eta)
-!    eta = eta/dble(1000)
-!    do k=0,1000-1
-!    ieta = find_sample(eta+dble(k)/dble(1000),waccu,Nabc)
-    ieta = find_sample(eta,waccu,Nabc)
-    isample = isample+1_8
+  memo(:) = 0.d0
+  sampled(:) = -1_8
 
-    if (.not.computed(ieta)) then
+  integer*8 :: ileft, iright, imin
+  ileft = 1_8
+  iright = Nabc
+  integer*8, allocatable :: bounds(:,:)
+
+  allocate (bounds(2,nbuckets))
+  do isample=1,nbuckets
+    eta = 1.d0/dble(nbuckets) * dble(isample)
+    ieta = binary_search(waccu,eta,Nabc,ileft,iright)
+    bounds(1,isample) = ileft
+    bounds(2,isample) = ieta
+    ileft = ieta+1
+    wsum(isample) = sum( Pabc(bounds(1,isample):bounds(2,isample) ) )
+  enddo
+
+  Pabc(:) = 1.d0/Pabc(:)
+
+  call wall_time(t00)
+  imin = 1_8
+  !$OMP PARALLEL                                                     &
+      !$OMP PRIVATE(ieta,eta,a,b,c,kiter,isample)                    &
+      !$OMP DEFAULT(SHARED)
+
+  do kiter=1,Nabc
+
+    !$OMP MASTER
+    do while ((imin <= Nabc).and.(sampled(imin)>-1_8))
+      imin = imin+1
+    enddo
+
+    ! Deterministic part
+    if (imin < Nabc) then
+      ieta=imin
+      sampled(ieta) = 0_8
       a = abc(1,ieta)
       b = abc(2,ieta)
       c = abc(3,ieta)
-      if (a/=c) then
-         memo(ieta) = ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
-                         V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-                         W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
-      else
-         memo(ieta) =  ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
-                       V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-                       W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
-      endif
-      computed(ieta) = .True.
       Ncomputed += 1_8
-      call wall_time(t01)
-      if (t01-t00 > 1.d0) then
-        t00 = t01
-        print *, average, dsqrt(variance/dble(isample)), real(Ncomputed)/real(Nabc), real(isample)/real(Nabc)
+      !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(a,b,c,ieta)
+      if (a/=c) then
+        memo(ieta) = ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov, &
+            X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
+      else
+        memo(ieta) =  ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,  &
+            X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
       endif
-!       print *, memo(ieta), Pabc(ieta), memo(ieta) * Pabc(ieta)
+      !$OMP END TASK
     endif
-    sample = memo(ieta) * Pabc(ieta)
-    ET = ET + sample
-    ET2 = ET2 + sample*sample
-    average  = ET/dble(isample)
-    variance = ET2/dble(isample) - average*average
-    converged = (Ncomputed >= (Nabc*90_8)/100_8) .or. (isample>=1000*Nabc)
-!    enddo
-  enddo
-        print *, average, dsqrt(variance/dble(isample)), real(Ncomputed)/real(Nabc), real(isample)/real(Nabc)
-  energy = average
 
-!  !$OMP PARALLEL                                                     &
-!      !$OMP PRIVATE(a,b,c,e)                                         &
-!      !$OMP PRIVATE(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb,        &
-!      !$OMP         V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )       &
-!      !$OMP DEFAULT(SHARED)
-!  allocate( W_abc(nO,nO,nO), W_cab(nO,nO,nO), W_bca(nO,nO,nO), &
-!            W_bac(nO,nO,nO), W_cba(nO,nO,nO), W_acb(nO,nO,nO), &
-!            V_abc(nO,nO,nO), V_cab(nO,nO,nO), V_bca(nO,nO,nO), &
-!            V_bac(nO,nO,nO), V_cba(nO,nO,nO), V_acb(nO,nO,nO) )
-!  e = 0d0
-!  !$OMP DO SCHEDULE(dynamic)
-!  do a = 1, nV
-!    do b = a+1, nV
-!      do c = b+1, nV
-!        e = e + ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov,V_abc, &
-!                        V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-!                        W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
-!      enddo
-!    enddo
-!
-!    do b = 1, nV
-!      if (b == a) cycle
-!      e = e + ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,V_abc, &
-!                      V_acb,V_bac,V_bca,V_cab,V_cba,W_abc,W_acb,W_bac, &
-!                      W_bca,W_cab,W_cba,X_ooov,X_oovv,X_vovv,f_o,f_v)
-!    enddo
-!  enddo
-!  !$OMP END DO NOWAIT
-!
-!  !$OMP CRITICAL
-!  energy = energy + e
-!  !$OMP END CRITICAL
-!
-!  deallocate(W_abc, W_cab, W_bca, W_bac, W_cba, W_acb, &
-!             V_abc, V_cab, V_bca, V_bac, V_cba, V_acb )
-!
-!  !$OMP END PARALLEL
+    ! Stochastic part
+    call random_number(eta)
+    do isample=1,nbuckets
+      if (imin >= bounds(2,isample)) then
+        cycle
+      endif
+      ieta = binary_search(waccu,(eta + dble(isample-1))/dble(nbuckets),Nabc)
+
+      if (sampled(ieta) == -1_8) then
+        sampled(ieta) = 0_8
+        a = abc(1,ieta)
+        b = abc(2,ieta)
+        c = abc(3,ieta)
+        Ncomputed += 1_8
+        !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(a,b,c,ieta)
+        if (a/=c) then
+          memo(ieta) = ccsd_t_task_abc(a,b,c,nO,nV,t1,T_oovv,T_voov, &
+              X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
+        else
+          memo(ieta) =  ccsd_t_task_aba(a,b,nO,nV,t1,T_oovv,T_voov,  &
+              X_ooov,X_oovv,X_vovv,f_o,f_v) / 3.d0
+        endif
+        !$OMP END TASK
+      endif
+      sampled(ieta) = sampled(ieta)+1_8
+
+    enddo
+
+    call wall_time(t01)
+    if (t01-t00 > 1.0d0) then
+      t00 = t01
+
+      !$OMP TASKWAIT
+
+      double precision :: ET, ET2
+      double precision :: energy_stoch, energy_det
+      double precision :: scale
+      double precision :: w
+      double precision :: tmp
+      energy_stoch = 0.d0
+      energy_det   = 0.d0
+      norm = 0.d0
+      scale = 1.d0
+      ET = 0.d0
+      ET2 = 0.d0
+
+
+      do isample=1,nbuckets
+        if (imin >= bounds(2,isample)) then
+          energy_det = energy_det + sum(memo(bounds(1,isample):bounds(2,isample)))
+          scale = scale - wsum(isample)
+        else
+          exit
+        endif
+      enddo
+
+      do ieta=bounds(1,isample), Nabc
+          w = dble(max(sampled(ieta),0_8))
+          tmp = w * memo(ieta) * Pabc(ieta)
+          ET = ET + tmp
+          ET2 = ET2 + tmp * memo(ieta) * Pabc(ieta)
+          norm = norm + w
+      enddo
+      norm = norm/scale
+      if (norm > 0.d0) then
+        energy_stoch = ET / norm
+        variance = ET2 / norm - energy_stoch*energy_stoch
+      endif
+
+      energy = energy_det + energy_stoch
+
+      print *, real(energy), ' +/- ', real(sqrt(variance/(norm-1.d0))), isample, real(Ncomputed)/real(Nabc)
+    endif
+    !$OMP END MASTER
+    if (imin >= Nabc) exit
+  enddo
+
+  !$OMP END PARALLEL
 
   deallocate(X_vovv,X_ooov,T_voov,T_oovv)
 end
 
 
-integer*8 function find_sample(v, w, n)
-  implicit none
-  BEGIN_DOC
-! Finds sample v in weights w
-  END_DOC
-  integer*8, intent(in) :: n
-  double precision, intent(in) :: v, w(0:n)
-  integer*8 :: i,l,r
 
-  l=0
-  r=n
+integer*8 function binary_search(arr, key, size)
+    implicit none
+    BEGIN_DOC
+! Searches the key in array arr(1:size) between l_in and r_in, and returns its index
+    END_DOC
+    integer*8 :: size, i, j, mid, l_in, r_in
+    double precision, dimension(size) :: arr(1:size)
+    double precision :: key
 
-  do while(r-l > 1)
-    i = shiftr(r+l,1)
-    if(w(i) < v) then
-      l = i
-    else
-      r = i
-    end if
-  end do
-  i = r
-  do r=i+1,n
-    if (w(r) /= w(i)) then
-      exit
-    endif
-  enddo
-  find_sample = r-1
-end function
+    i = 1_8
+    j = size
+
+    do while (j >= i)
+        mid = i + (j - i) / 2
+        if (arr(mid) >= key) then
+            if (mid > 1 .and. arr(mid - 1) < key) then
+                binary_search = mid
+                return
+            end if
+            j = mid - 1
+        else if (arr(mid) < key) then
+            i = mid + 1
+        else
+            binary_search = mid + 1
+            return
+        end if
+    end do
+    binary_search = i
+end function binary_search
 

From ee790fa1d82e94724cbc34cb5c4c802ca001d2b4 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 16 May 2023 19:54:30 +0200
Subject: [PATCH 18/79] Formatted output in (T)

---
 src/ccsd/ccsd_space_orb_sub.irp.f     | 289 +++++++++++++-------------
 src/ccsd/ccsd_t_space_orb_stoch.irp.f |  15 +-
 2 files changed, 158 insertions(+), 146 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 29ecca1c..287d5b03 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -1,5 +1,5 @@
 subroutine run_ccsd_space_orb
-  
+
   implicit none
 
   integer :: i,j,k,l,a,b,c,d,tmp_a,tmp_b,tmp_c,tmp_d
@@ -12,12 +12,12 @@ subroutine run_ccsd_space_orb
   double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:)
   double precision, allocatable :: t1(:,:), r1(:,:)
   double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
-  
+
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
   integer                       :: nO, nV, nOa, nOb, nVa, nVb, n_spin(4)
-  
+
   PROVIDE mo_two_e_integrals_in_map
 
   det = psi_det(:,:,cc_ref)
@@ -35,11 +35,11 @@ subroutine run_ccsd_space_orb
   if (cc_ref_is_open_shell) then
     call abort
   endif
-  
+
   ! Number of occ/vir spatial orb
   nO = nOa
   nV = nVa
-  
+
   allocate(list_occ(nO),list_vir(nV))
   list_occ = cc_list_occ
   list_vir = cc_list_vir
@@ -47,7 +47,7 @@ subroutine run_ccsd_space_orb
   !call extract_list_orb_space(det,nO,nV,list_occ,list_vir)
   !print*,'occ',list_occ
   !print*,'vir',list_vir
-  
+
   allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
   allocate(tau(nO,nO,nV,nV))
   allocate(t1(nO,nV), r1(nO,nV))
@@ -76,7 +76,7 @@ subroutine run_ccsd_space_orb
   print*,'Det energy', uncorr_energy
   call ccsd_energy_space(nO,nV,tau,t1,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
-  
+
   nb_iter = 0
   not_converged = .True.
   max_r1 = 0d0
@@ -86,9 +86,9 @@ subroutine run_ccsd_space_orb
   write(*,'(A77)') ' |   It.  |       E(CCSD) (Ha) | Correlation (Ha) |  Conv. T1  |  Conv. T2  |'
   write(*,'(A77)') ' -----------------------------------------------------------------------------'
   call wall_time(ta)
-  
+
   do while (not_converged)
-    
+
     call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
     call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
     call compute_H_vo(nO,nV,t1,t2,H_vo)
@@ -97,7 +97,7 @@ subroutine run_ccsd_space_orb
     call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
     call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     max_r = max(max_r1,max_r2)
-    
+
     ! Update
     if (cc_update_method == 'diis') then
       !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
@@ -122,7 +122,7 @@ subroutine run_ccsd_space_orb
     if (max_r < cc_thresh_conv .or. nb_iter > cc_max_iter) then
       not_converged = .False.
     endif
-    
+
   enddo
   write(*,'(A77)') ' -----------------------------------------------------------------------------'
   call wall_time(tb)
@@ -141,18 +141,18 @@ subroutine run_ccsd_space_orb
 
   call write_t1(nO,nV,t1)
   call write_t2(nO,nV,t2)
-  
+
   ! Deallocation
   if (cc_update_method == 'diis') then
     deallocate(all_err,all_t)
   endif
 
   deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
-  
+
   ! CCSD(T)
   double precision :: e_t
 
-  if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then 
+  if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
     ! Dumb way
     !call wall_time(ta)
@@ -171,8 +171,11 @@ subroutine run_ccsd_space_orb
     call wall_time(ta)
 !    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
 !         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
+
+    e_t = uncorr_energy + energy ! For print in next call
     call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
+
     call wall_time(tb)
     print*,'Time: ',tb-ta, ' s'
 
@@ -182,7 +185,7 @@ subroutine run_ccsd_space_orb
     write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
     print*,''
   endif
-  
+
   print*,'Reference determinant:'
   call print_det(det,N_int)
 
@@ -234,7 +237,7 @@ subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
   energy = energy + e
   !$omp end critical
   !$omp end parallel
-  
+
 end
 
 ! Tau
@@ -252,12 +255,12 @@ subroutine update_tau_space(nO,nV,t1,t2,tau)
 
   ! internal
   integer                       :: i,j,a,b
-  
+
   !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,tau,t2,t1) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  !$OMP DO 
+  !$OMP DO
   do b = 1, nV
     do a = 1, nV
       do j = 1, nO
@@ -269,7 +272,7 @@ subroutine update_tau_space(nO,nV,t1,t2,tau)
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
-    
+
 end
 
 ! R1
@@ -285,7 +288,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
 
   ! out
   double precision, intent(out) :: r1(nO,nV), max_r1
-  
+
   ! internal
   integer                       :: u,i,j,beta,a,b
 
@@ -306,7 +309,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   ! cc_space_f_vo(a,i) * t1(i,beta) -> X1(nV,nV), O(nV*nV*nO)
   ! X1(a,beta) * t1(u,a) -> O(nO*nV*nV)
   ! cc_space_f_vo(a,i) * t1(u,a)    -> X1(nO,nO), O(nO*nO*nV)
-  ! X1(i,u) * t1(i,beta) -> O(nO*nO*nV)  
+  ! X1(i,u) * t1(i,beta) -> O(nO*nO*nV)
   !do beta = 1, nV
   !  do u = 1, nO
   !    do i = 1, nO
@@ -326,7 +329,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   call dgemm('T','N', nO, nV, nO, &
              1d0, X_oo, size(X_oo,2), &
                   t1  , size(t1,1), &
-             1d0, r1  , size(r1,1)) 
+             1d0, r1  , size(r1,1))
   deallocate(X_oo)
 
   ! r1(u,beta) = r1(u,beta) + H_vv(a,beta) * t1(u,a)
@@ -375,7 +378,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,X_voov,t2,t1) &
   !$omp private(u,beta,i,a) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do beta = 1, nV
     do u = 1, nO
       do i = 1, nO
@@ -387,16 +390,16 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   call dgemv('T', nV*nO, nO*nV, &
              1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
                   H_vo  , 1, &
              1d0, r1    , 1)
-  
+
   deallocate(X_voov)
 
   ! r1(u,beta) = r1(u,beta) + (2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)) * t1(i,a)
-  ! <=> 
+  ! <=>
   ! r1(u,beta) = r1(u,beta) + X(i,a,u,beta)
   !do beta = 1, nV
   !  do u = 1, nO
@@ -414,7 +417,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
   !$omp private(u,beta,i,a) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do beta = 1, nV
     do u = 1, nO
       do a = 1, nv
@@ -431,17 +434,17 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
              1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
                   t1     , 1, &
              1d0, r1     , 1)
-  
+
   deallocate(X_ovov)
 
-  ! r1(u,beta) = r1(u,beta) + (2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta)) * tau(i,u,a,b)  
-  ! r1(u,beta) = r1(u,beta) + W(a,b,i,beta) * T(u,a,b,i) 
+  ! r1(u,beta) = r1(u,beta) + (2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta)) * tau(i,u,a,b)
+  ! r1(u,beta) = r1(u,beta) + W(a,b,i,beta) * T(u,a,b,i)
   !do beta = 1, nV
   !  do u = 1, nO
   !    do i = 1, nO
   !      do a = 1, nV
   !        do b = 1, nV
-  !          r1(u,beta) = r1(u,beta) + (2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta)) * tau(i,u,a,b)  
+  !          r1(u,beta) = r1(u,beta) + (2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta)) * tau(i,u,a,b)
   !        enddo
   !      enddo
   !    enddo
@@ -454,24 +457,24 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,cc_space_v_vvov,W_vvov,T_vvoo,tau) &
   !$omp private(b,beta,i,a) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do beta = 1, nV
     do i = 1, nO
       do b = 1, nV
         do a = 1, nV
-          W_vvov(a,b,i,beta) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta)  
+          W_vvov(a,b,i,beta) = 2d0 * cc_space_v_vvov(a,b,i,beta) - cc_space_v_vvov(b,a,i,beta)
         enddo
       enddo
     enddo
   enddo
   !$omp end do nowait
 
-  !$omp do 
+  !$omp do
   do u = 1, nO
     do i = 1, nO
       do b = 1, nV
         do a = 1, nV
-          T_vvoo(a,b,i,u) = tau(i,u,a,b)  
+          T_vvoo(a,b,i,u) = tau(i,u,a,b)
         enddo
       enddo
     enddo
@@ -483,17 +486,17 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
              1d0, T_vvoo, size(T_vvoo,1) * size(T_vvoo,2) * size(T_vvoo,3), &
                   W_vvov, size(W_vvov,1) * size(W_vvov,2) * size(W_vvov,3), &
              1d0, r1    , size(r1,1))
-  
+
   deallocate(W_vvov,T_vvoo)
 
-  ! r1(u,beta) = r1(u,beta) - (2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)) * tau(i,j,a,beta) 
-  ! r1(u,beta) = r1(u,beta) - W(i,j,a,u) * tau(i,j,a,beta) 
+  ! r1(u,beta) = r1(u,beta) - (2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)) * tau(i,j,a,beta)
+  ! r1(u,beta) = r1(u,beta) - W(i,j,a,u) * tau(i,j,a,beta)
   !do beta = 1, nV
   !  do u = 1, nO
   !    do i = 1, nO
   !      do j = 1, nO
   !        do a = 1, nV
-  !          r1(u,beta) = r1(u,beta) - (2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)) * tau(i,j,a,beta) 
+  !          r1(u,beta) = r1(u,beta) - (2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)) * tau(i,j,a,beta)
   !        enddo
   !      enddo
   !    enddo
@@ -507,7 +510,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp private(u,a,i,j) &
   !$omp default(none)
   do u = 1, nO
-    !$omp do 
+    !$omp do
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
@@ -523,7 +526,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
              -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
                    tau   , size(tau,1) * size(tau,2) * size(tau,3), &
               1d0, r1    , size(r1,1))
-  
+
   deallocate(W_oovo)
 
   max_r1 = 0d0
@@ -538,7 +541,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp shared(nO,nV,r1) &
   !$omp private(a,i) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do a = 1, nV
     do i = 1, nO
       r1(i,a) = -r1(i,a)
@@ -546,7 +549,7 @@ subroutine compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
 end
 
 ! H_oo
@@ -578,7 +581,7 @@ subroutine compute_H_oo(nO,nV,t1,t2,tau,H_oo)
   !        enddo
   !      enddo
   !    enddo
-  !    
+  !
   !  enddo
   !enddo
 
@@ -601,8 +604,8 @@ subroutine compute_H_oo(nO,nV,t1,t2,tau,H_oo)
   call dgemm('N','T', nO, nO, nO*nV*nV,       &
              1d0, tau     , size(tau,1),      &
                   cc_space_w_oovv, size(cc_space_w_oovv,1), &
-             1d0, H_oo    , size(H_oo,1))     
-  
+             1d0, H_oo    , size(H_oo,1))
+
 end
 
 ! H_vv
@@ -633,7 +636,7 @@ subroutine compute_H_vv(nO,nV,t1,t2,tau,H_vv)
   !        enddo
   !      enddo
   !    enddo
-  !    
+  !
   !  enddo
   !enddo
 
@@ -656,13 +659,13 @@ subroutine compute_H_vv(nO,nV,t1,t2,tau,H_vv)
 
   ! H_vv(a,beta) = H_vv(a,beta) - cc_space_w_vvoo(a,b,i,j) * tau(i,j,beta,b)
   ! H_vv(a,beta) = H_vv(a,beta) - cc_space_w_vvoo(a,b,i,j) * tmp_tau(b,i,j,beta)
-  
-  !$omp do 
+
+  !$omp do
   do beta = 1, nV
     do j = 1, nO
       do i = 1, nO
         do b = 1, nV
-          tmp_tau(b,i,j,beta) = tau(i,j,beta,b) 
+          tmp_tau(b,i,j,beta) = tau(i,j,beta,b)
         enddo
       enddo
     enddo
@@ -676,7 +679,7 @@ subroutine compute_H_vv(nO,nV,t1,t2,tau,H_vv)
               1d0, H_vv    , size(H_vv,1))
 
   deallocate(tmp_tau)
-  
+
 end
 
 ! H_vo
@@ -704,7 +707,7 @@ subroutine compute_H_vo(nO,nV,t1,t2,H_vo)
   !        H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b)
   !      enddo
   !    enddo
-  !    
+  !
   !  enddo
   !enddo
 
@@ -727,7 +730,7 @@ subroutine compute_H_vo(nO,nV,t1,t2,H_vo)
   ! H_vo(a,i) = H_vo(a,i) + cc_space_w_vvoo(a,b,i,j) * t1(j,b)
   ! H_vo(a,i) = H_vo(a,i) + w(a,i,j,b) * t1(j,b)
 
-  !$omp do 
+  !$omp do
   do b = 1, nV
     do j = 1, nO
       do i = 1, nO
@@ -746,7 +749,7 @@ subroutine compute_H_vo(nO,nV,t1,t2,H_vo)
              1d0, H_vo, 1)
 
   deallocate(w)
-  
+
 end
 
 ! R2
@@ -771,7 +774,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   allocate(g_occ(nO,nO), g_vir(nV,nV))
   allocate(J1(nO,nV,nV,nO), K1(nO,nV,nO,nV))
   allocate(A1(nO,nO,nO,nO))
-  
+
   call compute_g_occ(nO,nV,t1,t2,H_oo,g_occ)
   call compute_g_vir(nO,nV,t1,t2,H_vv,g_vir)
   call compute_A1(nO,nV,t1,t2,tau,A1)
@@ -787,7 +790,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,cc_space_v_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -863,7 +866,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,t2,X_oovv) &
   !$omp private(u,v,gam,a) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do a = 1, nV
     do gam = 1, nV
       do v = 1, nO
@@ -875,7 +878,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   call dgemm('N','N',nO*nO*nV,nV,nV, &
              1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), &
                   g_vir, size(g_vir,1), &
@@ -885,7 +888,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Y_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -921,7 +924,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -957,7 +960,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,X_vovv,cc_space_v_ovvv) &
   !$omp private(u,a,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do u = 1, nO
@@ -979,7 +982,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,Y_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -991,7 +994,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   !do gam = 1, nV
   !  do beta = 1, nV
   !    do v = 1, nO
@@ -1009,13 +1012,13 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !enddo
   double precision, allocatable :: X_vovo(:,:,:,:), Y_vovv(:,:,:,:)
   allocate(X_vovo(nV,nO,nV,nO), Y_vovv(nV,nO,nV,nV),X_oovv(nO,nO,nV,nV))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,X_vovo,cc_space_v_ovov) &
   !$omp private(u,v,gam,i) &
   !$omp default(none)
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do gam = 1, nV
       do u = 1, nO
         do a = 1, nV
@@ -1036,12 +1039,12 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
              1d0, t1, size(t1,1), &
                   Y_vovv, size(Y_vovv,1), &
              0d0, X_oovv, size(X_oovv,1))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1055,7 +1058,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp end parallel
 
   deallocate(X_vovo,Y_vovv)
-  
+
   !do gam = 1, nV
   !  do beta = 1, nV
   !    do v = 1, nO
@@ -1079,7 +1082,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1092,7 +1095,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp end do
   !$omp end parallel
 
-  
+
   !do gam = 1, nV
   !  do beta = 1, nV
   !    do v = 1, nO
@@ -1111,13 +1114,13 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
 
   double precision, allocatable :: Y_oovo(:,:,:,:)
   allocate(X_vovo(nV,nO,nV,nO), Y_oovo(nO,nO,nV,nO))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
   !$omp private(a,v,gam,i) &
   !$omp default(none)
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do gam = 1, nV
       do v = 1, nO
         do a = 1, nV
@@ -1138,12 +1141,12 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
              1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
                   t1    , size(t1,1), &
              0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,r2,X_oovv) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1155,7 +1158,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   deallocate(X_vovo,Y_oovo)
 
   !do gam = 1, nV
@@ -1183,7 +1186,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp private(u,v,gam,beta,i,a) &
   !$omp default(none)
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do a = 1, nV
       do beta = 1, nV
         do u = 1, nO
@@ -1194,7 +1197,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     !$omp end do nowait
   enddo
 
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do v = 1, nO
       do i = 1, nO
@@ -1206,17 +1209,17 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   call dgemm('N','N', nO*nV,nO*nV,nV*nO, &
              1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
                   Y_voov, size(Y_voov,1) * size(Y_voov,2), &
              0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1228,9 +1231,9 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   deallocate(X_ovvo,Y_voov)
-  
+
   !do gam = 1, nV
   !  do beta = 1, nV
   !    do v = 1, nO
@@ -1252,7 +1255,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
   !$omp private(u,a,i,beta,gam) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do beta = 1, nV
     do u = 1, nO
       do a = 1, nV
@@ -1264,7 +1267,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do nowait
 
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do v = 1, nO
       do a = 1, nV
@@ -1281,12 +1284,12 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
              1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
                   Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
              0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do v = 1, nO
@@ -1298,7 +1301,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   !do gam = 1, nV
   !  do beta = 1, nV
   !    do v = 1, nO
@@ -1343,12 +1346,12 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   call dgemm('N','N',nO*nV,nO*nV,nO*nV, &
              1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
                   Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
              0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
@@ -1367,7 +1370,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp end parallel
 
   deallocate(X_ovov,Y_ovov,Z_ovov)
-  
+
   ! Change the sign for consistency with the code in spin orbitals
   !$omp parallel &
   !$omp shared(nO,nV,r2) &
@@ -1385,7 +1388,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   max_r2 = 0d0
   do b = 1, nV
     do a = 1, nV
@@ -1398,7 +1401,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   enddo
 
   deallocate(g_occ,g_vir,J1,K1,A1)
-  
+
 end
 
 ! A1
@@ -1427,12 +1430,12 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
   !          A1(u,v,i,j) = A1(u,v,i,j) &
   !          + cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
   !          + cc_space_v_vooo(a,v,i,j) * t1(u,a)
-  !          
+  !
   !          do b = 1, nV
   !            A1(u,v,i,j) = A1(u,v,i,j) + cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
-  !          enddo  
+  !          enddo
   !        enddo
-  !        
+  !
   !      enddo
   !    enddo
   !  enddo
@@ -1440,7 +1443,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
 
   double precision, allocatable :: X_vooo(:,:,:,:), Y_oooo(:,:,:,:)
   allocate(X_vooo(nV,nO,nO,nO), Y_oooo(nO,nO,nO,nO))
-  
+
   ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
   !$omp parallel &
   !$omp shared(nO,nV,A1,cc_space_v_oooo,cc_space_v_ovoo,X_vooo) &
@@ -1494,7 +1497,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   deallocate(X_vooo,Y_oooo)
 
   ! A1(u,v,i,j) += cc_space_v_vooo(a,v,i,j) * t1(u,a)
@@ -1508,7 +1511,7 @@ subroutine compute_A1(nO,nV,t1,t2,tau,A1)
              1d0, tau     , size(tau,1) * size(tau,2), &
                   cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), &
              1d0, A1      , size(A1,1) * size(A1,2))
-   
+
 end
 
 ! B1
@@ -1530,28 +1533,28 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
   !  do beta = 1, nV
   !    do b = 1, nV
   !      do a = 1, nV
-  !        B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) 
+  !        B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam)
 
   !        do i = 1, nO
   !          B1(a,b,beta,gam) = B1(a,b,beta,gam) &
   !          - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
   !          - cc_space_v_vvov(a,b,i,gam) * t1(i,beta)
   !        enddo
-  !        
+  !
   !      enddo
   !    enddo
   !  enddo
   !enddo
-  
+
   double precision, allocatable :: X_vvvo(:,:,:,:), Y_vvvv(:,:,:,:)
   allocate(X_vvvo(nV,nV,nV,nO), Y_vvvv(nV,nV,nV,nV))
 
-  ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam) 
+  ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam)
   !$omp parallel &
   !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo) &
   !$omp private(a,b,beta,gam) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do b = 1, nV
@@ -1563,7 +1566,7 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
   enddo
   !$omp end do nowait
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do gam = 1, nV
       do b = 1, nV
         do a = 1, nV
@@ -1574,14 +1577,14 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
     !$omp end do nowait
   enddo
   !$omp end parallel
-  
+
   ! B1(a,b,beta,gam) -= cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
   call dgemm('N','N', nV*nV*nV, nV, nO, &
              -1d0, cc_space_v_vvvo, size(cc_space_v_vvvo,1) * size(cc_space_v_vvvo,2) * size(cc_space_v_vvvo,3), &
                    t1      , size(t1,1), &
               1d0, B1      , size(B1,1) * size(B1,2) * size(B1,3))
 
-  
+
   ! B1(a,b,beta,gam) -= cc_space_v_vvov(a,b,i,gam) * t1(i,beta)
   call dgemm('N','N', nV*nV*nV, nV, nO, &
              -1d0, X_vvvo, size(X_vvvo,1) * size(X_vvvo,2) * size(X_vvvo,3), &
@@ -1592,7 +1595,7 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
   !$omp shared(nV,B1,Y_vvvv) &
   !$omp private(a,b,beta,gam) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do gam = 1, nV
     do beta = 1, nV
       do b = 1, nV
@@ -1604,9 +1607,9 @@ subroutine compute_B1(nO,nV,t1,t2,B1)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   deallocate(X_vvvo,Y_vvvv)
-  
+
 end
 
 ! g_occ
@@ -1627,14 +1630,14 @@ subroutine compute_g_occ(nO,nV,t1,t2,H_oo,g_occ)
   !do i = 1, nO
   !  do u = 1, nO
   !    g_occ(u,i) = H_oo(u,i)
-  !    
+  !
   !    do a = 1, nV
   !      g_occ(u,i) = g_occ(u,i) + cc_space_f_vo(a,i) * t1(u,a)
-  !      
+  !
   !      do j = 1, nO
   !        g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a)
   !      enddo
-  !         
+  !
   !    enddo
   !  enddo
   !enddo
@@ -1655,8 +1658,8 @@ subroutine compute_g_occ(nO,nV,t1,t2,H_oo,g_occ)
     enddo
   enddo
   !$omp end do
-  
-  !$omp do 
+
+  !$omp do
   do i = 1, nO
     do j = 1, nO
       do a = 1, nV
@@ -1668,7 +1671,7 @@ subroutine compute_g_occ(nO,nV,t1,t2,H_oo,g_occ)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
 end
 
 ! g_vir
@@ -1689,23 +1692,23 @@ subroutine compute_g_vir(nO,nV,t1,t2,H_vv,g_vir)
   !do beta = 1, nV
   !  do a = 1, nV
   !    g_vir(a,beta) = H_vv(a,beta)
-  !    
+  !
   !    do i = 1, nO
   !      g_vir(a,beta) = g_vir(a,beta) - cc_space_f_vo(a,i) * t1(i,beta)
-  !      
+  !
   !      do b = 1, nV
   !        g_vir(a,beta) = g_vir(a,beta) + (2d0 * cc_space_v_vvvo(a,b,beta,i) - cc_space_v_vvvo(b,a,beta,i)) * t1(i,b)
   !      enddo
-  !         
+  !
   !    enddo
   !  enddo
   !enddo
-  
+
   call dgemm('N','N',nV,nV,nO, &
              -1d0, cc_space_f_vo , size(cc_space_f_vo,1), &
                    t1   , size(t1,1), &
               0d0, g_vir, size(g_vir,1))
-      
+
   !$omp parallel &
   !$omp shared(nO,nV,g_vir,H_vv, cc_space_v_vvvo,t1) &
   !$omp private(i,b,a,beta) &
@@ -1718,7 +1721,7 @@ subroutine compute_g_vir(nO,nV,t1,t2,H_vv,g_vir)
   enddo
   !$omp end do
 
-  !$omp do 
+  !$omp do
   do beta = 1, nV
     do i = 1, nO
       do b = 1, nV
@@ -1730,7 +1733,7 @@ subroutine compute_g_vir(nO,nV,t1,t2,H_vv,g_vir)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
 end
 
 ! J1
@@ -1763,7 +1766,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
 
   !        do b = 1, nV
   !          J1(u,a,beta,i) = J1(u,a,beta,i) &
-  !          + cc_space_v_vvvo(b,a,beta,i) * t1(u,b)    
+  !          + cc_space_v_vvvo(b,a,beta,i) * t1(u,b)
   !        enddo
 
   !        do j = 1, nO
@@ -1773,7 +1776,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !           + 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
   !          enddo
   !        enddo
-  !        
+  !
   !      enddo
   !    enddo
   !  enddo
@@ -1781,13 +1784,13 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
 
   double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:)
   allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
   !$omp private(i,j,a,u,beta) &
   !$omp default(none)
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1810,7 +1813,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   enddo
   !$omp end do
   !$omp end parallel
-  
+
   call dgemm('N','N',nO*nV*nO,nV,nO, &
             -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), &
                   t1    , size(t1,1), &
@@ -1821,7 +1824,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp private(i,beta,a,u) &
   !$omp default(none)
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1847,7 +1850,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
   !$omp private(i,beta,a,u,b,j) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do b = 1, nV
     do j = 1, nO
       do beta = 1, nV
@@ -1859,7 +1862,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   enddo
   !$omp end do nowait
 
-  !$omp do 
+  !$omp do
   do b = 1, nV
     do j = 1, nO
       do i = 1, nO
@@ -1885,7 +1888,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   !$omp private(i,beta,a,u,j,b) &
   !$omp default(none)
   do i = 1, nO
-    !$omp do 
+    !$omp do
     do beta = 1, nV
       do a = 1, nV
         do u = 1, nO
@@ -1895,10 +1898,10 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
     enddo
     !$omp end do nowait
   enddo
-  
+
   !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
   do j = 1, nO
-    !$omp do 
+    !$omp do
     do b = 1, nV
       do i = 1, nO
         do a = 1, nV
@@ -1908,9 +1911,9 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
     enddo
     !$omp end do nowait
   enddo
-  
+
   do j = 1, nO
-    !$omp do 
+    !$omp do
     do b = 1, nV
       do beta = 1, nV
         do u = 1, nO
@@ -1921,7 +1924,7 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
     !$omp end do nowait
   enddo
   !$omp end parallel
-  
+
   call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
              1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
                   Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), &
@@ -1944,8 +1947,8 @@ subroutine compute_J1(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvvo,v_vvoo,J1)
   enddo
   !$omp end parallel
 
-  deallocate(X_ovvo,Z_ovvo,Y_ovov)  
-  
+  deallocate(X_ovvo,Z_ovvo,Y_ovov)
+
 end
 
 ! K1
@@ -1980,7 +1983,7 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
 
   !        do b = 1, nV
   !          K1(u,a,i,beta) = K1(u,a,i,beta) &
-  !          + cc_space_v_vvov(b,a,i,beta) * t1(u,b)    
+  !          + cc_space_v_vvov(b,a,i,beta) * t1(u,b)
   !        enddo
 
   !        do j = 1, nO
@@ -1989,19 +1992,19 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
   !           - cc_space_v_vvoo(b,a,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta))
   !          enddo
   !        enddo
-  !        
+  !
   !      enddo
   !    enddo
   !  enddo
   !enddo
 
   allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO))
-  
+
   !$omp parallel &
   !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
   !$omp private(i,beta,a,u,j,b) &
   !$omp default(none)
-  !$omp do 
+  !$omp do
   do beta = 1, nV
     do i = 1, nO
       do a = 1, nV
@@ -2072,5 +2075,5 @@ subroutine compute_K1(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,v_vvov,K1)
   !$omp end parallel
 
   deallocate(X,Y,Z)
-  
+
 end
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 0081e9e7..049c57e8 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -7,13 +7,14 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
   double precision, intent(in)  :: t2(nO,nO,nV,nV)
   double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
-  double precision, intent(out) :: energy
+  double precision, intent(inout) :: energy
 
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
   integer                       :: i,j,k,l,a,b,c,d
-  double precision              :: e,ta,tb
+  double precision              :: e,ta,tb,eccsd
 
+  eccsd = energy
   call set_multiple_levels_omp(.False.)
 
   allocate(X_vovv(nV,nO,nV,nV), X_ooov(nO,nO,nO,nV), X_oovv(nO,nO,nV,nV))
@@ -206,6 +207,12 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
 
   Pabc(:) = 1.d0/Pabc(:)
 
+  print '(A)', ''
+  print '(A)', ' +----------------------+--------------+----------+'
+  print '(A)', ' |      E(CCSD(T))      |   Error      |     %    |'
+  print '(A)', ' +----------------------+--------------+----------+'
+
+
   call wall_time(t00)
   imin = 1_8
   !$OMP PARALLEL                                                     &
@@ -309,13 +316,15 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
 
       energy = energy_det + energy_stoch
 
-      print *, real(energy), ' +/- ', real(sqrt(variance/(norm-1.d0))), isample, real(Ncomputed)/real(Nabc)
+      print '('' | '',F20.8, '' | '', E12.4,'' | '', F8.2,'' |'')', eccsd+energy, dsqrt(variance/(norm-1.d0)), 100.*real(Ncomputed)/real(Nabc)
     endif
     !$OMP END MASTER
     if (imin >= Nabc) exit
   enddo
 
   !$OMP END PARALLEL
+  print '(A)', ' +----------------------+--------------+----------+'
+  print '(A)', ''
 
   deallocate(X_vovv,X_ooov,T_voov,T_oovv)
 end

From 46cbd80b9596a6e2c19e2db13ea800376c8cfb55 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 17 May 2023 10:44:32 +0200
Subject: [PATCH 19/79] Accelerated Cholesky

---
 src/ao_two_e_ints/EZFIO.cfg       |  6 +++
 src/ao_two_e_ints/cholesky.irp.f  | 81 ++++++++++++++++---------------
 src/ccsd/ccsd_space_orb_sub.irp.f | 10 ++--
 3 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg
index 4ab080ec..9f523fca 100644
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@@ -11,6 +11,12 @@ interface: ezfio,provider,ocaml
 default: 1.e-15
 ezfio_name: threshold_ao
 
+[ao_cholesky_threshold]
+type: Threshold
+doc: If | (ii|jj) | < `ao_cholesky_threshold` then (ii|jj) is zero
+interface: ezfio,provider,ocaml
+default: 1.e-12
+
 [do_direct_integrals]
 type: logical
 doc: Compute integrals on the fly (very slow, only for debugging)
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index d4c201aa..3da827e1 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -4,29 +4,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num_guess ]
  ! Number of Cholesky vectors in AO basis
  END_DOC
 
- integer :: i,j,k,l
- double precision :: xnorm0, x, integral
- double precision, external :: ao_two_e_integral
-
- cholesky_ao_num_guess = 0
- xnorm0 = 0.d0
- x = 0.d0
- do j=1,ao_num
-   do i=1,ao_num
-     integral = ao_two_e_integral(i,i,j,j)
-     if (integral > ao_integrals_threshold) then
-       cholesky_ao_num_guess += 1
-     else
-       x += integral
-     endif
-   enddo
- enddo
- print *, 'Cholesky decomposition of AO integrals'
- print *, '--------------------------------------'
- print *, ''
- print *, 'Estimated Error: ', x
- print *, 'Guess size: ', cholesky_ao_num_guess, '(', 100.d0*dble(cholesky_ao_num_guess)/dble(ao_num*ao_num), ' %)'
-
+ cholesky_ao_num_guess = ao_num*ao_num / 2
 END_PROVIDER
 
  BEGIN_PROVIDER [ integer, cholesky_ao_num ]
@@ -39,7 +17,7 @@ END_PROVIDER
  END_DOC
 
  type(c_ptr) :: ptr
- integer :: fd, i,j,k,l, rank
+ integer :: fd, i,j,k,l,m,rank
  double precision, pointer :: ao_integrals(:,:,:,:)
  double precision, external :: ao_two_e_integral
 
@@ -49,24 +27,49 @@ END_PROVIDER
    8, fd, .False., ptr)
  call c_f_pointer(ptr, ao_integrals, (/ao_num, ao_num, ao_num, ao_num/))
 
- double precision :: integral
+ print*, 'Providing the AO integrals (Cholesky)'
+ call wall_time(wall_1)
+ call cpu_time(cpu_1)
+
+ ao_integrals = 0.d0
+
+ double precision :: integral, cpu_1, cpu_2, wall_1, wall_2
  logical, external :: ao_two_e_integral_zero
- !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k,l, integral) SCHEDULE(dynamic)
- do l=1,ao_num
-  do j=1,l
-   do k=1,ao_num
-    do i=1,k
-     if (ao_two_e_integral_zero(i,j,k,l)) cycle
-     integral = ao_two_e_integral(i,k,j,l)
-     ao_integrals(i,k,j,l) = integral
-     ao_integrals(k,i,j,l) = integral
-     ao_integrals(i,k,l,j) = integral
-     ao_integrals(k,i,l,j) = integral
-    enddo
+
+ !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2)
+ do m=0,9
+   do l=1+m,ao_num,10
+     !$OMP DO SCHEDULE(dynamic)
+     do j=1,l
+       do k=1,ao_num
+         do i=1,min(k,j)
+           if (ao_two_e_integral_zero(i,j,k,l)) cycle
+           integral = ao_two_e_integral(i,k,j,l)
+           ao_integrals(i,k,j,l) = integral
+           ao_integrals(k,i,j,l) = integral
+           ao_integrals(i,k,l,j) = integral
+           ao_integrals(k,i,l,j) = integral
+           ao_integrals(j,l,i,k) = integral
+           ao_integrals(j,l,k,i) = integral
+           ao_integrals(l,j,i,k) = integral
+           ao_integrals(l,j,k,i) = integral
+         enddo
+       enddo
+     enddo
+     !$OMP END DO NOWAIT
    enddo
-  enddo
+   !$OMP MASTER
+   call wall_time(wall_2)
+   print '(F10.2,'' %  in'', 4X, I10, '' s.'')', (m+1) * 10, wall_2-wall_1
+   !$OMP END MASTER
  enddo
- !$OMP END PARALLEL DO
+ !$OMP END PARALLEL
+
+ call wall_time(wall_2)
+ call cpu_time(cpu_2)
+ print*, 'AO integrals provided:'
+ print*, ' cpu  time :',cpu_2 - cpu_1, 's'
+ print*, ' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1+tiny(1.d0)), ' )'
 
  ! Call Lapack
  cholesky_ao_num = cholesky_ao_num_guess
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 287d5b03..2e0ccd8f 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -16,7 +16,7 @@ subroutine run_ccsd_space_orb
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
-  integer                       :: nO, nV, nOa, nOb, nVa, nVb, n_spin(4)
+  integer                       :: nO, nV, nOa, nVa
 
   PROVIDE mo_two_e_integrals_in_map
 
@@ -24,12 +24,8 @@ subroutine run_ccsd_space_orb
   print*,'Reference determinant:'
   call print_det(det,N_int)
 
-  ! Extract number of occ/vir alpha/beta spin orbitals
-  !call extract_n_spin(det,n_spin)
-  nOa = cc_nOa !n_spin(1)
-  nOb = cc_nOb !n_spin(2)
-  nVa = cc_nVa !n_spin(3)
-  nVb = cc_nVb !n_spin(4)
+  nOa = cc_nOa
+  nVa = cc_nVa
 
   ! Check that the reference is a closed shell determinant
   if (cc_ref_is_open_shell) then

From a8948d091667801acb7800a7510e45b136b59fd3 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 17 May 2023 16:55:29 +0200
Subject: [PATCH 20/79] cholesky in big_array

---
 src/ao_two_e_ints/cholesky.irp.f          |  97 +++++++----
 src/ccsd/ccsd_space_orb_sub.irp.f         |   2 +-
 src/ccsd/ccsd_t_space_orb_stoch.irp.f     |   2 +-
 src/mo_two_e_ints/cholesky.irp.f          |  30 ++++
 src/mo_two_e_ints/integrals_3_index.irp.f |  70 ++++++--
 src/mo_two_e_ints/mo_bi_integrals.irp.f   |  27 ++-
 src/utils_cc/energy.irp.f                 |   5 +-
 src/utils_cc/mo_integrals_cc.irp.f        | 197 +++++++++++-----------
 8 files changed, 281 insertions(+), 149 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 3da827e1..bb81b141 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -35,45 +35,82 @@ END_PROVIDER
 
  double precision :: integral, cpu_1, cpu_2, wall_1, wall_2
  logical, external :: ao_two_e_integral_zero
+  double precision, external :: get_ao_two_e_integral
 
- !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2)
- do m=0,9
-   do l=1+m,ao_num,10
-     !$OMP DO SCHEDULE(dynamic)
-     do j=1,l
-       do k=1,ao_num
-         do i=1,min(k,j)
-           if (ao_two_e_integral_zero(i,j,k,l)) cycle
-           integral = ao_two_e_integral(i,k,j,l)
-           ao_integrals(i,k,j,l) = integral
-           ao_integrals(k,i,j,l) = integral
-           ao_integrals(i,k,l,j) = integral
-           ao_integrals(k,i,l,j) = integral
-           ao_integrals(j,l,i,k) = integral
-           ao_integrals(j,l,k,i) = integral
-           ao_integrals(l,j,i,k) = integral
-           ao_integrals(l,j,k,i) = integral
+ if (read_ao_two_e_integrals) then
+   PROVIDE ao_two_e_integrals_in_map
+
+   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2)
+   do m=0,9
+     do l=1+m,ao_num,10
+       !$OMP DO SCHEDULE(dynamic)
+       do j=1,l
+         do k=1,ao_num
+           do i=1,min(k,j)
+             if (ao_two_e_integral_zero(i,j,k,l)) cycle
+             integral = get_ao_two_e_integral(i,j,k,l, ao_integrals_map)
+             ao_integrals(i,k,j,l) = integral
+             ao_integrals(k,i,j,l) = integral
+             ao_integrals(i,k,l,j) = integral
+             ao_integrals(k,i,l,j) = integral
+             ao_integrals(j,l,i,k) = integral
+             ao_integrals(j,l,k,i) = integral
+             ao_integrals(l,j,i,k) = integral
+             ao_integrals(l,j,k,i) = integral
+           enddo
          enddo
        enddo
+       !$OMP END DO NOWAIT
      enddo
-     !$OMP END DO NOWAIT
+     !$OMP MASTER
+     call wall_time(wall_2)
+     print '(I10,'' %  in'', 4X, F10.2, '' s.'')', (m+1) * 10, wall_2-wall_1
+     !$OMP END MASTER
    enddo
-   !$OMP MASTER
-   call wall_time(wall_2)
-   print '(F10.2,'' %  in'', 4X, I10, '' s.'')', (m+1) * 10, wall_2-wall_1
-   !$OMP END MASTER
- enddo
- !$OMP END PARALLEL
+   !$OMP END PARALLEL
 
- call wall_time(wall_2)
- call cpu_time(cpu_2)
- print*, 'AO integrals provided:'
- print*, ' cpu  time :',cpu_2 - cpu_1, 's'
- print*, ' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1+tiny(1.d0)), ' )'
+ else
+
+   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral, wall_2)
+   do m=0,9
+     do l=1+m,ao_num,10
+       !$OMP DO SCHEDULE(dynamic)
+       do j=1,l
+         do k=1,ao_num
+           do i=1,min(k,j)
+             if (ao_two_e_integral_zero(i,j,k,l)) cycle
+             integral = ao_two_e_integral(i,k,j,l)
+             ao_integrals(i,k,j,l) = integral
+             ao_integrals(k,i,j,l) = integral
+             ao_integrals(i,k,l,j) = integral
+             ao_integrals(k,i,l,j) = integral
+             ao_integrals(j,l,i,k) = integral
+             ao_integrals(j,l,k,i) = integral
+             ao_integrals(l,j,i,k) = integral
+             ao_integrals(l,j,k,i) = integral
+           enddo
+         enddo
+       enddo
+       !$OMP END DO NOWAIT
+     enddo
+     !$OMP MASTER
+     call wall_time(wall_2)
+     print '(I10,'' %  in'', 4X, F10.2, '' s.'')', (m+1) * 10, wall_2-wall_1
+     !$OMP END MASTER
+   enddo
+   !$OMP END PARALLEL
+
+   call wall_time(wall_2)
+   call cpu_time(cpu_2)
+   print*, 'AO integrals provided:'
+   print*, ' cpu  time :',cpu_2 - cpu_1, 's'
+   print*, ' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1+tiny(1.d0)), ' )'
+
+ endif
 
  ! Call Lapack
  cholesky_ao_num = cholesky_ao_num_guess
- call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_integrals_threshold, ao_num*ao_num, cholesky_ao)
+ call pivoted_cholesky(ao_integrals, cholesky_ao_num, ao_cholesky_threshold, ao_num*ao_num, cholesky_ao)
  print *, 'Rank: ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)'
 
  ! Remove mmap
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 2e0ccd8f..256117d6 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -18,7 +18,7 @@ subroutine run_ccsd_space_orb
   integer(bit_kind)             :: det(N_int,2)
   integer                       :: nO, nV, nOa, nVa
 
-  PROVIDE mo_two_e_integrals_in_map
+!  PROVIDE mo_two_e_integrals_in_map
 
   det = psi_det(:,:,cc_ref)
   print*,'Reference determinant:'
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 049c57e8..1f3bebc2 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -274,7 +274,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
     enddo
 
     call wall_time(t01)
-    if (t01-t00 > 1.0d0) then
+    if ((t01-t00 > 1.0d0).or.(imin >= Nabc)) then
       t00 = t01
 
       !$OMP TASKWAIT
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 14d3c696..b5b39b3b 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -6,11 +6,41 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num
 
  integer :: k
 
+ print *, 'AO->MO Transformation of Cholesky vectors'
  !$OMP PARALLEL DO PRIVATE(k)
  do k=1,cholesky_ao_num
   call ao_to_mo(cholesky_ao(1,1,k),ao_num,cholesky_mo(1,1,k),mo_num)
  enddo
  !$OMP END PARALLEL DO
+ print *, ''
+
+END_PROVIDER
+
+BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num, mo_num) ]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors in MO basis
+ END_DOC
+
+ integer :: i,j,k
+ double precision, allocatable :: buffer(:,:)
+
+ print *, 'AO->MO Transformation of Cholesky vectors  .'
+ !$OMP PARALLEL PRIVATE(i,j,k,buffer)
+ allocate(buffer(mo_num,mo_num))
+ !$OMP DO SCHEDULE(static)
+ do k=1,cholesky_ao_num
+  call ao_to_mo(cholesky_ao(1,1,k),ao_num,buffer,mo_num)
+  do j=1,mo_num
+    do i=1,mo_num
+      cholesky_mo_transp(k,i,j) = buffer(i,j)
+    enddo
+  enddo
+ enddo
+ !$OMP END DO
+ deallocate(buffer)
+ !$OMP END PARALLEL
+ print *, ''
 
 END_PROVIDER
 
diff --git a/src/mo_two_e_ints/integrals_3_index.irp.f b/src/mo_two_e_ints/integrals_3_index.irp.f
index 4ffb0134..d807f619 100644
--- a/src/mo_two_e_ints/integrals_3_index.irp.f
+++ b/src/mo_two_e_ints/integrals_3_index.irp.f
@@ -4,24 +4,68 @@
  BEGIN_DOC
  ! big_array_coulomb_integrals(j,i,k)  = <ij|kj> = (ik|jj)
  !
- ! big_array_exchange_integrals(i,j,k) = <ij|jk> = (ij|kj)
+ ! big_array_exchange_integrals(j,i,k) = <ij|jk> = (ij|kj)
  END_DOC
- integer :: i,j,k,l
+ integer :: i,j,k,l,a
  double precision :: get_two_e_integral
  double precision :: integral
 
- do k = 1, mo_num
-  do i = 1, mo_num
-   do j = 1, mo_num
-     l = j
-     integral = get_two_e_integral(i,j,k,l,mo_integrals_map)
-     big_array_coulomb_integrals(j,i,k) = integral
-     l = j
-     integral = get_two_e_integral(i,j,l,k,mo_integrals_map)
-     big_array_exchange_integrals(j,i,k) = integral
+ if (do_ao_cholesky) then
+
+    double precision, allocatable :: buffer_jj(:,:), buffer(:,:,:)
+    allocate(buffer_jj(cholesky_ao_num,mo_num), buffer(mo_num,mo_num,mo_num))
+    do j=1,mo_num
+      buffer_jj(:,j) = cholesky_mo_transp(:,j,j)
+    enddo
+
+    call dgemm('T','N', mo_num*mo_num,mo_num,cholesky_ao_num, 1.d0, &
+        cholesky_mo_transp, cholesky_ao_num, &
+        buffer_jj, cholesky_ao_num, 0.d0, &
+        buffer, mo_num*mo_num)
+
+    do k = 1, mo_num
+      do i = 1, mo_num
+        do j = 1, mo_num
+          big_array_coulomb_integrals(j,i,k) = buffer(i,k,j)
+        enddo
+      enddo
+    enddo
+    deallocate(buffer_jj)
+
+    allocate(buffer_jj(mo_num,mo_num))
+
+    do j = 1, mo_num
+
+      call dgemm('T','N',mo_num,mo_num,cholesky_ao_num, 1.d0, &
+        cholesky_mo_transp(1,1,j), cholesky_ao_num, &
+        cholesky_mo_transp(1,1,j), cholesky_ao_num, 0.d0, &
+        buffer_jj, mo_num)
+
+      do k=1,mo_num
+        do i=1,mo_num
+          big_array_exchange_integrals(j,i,k) = buffer_jj(i,k)
+       enddo
+     enddo
+    enddo
+
+    deallocate(buffer_jj)
+
+ else
+
+   do k = 1, mo_num
+     do i = 1, mo_num
+       do j = 1, mo_num
+         l = j
+         integral = get_two_e_integral(i,j,k,l,mo_integrals_map)
+         big_array_coulomb_integrals(j,i,k) = integral
+         l = j
+         integral = get_two_e_integral(i,j,l,k,mo_integrals_map)
+         big_array_exchange_integrals(j,i,k) = integral
+       enddo
+     enddo
    enddo
-  enddo
- enddo
+
+ endif
 
 END_PROVIDER
 
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index b7ef901d..a461504e 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -1353,15 +1353,30 @@ END_PROVIDER
   integer                        :: i,j
   double precision               :: get_two_e_integral
 
-  PROVIDE mo_two_e_integrals_in_map
-  mo_two_e_integrals_jj = 0.d0
-  mo_two_e_integrals_jj_exchange = 0.d0
+
+  if (do_ao_cholesky) then
+    do j=1,mo_num
+      do i=1,mo_num
+        !TODO: use dgemm
+        mo_two_e_integrals_jj(i,j) = sum(cholesky_mo_transp(:,i,i)*cholesky_mo_transp(:,j,j))
+        mo_two_e_integrals_jj_exchange(i,j) = sum(cholesky_mo_transp(:,i,j)*cholesky_mo_transp(:,j,i))
+      enddo
+    enddo
+
+  else
+
+    do j=1,mo_num
+      do i=1,mo_num
+        mo_two_e_integrals_jj(i,j) = get_two_e_integral(i,j,i,j,mo_integrals_map)
+        mo_two_e_integrals_jj_exchange(i,j) = get_two_e_integral(i,j,j,i,mo_integrals_map)
+      enddo
+    enddo
+
+  endif
 
   do j=1,mo_num
     do i=1,mo_num
-      mo_two_e_integrals_jj(i,j) = get_two_e_integral(i,j,i,j,mo_integrals_map)
-      mo_two_e_integrals_jj_exchange(i,j) = get_two_e_integral(i,j,j,i,mo_integrals_map)
-      mo_two_e_integrals_jj_anti(i,j) = mo_two_e_integrals_jj(i,j) - mo_two_e_integrals_jj_exchange(i,j)
+        mo_two_e_integrals_jj_anti(i,j) = mo_two_e_integrals_jj(i,j) - mo_two_e_integrals_jj_exchange(i,j)
     enddo
   enddo
 
diff --git a/src/utils_cc/energy.irp.f b/src/utils_cc/energy.irp.f
index 33e0cbae..fc1451ba 100644
--- a/src/utils_cc/energy.irp.f
+++ b/src/utils_cc/energy.irp.f
@@ -5,9 +5,8 @@ subroutine det_energy(det,energy)
   integer(bit_kind), intent(in) :: det
 
   double precision, intent(out) :: energy
+  double precision, external :: diag_H_mat_elem
 
-  call i_H_j(det,det,N_int,energy)
+  energy = diag_H_mat_elem(det,N_int) + nuclear_repulsion
 
-  energy = energy + nuclear_repulsion
-  
 end
diff --git a/src/utils_cc/mo_integrals_cc.irp.f b/src/utils_cc/mo_integrals_cc.irp.f
index 9e244d82..485d7002 100644
--- a/src/utils_cc/mo_integrals_cc.irp.f
+++ b/src/utils_cc/mo_integrals_cc.irp.f
@@ -13,7 +13,7 @@ subroutine gen_f_space(det,n1,n2,list1,list2,f)
   integer                       :: i1,i2,idx1,idx2
 
   allocate(tmp_F(mo_num,mo_num))
-  
+
   call get_fock_matrix_spin(det,1,tmp_F)
 
   !$OMP PARALLEL &
@@ -32,7 +32,7 @@ subroutine gen_f_space(det,n1,n2,list1,list2,f)
   !$OMP END PARALLEL
 
   deallocate(tmp_F)
-  
+
 end
 
 ! V
@@ -45,63 +45,66 @@ subroutine gen_v_space(n1,n2,n3,n4,list1,list2,list3,list4,v)
   integer, intent(in)           :: list1(n1),list2(n2),list3(n3),list4(n4)
   double precision, intent(out) :: v(n1,n2,n3,n4)
 
-  integer                       :: i1,i2,i3,i4,idx1,idx2,idx3,idx4
-  double precision              :: get_two_e_integral
-  
-  PROVIDE mo_two_e_integrals_in_map
+  integer                       :: i1,i2,i3,i4,idx1,idx2,idx3,idx4,k
 
+  double precision, allocatable :: buffer(:,:,:)
   !$OMP PARALLEL &
-  !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,mo_integrals_map) &
-  !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4)&
+  !$OMP SHARED(n1,n2,n3,n4,list1,list2,list3,list4,v,mo_num,cholesky_mo_transp,cholesky_ao_num) &
+  !$OMP PRIVATE(i1,i2,i3,i4,idx1,idx2,idx3,idx4,k,buffer)&
   !$OMP DEFAULT(NONE)
-  !$OMP DO collapse(3)
+  allocate(buffer(mo_num,mo_num,mo_num))
+  !$OMP DO
   do i4 = 1, n4
-    do i3 = 1, n3
-      do i2 = 1, n2
+    idx4 = list4(i4)
+    call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_ao_num, 1.d0, &
+       cholesky_mo_transp, cholesky_ao_num, &
+       cholesky_mo_transp(1,1,idx4), cholesky_ao_num, 0.d0, buffer, mo_num*mo_num)
+    do i2 = 1, n2
+      idx2 = list2(i2)
+      do i3 = 1, n3
+        idx3 = list3(i3)
         do i1 = 1, n1
-          idx4 = list4(i4)
-          idx3 = list3(i3)
-          idx2 = list2(i2)
           idx1 = list1(i1)
-          v(i1,i2,i3,i4) = get_two_e_integral(idx1,idx2,idx3,idx4,mo_integrals_map)
+          v(i1,i2,i3,i4) = buffer(idx1,idx3,idx2)
         enddo
       enddo
     enddo
   enddo
   !$OMP END DO
+  deallocate(buffer)
   !$OMP END PARALLEL
-  
+
+
 end
 
 ! full
 
 BEGIN_PROVIDER [double precision, cc_space_v, (mo_num,mo_num,mo_num,mo_num)]
-
   implicit none
-
-  integer          :: i,j,k,l
-  double precision :: get_two_e_integral
-  
-  PROVIDE mo_two_e_integrals_in_map
-
+  integer                       :: i1,i2,i3,i4,k
+  double precision, allocatable :: buffer(:,:,:)
   !$OMP PARALLEL &
-  !$OMP SHARED(cc_space_v,mo_num,mo_integrals_map) &
-  !$OMP PRIVATE(i,j,k,l) &
+  !$OMP SHARED(cc_space_v,mo_num,cholesky_mo_transp,cholesky_ao_num) &
+  !$OMP PRIVATE(i1,i2,i3,i4,k,buffer)&
   !$OMP DEFAULT(NONE)
-  
-  !$OMP DO collapse(3)
-  do l = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do i = 1, mo_num
-          cc_space_v(i,j,k,l) = get_two_e_integral(i,j,k,l,mo_integrals_map)
+  allocate(buffer(mo_num,mo_num,mo_num))
+  !$OMP DO
+  do i4 = 1, mo_num
+    call dgemm('T','N', mo_num*mo_num, mo_num, cholesky_ao_num, 1.d0, &
+         cholesky_mo_transp, cholesky_ao_num, &
+         cholesky_mo_transp(1,1,i4), cholesky_ao_num, 0.d0, buffer, mo_num*mo_num)
+    do i2 = 1, mo_num
+      do i3 = 1, mo_num
+        do i1 = 1, mo_num
+          cc_space_v(i1,i2,i3,i4) = buffer(i1,i3,i2)
         enddo
       enddo
     enddo
   enddo
   !$OMP END DO
+  deallocate(buffer)
   !$OMP END PARALLEL
-       
+
 END_PROVIDER
 
 ! oooo
@@ -280,7 +283,7 @@ BEGIN_PROVIDER [double precision, cc_space_v_ppqq, (cc_n_mo, cc_n_mo)]
   allocate(tmp_v(cc_n_mo,cc_n_mo,cc_n_mo,cc_n_mo))
 
   call gen_v_space(cc_n_mo,cc_n_mo,cc_n_mo,cc_n_mo, cc_list_gen,cc_list_gen,cc_list_gen,cc_list_gen, tmp_v)
-  
+
   do q = 1, cc_n_mo
     do p = 1, cc_n_mo
       cc_space_v_ppqq(p,q) = tmp_v(p,p,q,q)
@@ -382,7 +385,7 @@ BEGIN_PROVIDER [double precision, cc_space_v_aabb, (cc_nVa,cc_nVa)]
   enddo
 
   FREE cc_space_v_vvvv
-  
+
 END_PROVIDER
 
 ! iaia
@@ -467,7 +470,7 @@ BEGIN_PROVIDER [double precision, cc_space_w_oovv, (cc_nOa, cc_nOa, cc_nVa, cc_n
   integer :: i,j,a,b
 
   allocate(tmp_v(cc_nOa,cc_nOa,cc_nVa,cc_nVa))
-  
+
   call gen_v_space(cc_nOa,cc_nOa,cc_nVa,cc_nVa, cc_list_occ,cc_list_occ,cc_list_vir,cc_list_vir, tmp_v)
 
   !$OMP PARALLEL &
@@ -501,7 +504,7 @@ BEGIN_PROVIDER [double precision, cc_space_w_vvoo, (cc_nVa, cc_nVa, cc_nOa, cc_n
   integer :: i,j,a,b
 
   allocate(tmp_v(cc_nVa,cc_nVa,cc_nOa,cc_nOa))
-  
+
   call gen_v_space(cc_nVa,cc_nVa,cc_nOa,cc_nOa, cc_list_vir,cc_list_vir,cc_list_occ,cc_list_occ, tmp_v)
 
   !$OMP PARALLEL &
@@ -613,7 +616,7 @@ subroutine shift_idx_spin(s,n_S,shift)
   else
     shift = n_S(1)
   endif
-  
+
 end
 
 ! F
@@ -626,21 +629,22 @@ subroutine gen_f_spin(det, n1,n2, n1_S,n2_S, list1,list2, dim1,dim2, f)
   ! Compute the Fock matrix corresponding to two lists of spin orbitals.
   ! Ex: occ/occ, occ/vir,...
   END_DOC
-  
+
   integer(bit_kind), intent(in) :: det(N_int,2)
   integer, intent(in)           :: n1,n2, n1_S(2), n2_S(2)
   integer, intent(in)           :: list1(n1,2), list2(n2,2)
   integer, intent(in)           :: dim1, dim2
-  
+
   double precision, intent(out) :: f(dim1, dim2)
 
   double precision, allocatable :: tmp_F(:,:)
   integer                       :: i,j, idx_i,idx_j,i_shift,j_shift
   integer                       :: tmp_i,tmp_j
   integer                       :: si,sj,s
+  PROVIDE big_array_exchange_integrals big_array_coulomb_integrals
 
   allocate(tmp_F(mo_num,mo_num))
-  
+
   do sj = 1, 2
     call shift_idx_spin(sj,n2_S,j_shift)
     do si = 1, 2
@@ -669,9 +673,9 @@ subroutine gen_f_spin(det, n1,n2, n1_S,n2_S, list1,list2, dim1,dim2, f)
 
     enddo
   enddo
-  
+
   deallocate(tmp_F)
-  
+
 end
 
 ! Get F
@@ -683,12 +687,12 @@ subroutine get_fock_matrix_spin(det,s,f)
   BEGIN_DOC
   ! Fock matrix alpha or beta of an arbitrary det
   END_DOC
-  
+
   integer(bit_kind), intent(in) :: det(N_int,2)
   integer, intent(in)           :: s
-  
+
   double precision, intent(out) :: f(mo_num,mo_num)
-  
+
   integer                       :: p,q,i,s1,s2
   integer(bit_kind)             :: res(N_int,2)
   logical                       :: ok
@@ -701,9 +705,11 @@ subroutine get_fock_matrix_spin(det,s,f)
     s1 = 2
     s2 = 1
   endif
-  
+
+  PROVIDE big_array_coulomb_integrals big_array_exchange_integrals
+
   !$OMP PARALLEL &
-  !$OMP SHARED(f,mo_num,s1,s2,N_int,det,mo_one_e_integrals) &
+  !$OMP SHARED(f,mo_num,s1,s2,N_int,det,mo_one_e_integrals,big_array_coulomb_integrals,big_array_exchange_integrals) &
   !$OMP PRIVATE(p,q,ok,i,res)&
   !$OMP DEFAULT(NONE)
   !$OMP DO collapse(1)
@@ -713,20 +719,21 @@ subroutine get_fock_matrix_spin(det,s,f)
       do i = 1, mo_num
         call apply_hole(det, s1, i, res, ok, N_int)
         if (ok) then
-          f(p,q) = f(p,q) + mo_two_e_integral(p,i,q,i) - mo_two_e_integral(p,i,i,q)
+!          f(p,q) = f(p,q) + mo_two_e_integral(p,i,q,i) - mo_two_e_integral(p,i,i,q)
+          f(p,q) = f(p,q) + big_array_coulomb_integrals(i,p,q) - big_array_exchange_integrals(i,p,q)
         endif
       enddo
       do i = 1, mo_num
         call apply_hole(det, s2, i, res, ok, N_int)
         if (ok) then
-          f(p,q) = f(p,q) + mo_two_e_integral(p,i,q,i)
+          f(p,q) = f(p,q) + big_array_coulomb_integrals(i,p,q)
         endif
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
-    
+
 end
 
 ! V
@@ -752,14 +759,14 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
   integer                       :: si,sj,sk,sl,s
 
   PROVIDE cc_space_v
-  
+
   !$OMP PARALLEL &
   !$OMP SHARED(cc_space_v,n1_S,n2_S,n3_S,n4_S,list1,list2,list3,list4,v) &
   !$OMP PRIVATE(s,si,sj,sk,sl,i_shift,j_shift,k_shift,l_shift, &
   !$OMP i,j,k,l,idx_i,idx_j,idx_k,idx_l,&
   !$OMP tmp_i,tmp_j,tmp_k,tmp_l)&
   !$OMP DEFAULT(NONE)
-  
+
   do sl = 1, 2
     call shift_idx_spin(sl,n4_S,l_shift)
     do sk = 1, 2
@@ -768,7 +775,7 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
         call shift_idx_spin(sj,n2_S,j_shift)
         do si = 1, 2
           call shift_idx_spin(si,n1_S,i_shift)
-    
+
           s = si+sj+sk+sl
           ! <aa||aa> or <bb||bb>
           if (s == 4 .or. s == 8) then
@@ -776,7 +783,7 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
             do tmp_l = 1, n4_S(sl)
               do tmp_k = 1, n3_S(sk)
                 do tmp_j = 1, n2_S(sj)
-                  do tmp_i = 1, n1_S(si)  
+                  do tmp_i = 1, n1_S(si)
                     l = list4(tmp_l,sl)
                     idx_l = tmp_l + l_shift
                     k = list3(tmp_k,sk)
@@ -792,14 +799,14 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
               enddo
             enddo
             !$OMP END DO
-            
+
           ! <ab||ab> or <ba||ba>
           elseif (si == sk .and. sj == sl) then
             !$OMP DO collapse(3)
             do tmp_l = 1, n4_S(sl)
               do tmp_k = 1, n3_S(sk)
                 do tmp_j = 1, n2_S(sj)
-                  do tmp_i = 1, n1_S(si)  
+                  do tmp_i = 1, n1_S(si)
                     l = list4(tmp_l,sl)
                     idx_l = tmp_l + l_shift
                     k = list3(tmp_k,sk)
@@ -815,14 +822,14 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
               enddo
             enddo
             !$OMP END DO
-            
+
           ! <ab||ba> or <ba||ab>
           elseif (si == sl .and. sj == sk) then
             !$OMP DO collapse(3)
             do tmp_l = 1, n4_S(sl)
               do tmp_k = 1, n3_S(sk)
                 do tmp_j = 1, n2_S(sj)
-                  do tmp_i = 1, n1_S(si)  
+                  do tmp_i = 1, n1_S(si)
                     l = list4(tmp_l,sl)
                     idx_l = tmp_l + l_shift
                     k = list3(tmp_k,sk)
@@ -843,7 +850,7 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
             do tmp_l = 1, n4_S(sl)
               do tmp_k = 1, n3_S(sk)
                 do tmp_j = 1, n2_S(sj)
-                  do tmp_i = 1, n1_S(si)  
+                  do tmp_i = 1, n1_S(si)
                     l = list4(tmp_l,sl)
                     idx_l = tmp_l + l_shift
                     k = list3(tmp_k,sk)
@@ -859,13 +866,13 @@ subroutine gen_v_spin(n1,n2,n3,n4, n1_S,n2_S,n3_S,n4_S, list1,list2,list3,list4,
             enddo
             !$OMP END DO
           endif
-          
+
         enddo
       enddo
     enddo
   enddo
   !$OMP END PARALLEL
-  
+
 end
 
 ! V_3idx
@@ -900,28 +907,28 @@ subroutine gen_v_spin_3idx(n1,n2,n3,n4, idx_l, n1_S,n2_S,n3_S,n4_S, list1,list2,
   call shift_idx_spin(sl,n4_S,l_shift)
   tmp_l = idx_l - l_shift
   l = list4(tmp_l,sl)
-  
+
   !$OMP PARALLEL &
   !$OMP SHARED(l,sl,idx_l,cc_space_v,n1_S,n2_S,n3_S,n4_S,list1,list2,list3,list4,v_l) &
   !$OMP PRIVATE(s,si,sj,sk,i_shift,j_shift,k_shift, &
   !$OMP i,j,k,idx_i,idx_j,idx_k,&
   !$OMP tmp_i,tmp_j,tmp_k)&
   !$OMP DEFAULT(NONE)
-  
+
   do sk = 1, 2
     call shift_idx_spin(sk,n3_S,k_shift)
     do sj = 1, 2
       call shift_idx_spin(sj,n2_S,j_shift)
       do si = 1, 2
         call shift_idx_spin(si,n1_S,i_shift)
-  
+
         s = si+sj+sk+sl
         ! <aa||aa> or <bb||bb>
         if (s == 4 .or. s == 8) then
           !$OMP DO collapse(2)
           do tmp_k = 1, n3_S(sk)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 k = list3(tmp_k,sk)
                 idx_k = tmp_k + k_shift
                 j = list2(tmp_j,sj)
@@ -934,13 +941,13 @@ subroutine gen_v_spin_3idx(n1,n2,n3,n4, idx_l, n1_S,n2_S,n3_S,n4_S, list1,list2,
             enddo
           enddo
           !$OMP END DO
-          
+
         ! <ab||ab> or <ba||ba>
         elseif (si == sk .and. sj == sl) then
           !$OMP DO collapse(2)
           do tmp_k = 1, n3_S(sk)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 k = list3(tmp_k,sk)
                 idx_k = tmp_k + k_shift
                 j = list2(tmp_j,sj)
@@ -953,13 +960,13 @@ subroutine gen_v_spin_3idx(n1,n2,n3,n4, idx_l, n1_S,n2_S,n3_S,n4_S, list1,list2,
             enddo
           enddo
           !$OMP END DO
-          
+
         ! <ab||ba> or <ba||ab>
         elseif (si == sl .and. sj == sk) then
           !$OMP DO collapse(2)
           do tmp_k = 1, n3_S(sk)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 k = list3(tmp_k,sk)
                 idx_k = tmp_k + k_shift
                 j = list2(tmp_j,sj)
@@ -976,7 +983,7 @@ subroutine gen_v_spin_3idx(n1,n2,n3,n4, idx_l, n1_S,n2_S,n3_S,n4_S, list1,list2,
           !$OMP DO collapse(2)
           do tmp_k = 1, n3_S(sk)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 k = list3(tmp_k,sk)
                 idx_k = tmp_k + k_shift
                 j = list2(tmp_j,sj)
@@ -989,12 +996,12 @@ subroutine gen_v_spin_3idx(n1,n2,n3,n4, idx_l, n1_S,n2_S,n3_S,n4_S, list1,list2,
           enddo
           !$OMP END DO
         endif
-        
+
       enddo
     enddo
   enddo
   !$OMP END PARALLEL
-  
+
 end
 
 ! V_3idx_ij_l
@@ -1029,28 +1036,28 @@ subroutine gen_v_spin_3idx_ij_l(n1,n2,n3,n4, idx_k, n1_S,n2_S,n3_S,n4_S, list1,l
   call shift_idx_spin(sk,n3_S,k_shift)
   tmp_k = idx_k - k_shift
   k = list3(tmp_k,sk)
-  
+
   !$OMP PARALLEL &
   !$OMP SHARED(k,sk,idx_k,cc_space_v,n1_S,n2_S,n3_S,n4_S,list1,list2,list3,list4,v_k) &
   !$OMP PRIVATE(s,si,sj,sl,i_shift,j_shift,l_shift, &
   !$OMP i,j,l,idx_i,idx_j,idx_l,&
   !$OMP tmp_i,tmp_j,tmp_l)&
   !$OMP DEFAULT(NONE)
-  
+
   do sl = 1, 2
     call shift_idx_spin(sl,n4_S,l_shift)
     do sj = 1, 2
       call shift_idx_spin(sj,n2_S,j_shift)
       do si = 1, 2
         call shift_idx_spin(si,n1_S,i_shift)
-  
+
         s = si+sj+sk+sl
         ! <aa||aa> or <bb||bb>
         if (s == 4 .or. s == 8) then
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 j = list2(tmp_j,sj)
@@ -1063,13 +1070,13 @@ subroutine gen_v_spin_3idx_ij_l(n1,n2,n3,n4, idx_k, n1_S,n2_S,n3_S,n4_S, list1,l
             enddo
           enddo
           !$OMP END DO
-          
+
         ! <ab||ab> or <ba||ba>
         elseif (si == sk .and. sj == sl) then
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 j = list2(tmp_j,sj)
@@ -1082,13 +1089,13 @@ subroutine gen_v_spin_3idx_ij_l(n1,n2,n3,n4, idx_k, n1_S,n2_S,n3_S,n4_S, list1,l
             enddo
           enddo
           !$OMP END DO
-          
+
         ! <ab||ba> or <ba||ab>
         elseif (si == sl .and. sj == sk) then
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 j = list2(tmp_j,sj)
@@ -1105,7 +1112,7 @@ subroutine gen_v_spin_3idx_ij_l(n1,n2,n3,n4, idx_k, n1_S,n2_S,n3_S,n4_S, list1,l
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_j = 1, n2_S(sj)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 j = list2(tmp_j,sj)
@@ -1118,12 +1125,12 @@ subroutine gen_v_spin_3idx_ij_l(n1,n2,n3,n4, idx_k, n1_S,n2_S,n3_S,n4_S, list1,l
           enddo
           !$OMP END DO
         endif
-        
+
       enddo
     enddo
   enddo
   !$OMP END PARALLEL
-  
+
 end
 
 ! V_3idx_i_kl
@@ -1158,28 +1165,28 @@ subroutine gen_v_spin_3idx_i_kl(n1,n2,n3,n4, idx_j, n1_S,n2_S,n3_S,n4_S, list1,l
   call shift_idx_spin(sj,n2_S,j_shift)
   tmp_j = idx_j - j_shift
   j = list2(tmp_j,sj)
-  
+
   !$OMP PARALLEL &
   !$OMP SHARED(j,sj,idx_j,cc_space_v,n1_S,n2_S,n3_S,n4_S,list1,list2,list3,list4,v_j) &
   !$OMP PRIVATE(s,si,sk,sl,i_shift,l_shift,k_shift, &
   !$OMP i,k,l,idx_i,idx_k,idx_l,&
   !$OMP tmp_i,tmp_k,tmp_l)&
   !$OMP DEFAULT(NONE)
-  
+
   do sl = 1, 2
     call shift_idx_spin(sl,n4_S,l_shift)
     do sk = 1, 2
       call shift_idx_spin(sk,n3_S,k_shift)
       do si = 1, 2
         call shift_idx_spin(si,n1_S,i_shift)
-  
+
         s = si+sj+sk+sl
         ! <aa||aa> or <bb||bb>
         if (s == 4 .or. s == 8) then
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_k = 1, n3_S(sk)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 k = list3(tmp_k,sk)
@@ -1192,13 +1199,13 @@ subroutine gen_v_spin_3idx_i_kl(n1,n2,n3,n4, idx_j, n1_S,n2_S,n3_S,n4_S, list1,l
             enddo
           enddo
           !$OMP END DO
-          
+
         ! <ab||ab> or <ba||ba>
         elseif (si == sk .and. sj == sl) then
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_k = 1, n3_S(sk)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 k = list3(tmp_k,sk)
@@ -1211,13 +1218,13 @@ subroutine gen_v_spin_3idx_i_kl(n1,n2,n3,n4, idx_j, n1_S,n2_S,n3_S,n4_S, list1,l
             enddo
           enddo
           !$OMP END DO
-          
+
         ! <ab||ba> or <ba||ab>
         elseif (si == sl .and. sj == sk) then
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_k = 1, n3_S(sk)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 k = list3(tmp_k,sk)
@@ -1234,7 +1241,7 @@ subroutine gen_v_spin_3idx_i_kl(n1,n2,n3,n4, idx_j, n1_S,n2_S,n3_S,n4_S, list1,l
           !$OMP DO collapse(2)
           do tmp_l = 1, n4_S(sl)
             do tmp_k = 1, n3_S(sk)
-              do tmp_i = 1, n1_S(si)  
+              do tmp_i = 1, n1_S(si)
                 l = list4(tmp_l,sl)
                 idx_l = tmp_l + l_shift
                 k = list3(tmp_k,sk)
@@ -1247,10 +1254,10 @@ subroutine gen_v_spin_3idx_i_kl(n1,n2,n3,n4, idx_j, n1_S,n2_S,n3_S,n4_S, list1,l
           enddo
           !$OMP END DO
         endif
-        
+
       enddo
     enddo
   enddo
   !$OMP END PARALLEL
-  
+
 end

From 5817bbf573c5074ae4c31562cb03c47c69e148f7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 17 May 2023 17:50:35 +0200
Subject: [PATCH 21/79] Reduced memory in CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f | 105 +++++++++++++++++++++++++++---
 1 file changed, 97 insertions(+), 8 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 256117d6..1467d9a4 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -764,7 +764,7 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
 
   ! internal
   double precision, allocatable :: g_occ(:,:), g_vir(:,:), J1(:,:,:,:), K1(:,:,:,:)
-  double precision, allocatable :: A1(:,:,:,:), B1(:,:,:,:)
+  double precision, allocatable :: A1(:,:,:,:), B1_gam(:,:,:)
   integer                       :: u,v,i,j,beta,gam,a,b
 
   allocate(g_occ(nO,nO), g_vir(nV,nV))
@@ -834,13 +834,18 @@ subroutine compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !  enddo
   !enddo
 
-  allocate(B1(nV,nV,nV,nV))
-  call compute_B1(nO,nV,t1,t2,B1)
-  call dgemm('N','N',nO*nO,nV*nV,nV*nV, &
-             1d0, tau, size(tau,1) * size(tau,2), &
-                  B1 , size(B1,1) * size(B1,2), &
-             1d0, r2, size(r2,1) * size(r2,2))
-  deallocate(B1)
+!  allocate(B1(nV,nV,nV,nV))
+!  call compute_B1(nO,nV,t1,t2,B1)
+  allocate(B1_gam(nV,nV,nV))
+  do gam=1,nV
+    call compute_B1_gam(nO,nV,t1,t2,B1_gam,gam)
+    call dgemm('N','N',nO*nO,nV,nV*nV, &
+                1d0, tau, size(tau,1) * size(tau,2), &
+                     B1_gam        , size(B1_gam,1) * size(B1_gam,2), &
+                1d0, r2(1,1,1,gam), size(r2,1) * size(r2,2))
+  enddo
+  deallocate(B1_gam)
+
 
   !do gam = 1, nV
   !  do beta = 1, nV
@@ -1512,6 +1517,90 @@ end
 
 ! B1
 
+subroutine compute_B1_gam(nO,nV,t1,t2,B1,gam)
+
+  implicit none
+
+  integer, intent(in)           :: nO,nV,gam
+  double precision, intent(in)  :: t1(nO, nV)
+  double precision, intent(in)  :: t2(nO, nO, nV, nV)
+  double precision, intent(out) :: B1(nV, nV, nV)
+
+  integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
+
+!  do beta = 1, nV
+!    do b = 1, nV
+!      do a = 1, nV
+!        B1(a,b,beta) = cc_space_v_vvvv(a,b,beta,gam)
+!
+!        do i = 1, nO
+!          B1(a,b,beta) = B1(a,b,beta) &
+!          - cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
+!          - cc_space_v_vvov(a,b,i,gam) * t1(i,beta)
+!        enddo
+!
+!      enddo
+!    enddo
+!  enddo
+
+  double precision, allocatable :: X_vvvo(:,:,:), Y_vvvv(:,:,:)
+  allocate(X_vvvo(nV,nV,nO), Y_vvvv(nV,nV,nV))
+!  ! B1(a,b,beta,gam) = cc_space_v_vvvv(a,b,beta,gam)
+  !$omp parallel &
+  !$omp shared(nO,nV,B1,cc_space_v_vvvv,cc_space_v_vvov,X_vvvo,gam) &
+  !$omp private(a,b,beta) &
+  !$omp default(none)
+  !$omp do
+    do beta = 1, nV
+      do b = 1, nV
+        do a = 1, nV
+          B1(a,b,beta) = cc_space_v_vvvv(a,b,beta,gam)
+        enddo
+      enddo
+    enddo
+  !$omp end do nowait
+  do i = 1, nO
+    !$omp do
+      do b = 1, nV
+        do a = 1, nV
+          X_vvvo(a,b,i) = cc_space_v_vvov(a,b,i,gam)
+        enddo
+      enddo
+    !$omp end do nowait
+  enddo
+  !$omp end parallel
+
+!  ! B1(a,b,beta) -= cc_space_v_vvvo(a,b,beta,i) * t1(i,gam) &
+  call dgemm('N','N', nV*nV*nV, 1, nO, &
+             -1d0, cc_space_v_vvvo, size(cc_space_v_vvvo,1) * size(cc_space_v_vvvo,2) * size(cc_space_v_vvvo,3), &
+                   t1(1,gam), size(t1,1), &
+              1d0, B1      , size(B1,1) * size(B1,2) * size(B1,3))
+
+  ! B1(a,b,beta,gam) -= cc_space_v_vvov(a,b,i,gam) * t1(i,beta)
+  call dgemm('N','N', nV*nV, nV, nO, &
+             -1d0, X_vvvo, size(X_vvvo,1) * size(X_vvvo,2), &
+                   t1    , size(t1,1), &
+              0d0, Y_vvvv, size(Y_vvvv,1) * size(Y_vvvv,2))
+
+  !$omp parallel &
+  !$omp shared(nV,B1,Y_vvvv,gam) &
+  !$omp private(a,b,beta) &
+  !$omp default(none)
+  !$omp do
+  do beta = 1, nV
+    do b = 1, nV
+      do a = 1, nV
+        B1(a,b,beta) = B1(a,b,beta) + Y_vvvv(a,b,beta)
+      enddo
+    enddo
+  enddo
+  !$omp end do
+  !$omp end parallel
+
+  deallocate(X_vvvo,Y_vvvv)
+
+end
+
 subroutine compute_B1(nO,nV,t1,t2,B1)
 
   implicit none

From 1d5ff0df6629c3374829327df63c912dedd72e00 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 22 May 2023 11:52:16 +0200
Subject: [PATCH 22/79] added the possibility to select 3idx, 4-idx and 5idx

---
 src/tc_bi_ortho/slater_tc_3e.irp.f         | 24 +++++++++++++---------
 src/tc_bi_ortho/slater_tc_opt_diag.irp.f   |  4 ++--
 src/tc_bi_ortho/slater_tc_opt_double.irp.f |  8 ++++----
 src/tc_bi_ortho/slater_tc_opt_single.irp.f |  2 +-
 src/tc_bi_ortho/tc_hmat.irp.f              | 16 ++++-----------
 src/tc_keywords/EZFIO.cfg                  | 18 ++++++++++++++++
 6 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/src/tc_bi_ortho/slater_tc_3e.irp.f b/src/tc_bi_ortho/slater_tc_3e.irp.f
index 7b73d5f2..f95be64b 100644
--- a/src/tc_bi_ortho/slater_tc_3e.irp.f
+++ b/src/tc_bi_ortho/slater_tc_3e.irp.f
@@ -4,17 +4,21 @@ subroutine provide_all_three_ints_bi_ortho
 ! routine that provides all necessary three-electron integrals 
  END_DOC
  if(three_body_h_tc)then
-  PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
-  PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
-  PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
-  PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
+  if(three_e_3_idx_term)then
+   PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
+   PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
+  endif
+  if(three_e_4_idx_term)then
+   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
+   PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
+  endif
+  if(.not.double_normal_ord.and.three_e_5_idx_term)then
+   PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
+   PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
+  elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
+   PROVIDE normal_two_body_bi_orth
+  endif
  endif
-if(.not.double_normal_ord)then
-  PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
-  PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
-else
- PROVIDE normal_two_body_bi_orth
-endif
 end
 
 subroutine diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
diff --git a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
index 5a3f9935..1745e362 100644
--- a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
@@ -156,7 +156,7 @@ subroutine ac_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
     htwoe = htwoe + mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
   enddo
 
-  if(three_body_h_tc.and.elec_num.gt.2)then
+  if(three_body_h_tc.and.elec_num.gt.2.and.three_e_3_idx_term)then
    !!!!! 3-e part 
    !! same-spin/same-spin
    do j = 1, na
@@ -243,7 +243,7 @@ subroutine a_tc_operator(iorb,ispin,key,hmono,htwoe,hthree,Nint,na,nb)
     htwoe= htwoe- mo_bi_ortho_tc_two_e_jj(occ(i,other_spin),iorb)
   enddo
 
-  if(three_body_h_tc.and.elec_num.gt.2)then
+  if(three_body_h_tc.and.elec_num.gt.2.and.three_e_3_idx_term)then
    !!!!! 3-e part 
    !! same-spin/same-spin
    do j = 1, na
diff --git a/src/tc_bi_ortho/slater_tc_opt_double.irp.f b/src/tc_bi_ortho/slater_tc_opt_double.irp.f
index 1b0e43bb..2d6bfb27 100644
--- a/src/tc_bi_ortho/slater_tc_opt_double.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_double.irp.f
@@ -42,13 +42,13 @@ subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe,
    ! opposite spin two-body 
     htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1) 
     if(three_body_h_tc.and.elec_num.gt.2)then
-     if(.not.double_normal_ord)then
+     if(.not.double_normal_ord.and.three_e_5_idx_term)then
       if(degree_i>degree_j)then
        call three_comp_two_e_elem(key_j,h1,h2,p1,p2,s1,s2,hthree)
       else
        call three_comp_two_e_elem(key_i,h1,h2,p1,p2,s1,s2,hthree)
       endif
-     elseif(double_normal_ord.and.elec_num.gt.2)then
+     elseif(double_normal_ord)then
       htwoe += normal_two_body_bi_orth(p2,h2,p1,h1)
      endif
     endif
@@ -59,13 +59,13 @@ subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe,
    ! exchange terms 
    htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1) 
    if(three_body_h_tc.and.elec_num.gt.2)then
-    if(.not.double_normal_ord)then
+    if(.not.double_normal_ord.and.three_e_5_idx_term)then
      if(degree_i>degree_j)then
       call three_comp_two_e_elem(key_j,h1,h2,p1,p2,s1,s2,hthree)
      else
       call three_comp_two_e_elem(key_i,h1,h2,p1,p2,s1,s2,hthree)
      endif
-    elseif(double_normal_ord.and.elec_num.gt.2)then
+    elseif(double_normal_ord)then
       htwoe -= normal_two_body_bi_orth(h2,p1,h1,p2)
       htwoe += normal_two_body_bi_orth(h1,p1,h2,p2)
     endif
diff --git a/src/tc_bi_ortho/slater_tc_opt_single.irp.f b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
index 2f9d83bf..7178d6d9 100644
--- a/src/tc_bi_ortho/slater_tc_opt_single.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
@@ -106,7 +106,7 @@ subroutine get_single_excitation_from_fock_tc(key_i,key_j,h,p,spin,phase,hmono,h
   htwoe -= buffer_x(i)
  enddo
  hthree = 0.d0
- if (three_body_h_tc.and.elec_num.gt.2)then
+ if (three_body_h_tc.and.elec_num.gt.2.and.three_e_4_idx_term)then
   call three_comp_fock_elem(key_i,h,p,spin,hthree)
  endif
 
diff --git a/src/tc_bi_ortho/tc_hmat.irp.f b/src/tc_bi_ortho/tc_hmat.irp.f
index 3353d3e7..ec072531 100644
--- a/src/tc_bi_ortho/tc_hmat.irp.f
+++ b/src/tc_bi_ortho/tc_hmat.irp.f
@@ -9,33 +9,25 @@
  
   implicit none
   integer          :: i, j
-  double precision :: hmono,htwoe,hthree,htot
+  double precision :: htot
 
   PROVIDE N_int
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+  call htilde_mu_mat_opt_bi_ortho_tot(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
 
- !$OMP PARALLEL DO SCHEDULE(GUIDED) DEFAULT(NONE) PRIVATE(i,j,hmono, htwoe, hthree, htot) &
+ !$OMP PARALLEL DO SCHEDULE(GUIDED) DEFAULT(NONE) PRIVATE(i,j, htot) &
  !$OMP SHARED (N_det, psi_det, N_int,htilde_matrix_elmt_bi_ortho)
     do i = 1, N_det
       do j = 1, N_det
         ! < J | Htilde | I >
-        call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+        call htilde_mu_mat_opt_bi_ortho_tot(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
 
-        !print *, ' hmono  = ', hmono
-        !print *, ' htwoe  = ', htwoe
-        !print *, ' hthree = ', hthree
         htilde_matrix_elmt_bi_ortho(j,i) = htot
       enddo
     enddo
  !$OMP END PARALLEL DO
-! print*,'htilde_matrix_elmt_bi_ortho = '
-! do i = 1, min(100,N_det)
-!  write(*,'(100(F16.10,X))')htilde_matrix_elmt_bi_ortho(1:min(100,N_det),i)
-! enddo
-
 
 END_PROVIDER 
 
diff --git a/src/tc_keywords/EZFIO.cfg b/src/tc_keywords/EZFIO.cfg
index 3a26a6eb..484bd1f0 100644
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@@ -16,6 +16,24 @@ doc: If |true|, three-body terms are included
 interface: ezfio,provider,ocaml
 default: True
 
+[three_e_3_idx_term]
+type: logical
+doc: If |true|, the diagonal 3-idx terms of the 3-e interaction are taken
+interface: ezfio,provider,ocaml
+default: True
+
+[three_e_4_idx_term]
+type: logical
+doc: If |true|, the off-diagonal 4-idx terms of the 3-e interaction are taken
+interface: ezfio,provider,ocaml
+default: True
+
+[three_e_5_idx_term]
+type: logical
+doc: If |true|, the off-diagonal 5-idx terms of the 3-e interaction are taken
+interface: ezfio,provider,ocaml
+default: True
+
 [pure_three_body_h_tc]
 type: logical
 doc: If |true|, pure triple excitation three-body terms are included

From daf8b1c3dcef5f066d1add8b2cc751f03544ba98 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 22 May 2023 18:17:17 +0200
Subject: [PATCH 23/79] renaming the routines in tc slater rules in _slow when
 they are naively built

---
 src/tc_bi_ortho/dressing_vectors_lr.irp.f     |  8 ++--
 src/tc_bi_ortho/e_corr_bi_ortho.irp.f         | 18 +++----
 src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f       | 12 ++---
 src/tc_bi_ortho/print_tc_wf.irp.f             |  6 +--
 src/tc_bi_ortho/pt2_tc_cisd.irp.f             |  8 ++--
 ...er_tc_3e.irp.f => slater_tc_3e_slow.irp.f} | 28 ++---------
 src/tc_bi_ortho/slater_tc_opt.irp.f           | 23 +++++++++
 src/tc_bi_ortho/slater_tc_opt_diag.irp.f      |  4 +-
 .../{slater_tc.irp.f => slater_tc_slow.irp.f} | 47 +++++++------------
 src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f       | 12 ++---
 src/tc_bi_ortho/tc_h_eigvectors.irp.f         |  5 +-
 src/tc_bi_ortho/tc_som.irp.f                  |  4 +-
 src/tc_bi_ortho/tc_utils.irp.f                |  4 +-
 src/tc_bi_ortho/test_normal_order.irp.f       |  6 +--
 src/tc_bi_ortho/test_s2_tc.irp.f              |  2 +-
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f        | 14 +++---
 src/tc_bi_ortho/test_tc_fock.irp.f            |  7 ++-
 17 files changed, 97 insertions(+), 111 deletions(-)
 rename src/tc_bi_ortho/{slater_tc_3e.irp.f => slater_tc_3e_slow.irp.f} (87%)
 rename src/tc_bi_ortho/{slater_tc.irp.f => slater_tc_slow.irp.f} (85%)

diff --git a/src/tc_bi_ortho/dressing_vectors_lr.irp.f b/src/tc_bi_ortho/dressing_vectors_lr.irp.f
index 08913bab..ed663f02 100644
--- a/src/tc_bi_ortho/dressing_vectors_lr.irp.f
+++ b/src/tc_bi_ortho/dressing_vectors_lr.irp.f
@@ -27,7 +27,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
   call hmat_bi_ortho         (psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
 
   delta = 0.d0
@@ -39,7 +39,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
     do j = 1, ndet
 
       ! < I | Htilde | J >
-      call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
       ! < I | H | J >
       call hmat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
 
@@ -78,7 +78,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
 
   delta = 0.d0
  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8)   &
@@ -88,7 +88,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
     do j = 1, ndet
 
       ! < I | Htilde | J >
-      call htilde_mu_mat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
 
       delta(i) = delta(i) + psicoef(j) * htc_tot
     enddo
diff --git a/src/tc_bi_ortho/e_corr_bi_ortho.irp.f b/src/tc_bi_ortho/e_corr_bi_ortho.irp.f
index 3a715b44..6d5c3b21 100644
--- a/src/tc_bi_ortho/e_corr_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/e_corr_bi_ortho.irp.f
@@ -2,7 +2,7 @@
  BEGIN_PROVIDER [ double precision, e_tilde_00]
  implicit none
  double precision :: hmono,htwoe,hthree,htot
- call htilde_mu_mat_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
+ call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
  e_tilde_00 = htot
  END_PROVIDER 
 
@@ -18,11 +18,11 @@
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
   if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
-   call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    e_pt2_tc_bi_orth += coef_pt1 * htilde_ij
    if(degree == 1)then
     e_pt2_tc_bi_orth_single += coef_pt1 * htilde_ij
@@ -37,7 +37,7 @@
  BEGIN_PROVIDER [ double precision, e_tilde_bi_orth_00]
  implicit none
  double precision :: hmono,htwoe,hthree,htilde_ij
- call htilde_mu_mat_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
+ call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
  e_tilde_bi_orth_00 += nuclear_repulsion
  END_PROVIDER 
 
@@ -57,7 +57,7 @@
  e_corr_double_bi_orth = 0.d0
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+  call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
   if(degree == 1)then
    e_corr_single_bi_orth += reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1)
    e_corr_single_bi_orth_abs += dabs(reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1))
@@ -80,7 +80,7 @@
  do i = 1, N_det
   accu += reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(i,1)
   do j = 1, N_det
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    e_tc_left_right += htilde_ij * reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(j,1)
   enddo
  enddo
@@ -99,8 +99,8 @@ BEGIN_PROVIDER [ double precision, coef_pt1_bi_ortho, (N_det)]
   if(degree==0)then
    coef_pt1_bi_ortho(i) = 1.d0
   else
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
    coef_pt1_bi_ortho(i)= coef_pt1
diff --git a/src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f b/src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
index b7129d36..1d1b26cc 100644
--- a/src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
+++ b/src/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
@@ -1,4 +1,4 @@
-subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
+subroutine htc_bi_ortho_calc_tdav_slow(v, u, N_st, sze)
 
   use bitmasks
 
@@ -27,7 +27,7 @@ subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
 
   v = 0.d0
  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8) &
@@ -36,7 +36,7 @@ subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
   do istate = 1, N_st
     do i = 1, sze
       do j = 1, sze
-        call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
+        call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
         v(i,istate) = v(i,istate) + htot * u(j,istate)
       enddo
     enddo 
@@ -45,7 +45,7 @@ subroutine htc_bi_ortho_calc_tdav(v, u, N_st, sze)
 
 end 
 
-subroutine htcdag_bi_ortho_calc_tdav(v, u, N_st, sze)
+subroutine htcdag_bi_ortho_calc_tdav_slow(v, u, N_st, sze)
 
   use bitmasks
 
@@ -71,7 +71,7 @@ subroutine htcdag_bi_ortho_calc_tdav(v, u, N_st, sze)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, htot)
 
   v = 0.d0
 
@@ -81,7 +81,7 @@ subroutine htcdag_bi_ortho_calc_tdav(v, u, N_st, sze)
   do istate = 1, N_st
     do i = 1, sze
       do j = 1, sze
-        call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
+        call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, htot)
         v(i,istate) = v(i,istate) + htot * u(j,istate)
       enddo
     enddo
diff --git a/src/tc_bi_ortho/print_tc_wf.irp.f b/src/tc_bi_ortho/print_tc_wf.irp.f
index 0cf3ca87..0c4198a9 100644
--- a/src/tc_bi_ortho/print_tc_wf.irp.f
+++ b/src/tc_bi_ortho/print_tc_wf.irp.f
@@ -49,12 +49,12 @@ subroutine routine
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
    if(degree == 1 .or. degree == 2)then
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
     delta_e = e_tilde_00 - e_i0
     coef_pt1 = htilde_ij / delta_e
  
-    call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
     contrib_pt = coef_pt1 * htilde_ij
     e_pt2 += contrib_pt
  
diff --git a/src/tc_bi_ortho/pt2_tc_cisd.irp.f b/src/tc_bi_ortho/pt2_tc_cisd.irp.f
index 50d9dd45..9cb9a600 100644
--- a/src/tc_bi_ortho/pt2_tc_cisd.irp.f
+++ b/src/tc_bi_ortho/pt2_tc_cisd.irp.f
@@ -36,11 +36,11 @@ subroutine routine
  e_corr_abs = 0.d0
  e_corr_pos = 0.d0
  e_corr_neg = 0.d0
- call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,1), psi_det(1,1,1), N_int, e00) 
+ call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,1), N_int, e00) 
  do i = 2, N_det
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,1), N_int, hi0) 
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,1), psi_det(1,1,i), N_int, h0i) 
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, ei) 
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,1), N_int, hi0) 
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,i), N_int, h0i) 
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, ei) 
   call get_excitation_degree(psi_det(1,1,1), psi_det(1,1,i),degree,N_int)
   call get_excitation(psi_det(1,1,1), psi_det(1,1,i),exc,degree,phase,N_int)
   call decode_exc(exc,degree,h1,p1,h2,p2,s1,s2)
diff --git a/src/tc_bi_ortho/slater_tc_3e.irp.f b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
similarity index 87%
rename from src/tc_bi_ortho/slater_tc_3e.irp.f
rename to src/tc_bi_ortho/slater_tc_3e_slow.irp.f
index f95be64b..6abb6b78 100644
--- a/src/tc_bi_ortho/slater_tc_3e.irp.f
+++ b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
@@ -1,27 +1,5 @@
-subroutine provide_all_three_ints_bi_ortho
- implicit none
- BEGIN_DOC
-! routine that provides all necessary three-electron integrals 
- END_DOC
- if(three_body_h_tc)then
-  if(three_e_3_idx_term)then
-   PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
-   PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
-  endif
-  if(three_e_4_idx_term)then
-   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
-   PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
-  endif
-  if(.not.double_normal_ord.and.three_e_5_idx_term)then
-   PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
-   PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
-  elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
-   PROVIDE normal_two_body_bi_orth
-  endif
- endif
-end
 
-subroutine diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
+subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
 
   BEGIN_DOC
   !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
@@ -112,7 +90,7 @@ subroutine diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
 end
 
 
-subroutine single_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
 
   BEGIN_DOC
   ! <key_j | H_tilde | key_i> for single excitation ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
@@ -207,7 +185,7 @@ end
 
 ! ---
 
-subroutine double_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+subroutine double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
 
   BEGIN_DOC
   ! <key_j | H_tilde | key_i> for double excitation ONLY FOR THREE-BODY TERMS  WITH BI ORTHONORMAL ORBITALS
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index a19d4688..3fd2576a 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -1,3 +1,26 @@
+subroutine provide_all_three_ints_bi_ortho
+ implicit none
+ BEGIN_DOC
+! routine that provides all necessary three-electron integrals 
+ END_DOC
+ if(three_body_h_tc)then
+  if(three_e_3_idx_term)then
+   PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
+   PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
+  endif
+  if(three_e_4_idx_term)then
+   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
+   PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
+  endif
+  if(.not.double_normal_ord.and.three_e_5_idx_term)then
+   PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
+   PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
+  elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
+   PROVIDE normal_two_body_bi_orth
+  endif
+ endif
+end
+
 subroutine htilde_mu_mat_opt_bi_ortho_tot(key_j, key_i, Nint, htot)
  implicit none
   BEGIN_DOC
diff --git a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
index 1745e362..531f0141 100644
--- a/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_diag.irp.f
@@ -7,11 +7,11 @@
 ! Various component of the TC energy for the reference "HF" Slater determinant
  END_DOC 
  double precision :: hmono, htwoe, htot, hthree
- call diag_htilde_mu_mat_bi_ortho(N_int,HF_bitmask , hmono, htwoe, htot)
+ call diag_htilde_mu_mat_bi_ortho_slow(N_int,HF_bitmask , hmono, htwoe, htot)
  ref_tc_energy_1e = hmono
  ref_tc_energy_2e = htwoe 
  if(three_body_h_tc)then
-  call diag_htilde_three_body_ints_bi_ort(N_int, HF_bitmask, hthree)
+  call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
   ref_tc_energy_3e = hthree
  else
   ref_tc_energy_3e = 0.d0
diff --git a/src/tc_bi_ortho/slater_tc.irp.f b/src/tc_bi_ortho/slater_tc_slow.irp.f
similarity index 85%
rename from src/tc_bi_ortho/slater_tc.irp.f
rename to src/tc_bi_ortho/slater_tc_slow.irp.f
index 2c0ae2ca..1833d20f 100644
--- a/src/tc_bi_ortho/slater_tc.irp.f
+++ b/src/tc_bi_ortho/slater_tc_slow.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-subroutine htilde_mu_mat_bi_ortho_tot(key_j, key_i, Nint, htot)
+subroutine htilde_mu_mat_bi_ortho_tot_slow(key_j, key_i, Nint, htot)
 
   BEGIN_DOC
   ! <key_j | H_tilde | key_i> where |key_j> is developed on the LEFT basis and |key_i> is developed on the RIGHT basis
@@ -24,14 +24,14 @@ subroutine htilde_mu_mat_bi_ortho_tot(key_j, key_i, Nint, htot)
   if(degree.gt.2)then
     htot = 0.d0
   else
-    call htilde_mu_mat_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+    call htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
   endif
 
-end subroutine htilde_mu_mat_bi_ortho_tot
+end subroutine htilde_mu_mat_bi_ortho_tot_slow
 
 ! --
 
-subroutine htilde_mu_mat_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
+subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree, htot)
 
   BEGIN_DOC
   !
@@ -61,22 +61,22 @@ subroutine htilde_mu_mat_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree, htot
   if(degree.gt.2) return
 
   if(degree == 0)then
-    call diag_htilde_mu_mat_bi_ortho(Nint, key_i, hmono, htwoe, htot)
+    call diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
   else if (degree == 1)then
-    call single_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+    call single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
   else if(degree == 2)then
-    call double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+    call double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
   endif
 
   if(three_body_h_tc) then
     if(degree == 2) then
-      if(.not.double_normal_ord) then
-        call double_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
+      if(.not.double_normal_ord.and.elec_num.gt.2.and.three_e_5_idx_term) then
+        call double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
       endif
-    else if(degree == 1) then
-      call single_htilde_three_body_ints_bi_ort(Nint, key_j, key_i, hthree)
-    else if(degree == 0) then
-      call diag_htilde_three_body_ints_bi_ort(Nint, key_i, hthree)
+    else if(degree == 1.and.elec_num.gt.2.and.three_e_4_idx_term) then
+      call single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
+    else if(degree == 0.and.elec_num.gt.2.and.three_e_3_idx_term) then
+      call diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
     endif
   endif
 
@@ -89,7 +89,7 @@ end
 
 ! ---
 
-subroutine diag_htilde_mu_mat_bi_ortho(Nint, key_i, hmono, htwoe, htot)
+subroutine diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
 
   BEGIN_DOC
   !  diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
@@ -188,7 +188,7 @@ end
 
 
 
-subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+subroutine double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
 
   BEGIN_DOC
   ! <key_j | H_tilde | key_i> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS 
@@ -227,18 +227,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
    return
   endif
 
-!  if(core_tc_op)then
-!   print*,'core_tc_op not already taken into account for bi ortho'
-!   print*,'stopping ...'
-!   stop
-!   do i = 1, Nint
-!    key_i_core(i,1) = xor(key_i(i,1),core_bitmask(i,1))
-!    key_i_core(i,2) = xor(key_i(i,2),core_bitmask(i,2))
-!   enddo
-!   call bitstring_to_list_ab(key_i_core, occ, Ne, Nint)
-!  else
    call bitstring_to_list_ab(key_i, occ, Ne, Nint)
-!  endif
   call get_double_excitation(key_i, key_j, exc, phase, Nint)
   call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
 
@@ -246,7 +235,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
    ! opposite spin two-body 
 !   key_j, key_i
     htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1) 
-    if(double_normal_ord.and.+Ne(1).gt.2)then
+    if(three_body_h_tc.and.double_normal_ord.and.+Ne(1).gt.2)then
      htwoe += normal_two_body_bi_orth(p2,h2,p1,h1)!!! WTF ???
     endif
   else
@@ -255,7 +244,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)  
    ! exchange terms 
    htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1) 
-   if(double_normal_ord.and.+Ne(1).gt.2)then
+   if(three_body_h_tc.and.double_normal_ord.and.+Ne(1).gt.2)then
     htwoe -= normal_two_body_bi_orth(h2,p1,h1,p2)!!! WTF ???
     htwoe += normal_two_body_bi_orth(h1,p1,h2,p2)!!! WTF ???
    endif
@@ -266,7 +255,7 @@ subroutine double_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
 end
 
 
-subroutine single_htilde_mu_mat_bi_ortho(Nint, key_j, key_i, hmono, htwoe, htot)
+subroutine single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
 
   BEGIN_DOC
   ! <key_j | H_tilde | key_i> for single excitation ONLY FOR ONE- AND TWO-BODY TERMS 
diff --git a/src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f b/src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
index 4ae44148..4c3c0788 100644
--- a/src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
+++ b/src/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
@@ -11,10 +11,10 @@
  allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),eigval_tmp(N_states))
  dressing_dets = 0.d0
  do i = 1, N_det
-  call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
   if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
+   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
   endif
  enddo
  reigvec_tc_bi_orth_tmp = 0.d0
@@ -29,7 +29,7 @@
   vec_tmp(istate,istate) = 1.d0
  enddo
  print*,'Diagonalizing the TC CISD '
- call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav)
+ call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
  do i = 1, N_det 
   e_corr_dets(i) = reigvec_tc_bi_orth_tmp(i,1) * h0j(i)/reigvec_tc_bi_orth_tmp(1,1)
  enddo
@@ -41,8 +41,8 @@
  it = 0
  dressing_dets = 0.d0
   double precision, allocatable :: H_jj(:),vec_tmp(:,:),eigval_tmp(:)
-  external                         htc_bi_ortho_calc_tdav
-  external                         htcdag_bi_ortho_calc_tdav
+  external                         htc_bi_ortho_calc_tdav_slow
+  external                         htcdag_bi_ortho_calc_tdav_slow
   logical                       :: converged
  do while (dabs(E_before-E_current).gt.thr)
   it += 1
@@ -66,7 +66,7 @@
   do istate = N_states+1, n_states_diag
    vec_tmp(istate,istate) = 1.d0
   enddo
-  call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav)
+  call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
   print*,'outside Davidson'
   print*,'eigval_tmp(1) = ',eigval_tmp(1)
   do i = 1, N_det 
diff --git a/src/tc_bi_ortho/tc_h_eigvectors.irp.f b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
index a83d6cd0..db4c5e28 100644
--- a/src/tc_bi_ortho/tc_h_eigvectors.irp.f
+++ b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
@@ -207,8 +207,6 @@ end
   else ! n_det > N_det_max_full
 
     double precision, allocatable :: H_jj(:),vec_tmp(:,:)
-    external                         htc_bi_ortho_calc_tdav
-    external                         htcdag_bi_ortho_calc_tdav
     external                         H_tc_u_0_opt
     external                         H_tc_dagger_u_0_opt
     external                         H_tc_s2_dagger_u_0_opt
@@ -217,7 +215,7 @@ end
     allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag))
 
     do i = 1, N_det
-      call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+      call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
     enddo
 
     print*,'---------------------------------'
@@ -259,7 +257,6 @@ end
     do istate = N_states+1, n_states_diag
       vec_tmp(istate,istate) = 1.d0
     enddo
-    !call davidson_general_ext_rout_nonsym_b1space(vec_tmp, H_jj, eigval_right_tc_bi_orth, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav)
     !call davidson_general_ext_rout_nonsym_b1space(vec_tmp, H_jj, eigval_right_tc_bi_orth, N_det, n_states, n_states_diag, converged, H_tc_u_0_opt)
     converged = .False.
     i_it = 0
diff --git a/src/tc_bi_ortho/tc_som.irp.f b/src/tc_bi_ortho/tc_som.irp.f
index 291c52ef..a7e4d09e 100644
--- a/src/tc_bi_ortho/tc_som.irp.f
+++ b/src/tc_bi_ortho/tc_som.irp.f
@@ -56,8 +56,8 @@ subroutine main()
   U_SOM = 0.d0 
   do i = 1, N_det
     if(i == i_HF) cycle
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
-    call htilde_mu_mat_bi_ortho(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
     U_SOM += htot_1 * htot_2
   enddo
   U_SOM = 0.5d0 * U_SOM
diff --git a/src/tc_bi_ortho/tc_utils.irp.f b/src/tc_bi_ortho/tc_utils.irp.f
index f8f648e8..24bb7017 100644
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@@ -12,7 +12,7 @@ subroutine write_tc_energy()
     do i = 1, N_det
       do j = 1, N_det
         !htot = htilde_matrix_elmt_bi_ortho(i,j)
-        call htilde_mu_mat_bi_ortho(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+        call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
         E_TC = E_TC + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htot
         !E_TC = E_TC + leigvec_tc_bi_orth(i,k) * reigvec_tc_bi_orth(j,k) * htot
       enddo
@@ -45,7 +45,7 @@ subroutine write_tc_var()
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
       SIGMA_TC = SIGMA_TC + htot * htot
     enddo
 
diff --git a/src/tc_bi_ortho/test_normal_order.irp.f b/src/tc_bi_ortho/test_normal_order.irp.f
index 118e481a..cb0c355c 100644
--- a/src/tc_bi_ortho/test_normal_order.irp.f
+++ b/src/tc_bi_ortho/test_normal_order.irp.f
@@ -35,7 +35,7 @@ subroutine test
      det_i = ref_bitmask
      call do_single_excitation(det_i,h1,p1,s1,i_ok)
      call do_single_excitation(det_i,h2,p2,s2,i_ok)
-     call htilde_mu_mat_bi_ortho(det_i,HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+     call htilde_mu_mat_bi_ortho_slow(det_i,HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
      call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
      call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
      hthree *= phase
@@ -67,7 +67,7 @@ do h1 = 1, elec_alpha_num
     if(i_ok.ne.1)cycle
     call do_single_excitation(det_i,h2,p2,s2,i_ok)
     if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
     integer :: hh1, pp1, hh2, pp2, ss1, ss2
@@ -103,7 +103,7 @@ do h1 = 1, elec_beta_num
     if(i_ok.ne.1)cycle
     call do_single_excitation(det_i,h2,p2,s2,i_ok)
     if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
     call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
diff --git a/src/tc_bi_ortho/test_s2_tc.irp.f b/src/tc_bi_ortho/test_s2_tc.irp.f
index 4debe2e2..1f7bdfda 100644
--- a/src/tc_bi_ortho/test_s2_tc.irp.f
+++ b/src/tc_bi_ortho/test_s2_tc.irp.f
@@ -91,7 +91,7 @@ subroutine routine_test_s2_davidson
  external H_tc_s2_u_0_opt
  allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),energies(n_states_diag), s2(n_states_diag))
  do i = 1, N_det
-   call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+   call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
  enddo
  ! Preparing the left-eigenvector
  print*,'Computing the left-eigenvector '
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index 6721c285..df86ea65 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -31,7 +31,7 @@ subroutine test_h_u0
   u_0(i) = psi_r_coef_bi_ortho(i,1)
  enddo
  call H_tc_u_0_nstates_openmp(v_0_new,u_0,N_states,N_det, do_right)
- call htc_bi_ortho_calc_tdav (v_0_ref,u_0,N_states,N_det)
+ call htc_bi_ortho_calc_tdav_slow (v_0_ref,u_0,N_states,N_det)
  print*,'difference right '
  accu = 0.d0
  do i = 1, N_det
@@ -42,7 +42,7 @@ subroutine test_h_u0
  do_right = .False.
  v_0_new = 0.d0
  call H_tc_u_0_nstates_openmp(v_0_new,u_0,N_states,N_det, do_right)
- call htcdag_bi_ortho_calc_tdav(v_0_ref_dagger,u_0,N_states,N_det, do_right)
+ call htcdag_bi_ortho_calc_tdav_slow(v_0_ref_dagger,u_0,N_states,N_det, do_right)
  print*,'difference left'
  accu = 0.d0
  do i = 1, N_det
@@ -63,7 +63,7 @@ subroutine test_slater_tc_opt
  i_count = 0.d0
  do i = 1, N_det
   do j = 1,N_det
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hnewmono, hnewtwoe, hnewthree, hnewtot)
    if(dabs(htot).gt.1.d-15)then
      i_count += 1.D0
@@ -99,7 +99,7 @@ subroutine timing_tot
   do j = 1, N_det
 !   call get_excitation_degree(psi_det(1,1,j), psi_det(1,1,i),degree,N_int)
    i_count += 1.d0
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   enddo
  enddo
  call wall_time(wall1)
@@ -146,7 +146,7 @@ subroutine timing_diag
  do i = 1, N_det
   do j = i,i 
    i_count += 1.d0
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   enddo
  enddo
  call wall_time(wall1)
@@ -183,7 +183,7 @@ subroutine timing_single
    if(degree.ne.1)cycle
    i_count += 1.d0
    call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call wall_time(wall1)
    accu += wall1 - wall0
   enddo
@@ -225,7 +225,7 @@ subroutine timing_double
    if(degree.ne.2)cycle
    i_count += 1.d0
    call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call wall_time(wall1)
    accu += wall1 - wall0
   enddo
diff --git a/src/tc_bi_ortho/test_tc_fock.irp.f b/src/tc_bi_ortho/test_tc_fock.irp.f
index ebd43a7a..b7de067f 100644
--- a/src/tc_bi_ortho/test_tc_fock.irp.f
+++ b/src/tc_bi_ortho/test_tc_fock.irp.f
@@ -25,8 +25,7 @@ subroutine test_3e
  implicit none
  double precision :: integral_aaa,integral_aab,integral_abb,integral_bbb,accu
  double precision ::  hmono, htwoe, hthree, htot
- call htilde_mu_mat_bi_ortho(ref_bitmask, ref_bitmask, N_int, hmono, htwoe, hthree, htot)
-! call diag_htilde_three_body_ints_bi_ort(N_int, ref_bitmask, hthree)
+ call htilde_mu_mat_bi_ortho_slow(ref_bitmask, ref_bitmask, N_int, hmono, htwoe, hthree, htot)
  print*,'hmono = ',hmono
  print*,'htwoe = ',htwoe
  print*,'hthree= ',hthree
@@ -88,7 +87,7 @@ subroutine routine_3()
        print*, ' excited det'
        call debug_det(det_i, N_int)
  
-       call htilde_mu_mat_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
+       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
        if(dabs(hthree).lt.1.d-10)cycle
        ref = hthree 
        if(s1 == 1)then
@@ -156,7 +155,7 @@ subroutine routine_tot()
         stop
        endif
  
-       call htilde_mu_mat_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
+       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
        print*,htilde_ij
        if(dabs(htilde_ij).lt.1.d-10)cycle
        print*, ' excited det'

From fd051ae020927be5e495dc3da3fa661ba55cd6ee Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 22 May 2023 18:39:48 +0200
Subject: [PATCH 24/79] some cleaning in slow and no slow tc routines

---
 src/cipsi_tc_bi_ortho/selection.irp.f | 5 ++---
 src/tc_bi_ortho/tc_h_eigvectors.irp.f | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/cipsi_tc_bi_ortho/selection.irp.f b/src/cipsi_tc_bi_ortho/selection.irp.f
index 4c271a4b..77377554 100644
--- a/src/cipsi_tc_bi_ortho/selection.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection.irp.f
@@ -868,7 +868,6 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
         ! <det|H(j)|psi_0> and transpose
         ! -------------------------------------------
 
-!        call htilde_mu_mat_bi_ortho_tot(det, det, N_int, Hii)
         double precision               :: hmono, htwoe, hthree
         call diag_htilde_mu_mat_fock_bi_ortho(N_int, det, hmono, htwoe, hthree, hii)
         do istate = 1,N_states
@@ -878,8 +877,8 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
             psi_h_alpha = 0.d0
             alpha_h_psi = 0.d0
             do iii = 1, N_det_selectors
-              call htilde_mu_mat_bi_ortho_tot(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_bi_ortho_tot(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
+              call htilde_mu_mat_bi_ortho_tot_slow(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
+              call htilde_mu_mat_bi_ortho_tot_slow(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
               call get_excitation_degree(psi_selectors(1,1,iii), det,degree,N_int)
               if(degree == 0)then
                print*,'problem !!!'
diff --git a/src/tc_bi_ortho/tc_h_eigvectors.irp.f b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
index db4c5e28..fa946d6a 100644
--- a/src/tc_bi_ortho/tc_h_eigvectors.irp.f
+++ b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
@@ -215,7 +215,7 @@ end
     allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag))
 
     do i = 1, N_det
-      call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
+      call htilde_mu_mat_opt_bi_ortho_tot(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
     enddo
 
     print*,'---------------------------------'

From 4d9cdf9df1d8e0d61006c1d348f28e96c0946464 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 24 May 2023 11:06:00 +0200
Subject: [PATCH 25/79] added new mu(r) jastrow

---
 src/non_h_ints_mu/jast_deriv.irp.f | 167 ++++++++++++++++++++++++++++-
 src/tc_keywords/EZFIO.cfg          |   6 ++
 2 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/src/non_h_ints_mu/jast_deriv.irp.f b/src/non_h_ints_mu/jast_deriv.irp.f
index cbd0b406..5e99600e 100644
--- a/src/non_h_ints_mu/jast_deriv.irp.f
+++ b/src/non_h_ints_mu/jast_deriv.irp.f
@@ -187,6 +187,19 @@ end function j12_mu
 
 subroutine grad1_j12_mu(r1, r2, grad)
 
+  BEGIN_DOC
+!  gradient of j(mu(r1,r2),r12) form of jastrow. 
+!
+! if mu(r1,r2) = cst ---> j1b_type < 200 and 
+!
+!  d/dx1 j(mu,r12) = 0.5 * (1 - erf(mu *r12))/r12 * (x1 - x2)
+!
+! if mu(r1,r2) /= cst ---> 200 < j1b_type < 300 and 
+!
+! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 )   d/dx1 mu(r1,r2) 
+!
+!                          + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)
+  END_DOC
   include 'constants.include.F'
 
   implicit none
@@ -515,6 +528,9 @@ subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
   double precision              :: r(3)
   double precision              :: dm_a(1), dm_b(1), grad_dm_a(3,1), grad_dm_b(3,1)
   double precision              :: dm_tot, tmp1, tmp2, tmp3
+  double precision :: rho1, grad_rho1(3),rho2,rho_tot,inv_rho_tot
+  double precision :: f_rho1, f_rho2, d_drho_f_rho1
+  double precision :: d_dx1_f_rho1(3),d_dx_rho_f_rho(3),nume
 
   if(j1b_type .eq. 200) then
 
@@ -578,8 +594,84 @@ subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
     mu_der(2) = tmp3 * (grad_dm_a(2,1) + grad_dm_b(2,1))
     mu_der(3) = tmp3 * (grad_dm_a(3,1) + grad_dm_b(3,1))
 
-  else
+  elseif(j1b_type .eq. 202) then
 
+    ! mu(r1,r2) = {rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]} / RHO
+    !
+    ! RHO = rho(r1) + rho(r2)
+    !
+    ! f[rho] = alpha rho^beta + mu0 exp(-rho)
+    !
+    ! d/dx1 mu(r1,r2) = 1/RHO^2 * {RHO * d/dx1 (rho(r1) f[rho(r1)]) 
+    !                              - d/dx1 rho(r1) * [rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]] }
+    !
+    ! d/dx1 f[rho(r1)] = [0.5 alpha / sqrt(rho(r1)) - mu0 exp(-rho(r1))] (d rho(r1) / dx1)
+    !
+    ! d/dx1 (rho(r1) f[rho(r1)] = rho(r1) * d/dx1 f[rho(r1)] + f[rho(r1)] * d/dx1 rho(r1)
+     
+    !!!!!!!!! rho1,rho2,rho1+rho2
+    call get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
+    rho_tot = rho1 + rho2
+    if(rho_tot.lt.1.d-10)rho_tot = 1.d-10
+    inv_rho_tot = 1.d0/rho_tot
+    ! f(rho) = mu_r_ct * rho**beta_rho_power + mu_erf * exp(-rho)
+    call get_all_f_rho(rho1,rho2,mu_r_ct,mu_erf,beta_rho_power,f_rho1,d_drho_f_rho1,f_rho2)
+    d_dx1_f_rho1(1:3)   = d_drho_f_rho1 * grad_rho1(1:3)
+    d_dx_rho_f_rho(1:3) = rho1 * d_dx1_f_rho1(1:3) + f_rho1 * grad_rho1(1:3)
+    nume   = rho1 * f_rho1 + rho2 * f_rho2
+    mu_val = nume * inv_rho_tot
+    mu_der(1:3) = inv_rho_tot*inv_rho_tot * (rho_tot * d_dx_rho_f_rho(1:3) - grad_rho1(1:3) * nume)
+  elseif(j1b_type .eq. 203) then
+
+    ! mu(r1,r2) = {rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]} / RHO
+    !
+    ! RHO = rho(r1) + rho(r2)
+    !
+    ! f[rho] = alpha rho^beta + mu0 
+    !
+    ! d/dx1 mu(r1,r2) = 1/RHO^2 * {RHO * d/dx1 (rho(r1) f[rho(r1)]) 
+    !                              - d/dx1 rho(r1) * [rho(r1) f[rho(r1)] + rho(r2) f[rho(r2)]] }
+    !
+    ! d/dx1 f[rho(r1)] = [0.5 alpha / sqrt(rho(r1)) ] (d rho(r1) / dx1)
+    !
+    ! d/dx1 (rho(r1) f[rho(r1)] = rho(r1) * d/dx1 f[rho(r1)] + f[rho(r1)] * d/dx1 rho(r1)
+     
+    !!!!!!!!! rho1,rho2,rho1+rho2
+    call get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
+    rho_tot = rho1 + rho2
+    if(rho_tot.lt.1.d-10)rho_tot = 1.d-10
+    inv_rho_tot = 1.d0/rho_tot
+    ! f(rho) = mu_r_ct * rho**beta_rho_power + mu_erf 
+    call get_all_f_rho_simple(rho1,rho2,mu_r_ct,mu_erf,beta_rho_power,f_rho1,d_drho_f_rho1,f_rho2)
+    d_dx1_f_rho1(1:3)   = d_drho_f_rho1 * grad_rho1(1:3)
+    d_dx_rho_f_rho(1:3) = rho1 * d_dx1_f_rho1(1:3) + f_rho1 * grad_rho1(1:3)
+    nume   = rho1 * f_rho1 + rho2 * f_rho2
+    mu_val = nume * inv_rho_tot
+    mu_der(1:3) = inv_rho_tot*inv_rho_tot * (rho_tot * d_dx_rho_f_rho(1:3) - grad_rho1(1:3) * nume)
+  elseif(j1b_type .eq. 204) then
+
+    ! mu(r1,r2) = 1/2 * (f[rho(r1)] + f[rho(r2)]} 
+    !
+    ! f[rho] = alpha rho^beta + mu0 
+    !
+    ! d/dx1 mu(r1,r2) = 1/2 * d/dx1 (rho(r1) f[rho(r1)])
+    !                   
+    ! d/dx1 f[rho(r1)] = [0.5 alpha / sqrt(rho(r1)) ] (d rho(r1) / dx1)
+    !
+    ! d/dx1 (rho(r1) f[rho(r1)] = rho(r1) * d/dx1 f[rho(r1)] + f[rho(r1)] * d/dx1 rho(r1)
+     
+    !!!!!!!!! rho1,rho2,rho1+rho2
+    call get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
+    rho_tot = rho1 + rho2
+    if(rho_tot.lt.1.d-10)rho_tot = 1.d-10
+    inv_rho_tot = 1.d0/rho_tot
+    ! f(rho) = mu_r_ct * rho**beta_rho_power + mu_erf 
+    call get_all_f_rho_simple(rho1,rho2,mu_r_ct,mu_erf,beta_rho_power,f_rho1,d_drho_f_rho1,f_rho2)
+    d_dx1_f_rho1(1:3)   = d_drho_f_rho1 * grad_rho1(1:3)
+    d_dx_rho_f_rho(1:3) = rho1 * d_dx1_f_rho1(1:3) + f_rho1 * grad_rho1(1:3)
+    mu_val = 0.5d0 * ( f_rho1 + f_rho2)
+    mu_der(1:3) = d_dx_rho_f_rho(1:3) 
+  else
     print *, ' j1b_type = ', j1b_type, 'not implemented yet'
     stop
 
@@ -684,3 +776,76 @@ end function j12_mu_square
 
 ! ---
 
+subroutine f_mu_and_deriv_mu(rho,alpha,mu0,beta,f_mu,d_drho_f_mu)
+ implicit none
+ BEGIN_DOC
+! function giving mu as a function of rho
+!
+! f_mu = alpha * rho**beta + mu0 * exp(-rho)
+!
+! and its derivative with respect to rho d_drho_f_mu
+ END_DOC
+ double precision, intent(in)  :: rho,alpha,mu0,beta
+ double precision, intent(out) :: f_mu,d_drho_f_mu
+ f_mu = alpha * (rho)**beta + mu0 * dexp(-rho)
+ d_drho_f_mu = alpha * beta * rho**(beta-1.d0) - mu0 * dexp(-rho)
+
+end
+
+
+subroutine get_all_rho_grad_rho(r1,r2,rho1,rho2,grad_rho1)
+ implicit none
+ BEGIN_DOC
+! returns the density in r1,r2 and grad_rho at r1
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: grad_rho1(3),rho1,rho2
+ double precision              :: dm_a(1), dm_b(1), grad_dm_a(3,1), grad_dm_b(3,1)
+ call density_and_grad_alpha_beta(r1, dm_a, dm_b, grad_dm_a, grad_dm_b)
+ rho1 = dm_a(1) + dm_b(1)
+ grad_rho1(1:3) = grad_dm_a(1:3,1) + grad_dm_b(1:3,1)
+ call density_and_grad_alpha_beta(r2, dm_a, dm_b, grad_dm_a, grad_dm_b)
+ rho2 = dm_a(1) + dm_b(1)
+end
+
+subroutine get_all_f_rho(rho1,rho2,alpha,mu0,beta,f_rho1,d_drho_f_rho1,f_rho2)
+ implicit none
+ BEGIN_DOC
+! returns the values f(mu(r1)), f(mu(r2)) and d/drho(1) f(mu(r1))
+ END_DOC
+ double precision, intent(in) :: rho1,rho2,alpha,mu0,beta
+ double precision, intent(out):: f_rho1,d_drho_f_rho1,f_rho2
+ double precision :: tmp
+ call f_mu_and_deriv_mu(rho1,alpha,mu0,beta,f_rho1,d_drho_f_rho1)
+ call f_mu_and_deriv_mu(rho2,alpha,mu0,beta,f_rho2,tmp)
+end
+
+
+subroutine get_all_f_rho_simple(rho1,rho2,alpha,mu0,beta,f_rho1,d_drho_f_rho1,f_rho2)
+ implicit none
+ BEGIN_DOC
+! returns the values f(mu(r1)), f(mu(r2)) and d/drho(1) f(mu(r1))
+ END_DOC
+ double precision, intent(in) :: rho1,rho2,alpha,mu0,beta
+ double precision, intent(out):: f_rho1,d_drho_f_rho1,f_rho2
+ double precision :: tmp
+ call f_mu_and_deriv_mu_simple(rho1,alpha,mu0,beta,f_rho1,d_drho_f_rho1)
+ call f_mu_and_deriv_mu_simple(rho2,alpha,mu0,beta,f_rho2,tmp)
+end
+
+subroutine f_mu_and_deriv_mu_simple(rho,alpha,mu0,beta,f_mu,d_drho_f_mu)
+ implicit none
+ BEGIN_DOC
+! function giving mu as a function of rho
+!
+! f_mu = alpha * rho**beta + mu0 
+!
+! and its derivative with respect to rho d_drho_f_mu
+ END_DOC
+ double precision, intent(in)  :: rho,alpha,mu0,beta
+ double precision, intent(out) :: f_mu,d_drho_f_mu
+ f_mu = alpha * (rho)**beta + mu0 
+ d_drho_f_mu = alpha * beta * rho**(beta-1.d0) 
+
+end
+
diff --git a/src/tc_keywords/EZFIO.cfg b/src/tc_keywords/EZFIO.cfg
index 484bd1f0..62adb068 100644
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@@ -148,6 +148,12 @@ doc: a parameter used to define mu(r)
 interface: ezfio, provider, ocaml
 default: 6.203504908994001e-1
 
+[beta_rho_power]
+type: double precision
+doc: a parameter used to define mu(r)
+interface: ezfio, provider, ocaml
+default: 0.5
+
 [thr_degen_tc]
 type: Threshold
 doc: Threshold to determine if two orbitals are degenerate in TCSCF in order to avoid random quasi orthogonality between the right- and left-eigenvector for the same eigenvalue

From 7e5f1ffc0c8fb9edc23e33f5be163b3d93ff124f Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 25 May 2023 12:57:58 +0200
Subject: [PATCH 26/79] added plot_mu_of_r.irp.f

---
 src/non_h_ints_mu/plot_mu_of_r.irp.f | 33 ++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 src/non_h_ints_mu/plot_mu_of_r.irp.f

diff --git a/src/non_h_ints_mu/plot_mu_of_r.irp.f b/src/non_h_ints_mu/plot_mu_of_r.irp.f
new file mode 100644
index 00000000..1100cd7c
--- /dev/null
+++ b/src/non_h_ints_mu/plot_mu_of_r.irp.f
@@ -0,0 +1,33 @@
+program plot_mu_of_r
+ implicit none
+ read_wf = .False.
+ touch read_wf 
+ call routine_print
+
+end
+
+
+subroutine routine_print
+ implicit none
+ character*(128) :: output
+ integer :: i_unit_output,getUnitAndOpen
+ output=trim(ezfio_filename)//'.mu_of_r'
+ i_unit_output = getUnitAndOpen(output,'w')
+ integer :: ipoint,nx
+ double precision :: xmax,xmin,r(3),dx
+ double precision :: mu_val, mu_der(3),dm_a,dm_b,grad
+ xmax =  5.D0
+ xmin = -5.D0
+ nx = 10000
+ dx = (xmax - xmin)/dble(nx)
+ r = 0.d0
+ r(1) = xmin
+ do ipoint = 1, nx
+  call mu_r_val_and_grad(r, r, mu_val, mu_der)
+  call dm_dft_alpha_beta_at_r(r,dm_a,dm_b)
+  grad = mu_der(1)**2 + mu_der(2)**2 + mu_der(3)**2 
+  grad = dsqrt(grad)
+  write(i_unit_output,'(100(F16.7,X))')r(1),mu_val,dm_a+dm_b,grad
+  r(1) += dx
+ enddo
+end

From f2ca86ef604b906c3c10032691ac5e0e0ff83b53 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 26 May 2023 11:48:08 +0200
Subject: [PATCH 27/79] Improved cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 25 +++++++++++++++++++++++--
 src/cipsi/selection.irp.f        |  3 +++
 src/mo_two_e_ints/cholesky.irp.f |  1 +
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index d4c201aa..5aab12d9 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -51,8 +51,9 @@ END_PROVIDER
 
  double precision :: integral
  logical, external :: ao_two_e_integral_zero
- !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k,l, integral) SCHEDULE(dynamic)
+ !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral)
  do l=1,ao_num
+  !$OMP DO SCHEDULE(dynamic)
   do j=1,l
    do k=1,ao_num
     do i=1,k
@@ -65,8 +66,28 @@ END_PROVIDER
     enddo
    enddo
   enddo
+  !$OMP END DO NOWAIT
  enddo
- !$OMP END PARALLEL DO
+ !$OMP END PARALLEL
+
+ !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral)
+ do l=1,ao_num
+  !$OMP DO SCHEDULE(dynamic)
+  do j=1,l
+   do k=1,ao_num
+    do i=1,k
+     if (ao_two_e_integral_zero(i,j,k,l)) cycle
+     integral = ao_two_e_integral(i,k,j,l)
+     ao_integrals(i,k,j,l) = integral
+     ao_integrals(k,i,j,l) = integral
+     ao_integrals(i,k,l,j) = integral
+     ao_integrals(k,i,l,j) = integral
+    enddo
+   enddo
+  enddo
+  !$OMP END DO NOWAIT
+ enddo
+ !$OMP END PARALLEL
 
  ! Call Lapack
  cholesky_ao_num = cholesky_ao_num_guess
diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 6f40a809..0705d103 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -76,6 +76,8 @@ subroutine select_connected(i_generator,E0,pt2_data,b,subset,csubset)
 
   double precision, allocatable  :: fock_diag_tmp(:,:)
 
+  if (csubset == 0) return
+
   allocate(fock_diag_tmp(2,mo_num+1))
 
   call build_fock_tmp(fock_diag_tmp,psi_det_generators(1,1,i_generator),N_int)
@@ -177,6 +179,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
   monoAdo = .true.
   monoBdo = .true.
 
+  if (csubset == 0) return
 
   do k=1,N_int
     hole    (k,1) = iand(psi_det_generators(k,1,i_generator), hole_mask(k,1))
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 14d3c696..1706b2ec 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -6,6 +6,7 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num
 
  integer :: k
 
+ call set_multiple_levels_omp(.False.)
  !$OMP PARALLEL DO PRIVATE(k)
  do k=1,cholesky_ao_num
   call ao_to_mo(cholesky_ao(1,1,k),ao_num,cholesky_mo(1,1,k),mo_num)

From b8bfab1d7cd8576c9597d92f70822d903628a6a6 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 27 May 2023 22:34:40 +0200
Subject: [PATCH 28/79] start working on NO

---
 src/tc_bi_ortho/normal_ordered.irp.f | 252 ++++++++++++++++-----------
 1 file changed, 150 insertions(+), 102 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 8adc7a63..c30cd1ef 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -1,3 +1,6 @@
+
+! ---
+
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC 
@@ -8,13 +11,16 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   implicit none
 
-  integer :: i,h1,p1,h2,p2
-  integer :: hh1,hh2,pp1,pp2
+  integer                        :: i, h1, p1, h2, p2
+  integer                        :: hh1, hh2, pp1, pp2
   integer                        :: Ne(2)
+  double precision               :: hthree_aba, hthree_aaa, hthree_aab
+  double precision               :: wall0, wall1
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision :: hthree_aba,hthree_aaa,hthree_aab
-  double precision :: wall0,wall1
+
+  print*,' Providing normal_two_body_bi_orth ...'
+  call wall_time(wall0)
  
   PROVIDE N_int
 
@@ -23,23 +29,21 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   if(core_tc_op) then
     do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
     enddo
-    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
   else
-    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
   endif
 
-  normal_two_body_bi_orth = 0.d0
-  print*,'Providing normal_two_body_bi_orth ...'
-  call wall_time(wall0)
+  normal_two_body_bi_orth(1:mo_num,1:mo_num,1:mo_num,1:mo_num) = 0.d0
 
- !$OMP PARALLEL                                                                         &
- !$OMP DEFAULT (NONE)                                                                   &
- !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
- !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
- !$OMP DO SCHEDULE (static) 
+  !$OMP PARALLEL                                                                         &
+  !$OMP DEFAULT (NONE)                                                                   &
+  !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+  !$OMP SHARED  (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
+  !$OMP DO SCHEDULE (static) 
   do hh1 = 1, n_act_orb
     h1 = list_act(hh1) 
     do pp1 = 1, n_act_orb
@@ -48,50 +52,57 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
         h2 = list_act(hh2) 
         do pp2 = 1, n_act_orb
           p2 = list_act(pp2)
+
           ! all contributions from the 3-e terms to the double excitations 
           ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
     
-
           ! opposite spin double excitations : s1 /= s2
           call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
 
           ! same spin double excitations : s1 == s2 
-          if(h1<h2.and.p1.gt.p2)then
-           ! with opposite spin contributions 
-           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-           ! same spin double excitations with same spin contributions 
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-           else
-             hthree_aaa = 0.d0
-           endif
-          else
-           ! with opposite spin contributions 
-           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-           if(Ne(2).ge.3)then
+          if((h1 .lt. h2) .and. (p1 .gt. p2)) then
+
+            ! with opposite spin contributions 
+            call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+
             ! same spin double excitations with same spin contributions 
-             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-           else
-             hthree_aaa = 0.d0
-           endif
+            if(Ne(2) .ge. 3) then
+              call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+            else
+              hthree_aaa = 0.d0
+            endif
+
+          else
+
+            ! with opposite spin contributions 
+            call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+
+            if(Ne(2) .ge. 3) then
+              ! same spin double excitations with same spin contributions 
+              call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+            else
+              hthree_aaa = 0.d0
+            endif
+
           endif
-          normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+
+          normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0 * (hthree_aba + hthree_aab + hthree_aaa)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print*,'Wall time for normal_two_body_bi_orth ',wall1-wall0
+  !$OMP END DO
+  !$OMP END PARALLEL
 
   deallocate( occ )
   deallocate( key_i_core )
 
+  call wall_time(wall1)
+  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
+
 END_PROVIDER 
 
-
+! ---
 
 subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
@@ -106,30 +117,41 @@ subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
   !!!! double alpha/beta
   hthree = 0.d0
+
   do ii = 1, Ne(2) ! purely closed shell part 
     i = occ(ii,2)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
     int_exc_13 = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2, i,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
     int_exc_12 = -1.d0 * integral
-    hthree += 2.d0 * int_direct - 1.d0 * ( int_exc_13 + int_exc_12)
+
+    hthree += 2.d0 * int_direct - 1.d0 * (int_exc_13 + int_exc_12)
   enddo
+
   do ii = Ne(2) + 1, Ne(1) ! purely open-shell part 
-   i = occ(ii,1)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
+    i = occ(ii,1)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
     int_exc_13 = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2, i,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
     int_exc_12 = -1.d0 * integral
-    hthree += 1.d0 * int_direct - 0.5d0* ( int_exc_13 + int_exc_12)
+
+    hthree += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
   enddo
 
-end subroutine give_aba_contraction
-
+  return
+end
 
+! ---
 
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num, mo_num, mo_num)]
 
@@ -152,29 +174,31 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num,
   allocate( key_i_core(N_int,2) )
   allocate( occ(N_int*bit_kind_size,2) )
 
-  if(core_tc_op)then
-   do i = 1, N_int
-    key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-    key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-   enddo
-   call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
   else
-   call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
   endif
+
   normal_two_body_bi_orth_ab = 0.d0
   do hh1 = 1, n_act_orb
-   h1 = list_act(hh1) 
-   do pp1 = 1, n_act_orb
-    p1 = list_act(pp1)
-    do hh2 = 1, n_act_orb
-     h2 = list_act(hh2) 
-     do pp2 = 1, n_act_orb
-      p2 = list_act(pp2)
-      call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
-      normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
-     enddo
+    h1 = list_act(hh1) 
+    do pp1 = 1, n_act_orb
+      p1 = list_act(pp1)
+      do hh2 = 1, n_act_orb
+        h2 = list_act(hh2) 
+        do pp2 = 1, n_act_orb
+          p2 = list_act(pp2)
+          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
+
+          normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
+        enddo
+      enddo
     enddo
-   enddo
   enddo
 
   deallocate( key_i_core )
@@ -182,7 +206,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num,
 
 END_PROVIDER 
 
-
+! ---
 
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_act_orb, n_act_orb, n_act_orb)]
 
@@ -250,13 +274,14 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_
 
 END_PROVIDER 
 
-
+! ---
 
 subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
   BEGIN_DOC
-! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
+  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
   END_DOC
+
   use bitmasks ! you need to include the bitmasks_module.f90 features
 
   implicit none
@@ -270,48 +295,64 @@ subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
   hthree = 0.d0
   do ii = 1, Ne(2) ! purely closed shell part 
     i = occ(ii,2)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,p1,i ,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
     int_exc_l = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,i ,p2,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
     int_exc_ll= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,i ,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
     int_exc_12= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
     int_exc_13= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(i ,p1,p2,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
     int_exc_23= -1.d0 * integral
 
-    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll -( int_exc_12+ int_exc_13+ int_exc_23  )
+    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
   enddo
+
   do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
     i = occ(ii,1)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
-    int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,p1,i ,i,h2,h1,integral)
-    int_exc_l = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,i ,p2,i,h2,h1,integral)
-    int_exc_ll= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,i ,p1,i,h2,h1,integral)
-    int_exc_12= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
-    int_exc_13= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(i ,p1,p2,i,h2,h1,integral)
-    int_exc_23= -1.d0 * integral
 
-    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll -( int_exc_12+ int_exc_13+ int_exc_23  ))
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
+    int_exc_l = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+    int_exc_ll = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+    int_exc_23 = -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
   enddo
 
-end subroutine give_aaa_contraction
-
+  return
+end
 
+! ---
 
 subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-  implicit none
+
   use bitmasks ! you need to include the bitmasks_module.f90 features
-  integer, intent(in)           :: Nint, h1, h2, p1, p2
-  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+
+  implicit none
+  integer,          intent(in)  :: Nint, h1, h2, p1, p2
+  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
   double precision, intent(out) :: hthree
   integer                       :: ii, i
   double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
@@ -320,11 +361,18 @@ subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
   hthree = 0.d0
   do ii = 1, Ne(2) ! purely closed shell part 
     i = occ(ii,2)
-    call give_integrals_3_body_bi_ort(p2,p1,i,h2,h1,i,integral)
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2,i,h2,h1,i,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
     int_exc_23= -1.d0 * integral
-    hthree  +=  1.d0 * int_direct - int_exc_23
+
+    hthree +=  1.d0 * int_direct - int_exc_23
   enddo
 
-end subroutine give_aab_contraction
+  return
+end
+
+! ---
+

From f0ad63966adf94b1bbe794186a0f35d07c744013 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 30 May 2023 13:48:34 +0200
Subject: [PATCH 29/79] Fixes for numerical orbitals in qp_import

---
 ocaml/Input_ao_basis.ml                  |  47 ++++----
 ocaml/Input_mo_basis.ml                  |   5 +-
 scripts/qp_import_trexio.py              | 146 +++++++++++++++--------
 src/trexio/import_trexio_integrals.irp.f | 136 ++++++++++++++++-----
 4 files changed, 236 insertions(+), 98 deletions(-)

diff --git a/ocaml/Input_ao_basis.ml b/ocaml/Input_ao_basis.ml
index 841089ea..506cf069 100644
--- a/ocaml/Input_ao_basis.ml
+++ b/ocaml/Input_ao_basis.ml
@@ -44,8 +44,12 @@ end = struct
   let get_default = Qpackage.get_ezfio_default "ao_basis";;
 
   let read_ao_basis () =
-    Ezfio.get_ao_basis_ao_basis ()
-    |> AO_basis_name.of_string
+    let result =
+      Ezfio.get_ao_basis_ao_basis ()
+    in
+    if result <> "None" then
+      AO_basis_name.of_string result
+    else failwith "No basis"
   ;;
 
   let read_ao_num () =
@@ -192,7 +196,7 @@ end = struct
          ao_expo         ;
          ao_cartesian    ;
          ao_normalized   ;
-         primitives_normalized ; 
+         primitives_normalized ;
        } = b
      in
      write_md5 b ;
@@ -207,7 +211,7 @@ end = struct
      Ezfio.set_ao_basis_ao_prim_num (Ezfio.ezfio_array_of_list
        ~rank:1 ~dim:[| ao_num |] ~data:ao_prim_num) ;
 
-     let ao_nucl = 
+     let ao_nucl =
        Array.to_list ao_nucl
        |> list_map Nucl_number.to_int
      in
@@ -215,7 +219,7 @@ end = struct
        ~rank:1 ~dim:[| ao_num |] ~data:ao_nucl) ;
 
      let ao_power =
-       let l = Array.to_list ao_power in 
+       let l = Array.to_list ao_power in
        List.concat [
          (list_map (fun a -> Positive_int.to_int a.Angmom.Xyz.x) l) ;
          (list_map (fun a -> Positive_int.to_int a.Angmom.Xyz.y) l) ;
@@ -227,7 +231,7 @@ end = struct
      Ezfio.set_ao_basis_ao_cartesian(ao_cartesian);
      Ezfio.set_ao_basis_ao_normalized(ao_normalized);
      Ezfio.set_ao_basis_primitives_normalized(primitives_normalized);
-     
+
      let ao_coef =
       Array.to_list ao_coef
       |> list_map AO_coef.to_float
@@ -267,7 +271,10 @@ end = struct
         |> Ezfio.set_ao_basis_ao_md5 ;
         Some result
       with
-      | _ -> (Ezfio.set_ao_basis_ao_md5 "None" ; None)
+      | _ -> ( "None"
+               |> Digest.string
+               |> Digest.to_hex
+               |> Ezfio.set_ao_basis_ao_md5 ; None)
   ;;
 
 
@@ -276,7 +283,7 @@ end = struct
       to_basis b
       |> Long_basis.of_basis
       |> Array.of_list
-    and unordered_basis = 
+    and unordered_basis =
       to_long_basis b
       |> Array.of_list
     in
@@ -289,15 +296,15 @@ end = struct
             (a.(i) <- None ; i)
           else
             find x a (i+1)
-      and find2 (s,g,n) a i = 
+      and find2 (s,g,n) a i =
         if i = Array.length a then -1
         else
-            match a.(i) with 
+            match a.(i) with
                 | None -> find2 (s,g,n) a (i+1)
                 | Some (s', g', n')  ->
                    if s <> s' || n <> n' then find2 (s,g,n) a (i+1)
                    else
-                   let lc  = list_map (fun (prim, _) -> prim) g.Gto.lc 
+                   let lc  = list_map (fun (prim, _) -> prim) g.Gto.lc
                    and lc' = list_map (fun (prim, _) -> prim) g'.Gto.lc
                    in
                    if lc <> lc' then find2 (s,g,n) a (i+1) else (a.(i) <- None ; i)
@@ -313,13 +320,13 @@ end = struct
       let ao_num = List.length long_basis |> AO_number.of_int in
       let ao_prim_num =
         list_map (fun (_,g,_) -> List.length g.Gto.lc
-          |> AO_prim_number.of_int ) long_basis 
+          |> AO_prim_number.of_int ) long_basis
         |> Array.of_list
       and ao_nucl =
-        list_map (fun (_,_,n) -> n) long_basis 
+        list_map (fun (_,_,n) -> n) long_basis
         |> Array.of_list
       and ao_power =
-        list_map (fun (x,_,_) -> x) long_basis 
+        list_map (fun (x,_,_) -> x) long_basis
         |> Array.of_list
       in
       let ao_prim_num_max = Array.fold_left (fun s x ->
@@ -329,16 +336,16 @@ end = struct
       in
 
       let gtos =
-        list_map (fun (_,x,_) -> x) long_basis 
+        list_map (fun (_,x,_) -> x) long_basis
       in
       let create_expo_coef ec =
           let coefs =
             begin match ec with
             | `Coefs -> list_map (fun x->
-              list_map (fun (_,coef) -> AO_coef.to_float coef) x.Gto.lc ) gtos 
+              list_map (fun (_,coef) -> AO_coef.to_float coef) x.Gto.lc ) gtos
             | `Expos -> list_map (fun x->
               list_map (fun (prim,_) -> AO_expo.to_float
-              prim.GaussianPrimitive.expo) x.Gto.lc ) gtos 
+              prim.GaussianPrimitive.expo) x.Gto.lc ) gtos
             end
           in
           let rec get_n n accu = function
@@ -360,7 +367,7 @@ end = struct
       let ao_coef = create_expo_coef `Coefs
       |> Array.of_list
       |> Array.map AO_coef.of_float
-      and ao_expo = create_expo_coef `Expos 
+      and ao_expo = create_expo_coef `Expos
       |> Array.of_list
       |> Array.map AO_expo.of_float
       in
@@ -372,7 +379,7 @@ end = struct
         }
   ;;
 
-  let reorder b = 
+  let reorder b =
     let order = ordering b in
     let f a = Array.init (Array.length a) (fun i -> a.(order.(i))) in
     let ao_prim_num_max = AO_prim_number.to_int b.ao_prim_num_max
@@ -464,7 +471,7 @@ Basis set (read-only) ::
     | line :: tail ->
       let line = String.trim line in
       if line = "Basis set (read-only) ::" then
-        String.concat "\n" tail 
+        String.concat "\n" tail
       else
         extract_basis tail
     in
diff --git a/ocaml/Input_mo_basis.ml b/ocaml/Input_mo_basis.ml
index a4e6176a..832b464e 100644
--- a/ocaml/Input_mo_basis.ml
+++ b/ocaml/Input_mo_basis.ml
@@ -56,7 +56,10 @@ end = struct
   let read_ao_md5 () =
     let ao_md5 =
       match (Input_ao_basis.Ao_basis.read ()) with
-      | None -> failwith "Unable to read AO basis"
+      | None -> ("None"
+                 |> Digest.string
+                 |> Digest.to_hex
+                 |> MD5.of_string)
       | Some result -> Input_ao_basis.Ao_basis.to_md5 result
     in
     let result =
diff --git a/scripts/qp_import_trexio.py b/scripts/qp_import_trexio.py
index eb19e16b..e7bc0f78 100755
--- a/scripts/qp_import_trexio.py
+++ b/scripts/qp_import_trexio.py
@@ -132,60 +132,113 @@ def write_ezfio(trexio_filename, filename):
     try:
         basis_type = trexio.read_basis_type(trexio_file)
 
-        if basis_type.lower() not in ["gaussian", "slater"]:
-            raise TypeError
+        if basis_type.lower() in ["gaussian", "slater"]:
+            shell_num   = trexio.read_basis_shell_num(trexio_file)
+            prim_num    = trexio.read_basis_prim_num(trexio_file)
+            ang_mom     = trexio.read_basis_shell_ang_mom(trexio_file)
+            nucl_index  = trexio.read_basis_nucleus_index(trexio_file)
+            exponent    = trexio.read_basis_exponent(trexio_file)
+            coefficient = trexio.read_basis_coefficient(trexio_file)
+            shell_index = trexio.read_basis_shell_index(trexio_file)
+            ao_shell    = trexio.read_ao_shell(trexio_file)
 
-        shell_num   = trexio.read_basis_shell_num(trexio_file)
-        prim_num    = trexio.read_basis_prim_num(trexio_file)
-        ang_mom     = trexio.read_basis_shell_ang_mom(trexio_file)
-        nucl_index  = trexio.read_basis_nucleus_index(trexio_file)
-        exponent    = trexio.read_basis_exponent(trexio_file)
-        coefficient = trexio.read_basis_coefficient(trexio_file)
-        shell_index = trexio.read_basis_shell_index(trexio_file)
-        ao_shell    = trexio.read_ao_shell(trexio_file)
+            ezfio.set_basis_basis("Read from TREXIO")
+            ezfio.set_ao_basis_ao_basis("Read from TREXIO")
+            ezfio.set_basis_shell_num(shell_num)
+            ezfio.set_basis_prim_num(prim_num)
+            ezfio.set_basis_shell_ang_mom(ang_mom)
+            ezfio.set_basis_basis_nucleus_index([ x+1 for x in nucl_index ])
+            ezfio.set_basis_prim_expo(exponent)
+            ezfio.set_basis_prim_coef(coefficient)
 
-        ezfio.set_basis_basis("Read from TREXIO")
-        ezfio.set_basis_shell_num(shell_num)
-        ezfio.set_basis_prim_num(prim_num)
-        ezfio.set_basis_shell_ang_mom(ang_mom)
-        ezfio.set_basis_basis_nucleus_index([ x+1 for x in nucl_index ])
-        ezfio.set_basis_prim_expo(exponent)
-        ezfio.set_basis_prim_coef(coefficient)
+            nucl_shell_num = []
+            prev = None
+            m = 0
+            for i in ao_shell:
+                if i != prev:
+                   m += 1
+                   if prev is None or nucl_index[i] != nucl_index[prev]:
+                        nucl_shell_num.append(m)
+                        m = 0
+                prev = i
+            assert (len(nucl_shell_num) == nucl_num)
 
-        nucl_shell_num = []
-        prev = None
-        m = 0
-        for i in ao_shell:
-            if i != prev:
-               m += 1
-               if prev is None or nucl_index[i] != nucl_index[prev]:
-                    nucl_shell_num.append(m)
-                    m = 0
-            prev = i
-        assert (len(nucl_shell_num) == nucl_num)
+            shell_prim_num = []
+            prev = shell_index[0]
+            count = 0
+            for i in shell_index:
+                if i != prev:
+                   shell_prim_num.append(count)
+                   count = 0
+                count += 1
+                prev = i
+            shell_prim_num.append(count)
 
-        shell_prim_num = []
-        prev = shell_index[0]
-        count = 0
-        for i in shell_index:
-            if i != prev:
-               shell_prim_num.append(count)
-               count = 0
-            count += 1
-            prev = i
-        shell_prim_num.append(count)
+            assert (len(shell_prim_num) == shell_num)
 
-        assert (len(shell_prim_num) == shell_num)
-
-        ezfio.set_basis_shell_prim_num(shell_prim_num)
-        ezfio.set_basis_shell_index([x+1 for x in shell_index])
-        ezfio.set_basis_nucleus_shell_num(nucl_shell_num)
+            ezfio.set_basis_shell_prim_num(shell_prim_num)
+            ezfio.set_basis_shell_index([x+1 for x in shell_index])
+            ezfio.set_basis_nucleus_shell_num(nucl_shell_num)
 
 
-        shell_factor = trexio.read_basis_shell_factor(trexio_file)
-        prim_factor  = trexio.read_basis_prim_factor(trexio_file)
+            shell_factor = trexio.read_basis_shell_factor(trexio_file)
+            prim_factor  = trexio.read_basis_prim_factor(trexio_file)
 
-        print("OK")
+        elif basis_type.lower() == "numerical":
+
+            shell_num   = trexio.read_basis_shell_num(trexio_file)
+            prim_num    = shell_num
+            ang_mom     = trexio.read_basis_shell_ang_mom(trexio_file)
+            nucl_index  = trexio.read_basis_nucleus_index(trexio_file)
+            exponent    = [1.]*prim_num
+            coefficient = [1.]*prim_num
+            shell_index = [i for i in range(shell_num)]
+            ao_shell    = trexio.read_ao_shell(trexio_file)
+
+            ezfio.set_basis_basis("None")
+            ezfio.set_ao_basis_ao_basis("None")
+            ezfio.set_basis_shell_num(shell_num)
+            ezfio.set_basis_prim_num(prim_num)
+            ezfio.set_basis_shell_ang_mom(ang_mom)
+            ezfio.set_basis_basis_nucleus_index([ x+1 for x in nucl_index ])
+            ezfio.set_basis_prim_expo(exponent)
+            ezfio.set_basis_prim_coef(coefficient)
+
+            nucl_shell_num = []
+            prev = None
+            m = 0
+            for i in ao_shell:
+                if i != prev:
+                   m += 1
+                   if prev is None or nucl_index[i] != nucl_index[prev]:
+                        nucl_shell_num.append(m)
+                        m = 0
+                prev = i
+            assert (len(nucl_shell_num) == nucl_num)
+
+            shell_prim_num = []
+            prev = shell_index[0]
+            count = 0
+            for i in shell_index:
+                if i != prev:
+                   shell_prim_num.append(count)
+                   count = 0
+                count += 1
+                prev = i
+            shell_prim_num.append(count)
+
+            assert (len(shell_prim_num) == shell_num)
+
+            ezfio.set_basis_shell_prim_num(shell_prim_num)
+            ezfio.set_basis_shell_index([x+1 for x in shell_index])
+            ezfio.set_basis_nucleus_shell_num(nucl_shell_num)
+
+            shell_factor = trexio.read_basis_shell_factor(trexio_file)
+            prim_factor  = [1.]*prim_num
+        else:
+           raise TypeError
+
+        print(basis_type)
     except:
         print("None")
         ezfio.set_ao_basis_ao_cartesian(True)
@@ -262,7 +315,6 @@ def write_ezfio(trexio_filename, filename):
 #        ezfio.set_ao_basis_ao_prim_num_max(prim_num_max)
         ezfio.set_ao_basis_ao_coef(coef)
         ezfio.set_ao_basis_ao_expo(expo)
-        ezfio.set_ao_basis_ao_basis("Read from TREXIO")
 
         print("OK")
 
diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f
index 9f9ad9d6..8c6b79d7 100644
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@@ -3,6 +3,7 @@ program import_integrals_ao
   implicit none
   integer(trexio_t)              :: f ! TREXIO file handle
   integer(trexio_exit_code)      :: rc
+  PROVIDE mo_num
 
   f = trexio_open(trexio_filename, 'r', TREXIO_AUTO, rc)
   if (f == 0_8) then
@@ -42,10 +43,10 @@ subroutine run(f)
 
   if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
     rc = trexio_read_nucleus_repulsion(f, s)
-    call trexio_assert(rc, TREXIO_SUCCESS)
     if (rc /= TREXIO_SUCCESS) then
       print *, irp_here, rc
       print *, 'Error reading nuclear repulsion'
+      call trexio_assert(rc, TREXIO_SUCCESS)
       stop -1
     endif
     call ezfio_set_nuclei_nuclear_repulsion(s)
@@ -63,6 +64,7 @@ subroutine run(f)
     if (rc /= TREXIO_SUCCESS) then
       print *, irp_here
       print *, 'Error reading AO overlap'
+      call trexio_assert(rc, TREXIO_SUCCESS)
       stop -1
     endif
     call ezfio_set_ao_one_e_ints_ao_integrals_overlap(A)
@@ -74,6 +76,7 @@ subroutine run(f)
     if (rc /= TREXIO_SUCCESS) then
       print *, irp_here
       print *, 'Error reading AO kinetic integrals'
+      call trexio_assert(rc, TREXIO_SUCCESS)
       stop -1
     endif
     call ezfio_set_ao_one_e_ints_ao_integrals_kinetic(A)
@@ -85,6 +88,7 @@ subroutine run(f)
 !    if (rc /= TREXIO_SUCCESS) then
 !      print *, irp_here
 !      print *, 'Error reading AO ECP local integrals'
+!      call trexio_assert(rc, TREXIO_SUCCESS)
 !      stop -1
 !    endif
 !    call ezfio_set_ao_one_e_ints_ao_integrals_pseudo(A)
@@ -96,6 +100,7 @@ subroutine run(f)
     if (rc /= TREXIO_SUCCESS) then
       print *, irp_here
       print *, 'Error reading AO potential N-e integrals'
+      call trexio_assert(rc, TREXIO_SUCCESS)
       stop -1
     endif
     call ezfio_set_ao_one_e_ints_ao_integrals_n_e(A)
@@ -106,41 +111,112 @@ subroutine run(f)
 
   ! AO 2e integrals
   ! ---------------
-  PROVIDE ao_integrals_map
 
-  integer*4 :: BUFSIZE
-  BUFSIZE=ao_num**2
-  allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-  allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+  rc = trexio_has_ao_2e_int(f)
+  PROVIDE ao_num
+  if (rc /= TREXIO_HAS_NOT) then
+      PROVIDE ao_integrals_map
 
-  integer*8 :: offset, icount
+      integer*4 :: BUFSIZE
+      BUFSIZE=ao_num**2
+      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
 
-  offset = 0_8
-  icount = BUFSIZE
-  rc = TREXIO_SUCCESS
-  do while (icount == size(V))
-    rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
-    do m=1,icount
-      i = Vi(1,m)
-      j = Vi(2,m)
-      k = Vi(3,m)
-      l = Vi(4,m)
-      integral = V(m)
-      call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-      buffer_values(m) = integral
-    enddo
-    call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
-    offset = offset + icount
+      integer*8 :: offset, icount
+
+      offset = 0_8
+      icount = BUFSIZE
+      rc = TREXIO_SUCCESS
+      do while (icount == size(V))
+        rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
+        do m=1,icount
+          i = Vi(1,m)
+          j = Vi(2,m)
+          k = Vi(3,m)
+          l = Vi(4,m)
+          integral = V(m)
+          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+          buffer_values(m) = integral
+        enddo
+        call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
+        offset = offset + icount
+        if (rc /= TREXIO_SUCCESS) then
+            exit
+        endif
+      end do
+      n_integrals = offset
+
+      call map_sort(ao_integrals_map)
+      call map_unique(ao_integrals_map)
+
+      call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
+      call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
+
+      deallocate(buffer_i, buffer_values, Vi, V)
+      print *, 'AO integrals read from TREXIO file'
+  else
+      print *, 'AO integrals not found in TREXIO file'
+  endif
+
+  ! MO integrals
+  ! ------------
+
+  allocate(A(mo_num, mo_num))
+  if (trexio_has_mo_1e_int_core_hamiltonian(f) == TREXIO_SUCCESS) then
+    rc = trexio_read_mo_1e_int_core_hamiltonian(f, A)
     if (rc /= TREXIO_SUCCESS) then
-        exit
+      print *, irp_here
+      print *, 'Error reading MO 1e integrals'
+      call trexio_assert(rc, TREXIO_SUCCESS)
+      stop -1
     endif
-  end do
-  n_integrals = offset
+    call ezfio_set_mo_one_e_ints_mo_one_e_integrals(A)
+    call ezfio_set_mo_one_e_ints_io_mo_one_e_integrals('Read')
+  endif
+  deallocate(A)
 
-  call map_sort(ao_integrals_map)
-  call map_unique(ao_integrals_map)
+  ! MO 2e integrals
+  ! ---------------
 
-  call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
-  call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
+  rc = trexio_has_mo_2e_int(f)
+  if (rc /= TREXIO_HAS_NOT) then
+
+      BUFSIZE=mo_num**2
+      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+
+
+      offset = 0_8
+      icount = BUFSIZE
+      rc = TREXIO_SUCCESS
+      do while (icount == size(V))
+        rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
+        do m=1,icount
+          i = Vi(1,m)
+          j = Vi(2,m)
+          k = Vi(3,m)
+          l = Vi(4,m)
+          integral = V(m)
+          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+          buffer_values(m) = integral
+        enddo
+        call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
+        offset = offset + icount
+        if (rc /= TREXIO_SUCCESS) then
+            exit
+        endif
+      end do
+      n_integrals = offset
+
+      call map_sort(mo_integrals_map)
+      call map_unique(mo_integrals_map)
+
+      call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
+      call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
+      deallocate(buffer_i, buffer_values, Vi, V)
+      print *, 'MO integrals read from TREXIO file'
+  else
+      print *, 'MO integrals not found in TREXIO file'
+  endif
 
 end

From ff5d62f840ee0c685120df72724d6ba049a07037 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 31 May 2023 11:01:51 +0200
Subject: [PATCH 30/79] Fix normalization factor in trexio

---
 src/trexio/EZFIO.cfg                    |  30 ++--
 src/trexio/export_trexio.irp.f          |   2 +-
 src/trexio/export_trexio_routines.irp.f | 176 +++++++++++++-----------
 3 files changed, 119 insertions(+), 89 deletions(-)

diff --git a/src/trexio/EZFIO.cfg b/src/trexio/EZFIO.cfg
index 8606e908..8c11478e 100644
--- a/src/trexio/EZFIO.cfg
+++ b/src/trexio/EZFIO.cfg
@@ -10,11 +10,17 @@ doc: Name of the exported TREXIO file
 interface: ezfio, ocaml, provider
 default: None
 
-[export_rdm]
+[export_basis]
 type: logical
-doc: If True, export two-body reduced density matrix
+doc: If True, export basis set and AOs
 interface: ezfio, ocaml, provider
-default: False
+default: True
+
+[export_mos]
+type: logical
+doc: If True, export basis set and AOs
+interface: ezfio, ocaml, provider
+default: True
 
 [export_ao_one_e_ints]
 type: logical
@@ -22,12 +28,6 @@ doc: If True, export one-electron integrals in AO basis
 interface: ezfio, ocaml, provider
 default: False
 
-[export_mo_one_e_ints]
-type: logical
-doc: If True, export one-electron integrals in MO basis
-interface: ezfio, ocaml, provider
-default: False
-
 [export_ao_two_e_ints]
 type: logical
 doc: If True, export two-electron integrals in AO basis
@@ -40,6 +40,12 @@ doc: If True, export Cholesky-decomposed two-electron integrals in AO basis
 interface: ezfio, ocaml, provider
 default: False
 
+[export_mo_one_e_ints]
+type: logical
+doc: If True, export one-electron integrals in MO basis
+interface: ezfio, ocaml, provider
+default: False
+
 [export_mo_two_e_ints]
 type: logical
 doc: If True, export two-electron integrals in MO basis
@@ -52,3 +58,9 @@ doc: If True, export Cholesky-decomposed two-electron integrals in MO basis
 interface: ezfio, ocaml, provider
 default: False
 
+[export_rdm]
+type: logical
+doc: If True, export two-body reduced density matrix
+interface: ezfio, ocaml, provider
+default: False
+
diff --git a/src/trexio/export_trexio.irp.f b/src/trexio/export_trexio.irp.f
index 3ae0dcb4..f9ecc17f 100644
--- a/src/trexio/export_trexio.irp.f
+++ b/src/trexio/export_trexio.irp.f
@@ -2,6 +2,6 @@ program export_trexio_prog
   implicit none
   read_wf = .True.
   SOFT_TOUCH read_wf
-  call export_trexio
+  call export_trexio(.False.)
 end
 
diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f
index c55ddc5e..f25ae370 100644
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@@ -1,15 +1,17 @@
-subroutine export_trexio
+subroutine export_trexio(update)
   use trexio
   implicit none
   BEGIN_DOC
   !     Exports the wave function in TREXIO format
   END_DOC
 
+  logical, intent(in)            :: update
   integer(trexio_t)              :: f(N_states) ! TREXIO file handle
   integer(trexio_exit_code)      :: rc
   integer                        :: k
   double precision, allocatable  :: factor(:)
   character*(256)  :: filenames(N_states)
+  character :: rw
 
   filenames(1) = trexio_filename
   do k=2,N_states
@@ -18,15 +20,26 @@ subroutine export_trexio
 
   do k=1,N_states
     print *, 'TREXIO file : ', trim(filenames(k))
-    call system('test -f '//trim(filenames(k))//' && mv '//trim(filenames(k))//' '//trim(filenames(k))//'.bak')
+    if (update) then
+      call system('test -f '//trim(filenames(k))//' && cp -r '//trim(filenames(k))//' '//trim(filenames(k))//'.bak')
+    else
+      call system('test -f '//trim(filenames(k))//' && mv '//trim(filenames(k))//' '//trim(filenames(k))//'.bak')
+    endif
   enddo
   print *, ''
 
+  if (update) then
+     rw = 'u'
+  else
+     rw = 'w'
+  endif
+
+
   do k=1,N_states
     if (backend == 0) then
-      f(k) = trexio_open(filenames(k), 'u', TREXIO_HDF5, rc)
+      f(k) = trexio_open(filenames(k), rw, TREXIO_HDF5, rc)
     else if (backend == 1) then
-      f(k) = trexio_open(filenames(k), 'u', TREXIO_TEXT, rc)
+      f(k) = trexio_open(filenames(k), rw, TREXIO_TEXT, rc)
     endif
     if (f(k) == 0_8) then
       print *, 'Unable to open TREXIO file for writing'
@@ -171,92 +184,95 @@ subroutine export_trexio
   endif
 
 
+  if (export_basis) then
+
 ! Basis
 ! -----
 
-  print *, 'Basis'
+    print *, 'Basis'
 
+    rc = trexio_write_basis_type(f(1), 'Gaussian', len('Gaussian'))
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_basis_type(f(1), 'Gaussian', len('Gaussian'))
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    rc = trexio_write_basis_prim_num(f(1), prim_num)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_basis_prim_num(f(1), prim_num)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+     rc = trexio_write_basis_shell_num(f(1), shell_num)
+     call trexio_assert(rc, TREXIO_SUCCESS)
 
-   rc = trexio_write_basis_shell_num(f(1), shell_num)
-   call trexio_assert(rc, TREXIO_SUCCESS)
+     rc = trexio_write_basis_nucleus_index(f(1), basis_nucleus_index)
+     call trexio_assert(rc, TREXIO_SUCCESS)
 
-   rc = trexio_write_basis_nucleus_index(f(1), basis_nucleus_index)
-   call trexio_assert(rc, TREXIO_SUCCESS)
+     rc = trexio_write_basis_shell_ang_mom(f(1), shell_ang_mom)
+     call trexio_assert(rc, TREXIO_SUCCESS)
 
-   rc = trexio_write_basis_shell_ang_mom(f(1), shell_ang_mom)
-   call trexio_assert(rc, TREXIO_SUCCESS)
+     allocate(factor(shell_num))
+!     if (ao_normalized) then
+!       factor(1:shell_num) = shell_normalization_factor(1:shell_num)
+!     else
+       factor(1:shell_num) = 1.d0
+!     endif
+     rc = trexio_write_basis_shell_factor(f(1), factor)
+     call trexio_assert(rc, TREXIO_SUCCESS)
 
-   allocate(factor(shell_num))
-   if (ao_normalized) then
-     factor(1:shell_num) = shell_normalization_factor(1:shell_num)
-   else
-     factor(1:shell_num) = 1.d0
-   endif
-   rc = trexio_write_basis_shell_factor(f(1), factor)
-   call trexio_assert(rc, TREXIO_SUCCESS)
+     deallocate(factor)
 
-   deallocate(factor)
+    rc = trexio_write_basis_shell_index(f(1), shell_index)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_basis_shell_index(f(1), shell_index)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    rc = trexio_write_basis_exponent(f(1), prim_expo)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_basis_exponent(f(1), prim_expo)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    rc = trexio_write_basis_coefficient(f(1), prim_coef)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_basis_coefficient(f(1), prim_coef)
-  call trexio_assert(rc, TREXIO_SUCCESS)
-
-  allocate(factor(prim_num))
-  if (primitives_normalized) then
-    factor(1:prim_num) = prim_normalization_factor(1:prim_num)
-  else
-    factor(1:prim_num) = 1.d0
-  endif
-  rc = trexio_write_basis_prim_factor(f(1), factor)
-  call trexio_assert(rc, TREXIO_SUCCESS)
-  deallocate(factor)
+    allocate(factor(prim_num))
+    if (primitives_normalized) then
+      factor(1:prim_num) = prim_normalization_factor(1:prim_num)
+    else
+      factor(1:prim_num) = 1.d0
+    endif
+    rc = trexio_write_basis_prim_factor(f(1), factor)
+    call trexio_assert(rc, TREXIO_SUCCESS)
+    deallocate(factor)
 
 
 ! Atomic orbitals
 ! ---------------
 
-  print *, 'AOs'
+    print *, 'AOs'
 
-  rc = trexio_write_ao_num(f(1), ao_num)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    rc = trexio_write_ao_num(f(1), ao_num)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_ao_cartesian(f(1), 1)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    rc = trexio_write_ao_cartesian(f(1), 1)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  rc = trexio_write_ao_shell(f(1), ao_shell)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    rc = trexio_write_ao_shell(f(1), ao_shell)
+    call trexio_assert(rc, TREXIO_SUCCESS)
 
-  integer :: i, pow0(3), powA(3), j, l, nz
-  double precision :: normA, norm0, C_A(3), overlap_x, overlap_z, overlap_y, c
-  nz=100
+    integer :: i, pow0(3), powA(3), j, l, nz
+    double precision :: normA, norm0, C_A(3), overlap_x, overlap_z, overlap_y, c
+    nz=100
 
-  C_A(1) = 0.d0
-  C_A(2) = 0.d0
-  C_A(3) = 0.d0
+    C_A(1) = 0.d0
+    C_A(2) = 0.d0
+    C_A(3) = 0.d0
+
+    allocate(factor(ao_num))
+    if (ao_normalized) then
+      do i=1,ao_num
+        l = ao_first_of_shell(ao_shell(i))
+        factor(i) = (ao_coef_normalized(i,1)+tiny(1.d0))/(ao_coef_normalized(l,1)+tiny(1.d0))
+      enddo
+    else
+      factor(:) = 1.d0
+    endif
+    rc = trexio_write_ao_normalization(f(1), factor)
+    call trexio_assert(rc, TREXIO_SUCCESS)
+    deallocate(factor)
 
-  allocate(factor(ao_num))
-  if (ao_normalized) then
-    do i=1,ao_num
-      l = ao_first_of_shell(ao_shell(i))
-      factor(i) = (ao_coef_normalized(i,1)+tiny(1.d0))/(ao_coef_normalized(l,1)+tiny(1.d0))
-    enddo
-  else
-    factor(:) = 1.d0
   endif
-  rc = trexio_write_ao_normalization(f(1), factor)
-  call trexio_assert(rc, TREXIO_SUCCESS)
-  deallocate(factor)
 
 ! One-e AO integrals
 ! ------------------
@@ -375,28 +391,30 @@ subroutine export_trexio
 ! Molecular orbitals
 ! ------------------
 
-  print *, 'MOs'
+  if (export_mos) then
+    print *, 'MOs'
 
-  rc = trexio_write_mo_type(f(1), mo_label, len(trim(mo_label)))
-  call trexio_assert(rc, TREXIO_SUCCESS)
-
-  do k=1,N_states
-    rc = trexio_write_mo_num(f(k), mo_num)
+    rc = trexio_write_mo_type(f(1), mo_label, len(trim(mo_label)))
     call trexio_assert(rc, TREXIO_SUCCESS)
-  enddo
 
-  rc = trexio_write_mo_coefficient(f(1), mo_coef)
-  call trexio_assert(rc, TREXIO_SUCCESS)
+    do k=1,N_states
+      rc = trexio_write_mo_num(f(k), mo_num)
+      call trexio_assert(rc, TREXIO_SUCCESS)
+    enddo
 
-  if ( (trim(mo_label) == 'Canonical').and. &
-       (export_mo_two_e_ints_cholesky.or.export_mo_two_e_ints) ) then
-    rc = trexio_write_mo_energy(f(1), fock_matrix_diag_mo)
+    rc = trexio_write_mo_coefficient(f(1), mo_coef)
+    call trexio_assert(rc, TREXIO_SUCCESS)
+
+    if ( (trim(mo_label) == 'Canonical').and. &
+         (export_mo_two_e_ints_cholesky.or.export_mo_two_e_ints) ) then
+      rc = trexio_write_mo_energy(f(1), fock_matrix_diag_mo)
+      call trexio_assert(rc, TREXIO_SUCCESS)
+    endif
+
+    rc = trexio_write_mo_class(f(1), mo_class, len(mo_class(1)))
     call trexio_assert(rc, TREXIO_SUCCESS)
   endif
 
-  rc = trexio_write_mo_class(f(1), mo_class, len(mo_class(1)))
-  call trexio_assert(rc, TREXIO_SUCCESS)
-
 ! One-e MO integrals
 ! ------------------
 

From 3306d26e0e0f08cd407df73de4f44388ebc6a919 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 31 May 2023 11:47:53 +0200
Subject: [PATCH 31/79] Fix import_trexio

---
 scripts/qp_import_trexio.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/qp_import_trexio.py b/scripts/qp_import_trexio.py
index e7bc0f78..89096387 100755
--- a/scripts/qp_import_trexio.py
+++ b/scripts/qp_import_trexio.py
@@ -340,6 +340,7 @@ def write_ezfio(trexio_filename, filename):
     except:
       label = "None"
     ezfio.set_mo_basis_mo_label(label)
+    ezfio.set_determinants_mo_label(label)
 
     try:
       clss = trexio.read_mo_class(trexio_file)

From 87090d73978169b167e103e3fd867682d9f5b32f Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 31 May 2023 18:11:54 +0200
Subject: [PATCH 32/79] fixed nuclear repulsion in fci_tc_bi_ortho

---
 src/fci_tc_bi/diagonalize_ci.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fci_tc_bi/diagonalize_ci.irp.f b/src/fci_tc_bi/diagonalize_ci.irp.f
index b6ec073f..df753449 100644
--- a/src/fci_tc_bi/diagonalize_ci.irp.f
+++ b/src/fci_tc_bi/diagonalize_ci.irp.f
@@ -39,7 +39,7 @@ subroutine diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
    write(*,'(A28,X,I10,X,100(F16.8,X))')'Ndet,E,E+PT2,E+RPT2,|PT2|=',ndet,E_tc ,E_tc  + pt2_tmp/norm,E_tc  + rpt2_tmp/norm,abs_pt2
    print*,'*****'
   endif
-  psi_energy(1:N_states) = eigval_right_tc_bi_orth(1:N_states)
+  psi_energy(1:N_states) = eigval_right_tc_bi_orth(1:N_states) - nuclear_repulsion
   psi_s2(1:N_states) = s2_eigvec_tc_bi_orth(1:N_states)
 
   E_tc  = eigval_right_tc_bi_orth(1)

From 00be08932321f73b425987f99e4e5fcc685425f9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 09:56:06 +0200
Subject: [PATCH 33/79] Removed duplicate provider in cosgto

---
 src/cosgtos_ao_int/EZFIO.cfg | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/cosgtos_ao_int/EZFIO.cfg b/src/cosgtos_ao_int/EZFIO.cfg
index 8edeecd0..fe57b652 100644
--- a/src/cosgtos_ao_int/EZFIO.cfg
+++ b/src/cosgtos_ao_int/EZFIO.cfg
@@ -10,10 +10,3 @@ doc: If true, use cosgtos for AO integrals
 interface: ezfio,provider,ocaml
 default: False
 
-[ao_integrals_threshold]
-type: Threshold
-doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero
-interface: ezfio,provider,ocaml
-default: 1.e-15
-ezfio_name: threshold_ao
-

From 77186e0560d7febb0cf0f0ce7c6c03f98e1b9d9c Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 11:11:29 +0200
Subject: [PATCH 34/79] Cleaned cosgtos

---
 src/ao_basis/EZFIO.cfg                               | 11 +++++++++++
 src/ao_one_e_ints/NEED                               |  1 -
 .../aos_cosgtos.irp.f                                |  0
 .../one_e_Coul_integrals_cosgtos.irp.f}              |  0
 .../one_e_kin_integrals_cosgtos.irp.f}               |  0
 .../gauss_legendre.irp.f                             |  0
 .../two_e_Coul_integrals_cosgtos.irp.f}              |  0
 src/cosgtos_ao_int/EZFIO.cfg                         | 12 ------------
 src/cosgtos_ao_int/NEED                              |  2 --
 src/cosgtos_ao_int/README.rst                        |  4 ----
 src/cosgtos_ao_int/cosgtos_ao_int.irp.f              |  7 -------
 11 files changed, 11 insertions(+), 26 deletions(-)
 rename src/{cosgtos_ao_int => ao_one_e_ints}/aos_cosgtos.irp.f (100%)
 rename src/{cosgtos_ao_int/one_e_Coul_integrals.irp.f => ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f} (100%)
 rename src/{cosgtos_ao_int/one_e_kin_integrals.irp.f => ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f} (100%)
 rename src/{cosgtos_ao_int => ao_two_e_ints}/gauss_legendre.irp.f (100%)
 rename src/{cosgtos_ao_int/two_e_Coul_integrals.irp.f => ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f} (100%)
 delete mode 100644 src/cosgtos_ao_int/EZFIO.cfg
 delete mode 100644 src/cosgtos_ao_int/NEED
 delete mode 100644 src/cosgtos_ao_int/README.rst
 delete mode 100644 src/cosgtos_ao_int/cosgtos_ao_int.irp.f

diff --git a/src/ao_basis/EZFIO.cfg b/src/ao_basis/EZFIO.cfg
index 51d726da..a203e3f0 100644
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@@ -67,3 +67,14 @@ doc: Use normalized primitive functions
 interface: ezfio, provider
 default: true
 
+[ao_expoim_cosgtos]
+type: double precision
+doc: imag part for Exponents for each primitive of each cosGTOs |AO|
+size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
+interface: ezfio, provider
+
+[use_cosgtos]
+type: logical
+doc: If true, use cosgtos for AO integrals
+interface: ezfio,provider
+default: False
diff --git a/src/ao_one_e_ints/NEED b/src/ao_one_e_ints/NEED
index b9caaf5d..61d23b1e 100644
--- a/src/ao_one_e_ints/NEED
+++ b/src/ao_one_e_ints/NEED
@@ -1,3 +1,2 @@
 ao_basis
 pseudo
-cosgtos_ao_int
diff --git a/src/cosgtos_ao_int/aos_cosgtos.irp.f b/src/ao_one_e_ints/aos_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/aos_cosgtos.irp.f
rename to src/ao_one_e_ints/aos_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/one_e_Coul_integrals.irp.f b/src/ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/one_e_Coul_integrals.irp.f
rename to src/ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/one_e_kin_integrals.irp.f b/src/ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/one_e_kin_integrals.irp.f
rename to src/ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/gauss_legendre.irp.f b/src/ao_two_e_ints/gauss_legendre.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/gauss_legendre.irp.f
rename to src/ao_two_e_ints/gauss_legendre.irp.f
diff --git a/src/cosgtos_ao_int/two_e_Coul_integrals.irp.f b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/two_e_Coul_integrals.irp.f
rename to src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/EZFIO.cfg b/src/cosgtos_ao_int/EZFIO.cfg
deleted file mode 100644
index fe57b652..00000000
--- a/src/cosgtos_ao_int/EZFIO.cfg
+++ /dev/null
@@ -1,12 +0,0 @@
-[ao_expoim_cosgtos]
-type: double precision
-doc: imag part for Exponents for each primitive of each cosGTOs |AO|
-size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
-interface: ezfio, provider
-
-[use_cosgtos]
-type: logical
-doc: If true, use cosgtos for AO integrals
-interface: ezfio,provider,ocaml
-default: False
-
diff --git a/src/cosgtos_ao_int/NEED b/src/cosgtos_ao_int/NEED
deleted file mode 100644
index 932f88a3..00000000
--- a/src/cosgtos_ao_int/NEED
+++ /dev/null
@@ -1,2 +0,0 @@
-ezfio_files
-ao_basis
diff --git a/src/cosgtos_ao_int/README.rst b/src/cosgtos_ao_int/README.rst
deleted file mode 100644
index 01f25d6d..00000000
--- a/src/cosgtos_ao_int/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-==============
-cosgtos_ao_int
-==============
-
diff --git a/src/cosgtos_ao_int/cosgtos_ao_int.irp.f b/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
deleted file mode 100644
index d65dfba5..00000000
--- a/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
+++ /dev/null
@@ -1,7 +0,0 @@
-program cosgtos_ao_int
-  implicit none
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-  print *, 'Hello world'
-end

From d05e4ed0b310fb083bf4318e8c7dee481dda302f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 17:46:07 +0200
Subject: [PATCH 35/79] Fix ao_basis/use_cosgtos not found in EZFIO file

---
 src/ao_basis/EZFIO.cfg     |  3 ++-
 src/ao_basis/cosgtos.irp.f | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 src/ao_basis/cosgtos.irp.f

diff --git a/src/ao_basis/EZFIO.cfg b/src/ao_basis/EZFIO.cfg
index a203e3f0..6ad9b998 100644
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@@ -76,5 +76,6 @@ interface: ezfio, provider
 [use_cosgtos]
 type: logical
 doc: If true, use cosgtos for AO integrals
-interface: ezfio,provider
+interface: ezfio
 default: False
+
diff --git a/src/ao_basis/cosgtos.irp.f b/src/ao_basis/cosgtos.irp.f
new file mode 100644
index 00000000..721a3e57
--- /dev/null
+++ b/src/ao_basis/cosgtos.irp.f
@@ -0,0 +1,33 @@
+BEGIN_PROVIDER [ logical, use_cosgtos  ]
+  implicit none
+  BEGIN_DOC
+! If true, use cosgtos for AO integrals
+  END_DOC
+
+  logical                        :: has
+  PROVIDE ezfio_filename
+  if (mpi_master) then
+    call ezfio_has_ao_basis_use_cosgtos(has)
+    if (has) then
+!      write(6,'(A)') '.. >>>>> [ IO READ: use_cosgtos ] <<<<< ..'
+      call ezfio_get_ao_basis_use_cosgtos(use_cosgtos)
+    else
+      use_cosgtos = .False.
+    endif
+  endif
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    call MPI_BCAST( use_cosgtos, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      stop 'Unable to read use_cosgtos with MPI'
+    endif
+  IRP_ENDIF
+
+!  call write_time(6)
+
+END_PROVIDER

From 5ab6a1d7fba6fbff88ac858747783bb9292b9a89 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 1 Jun 2023 19:59:25 +0200
Subject: [PATCH 36/79] few modifs

---
 src/bi_ort_ints/semi_num_ints_mo.irp.f       |  5 +++
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f |  2 +-
 src/non_h_ints_mu/grad_squared.irp.f         | 21 ++++++++++-
 src/non_h_ints_mu/new_grad_tc.irp.f          |  1 +
 src/non_h_ints_mu/tc_integ.irp.f             |  6 +++
 src/non_h_ints_mu/total_tc_int.irp.f         |  5 +++
 src/tc_scf/rh_tcscf_diis.irp.f               | 39 +++++++++++++++++++-
 src/tc_scf/tc_scf.irp.f                      |  4 ++
 8 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 0d727785..771d3274 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -138,10 +138,13 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
       enddo
     enddo
 
+    FREE int2_grad1_u12_ao
+
   endif
 
   call wall_time(wall1)
   print *, ' wall time for int2_grad1_u12_ao_transp ', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -200,6 +203,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
     enddo
   enddo
 
+  FREE int2_grad1_u12_bimo_transp
+
 END_PROVIDER 
 
 ! ---
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index e8b56307..5a3730b3 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -79,7 +79,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
   integer,          intent(in)  :: n, l, k, m, j, i
   double precision, intent(out) :: integral
   integer                       :: ipoint
-  double precision              :: weight
+  double precision              :: weight, tmp
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_t
diff --git a/src/non_h_ints_mu/grad_squared.irp.f b/src/non_h_ints_mu/grad_squared.irp.f
index 3f1a9bf5..44a6ae65 100644
--- a/src/non_h_ints_mu/grad_squared.irp.f
+++ b/src/non_h_ints_mu/grad_squared.irp.f
@@ -231,6 +231,7 @@ BEGIN_PROVIDER [ double precision, grad12_j12, (ao_num, ao_num, n_points_final_g
   call wall_time(time0)
 
   PROVIDE j1b_type
+  PROVIDE int2_grad1u2_grad2u2_j1b2
 
   do ipoint = 1, n_points_final_grid
     tmp1 = v_1b(ipoint)
@@ -242,6 +243,8 @@ BEGIN_PROVIDER [ double precision, grad12_j12, (ao_num, ao_num, n_points_final_g
     enddo
   enddo
 
+  FREE int2_grad1u2_grad2u2_j1b2
+
   !if(j1b_type .eq. 0) then 
   !  grad12_j12 = 0.d0
   !  do ipoint = 1, n_points_final_grid
@@ -262,6 +265,7 @@ BEGIN_PROVIDER [ double precision, grad12_j12, (ao_num, ao_num, n_points_final_g
 
   call wall_time(time1)
   print*, ' Wall time for grad12_j12 = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -278,6 +282,9 @@ BEGIN_PROVIDER [double precision, u12sq_j1bsq, (ao_num, ao_num, n_points_final_g
   print*, ' providing u12sq_j1bsq ...'
   call wall_time(time0)
 
+  ! do not free here
+  PROVIDE int2_u2_j1b2
+
   do ipoint = 1, n_points_final_grid
     tmp_x = v_1b_grad(1,ipoint)
     tmp_y = v_1b_grad(2,ipoint)
@@ -292,6 +299,7 @@ BEGIN_PROVIDER [double precision, u12sq_j1bsq, (ao_num, ao_num, n_points_final_g
 
   call wall_time(time1)
   print*, ' Wall time for u12sq_j1bsq = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -310,6 +318,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b, (ao_num, ao_num,
   print*, ' providing u12_grad1_u12_j1b_grad1_j1b ...'
   call wall_time(time0)
 
+  PROVIDE int2_u_grad1u_j1b2
+  PROVIDE int2_u_grad1u_x_j1b2
+
   do ipoint = 1, n_points_final_grid
 
     x     = final_grid_points(1,ipoint)
@@ -340,14 +351,17 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b, (ao_num, ao_num,
     enddo
   enddo
 
+  FREE int2_u_grad1u_j1b2
+  FREE int2_u_grad1u_x_j1b2
+
   call wall_time(time1)
   print*, ' Wall time for u12_grad1_u12_j1b_grad1_j1b = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
 ! ---
 
-
 BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
 
   BEGIN_DOC
@@ -401,6 +415,8 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
               , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid &
               , 0.d0, tc_grad_square_ao, ao_num*ao_num)
 
+    FREE int2_grad1_u12_square_ao
+
     ! ---
 
     if(((j1b_type .eq. 3) .or. (j1b_type .eq. 4)) .and. use_ipp) then
@@ -442,6 +458,8 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
       call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0     &
                 , int2_u2_j1b2(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid &
                 , 1.d0, tc_grad_square_ao, ao_num*ao_num)
+
+      FREE int2_u2_j1b2
     endif
 
     ! ---
@@ -478,6 +496,7 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
 
   call wall_time(time1)
   print*, ' Wall time for tc_grad_square_ao = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
diff --git a/src/non_h_ints_mu/new_grad_tc.irp.f b/src/non_h_ints_mu/new_grad_tc.irp.f
index 24e7e743..499ffe9d 100644
--- a/src/non_h_ints_mu/new_grad_tc.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc.irp.f
@@ -284,6 +284,7 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
 
   call wall_time(time1)
   print*, ' Wall time for tc_grad_and_lapl_ao = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/non_h_ints_mu/tc_integ.irp.f b/src/non_h_ints_mu/tc_integ.irp.f
index d5995ae5..8251fc71 100644
--- a/src/non_h_ints_mu/tc_integ.irp.f
+++ b/src/non_h_ints_mu/tc_integ.irp.f
@@ -176,6 +176,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
 
   call wall_time(time1)
   print*, ' wall time for int2_grad1_u12_ao =', time1-time0 
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -242,6 +243,8 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
       !$OMP END DO
       !$OMP END PARALLEL
 
+      FREE u12sq_j1bsq grad12_j12
+
     else
 
       PROVIDE u12sq_j1bsq u12_grad1_u12_j1b_grad1_j1b grad12_j12
@@ -262,6 +265,8 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
       !$OMP END DO
       !$OMP END PARALLEL
 
+      FREE u12sq_j1bsq u12_grad1_u12_j1b_grad1_j1b grad12_j12
+
     endif
 
   elseif(j1b_type .ge. 100) then
@@ -324,6 +329,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
 
   call wall_time(time1)
   print*, ' wall time for int2_grad1_u12_square_ao =', time1-time0 
+  call print_memory_usage()
 
 END_PROVIDER
 
diff --git a/src/non_h_ints_mu/total_tc_int.irp.f b/src/non_h_ints_mu/total_tc_int.irp.f
index 450bbef0..2034872a 100644
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@@ -84,8 +84,13 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist, (ao_num, ao_num, ao_num, ao
     enddo
   endif
 
+  FREE tc_grad_square_ao
+  FREE tc_grad_and_lapl_ao
+  FREE ao_two_e_coul
+
   call wall_time(wall1)
   print *, ' wall time for ao_tc_int_chemist ', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/tc_scf/rh_tcscf_diis.irp.f b/src/tc_scf/rh_tcscf_diis.irp.f
index 20260a95..0504373c 100644
--- a/src/tc_scf/rh_tcscf_diis.irp.f
+++ b/src/tc_scf/rh_tcscf_diis.irp.f
@@ -11,6 +11,7 @@ subroutine rh_tcscf_diis()
 
   integer                       :: i, j, it
   integer                       :: dim_DIIS, index_dim_DIIS
+  logical                       :: converged
   double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, e_save, e_delta
   double precision              :: tc_grad, g_save, g_delta, g_delta_th
   double precision              :: level_shift_save, rate_th
@@ -92,8 +93,9 @@ subroutine rh_tcscf_diis()
 
   PROVIDE FQS_SQF_ao Fock_matrix_tc_ao_tot
 
+  converged = .false.
   !do while((tc_grad .gt. dsqrt(thresh_tcscf)) .and. (er_DIIS .gt. dsqrt(thresh_tcscf)))
-  do while(er_DIIS .gt. dsqrt(thresh_tcscf))
+  do while(.not. converged)
 
     call wall_time(t0)
 
@@ -218,21 +220,56 @@ subroutine rh_tcscf_diis()
     !g_delta_th  = dabs(tc_grad) ! g_delta)
     er_delta_th = dabs(er_DIIS) !er_delta)
 
+    converged = er_DIIS .lt. dsqrt(thresh_tcscf)
+
     call wall_time(t1)
     !write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
     !  it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
     write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
       it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
 
+
+!   Write data in JSON file
+
+    call lock_io
+    if (it == 1) then
+      write(json_unit, json_dict_uopen_fmt)
+    else
+      write(json_unit, json_dict_close_uopen_fmt)
+    endif
+    write(json_unit, json_int_fmt)  ' iteration      ', it
+    write(json_unit, json_real_fmt) ' SCF TC Energy  ', etc_tot
+    write(json_unit, json_real_fmt) ' E(1e)          ', etc_1e
+    write(json_unit, json_real_fmt) ' E(2e)          ', etc_2e
+    write(json_unit, json_real_fmt) ' E(3e)          ', etc_3e
+    write(json_unit, json_real_fmt) ' delta Energy   ', e_delta
+    write(json_unit, json_real_fmt) ' DIIS error     ', er_DIIS
+    write(json_unit, json_real_fmt) ' level_shift    ', level_shift_tcscf
+    write(json_unit, json_real_fmt) ' DIIS           ', dim_DIIS
+    write(json_unit, json_real_fmt) ' Wall time (min)', (t1-t0)/60.d0
+    call unlock_io
+
     if(er_delta .lt. 0.d0) then
       call ezfio_set_tc_scf_bitc_energy(etc_tot)
       call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
       call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+      write(json_unit, json_true_fmt) 'saved'
+    else
+      write(json_unit, json_false_fmt) 'saved'
     endif
+    call lock_io
 
+    if (converged) then
+      write(json_unit, json_true_fmtx) 'converged'
+    else
+      write(json_unit, json_false_fmtx) 'converged'
+    endif
+    call unlock_io
     if(qp_stop()) exit
   enddo
 
+  write(json_unit, json_dict_close_fmtx)
+
   ! ---
 
   print *, ' TCSCF DIIS converged !'
diff --git a/src/tc_scf/tc_scf.irp.f b/src/tc_scf/tc_scf.irp.f
index 88ddd26c..04c4f92d 100644
--- a/src/tc_scf/tc_scf.irp.f
+++ b/src/tc_scf/tc_scf.irp.f
@@ -8,6 +8,8 @@ program tc_scf
 
   implicit none
 
+  write(json_unit,json_array_open_fmt) 'tc-scf'
+
   print *, ' starting ...'
 
   my_grid_becke  = .True.
@@ -57,6 +59,8 @@ program tc_scf
 
   endif
 
+  write(json_unit,json_array_close_fmtx)
+  call json_close
 
 end
 

From 6971bf186cf020ce66d0bac091d06ae850bd803f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 21:42:02 +0200
Subject: [PATCH 37/79] Accelerated multiply_poly

---
 src/ao_one_e_ints/pot_ao_ints.irp.f          |  21 +-
 src/ao_two_e_ints/two_e_integrals.irp.f      | 136 +------
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f |  52 +--
 src/utils/integration.irp.f                  | 366 +++++++++++++------
 4 files changed, 305 insertions(+), 270 deletions(-)

diff --git a/src/ao_one_e_ints/pot_ao_ints.irp.f b/src/ao_one_e_ints/pot_ao_ints.irp.f
index 928053ad..446bf730 100644
--- a/src/ao_one_e_ints/pot_ao_ints.irp.f
+++ b/src/ao_one_e_ints/pot_ao_ints.irp.f
@@ -455,10 +455,12 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
     do ix=0,nx
       X(ix) *= dble(c)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
     ny=0
     call I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,Y,ny,n_pt_in)
-    call multiply_poly(Y,ny,R1x,2,d,nd)
+!    call multiply_poly(Y,ny,R1x,2,d,nd)
+    call multiply_poly_c2(Y,ny,R1x,d,nd)
   else
     do ix=0,n_pt_in
       X(ix) = 0.d0
@@ -469,7 +471,8 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
     do ix=0,nx
       X(ix) *= dble(a-1)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
 
     nx = nd
     do ix=0,n_pt_in
@@ -479,10 +482,12 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
     do ix=0,nx
       X(ix) *= dble(c)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
     ny=0
     call I_x1_pol_mult_one_e(a-1,c,R1x,R1xp,R2x,Y,ny,n_pt_in)
-    call multiply_poly(Y,ny,R1x,2,d,nd)
+!    call multiply_poly(Y,ny,R1x,2,d,nd)
+    call multiply_poly_c2(Y,ny,R1x,d,nd)
   endif
 end
 
@@ -519,7 +524,8 @@ recursive subroutine I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,d,nd,dim)
     do ix=0,nx
       X(ix) *= dble(c-1)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
     ny = 0
     do ix=0,dim
       Y(ix) = 0.d0
@@ -527,7 +533,8 @@ recursive subroutine I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,d,nd,dim)
 
     call I_x1_pol_mult_one_e(0,c-1,R1x,R1xp,R2x,Y,ny,dim)
     if(ny.ge.0)then
-      call multiply_poly(Y,ny,R1xp,2,d,nd)
+!      call multiply_poly(Y,ny,R1xp,2,d,nd)
+      call multiply_poly_c2(Y,ny,R1xp,d,nd)
     endif
   endif
 end
diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f
index 835dc89a..85ff5bcf 100644
--- a/src/ao_two_e_ints/two_e_integrals.irp.f
+++ b/src/ao_two_e_ints/two_e_integrals.irp.f
@@ -975,18 +975,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_10,2,d,nd)
-  if (nx >= 0) then
-    integer :: ib
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_10(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_10(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_10(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_10,d,nd)
 
   nx = nd
   !DIR$ LOOP COUNT(8)
@@ -1009,17 +998,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
     endif
 !    !DIR$ FORCEINLINE
 !    call multiply_poly(X,nx,B_00,2,d,nd)
-    if (nx >= 0) then
-       do ib=0,nx
-           d(ib  ) = d(ib  ) + B_00(0) * X(ib)
-           d(ib+1) = d(ib+1) + B_00(1) * X(ib)
-           d(ib+2) = d(ib+2) + B_00(2) * X(ib)
-       enddo
-
-       do nd = nx+2,0,-1
-         if (d(nd) /= 0.d0) exit
-       enddo
-    endif
+    call multiply_poly_c2(X,nx,B_00,d,nd)
   endif
 
   ny=0
@@ -1038,17 +1017,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(Y,ny,C_00,2,d,nd)
-   if (ny >= 0) then
-     do ib=0,ny
-         d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
-         d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
-         d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
-     enddo
-
-   do nd = ny+2,0,-1
-     if (d(nd) /= 0.d0) exit
-   enddo
-  endif
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
 
 recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
@@ -1088,18 +1057,7 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_00,2,d,nd)
-  if (nx >= 0) then
-    integer                        :: ib
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_00(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_00(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_00(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_00,d,nd)
 
   ny=0
 
@@ -1111,17 +1069,7 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(Y,ny,C_00,2,d,nd)
-  if (ny >= 0) then
-    do ib=0,ny
-      d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
-      d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
-      d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
-    enddo
-
-    do nd = ny+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 
 end
 
@@ -1150,18 +1098,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_10,2,d,nd)
-  if (nx >= 0) then
-    integer :: ib
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_10(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_10(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_10(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_10,d,nd)
 
   nx = nd
   !DIR$ LOOP COUNT(8)
@@ -1181,17 +1118,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_00,2,d,nd)
-  if (nx >= 0) then
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_00(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_00(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_00(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_00,d,nd)
 
   ny=0
   !DIR$ LOOP COUNT(8)
@@ -1203,17 +1130,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(Y,ny,C_00,2,d,nd)
-  if (ny >= 0) then
-    do ib=0,ny
-        d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
-        d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
-        d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
-    enddo
-
-    do nd = ny+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
 
 recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
@@ -1262,18 +1179,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
 
 !      !DIR$ FORCEINLINE
 !      call multiply_poly(Y,ny,D_00,2,d,nd)
-      if (ny >= 0) then
-        integer :: ib
-        do ib=0,ny
-            d(ib  ) = d(ib  ) + D_00(0) * Y(ib)
-            d(ib+1) = d(ib+1) + D_00(1) * Y(ib)
-            d(ib+2) = d(ib+2) + D_00(2) * Y(ib)
-        enddo
-
-        do nd = ny+2,0,-1
-          if (d(nd) /= 0.d0) exit
-        enddo
-      endif
+      call multiply_poly_c2(Y,ny,D_00,d,nd)
 
       return
 
@@ -1293,17 +1199,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
 
 !      !DIR$ FORCEINLINE
 !      call multiply_poly(X,nx,B_01,2,d,nd)
-      if (nx >= 0) then
-        do ib=0,nx
-          d(ib  ) = d(ib  ) + B_01(0) * X(ib)
-          d(ib+1) = d(ib+1) + B_01(1) * X(ib)
-          d(ib+2) = d(ib+2) + B_01(2) * X(ib)
-        enddo
-
-        do nd = nx+2,0,-1
-          if (d(nd) /= 0.d0) exit
-        enddo
-      endif
+      call multiply_poly_c2(X,nx,B_01,d,nd)
 
       ny = 0
       !DIR$ LOOP COUNT(6)
@@ -1314,17 +1210,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
 
 !      !DIR$ FORCEINLINE
 !      call multiply_poly(Y,ny,D_00,2,d,nd)
-      if (ny >= 0) then
-        do ib=0,ny
-            d(ib  ) = d(ib  ) + D_00(0) * Y(ib)
-            d(ib+1) = d(ib+1) + D_00(1) * Y(ib)
-            d(ib+2) = d(ib+2) + D_00(2) * Y(ib)
-        enddo
-
-        do nd = ny+2,0,-1
-          if (d(nd) /= 0.d0) exit
-        enddo
-      endif
+      call multiply_poly_c2(Y,ny,D_00,d,nd)
 
   end select
 end
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index e8b56307..a72cd682 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -4,7 +4,7 @@
 BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num, mo_num)]
 
  BEGIN_DOC
-! matrix element of the -L  three-body operator 
+! matrix element of the -L  three-body operator
 !
 ! notice the -1 sign: in this way three_body_ints_bi_ort can be directly used to compute Slater rules :)
  END_DOC
@@ -12,7 +12,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
  implicit none
  integer          :: i, j, k, l, m, n
  double precision :: integral, wall1, wall0
- character*(128)  :: name_file 
+ character*(128)  :: name_file
 
   three_body_ints_bi_ort = 0.d0
   print *, ' Providing the three_body_ints_bi_ort ...'
@@ -27,12 +27,12 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
 !   call read_array_6_index_tensor(mo_num,three_body_ints_bi_ort,name_file)
 !  else
 
-  !provide x_W_ki_bi_ortho_erf_rk 
+  !provide x_W_ki_bi_ortho_erf_rk
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
  !$OMP PARALLEL                       &
  !$OMP DEFAULT (NONE)                 &
- !$OMP PRIVATE (i,j,k,l,m,n,integral) & 
+ !$OMP PRIVATE (i,j,k,l,m,n,integral) &
  !$OMP SHARED (mo_num,three_body_ints_bi_ort)
  !$OMP DO SCHEDULE (dynamic)
   do i = 1, mo_num
@@ -43,7 +43,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
             do n = 1, mo_num
               call give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
 
-              three_body_ints_bi_ort(n,l,k,m,j,i) = -1.d0 * integral 
+              three_body_ints_bi_ort(n,l,k,m,j,i) = -1.d0 * integral
             enddo
           enddo
         enddo
@@ -63,7 +63,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
 !  call ezfio_set_three_body_ints_bi_ort_io_three_body_ints_bi_ort("Read")
 ! endif
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -71,7 +71,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
 
   BEGIN_DOC
   !
-  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS 
+  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS
   !
   END_DOC
 
@@ -79,28 +79,30 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
   integer,          intent(in)  :: n, l, k, m, j, i
   double precision, intent(out) :: integral
   integer                       :: ipoint
-  double precision              :: weight
+  double precision              :: weight, tmp
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_t
 
   integral = 0.d0
   do ipoint = 1, n_points_final_grid
-    weight = final_weight_at_r_vector(ipoint)                                                                          
 
-    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+    tmp =     mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
               * ( int2_grad1_u12_bimo_t(ipoint,1,n,m) * int2_grad1_u12_bimo_t(ipoint,1,l,j)    &
                 + int2_grad1_u12_bimo_t(ipoint,2,n,m) * int2_grad1_u12_bimo_t(ipoint,2,l,j)    &
                 + int2_grad1_u12_bimo_t(ipoint,3,n,m) * int2_grad1_u12_bimo_t(ipoint,3,l,j) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+
+    tmp = tmp + mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
               * ( int2_grad1_u12_bimo_t(ipoint,1,n,m) * int2_grad1_u12_bimo_t(ipoint,1,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,2,n,m) * int2_grad1_u12_bimo_t(ipoint,2,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,3,n,m) * int2_grad1_u12_bimo_t(ipoint,3,k,i) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,m) &
+
+    tmp = tmp + mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,m) &
               * ( int2_grad1_u12_bimo_t(ipoint,1,l,j) * int2_grad1_u12_bimo_t(ipoint,1,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,2,l,j) * int2_grad1_u12_bimo_t(ipoint,2,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,3,l,j) * int2_grad1_u12_bimo_t(ipoint,3,k,i) )
 
+    integral = integral + tmp * final_weight_at_r_vector(ipoint)
   enddo
 
 end subroutine give_integrals_3_body_bi_ort
@@ -111,7 +113,7 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 
   BEGIN_DOC
   !
-  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS 
+  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS
   !
   END_DOC
 
@@ -123,13 +125,13 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 
   integral = 0.d0
   do ipoint = 1, n_points_final_grid
-    weight = final_weight_at_r_vector(ipoint)                                                                          
+    weight = final_weight_at_r_vector(ipoint)
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
 !              * ( x_W_ki_bi_ortho_erf_rk(ipoint,1,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,1,l,j)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,2,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,2,l,j)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,3,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,3,l,j)  )
-!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
 !              * ( x_W_ki_bi_ortho_erf_rk(ipoint,1,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,1,k,i)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,2,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,2,k,i)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,3,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,3,k,i)  )
@@ -138,11 +140,11 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,2,l,j) * x_W_ki_bi_ortho_erf_rk(ipoint,2,k,i)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,3,l,j) * x_W_ki_bi_ortho_erf_rk(ipoint,3,k,i)  )
 
-!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
 !              * ( int2_grad1_u12_bimo(1,n,m,ipoint) * int2_grad1_u12_bimo(1,l,j,ipoint)        &
 !                + int2_grad1_u12_bimo(2,n,m,ipoint) * int2_grad1_u12_bimo(2,l,j,ipoint)        &
 !                + int2_grad1_u12_bimo(3,n,m,ipoint) * int2_grad1_u12_bimo(3,l,j,ipoint)        )
-!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
 !              * ( int2_grad1_u12_bimo(1,n,m,ipoint) * int2_grad1_u12_bimo(1,k,i,ipoint)        &
 !                + int2_grad1_u12_bimo(2,n,m,ipoint) * int2_grad1_u12_bimo(2,k,i,ipoint)        &
 !                + int2_grad1_u12_bimo(3,n,m,ipoint) * int2_grad1_u12_bimo(3,k,i,ipoint)        )
@@ -151,13 +153,13 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 !                + int2_grad1_u12_bimo(2,l,j,ipoint) * int2_grad1_u12_bimo(2,k,i,ipoint)        &
 !                + int2_grad1_u12_bimo(3,l,j,ipoint) * int2_grad1_u12_bimo(3,k,i,ipoint)        )
 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i)        & 
+    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i)        &
               * ( int2_grad1_u12_bimo_transp(n,m,1,ipoint) * int2_grad1_u12_bimo_transp(l,j,1,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,2,ipoint) * int2_grad1_u12_bimo_transp(l,j,2,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,3,ipoint) * int2_grad1_u12_bimo_transp(l,j,3,ipoint) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j)        & 
+    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j)        &
               * ( int2_grad1_u12_bimo_transp(n,m,1,ipoint) * int2_grad1_u12_bimo_transp(k,i,1,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,2,ipoint) * int2_grad1_u12_bimo_transp(k,i,2,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,3,ipoint) * int2_grad1_u12_bimo_transp(k,i,3,ipoint) )
@@ -176,7 +178,7 @@ subroutine give_integrals_3_body_bi_ort_ao(n, l, k, m, j, i, integral)
 
   BEGIN_DOC
   !
-  ! < n l k | -L | m j i > with a BI-ORTHONORMAL ATOMIC ORBITALS 
+  ! < n l k | -L | m j i > with a BI-ORTHONORMAL ATOMIC ORBITALS
   !
   END_DOC
 
@@ -188,13 +190,13 @@ subroutine give_integrals_3_body_bi_ort_ao(n, l, k, m, j, i, integral)
 
   integral = 0.d0
   do ipoint = 1, n_points_final_grid
-    weight = final_weight_at_r_vector(ipoint)                                                                          
+    weight = final_weight_at_r_vector(ipoint)
 
-    integral += weight * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,i) & 
+    integral += weight * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,i) &
               * ( int2_grad1_u12_ao_t(ipoint,1,n,m) * int2_grad1_u12_ao_t(ipoint,1,l,j)    &
                 + int2_grad1_u12_ao_t(ipoint,2,n,m) * int2_grad1_u12_ao_t(ipoint,2,l,j)    &
                 + int2_grad1_u12_ao_t(ipoint,3,n,m) * int2_grad1_u12_ao_t(ipoint,3,l,j) )
-    integral += weight * aos_in_r_array_transp(ipoint,l) * aos_in_r_array_transp(ipoint,j) & 
+    integral += weight * aos_in_r_array_transp(ipoint,l) * aos_in_r_array_transp(ipoint,j) &
               * ( int2_grad1_u12_ao_t(ipoint,1,n,m) * int2_grad1_u12_ao_t(ipoint,1,k,i)    &
                 + int2_grad1_u12_ao_t(ipoint,2,n,m) * int2_grad1_u12_ao_t(ipoint,2,k,i)    &
                 + int2_grad1_u12_ao_t(ipoint,3,n,m) * int2_grad1_u12_ao_t(ipoint,3,k,i) )
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index b60e3bc1..21179dac 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -56,7 +56,7 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
   !               * [ sum (l_y = 0,i_order(2)) P_new(l_y,2) * (y-P_center(2))^l_y ] exp (- p (y-P_center(2))^2 )
   !               * [ sum (l_z = 0,i_order(3)) P_new(l_z,3) * (z-P_center(3))^l_z ] exp (- p (z-P_center(3))^2 )
   !
-  ! WARNING ::: IF fact_k is too smal then: 
+  ! WARNING ::: IF fact_k is too smal then:
   ! returns a "s" function centered in zero
   ! with an inifinite exponent and a zero polynom coef
   END_DOC
@@ -86,7 +86,7 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
   !DIR$ FORCEINLINE
   call gaussian_product(alpha,A_center,beta,B_center,fact_k,p,P_center)
   if (fact_k < thresh) then
-    ! IF fact_k is too smal then: 
+    ! IF fact_k is too smal then:
     ! returns a "s" function centered in zero
     ! with an inifinite exponent and a zero polynom coef
     P_center = 0.d0
@@ -468,114 +468,6 @@ end subroutine
 
 
 
-subroutine multiply_poly_0c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:0), c(0:nc)
-  double precision, intent(inout) :: d(0:0+nc)
-
-  integer                        :: ic
-
-  do ic = 0,nc
-    d(ic) = d(ic) + c(ic) * b(0)
-  enddo
-
-  do nd = nc,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-subroutine multiply_poly_1c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:1), c(0:nc)
-  double precision, intent(inout) :: d(0:1+nc)
-
-  integer                        :: ic, id
-  if(nc < 0) return
-
-  do ic = 0,nc
-    d(  ic) = d(  ic) + c(ic) * b(0)
-    d(1+ic) = d(1+ic) + c(ic) * b(1)
-  enddo
-
-  do nd = nc+1,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-
-subroutine multiply_poly_2c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:2), c(0:nc)
-  double precision, intent(inout) :: d(0:2+nc)
-
-  integer                        :: ic, id, k
-  if (nc <0) return
-
-  do ic = 0,nc
-    d(  ic) = d(  ic) + c(ic) * b(0)
-    d(1+ic) = d(1+ic) + c(ic) * b(1)
-    d(2+ic) = d(2+ic) + c(ic) * b(2)
-  enddo
-
-  do nd = nc+2,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-subroutine multiply_poly_3c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:3), c(0:nc)
-  double precision, intent(inout) :: d(0:3+nc)
-
-  integer                        :: ic, id
-  if (nc <0) return
-
-  do ic = 1,nc
-    d(  ic) = d(1+ic) + c(ic) * b(0)
-    d(1+ic) = d(1+ic) + c(ic) * b(1)
-    d(2+ic) = d(1+ic) + c(ic) * b(2)
-    d(3+ic) = d(1+ic) + c(ic) * b(3)
-  enddo
-
-  do nd = nc+3,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-
-
 subroutine multiply_poly(b,nb,c,nc,d,nd)
   implicit none
   BEGIN_DOC
@@ -604,6 +496,254 @@ subroutine multiply_poly(b,nb,c,nc,d,nd)
 
 end
 
+
+subroutine multiply_poly_b0(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:0), c(0:nc)
+  double precision, intent(inout) :: d(0:nc)
+
+  integer                        :: ndtmp
+  integer                        :: ic, id, k
+  if(nc < 0) return !False if nc>=0
+
+  do ic = 0,nc
+    d(ic) = d(ic) + c(ic) * b(0)
+  enddo
+
+  do nd = nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+subroutine multiply_poly_b1(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:1), c(0:nc)
+  double precision, intent(inout) :: d(0:1+nc)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nc < 0) return !False if nc>=0
+
+
+  select case (nc)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(1) * b(1)
+
+    case default
+      d(0) = d(0) + c(0) * b(0)
+      do ic = 1,nc
+        d(ic) = d(ic) + c(ic) * b(0) + c(ic-1) * b(1)
+      enddo
+      d(nc+1) = d(nc+1) + c(nc) * b(1)
+
+  end select
+
+  do nd = 1+nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_b2(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:2), c(0:nc)
+  double precision, intent(inout) :: d(0:2+nc)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nc < 0) return !False if nc>=0
+
+  select case (nc)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1)
+      d(2) = d(2) + c(0) * b(2)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(0) * b(2) + c(1) * b(1)
+      d(3) = d(3) + c(1) * b(2)
+
+    case (2)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
+      d(3) = d(3) + c(2) * b(1) + c(1) * b(2)
+      d(4) = d(4) + c(2) * b(2)
+
+    case default
+
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      do ic = 2,nc
+        d(ic) = d(ic) + c(ic) * b(0) + c(ic-1) * b(1) + c(ic-2) * b(2)
+      enddo
+      d(nc+1) = d(nc+1) + c(nc) * b(1) + c(nc-1) * b(2)
+      d(nc+2) = d(nc+2) + c(nc) * b(2)
+
+  end select
+
+  do nd = 2+nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_c0(b,nb,c,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:0)
+  double precision, intent(inout) :: d(0:nb)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nb < 0) return !False if nb>=0
+
+  do ib=0,nb
+      d(ib) = d(ib) + c(0) * b(ib)
+  enddo
+
+  do nd = nb,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_c1(b,nb,c,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:1)
+  double precision, intent(inout) :: d(0:nb+1)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nb < 0) return !False if nb>=0
+
+  select case (nb)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(1) * b(0)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(1) * b(1)
+
+    case default
+      d(0) = d(0) + c(0) * b(0)
+      do ib=1,nb
+        d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1)
+      enddo
+      d(nb+1) = d(nb+1) + c(1) * b(nb)
+
+  end select
+
+  do nd = nb+1,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_c2(b,nb,c,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:2)
+  double precision, intent(inout) :: d(0:nb+2)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nb < 0) return !False if nb>=0
+
+  select case (nb)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(1) * b(0)
+      d(2) = d(2) + c(2) * b(0)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(1) * b(1) + c(2) * b(0)
+      d(3) = d(3) + c(2) * b(1)
+
+    case (2)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
+      d(3) = d(3) + c(1) * b(2) + c(2) * b(1)
+      d(4) = d(4) + c(2) * b(2)
+
+    case default
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      do ib=2,nb
+        d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1) + c(2) * b(ib-2)
+      enddo
+      d(nb+1) = d(nb+1) + c(1) * b(nb) + c(2) * b(nb-1)
+      d(nb+2) = d(nb+2) + c(2) * b(nb)
+
+  end select
+
+  do nd = nb+2,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+
+
 subroutine multiply_poly_v(b,nb,c,nc,d,nd,n_points)
   implicit none
   BEGIN_DOC
@@ -778,11 +918,11 @@ end subroutine recentered_poly2_v
 subroutine recentered_poly2_v0(P_new, lda, x_A, LD_xA, x_P, a, n_points)
 
   BEGIN_DOC
-  ! 
+  !
   ! Recenter two polynomials. Special case for b=(0,0,0)
-  ! 
+  !
   ! (x - A)^a (x - B)^0 = (x - P + P - A)^a  (x - Q + Q - B)^0
-  !                     = (x - P + P - A)^a 
+  !                     = (x - P + P - A)^a
   !
   END_DOC
 

From b9c18338960064f01bdd3e7bf6427ab3510575c9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 00:33:37 +0200
Subject: [PATCH 38/79] Optimized three_e_5_idx_exch12_bi_ort

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |  12 +-
 src/bi_ort_ints/three_body_ijmkl.irp.f | 162 ++++++++++++++++++++-----
 src/utils/integration.irp.f            |  24 ++++
 3 files changed, 166 insertions(+), 32 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index ca50dd56..63b2aa8c 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -16,23 +16,27 @@ subroutine test_3e
  double precision :: accu, contrib,new,ref
  i = 1
  k = 1
+ n = 0
  accu = 0.d0
  do i = 1, mo_num
   do k = 1, mo_num 
    do j = 1, mo_num
     do l = 1, mo_num 
      do m = 1, mo_num
-      do n = 1, mo_num
-        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
-        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
+      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
+!      do n = 1, mo_num
+!        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
+!        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
         contrib = dabs(new - ref)
         accu += contrib
         if(contrib .gt. 1.d-10)then
          print*,'pb !!'
          print*,i,k,j,l,m,n
          print*,ref,new,contrib
+         stop
         endif
-      enddo
+!      enddo
      enddo
     enddo
    enddo
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index ae4c9bd5..af2cb353 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -24,7 +24,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -33,7 +33,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
-            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -45,7 +45,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -73,7 +73,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -82,7 +82,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -94,7 +94,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -122,7 +122,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -131,7 +131,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
         do m = 1, mo_num
           do l = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -143,7 +143,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -171,7 +171,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -180,7 +180,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -192,7 +192,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -220,7 +220,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -229,7 +229,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
-            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -241,7 +241,57 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  three_e_5_idx_exch12_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
+            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
 
 ! ---
 
@@ -259,38 +309,94 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
 
   implicit none
   integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  three_e_5_idx_exch12_bi_ort = 0.d0
   print *, ' Providing the three_e_5_idx_exch12_bi_ort ...'
   call wall_time(wall0)
 
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+ do m = 1, mo_num
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
- !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         m2grad_r, m2grad_l, grad_mli, tmp_mat, orb_mat)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+         grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
+               int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
+               int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
+               int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+         m2grad_l(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         m2grad_l(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         m2grad_l(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+         m2grad_r(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         m2grad_r(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         m2grad_r(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
+      orb_mat, n_points_final_grid,  &
+      grad_mli, n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
   do i = 1, mo_num
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
-            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = -1.d0 * integral 
-          enddo
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,k,j) - tmp_mat(k,j,l,i)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      m2grad_l, 3*n_points_final_grid,  &
+      m2grad_r, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = &
+                three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  enddo
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index 21179dac..b548b18a 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -484,6 +484,30 @@ subroutine multiply_poly(b,nb,c,nc,d,nd)
   integer                        :: ib, ic, id, k
   if(ior(nc,nb) < 0) return !False if nc>=0 and nb>=0
 
+  select case (nb)
+    case (0)
+      call multiply_poly_b0(b,c,nc,d,nd)
+      return
+    case (1)
+      call multiply_poly_b1(b,c,nc,d,nd)
+      return
+    case (2)
+      call multiply_poly_b2(b,c,nc,d,nd)
+      return
+  end select
+
+  select case (nc)
+    case (0)
+      call multiply_poly_c0(b,nb,c,d,nd)
+      return
+    case (1)
+      call multiply_poly_c1(b,nb,c,d,nd)
+      return
+    case (2)
+      call multiply_poly_c2(b,nb,c,d,nd)
+      return
+  end select
+
   do ib=0,nb
     do ic = 0,nc
       d(ib+ic) = d(ib+ic) + c(ic) * b(ib)

From fb5300a8e59d4dc08c4ce118317f4deffef7daba Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 08:51:04 +0200
Subject: [PATCH 39/79] Preparing for optimization of 5idx in TC

---
 external/qp2-dependencies                  |   2 +-
 src/bi_ort_ints/three_body_ijmkl.irp.f     |  58 +---
 src/bi_ort_ints/three_body_ijmkl_old.irp.f | 295 +++++++++++++++++++++
 3 files changed, 303 insertions(+), 52 deletions(-)
 create mode 100644 src/bi_ort_ints/three_body_ijmkl_old.irp.f

diff --git a/external/qp2-dependencies b/external/qp2-dependencies
index 6e23ebac..e0d0e02e 160000
--- a/external/qp2-dependencies
+++ b/external/qp2-dependencies
@@ -1 +1 @@
-Subproject commit 6e23ebac001acae91d1c762ca934e09a9b7d614a
+Subproject commit e0d0e02e9f5ece138d1520106954a881ab0b8db2
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index af2cb353..5220d8c7 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -245,56 +245,6 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  three_e_5_idx_exch12_bi_ort_old = 0.d0
-  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
-            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
@@ -305,6 +255,12 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
+  ! Equivalent to:
+  !
+  !    call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
+  !
+  !    three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+  !
   END_DOC
 
   implicit none
@@ -314,10 +270,10 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   double precision :: weight
   double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
   allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
   allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
   allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
diff --git a/src/bi_ort_ints/three_body_ijmkl_old.irp.f b/src/bi_ort_ints/three_body_ijmkl_old.irp.f
new file mode 100644
index 00000000..105cd179
--- /dev/null
+++ b/src/bi_ort_ints/three_body_ijmkl_old.irp.f
@@ -0,0 +1,295 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_direct_bi_ort_old(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_direct_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_direct_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
+            three_e_5_idx_direct_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_direct_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_cycle_1_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
+            three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_cycle_2_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          do l = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
+            three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_exch23_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch23_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
+            three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch23_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_exch13_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch13_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
+            three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch13_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  three_e_5_idx_exch12_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
+            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+

From c4612318ae9cce73c3cf668703827eb9c7bfd093 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 09:11:32 +0200
Subject: [PATCH 40/79] Optimized direct 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |  62 ++-
 src/bi_ort_ints/three_body_ijmkl.irp.f | 500 ++++++++++++-------------
 2 files changed, 297 insertions(+), 265 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 63b2aa8c..d0367f6f 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -7,7 +7,8 @@ program bi_ort_ints
   my_n_pt_r_grid = 10
   my_n_pt_a_grid = 14
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call test_3e
+! call test_3e
+ call test_5idx
 end
 
 subroutine test_3e
@@ -19,15 +20,13 @@ subroutine test_3e
  n = 0
  accu = 0.d0
  do i = 1, mo_num
-  do k = 1, mo_num 
+  do k = 1, mo_num
    do j = 1, mo_num
-    do l = 1, mo_num 
+    do l = 1, mo_num
      do m = 1, mo_num
-      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
-!      do n = 1, mo_num
-!        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
-!        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
+      do n = 1, mo_num
+        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
+        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
         contrib = dabs(new - ref)
         accu += contrib
         if(contrib .gt. 1.d-10)then
@@ -36,7 +35,7 @@ subroutine test_3e
          print*,ref,new,contrib
          stop
         endif
-!      enddo
+      enddo
      enddo
     enddo
    enddo
@@ -46,3 +45,48 @@ subroutine test_3e
 
 
 end
+
+subroutine test_5idx
+ implicit none
+ integer :: i,k,j,l,m,n,ipoint
+ double precision :: accu, contrib,new,ref
+ i = 1
+ k = 1
+ n = 0
+ accu = 0.d0
+ do i = 1, mo_num
+  do k = 1, mo_num
+   do j = 1, mo_num
+    do l = 1, mo_num
+     do m = 1, mo_num
+      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'direct'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+
+!      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'exch12'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+     enddo
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/dble(mo_num)**5
+
+
+end
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 5220d8c7..1db773f1 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,7 +1,8 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -12,257 +13,6 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   END_DOC
 
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_direct_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
-            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_cycle_1_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_cycle_2_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          do l = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_exch23_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_exch13_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
-            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  ! Equivalent to:
-  !
-  !    call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
-  !
-  !    three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
-  !
-  END_DOC
-
   implicit none
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
@@ -279,7 +29,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  print *, ' Providing the three_e_5_idx_exch12_bi_ort ...'
+  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
   call wall_time(wall0)
 
  do m = 1, mo_num
@@ -322,6 +72,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
             three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,k,j) - tmp_mat(k,j,l,i)
         enddo
       enddo
@@ -339,8 +90,8 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = &
-                three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
         enddo
       enddo
     enddo
@@ -350,9 +101,246 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   enddo
 
   call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
 
 END_PROVIDER
 
 ! ---
 
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          do l = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
+            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
+            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+

From 00bd8e2fcc8d435a1484af065a443efee3ca3c9f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 10:34:05 +0200
Subject: [PATCH 41/79] Optimized cyclic 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f            |  65 ++++-
 src/bi_ort_ints/three_body_ijmkl.irp.f       | 288 +++++++------------
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f |   1 +
 3 files changed, 152 insertions(+), 202 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index d0367f6f..eae0affe 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -59,17 +59,18 @@ subroutine test_5idx
    do j = 1, mo_num
     do l = 1, mo_num
      do m = 1, mo_num
-      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'direct'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
 
+!      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'direct'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+!
 !      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
 !      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
 !      contrib = dabs(new - ref)
@@ -81,6 +82,50 @@ subroutine test_5idx
 !       stop
 !      endif
 
+!      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'cycle1'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+!      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'cycle2'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+!      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'exch23'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+      new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'exch13'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+
      enddo
     enddo
    enddo
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 1db773f1..9f316771 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,4 +1,3 @@
-
 ! ---
 
  BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
@@ -17,7 +16,6 @@
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
   integer          :: ipoint
-  double precision :: weight
   double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
   allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
@@ -45,17 +43,22 @@
   do i=1,mo_num
     do l=1,mo_num
        do ipoint=1, n_points_final_grid
+
          grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
                int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
                int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
                int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+
+         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+
          m2grad_l(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
          m2grad_l(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
          m2grad_l(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
          m2grad_r(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
          m2grad_r(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
          m2grad_r(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+
        enddo
     enddo
   enddo
@@ -107,240 +110,141 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  double precision :: integral
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  double precision :: integral
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          do l = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
   ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
   !
-  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
   END_DOC
 
   implicit none
-  double precision :: integral
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
   integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
+  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:)
+  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
+  print *, ' Providing the three_e_5_idx_cycle_bi_ort ...'
   call wall_time(wall0)
 
+ do m = 1, mo_num
+
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         rk_grad_im, rm_grad_ik, lk_grad_mi, lm_grad_ik, tmp_mat)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
+         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
+         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lk_grad_mi, 3*n_points_final_grid,  &
+      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
   do i = 1, mo_num
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -tmp_mat(k,j,l,i)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -tmp_mat(l,i,k,j)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = -tmp_mat(l,j,k,i)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = -tmp_mat(k,i,l,j)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END PARALLEL DO
 
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lk_grad_mi, 3*n_points_final_grid,  &
+      rk_grad_im, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
 
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  double precision :: integral
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
   do i = 1, mo_num
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
-            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,j,i,k)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,i,j,l)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,j,i,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,i,j,k)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lm_grad_ik, 3*n_points_final_grid,  &
+      rk_grad_im, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,i,j,k)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,j,i,l)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,i,j,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,j,i,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+  enddo
 
   call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_cycle_bi_ort', wall1 - wall0
 
 END_PROVIDER
 
 ! ---
 
 
+
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index a72cd682..1962c8d6 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -85,6 +85,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
   PROVIDE int2_grad1_u12_bimo_t
 
   integral = 0.d0
+  ! (n, l, k, m, j, i)
   do ipoint = 1, n_points_final_grid
 
     tmp =     mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &

From b2c005eccb7e05eaebb59e9dcbc3c0e771c9a87f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 11:08:30 +0200
Subject: [PATCH 42/79] Finished optimizing 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f      | 102 +++++++++----------
 src/bi_ort_ints/three_body_ijmkl.irp.f | 135 +++++++------------------
 2 files changed, 87 insertions(+), 150 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index eae0affe..5653a2e2 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -60,60 +60,60 @@ subroutine test_5idx
     do l = 1, mo_num
      do m = 1, mo_num
 
-!      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'direct'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'direct'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+
+      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'exch12'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 !
-!      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'exch12'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'cycle1'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 
-!      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'cycle1'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'cycle2'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 
-!      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'cycle2'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
-
-!      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'exch23'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'exch23'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 
       new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
       ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 9f316771..c9e88ab9 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,7 +1,11 @@
 ! ---
 
- BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -16,18 +20,22 @@
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
   integer          :: ipoint
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  double precision, allocatable :: grad_mli(:,:,:), orb_mat(:,:,:)
+  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
+  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:)
+  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
   allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
   allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
+  print *, ' Providing the three_e_5_idx_bi_ort ...'
   call wall_time(wall0)
 
  do m = 1, mo_num
@@ -38,7 +46,8 @@
  !$OMP SHARED (m,mo_num,n_points_final_grid, &
  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         m2grad_r, m2grad_l, grad_mli, tmp_mat, orb_mat)
+ !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi, &
+ !$OMP         grad_mli, tmp_mat, orb_mat)
  !$OMP DO COLLAPSE(2)
   do i=1,mo_num
     do l=1,mo_num
@@ -51,13 +60,21 @@
 
          orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
 
-         m2grad_l(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         m2grad_l(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         m2grad_l(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
 
-         m2grad_r(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         m2grad_r(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         m2grad_r(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
+         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
 
        enddo
     enddo
@@ -84,8 +101,8 @@
   !$OMP END PARALLEL DO
 
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      m2grad_l, 3*n_points_final_grid,  &
-      m2grad_r, 3*n_points_final_grid,  0.d0, &
+      lm_grad_ik, 3*n_points_final_grid,  &
+      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
       tmp_mat, mo_num*mo_num)
 
   !$OMP PARALLEL DO PRIVATE(i,j,k,l)
@@ -101,83 +118,6 @@
   enddo
   !$OMP END PARALLEL DO
 
-  enddo
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
-  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:)
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_cycle_bi_ort ...'
-  call wall_time(wall0)
-
- do m = 1, mo_num
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rk_grad_im, rm_grad_ik, lk_grad_mi, lm_grad_ik, tmp_mat)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
-
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
-
-       enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
       lk_grad_mi, 3*n_points_final_grid,  &
       rm_grad_ik, 3*n_points_final_grid,  0.d0, &
@@ -237,14 +177,11 @@ END_PROVIDER
     enddo
   enddo
   !$OMP END PARALLEL DO
+
   enddo
 
   call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_bi_ort', wall1 - wall0
 
 END_PROVIDER
 
-! ---
-
-
-

From 896ac96e7e7339c710b6325972e878d94003b9e5 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 11:40:21 +0200
Subject: [PATCH 43/79] Reduced memory in 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |   5 +
 src/bi_ort_ints/three_body_ijmkl.irp.f | 139 ++++++++++++++++---------
 2 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 5653a2e2..f7a42f37 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -9,6 +9,11 @@ program bi_ort_ints
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 ! call test_3e
  call test_5idx
+!  call test_5idx2
+end
+
+subroutine test_5idx2
+  PROVIDE three_e_5_idx_cycle_2_bi_ort
 end
 
 subroutine test_3e
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index c9e88ab9..bd669163 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -24,12 +24,6 @@
   double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
   double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:)
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
@@ -40,14 +34,15 @@
 
  do m = 1, mo_num
 
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
  !$OMP PRIVATE (i,l,ipoint) &
  !$OMP SHARED (m,mo_num,n_points_final_grid, &
  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi, &
- !$OMP         grad_mli, tmp_mat, orb_mat)
+ !$OMP         grad_mli, orb_mat)
  !$OMP DO COLLAPSE(2)
   do i=1,mo_num
     do l=1,mo_num
@@ -60,22 +55,6 @@
 
          orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
 
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
-
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
-
        enddo
     enddo
   enddo
@@ -100,6 +79,41 @@
   enddo
   !$OMP END PARALLEL DO
 
+  deallocate(orb_mat,grad_mli)
+
+  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+
+         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
+         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
       lm_grad_ik, 3*n_points_final_grid,  &
       rm_grad_ik, 3*n_points_final_grid,  0.d0, &
@@ -118,6 +132,52 @@
   enddo
   !$OMP END PARALLEL DO
 
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lm_grad_ik, 3*n_points_final_grid,  &
+      rk_grad_im, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,j,k)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = - tmp_mat(k,j,i,l)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = - tmp_mat(k,i,j,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = - tmp_mat(l,j,i,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(lm_grad_ik)
+
+  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         lk_grad_mi)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+
+         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
       lk_grad_mi, 3*n_points_final_grid,  &
       rm_grad_ik, 3*n_points_final_grid,  0.d0, &
@@ -128,10 +188,10 @@
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -tmp_mat(k,j,l,i)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -tmp_mat(l,i,k,j)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = -tmp_mat(l,j,k,i)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = -tmp_mat(k,i,l,j)
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l,i)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k,i)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(k,i,l,j)
         enddo
       enddo
     enddo
@@ -158,26 +218,9 @@
   enddo
   !$OMP END PARALLEL DO
 
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,i,j,k)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,j,i,l)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,i,j,l)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,j,i,k)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
+  deallocate(lk_grad_mi)
+  deallocate(rm_grad_ik)
+  deallocate(rk_grad_im)
   enddo
 
   call wall_time(wall1)

From 81b7751b00f54a988e2df30fb92edc98f0e49474 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 16:10:04 +0200
Subject: [PATCH 44/79] Fix bug in number of args

---
 src/ccsd/ccsd_t_space_orb_stoch.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 1f3bebc2..b669025e 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -198,7 +198,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   allocate (bounds(2,nbuckets))
   do isample=1,nbuckets
     eta = 1.d0/dble(nbuckets) * dble(isample)
-    ieta = binary_search(waccu,eta,Nabc,ileft,iright)
+    ieta = binary_search(waccu,eta,Nabc)
     bounds(1,isample) = ileft
     bounds(2,isample) = ieta
     ileft = ieta+1

From 6d01eb42ca24a4265710b20913d64c9fb3117298 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:16:39 +0200
Subject: [PATCH 45/79] print mem details

---
 src/bi_ort_ints/three_body_ijm.irp.f         | 7 +++++++
 src/bi_ort_ints/three_body_ijmk.irp.f        | 6 ++++++
 src/bi_ort_ints/three_body_ijmkl.irp.f       | 6 ++++++
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f | 1 +
 src/non_h_ints_mu/tc_integ.irp.f             | 2 ++
 src/non_h_ints_mu/total_tc_int.irp.f         | 4 +---
 6 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/bi_ort_ints/three_body_ijm.irp.f b/src/bi_ort_ints/three_body_ijm.irp.f
index 4d21cb93..b34638b8 100644
--- a/src/bi_ort_ints/three_body_ijm.irp.f
+++ b/src/bi_ort_ints/three_body_ijm.irp.f
@@ -49,6 +49,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_direct_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -102,6 +103,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_cycle_1_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_cycle_1_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -155,6 +157,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_cycle_2_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_cycle_2_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -208,6 +211,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch23_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch23_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -261,6 +265,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch13_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch13_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -306,6 +311,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch12_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch12_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -359,6 +365,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch12_bi_ort_new, (mo_num, mo_
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch12_bi_ort_new', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 5afd49ab..95b57e37 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -43,6 +43,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_direct_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -90,6 +91,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_cycle_1_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -137,6 +139,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_cycle_2_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -184,6 +187,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -230,6 +234,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_exch13_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -277,6 +282,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_exch12_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index ae4c9bd5..507408e5 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -44,6 +44,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -93,6 +94,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -142,6 +144,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -191,6 +194,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -240,6 +244,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -289,6 +294,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index 5a3730b3..f82e8725 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -57,6 +57,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
 
   call wall_time(wall1)
   print *, ' wall time for three_body_ints_bi_ort', wall1 - wall0
+  call print_memory_usage()
 ! if(write_three_body_ints_bi_ort)then
 !  print*,'Writing three_body_ints_bi_ort on disk ...'
 !  call write_array_6_index_tensor(mo_num,three_body_ints_bi_ort,name_file)
diff --git a/src/non_h_ints_mu/tc_integ.irp.f b/src/non_h_ints_mu/tc_integ.irp.f
index 8251fc71..b2c0df31 100644
--- a/src/non_h_ints_mu/tc_integ.irp.f
+++ b/src/non_h_ints_mu/tc_integ.irp.f
@@ -100,6 +100,8 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
       !$OMP END DO
       !$OMP END PARALLEL
 
+      FREE v_ij_erf_rk_cst_mu_j1b v_ij_u_cst_mu_j1b x_v_ij_erf_rk_cst_mu_j1b
+
     elseif(j1b_type .ge. 100) then
 
       PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
diff --git a/src/non_h_ints_mu/total_tc_int.irp.f b/src/non_h_ints_mu/total_tc_int.irp.f
index 2034872a..afa10305 100644
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@@ -84,9 +84,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist, (ao_num, ao_num, ao_num, ao
     enddo
   endif
 
-  FREE tc_grad_square_ao
-  FREE tc_grad_and_lapl_ao
-  FREE ao_two_e_coul
+  FREE tc_grad_square_ao tc_grad_and_lapl_ao ao_two_e_coul
 
   call wall_time(wall1)
   print *, ' wall time for ao_tc_int_chemist ', wall1 - wall0

From 107cc3f2fbfb13fdea4bad734c8bc5d11d9d8df8 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:19:25 +0200
Subject: [PATCH 46/79] fixed bug in TC-VAR

---
 src/tc_bi_ortho/tc_utils.irp.f | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/tc_bi_ortho/tc_utils.irp.f b/src/tc_bi_ortho/tc_utils.irp.f
index f8f648e8..737c393b 100644
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@@ -38,15 +38,16 @@ subroutine write_tc_var()
 
   implicit none
   integer          :: i, j, k
-  double precision :: hmono, htwoe, hthree, htot
+  double precision :: hmono, htwoe, hthree, htot_1j, htot_j1
   double precision :: SIGMA_TC
 
   do k = 1, n_states
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
-      SIGMA_TC = SIGMA_TC + htot * htot
+      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
+      call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
+      SIGMA_TC = SIGMA_TC + htot_1j * htot_j1
     enddo
 
     print *, " state    : ", k

From 4cc8dae42010e062f82ace4373e2d5927e9074b0 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 20:32:31 +0200
Subject: [PATCH 47/79] Improve 5idx

---
 scripts/qp_import_trexio.py                |  15 ++
 scripts/utility/qp_bitmasks.py             |   6 +-
 src/ao_basis/cosgtos.irp.f                 |   3 +-
 src/bi_ort_ints/bi_ort_ints.irp.f          | 115 ++++++-------
 src/bi_ort_ints/three_body_ijmkl.irp.f     |  10 +-
 src/tc_bi_ortho/31.tc_bi_ortho.bats        |  34 ++--
 src/tc_bi_ortho/slater_tc_3e_slow.irp.f    |  89 +++++-----
 src/tc_bi_ortho/slater_tc_opt.irp.f        |   3 +-
 src/tc_bi_ortho/slater_tc_opt_double.irp.f | 180 +++++++++++----------
 src/tc_bi_ortho/symmetrized_3_e_int.irp.f  |   3 +-
 10 files changed, 247 insertions(+), 211 deletions(-)

diff --git a/scripts/qp_import_trexio.py b/scripts/qp_import_trexio.py
index 89096387..2c829f5c 100755
--- a/scripts/qp_import_trexio.py
+++ b/scripts/qp_import_trexio.py
@@ -17,6 +17,7 @@ import numpy as np
 from functools import reduce
 from ezfio import ezfio
 from docopt import docopt
+import qp_bitmasks
 
 try:
   import trexio
@@ -453,6 +454,20 @@ def write_ezfio(trexio_filename, filename):
     else:
         print("None")
 
+    print("Determinant\t\t...\t", end=' ')
+    alpha = [ i for i in range(num_alpha) ]
+    beta  = [ i for i in range(num_beta) ]
+    if trexio.has_mo_spin(trexio_file):
+       spin = trexio.read_mo_spin(trexio_file)
+       beta  = [ i for i in range(mo_num) if spin[i] == 1 ]
+       beta  = [ beta[i] for i in range(num_beta) ]
+
+    alpha = qp_bitmasks.BitMask(alpha)
+    beta  = qp_bitmasks.BitMask(beta )
+    print(alpha)
+    print(beta)
+    print("OK")
+
 
 
 
diff --git a/scripts/utility/qp_bitmasks.py b/scripts/utility/qp_bitmasks.py
index 38aa48d7..11965b72 100644
--- a/scripts/utility/qp_bitmasks.py
+++ b/scripts/utility/qp_bitmasks.py
@@ -22,7 +22,7 @@ def int_to_string(s):
         assert s>=0
     AssertionError
     """
-    assert type(s) in (int, long)
+    assert type(s) == int
     assert s>=0
     return '{s:0b}'.format(s=s)
 
@@ -62,7 +62,7 @@ def int_to_bitmask(s,bit_kind_size=BIT_KIND_SIZE):
     ['1111111111111111111111111111111111111111111111111111111111110110']
     >>>
     """
-    assert type(s) in (int, long)
+    assert type(s) == int
     if s < 0:
         s = s + (1 << bit_kind_size)
     return ['{s:0{width}b}'.format(s=s,width=bit_kind_size)]
@@ -104,7 +104,7 @@ class BitMask(object):
     return self._data_int[i]
 
   def __setitem__(self,i,value):
-    if type(value) in (int,long):
+    if type(value) == int :
         self._data_int[i] = value
     elif type(value) == str:
         s = string_to_bitmask(value,bit_kind_size=self.bit_kind_size)[0]
diff --git a/src/ao_basis/cosgtos.irp.f b/src/ao_basis/cosgtos.irp.f
index 721a3e57..dfa7d6b9 100644
--- a/src/ao_basis/cosgtos.irp.f
+++ b/src/ao_basis/cosgtos.irp.f
@@ -6,13 +6,14 @@ BEGIN_PROVIDER [ logical, use_cosgtos  ]
 
   logical                        :: has
   PROVIDE ezfio_filename
+  use_cosgtos = .False.
   if (mpi_master) then
     call ezfio_has_ao_basis_use_cosgtos(has)
     if (has) then
 !      write(6,'(A)') '.. >>>>> [ IO READ: use_cosgtos ] <<<<< ..'
       call ezfio_get_ao_basis_use_cosgtos(use_cosgtos)
     else
-      use_cosgtos = .False.
+      call ezfio_set_ao_basis_use_cosgtos(use_cosgtos)
     endif
   endif
   IRP_IF MPI_DEBUG
diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index f7a42f37..42bbe315 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -55,6 +55,7 @@ subroutine test_5idx
  implicit none
  integer :: i,k,j,l,m,n,ipoint
  double precision :: accu, contrib,new,ref
+ double precision, external :: three_e_5_idx_exch12_bi_ort
  i = 1
  k = 1
  n = 0
@@ -64,18 +65,21 @@ subroutine test_5idx
    do j = 1, mo_num
     do l = 1, mo_num
      do m = 1, mo_num
+!      if (dabs(three_e_5_idx_direct_bi_ort(m,l,j,k,i) - three_e_5_idx_exch12_bi_ort(m,l,i,k,j)) > 1.d-10) then 
+!         stop
+!      endif
 
-      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'direct'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
-
+!      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'direct'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+!
       new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
       ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
       contrib = dabs(new - ref)
@@ -86,51 +90,52 @@ subroutine test_5idx
        print*,ref,new,contrib
        stop
       endif
+
+!
+!      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'cycle1'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+!
+!      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'cycle2'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+!
+!      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'exch23'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+!
+!      new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'exch13'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
 !
-      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'cycle1'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
-
-      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'cycle2'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
-
-      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'exch23'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
-
-      new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'exch13'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
-
      enddo
     enddo
    enddo
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index bd669163..7b39235b 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,7 +1,11 @@
 ! ---
+double precision function three_e_5_idx_exch12_bi_ort(m,l,i,k,j) result(integral)
+  implicit none
+  integer, intent(in) :: m,l,j,k,i
+  integral = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+end
 
  BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
@@ -14,6 +18,8 @@
   ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+
   END_DOC
 
   implicit none
@@ -72,7 +78,6 @@
       do j = 1, mo_num
         do l = 1, mo_num
             three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
-            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,k,j) - tmp_mat(k,j,l,i)
         enddo
       enddo
     enddo
@@ -125,7 +130,6 @@
       do j = 1, mo_num
         do l = 1, mo_num
             three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
-            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
         enddo
       enddo
     enddo
diff --git a/src/tc_bi_ortho/31.tc_bi_ortho.bats b/src/tc_bi_ortho/31.tc_bi_ortho.bats
index f5b9d8c0..93bed2ab 100644
--- a/src/tc_bi_ortho/31.tc_bi_ortho.bats
+++ b/src/tc_bi_ortho/31.tc_bi_ortho.bats
@@ -4,46 +4,50 @@ source $QP_ROOT/tests/bats/common.bats.sh
 source $QP_ROOT/quantum_package.rc
 
 
+function get_e() {
+  grep "eigval_right_tc_bi_orth" $1 | cut -d '=' -f 2 | xargs
+}
+
 function run_Ne() {
-  qp set_file Ne_tc_scf 
-  qp run cisd 
-  qp run tc_bi_ortho | tee Ne_tc_scf.cisd_tc_bi_ortho.out  
+  qp set_file Ne_tc_scf
+  qp run cisd
+  qp run tc_bi_ortho | tee Ne_tc_scf.cisd_tc_bi_ortho.out
   eref=-128.77020441279302
-  energy="$(grep "eigval_right_tc_bi_orth =" Ne_tc_scf.cisd_tc_bi_ortho.out)"
+  energy=$(get_e Ne_tc_scf.cisd_tc_bi_ortho.out)
   eq $energy $eref 1e-6
 }
 
 
 @test "Ne" {
- run_Ne 
+ run_Ne
 }
 
 
 function run_C() {
-  qp set_file C_tc_scf 
-  qp run cisd 
-  qp run tc_bi_ortho | tee C_tc_scf.cisd_tc_bi_ortho.out  
+  qp set_file C_tc_scf
+  qp run cisd
+  qp run tc_bi_ortho | tee C_tc_scf.cisd_tc_bi_ortho.out
   eref=-37.757536149952514
-  energy="$(grep "eigval_right_tc_bi_orth =" C_tc_scf.cisd_tc_bi_ortho.out)"
+  energy=$(get_e C_tc_scf.cisd_tc_bi_ortho.out)
   eq $energy $eref 1e-6
 }
 
 
 @test "C" {
- run_C 
+ run_C
 }
 
 function run_O() {
-  qp set_file C_tc_scf 
-  qp run cisd 
-  qp run tc_bi_ortho | tee O_tc_scf.cisd_tc_bi_ortho.out  
+  qp set_file C_tc_scf
+  qp run cisd
+  qp run tc_bi_ortho | tee O_tc_scf.cisd_tc_bi_ortho.out
   eref=-74.908518517716161
-  energy="$(grep "eigval_right_tc_bi_orth =" O_tc_scf.cisd_tc_bi_ortho.out)"
+  energy=$(get_e O_tc_scf.cisd_tc_bi_ortho.out)
   eq $energy $eref 1e-6
 }
 
 
 @test "O" {
- run_O 
+ run_O
 }
 
diff --git a/src/tc_bi_ortho/slater_tc_3e_slow.irp.f b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
index 6abb6b78..49977f37 100644
--- a/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
+++ b/src/tc_bi_ortho/slater_tc_3e_slow.irp.f
@@ -32,28 +32,28 @@ subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
   if(Ne(1)+Ne(2).ge.3)then
 !!  ! alpha/alpha/beta three-body
    do i = 1, Ne(1)
-    ii = occ(i,1) 
+    ii = occ(i,1)
     do j = i+1, Ne(1)
-     jj = occ(j,1) 
+     jj = occ(j,1)
      do m = 1, Ne(2)
-      mm = occ(m,2) 
-!      direct_int = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) USES THE 6-IDX TENSOR 
-!      exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) USES THE 6-IDX TENSOR 
-      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,ii) ! USES 3-IDX TENSOR 
-      exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii) ! USES 3-IDX TENSOR 
+      mm = occ(m,2)
+!      direct_int = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) USES THE 6-IDX TENSOR
+!      exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) USES THE 6-IDX TENSOR
+      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,ii) ! USES 3-IDX TENSOR
+      exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii) ! USES 3-IDX TENSOR
       hthree += direct_int - exchange_int
      enddo
     enddo
    enddo
-  
+
    ! beta/beta/alpha three-body
    do i = 1, Ne(2)
-    ii = occ(i,2) 
+    ii = occ(i,2)
     do j = i+1, Ne(2)
-     jj = occ(j,2) 
+     jj = occ(j,2)
      do m = 1, Ne(1)
-      mm = occ(m,1) 
-      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,ii) 
+      mm = occ(m,1)
+      direct_int = three_e_3_idx_direct_bi_ort(mm,jj,ii)
       exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)
       hthree += direct_int - exchange_int
      enddo
@@ -64,10 +64,10 @@ subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
    do i = 1, Ne(1)
     ii = occ(i,1) ! 1
     do j = i+1, Ne(1)
-     jj = occ(j,1) ! 2 
+     jj = occ(j,1) ! 2
      do m = j+1, Ne(1)
-      mm = occ(m,1) ! 3 
-!      ref =  sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) USES THE 6 IDX TENSOR 
+      mm = occ(m,1) ! 3
+!      ref =  sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) USES THE 6 IDX TENSOR
       hthree += three_e_diag_parrallel_spin(mm,jj,ii) ! USES ONLY 3-IDX TENSORS
      enddo
     enddo
@@ -80,7 +80,7 @@ subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
      jj = occ(j,2) ! 2
      do m = j+1, Ne(2)
       mm = occ(m,2) ! 3
-!      ref =  sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) USES THE 6 IDX TENSOR 
+!      ref =  sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) USES THE 6 IDX TENSOR
       hthree += three_e_diag_parrallel_spin(mm,jj,ii) ! USES ONLY 3-IDX TENSORS
      enddo
     enddo
@@ -96,7 +96,7 @@ subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
   ! <key_j | H_tilde | key_i> for single excitation ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
   !!
   !! WARNING !!
-  ! 
+  !
   ! Non hermitian !!
   END_DOC
 
@@ -110,7 +110,7 @@ subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
   integer                       :: Ne(2),i,j,ii,jj,ispin,jspin,k,kk
   integer                       :: degree,exc(0:2,2,2)
   integer                       :: h1, p1, h2, p2, s1, s2
-  double precision              :: direct_int,phase,exchange_int,three_e_single_parrallel_spin 
+  double precision              :: direct_int,phase,exchange_int,three_e_single_parrallel_spin
   double precision              :: sym_3_e_int_from_6_idx_tensor
   integer                       :: other_spin(2)
   integer(bit_kind)             :: key_j_core(Nint,2),key_i_core(Nint,2)
@@ -142,26 +142,26 @@ subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
    ! alpha/alpha/beta three-body
 !   print*,'IN SLAT RULES'
    if(Ne(1)+Ne(2).ge.3)then
-     ! hole of spin s1 :: contribution from purely other spin 
+     ! hole of spin s1 :: contribution from purely other spin
      ispin = other_spin(s1) ! ispin is the other spin than s1
-     do i = 1, Ne(ispin)  ! i is the orbitals of the other spin than s1  
-      ii = occ(i,ispin)  
-      do j = i+1, Ne(ispin) ! j has the same spin than s1 
-       jj = occ(j,ispin) 
+     do i = 1, Ne(ispin)  ! i is the orbitals of the other spin than s1
+      ii = occ(i,ispin)
+      do j = i+1, Ne(ispin) ! j has the same spin than s1
+       jj = occ(j,ispin)
        !   is == ispin  in :::   s1 is is  s1 is is      s1 is is s1 is is
        !                       < h1 j  i | p1 j  i > - < h1 j  i | p1 i j >
-       !                                                   
-       direct_int   = three_e_4_idx_direct_bi_ort(jj,ii,p1,h1)  
-       exchange_int = three_e_4_idx_exch23_bi_ort(jj,ii,p1,h1) 
+       !
+       direct_int   = three_e_4_idx_direct_bi_ort(jj,ii,p1,h1)
+       exchange_int = three_e_4_idx_exch23_bi_ort(jj,ii,p1,h1)
        hthree += direct_int - exchange_int
       enddo
      enddo
-  
+
      ! hole of spin s1 :: contribution from mixed other spin / same spin
-     do i = 1, Ne(ispin) ! other spin 
-      ii = occ(i,ispin)  ! other spin 
-      do j = 1, Ne(s1)   ! same spin 
-       jj = occ(j,s1)    ! same spin 
+     do i = 1, Ne(ispin) ! other spin
+      ii = occ(i,ispin)  ! other spin
+      do j = 1, Ne(s1)   ! same spin
+       jj = occ(j,s1)    ! same spin
        direct_int   = three_e_4_idx_direct_bi_ort(jj,ii,p1,h1)
        exchange_int = three_e_4_idx_exch13_bi_ort(jj,ii,p1,h1)
        !              < h1 j  i | p1 j i > - < h1 j i | j p1 i >
@@ -174,8 +174,8 @@ subroutine single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
       ii = occ(i,s1)
       do j = i+1, Ne(s1)
        jj = occ(j,s1)
-!       ref = sym_3_e_int_from_6_idx_tensor(jj,ii,p1,jj,ii,h1) 
-       hthree += three_e_single_parrallel_spin(jj,ii,p1,h1) ! USES THE 4-IDX TENSOR 
+!       ref = sym_3_e_int_from_6_idx_tensor(jj,ii,p1,jj,ii,h1)
+       hthree += three_e_single_parrallel_spin(jj,ii,p1,h1) ! USES THE 4-IDX TENSOR
       enddo
      enddo
    endif
@@ -191,7 +191,7 @@ subroutine double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
   ! <key_j | H_tilde | key_i> for double excitation ONLY FOR THREE-BODY TERMS  WITH BI ORTHONORMAL ORBITALS
   !!
   !! WARNING !!
-  ! 
+  !
   ! Non hermitian !!
   END_DOC
 
@@ -235,29 +235,30 @@ subroutine double_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
   call get_double_excitation(key_i, key_j, exc, phase, Nint)
   call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
 
-    
+
     if(Ne(1)+Ne(2).ge.3)then
-     if(s1==s2)then ! same spin excitation 
+     if(s1==s2)then ! same spin excitation
       ispin = other_spin(s1)
       do m = 1, Ne(ispin) ! direct(other_spin) - exchange(s1)
        mm = occ(m,ispin)
-       direct_int = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1) 
-       exchange_int = three_e_5_idx_exch12_bi_ort(mm,p2,h2,p1,h1)
+       direct_int = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
+!       exchange_int = three_e_5_idx_exch12_bi_ort(mm,p2,h2,p1,h1)
+       exchange_int = three_e_5_idx_direct_bi_ort(mm,p2,h1,p1,h2)
        hthree += direct_int - exchange_int
       enddo
-      do m = 1, Ne(s1) ! pure contribution from s1 
+      do m = 1, Ne(s1) ! pure contribution from s1
        mm = occ(m,s1)
        hthree += three_e_double_parrallel_spin(mm,p2,h2,p1,h1)
-      enddo 
-     else ! different spin excitation 
+      enddo
+     else ! different spin excitation
        do m = 1, Ne(s1)
-        mm = occ(m,s1) ! 
-        direct_int = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1) 
+        mm = occ(m,s1) !
+        direct_int = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
         exchange_int = three_e_5_idx_exch13_bi_ort(mm,p2,h2,p1,h1)
         hthree += direct_int - exchange_int
        enddo
        do m = 1, Ne(s2)
-        mm = occ(m,s2) ! 
+        mm = occ(m,s2) !
         direct_int = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
         exchange_int = three_e_5_idx_exch23_bi_ort(mm,p2,h2,p1,h1)
         hthree += direct_int - exchange_int
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index 3fd2576a..882470ed 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -13,8 +13,7 @@ subroutine provide_all_three_ints_bi_ortho
    PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
   endif
   if(.not.double_normal_ord.and.three_e_5_idx_term)then
-   PROVIDE three_e_5_idx_direct_bi_ort three_e_5_idx_cycle_1_bi_ort three_e_5_idx_cycle_2_bi_ort
-   PROVIDE three_e_5_idx_exch23_bi_ort three_e_5_idx_exch13_bi_ort three_e_5_idx_exch12_bi_ort
+   PROVIDE three_e_5_idx_direct_bi_ort 
   elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
    PROVIDE normal_two_body_bi_orth
   endif
diff --git a/src/tc_bi_ortho/slater_tc_opt_double.irp.f b/src/tc_bi_ortho/slater_tc_opt_double.irp.f
index 2d6bfb27..12bbbec0 100644
--- a/src/tc_bi_ortho/slater_tc_opt_double.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_double.irp.f
@@ -2,17 +2,17 @@
 subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe, hthree, htot)
 
   BEGIN_DOC
-  ! <key_j | H_tilde | key_i> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS 
+  ! <key_j | H_tilde | key_i> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS
   !!
   !! WARNING !!
-  ! 
+  !
   ! Non hermitian !!
   END_DOC
 
   use bitmasks
 
   implicit none
-  integer,           intent(in) :: Nint 
+  integer,           intent(in) :: Nint
   integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
   double precision, intent(out) :: hmono, htwoe, hthree, htot
   integer                       :: occ(Nint*bit_kind_size,2)
@@ -39,8 +39,8 @@ subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe,
   call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
 
   if(s1.ne.s2)then
-   ! opposite spin two-body 
-    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1) 
+   ! opposite spin two-body
+    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
     if(three_body_h_tc.and.elec_num.gt.2)then
      if(.not.double_normal_ord.and.three_e_5_idx_term)then
       if(degree_i>degree_j)then
@@ -53,11 +53,11 @@ subroutine double_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe,
      endif
     endif
   else
-   ! same spin two-body 
-   ! direct terms 
-   htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)  
-   ! exchange terms 
-   htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1) 
+   ! same spin two-body
+   ! direct terms
+   htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+   ! exchange terms
+   htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1)
    if(three_body_h_tc.and.elec_num.gt.2)then
     if(.not.double_normal_ord.and.three_e_5_idx_term)then
      if(degree_i>degree_j)then
@@ -112,72 +112,76 @@ subroutine three_comp_two_e_elem(key_i,h1,h2,p1,p2,s1,s2,hthree)
   !DIR$ FORCEINLINE
   call bitstring_to_list_ab(particle, occ_particle, tmp, N_int)
   ASSERT (tmp(1) == nexc(1)) ! Number of particles alpha
-  ASSERT (tmp(2) == nexc(2)) ! Number of particle beta 
+  ASSERT (tmp(2) == nexc(2)) ! Number of particle beta
   !DIR$ FORCEINLINE
   call bitstring_to_list_ab(hole, occ_hole, tmp, N_int)
   ASSERT (tmp(1) == nexc(1)) ! Number of holes alpha
-  ASSERT (tmp(2) == nexc(2)) ! Number of holes beta 
+  ASSERT (tmp(2) == nexc(2)) ! Number of holes beta
   if(s1==s2.and.s1==1)then
    !!!!!!!!!!!!!!!!!!!!!!!!!! alpha/alpha double exc
-   hthree = eff_2_e_from_3_e_aa(p2,p1,h2,h1) 
-   if(nexc(1)+nexc(2) ==0)return !! if you're on the reference determinant 
-    !!!!!!!! the matrix element is already exact 
-    !!!!!!!! else you need to take care of holes and particles 
+   hthree = eff_2_e_from_3_e_aa(p2,p1,h2,h1)
+   if(nexc(1)+nexc(2) ==0)return !! if you're on the reference determinant
+    !!!!!!!! the matrix element is already exact
+    !!!!!!!! else you need to take care of holes and particles
     !!!!!!!!!!!!! Holes and particles !!!!!!!!!!!!!!!!!!!!!!!
     ispin = 1 ! i==alpha ==> pure same spin terms
-    do i = 1, nexc(ispin) ! number of couple of holes/particles 
+    do i = 1, nexc(ispin) ! number of couple of holes/particles
      ipart=occ_particle(i,ispin)
      hthree += three_e_double_parrallel_spin_prov(ipart,p2,h2,p1,h1)
      ihole=occ_hole(i,ispin)
      hthree -= three_e_double_parrallel_spin_prov(ihole,p2,h2,p1,h1)
     enddo
     ispin = 2 ! i==beta ==> alpha/alpha/beta terms
-    do i = 1, nexc(ispin) ! number of couple of holes/particles 
+    do i = 1, nexc(ispin) ! number of couple of holes/particles
      ! exchange between (h1,p1) and (h2,p2)
      ipart=occ_particle(i,ispin)
      direct_int  = three_e_5_idx_direct_bi_ort(ipart,p2,h2,p1,h1)
-     exchange_int = three_e_5_idx_exch12_bi_ort(ipart,p2,h2,p1,h1)
+!     exchange_int = three_e_5_idx_exch12_bi_ort(ipart,p2,h2,p1,h1)
+     exchange_int = three_e_5_idx_direct_bi_ort(ipart,p2,h1,p1,h2)
      hthree += direct_int - exchange_int
      ihole=occ_hole(i,ispin)
      direct_int  = three_e_5_idx_direct_bi_ort(ihole,p2,h2,p1,h1)
-     exchange_int = three_e_5_idx_exch12_bi_ort(ihole,p2,h2,p1,h1)
+!     exchange_int = three_e_5_idx_exch12_bi_ort(ihole,p2,h2,p1,h1)
+     exchange_int = three_e_5_idx_direct_bi_ort(ihole,p2,h1,p1,h2)
      hthree -= direct_int - exchange_int
     enddo
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  elseif(s1==s2.and.s1==2)then 
+  elseif(s1==s2.and.s1==2)then
    !!!!!!!!!!!!!!!!!!!!!!!!!! beta/beta double exc
    hthree = eff_2_e_from_3_e_bb(p2,p1,h2,h1)
-   if(nexc(1)+nexc(2) ==0)return !! if you're on the reference determinant 
-   !!!!!!!! the matrix element is already exact 
-   !!!!!!!! else you need to take care of holes and particles 
+   if(nexc(1)+nexc(2) ==0)return !! if you're on the reference determinant
+   !!!!!!!! the matrix element is already exact
+   !!!!!!!! else you need to take care of holes and particles
    !!!!!!!!!!!!! Holes and particles !!!!!!!!!!!!!!!!!!!!!!!
    ispin = 2 ! i==beta  ==> pure same spin terms
-   do i = 1, nexc(ispin) ! number of couple of holes/particles 
+   do i = 1, nexc(ispin) ! number of couple of holes/particles
     ipart=occ_particle(i,ispin)
     hthree += three_e_double_parrallel_spin_prov(ipart,p2,h2,p1,h1)
     ihole=occ_hole(i,ispin)
     hthree -= three_e_double_parrallel_spin_prov(ihole,p2,h2,p1,h1)
    enddo
    ispin = 1 ! i==alpha==> beta/beta/alpha terms
-   do i = 1, nexc(ispin) ! number of couple of holes/particles 
+   do i = 1, nexc(ispin) ! number of couple of holes/particles
     ! exchange between (h1,p1) and (h2,p2)
     ipart=occ_particle(i,ispin)
     direct_int  = three_e_5_idx_direct_bi_ort(ipart,p2,h2,p1,h1)
-    exchange_int = three_e_5_idx_exch12_bi_ort(ipart,p2,h2,p1,h1)
+!    exchange_int = three_e_5_idx_exch12_bi_ort(ipart,p2,h2,p1,h1)
+    exchange_int = three_e_5_idx_direct_bi_ort(ipart,p2,h1,p1,h2)
     hthree += direct_int - exchange_int
     ihole=occ_hole(i,ispin)
     direct_int  = three_e_5_idx_direct_bi_ort(ihole,p2,h2,p1,h1)
-    exchange_int = three_e_5_idx_exch12_bi_ort(ihole,p2,h2,p1,h1)
+!    exchange_int = three_e_5_idx_exch12_bi_ort(ihole,p2,h2,p1,h1)
+    exchange_int = three_e_5_idx_direct_bi_ort(ihole,p2,h1,p1,h2)
     hthree -= direct_int - exchange_int
    enddo
-  else                         ! (h1,p1) == alpha/(h2,p2) == beta 
+  else                         ! (h1,p1) == alpha/(h2,p2) == beta
    hthree = eff_2_e_from_3_e_ab(p2,p1,h2,h1)
-   if(nexc(1)+nexc(2) ==0)return !! if you're on the reference determinant 
-   !!!!!!!! the matrix element is already exact 
-   !!!!!!!! else you need to take care of holes and particles 
+   if(nexc(1)+nexc(2) ==0)return !! if you're on the reference determinant
+   !!!!!!!! the matrix element is already exact
+   !!!!!!!! else you need to take care of holes and particles
    !!!!!!!!!!!!! Holes and particles !!!!!!!!!!!!!!!!!!!!!!!
-   ispin = 1 ! i==alpha ==> alpha/beta/alpha terms 
-   do i = 1, nexc(ispin) ! number of couple of holes/particles 
+   ispin = 1 ! i==alpha ==> alpha/beta/alpha terms
+   do i = 1, nexc(ispin) ! number of couple of holes/particles
     ! exchange between (h1,p1) and i
     ipart=occ_particle(i,ispin)
     direct_int  = three_e_5_idx_direct_bi_ort(ipart,p2,h2,p1,h1)
@@ -188,8 +192,8 @@ subroutine three_comp_two_e_elem(key_i,h1,h2,p1,p2,s1,s2,hthree)
     exchange_int = three_e_5_idx_exch13_bi_ort(ihole,p2,h2,p1,h1)
     hthree -= direct_int - exchange_int
    enddo
-   ispin = 2 ! i==beta  ==> alpha/beta/beta  terms 
-   do i = 1, nexc(ispin) ! number of couple of holes/particles 
+   ispin = 2 ! i==beta  ==> alpha/beta/beta  terms
+   do i = 1, nexc(ispin) ! number of couple of holes/particles
     ! exchange between (h2,p2) and i
     ipart=occ_particle(i,ispin)
     direct_int  = three_e_5_idx_direct_bi_ort(ipart,p2,h2,p1,h1)
@@ -207,7 +211,7 @@ end
 BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_ab, (mo_num, mo_num, mo_num, mo_num)]
  implicit none
  BEGIN_DOC
-! eff_2_e_from_3_e_ab(p2,p1,h2,h1) = Effective Two-electron operator for alpha/beta double excitations 
+! eff_2_e_from_3_e_ab(p2,p1,h2,h1) = Effective Two-electron operator for alpha/beta double excitations
 !
 ! from contraction with HF density = a^{dagger}_p1_alpha a^{dagger}_p2_beta a_h2_beta a_h1_alpha
  END_DOC
@@ -222,16 +226,16 @@ BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_ab, (mo_num, mo_num, mo_num,
  eff_2_e_from_3_e_ab = 0.d0
  !$OMP PARALLEL                                                                         &
  !$OMP DEFAULT (NONE)                                                                   &
- !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, contrib) & 
+ !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, contrib) &
  !$OMP SHARED (n_act_orb, list_act, Ne,occ, eff_2_e_from_3_e_ab)
- !$OMP DO SCHEDULE (static) 
-  do hh1 = 1, n_act_orb !! alpha 
-    h1 = list_act(hh1) 
-    do hh2 = 1, n_act_orb !! beta 
-      h2 = list_act(hh2) 
+ !$OMP DO SCHEDULE (static)
+  do hh1 = 1, n_act_orb !! alpha
+    h1 = list_act(hh1)
+    do hh2 = 1, n_act_orb !! beta
+      h2 = list_act(hh2)
       do pp1 = 1, n_act_orb !! alpha
         p1 = list_act(pp1)
-        do pp2 = 1, n_act_orb !! beta 
+        do pp2 = 1, n_act_orb !! beta
           p2 = list_act(pp2)
           call give_contrib_for_abab(h1,h2,p1,p2,occ,Ne,contrib)
           eff_2_e_from_3_e_ab(p2,p1,h2,h1) = contrib
@@ -242,25 +246,25 @@ BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_ab, (mo_num, mo_num, mo_num,
  !$OMP END DO
  !$OMP END PARALLEL
 
-END_PROVIDER 
+END_PROVIDER
 
 subroutine give_contrib_for_abab(h1,h2,p1,p2,occ,Ne,contrib)
  implicit none
- BEGIN_DOC 
+ BEGIN_DOC
 ! gives the contribution for a double excitation (h1,p1)_alpha (h2,p2)_beta
 !
 ! on top of a determinant whose occupied orbitals is in (occ, Ne)
  END_DOC
  integer, intent(in) :: h1,h2,p1,p2,occ(N_int*bit_kind_size,2),Ne(2)
  double precision, intent(out) :: contrib
- integer :: mm,m 
+ integer :: mm,m
  double precision :: direct_int, exchange_int
- !! h1,p1 == alpha 
+ !! h1,p1 == alpha
  !! h2,p2 == beta
  contrib = 0.d0
- do mm = 1, Ne(1) !! alpha 
+ do mm = 1, Ne(1) !! alpha
    m = occ(mm,1)
-   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1) 
+   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
    ! exchange between (h1,p1) and m
    exchange_int = three_e_5_idx_exch13_bi_ort(mm,p2,h2,p1,h1)
    contrib += direct_int - exchange_int
@@ -268,7 +272,7 @@ subroutine give_contrib_for_abab(h1,h2,p1,p2,occ,Ne,contrib)
 
  do mm = 1, Ne(2) !! beta
    m = occ(mm,2)
-   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1) 
+   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
    ! exchange between (h2,p2) and m
    exchange_int = three_e_5_idx_exch23_bi_ort(mm,p2,h2,p1,h1)
    contrib += direct_int - exchange_int
@@ -278,11 +282,11 @@ end
 BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_aa, (mo_num, mo_num, mo_num, mo_num)]
  implicit none
  BEGIN_DOC
-! eff_2_e_from_3_e_ab(p2,p1,h2,h1) = Effective Two-electron operator for alpha/alpha double excitations 
+! eff_2_e_from_3_e_ab(p2,p1,h2,h1) = Effective Two-electron operator for alpha/alpha double excitations
 !
 ! from contractionelec_alpha_num with HF density = a^{dagger}_p1_alpha a^{dagger}_p2_alpha a_h2_alpha a_h1_alpha
 !
-! WARNING :: to be coherent with the phase convention used in the Hamiltonian matrix elements, you must fulfill 
+! WARNING :: to be coherent with the phase convention used in the Hamiltonian matrix elements, you must fulfill
 !
 ! ||||    h2>h1, p2>p1   ||||
  END_DOC
@@ -297,13 +301,13 @@ BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_aa, (mo_num, mo_num, mo_num,
  eff_2_e_from_3_e_aa = 100000000.d0
  !$OMP PARALLEL                                                                         &
  !$OMP DEFAULT (NONE)                                                                   &
- !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, contrib) & 
+ !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, contrib) &
  !$OMP SHARED (n_act_orb, list_act, Ne,occ, eff_2_e_from_3_e_aa)
- !$OMP DO SCHEDULE (static) 
-  do hh1 = 1, n_act_orb !! alpha 
-    h1 = list_act(hh1) 
+ !$OMP DO SCHEDULE (static)
+  do hh1 = 1, n_act_orb !! alpha
+    h1 = list_act(hh1)
     do hh2 = hh1+1, n_act_orb !! alpha
-      h2 = list_act(hh2) 
+      h2 = list_act(hh2)
       do pp1 = 1, n_act_orb !! alpha
         p1 = list_act(pp1)
         do pp2 = pp1+1, n_act_orb !! alpha
@@ -317,20 +321,20 @@ BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_aa, (mo_num, mo_num, mo_num,
  !$OMP END DO
  !$OMP END PARALLEL
 
-END_PROVIDER 
+END_PROVIDER
 
 subroutine give_contrib_for_aaaa(h1,h2,p1,p2,occ,Ne,contrib)
  implicit none
- BEGIN_DOC 
+ BEGIN_DOC
 ! gives the contribution for a double excitation (h1,p1)_alpha (h2,p2)_alpha
 !
 ! on top of a determinant whose occupied orbitals is in (occ, Ne)
  END_DOC
  integer, intent(in) :: h1,h2,p1,p2,occ(N_int*bit_kind_size,2),Ne(2)
  double precision, intent(out) :: contrib
- integer :: mm,m 
+ integer :: mm,m
  double precision :: direct_int, exchange_int
- !! h1,p1 == alpha 
+ !! h1,p1 == alpha
  !! h2,p2 == alpha
  contrib = 0.d0
  do mm = 1, Ne(1) !! alpha ==> pure parallele spin contribution
@@ -340,9 +344,10 @@ subroutine give_contrib_for_aaaa(h1,h2,p1,p2,occ,Ne,contrib)
 
  do mm = 1, Ne(2) !! beta
    m = occ(mm,2)
-   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1) 
+   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
    ! exchange between (h1,p1) and (h2,p2)
-   exchange_int = three_e_5_idx_exch12_bi_ort(mm,p2,h2,p1,h1)
+!   exchange_int = three_e_5_idx_exch12_bi_ort(mm,p2,h2,p1,h1)
+   exchange_int = three_e_5_idx_direct_bi_ort(mm,p2,h1,p1,h2)
    contrib += direct_int - exchange_int
  enddo
 end
@@ -351,11 +356,11 @@ end
 BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_bb, (mo_num, mo_num, mo_num, mo_num)]
  implicit none
  BEGIN_DOC
-! eff_2_e_from_3_e_ab(p2,p1,h2,h1) = Effective Two-electron operator for beta/beta double excitations 
+! eff_2_e_from_3_e_ab(p2,p1,h2,h1) = Effective Two-electron operator for beta/beta double excitations
 !
 ! from contractionelec_beta_num with HF density = a^{dagger}_p1_beta a^{dagger}_p2_beta a_h2_beta a_h1_beta
 !
-! WARNING :: to be coherent with the phase convention used in the Hamiltonian matrix elements, you must fulfill 
+! WARNING :: to be coherent with the phase convention used in the Hamiltonian matrix elements, you must fulfill
 !
 ! ||||    h2>h1, p2>p1   ||||
  END_DOC
@@ -370,13 +375,13 @@ BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_bb, (mo_num, mo_num, mo_num,
  eff_2_e_from_3_e_bb = 100000000.d0
  !$OMP PARALLEL                                                                         &
  !$OMP DEFAULT (NONE)                                                                   &
- !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, contrib) & 
+ !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, contrib) &
  !$OMP SHARED (n_act_orb, list_act, Ne,occ, eff_2_e_from_3_e_bb)
- !$OMP DO SCHEDULE (static) 
-  do hh1 = 1, n_act_orb !! beta 
-    h1 = list_act(hh1) 
+ !$OMP DO SCHEDULE (static)
+  do hh1 = 1, n_act_orb !! beta
+    h1 = list_act(hh1)
     do hh2 = hh1+1, n_act_orb !! beta
-      h2 = list_act(hh2) 
+      h2 = list_act(hh2)
       do pp1 = 1, n_act_orb !! beta
         p1 = list_act(pp1)
         do pp2 = pp1+1, n_act_orb !! beta
@@ -390,18 +395,18 @@ BEGIN_PROVIDER [ double precision, eff_2_e_from_3_e_bb, (mo_num, mo_num, mo_num,
  !$OMP END DO
  !$OMP END PARALLEL
 
-END_PROVIDER 
+END_PROVIDER
 
 subroutine give_contrib_for_bbbb(h1,h2,p1,p2,occ,Ne,contrib)
  implicit none
- BEGIN_DOC 
+ BEGIN_DOC
 ! gives the contribution for a double excitation (h1,p1)_beta (h2,p2)_beta
 !
 ! on top of a determinant whose occupied orbitals is in (occ, Ne)
  END_DOC
  integer, intent(in) :: h1,h2,p1,p2,occ(N_int*bit_kind_size,2),Ne(2)
  double precision, intent(out) :: contrib
- integer :: mm,m 
+ integer :: mm,m
  double precision :: direct_int, exchange_int
  !! h1,p1 == beta
  !! h2,p2 == beta
@@ -413,9 +418,10 @@ subroutine give_contrib_for_bbbb(h1,h2,p1,p2,occ,Ne,contrib)
 
  do mm = 1, Ne(1) !! alpha
    m = occ(mm,1)
-   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1) 
+   direct_int   = three_e_5_idx_direct_bi_ort(mm,p2,h2,p1,h1)
    ! exchange between (h1,p1) and (h2,p2)
-   exchange_int = three_e_5_idx_exch12_bi_ort(mm,p2,h2,p1,h1)
+!   exchange_int = three_e_5_idx_exch12_bi_ort(mm,p2,h2,p1,h1)
+   exchange_int = three_e_5_idx_direct_bi_ort(mm,p2,h1,p1,h2)
    contrib += direct_int - exchange_int
  enddo
 end
@@ -424,17 +430,17 @@ end
 subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_j, key_i, htot)
 
   BEGIN_DOC
-  ! <key_j | H_tilde | key_i> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS 
+  ! <key_j | H_tilde | key_i> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS
   !!
   !! WARNING !!
-  ! 
+  !
   ! Non hermitian !!
   END_DOC
 
   use bitmasks
 
   implicit none
-  integer,           intent(in) :: Nint 
+  integer,           intent(in) :: Nint
   integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
   double precision, intent(out) :: htot
   double precision :: hmono, htwoe
@@ -461,17 +467,17 @@ subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_j, key_i, htot)
   call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
 
   if(s1.ne.s2)then
-   ! opposite spin two-body 
-    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1) 
+   ! opposite spin two-body
+    htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
   else
-   ! same spin two-body 
-   ! direct terms 
-   htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)  
-   ! exchange terms 
-   htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1) 
+   ! same spin two-body
+   ! direct terms
+   htwoe  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+   ! exchange terms
+   htwoe -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1)
   endif
   htwoe  *= phase
-  htot    =  htwoe 
+  htot    =  htwoe
 
 end
 
diff --git a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
index e4f7ca93..e725d8e5 100644
--- a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
+++ b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
@@ -107,5 +107,6 @@ double precision function three_e_double_parrallel_spin(m,l,j,k,i)
   three_e_double_parrallel_spin = three_e_5_idx_direct_bi_ort(m,l,j,k,i)  ! direct
   three_e_double_parrallel_spin += three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) + three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) & ! two cyclic permutations 
   - three_e_5_idx_exch23_bi_ort(m,l,j,k,i) - three_e_5_idx_exch13_bi_ort(m,l,j,k,i)  & ! two first exchange 
-  - three_e_5_idx_exch12_bi_ort(m,l,j,k,i) ! last exchange 
+!  - three_e_5_idx_exch12_bi_ort(m,l,j,k,i) ! last exchange 
+  - three_e_5_idx_direct_bi_ort(m,l,i,k,j) ! last exchange 
 end

From 82b2d8bd98e9f3d543b74f766553d28166486094 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:48:23 +0200
Subject: [PATCH 48/79] avoid long name in cosgtos

---
 .../two_e_Coul_integrals_cosgtos.irp.f        | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
index 527a98d5..ea9ff009 100644
--- a/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
+++ b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
@@ -29,14 +29,14 @@ double precision function ao_two_e_integral_cosgtos(i, j, k, l)
   complex*16          :: integral5, integral6, integral7, integral8
   complex*16          :: integral_tot
 
-  double precision    :: ao_two_e_integral_cosgtos_schwartz_accel
+  double precision    :: ao_2e_cosgtos_schwartz_accel
   complex*16          :: ERI_cosgtos
   complex*16          :: general_primitive_integral_cosgtos
 
   if(ao_prim_num(i) * ao_prim_num(j) * ao_prim_num(k) * ao_prim_num(l) > 1024) then
 
     !print *, ' with shwartz acc '
-    ao_two_e_integral_cosgtos = ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
+    ao_two_e_integral_cosgtos = ao_2e_cosgtos_schwartz_accel(i, j, k, l)
 
   else
     !print *, ' without shwartz acc '
@@ -294,7 +294,7 @@ end function ao_two_e_integral_cosgtos
 
 ! ---
 
-double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
+double precision function ao_2e_cosgtos_schwartz_accel(i, j, k, l)
 
   BEGIN_DOC
   !  integral of the AO basis <ik|jl> or (ij|kl)
@@ -329,7 +329,7 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
   complex*16                    :: ERI_cosgtos
   complex*16                    :: general_primitive_integral_cosgtos
 
-  ao_two_e_integral_cosgtos_schwartz_accel = 0.d0
+  ao_2e_cosgtos_schwartz_accel = 0.d0
 
   dim1 = n_pt_max_integrals
 
@@ -519,8 +519,7 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
 
             integral_tot = integral1 + integral2 + integral3 + integral4 + integral5 + integral6 + integral7 + integral8
 
-            ao_two_e_integral_cosgtos_schwartz_accel = ao_two_e_integral_cosgtos_schwartz_accel &
-                                                     + coef4 * 2.d0 * real(integral_tot)
+            ao_2e_cosgtos_schwartz_accel = ao_2e_cosgtos_schwartz_accel + coef4 * 2.d0 * real(integral_tot)
           enddo ! s
         enddo  ! r
       enddo   ! q
@@ -698,8 +697,7 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
 
             integral_tot = integral1 + integral2 + integral3 + integral4 + integral5 + integral6 + integral7 + integral8
 
-            ao_two_e_integral_cosgtos_schwartz_accel = ao_two_e_integral_cosgtos_schwartz_accel &
-                                      + coef4 * 2.d0 * real(integral_tot)
+            ao_2e_cosgtos_schwartz_accel = ao_2e_cosgtos_schwartz_accel + coef4 * 2.d0 * real(integral_tot)
           enddo ! s
         enddo  ! r
       enddo   ! q
@@ -709,11 +707,11 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
 
   deallocate(schwartz_kl)
 
-end function ao_two_e_integral_cosgtos_schwartz_accel
+end function ao_2e_cosgtos_schwartz_accel
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, ao_two_e_integral_cosgtos_schwartz, (ao_num,ao_num)  ]
+BEGIN_PROVIDER [ double precision, ao_2e_cosgtos_schwartz, (ao_num,ao_num)]
 
   BEGIN_DOC
   !  Needed to compute Schwartz inequalities
@@ -723,16 +721,16 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_cosgtos_schwartz, (ao_num,a
   integer          :: i, k
   double precision :: ao_two_e_integral_cosgtos
 
-  ao_two_e_integral_cosgtos_schwartz(1,1) = ao_two_e_integral_cosgtos(1, 1, 1, 1)
+  ao_2e_cosgtos_schwartz(1,1) = ao_two_e_integral_cosgtos(1, 1, 1, 1)
 
- !$OMP PARALLEL DO PRIVATE(i,k)                                       &
- !$OMP             DEFAULT(NONE)                                      &
- !$OMP             SHARED(ao_num, ao_two_e_integral_cosgtos_schwartz) &
+ !$OMP PARALLEL DO PRIVATE(i,k)                           &
+ !$OMP             DEFAULT(NONE)                          &
+ !$OMP             SHARED(ao_num, ao_2e_cosgtos_schwartz) &
  !$OMP             SCHEDULE(dynamic)
   do i = 1, ao_num
     do k = 1, i
-      ao_two_e_integral_cosgtos_schwartz(i,k) = dsqrt(ao_two_e_integral_cosgtos(i, i, k, k))
-      ao_two_e_integral_cosgtos_schwartz(k,i) = ao_two_e_integral_cosgtos_schwartz(i,k)
+      ao_2e_cosgtos_schwartz(i,k) = dsqrt(ao_two_e_integral_cosgtos(i, i, k, k))
+      ao_2e_cosgtos_schwartz(k,i) = ao_2e_cosgtos_schwartz(i,k)
     enddo
   enddo
  !$OMP END PARALLEL DO

From cab3b12b9b397933ca438717846d28d3164d4804 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:55:51 +0200
Subject: [PATCH 49/79] minor modif in names

---
 src/tc_bi_ortho/tc_utils.irp.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tc_bi_ortho/tc_utils.irp.f b/src/tc_bi_ortho/tc_utils.irp.f
index e0f29eb8..9023e2f0 100644
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@@ -45,8 +45,8 @@ subroutine write_tc_var()
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
       SIGMA_TC = SIGMA_TC + htot_1j * htot_j1
     enddo
 

From 072bea8041a5414da00bc8ddc001186c3c9ff269 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 3 Jun 2023 22:12:30 +0200
Subject: [PATCH 50/79] Improve 4idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f          | 186 +++++-
 src/bi_ort_ints/three_body_ijm.irp.f       |  14 +-
 src/bi_ort_ints/three_body_ijmk.irp.f      | 686 +++++++++++++--------
 src/bi_ort_ints/three_body_ijmk_old.irp.f  | 290 +++++++++
 src/bi_ort_ints/three_body_ijmkl.irp.f     | 299 +++++----
 src/tc_bi_ortho/slater_tc_opt.irp.f        |   3 +-
 src/tc_bi_ortho/slater_tc_opt_single.irp.f |   7 +-
 src/tc_bi_ortho/symmetrized_3_e_int.irp.f  |   6 +-
 8 files changed, 1071 insertions(+), 420 deletions(-)
 create mode 100644 src/bi_ort_ints/three_body_ijmk_old.irp.f

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 42bbe315..bb0424cd 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -8,8 +8,9 @@ program bi_ort_ints
   my_n_pt_a_grid = 14
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 ! call test_3e
- call test_5idx
-!  call test_5idx2
+! call test_5idx
+! call test_5idx2
+ call test_4idx
 end
 
 subroutine test_5idx2
@@ -145,3 +146,184 @@ subroutine test_5idx
 
 
 end
+
+! ---
+
+subroutine test_4idx()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  thr = 1d-5
+
+  PROVIDE three_e_4_idx_direct_bi_ort_old
+  PROVIDE three_e_4_idx_direct_bi_ort 
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_direct_bi_ort    (l,k,j,i)
+          ref = three_e_4_idx_direct_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_direct_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_direct_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch13_bi_ort_old
+  PROVIDE three_e_4_idx_exch13_bi_ort 
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch13_bi_ort   (l,k,j,i)
+          ref = three_e_4_idx_exch13_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch13_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch13_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+!  PROVIDE three_e_4_idx_exch12_bi_ort_old
+!  PROVIDE three_e_4_idx_exch12_bi_ort 
+!
+!  accu = 0.d0
+!  do i = 1, mo_num
+!    do j = 1, mo_num
+!      do k = 1, mo_num
+!        do l = 1, mo_num
+!
+!          new = three_e_4_idx_exch12_bi_ort    (l,k,j,i)
+!          ref = three_e_4_idx_exch12_bi_ort_old(l,k,j,i)
+!          contrib = dabs(new - ref)
+!          accu += contrib
+!          if(contrib .gt. thr) then
+!            print*, ' problem in three_e_4_idx_exch12_bi_ort'
+!            print*, l, k, j, i
+!            print*, ref, new, contrib
+!            stop
+!          endif
+!
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  print*, ' accu on three_e_4_idx_exch12_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_cycle_1_bi_ort_old
+  PROVIDE three_e_4_idx_cycle_1_bi_ort
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_cycle_1_bi_ort    (l,k,j,i)
+          ref = three_e_4_idx_cycle_1_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_cycle_1_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_cycle_1_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+!  PROVIDE three_e_4_idx_cycle_2_bi_ort_old
+!  PROVIDE three_e_4_idx_cycle_2_bi_ort
+!
+!  accu = 0.d0
+!  do i = 1, mo_num
+!    do j = 1, mo_num
+!      do k = 1, mo_num
+!        do l = 1, mo_num
+!
+!          new = three_e_4_idx_cycle_2_bi_ort    (l,k,j,i)
+!          ref = three_e_4_idx_cycle_2_bi_ort_old(l,k,j,i)
+!          contrib = dabs(new - ref)
+!          accu += contrib
+!          if(contrib .gt. thr) then
+!            print*, ' problem in three_e_4_idx_cycle_2_bi_ort'
+!            print*, l, k, j, i
+!            print*, ref, new, contrib
+!            stop
+!          endif
+!
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  print*, ' accu on three_e_4_idx_cycle_2_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch23_bi_ort_old
+  PROVIDE three_e_4_idx_exch23_bi_ort
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch23_bi_ort    (l,k,j,i)
+          ref = three_e_4_idx_exch23_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch23_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch23_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  return
+end
diff --git a/src/bi_ort_ints/three_body_ijm.irp.f b/src/bi_ort_ints/three_body_ijm.irp.f
index b34638b8..ae100fb5 100644
--- a/src/bi_ort_ints/three_body_ijm.irp.f
+++ b/src/bi_ort_ints/three_body_ijm.irp.f
@@ -23,11 +23,11 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
- !$OMP PARALLEL                 &
- !$OMP DEFAULT (NONE)           &
- !$OMP PRIVATE (i,j,m,integral) & 
- !$OMP SHARED (mo_num,three_e_3_idx_direct_bi_ort)
- !$OMP DO SCHEDULE (dynamic)
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (i,j,m,integral) & 
+  !$OMP SHARED (mo_num,three_e_3_idx_direct_bi_ort)
+  !$OMP DO SCHEDULE (dynamic)
   do i = 1, mo_num
     do j = 1, mo_num
       do m = j, mo_num
@@ -36,8 +36,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END DO
+  !$OMP END PARALLEL
 
   do i = 1, mo_num
     do j = 1, mo_num
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 95b57e37..39a31751 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -1,287 +1,467 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
   ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
   !
-  ! three_e_4_idx_direct_bi_ort(m,j,k,i) = <mjk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_direct_bi_ort (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch13_bi_ort (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch12_bi_ort (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                       = three_e_4_idx_exch13_bi_ort (j,m,k,i) 
+  ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                       = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
   !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
-  END_DOC
-
- implicit none
- integer          :: i, j, k, m
- double precision :: integral, wall1, wall0
-
-  three_e_4_idx_direct_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_direct_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_direct_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, m, j, i, integral)
-          three_e_4_idx_direct_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_direct_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = <mjk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  ! three_e_4_idx_direct_bi_ort (m,j,k,i) : Lk Ri Imm Ijj + Lj Rj Imm Iki + Lm Rm Ijj Iki 
+  ! three_e_4_idx_exch13_bi_ort (m,j,k,i) : Lk Rm Imi Ijj + Lj Rj Imi Ikm + Lm Ri Ijj Ikm 
+  ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) : Lk Rm Imj Iji + Lj Ri Imj Ikm + Lm Rj Iji Ikm 
   !
   END_DOC
 
   implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
+  integer                       :: ipoint, i, j, k, l, m
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:), tmp4(:,:,:,:)
+  double precision, allocatable :: tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp5(:,:,:)
+  double precision, allocatable :: tmp7(:,:)
+  double precision, allocatable :: tmp_3d(:,:,:)
 
-  three_e_4_idx_cycle_1_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_cycle_1_bi_ort ...'
+  print *, ' Providing the three_e_4_idx_bi_ort ...'
   call wall_time(wall0)
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_cycle_1_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp4(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1, tmp2, tmp3, tmp4)
+  !$OMP DO COLLAPSE(2)
   do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, j, i, m, integral)
-          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
 
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_cycle_1_bi_ort', wall1 - wall0
-  call print_memory_usage()
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
 
-END_PROVIDER 
+        tmp2(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
 
-! --
+        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
 
-BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = <mjk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_cycle_2_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_cycle_2_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_cycle_2_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, i, m, j, integral)
-          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_cycle_2_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch23_bi_ort(m,j,k,i) = <mjk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_exch23_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_exch23_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_exch23_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, j, m, i, integral)
-          three_e_4_idx_exch23_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch13_bi_ort(m,j,k,i) = <mjk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_exch13_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_exch13_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_exch13_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, i, j, m, integral)
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch13_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  ! 
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch12_bi_ort(m,j,k,i) = <mjk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  ! 
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_exch12_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_exch12_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_exch12_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, m, i, j, integral)
-          three_e_4_idx_exch12_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
+        tmp4(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp4(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp4(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp4, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_4d(m,i,j,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp3, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp2)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp3, 3*n_points_final_grid, tmp4, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp3)
+  deallocate(tmp4)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+
+
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
+            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort(m,j,k,i) = three_e_4_idx_direct_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp7(n_points_final_grid,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, ipoint)                                       &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector,  &
+  !$OMP         tmp7)
+  !$OMP DO
+  do i = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+      tmp7(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+
+  do m = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (i, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         int2_grad1_u12_bimo_t,                           &
+    !$OMP         tmp5)
+    !$OMP DO COLLAPSE(2)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp5(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0 &
+              , tmp7, n_points_final_grid, tmp5, n_points_final_grid       &
+              , 0.d0, tmp_3d, mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (j, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         mos_l_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp5)
+    !$OMP DO COLLAPSE(2)
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp5(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
+              , tmp5, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
+              , 0.d0, tmp_3d, mo_num*mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+  enddo
+
+  deallocate(tmp7)
+  deallocate(tmp_3d)
+
+
+
+  do i = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (m, j, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
+    !$OMP         mos_r_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp5)
+    !$OMP DO COLLAPSE(2)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp5(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
+              , tmp5, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
+              , 1.d0, three_e_4_idx_cycle_1_bi_ort(1,1,1,i), mo_num*mo_num)
+
+  enddo
+
+  deallocate(tmp5)
+
+
+!  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+!  do i = 1, mo_num
+!    do k = 1, mo_num
+!      do j = 1, mo_num
+!        do m = 1, mo_num
+!          three_e_4_idx_exch12_bi_ort (m,j,k,i) = three_e_4_idx_exch13_bi_ort (j,m,k,i)
+!          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END PARALLEL DO
+
+
   call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch12_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_4_idx_bi_ort', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: i, j, k, l, m, ipoint
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp5(:,:,:), tmp6(:,:,:)
+
+  print *, ' Providing the three_e_4_idx_exch23_bi_ort ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+
+  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp6(n_points_final_grid,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp5, tmp6)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp5(ipoint,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * int2_grad1_u12_bimo_t(ipoint,1,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,l,i) * int2_grad1_u12_bimo_t(ipoint,2,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,l,i) * int2_grad1_u12_bimo_t(ipoint,3,i,l) 
+
+        tmp6(ipoint,l,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, -1.d0 &
+            , tmp5, n_points_final_grid, tmp6, n_points_final_grid               &
+            , 0.d0, three_e_4_idx_exch23_bi_ort, mo_num*mo_num)
+
+  deallocate(tmp5)
+  deallocate(tmp6)
+
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
+            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch23_bi_ort(m,j,k,i) = three_e_4_idx_exch23_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
   call print_memory_usage()
 
 END_PROVIDER 
diff --git a/src/bi_ort_ints/three_body_ijmk_old.irp.f b/src/bi_ort_ints/three_body_ijmk_old.irp.f
new file mode 100644
index 00000000..1a67f35b
--- /dev/null
+++ b/src/bi_ort_ints/three_body_ijmk_old.irp.f
@@ -0,0 +1,290 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_direct_bi_ort_old(m,j,k,i) = <mjk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+ implicit none
+ integer          :: i, j, k, m
+ double precision :: integral, wall1, wall0
+
+  three_e_4_idx_direct_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_direct_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_direct_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, m, j, i, integral)
+          three_e_4_idx_direct_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_direct_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_cycle_1_bi_ort_old(m,j,k,i) = <mjk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_cycle_1_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_cycle_1_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_cycle_1_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, j, i, m, integral)
+          three_e_4_idx_cycle_1_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_cycle_1_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! --
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_cycle_2_bi_ort_old(m,j,k,i) = <mjk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_cycle_2_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_cycle_2_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_cycle_2_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, i, m, j, integral)
+          three_e_4_idx_cycle_2_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_cycle_2_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch23_bi_ort_old(m,j,k,i) = <mjk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_exch23_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_exch23_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_exch23_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, j, m, i, integral)
+          three_e_4_idx_exch23_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch23_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch13_bi_ort_old(m,j,k,i) = <mjk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_exch13_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_exch13_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_exch13_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, i, j, m, integral)
+          three_e_4_idx_exch13_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch13_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  ! 
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch12_bi_ort_old(m,j,k,i) = <mjk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  ! 
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_exch12_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_exch12_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_exch12_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, m, i, j, integral)
+          three_e_4_idx_exch12_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch12_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index d67e1434..3e4412a3 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -19,17 +19,17 @@ end
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
-
   END_DOC
 
   implicit none
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
+  integer                       :: i, j, k, m, l
+  integer                       :: ipoint
+  double precision              :: wall1, wall0
   double precision, allocatable :: grad_mli(:,:,:), orb_mat(:,:,:)
   double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
   double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:)
+
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
@@ -38,201 +38,196 @@ end
   print *, ' Providing the three_e_5_idx_bi_ort ...'
   call wall_time(wall0)
 
- do m = 1, mo_num
+  do m = 1, mo_num
 
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         grad_mli, orb_mat)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
+    allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+    allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 
-         grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
-               int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
-               int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
-               int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (i,l,ipoint)                                      &
+    !$OMP SHARED (m,mo_num,n_points_final_grid,                     &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         grad_mli, orb_mat)
+    !$OMP DO COLLAPSE(2)
+    do i = 1, mo_num
+      do l = 1, mo_num
+         do ipoint = 1, n_points_final_grid
 
-         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+           grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
+                 int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
+                 int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
+                 int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
 
-       enddo
+           orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+
+         enddo
+      enddo
     enddo
-  enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
 
-  !$OMP END DO
-  !$OMP END PARALLEL
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
+        orb_mat, n_points_final_grid,  &
+        grad_mli, n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
 
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
-      orb_mat, n_points_final_grid,  &
-      grad_mli, n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
+    !$OMP END PARALLEL DO
 
-  deallocate(orb_mat,grad_mli)
+    deallocate(orb_mat,grad_mli)
 
+    allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+    allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+    allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
 
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (i,l,ipoint)                                      &
+    !$OMP SHARED (m,mo_num,n_points_final_grid,                     &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
+    !$OMP DO COLLAPSE(2)
+    do i=1,mo_num
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
 
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
+          lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+          lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+          lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
 
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+          rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+          rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+          rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
 
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
-
-       enddo
+          rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+          rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+          rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+        enddo
+      enddo
     enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lm_grad_ik, 3*n_points_final_grid,        &
+        rm_grad_ik, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
 
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP END PARALLEL DO
+  
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lm_grad_ik, 3*n_points_final_grid,        &
+        rk_grad_im, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
+  
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,j,k)
             three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = - tmp_mat(k,j,i,l)
             three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = - tmp_mat(k,i,j,l)
             three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = - tmp_mat(l,j,i,k)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-  deallocate(lm_grad_ik)
-
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         lk_grad_mi)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
-
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-       enddo
+    !$OMP END PARALLEL DO
+  
+    deallocate(lm_grad_ik)
+  
+    allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+  
+   !$OMP PARALLEL                                                  &
+   !$OMP DEFAULT (NONE)                                            &
+   !$OMP PRIVATE (i,l,ipoint)                                      &
+   !$OMP SHARED (m,mo_num,n_points_final_grid,                     &
+   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+   !$OMP         lk_grad_mi)
+   !$OMP DO COLLAPSE(2)
+    do i=1,mo_num
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
+  
+          lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+          lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+          lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+  
+        enddo
+      enddo
     enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lk_grad_mi, 3*n_points_final_grid,  &
-      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lk_grad_mi, 3*n_points_final_grid,  &
+        rm_grad_ik, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
+  
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l,i)
             three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
             three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k,i)
             three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(k,i,l,j)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lk_grad_mi, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP END PARALLEL DO
+  
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lk_grad_mi, 3*n_points_final_grid,  &
+        rk_grad_im, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
+  
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,j,i,k)
             three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,i,j,l)
             three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,j,i,l)
             three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,i,j,k)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  deallocate(lk_grad_mi)
-  deallocate(rm_grad_ik)
-  deallocate(rk_grad_im)
+    !$OMP END PARALLEL DO
+  
+    deallocate(lk_grad_mi)
+    deallocate(rm_grad_ik)
+    deallocate(rk_grad_im)
   enddo
 
+  deallocate(tmp_mat)
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_bi_ort', wall1 - wall0
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index 882470ed..a2077f0f 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -9,8 +9,7 @@ subroutine provide_all_three_ints_bi_ortho
    PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
   endif
   if(three_e_4_idx_term)then
-   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
-   PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
+   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
   endif
   if(.not.double_normal_ord.and.three_e_5_idx_term)then
    PROVIDE three_e_5_idx_direct_bi_ort 
diff --git a/src/tc_bi_ortho/slater_tc_opt_single.irp.f b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
index 7178d6d9..9719a6e7 100644
--- a/src/tc_bi_ortho/slater_tc_opt_single.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
@@ -243,7 +243,9 @@ subroutine fock_ac_tc_operator(iorb,ispin,key, h_fock,p_fock, ispin_fock,hthree,
    do j = 1, nb
     jj = occ(j,other_spin) 
     direct_int = three_e_4_idx_direct_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
-    exchange_int = three_e_4_idx_exch12_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
+    ! TODO
+    ! use transpose
+    exchange_int = three_e_4_idx_exch13_bi_ort(iorb,jj,p_fock,h_fock) ! USES 4-IDX TENSOR 
     hthree += direct_int - exchange_int
    enddo
   else !! ispin NE to ispin_fock
@@ -322,7 +324,8 @@ subroutine fock_a_tc_operator(iorb,ispin,key, h_fock,p_fock, ispin_fock,hthree,N
    do j = 1, nb
     jj = occ(j,other_spin) 
     direct_int = three_e_4_idx_direct_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
-    exchange_int = three_e_4_idx_exch12_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
+    ! TODO use transpose 
+    exchange_int = three_e_4_idx_exch13_bi_ort(iorb,jj,p_fock,h_fock) ! USES 4-IDX TENSOR 
     hthree -= direct_int - exchange_int
    enddo
   else !! ispin NE to ispin_fock
diff --git a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
index e725d8e5..3180d946 100644
--- a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
+++ b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
@@ -96,9 +96,11 @@ double precision function three_e_single_parrallel_spin(m,j,k,i)
  implicit none
  integer, intent(in) :: i,k,j,m
   three_e_single_parrallel_spin = three_e_4_idx_direct_bi_ort(m,j,k,i)  ! direct
-  three_e_single_parrallel_spin += three_e_4_idx_cycle_1_bi_ort(m,j,k,i) + three_e_4_idx_cycle_2_bi_ort(m,j,k,i) & ! two cyclic permutations 
+  three_e_single_parrallel_spin += three_e_4_idx_cycle_1_bi_ort(m,j,k,i) + three_e_4_idx_cycle_1_bi_ort(j,m,k,i) & ! two cyclic permutations 
   - three_e_4_idx_exch23_bi_ort(m,j,k,i) - three_e_4_idx_exch13_bi_ort(m,j,k,i)  & ! two first exchange 
-  - three_e_4_idx_exch12_bi_ort(m,j,k,i) ! last exchange 
+  - three_e_4_idx_exch13_bi_ort(j,m,k,i) ! last exchange 
+  ! TODO
+  ! use transpose
 end
 
 double precision function three_e_double_parrallel_spin(m,l,j,k,i)

From a791a28523b787618d571947452fffbc4e7340c6 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Jun 2023 09:19:34 +0200
Subject: [PATCH 51/79] working on memory footprint

---
 .../grid_becke_vector.irp.f                   | 18 +++-
 src/bi_ort_ints/semi_num_ints_mo.irp.f        | 23 ++++--
 src/bi_ort_ints/three_body_ijmk.irp.f         | 82 +++++++++++--------
 src/bi_ortho_mos/bi_ort_mos_in_r.irp.f        |  6 +-
 src/fci_tc_bi/fci_tc_bi_ortho.irp.f           |  8 +-
 src/tc_bi_ortho/normal_ordered.irp.f          |  6 --
 src/tc_bi_ortho/slater_tc_opt.irp.f           | 50 +++++++----
 7 files changed, 121 insertions(+), 72 deletions(-)

diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index fd185641..8982fe83 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -62,20 +62,30 @@ END_PROVIDER
     enddo
   enddo
 
+  FREE grid_points_per_atom
+  FREE final_weight_at_r
+
 END_PROVIDER
 
 ! ---
 
 BEGIN_PROVIDER [double precision, final_grid_points_transp, (n_points_final_grid,3)]
-  implicit none
+
   BEGIN_DOC
-! Transposed final_grid_points
+  ! Transposed final_grid_points
   END_DOC
 
+  implicit none
   integer :: i,j
-  do j=1,3
-    do i=1,n_points_final_grid
+
+  do j = 1, 3
+    do i = 1, n_points_final_grid
       final_grid_points_transp(i,j) = final_grid_points(j,i)
     enddo
   enddo
+
 END_PROVIDER
+
+! ---
+
+
diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 771d3274..6354b393 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -124,6 +124,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
       enddo
     enddo
 
+    FREE int2_grad1_u12_ao_test
+
   else
 
     PROVIDE int2_grad1_u12_ao
@@ -153,14 +155,14 @@ END_PROVIDER
 BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
 
   implicit none
-  integer :: ipoint
+  integer          :: ipoint
   double precision :: wall0, wall1
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_ao_transp
 
-  !print *, ' providing int2_grad1_u12_bimo_transp'
-  !call wall_time(wall0)
+  print *, ' providing int2_grad1_u12_bimo_transp'
+  call wall_time(wall0)
 
   !$OMP PARALLEL         &
   !$OMP DEFAULT (NONE)   &
@@ -178,8 +180,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   !$OMP END DO
   !$OMP END PARALLEL
 
-  !call wall_time(wall1)
-  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  call wall_time(wall1)
+  print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -188,7 +191,11 @@ END_PROVIDER
 BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
 
   implicit none
-  integer :: i, j, ipoint
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  call wall_time(wall0)
+  print *, ' Providing int2_grad1_u12_bimo_t ...'
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_transp
@@ -205,6 +212,10 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
 
   FREE int2_grad1_u12_bimo_transp
 
+  call wall_time(wall1)
+  print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
+  call print_memory_usage()
+
 END_PROVIDER 
 
 ! ---
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 39a31751..ee7e88ef 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -30,10 +30,10 @@
   implicit none
   integer                       :: ipoint, i, j, k, l, m
   double precision              :: wall1, wall0
-  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:), tmp4(:,:,:,:)
+  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:)
   double precision, allocatable :: tmp_4d(:,:,:,:)
-  double precision, allocatable :: tmp5(:,:,:)
-  double precision, allocatable :: tmp7(:,:)
+  double precision, allocatable :: tmp4(:,:,:)
+  double precision, allocatable :: tmp5(:,:)
   double precision, allocatable :: tmp_3d(:,:,:)
 
   print *, ' Providing the three_e_4_idx_bi_ort ...'
@@ -47,7 +47,6 @@
   allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp4(n_points_final_grid,3,mo_num,mo_num))
 
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
@@ -55,7 +54,7 @@
   !$OMP SHARED (mo_num, n_points_final_grid,                      &
   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1, tmp2, tmp3, tmp4)
+  !$OMP         tmp1, tmp2, tmp3)
   !$OMP DO COLLAPSE(2)
   do i = 1, mo_num
     do l = 1, mo_num
@@ -69,13 +68,9 @@
         tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
         tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
 
-        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-
-        tmp4(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp4(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp4(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
       enddo
     enddo
   enddo
@@ -99,7 +94,7 @@
   !$OMP END PARALLEL DO
 
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp4, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
+            , tmp3, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
             , 0.d0, tmp_4d, mo_num*mo_num)
 
   deallocate(tmp1)
@@ -116,8 +111,30 @@
   enddo
   !$OMP END PARALLEL DO
 
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp3, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
+            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
             , 0.d0, tmp_4d, mo_num*mo_num)
 
   deallocate(tmp2)
@@ -135,11 +152,10 @@
   !$OMP END PARALLEL DO
 
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp3, 3*n_points_final_grid, tmp4, 3*n_points_final_grid            &
+            , tmp1, 3*n_points_final_grid, tmp3, 3*n_points_final_grid            &
             , 0.d0, tmp_4d, mo_num*mo_num)
 
   deallocate(tmp3)
-  deallocate(tmp4)
 
   !$OMP PARALLEL DO PRIVATE(i,j,k,m)
   do i = 1, mo_num
@@ -155,8 +171,6 @@
 
 
 
-  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
-
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
   !$OMP PRIVATE (i, l, ipoint)                                    &
@@ -199,26 +213,26 @@
 
 
   allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp7(n_points_final_grid,mo_num))
+  allocate(tmp5(n_points_final_grid,mo_num))
 
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
   !$OMP PRIVATE (i, ipoint)                                       &
   !$OMP SHARED (mo_num, n_points_final_grid,                      &
   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector,  &
-  !$OMP         tmp7)
+  !$OMP         final_weight_at_r_vector,                         &
+  !$OMP         tmp5)
   !$OMP DO
   do i = 1, mo_num
     do ipoint = 1, n_points_final_grid
-      tmp7(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+      tmp5(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
 
-  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp4(n_points_final_grid,mo_num,mo_num))
 
   do m = 1, mo_num
 
@@ -227,13 +241,13 @@
     !$OMP PRIVATE (i, k, ipoint)                                   &
     !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
     !$OMP         int2_grad1_u12_bimo_t,                           &
-    !$OMP         tmp5)
+    !$OMP         tmp4)
     !$OMP DO COLLAPSE(2)
     do i = 1, mo_num
       do k = 1, mo_num
         do ipoint = 1, n_points_final_grid
 
-          tmp5(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
+          tmp4(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
                            + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
                            + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
         enddo
@@ -243,7 +257,7 @@
     !$OMP END PARALLEL
 
     call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0 &
-              , tmp7, n_points_final_grid, tmp5, n_points_final_grid       &
+              , tmp5, n_points_final_grid, tmp4, n_points_final_grid       &
               , 0.d0, tmp_3d, mo_num)
 
     !$OMP PARALLEL DO PRIVATE(i,j,k)
@@ -264,13 +278,13 @@
     !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
     !$OMP         mos_l_in_r_array_transp,                         &
     !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp5)
+    !$OMP         tmp4)
     !$OMP DO COLLAPSE(2)
     do k = 1, mo_num
       do j = 1, mo_num
         do ipoint = 1, n_points_final_grid
 
-          tmp5(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
+          tmp4(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
                            * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
                              + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
                              + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
@@ -281,7 +295,7 @@
     !$OMP END PARALLEL
 
     call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
-              , tmp5, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
+              , tmp4, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
               , 0.d0, tmp_3d, mo_num*mo_num)
 
     !$OMP PARALLEL DO PRIVATE(i,j,k)
@@ -296,7 +310,7 @@
 
   enddo
 
-  deallocate(tmp7)
+  deallocate(tmp5)
   deallocate(tmp_3d)
 
 
@@ -309,13 +323,13 @@
     !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
     !$OMP         mos_r_in_r_array_transp,                         &
     !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp5)
+    !$OMP         tmp4)
     !$OMP DO COLLAPSE(2)
     do j = 1, mo_num
       do m = 1, mo_num
         do ipoint = 1, n_points_final_grid
 
-          tmp5(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
+          tmp4(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
                            * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
                              + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
                              + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
@@ -326,12 +340,12 @@
     !$OMP END PARALLEL
 
     call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
-              , tmp5, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
+              , tmp4, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
               , 1.d0, three_e_4_idx_cycle_1_bi_ort(1,1,1,i), mo_num*mo_num)
 
   enddo
 
-  deallocate(tmp5)
+  deallocate(tmp4)
 
 
 !  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
diff --git a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
index 42130575..8667683e 100644
--- a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
+++ b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
@@ -46,6 +46,8 @@ BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid,
       mos_r_in_r_array_transp(i,j) = mos_r_in_r_array(j,i) 
     enddo
   enddo
+
+  FREE mos_r_in_r_array
  
 END_PROVIDER
 
@@ -116,7 +118,7 @@ end subroutine give_all_mos_l_at_r
 
 ! ---
 
-BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp,(n_points_final_grid,mo_num)]
+BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid,mo_num)]
 
   BEGIN_DOC
   ! mos_l_in_r_array_transp(i,j) = value of the jth mo on the ith grid point
@@ -130,6 +132,8 @@ BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp,(n_points_final_grid,mo
       mos_l_in_r_array_transp(i,j) = mos_l_in_r_array(j,i) 
     enddo
   enddo
+
+  FREE mos_l_in_r_array
  
 END_PROVIDER
 
diff --git a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
index ed75c882..f9bda058 100644
--- a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -54,11 +54,13 @@ subroutine run_cipsi_tc
 
   implicit none
 
-  if (.not.is_zmq_slave) then
+  if (.not. is_zmq_slave) then
+
     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
-    if(elec_alpha_num+elec_beta_num.ge.3)then
+
+    if(elec_alpha_num+elec_beta_num .ge. 3) then
       if(three_body_h_tc)then
-        call provide_all_three_ints_bi_ortho
+        call provide_all_three_ints_bi_ortho()
       endif
     endif
     ! ---
diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index cc01d144..f8e310df 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -24,9 +24,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
  
   PROVIDE N_int
 
-  print*,' Providing normal_two_body_bi_orth ...'
-  call wall_time(wall0)
-
   if(read_tc_norm_ord) then
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
@@ -115,9 +112,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   call wall_time(wall1)
   print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
 
-  call wall_time(wall1)
-  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
-
 END_PROVIDER 
 
 ! ---
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index a2077f0f..42c59308 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -1,24 +1,38 @@
-subroutine provide_all_three_ints_bi_ortho
- implicit none
- BEGIN_DOC
-! routine that provides all necessary three-electron integrals 
- END_DOC
- if(three_body_h_tc)then
-  if(three_e_3_idx_term)then
-   PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
-   PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
-  endif
-  if(three_e_4_idx_term)then
-   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
-  endif
-  if(.not.double_normal_ord.and.three_e_5_idx_term)then
-   PROVIDE three_e_5_idx_direct_bi_ort 
-  elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
-   PROVIDE normal_two_body_bi_orth
-  endif
+
+! ---
+
+subroutine provide_all_three_ints_bi_ortho()
+
+  BEGIN_DOC
+  ! routine that provides all necessary three-electron integrals 
+  END_DOC
+
+  implicit none
+
+  if(three_body_h_tc) then
+
+    if(three_e_3_idx_term) then
+      PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
+      PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
+    endif
+
+    if(three_e_4_idx_term) then
+      PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
+    endif
+
+    if(.not. double_normal_ord. and. three_e_5_idx_term) then
+      PROVIDE three_e_5_idx_direct_bi_ort 
+    elseif(double_normal_ord .and. (.not. three_e_5_idx_term)) then
+      PROVIDE normal_two_body_bi_orth
+    endif
+
  endif
+
+ return
 end
 
+! ---
+
 subroutine htilde_mu_mat_opt_bi_ortho_tot(key_j, key_i, Nint, htot)
  implicit none
   BEGIN_DOC

From 501b9d648702c0f5a2ba0f684ef40ce69d0cb6ce Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Jun 2023 09:58:29 +0200
Subject: [PATCH 52/79] minor modifs

---
 .../grid_becke_vector.irp.f                   | 21 +++++++++++++---
 src/bi_ort_ints/bi_ort_ints.irp.f             | 25 ++++++++++++++-----
 src/bi_ort_ints/semi_num_ints_mo.irp.f        | 22 ++++++++--------
 src/bi_ortho_mos/bi_ort_mos_in_r.irp.f        |  4 ---
 src/tc_bi_ortho/slater_tc_opt.irp.f           |  2 +-
 5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index 8982fe83..0386f3c6 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -1,10 +1,13 @@
 
 BEGIN_PROVIDER [integer, n_points_final_grid]
-  implicit none
+
   BEGIN_DOC
   ! Number of points which are non zero
   END_DOC
-  integer                        :: i,j,k,l
+
+  implicit none
+  integer :: i, j, k, l
+
   n_points_final_grid = 0
   do j = 1, nucl_num
     do i = 1, n_points_radial_grid -1
@@ -16,9 +19,11 @@ BEGIN_PROVIDER [integer, n_points_final_grid]
       enddo
     enddo
   enddo
-  print*,'n_points_final_grid = ',n_points_final_grid
-  print*,'n max point         = ',n_points_integration_angular*(n_points_radial_grid*nucl_num - 1)
+
+  print*,' n_points_final_grid = ', n_points_final_grid
+  print*,' n max point         = ', n_points_integration_angular*(n_points_radial_grid*nucl_num - 1)
   call ezfio_set_becke_numerical_grid_n_points_final_grid(n_points_final_grid)
+
 END_PROVIDER
 
 ! ---
@@ -41,6 +46,10 @@ END_PROVIDER
   implicit none
   integer          :: i, j, k, l, i_count
   double precision :: r(3)
+  double precision :: wall0, wall1
+
+  call wall_time(wall0)
+  print *, ' Providing final_grid_points ...'
 
   i_count = 0
   do j = 1, nucl_num
@@ -65,6 +74,10 @@ END_PROVIDER
   FREE grid_points_per_atom
   FREE final_weight_at_r
 
+  call wall_time(wall1)
+  print *, ' wall time for final_grid_points,', wall1 - wall0
+  call print_memory_usage()
+
 END_PROVIDER
 
 ! ---
diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index bb0424cd..e64892d7 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -1,22 +1,35 @@
+! ---
+
 program bi_ort_ints
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 10
-  my_n_pt_a_grid = 14
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+  !my_n_pt_r_grid = 10
+  !my_n_pt_a_grid = 14
+  my_n_pt_r_grid = 30
+  my_n_pt_a_grid = 50
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
 ! call test_3e
 ! call test_5idx
 ! call test_5idx2
- call test_4idx
+ !call test_4idx
+  call test_4idx2()
 end
 
 subroutine test_5idx2
   PROVIDE three_e_5_idx_cycle_2_bi_ort
 end
 
+subroutine test_4idx2()
+  PROVIDE three_e_4_idx_direct_bi_ort 
+end
+
 subroutine test_3e
  implicit none
  integer :: i,k,j,l,m,n,ipoint
diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 6354b393..355fa38f 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -54,7 +54,7 @@ BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_poi
     enddo
   enddo
 
-! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
+  !FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
 
 END_PROVIDER 
 
@@ -161,8 +161,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_ao_transp
 
-  print *, ' providing int2_grad1_u12_bimo_transp'
-  call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_bimo_transp'
+  !call wall_time(wall0)
 
   !$OMP PARALLEL         &
   !$OMP DEFAULT (NONE)   &
@@ -180,9 +180,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call wall_time(wall1)
-  print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
-  call print_memory_usage()
+  !call wall_time(wall1)
+  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  !call print_memory_usage()
 
 END_PROVIDER 
 
@@ -194,8 +194,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
   integer          :: i, j, ipoint
   double precision :: wall0, wall1
 
-  call wall_time(wall0)
-  print *, ' Providing int2_grad1_u12_bimo_t ...'
+  !call wall_time(wall0)
+  !print *, ' Providing int2_grad1_u12_bimo_t ...'
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_transp
@@ -212,9 +212,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
 
   FREE int2_grad1_u12_bimo_transp
 
-  call wall_time(wall1)
-  print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
-  call print_memory_usage()
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
+  !call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
index 8667683e..25572854 100644
--- a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
+++ b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
@@ -47,8 +47,6 @@ BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid,
     enddo
   enddo
 
-  FREE mos_r_in_r_array
- 
 END_PROVIDER
 
 ! ---
@@ -133,8 +131,6 @@ BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid,m
     enddo
   enddo
 
-  FREE mos_l_in_r_array
- 
 END_PROVIDER
 
 ! ---
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index 42c59308..7acb0d0f 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -20,7 +20,7 @@ subroutine provide_all_three_ints_bi_ortho()
       PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
     endif
 
-    if(.not. double_normal_ord. and. three_e_5_idx_term) then
+    if(.not. double_normal_ord .and. three_e_5_idx_term) then
       PROVIDE three_e_5_idx_direct_bi_ort 
     elseif(double_normal_ord .and. (.not. three_e_5_idx_term)) then
       PROVIDE normal_two_body_bi_orth

From b984d7a1f4a734ae459a3c91e2ca9ee2ea26bc50 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Jun 2023 15:27:07 +0200
Subject: [PATCH 53/79] minor modif

---
 src/bi_ort_ints/bi_ort_ints.irp.f | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index e64892d7..5e465d0f 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -27,7 +27,8 @@ subroutine test_5idx2
 end
 
 subroutine test_4idx2()
-  PROVIDE three_e_4_idx_direct_bi_ort 
+  !PROVIDE three_e_4_idx_direct_bi_ort 
+  PROVIDE three_e_4_idx_exch23_bi_ort
 end
 
 subroutine test_3e

From b03709020df45806e59d2dfc18303285d69696d4 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 4 Jun 2023 16:45:38 +0200
Subject: [PATCH 54/79] 5idx arrays need O(N3) temp arrays

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |  34 +--
 src/bi_ort_ints/three_body_ijmkl.irp.f | 330 +++++++++++++------------
 2 files changed, 187 insertions(+), 177 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 42bbe315..5618a2cd 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -8,8 +8,8 @@ program bi_ort_ints
   my_n_pt_a_grid = 14
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 ! call test_3e
+  call test_5idx2
  call test_5idx
-!  call test_5idx2
 end
 
 subroutine test_5idx2
@@ -60,6 +60,8 @@ subroutine test_5idx
  k = 1
  n = 0
  accu = 0.d0
+ PROVIDE three_e_5_idx_direct_bi_ort_old
+
  do i = 1, mo_num
   do k = 1, mo_num
    do j = 1, mo_num
@@ -69,28 +71,28 @@ subroutine test_5idx
 !         stop
 !      endif
 
-!      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'direct'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+!
+!      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
 !      contrib = dabs(new - ref)
 !      accu += contrib
 !      if(contrib .gt. 1.d-10)then
-!       print*,'direct'
+!       print*,'exch12'
 !       print*,i,k,j,l,m
 !       print*,ref,new,contrib
 !       stop
 !      endif
 !
-      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'exch12'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
-
 !
 !      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
 !      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
@@ -135,7 +137,7 @@ subroutine test_5idx
 !       print*,ref,new,contrib
 !       stop
 !      endif
-!
+
      enddo
     enddo
    enddo
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 7b39235b..6e46637d 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -15,7 +15,7 @@ end
   !
   ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
   !
-  ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> :: : notice that i is the RIGHT MO and k is the LEFT MO
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
@@ -23,210 +23,218 @@ end
   END_DOC
 
   implicit none
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision, allocatable :: grad_mli(:,:,:), orb_mat(:,:,:)
-  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
-  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:)
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  integer                        :: i, j, k, m, l
+  double precision               :: wall1, wall0
+  integer                        :: ipoint
+  double precision, allocatable  :: grad_mli(:,:), orb_mat(:,:,:)
+  double precision, allocatable  :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:)
+  double precision, allocatable  :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:)
+  double precision, allocatable  :: tmp_mat(:,:,:)
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
+  call print_memory_usage
   print *, ' Providing the three_e_5_idx_bi_ort ...'
   call wall_time(wall0)
 
- do m = 1, mo_num
+  three_e_5_idx_direct_bi_ort (:,:,:,:,:) = 0.d0
+  three_e_5_idx_cycle_1_bi_ort(:,:,:,:,:) = 0.d0
+  three_e_5_idx_cycle_2_bi_ort(:,:,:,:,:) = 0.d0
+  three_e_5_idx_exch23_bi_ort (:,:,:,:,:) = 0.d0
+  three_e_5_idx_exch13_bi_ort (:,:,:,:,:) = 0.d0
 
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  call print_memory_usage
+
+  allocate(tmp_mat(mo_num,mo_num,mo_num))
   allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         grad_mli, orb_mat)
- !$OMP DO COLLAPSE(2)
+
+  !$OMP PARALLEL DO PRIVATE (i,l,ipoint)
   do i=1,mo_num
     do l=1,mo_num
-       do ipoint=1, n_points_final_grid
+      do ipoint=1, n_points_final_grid
 
-         grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
-               int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
-               int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
-               int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+        orb_mat(ipoint,l,i) = final_weight_at_r_vector(ipoint)       &
+            * mos_l_in_r_array_transp(ipoint,l)                      &
+            * mos_r_in_r_array_transp(ipoint,i)
 
-         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
-
-       enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
-      orb_mat, n_points_final_grid,  &
-      grad_mli, n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
-        enddo
       enddo
     enddo
   enddo
   !$OMP END PARALLEL DO
 
-  deallocate(orb_mat,grad_mli)
+  tmp_mat = 0.d0
+  call print_memory_usage
+!
+  do m = 1, mo_num
 
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+    allocate(grad_mli(n_points_final_grid,mo_num))
 
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
+    do i=1,mo_num
+      !$OMP PARALLEL DO PRIVATE (l,ipoint)
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
 
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+          grad_mli(ipoint,l) =                                       &
+              int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) +&
+              int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) +&
+              int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
 
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
 
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+      call dgemm('T','N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0,&
+          orb_mat, n_points_final_grid,                              &
+          grad_mli, n_points_final_grid,  0.d0,                      &
+          tmp_mat, mo_num*mo_num)
 
-       enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
+      !$OMP PARALLEL PRIVATE(j,k,l)
+      !$OMP DO
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP DO
       do j = 1, mo_num
         do l = 1, mo_num
-            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
+          do k = 1, mo_num
+            three_e_5_idx_direct_bi_ort(m,k,i,l,j) = three_e_5_idx_direct_bi_ort(m,k,i,l,j) - tmp_mat(l,j,k)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+    enddo
+
+    deallocate(grad_mli)
+
+    allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+    allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+
+    !$OMP PARALLEL DO PRIVATE (i,l,ipoint)
+    do i=1,mo_num
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
+
+          lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+          lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+          lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
+          lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+          lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+          lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
+    !$OMP END PARALLEL DO
 
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
 
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,j,k)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = - tmp_mat(k,j,i,l)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = - tmp_mat(k,i,j,l)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = - tmp_mat(l,j,i,k)
+    allocate(rm_grad_ik(n_points_final_grid,3,mo_num))
+    allocate(rk_grad_im(n_points_final_grid,3,mo_num))
+
+    do i=1,mo_num
+      !$OMP PARALLEL DO PRIVATE (l,ipoint)
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
+
+          rm_grad_ik(ipoint,1,l) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+          rm_grad_ik(ipoint,2,l) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+          rm_grad_ik(ipoint,3,l) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+          rk_grad_im(ipoint,1,l) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+          rk_grad_im(ipoint,2,l) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+          rk_grad_im(ipoint,3,l) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+
         enddo
       enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
 
-  deallocate(lm_grad_ik)
+      call dgemm('T','N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0,&
+          lm_grad_ik, 3*n_points_final_grid,                         &
+          rm_grad_ik, 3*n_points_final_grid,  0.d0,                  &
+          tmp_mat, mo_num*mo_num)
 
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         lk_grad_mi)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
-
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-       enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lk_grad_mi, 3*n_points_final_grid,  &
-      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l,i)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k,i)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(k,i,l,j)
+      !$OMP PARALLEL DO PRIVATE(j,k,l)
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k)
+          enddo
         enddo
       enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
 
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lk_grad_mi, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
+      call dgemm('T','N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0,&
+          lm_grad_ik, 3*n_points_final_grid,                         &
+          rk_grad_im, 3*n_points_final_grid,  0.d0,                  &
+          tmp_mat, mo_num*mo_num)
 
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,j,i,k)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,i,j,l)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,j,i,l)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,i,j,k)
+      !$OMP PARALLEL DO PRIVATE(j,k,l)
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,i,k) = three_e_5_idx_cycle_1_bi_ort(m,l,j,i,k) - tmp_mat(l,k,j)
+            three_e_5_idx_cycle_2_bi_ort(m,i,j,k,l) = three_e_5_idx_cycle_2_bi_ort(m,i,j,k,l) - tmp_mat(k,j,l)
+            three_e_5_idx_exch23_bi_ort (m,i,j,k,l) = three_e_5_idx_exch23_bi_ort (m,i,j,k,l) - tmp_mat(k,l,j)
+            three_e_5_idx_exch13_bi_ort (m,l,j,i,k) = three_e_5_idx_exch13_bi_ort (m,l,j,i,k) - tmp_mat(l,j,k)
+          enddo
         enddo
       enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
+
+
+      call dgemm('T','N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0,&
+          lk_grad_mi, 3*n_points_final_grid,                         &
+          rm_grad_ik, 3*n_points_final_grid,  0.d0,                  &
+          tmp_mat, mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(j,k,l)
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l)
+            three_e_5_idx_cycle_2_bi_ort(m,l,i,k,j) = three_e_5_idx_cycle_2_bi_ort(m,l,i,k,j) - tmp_mat(l,j,k)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k)
+            three_e_5_idx_exch13_bi_ort (m,l,i,k,j) = three_e_5_idx_exch13_bi_ort (m,l,i,k,j) - tmp_mat(k,j,l)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      call dgemm('T','N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0,&
+          lk_grad_mi, 3*n_points_final_grid,                         &
+          rk_grad_im, 3*n_points_final_grid,  0.d0,                  &
+          tmp_mat, mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(j,k,l)
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,i,k) = three_e_5_idx_cycle_1_bi_ort(m,l,j,i,k) - tmp_mat(l,j,k)
+            three_e_5_idx_cycle_2_bi_ort(m,i,j,k,l) = three_e_5_idx_cycle_2_bi_ort(m,i,j,k,l) - tmp_mat(k,l,j)
+            three_e_5_idx_exch23_bi_ort (m,i,j,k,l) = three_e_5_idx_exch23_bi_ort (m,i,j,k,l) - tmp_mat(k,j,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,i,k) = three_e_5_idx_exch13_bi_ort (m,l,j,i,k) - tmp_mat(l,k,j)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+    enddo
+    deallocate(rm_grad_ik)
+    deallocate(rk_grad_im)
+    deallocate(lk_grad_mi)
+    deallocate(lm_grad_ik)
 
-  deallocate(lk_grad_mi)
-  deallocate(rm_grad_ik)
-  deallocate(rk_grad_im)
   enddo
 
+  deallocate(orb_mat)
+
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_bi_ort', wall1 - wall0
 

From b48e6b269d624ecfabcbb4895c75bd397646c0d8 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 01:33:55 +0200
Subject: [PATCH 55/79] start optim normal ordering

---
 src/tc_bi_ortho/normal_ordered.irp.f     | 438 ++++++++++++-----------
 src/tc_bi_ortho/normal_ordered_old.irp.f | 390 ++++++++++++++++++++
 2 files changed, 625 insertions(+), 203 deletions(-)
 create mode 100644 src/tc_bi_ortho/normal_ordered_old.irp.f

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index f8e310df..a092762b 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -14,7 +14,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   integer                        :: i, h1, p1, h2, p2
   integer                        :: hh1, hh2, pp1, pp2
   integer                        :: Ne(2)
-  double precision               :: hthree_aba, hthree_aaa, hthree_aab
+  double precision               :: hthree_aaa, hthree_aab
   double precision               :: wall0, wall1
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
@@ -39,57 +39,65 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
     if(core_tc_op) then
       do i = 1, N_int
-        key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-        key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+        key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
       enddo
-      call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+      call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
     else
-      call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+      call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
     endif
 
-    normal_two_body_bi_orth = 0.d0
+    ! opposite spin double excitations : s1 /= s2
+    normal_two_body_bi_orth(:,:,:,:) = no_aba_contraction(:,:,:,:)
 
-    !$OMP PARALLEL                                                                         &
-    !$OMP DEFAULT (NONE)                                                                   &
-    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+    !$OMP PARALLEL                                                             &
+    !$OMP DEFAULT (NONE)                                                       &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa) & 
     !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
+
       do pp1 = 1, n_act_orb
         p1 = list_act(pp1)
+
         do hh2 = 1, n_act_orb
           h2 = list_act(hh2) 
+
           do pp2 = 1, n_act_orb
             p2 = list_act(pp2)
+
             ! all contributions from the 3-e terms to the double excitations 
             ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
-      
-
-            ! opposite spin double excitations : s1 /= s2
-            call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
 
             ! same spin double excitations : s1 == s2 
-            if(h1<h2.and.p1.gt.p2)then
-             ! with opposite spin contributions 
-             call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-             ! same spin double excitations with same spin contributions 
-             if(Ne(2).ge.3)then
-               call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-             else
-               hthree_aaa = 0.d0
-             endif
-            else
-             ! with opposite spin contributions 
-             call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-             if(Ne(2).ge.3)then
+            if((h1 < h2) .and. (p1 > p2)) then
+
+              ! with opposite spin contributions 
+              call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+
               ! same spin double excitations with same spin contributions 
-               call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-             else
-               hthree_aaa = 0.d0
-             endif
+              if(Ne(2) .ge. 3) then
+                call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+              else
+                hthree_aaa = 0.d0
+              endif
+
+            else
+
+              ! with opposite spin contributions 
+              call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+
+              if(Ne(2) .ge. 3) then
+                ! same spin double excitations with same spin contributions 
+                call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+              else
+                hthree_aaa = 0.d0
+              endif
+
             endif
-            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+
+            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aab + hthree_aaa)
           enddo
         enddo
       enddo
@@ -116,178 +124,6 @@ END_PROVIDER
 
 ! ---
 
-subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer, intent(in)           :: Nint, h1, h2, p1, p2
-  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
-  double precision, intent(out) :: hthree
-  integer                       :: ii, i
-  double precision              :: int_direct, int_exc_12, int_exc_13, integral
-
-  !!!! double alpha/beta
-  hthree = 0.d0
-
-  do ii = 1, Ne(2) ! purely closed shell part 
-    i = occ(ii,2)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12 = -1.d0 * integral
-
-    hthree += 2.d0 * int_direct - 1.d0 * (int_exc_13 + int_exc_12)
-  enddo
-
-  do ii = Ne(2) + 1, Ne(1) ! purely open-shell part 
-    i = occ(ii,1)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12 = -1.d0 * integral
-
-    hthree += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
-  enddo
-
-  return
-end
-
-! ---
-
-BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  ! Normal ordered two-body sector of the three-body terms for opposite spin double excitations 
-  END_DOC
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: h1, p1, h2, p2, i
-  integer                        :: hh1, hh2, pp1, pp2
-  integer                        :: Ne(2)
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision               :: hthree
-
-  PROVIDE N_int
-
-  allocate( key_i_core(N_int,2) )
-  allocate( occ(N_int*bit_kind_size,2) )
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
-  endif
-
-  normal_two_body_bi_orth_ab = 0.d0
-  do hh1 = 1, n_act_orb
-    h1 = list_act(hh1) 
-    do pp1 = 1, n_act_orb
-      p1 = list_act(pp1)
-      do hh2 = 1, n_act_orb
-        h2 = list_act(hh2) 
-        do pp2 = 1, n_act_orb
-          p2 = list_act(pp2)
-          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
-
-          normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate( key_i_core )
-  deallocate( occ )
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_act_orb, n_act_orb, n_act_orb)]
-
-  BEGIN_DOC
-  ! Normal ordered two-body sector of the three-body terms for same spin double excitations 
-  END_DOC
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i,ii,j,h1,p1,h2,p2
-  integer                        :: hh1,hh2,pp1,pp2
-  integer                        :: Ne(2)
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision               :: hthree_aab, hthree_aaa
-
-  PROVIDE N_int
-
-  allocate( key_i_core(N_int,2) )
-  allocate( occ(N_int*bit_kind_size,2) )
-
-  if(core_tc_op)then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  normal_two_body_bi_orth_aa_bb = 0.d0
-  do hh1 = 1, n_act_orb
-    h1 = list_act(hh1) 
-    do pp1 = 1 , n_act_orb
-      p1 = list_act(pp1)
-      do hh2 = 1, n_act_orb
-        h2 = list_act(hh2) 
-        do pp2 = 1 , n_act_orb
-          p2 = list_act(pp2)
-          if(h1<h2.and.p1.gt.p2)then
-           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-           else
-             hthree_aaa = 0.d0
-           endif
-          else
-           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-           else
-             hthree_aaa = 0.d0
-           endif
-          endif
-          normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1) = hthree_aab + hthree_aaa
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate( key_i_core )
-  deallocate( occ )
-
-END_PROVIDER 
-
-! ---
-
 subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
   BEGIN_DOC
@@ -388,3 +224,199 @@ end
 
 ! ---
 
+BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+
+  print*,' Providing no_aba_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+  allocate(tmpvec_2(n_points_final_grid,3))
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0              &
+                , int2_grad1_u12_bimo_t, 3*n_points_final_grid, tmp1, 3*n_points_final_grid &
+                , 0.d0, tmp_3d, mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0                     &
+                  , mos_l_in_r_array_transp, n_points_final_grid, tmp2, n_points_final_grid &
+                  , 1.d0, no_aba_contraction(p2,h2,1,1), mo_num*mo_num)
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+
+  double precision :: integral, int_direct, int_exc_13, int_exc_12
+
+  ! TODO
+  ! purely open-shell part 
+  if(Ne(2) < Ne(1)) then
+
+    do ii = Ne(2) + 1, Ne(1)
+      i = occ(ii,1)
+
+      call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+      int_direct = -1.d0 * integral
+
+      call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+      int_exc_13 = -1.d0 * integral
+
+      call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+      int_exc_12 = -1.d0 * integral
+
+      no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
+    enddo
+  endif
+
+  ! ---
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+
+  !$OMP PARALLEL DO PRIVATE(h1,h2,p1,p2)
+  do h1 = 1, mo_num
+    do p1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p2 = 1, mo_num
+          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (no_aba_contraction(p2,h2,p1,h1) + no_aba_contraction(p1,h1,p2,h2))
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+END_PROVIDER
+
+! ---
+
+
diff --git a/src/tc_bi_ortho/normal_ordered_old.irp.f b/src/tc_bi_ortho/normal_ordered_old.irp.f
new file mode 100644
index 00000000..553cafdb
--- /dev/null
+++ b/src/tc_bi_ortho/normal_ordered_old.irp.f
@@ -0,0 +1,390 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC 
+  ! Normal ordering of the three body interaction on the HF density
+  END_DOC 
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+
+  integer                        :: i, h1, p1, h2, p2
+  integer                        :: hh1, hh2, pp1, pp2
+  integer                        :: Ne(2)
+  double precision               :: hthree_aba, hthree_aaa, hthree_aab
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+
+  print*,' Providing normal_two_body_bi_orth_old ...'
+  call wall_time(wall0)
+ 
+  PROVIDE N_int
+
+  if(read_tc_norm_ord) then
+
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_old', action="read")
+      read(11) normal_two_body_bi_orth_old
+    close(11)
+
+  else
+
+    PROVIDE N_int
+
+    allocate( occ(N_int*bit_kind_size,2) )
+    allocate( key_i_core(N_int,2) )
+
+    if(core_tc_op) then
+      do i = 1, N_int
+        key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+      enddo
+      call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+    else
+      call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    endif
+
+    normal_two_body_bi_orth_old = 0.d0
+
+    !$OMP PARALLEL                                                                         &
+    !$OMP DEFAULT (NONE)                                                                   &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth_old)
+    !$OMP DO SCHEDULE (static) 
+    do hh1 = 1, n_act_orb
+      h1 = list_act(hh1) 
+      do pp1 = 1, n_act_orb
+        p1 = list_act(pp1)
+        do hh2 = 1, n_act_orb
+          h2 = list_act(hh2) 
+          do pp2 = 1, n_act_orb
+            p2 = list_act(pp2)
+            ! all contributions from the 3-e terms to the double excitations 
+            ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
+      
+
+            ! opposite spin double excitations : s1 /= s2
+            call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
+
+            ! same spin double excitations : s1 == s2 
+            if(h1<h2.and.p1.gt.p2)then
+             ! with opposite spin contributions 
+             call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+             ! same spin double excitations with same spin contributions 
+             if(Ne(2).ge.3)then
+               call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+             else
+               hthree_aaa = 0.d0
+             endif
+            else
+             ! with opposite spin contributions 
+             call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+             if(Ne(2).ge.3)then
+              ! same spin double excitations with same spin contributions 
+               call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+             else
+               hthree_aaa = 0.d0
+             endif
+            endif
+            normal_two_body_bi_orth_old(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate( occ )
+    deallocate( key_i_core )
+  endif
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_old', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth_old
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for normal_two_body_bi_orth_old ', wall1-wall0
+
+END_PROVIDER 
+
+! ---
+
+subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer, intent(in)           :: Nint, h1, h2, p1, p2
+  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+  double precision, intent(out) :: hthree
+  integer                       :: ii, i
+  double precision              :: int_direct, int_exc_12, int_exc_13, integral
+
+  !!!! double alpha/beta
+  hthree = 0.d0
+
+  do ii = 1, Ne(2) ! purely closed shell part 
+    i = occ(ii,2)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    hthree += 2.d0 * int_direct - 1.d0 * (int_exc_13 + int_exc_12)
+  enddo
+
+  do ii = Ne(2) + 1, Ne(1) ! purely open-shell part 
+    i = occ(ii,1)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    hthree += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
+  enddo
+
+  return
+end
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  ! Normal ordered two-body sector of the three-body terms for opposite spin double excitations 
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: h1, p1, h2, p2, i
+  integer                        :: hh1, hh2, pp1, pp2
+  integer                        :: Ne(2)
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision               :: hthree
+
+  PROVIDE N_int
+
+  allocate( key_i_core(N_int,2) )
+  allocate( occ(N_int*bit_kind_size,2) )
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+  endif
+
+  normal_two_body_bi_orth_ab = 0.d0
+  do hh1 = 1, n_act_orb
+    h1 = list_act(hh1) 
+    do pp1 = 1, n_act_orb
+      p1 = list_act(pp1)
+      do hh2 = 1, n_act_orb
+        h2 = list_act(hh2) 
+        do pp2 = 1, n_act_orb
+          p2 = list_act(pp2)
+          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
+
+          normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
+        enddo
+      enddo
+    enddo
+  enddo
+
+  deallocate( key_i_core )
+  deallocate( occ )
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_act_orb, n_act_orb, n_act_orb)]
+
+  BEGIN_DOC
+  ! Normal ordered two-body sector of the three-body terms for same spin double excitations 
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i,ii,j,h1,p1,h2,p2
+  integer                        :: hh1,hh2,pp1,pp2
+  integer                        :: Ne(2)
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision               :: hthree_aab, hthree_aaa
+
+  PROVIDE N_int
+
+  allocate( key_i_core(N_int,2) )
+  allocate( occ(N_int*bit_kind_size,2) )
+
+  if(core_tc_op)then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  normal_two_body_bi_orth_aa_bb = 0.d0
+  do hh1 = 1, n_act_orb
+    h1 = list_act(hh1) 
+    do pp1 = 1 , n_act_orb
+      p1 = list_act(pp1)
+      do hh2 = 1, n_act_orb
+        h2 = list_act(hh2) 
+        do pp2 = 1 , n_act_orb
+          p2 = list_act(pp2)
+          if(h1<h2.and.p1.gt.p2)then
+           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+           if(Ne(2).ge.3)then
+             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+           else
+             hthree_aaa = 0.d0
+           endif
+          else
+           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+           if(Ne(2).ge.3)then
+             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+           else
+             hthree_aaa = 0.d0
+           endif
+          endif
+          normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1) = hthree_aab + hthree_aaa
+        enddo
+      enddo
+    enddo
+  enddo
+
+  deallocate( key_i_core )
+  deallocate( occ )
+
+END_PROVIDER 
+
+! ---
+
+subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+
+  BEGIN_DOC
+  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer, intent(in)           :: Nint, h1, h2, p1, p2
+  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+  double precision, intent(out) :: hthree
+  integer                       :: ii,i
+  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
+  double precision              :: integral,int_exc_l,int_exc_ll
+
+  hthree = 0.d0
+  do ii = 1, Ne(2) ! purely closed shell part 
+    i = occ(ii,2)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
+    int_exc_l = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+    int_exc_ll= -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12= -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13= -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+    int_exc_23= -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
+  enddo
+
+  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
+    i = occ(ii,1)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
+    int_exc_l = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+    int_exc_ll = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+    int_exc_23 = -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+  enddo
+
+  return
+end
+
+! ---
+
+subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer,          intent(in)  :: Nint, h1, h2, p1, p2
+  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
+  double precision, intent(out) :: hthree
+  integer                       :: ii, i
+  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
+  double precision              :: integral, int_exc_l, int_exc_ll
+
+  hthree = 0.d0
+  do ii = 1, Ne(2) ! purely closed shell part 
+    i = occ(ii,2)
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
+    int_exc_23= -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct - int_exc_23
+  enddo
+
+  return
+end
+
+! ---
+

From 471283634919dd134e294aa71c0bac0a37d4872c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 11:17:08 +0200
Subject: [PATCH 56/79]  normal ordering: aba-DGEMM OK

---
 src/tc_bi_ortho/normal_ordered.irp.f   | 495 ++++++++++++++++++-------
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f |  48 ++-
 2 files changed, 411 insertions(+), 132 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index a092762b..59e78b92 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -22,8 +22,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   print*,' Providing normal_two_body_bi_orth ...'
   call wall_time(wall0)
  
-  PROVIDE N_int
-
   if(read_tc_norm_ord) then
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
@@ -48,12 +46,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
     endif
 
     ! opposite spin double excitations : s1 /= s2
-    normal_two_body_bi_orth(:,:,:,:) = no_aba_contraction(:,:,:,:)
+    PROVIDE no_aba_contraction
 
-    !$OMP PARALLEL                                                             &
-    !$OMP DEFAULT (NONE)                                                       &
-    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa) & 
-    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
+    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
+    !$OMP         no_aba_contraction)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
@@ -97,7 +96,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
             endif
 
-            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aab + hthree_aaa)
+            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + 0.5d0*(hthree_aab + hthree_aaa)
           enddo
         enddo
       enddo
@@ -124,103 +123,103 @@ END_PROVIDER
 
 ! ---
 
-subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-
-  BEGIN_DOC
-  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
-  END_DOC
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer, intent(in)           :: Nint, h1, h2, p1, p2
-  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
-  double precision, intent(out) :: hthree
-  integer                       :: ii,i
-  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
-  double precision              :: integral,int_exc_l,int_exc_ll
-
-  hthree = 0.d0
-  do ii = 1, Ne(2) ! purely closed shell part 
-    i = occ(ii,2)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
-    int_exc_l = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-    int_exc_ll= -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12= -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13= -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-    int_exc_23= -1.d0 * integral
-
-    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
-  enddo
-
-  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
-    i = occ(ii,1)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
-    int_exc_l = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-    int_exc_ll = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-    int_exc_23 = -1.d0 * integral
-
-    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
-  enddo
-
-  return
-end
+!subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+!
+!  BEGIN_DOC
+!  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
+!  END_DOC
+!
+!  use bitmasks ! you need to include the bitmasks_module.f90 features
+!
+!  implicit none
+!  integer, intent(in)           :: Nint, h1, h2, p1, p2
+!  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+!  double precision, intent(out) :: hthree
+!  integer                       :: ii,i
+!  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
+!  double precision              :: integral,int_exc_l,int_exc_ll
+!
+!  hthree = 0.d0
+!  do ii = 1, Ne(2) ! purely closed shell part 
+!    i = occ(ii,2)
+!
+!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!    int_direct = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
+!    int_exc_l = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+!    int_exc_ll= -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!    int_exc_12= -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!    int_exc_13= -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+!    int_exc_23= -1.d0 * integral
+!
+!    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
+!  enddo
+!
+!  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
+!    i = occ(ii,1)
+!
+!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!    int_direct = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
+!    int_exc_l = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+!    int_exc_ll = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!    int_exc_12 = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!    int_exc_13 = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+!    int_exc_23 = -1.d0 * integral
+!
+!    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+!  enddo
+!
+!  return
+!end
 
 ! ---
 
-subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer,          intent(in)  :: Nint, h1, h2, p1, p2
-  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
-  double precision, intent(out) :: hthree
-  integer                       :: ii, i
-  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
-  double precision              :: integral, int_exc_l, int_exc_ll
-
-  hthree = 0.d0
-  do ii = 1, Ne(2) ! purely closed shell part 
-    i = occ(ii,2)
-
-    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
-    int_exc_23= -1.d0 * integral
-
-    hthree +=  1.d0 * int_direct - int_exc_23
-  enddo
-
-  return
-end
+!subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+!
+!  use bitmasks ! you need to include the bitmasks_module.f90 features
+!
+!  implicit none
+!  integer,          intent(in)  :: Nint, h1, h2, p1, p2
+!  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
+!  double precision, intent(out) :: hthree
+!  integer                       :: ii, i
+!  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
+!  double precision              :: integral, int_exc_l, int_exc_ll
+!
+!  hthree = 0.d0
+!  do ii = 1, Ne(2) ! purely closed shell part 
+!    i = occ(ii,2)
+!
+!    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
+!    int_direct = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
+!    int_exc_23= -1.d0 * integral
+!
+!    hthree +=  1.d0 * int_direct - int_exc_23
+!  enddo
+!
+!  return
+!end
 
 ! ---
 
@@ -264,6 +263,10 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   allocate(tmpvec_1(n_points_final_grid,3))
   allocate(tmpvec_2(n_points_final_grid,3))
 
+  double precision, allocatable :: tmp_2d(:,:)
+  allocate(tmp_2d(mo_num,mo_num))
+
+
   ! purely closed shell part 
   do ii = 1, Ne(2)
     i = occ(ii,2)
@@ -313,9 +316,10 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       !$OMP END DO
       !$OMP END PARALLEL
 
-      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0              &
-                , int2_grad1_u12_bimo_t, 3*n_points_final_grid, tmp1, 3*n_points_final_grid &
-                , 0.d0, tmp_3d, mo_num)
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
       !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
       do p1 = 1, mo_num
@@ -364,38 +368,163 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP END DO
         !$OMP END PARALLEL
 
-        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0                     &
-                  , mos_l_in_r_array_transp, n_points_final_grid, tmp2, n_points_final_grid &
-                  , 1.d0, no_aba_contraction(p2,h2,1,1), mo_num*mo_num)
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
 
       enddo ! p1
     enddo ! h1
   enddo ! i
 
 
-  double precision :: integral, int_direct, int_exc_13, int_exc_12
 
-  ! TODO
+
+
+
+
+
+
   ! purely open-shell part 
   if(Ne(2) < Ne(1)) then
-
     do ii = Ne(2) + 1, Ne(1)
       i = occ(ii,1)
 
-      call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-      int_direct = -1.d0 * integral
+      do h1 = 1, mo_num
 
-      call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-      int_exc_13 = -1.d0 * integral
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-      call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-      int_exc_12 = -1.d0 * integral
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-      no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
-    enddo
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        do p1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                    , tmp2(1,1), n_points_final_grid                       &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo !i
   endif
 
-  ! ---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
   deallocate(tmp_3d)
   deallocate(tmp1, tmp2)
@@ -403,17 +532,121 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpvec_1, tmpvec_2)
 
 
-  !$OMP PARALLEL DO PRIVATE(h1,h2,p1,p2)
-  do h1 = 1, mo_num
-    do p1 = 1, mo_num
-      do h2 = 1, mo_num
-        do p2 = 1, mo_num
-          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (no_aba_contraction(p2,h2,p1,h1) + no_aba_contraction(p1,h1,p2,h2))
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
+
+
+
+
+
+
+
+  no_aba_contraction = -0.5d0 * no_aba_contraction
+  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+
+!  do h1 = 1, mo_num
+!    do p1 = 1, mo_num
+!      do h2 = 1, mo_num
+!        do p2 = 1, mo_num
+!          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (tmp_4d(p2,h2,p1,h1) + tmp_4d(p1,h1,p2,h2))
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+
+
+  ! ---
+
+  double precision :: integral, int_direct, int_exc_13, int_exc_12
+
+!  no_aba_contraction = 0.d0
+!
+!  ! purely closed shell part 
+!  do ii = 1, Ne(2)
+!    i = occ(ii,1)
+!
+!    !$OMP PARALLEL                                                               &
+!    !$OMP DEFAULT (NONE)                                                         &
+!    !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
+!    !$OMP SHARED (mo_num, i, no_aba_contraction)
+!    !$OMP DO SCHEDULE (static) 
+!    do h1 = 1, mo_num
+!      do p1 = 1, mo_num
+!        do h2 = 1, mo_num
+!          do p2 = 1, mo_num
+!
+!            call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!            int_direct = -1.d0 * integral
+!
+!            call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!            int_exc_13 = -1.d0 * integral
+!
+!            call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!            int_exc_12 = -1.d0 * integral
+!
+!            !no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
+!          enddo
+!        enddo
+!      enddo
+!    enddo
+!    !$OMP END DO
+!    !$OMP END PARALLEL
+!  enddo
+
+!  ! purely open-shell part 
+!  if(Ne(2) < Ne(1)) then
+!
+!    do ii = Ne(2) + 1, Ne(1)
+!      i = occ(ii,1)
+!
+!      !$OMP PARALLEL                                                               &
+!      !$OMP DEFAULT (NONE)                                                         &
+!      !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
+!      !$OMP SHARED (mo_num, i, no_aba_contraction)
+!      !$OMP DO SCHEDULE (static) 
+!      do h1 = 1, mo_num
+!        do p1 = 1, mo_num
+!          do h2 = 1, mo_num
+!            do p2 = 1, mo_num
+!
+!              call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!              int_direct = -1.d0 * integral
+!           
+!              call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!              int_exc_13 = -1.d0 * integral
+!           
+!              call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!              int_exc_12 = -1.d0 * integral
+!           
+!              no_aba_contraction(p2,h2,p1,h1) += 0.5d0 * int_direct - 0.25d0 * (int_exc_13 + int_exc_12)
+!            enddo
+!          enddo
+!        enddo
+!      enddo
+!      !$OMP END DO
+!      !$OMP END PARALLEL
+!    enddo
+!  endif
+
+  ! ---
+
+!  !$OMP PARALLEL                                         &
+!  !$OMP DEFAULT (NONE)                                   &
+!  !$OMP PRIVATE (h1, h2, p1, p2, integral)               & 
+!  !$OMP SHARED (mo_num, N_int,Ne, occ, no_aba_contraction)
+!  !$OMP DO SCHEDULE (static) 
+!  do h1 = 1, mo_num
+!    do p1 = 1, mo_num
+!      do h2 = 1, mo_num
+!        do p2 = 1, mo_num
+!          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, integral)
+!          no_aba_contraction(p2,h2,p1,h1) = 0.5d0 * integral
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END DO
+!  !$OMP END PARALLEL
+
+
 
 END_PROVIDER
 
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index df86ea65..33b5c5aa 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -11,12 +11,14 @@ program tc_bi_ortho
   touch read_wf
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
- call test_h_u0
+! call test_h_u0
 ! call test_slater_tc_opt
 ! call timing_tot
 ! call timing_diag
 ! call timing_single
 ! call timing_double
+
+  call test_no()
 end
 
 subroutine test_h_u0
@@ -252,3 +254,47 @@ subroutine timing_double
 
 end
 
+! ---
+
+subroutine test_no()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing normal_two_body_bi_orth ...'
+
+  thr = 1d-8
+
+  PROVIDE normal_two_body_bi_orth_old
+  PROVIDE normal_two_body_bi_orth
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = normal_two_body_bi_orth    (l,k,j,i)
+          ref = normal_two_body_bi_orth_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on normal_two_body_bi_orth'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on normal_two_body_bi_orth = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
+
+

From 3a5dd05d7eb61dc21b1ec16eba330e3687b54001 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 15:13:18 +0200
Subject: [PATCH 57/79] NO: working on AAB contractions

---
 src/tc_bi_ortho/normal_ordered.irp.f | 428 +++++++++++----------------
 src/utils/util.irp.f                 |  37 ++-
 2 files changed, 217 insertions(+), 248 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 59e78b92..b3c413d3 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -45,14 +45,14 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
       call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
     endif
 
-    ! opposite spin double excitations : s1 /= s2
     PROVIDE no_aba_contraction
+    PROVIDE no_aab_contraction
 
     !$OMP PARALLEL                                                              &
     !$OMP DEFAULT (NONE)                                                        &
     !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
     !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
-    !$OMP         no_aba_contraction)
+    !$OMP         no_aba_contraction,no_aab_contraction)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
@@ -72,9 +72,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
             ! same spin double excitations : s1 == s2 
             if((h1 < h2) .and. (p1 > p2)) then
 
-              ! with opposite spin contributions 
-              call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-
               ! same spin double excitations with same spin contributions 
               if(Ne(2) .ge. 3) then
                 call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
@@ -84,9 +81,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
             else
 
-              ! with opposite spin contributions 
-              call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-
               if(Ne(2) .ge. 3) then
                 ! same spin double excitations with same spin contributions 
                 call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
@@ -96,7 +90,9 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
             endif
 
-            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + 0.5d0*(hthree_aab + hthree_aaa)
+            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) &
+                                                 + no_aab_contraction(p2,h2,p1,h1) &
+                                                 + 0.5d0 * hthree_aaa
           enddo
         enddo
       enddo
@@ -123,106 +119,6 @@ END_PROVIDER
 
 ! ---
 
-!subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-!
-!  BEGIN_DOC
-!  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
-!  END_DOC
-!
-!  use bitmasks ! you need to include the bitmasks_module.f90 features
-!
-!  implicit none
-!  integer, intent(in)           :: Nint, h1, h2, p1, p2
-!  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
-!  double precision, intent(out) :: hthree
-!  integer                       :: ii,i
-!  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
-!  double precision              :: integral,int_exc_l,int_exc_ll
-!
-!  hthree = 0.d0
-!  do ii = 1, Ne(2) ! purely closed shell part 
-!    i = occ(ii,2)
-!
-!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!    int_direct = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
-!    int_exc_l = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-!    int_exc_ll= -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!    int_exc_12= -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!    int_exc_13= -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-!    int_exc_23= -1.d0 * integral
-!
-!    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
-!  enddo
-!
-!  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
-!    i = occ(ii,1)
-!
-!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!    int_direct = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
-!    int_exc_l = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-!    int_exc_ll = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!    int_exc_12 = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!    int_exc_13 = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-!    int_exc_23 = -1.d0 * integral
-!
-!    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
-!  enddo
-!
-!  return
-!end
-
-! ---
-
-!subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-!
-!  use bitmasks ! you need to include the bitmasks_module.f90 features
-!
-!  implicit none
-!  integer,          intent(in)  :: Nint, h1, h2, p1, p2
-!  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
-!  double precision, intent(out) :: hthree
-!  integer                       :: ii, i
-!  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
-!  double precision              :: integral, int_exc_l, int_exc_ll
-!
-!  hthree = 0.d0
-!  do ii = 1, Ne(2) ! purely closed shell part 
-!    i = occ(ii,2)
-!
-!    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
-!    int_direct = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
-!    int_exc_23= -1.d0 * integral
-!
-!    hthree +=  1.d0 * int_direct - int_exc_23
-!  enddo
-!
-!  return
-!end
-
-! ---
-
 BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
@@ -236,6 +132,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmp_3d(:,:,:)
   double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
   double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
 
   print*,' Providing no_aba_contraction ...'
   call wall_time(wall0)
@@ -262,8 +159,6 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   allocate(tmpval_2(n_points_final_grid))
   allocate(tmpvec_1(n_points_final_grid,3))
   allocate(tmpvec_2(n_points_final_grid,3))
-
-  double precision, allocatable :: tmp_2d(:,:)
   allocate(tmp_2d(mo_num,mo_num))
 
 
@@ -386,13 +281,6 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   enddo ! i
 
 
-
-
-
-
-
-
-
   ! purely open-shell part 
   if(Ne(2) < Ne(1)) then
     do ii = Ne(2) + 1, Ne(1)
@@ -510,146 +398,192 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
     enddo !i
   endif
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
   deallocate(tmp_3d)
   deallocate(tmp1, tmp2)
   deallocate(tmpval_1, tmpval_2)
   deallocate(tmpvec_1, tmpvec_2)
 
-
-
-
-
-
-
-
-
   no_aba_contraction = -0.5d0 * no_aba_contraction
   call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
 
-!  do h1 = 1, mo_num
-!    do p1 = 1, mo_num
-!      do h2 = 1, mo_num
-!        do p2 = 1, mo_num
-!          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (tmp_4d(p2,h2,p1,h1) + tmp_4d(p1,h1,p2,h2))
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-
-
-  ! ---
-
-  double precision :: integral, int_direct, int_exc_13, int_exc_12
-
-!  no_aba_contraction = 0.d0
-!
-!  ! purely closed shell part 
-!  do ii = 1, Ne(2)
-!    i = occ(ii,1)
-!
-!    !$OMP PARALLEL                                                               &
-!    !$OMP DEFAULT (NONE)                                                         &
-!    !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
-!    !$OMP SHARED (mo_num, i, no_aba_contraction)
-!    !$OMP DO SCHEDULE (static) 
-!    do h1 = 1, mo_num
-!      do p1 = 1, mo_num
-!        do h2 = 1, mo_num
-!          do p2 = 1, mo_num
-!
-!            call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!            int_direct = -1.d0 * integral
-!
-!            call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!            int_exc_13 = -1.d0 * integral
-!
-!            call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!            int_exc_12 = -1.d0 * integral
-!
-!            !no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
-!          enddo
-!        enddo
-!      enddo
-!    enddo
-!    !$OMP END DO
-!    !$OMP END PARALLEL
-!  enddo
-
-!  ! purely open-shell part 
-!  if(Ne(2) < Ne(1)) then
-!
-!    do ii = Ne(2) + 1, Ne(1)
-!      i = occ(ii,1)
-!
-!      !$OMP PARALLEL                                                               &
-!      !$OMP DEFAULT (NONE)                                                         &
-!      !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
-!      !$OMP SHARED (mo_num, i, no_aba_contraction)
-!      !$OMP DO SCHEDULE (static) 
-!      do h1 = 1, mo_num
-!        do p1 = 1, mo_num
-!          do h2 = 1, mo_num
-!            do p2 = 1, mo_num
-!
-!              call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!              int_direct = -1.d0 * integral
-!           
-!              call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!              int_exc_13 = -1.d0 * integral
-!           
-!              call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!              int_exc_12 = -1.d0 * integral
-!           
-!              no_aba_contraction(p2,h2,p1,h1) += 0.5d0 * int_direct - 0.25d0 * (int_exc_13 + int_exc_12)
-!            enddo
-!          enddo
-!        enddo
-!      enddo
-!      !$OMP END DO
-!      !$OMP END PARALLEL
-!    enddo
-!  endif
-
-  ! ---
-
-!  !$OMP PARALLEL                                         &
-!  !$OMP DEFAULT (NONE)                                   &
-!  !$OMP PRIVATE (h1, h2, p1, p2, integral)               & 
-!  !$OMP SHARED (mo_num, N_int,Ne, occ, no_aba_contraction)
-!  !$OMP DO SCHEDULE (static) 
-!  do h1 = 1, mo_num
-!    do p1 = 1, mo_num
-!      do h2 = 1, mo_num
-!        do p2 = 1, mo_num
-!          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, integral)
-!          no_aba_contraction(p2,h2,p1,h1) = 0.5d0 * integral
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
-
+  call wall_time(wall1)
+  print*,' Wall time for no_aba_contraction', wall1-wall0
 
 END_PROVIDER
 
 ! ---
 
+BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
 
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aab_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+  allocate(tmp_2d(mo_num,mo_num))
+
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpvec_1)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpvec_1, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1)
+  deallocate(tmpvec_1)
+
+  no_aab_contraction = 0.5d0 * no_aab_contraction
+  call sub_A_At(no_aab_contraction(1,1,1,1), mo_num*mo_num)
+
+  do h1 = 1, mo_num-1
+    do h2 = h1+1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+        enddo
+      enddo
+    enddo
+  enddo
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aab_contraction', wall1-wall0
+
+
+END_PROVIDER
+
+! ---
diff --git a/src/utils/util.irp.f b/src/utils/util.irp.f
index aba99c2b..a9f1a438 100644
--- a/src/utils/util.irp.f
+++ b/src/utils/util.irp.f
@@ -490,7 +490,7 @@ end subroutine check_sym
 subroutine sum_A_At(A, N)
 
   !BEGIN_DOC
-  ! useful for symmetrizing a tensor without a temporary tensor
+  ! add a tensor with its transpose without a temporary tensor
   !END_DOC
 
   implicit none
@@ -521,3 +521,38 @@ subroutine sum_A_At(A, N)
 
 end
 
+! ---
+
+subroutine sub_A_At(A, N)
+
+  !BEGIN_DOC
+  ! substruct a tensor with its transpose without a temporary tensor
+  !END_DOC
+
+  implicit none
+  integer,          intent(in)    :: N
+  double precision, intent(inout) :: A(N,N)
+  integer                         :: i, j
+
+ !$OMP PARALLEL       &
+ !$OMP DEFAULT (NONE) &
+ !$OMP PRIVATE (i, j) & 
+ !$OMP SHARED (A, N)
+ !$OMP DO 
+  do j = 1, N
+    do i = j, N
+      A(i,j) -= A(j,i)
+    enddo
+  enddo
+ !$OMP END DO
+
+ !$OMP DO 
+  do j = 2, N
+    do i = 1, j-1
+      A(i,j) = -A(j,i)
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+end

From b0da0ac04d49b3fbbbe0eb9649b0b6da87cce6d2 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 16:08:46 +0200
Subject: [PATCH 58/79] normal ordering: aab-DGEMM OK

---
 src/tc_bi_ortho/normal_ordered.irp.f | 73 +++++++++++++++++++---------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index b3c413d3..3a1e79fd 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -230,11 +230,11 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
 
         ! to minimize the number of operations
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT (NONE)                                            &
-        !$OMP PRIVATE (ipoint)                                          &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
         !$OMP         tmpval_1)
         !$OMP DO
         do ipoint = 1, n_points_final_grid
@@ -398,7 +398,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
     enddo !i
   endif
 
-  deallocate(tmp_3d)
+  deallocate(tmp_2d, tmp_3d)
   deallocate(tmp1, tmp2)
   deallocate(tmpval_1, tmpval_2)
   deallocate(tmpvec_1, tmpvec_2)
@@ -446,12 +446,12 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
     call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
   endif
 
+  allocate(tmp_2d(mo_num,mo_num))
   allocate(tmp_3d(mo_num,mo_num,mo_num))
   allocate(tmp1(n_points_final_grid,3,mo_num))
   allocate(tmp2(n_points_final_grid,mo_num))
   allocate(tmpval_1(n_points_final_grid))
   allocate(tmpvec_1(n_points_final_grid,3))
-  allocate(tmp_2d(mo_num,mo_num))
 
 
   ! purely closed shell part 
@@ -471,10 +471,10 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       !$OMP         tmpval_1, tmpvec_1)
       !$OMP DO
       do ipoint = 1, n_points_final_grid
-        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
-        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
@@ -515,17 +515,17 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
 
         ! to minimize the number of operations
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT (NONE)                                            &
-        !$OMP PRIVATE (ipoint)                                          &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
         !$OMP         tmpval_1)
         !$OMP DO
         do ipoint = 1, n_points_final_grid
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
         enddo
         !$OMP END DO
         !$OMP END PARALLEL
@@ -567,9 +567,38 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpval_1)
   deallocate(tmpvec_1)
 
-  no_aab_contraction = 0.5d0 * no_aab_contraction
-  call sub_A_At(no_aab_contraction(1,1,1,1), mo_num*mo_num)
+  no_aab_contraction = -0.5d0 * no_aab_contraction
 
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (h1, h2, p1, p2) & 
+  !$OMP SHARED (no_aab_contraction, mo_num)
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 1, mo_num
+        do p2 = p1, mo_num
+          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
   do h1 = 1, mo_num-1
     do h2 = h1+1, mo_num
       do p1 = 2, mo_num
@@ -579,11 +608,11 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       enddo
     enddo
   enddo
+  !$OMP END PARALLEL
 
   call wall_time(wall1)
   print*,' Wall time for no_aab_contraction', wall1-wall0
 
-
 END_PROVIDER
 
 ! ---

From aafca191f1fe271575f80f5d16eb80587290213c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 20:59:34 +0200
Subject: [PATCH 59/79] normal ordering: aaa-DGEMM OK

---
 src/tc_bi_ortho/normal_ordered.irp.f     | 593 +++++++++++++++++++++--
 src/tc_bi_ortho/normal_ordered_old.irp.f |   4 +-
 2 files changed, 568 insertions(+), 29 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 3a1e79fd..fea229c9 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -47,12 +47,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
     PROVIDE no_aba_contraction
     PROVIDE no_aab_contraction
+    PROVIDE no_aaa_contraction
 
     !$OMP PARALLEL                                                              &
     !$OMP DEFAULT (NONE)                                                        &
     !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
     !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
-    !$OMP         no_aba_contraction,no_aab_contraction)
+    !$OMP         no_aba_contraction, no_aab_contraction, no_aaa_contraction)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
@@ -66,33 +67,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
           do pp2 = 1, n_act_orb
             p2 = list_act(pp2)
 
-            ! all contributions from the 3-e terms to the double excitations 
-            ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
-
-            ! same spin double excitations : s1 == s2 
-            if((h1 < h2) .and. (p1 > p2)) then
-
-              ! same spin double excitations with same spin contributions 
-              if(Ne(2) .ge. 3) then
-                call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-              else
-                hthree_aaa = 0.d0
-              endif
-
-            else
-
-              if(Ne(2) .ge. 3) then
-                ! same spin double excitations with same spin contributions 
-                call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-              else
-                hthree_aaa = 0.d0
-              endif
-
-            endif
-
-            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) &
-                                                 + no_aab_contraction(p2,h2,p1,h1) &
-                                                 + 0.5d0 * hthree_aaa
+            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + no_aab_contraction(p2,h2,p1,h1) + no_aaa_contraction(p2,h2,p1,h1)
           enddo
         enddo
       enddo
@@ -616,3 +591,565 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
 END_PROVIDER
 
 ! ---
+
+BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  !
+  ! if:
+  !    h1 < h2
+  !    p1 > p2
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
+  !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! else:
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! 
+  ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
+  ! J(p2,h2,p1,h1) = \sum_i [ <  i p2 p1 | i h2 h1 >
+  !                         + < p2 p1  i | i h2 h1 >
+  !                         + < p1  i p2 | i h2 h1 > ]
+  !
+  !
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+
+  print*,' Providing no_aaa_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  if(Ne(2) .lt. 3) then
+
+    no_aaa_contraction = 0.d0
+
+  else
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmp3(n_points_final_grid,3,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmpvec_3(n_points_final_grid,3))
+
+    ! purely closed shell part 
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      ! to avoid tmp(N^4)
+      do h1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpvec_1, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p2, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_2, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , 0.d0, tmp_3d(1,1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        ! to avoid tmp(N^4)
+        do p1 = 1, mo_num
+
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                &
+          !$OMP DEFAULT (NONE)                          &
+          !$OMP PRIVATE (h2, ipoint)                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, &
+          !$OMP         mos_r_in_r_array_transp,        &
+          !$OMP         int2_grad1_u12_bimo_t,          &
+          !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                    &
+          !$OMP DEFAULT (NONE)                              &
+          !$OMP PRIVATE (p2, ipoint)                        &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+          !$OMP         int2_grad1_u12_bimo_t,              &
+          !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp3(1,1,1), 3*n_points_final_grid                    &
+                    , tmp1(1,1,1), 3*n_points_final_grid                    &
+                    , 1.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+
+        ! to avoid tmp(N^4)
+        do h1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1,                       &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p1, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_1, tmpvec_1, tmp1)
+          !$OMP DO 
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p2, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_2, tmpvec_2, tmp1)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          ! to avoid tmp(N^4)
+          do p1 = 1, mo_num
+
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+            !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            !$OMP PARALLEL                                &
+            !$OMP DEFAULT (NONE)                          &
+            !$OMP PRIVATE (h2, ipoint)                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, &
+            !$OMP         mos_r_in_r_array_transp,        &
+            !$OMP         int2_grad1_u12_bimo_t,          &
+            !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+            !$OMP DO 
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            !$OMP PARALLEL                                    &
+            !$OMP DEFAULT (NONE)                              &
+            !$OMP PRIVATE (p2, ipoint)                        &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+            !$OMP         int2_grad1_u12_bimo_t,              &
+            !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+            !$OMP DO 
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                     &
+                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+    endif
+
+    deallocate(tmp_2d, tmp_3d)
+    deallocate(tmp1, tmp2, tmp3)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
+
+    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (no_aaa_contraction, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL
+
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aaa_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
diff --git a/src/tc_bi_ortho/normal_ordered_old.irp.f b/src/tc_bi_ortho/normal_ordered_old.irp.f
index 553cafdb..417580dd 100644
--- a/src/tc_bi_ortho/normal_ordered_old.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_old.irp.f
@@ -89,6 +89,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_old, (mo_num, mo_num,
                hthree_aaa = 0.d0
              endif
             endif
+
             normal_two_body_bi_orth_old(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
           enddo
         enddo
@@ -350,7 +351,8 @@ subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
     call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
     int_exc_23 = -1.d0 * integral
 
-    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+    !hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+    hthree +=  0.5d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
   enddo
 
   return

From 4b9b2a25603cda0d7687938e13384cb53877b9a9 Mon Sep 17 00:00:00 2001
From: ydamour <yann.damour@hotmail.fr>
Date: Thu, 8 Jun 2023 10:49:06 +0200
Subject: [PATCH 60/79] update ROHF F matrix with gamess parametrization

---
 src/scf_utils/fock_matrix.irp.f | 194 +++++++++++++++++++++++---------
 1 file changed, 138 insertions(+), 56 deletions(-)

diff --git a/src/scf_utils/fock_matrix.irp.f b/src/scf_utils/fock_matrix.irp.f
index 61633d3b..1942e542 100644
--- a/src/scf_utils/fock_matrix.irp.f
+++ b/src/scf_utils/fock_matrix.irp.f
@@ -5,6 +5,90 @@
    ! Fock matrix on the MO basis.
    ! For open shells, the ROHF Fock Matrix is ::
    !
+   !       |  Rcc  |  F^b  |  Fcv  |
+   !       |-----------------------|
+   !       |  F^b  |  Roo  |  F^a  |
+   !       |-----------------------|
+   !       |  Fcv  |  F^a  |  Rvv  |
+   !
+   ! C: Core, O: Open, V: Virtual 
+   ! 
+   ! Rcc = Acc Fcc^a + Bcc Fcc^b
+   ! Roo = Aoo Foo^a + Boo Foo^b
+   ! Rvv = Avv Fvv^a + Bvv Fvv^b
+   ! Fcv = (F^a + F^b)/2
+   ! 
+   ! F^a: Fock matrix alpha (MO), F^b: Fock matrix beta (MO)
+   ! A,B: Coupling parameters
+   !
+   ! J. Chem. Phys. 133, 141102 (2010), https://doi.org/10.1063/1.3503173
+   ! Coupling parameters from J. Chem. Phys. 125, 204110 (2006); https://doi.org/10.1063/1.2393223.
+   !       cc   oo   vv
+   !  A  -0.5  0.5  1.5
+   !  B   1.5  0.5 -0.5
+   ! 
+   END_DOC
+   integer                        :: i,j,n
+   if (elec_alpha_num == elec_beta_num) then
+     Fock_matrix_mo = Fock_matrix_mo_alpha
+   else
+     ! Core
+     do j = 1, elec_beta_num
+       ! Core
+       do i = 1, elec_beta_num
+         fock_matrix_mo(i,j) = - 0.5d0 * fock_matrix_mo_alpha(i,j) &
+                               + 1.5d0 * fock_matrix_mo_beta(i,j)
+       enddo
+       ! Open
+       do i = elec_beta_num+1, elec_alpha_num
+         fock_matrix_mo(i,j) = fock_matrix_mo_beta(i,j)
+       enddo
+       ! Virtual
+       do i = elec_alpha_num+1, mo_num
+         fock_matrix_mo(i,j) =   0.5d0 * fock_matrix_mo_alpha(i,j) &
+                               + 0.5d0 * fock_matrix_mo_beta(i,j)
+       enddo
+     enddo
+     ! Open
+     do j = elec_beta_num+1, elec_alpha_num
+       ! Core
+       do i = 1, elec_beta_num
+         fock_matrix_mo(i,j) = fock_matrix_mo_beta(i,j)
+       enddo
+       ! Open
+       do i = elec_beta_num+1, elec_alpha_num
+         fock_matrix_mo(i,j) =   0.5d0 * fock_matrix_mo_alpha(i,j) &
+                               + 0.5d0 * fock_matrix_mo_beta(i,j)
+       enddo
+       ! Virtual
+       do i = elec_alpha_num+1, mo_num
+         fock_matrix_mo(i,j) = fock_matrix_mo_alpha(i,j)
+       enddo
+     enddo
+     ! Virtual
+     do j = elec_alpha_num+1, mo_num
+       ! Core
+       do i = 1, elec_beta_num
+         fock_matrix_mo(i,j) =   0.5d0 * fock_matrix_mo_alpha(i,j) &
+                               + 0.5d0 * fock_matrix_mo_beta(i,j)
+       enddo
+       ! Open
+       do i = elec_beta_num+1, elec_alpha_num
+         fock_matrix_mo(i,j) = fock_matrix_mo_alpha(i,j)
+       enddo
+       ! Virtual
+       do i = elec_alpha_num+1, mo_num
+         fock_matrix_mo(i,j) =   1.5d0 * fock_matrix_mo_alpha(i,j) &
+                               - 0.5d0 * fock_matrix_mo_beta(i,j)
+       enddo
+     enddo
+   endif
+
+   ! Old
+   ! BEGIN_DOC
+   ! Fock matrix on the MO basis.
+   ! For open shells, the ROHF Fock Matrix is ::
+   !
    !       |   F-K    |  F + K/2  |    F     |
    !       |---------------------------------|
    !       | F + K/2  |     F     |  F - K/2 |
@@ -16,64 +100,64 @@
    !
    ! K = Fb - Fa
    !
-   END_DOC
-   integer                        :: i,j,n
-   if (elec_alpha_num == elec_beta_num) then
-     Fock_matrix_mo = Fock_matrix_mo_alpha
-   else
+   ! END_DOC
+   !integer                        :: i,j,n
+   !if (elec_alpha_num == elec_beta_num) then
+   !  Fock_matrix_mo = Fock_matrix_mo_alpha
+   !else
 
-     do j=1,elec_beta_num
-       ! F-K
-       do i=1,elec_beta_num !CC
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
-             - (Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
-       enddo
-       ! F+K/2
-       do i=elec_beta_num+1,elec_alpha_num  !CA
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
-             + 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
-       enddo
-       ! F
-       do i=elec_alpha_num+1, mo_num !CV
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))
-       enddo
-     enddo
+   !  do j=1,elec_beta_num
+   !    ! F-K
+   !    do i=1,elec_beta_num !CC
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
+   !          - (Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
+   !    enddo
+   !    ! F+K/2
+   !    do i=elec_beta_num+1,elec_alpha_num  !CA
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
+   !          + 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
+   !    enddo
+   !    ! F
+   !    do i=elec_alpha_num+1, mo_num !CV
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))
+   !    enddo
+   !  enddo
 
-     do j=elec_beta_num+1,elec_alpha_num
-       ! F+K/2
-       do i=1,elec_beta_num !AC
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
-             + 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
-       enddo
-       ! F
-       do i=elec_beta_num+1,elec_alpha_num !AA
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))
-       enddo
-       ! F-K/2
-       do i=elec_alpha_num+1, mo_num !AV
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
-             - 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
-       enddo
-     enddo
+   !  do j=elec_beta_num+1,elec_alpha_num
+   !    ! F+K/2
+   !    do i=1,elec_beta_num !AC
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
+   !          + 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
+   !    enddo
+   !    ! F
+   !    do i=elec_beta_num+1,elec_alpha_num !AA
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))
+   !    enddo
+   !    ! F-K/2
+   !    do i=elec_alpha_num+1, mo_num !AV
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
+   !          - 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
+   !    enddo
+   !  enddo
 
-     do j=elec_alpha_num+1, mo_num
-       ! F
-       do i=1,elec_beta_num !VC
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))
-       enddo
-       ! F-K/2
-       do i=elec_beta_num+1,elec_alpha_num !VA
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
-             - 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
-       enddo
-       ! F+K
-       do i=elec_alpha_num+1,mo_num !VV
-         Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j)) &
-             + (Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
-       enddo
-     enddo
+   !  do j=elec_alpha_num+1, mo_num
+   !    ! F
+   !    do i=1,elec_beta_num !VC
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))
+   !    enddo
+   !    ! F-K/2
+   !    do i=elec_beta_num+1,elec_alpha_num !VA
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j))&
+   !          - 0.5d0*(Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
+   !    enddo
+   !    ! F+K
+   !    do i=elec_alpha_num+1,mo_num !VV
+   !      Fock_matrix_mo(i,j) = 0.5d0*(Fock_matrix_mo_alpha(i,j)+Fock_matrix_mo_beta(i,j)) &
+   !          + (Fock_matrix_mo_beta(i,j) - Fock_matrix_mo_alpha(i,j))
+   !    enddo
+   !  enddo
 
-   endif
+   !endif
 
    do i = 1, mo_num
      Fock_matrix_diag_mo(i) = Fock_matrix_mo(i,i)
@@ -115,8 +199,6 @@
 
 END_PROVIDER
 
-
-
 BEGIN_PROVIDER [ double precision, Fock_matrix_mo_alpha, (mo_num,mo_num) ]
    implicit none
    BEGIN_DOC

From 374a88bc624396370660182f6da3d876934b35b9 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 8 Jun 2023 15:51:52 +0200
Subject: [PATCH 61/79] normal ordering with DGEMM: OK

---
 src/tc_bi_ortho/normal_ordered.irp.f          | 1230 ++++++++---------
 .../normal_ordered_contractions.irp.f         | 1062 ++++++++++++++
 2 files changed, 1615 insertions(+), 677 deletions(-)
 create mode 100644 src/tc_bi_ortho/normal_ordered_contractions.irp.f

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index fea229c9..7259c270 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -11,16 +11,15 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   implicit none
 
-  integer                        :: i, h1, p1, h2, p2
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
   integer                        :: hh1, hh2, pp1, pp2
   integer                        :: Ne(2)
-  double precision               :: hthree_aaa, hthree_aab
-  double precision               :: wall0, wall1
+  double precision               :: wall0, wall1, walli, wallf
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
 
   print*,' Providing normal_two_body_bi_orth ...'
-  call wall_time(wall0)
+  call wall_time(walli)
  
   if(read_tc_norm_ord) then
 
@@ -30,6 +29,11 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   else
 
+    double precision, allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+    double precision, allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+    double precision, allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+    double precision, allocatable :: tmp(:,:,:,:)
+
     PROVIDE N_int
 
     allocate( occ(N_int*bit_kind_size,2) )
@@ -45,224 +49,33 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
       call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
     endif
 
-    PROVIDE no_aba_contraction
-    PROVIDE no_aab_contraction
-    PROVIDE no_aaa_contraction
+    allocate(tmp(mo_num,mo_num,mo_num,mo_num))
 
-    !$OMP PARALLEL                                                              &
-    !$OMP DEFAULT (NONE)                                                        &
-    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
-    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
-    !$OMP         no_aba_contraction, no_aab_contraction, no_aaa_contraction)
-    !$OMP DO SCHEDULE (static) 
-    do hh1 = 1, n_act_orb
-      h1 = list_act(hh1) 
+    ! ---
+    ! aba contraction
 
-      do pp1 = 1, n_act_orb
-        p1 = list_act(pp1)
+    print*,' Providing aba_contraction ...'
+    call wall_time(wall0)
 
-        do hh2 = 1, n_act_orb
-          h2 = list_act(hh2) 
+    tmp = 0.d0
 
-          do pp2 = 1, n_act_orb
-            p2 = list_act(pp2)
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmp_2d(mo_num,mo_num))
 
-            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + no_aab_contraction(p2,h2,p1,h1) + no_aaa_contraction(p2,h2,p1,h1)
-          enddo
-        enddo
-      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    deallocate( occ )
-    deallocate( key_i_core )
-  endif
-
-  if(write_tc_norm_ord.and.mpi_master) then
-    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="write")
-      call ezfio_set_work_empty(.False.)
-      write(11) normal_two_body_bi_orth
-      close(11)
-      call ezfio_set_tc_keywords_io_tc_integ('Read')
-  endif
-
-  call wall_time(wall1)
-  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, ii, h1, p1, h2, p2, ipoint
-  integer                        :: Ne(2)
-  double precision               :: wall0, wall1
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision,  allocatable :: tmp_3d(:,:,:)
-  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
-  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
-  double precision,  allocatable :: tmp_2d(:,:)
-
-  print*,' Providing no_aba_contraction ...'
-  call wall_time(wall0)
-
-  PROVIDE N_int
-
-  allocate(occ(N_int*bit_kind_size,2))
-  allocate(key_i_core(N_int,2))
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num))
-  allocate(tmp2(n_points_final_grid,mo_num))
-  allocate(tmpval_1(n_points_final_grid))
-  allocate(tmpval_2(n_points_final_grid))
-  allocate(tmpvec_1(n_points_final_grid,3))
-  allocate(tmpvec_2(n_points_final_grid,3))
-  allocate(tmp_2d(mo_num,mo_num))
-
-
-  ! purely closed shell part 
-  do ii = 1, Ne(2)
-    i = occ(ii,2)
-
-    ! to avoid tmp(N^4)
-    do h1 = 1, mo_num
-
-      ! to minimize the number of operations
-      !$OMP PARALLEL                                                  &
-      !$OMP DEFAULT (NONE)                                            &
-      !$OMP PRIVATE (ipoint)                                          &
-      !$OMP SHARED (n_points_final_grid, i, h1,                       &
-      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
-      !$OMP DO
-      do ipoint = 1, n_points_final_grid
-        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
-        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
-        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
-        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      !$OMP PARALLEL                                                &
-      !$OMP DEFAULT (NONE)                                          &
-      !$OMP PRIVATE (p1, ipoint)                                    &
-      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
-      !$OMP DO 
-      do p1 = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
-                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
-          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
-                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
-          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
-                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
-        enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
-                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
-                , tmp1(1,1,1), 3*n_points_final_grid                           &
-                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
-
-      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
-      do p1 = 1, mo_num
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
-          enddo
-        enddo
-      enddo
-      !$OMP END PARALLEL DO
+    ! purely closed shell part 
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
 
       ! to avoid tmp(N^4)
-      do p1 = 1, mo_num
-
-        ! to minimize the number of operations
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT (NONE)                                           &
-        !$OMP PRIVATE (ipoint)                                         &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-        !$OMP         tmpval_1)
-        !$OMP DO
-        do ipoint = 1, n_points_final_grid
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
-                                                                - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
-                                                                - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
-                                                                - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        !$OMP PARALLEL                             &
-        !$OMP DEFAULT (NONE)                       &
-        !$OMP PRIVATE (h2, ipoint)                 &
-        !$OMP SHARED (mo_num, n_points_final_grid, &
-        !$OMP         mos_r_in_r_array_transp,     &
-        !$OMP         tmpval_1, tmp2)
-        !$OMP DO 
-        do h2 = 1, mo_num
-          do ipoint = 1, n_points_final_grid
-            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
-          enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
-                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
-                  , tmp2(1,1), n_points_final_grid                      &
-                  , 0.d0, tmp_2d(1,1), mo_num)
-
-        !$OMP PARALLEL DO PRIVATE(h2,p2)
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
-          enddo
-        enddo
-        !$OMP END PARALLEL DO
-
-      enddo ! p1
-    enddo ! h1
-  enddo ! i
-
-
-  ! purely open-shell part 
-  if(Ne(2) < Ne(1)) then
-    do ii = Ne(2) + 1, Ne(1)
-      i = occ(ii,1)
-
       do h1 = 1, mo_num
 
+        ! to minimize the number of operations
         !$OMP PARALLEL                                                  &
         !$OMP DEFAULT (NONE)                                            &
         !$OMP PRIVATE (ipoint)                                          &
@@ -304,29 +117,30 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP END DO
         !$OMP END PARALLEL
 
-        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
-                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
-                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
                   , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
         !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
         !$OMP END PARALLEL DO
 
+        ! to avoid tmp(N^4)
         do p1 = 1, mo_num
 
           ! to minimize the number of operations
-          !$OMP PARALLEL                                                  &
-          !$OMP DEFAULT (NONE)                                            &
-          !$OMP PRIVATE (ipoint)                                          &
-          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP PARALLEL                                                 &
+          !$OMP DEFAULT (NONE)                                           &
+          !$OMP PRIVATE (ipoint)                                         &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
           !$OMP         tmpval_1)
           !$OMP DO
           do ipoint = 1, n_points_final_grid
@@ -355,313 +169,171 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP END DO
           !$OMP END PARALLEL
 
-          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
-                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
-                    , tmp2(1,1), n_points_final_grid                       &
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
                     , 0.d0, tmp_2d(1,1), mo_num)
 
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
 
         enddo ! p1
       enddo ! h1
-    enddo !i
-  endif
+    enddo ! i
 
-  deallocate(tmp_2d, tmp_3d)
-  deallocate(tmp1, tmp2)
-  deallocate(tmpval_1, tmpval_2)
-  deallocate(tmpvec_1, tmpvec_2)
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
 
-  no_aba_contraction = -0.5d0 * no_aba_contraction
-  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+        do h1 = 1, mo_num
 
-  call wall_time(wall1)
-  print*,' Wall time for no_aba_contraction', wall1-wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, ii, h1, p1, h2, p2, ipoint
-  integer                        :: Ne(2)
-  double precision               :: wall0, wall1
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision,  allocatable :: tmp_3d(:,:,:)
-  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
-  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
-  double precision,  allocatable :: tmp_2d(:,:)
-
-  print*,' Providing no_aab_contraction ...'
-  call wall_time(wall0)
-
-  PROVIDE N_int
-
-  allocate(occ(N_int*bit_kind_size,2))
-  allocate(key_i_core(N_int,2))
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  allocate(tmp_2d(mo_num,mo_num))
-  allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num))
-  allocate(tmp2(n_points_final_grid,mo_num))
-  allocate(tmpval_1(n_points_final_grid))
-  allocate(tmpvec_1(n_points_final_grid,3))
-
-
-  ! purely closed shell part 
-  do ii = 1, Ne(2)
-    i = occ(ii,2)
-
-    ! to avoid tmp(N^4)
-    do h1 = 1, mo_num
-
-      ! to minimize the number of operations
-      !$OMP PARALLEL                                                  &
-      !$OMP DEFAULT (NONE)                                            &
-      !$OMP PRIVATE (ipoint)                                          &
-      !$OMP SHARED (n_points_final_grid, i, h1,                       &
-      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-      !$OMP         tmpval_1, tmpvec_1)
-      !$OMP DO
-      do ipoint = 1, n_points_final_grid
-        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      !$OMP PARALLEL                                                &
-      !$OMP DEFAULT (NONE)                                          &
-      !$OMP PRIVATE (p1, ipoint)                                    &
-      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-      !$OMP         tmpval_1, tmpvec_1, tmp1)
-      !$OMP DO 
-      do p1 = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
-          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
-          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
-        enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
-                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
-                , tmp1(1,1,1), 3*n_points_final_grid                           &
-                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
-
-      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
-      do p1 = 1, mo_num
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
-          enddo
-        enddo
-      enddo
-      !$OMP END PARALLEL DO
-
-      ! to avoid tmp(N^4)
-      do p1 = 1, mo_num
-
-        ! to minimize the number of operations
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT (NONE)                                           &
-        !$OMP PRIVATE (ipoint)                                         &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-        !$OMP         tmpval_1)
-        !$OMP DO
-        do ipoint = 1, n_points_final_grid
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        !$OMP PARALLEL                             &
-        !$OMP DEFAULT (NONE)                       &
-        !$OMP PRIVATE (h2, ipoint)                 &
-        !$OMP SHARED (mo_num, n_points_final_grid, &
-        !$OMP         mos_r_in_r_array_transp,     &
-        !$OMP         tmpval_1, tmp2)
-        !$OMP DO 
-        do h2 = 1, mo_num
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1,                       &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+          !$OMP DO
           do ipoint = 1, n_points_final_grid
-            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+            tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
           enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
+          !$OMP END DO
+          !$OMP END PARALLEL
 
-        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
-                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
-                  , tmp2(1,1), n_points_final_grid                      &
-                  , 0.d0, tmp_2d(1,1), mo_num)
-
-        !$OMP PARALLEL DO PRIVATE(h2,p2)
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p1, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+          !$OMP DO 
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+            enddo
           enddo
-        enddo
-        !$OMP END PARALLEL DO
+          !$OMP END DO
+          !$OMP END PARALLEL
 
-      enddo ! p1
-    enddo ! h1
-  enddo ! i
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
-  deallocate(tmp_3d)
-  deallocate(tmp1, tmp2)
-  deallocate(tmpval_1)
-  deallocate(tmpvec_1)
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
 
-  no_aab_contraction = -0.5d0 * no_aab_contraction
+          do p1 = 1, mo_num
 
-  !$OMP PARALLEL                 &
-  !$OMP DEFAULT (NONE)           &
-  !$OMP PRIVATE (h1, h2, p1, p2) & 
-  !$OMP SHARED (no_aab_contraction, mo_num)
+            ! to minimize the number of operations
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1)
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                    - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                    - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                    - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
 
-  !$OMP DO 
-  do h1 = 1, mo_num
-    do h2 = 1, mo_num
-      do p1 = 1, mo_num
-        do p2 = p1, mo_num
-          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
+            !$OMP PARALLEL                             &
+            !$OMP DEFAULT (NONE)                       &
+            !$OMP PRIVATE (h2, ipoint)                 &
+            !$OMP SHARED (mo_num, n_points_final_grid, &
+            !$OMP         mos_r_in_r_array_transp,     &
+            !$OMP         tmpval_1, tmp2)
+            !$OMP DO 
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
 
-  !$OMP DO 
-  do h1 = 1, mo_num
-    do h2 = 1, mo_num
-      do p1 = 2, mo_num
-        do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
 
-  !$OMP DO 
-  do h1 = 1, mo_num-1
-    do h2 = h1+1, mo_num
-      do p1 = 2, mo_num
-        do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
 
-  call wall_time(wall1)
-  print*,' Wall time for no_aab_contraction', wall1-wall0
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+    endif
 
-END_PROVIDER
+    deallocate(tmp_3d)
+    deallocate(tmp1)
+    deallocate(tmp2)
+    deallocate(tmpval_1)
+    deallocate(tmpval_2)
+    deallocate(tmpvec_1)
+    deallocate(tmpvec_2)
+    deallocate(tmp_2d)
 
-! ---
+    tmp = -0.5d0 * tmp
+    call sum_A_At(tmp(1,1,1,1), mo_num*mo_num)
 
-BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+    call wall_time(wall1)
+    print*,' Wall time for aba_contraction', wall1-wall0
 
-  BEGIN_DOC
-  !
-  ! if:
-  !    h1 < h2
-  !    p1 > p2
-  !
-  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
-  !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
-  !
-  ! else:
-  !
-  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
-  !
-  ! 
-  ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
-  ! J(p2,h2,p1,h1) = \sum_i [ <  i p2 p1 | i h2 h1 >
-  !                         + < p2 p1  i | i h2 h1 >
-  !                         + < p1  i p2 | i h2 h1 > ]
-  !
-  !
-  END_DOC
+    normal_two_body_bi_orth = tmp
 
-  use bitmasks ! you need to include the bitmasks_module.f90 features
+    ! ---
+    ! aab contraction
 
-  implicit none
-  integer                        :: i, ii, h1, p1, h2, p2, ipoint
-  integer                        :: Ne(2)
-  double precision               :: wall0, wall1
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
-  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
-  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+    print*,' Providing aab_contraction ...'
+    call wall_time(wall0)
 
-  print*,' Providing no_aaa_contraction ...'
-  call wall_time(wall0)
-
-  PROVIDE N_int
-
-  allocate(occ(N_int*bit_kind_size,2))
-  allocate(key_i_core(N_int,2))
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  if(Ne(2) .lt. 3) then
-
-    no_aaa_contraction = 0.d0
-
-  else
+    tmp = 0.d0
 
     allocate(tmp_2d(mo_num,mo_num))
     allocate(tmp_3d(mo_num,mo_num,mo_num))
     allocate(tmp1(n_points_final_grid,3,mo_num))
     allocate(tmp2(n_points_final_grid,mo_num))
-    allocate(tmp3(n_points_final_grid,3,mo_num))
     allocate(tmpval_1(n_points_final_grid))
-    allocate(tmpval_2(n_points_final_grid))
     allocate(tmpvec_1(n_points_final_grid,3))
-    allocate(tmpvec_2(n_points_final_grid,3))
-    allocate(tmpvec_3(n_points_final_grid,3))
 
     ! purely closed shell part 
     do ii = 1, Ne(2)
@@ -677,21 +349,13 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP SHARED (n_points_final_grid, i, h1,                       &
         !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
         !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+        !$OMP         tmpval_1, tmpvec_1)
         !$OMP DO
         do ipoint = 1, n_points_final_grid
-
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-
-          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
-
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
           tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
           tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
           tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-
-          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
-          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
-          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
         enddo
         !$OMP END DO
         !$OMP END PARALLEL
@@ -722,39 +386,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
-            enddo
-          enddo
-        enddo
-        !$OMP END PARALLEL DO
-
-        !$OMP PARALLEL                                                &
-        !$OMP DEFAULT (NONE)                                          &
-        !$OMP PRIVATE (p2, ipoint)                                    &
-        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-        !$OMP         tmpval_2, tmpvec_2, tmp1)
-        !$OMP DO 
-        do p2 = 1, mo_num
-          do ipoint = 1, n_points_final_grid
-            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
-            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
-            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
-          enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-                  , tmp1(1,1,1), 3*n_points_final_grid                           &
-                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
-                  , 0.d0, tmp_3d(1,1,1), mo_num)
-
-        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
-        do p1 = 1, mo_num
-          do h2 = 1, mo_num
-            do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
@@ -763,58 +395,32 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         ! to avoid tmp(N^4)
         do p1 = 1, mo_num
 
-          !$OMP PARALLEL                                                  &
-          !$OMP DEFAULT (NONE)                                            &
-          !$OMP PRIVATE (ipoint)                                          &
-          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                 &
+          !$OMP DEFAULT (NONE)                                           &
+          !$OMP PRIVATE (ipoint)                                         &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+          !$OMP         tmpval_1)
           !$OMP DO
           do ipoint = 1, n_points_final_grid
-
-            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
-                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
-
-            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
-
-            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
-            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
-            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
-
-            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
-            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
-            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
-
-            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
           enddo
           !$OMP END DO
           !$OMP END PARALLEL
 
-          !$OMP PARALLEL                                &
-          !$OMP DEFAULT (NONE)                          &
-          !$OMP PRIVATE (h2, ipoint)                    &
-          !$OMP SHARED (mo_num, n_points_final_grid, i, &
-          !$OMP         mos_r_in_r_array_transp,        &
-          !$OMP         int2_grad1_u12_bimo_t,          &
-          !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
           !$OMP DO 
           do h2 = 1, mo_num
             do ipoint = 1, n_points_final_grid
-
-              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
-                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
-                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
-                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
-
-              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
-              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
-              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
-
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
             enddo
           enddo
           !$OMP END DO
@@ -828,47 +434,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
-            enddo
-          enddo
-          !$OMP END PARALLEL DO
-
-          !$OMP PARALLEL                                    &
-          !$OMP DEFAULT (NONE)                              &
-          !$OMP PRIVATE (p2, ipoint)                        &
-          !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
-          !$OMP         int2_grad1_u12_bimo_t,              &
-          !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
-          !$OMP DO 
-          do p2 = 1, mo_num
-            do ipoint = 1, n_points_final_grid
-
-              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
-                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
-                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
-
-              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
-              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
-              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
-            enddo
-          enddo
-          !$OMP END DO
-          !$OMP END PARALLEL
-
-          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
-                    , tmp2(1,1), n_points_final_grid                      &
-                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
-                    , 0.d0, tmp_2d(1,1), mo_num)
-
-          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
-                    , tmp3(1,1,1), 3*n_points_final_grid                    &
-                    , tmp1(1,1,1), 3*n_points_final_grid                    &
-                    , 1.d0, tmp_2d(1,1), mo_num)
-
-          !$OMP PARALLEL DO PRIVATE(h2,p2)
-          do h2 = 1, mo_num
-            do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -877,14 +443,85 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
       enddo ! h1
     enddo ! i
 
+    deallocate(tmp_2d)
+    deallocate(tmp_3d)
+    deallocate(tmp1)
+    deallocate(tmp2)
+    deallocate(tmpval_1)
+    deallocate(tmpvec_1)
 
+    tmp = -0.5d0 * tmp
 
-    ! purely open-shell part 
-    if(Ne(2) < Ne(1)) then
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (tmp, mo_num)
 
-      do ii = Ne(2) + 1, Ne(1)
-        i = occ(ii,1)
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
 
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL
+
+    call wall_time(wall1)
+    print*,' Wall time for aab_contraction', wall1-wall0
+
+    normal_two_body_bi_orth += tmp
+
+    ! ---
+    ! aaa contraction
+
+    if(Ne(2) .ge. 3) then
+
+      print*,' Providing aaa_contraction ...'
+      call wall_time(wall0)
+
+      tmp = 0.d0
+
+      allocate(tmp_2d(mo_num,mo_num))
+      allocate(tmp_3d(mo_num,mo_num,mo_num))
+      allocate(tmp1(n_points_final_grid,3,mo_num))
+      allocate(tmp2(n_points_final_grid,mo_num))
+      allocate(tmp3(n_points_final_grid,3,mo_num))
+      allocate(tmpval_1(n_points_final_grid))
+      allocate(tmpval_2(n_points_final_grid))
+      allocate(tmpvec_1(n_points_final_grid,3))
+      allocate(tmpvec_2(n_points_final_grid,3))
+      allocate(tmpvec_3(n_points_final_grid,3))
+
+      ! purely closed shell part 
+      do ii = 1, Ne(2)
+        i = occ(ii,2)
 
         ! to avoid tmp(N^4)
         do h1 = 1, mo_num
@@ -932,16 +569,16 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP END DO
           !$OMP END PARALLEL
 
-          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
-                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
-                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
                     , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
           !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
               enddo
             enddo
           enddo
@@ -964,16 +601,16 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP END DO
           !$OMP END PARALLEL
 
-          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
-                    , tmp1(1,1,1), 3*n_points_final_grid                            &
-                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
                     , 0.d0, tmp_3d(1,1,1), mo_num)
 
           !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
               enddo
             enddo
           enddo
@@ -1039,15 +676,15 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP END DO
             !$OMP END PARALLEL
 
-            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
-                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
-                      , tmp2(1,1), n_points_final_grid                       &
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                      , tmp2(1,1), n_points_final_grid                      &
                       , 0.d0, tmp_2d(1,1), mo_num)
 
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
@@ -1074,82 +711,321 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP END DO
             !$OMP END PARALLEL
 
-            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
-                      , tmp2(1,1), n_points_final_grid                       &
-                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , tmp2(1,1), n_points_final_grid                      &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
                       , 0.d0, tmp_2d(1,1), mo_num)
 
-            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
-                      , tmp3(1,1,1), 3*n_points_final_grid                     &
-                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                    &
+                      , tmp1(1,1,1), 3*n_points_final_grid                    &
                       , 1.d0, tmp_2d(1,1), mo_num)
 
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
 
           enddo ! p1
         enddo ! h1
-      enddo !i
-    endif
+      enddo ! i
 
-    deallocate(tmp_2d, tmp_3d)
-    deallocate(tmp1, tmp2, tmp3)
-    deallocate(tmpval_1, tmpval_2)
-    deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
+      ! purely open-shell part 
+      if(Ne(2) < Ne(1)) then
 
-    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+        do ii = Ne(2) + 1, Ne(1)
+          i = occ(ii,1)
 
-    !$OMP PARALLEL                 &
-    !$OMP DEFAULT (NONE)           &
-    !$OMP PRIVATE (h1, h2, p1, p2) & 
-    !$OMP SHARED (no_aaa_contraction, mo_num)
+          ! to avoid tmp(N^4)
+          do h1 = 1, mo_num
 
-    !$OMP DO 
-    do h1 = 1, mo_num
-      do h2 = 1, mo_num
-        do p1 = 1, mo_num
-          do p2 = p1, mo_num
-            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+            ! to minimize the number of operations
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1,                       &
+            !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            !$OMP PARALLEL                                                &
+            !$OMP DEFAULT (NONE)                                          &
+            !$OMP PRIVATE (p1, ipoint)                                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+            !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+            !$OMP         tmpval_1, tmpvec_1, tmp1)
+            !$OMP DO 
+            do p1 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+                tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+                tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                enddo
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            !$OMP PARALLEL                                                &
+            !$OMP DEFAULT (NONE)                                          &
+            !$OMP PRIVATE (p2, ipoint)                                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+            !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+            !$OMP         tmpval_2, tmpvec_2, tmp1)
+            !$OMP DO 
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+                tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+                tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , 0.d0, tmp_3d(1,1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                enddo
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            ! to avoid tmp(N^4)
+            do p1 = 1, mo_num
+
+              !$OMP PARALLEL                                                  &
+              !$OMP DEFAULT (NONE)                                            &
+              !$OMP PRIVATE (ipoint)                                          &
+              !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+              !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+              !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+              !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+              !$OMP DO
+              do ipoint = 1, n_points_final_grid
+
+                tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                                 ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+                tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+                tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+                tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+                tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              enddo
+              !$OMP END DO
+              !$OMP END PARALLEL
+
+              !$OMP PARALLEL                                &
+              !$OMP DEFAULT (NONE)                          &
+              !$OMP PRIVATE (h2, ipoint)                    &
+              !$OMP SHARED (mo_num, n_points_final_grid, i, &
+              !$OMP         mos_r_in_r_array_transp,        &
+              !$OMP         int2_grad1_u12_bimo_t,          &
+              !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+              !$OMP DO 
+              do h2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                  + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                  tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                  tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                  tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+                enddo
+              enddo
+              !$OMP END DO
+              !$OMP END PARALLEL
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              !$OMP PARALLEL DO PRIVATE(h2,p2)
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                enddo
+              enddo
+              !$OMP END PARALLEL DO
+
+              !$OMP PARALLEL                                    &
+              !$OMP DEFAULT (NONE)                              &
+              !$OMP PRIVATE (p2, ipoint)                        &
+              !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+              !$OMP         int2_grad1_u12_bimo_t,              &
+              !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+              !$OMP DO 
+              do p2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                  tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                  tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                  tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+                enddo
+              enddo
+              !$OMP END DO
+              !$OMP END PARALLEL
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                        , tmp3(1,1,1), 3*n_points_final_grid                     &
+                        , tmp1(1,1,1), 3*n_points_final_grid                     &
+                        , 1.d0, tmp_2d(1,1), mo_num)
+
+              !$OMP PARALLEL DO PRIVATE(h2,p2)
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                enddo
+              enddo
+              !$OMP END PARALLEL DO
+
+            enddo ! p1
+          enddo ! h1
+        enddo !i
+      endif
+
+      deallocate(tmp_2d)
+      deallocate(tmp_3d)
+      deallocate(tmp1)
+      deallocate(tmp2)
+      deallocate(tmp3)
+      deallocate(tmpval_1)
+      deallocate(tmpval_2)
+      deallocate(tmpvec_1)
+      deallocate(tmpvec_2)
+      deallocate(tmpvec_3)
+
+      tmp = -0.5d0 * tmp
+
+      !$OMP PARALLEL                 &
+      !$OMP DEFAULT (NONE)           &
+      !$OMP PRIVATE (h1, h2, p1, p2) & 
+      !$OMP SHARED (tmp, mo_num)
+
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 1, mo_num
+            do p2 = p1, mo_num
+              tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
+      !$OMP END DO
 
-    !$OMP DO 
-    do h1 = 1, mo_num
-      do h2 = 1, mo_num
-        do p1 = 2, mo_num
-          do p2 = 1, p1-1
-            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
+      !$OMP END DO
 
-    !$OMP DO 
-    do h1 = 1, mo_num-1
-      do h2 = h1+1, mo_num
-        do p1 = 2, mo_num
-          do p2 = 1, p1-1
-            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+      !$OMP DO 
+      do h1 = 1, mo_num-1
+        do h2 = h1+1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) *= -1.d0
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END PARALLEL
+      !$OMP END PARALLEL
 
+      call wall_time(wallf)
+      print*,' Wall time for aaa_contraction', wall1-wall0
+
+      normal_two_body_bi_orth += tmp
+    endif ! Ne(2) .ge. 3
+
+    deallocate(tmp)
+
+  endif ! read_tc_norm_ord
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
   endif
 
-  call wall_time(wall1)
-  print*,' Wall time for no_aaa_contraction', wall1-wall0
+  call wall_time(wallf)
+  print*,' Wall time for normal_two_body_bi_orth ', wallf-walli
 
-END_PROVIDER
+END_PROVIDER 
 
 ! ---
+
diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
new file mode 100644
index 00000000..855cfd17
--- /dev/null
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -0,0 +1,1062 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aba_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+  allocate(tmpvec_2(n_points_final_grid,3))
+  allocate(tmp_2d(mo_num,mo_num))
+
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+
+  ! purely open-shell part 
+  if(Ne(2) < Ne(1)) then
+    do ii = Ne(2) + 1, Ne(1)
+      i = occ(ii,1)
+
+      do h1 = 1, mo_num
+
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        do p1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                    , tmp2(1,1), n_points_final_grid                       &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo !i
+  endif
+
+  deallocate(tmp_2d, tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+  no_aba_contraction = -0.5d0 * no_aba_contraction
+  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aba_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aab_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_2d(mo_num,mo_num))
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpvec_1)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpvec_1, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1)
+  deallocate(tmpvec_1)
+
+  no_aab_contraction = -0.5d0 * no_aab_contraction
+
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (h1, h2, p1, p2) & 
+  !$OMP SHARED (no_aab_contraction, mo_num)
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 1, mo_num
+        do p2 = p1, mo_num
+          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num-1
+    do h2 = h1+1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aab_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  !
+  ! if:
+  !    h1 < h2
+  !    p1 > p2
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
+  !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! else:
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! 
+  ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
+  ! J(p2,h2,p1,h1) = \sum_i [ <  i p2 p1 | i h2 h1 >
+  !                         + < p2 p1  i | i h2 h1 >
+  !                         + < p1  i p2 | i h2 h1 > ]
+  !
+  !
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+
+  print*,' Providing no_aaa_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  if(Ne(2) .lt. 3) then
+
+    no_aaa_contraction = 0.d0
+
+  else
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmp3(n_points_final_grid,3,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmpvec_3(n_points_final_grid,3))
+
+    ! purely closed shell part 
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      ! to avoid tmp(N^4)
+      do h1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpvec_1, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p2, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_2, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , 0.d0, tmp_3d(1,1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        ! to avoid tmp(N^4)
+        do p1 = 1, mo_num
+
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                &
+          !$OMP DEFAULT (NONE)                          &
+          !$OMP PRIVATE (h2, ipoint)                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, &
+          !$OMP         mos_r_in_r_array_transp,        &
+          !$OMP         int2_grad1_u12_bimo_t,          &
+          !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                    &
+          !$OMP DEFAULT (NONE)                              &
+          !$OMP PRIVATE (p2, ipoint)                        &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+          !$OMP         int2_grad1_u12_bimo_t,              &
+          !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp3(1,1,1), 3*n_points_final_grid                    &
+                    , tmp1(1,1,1), 3*n_points_final_grid                    &
+                    , 1.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+
+        ! to avoid tmp(N^4)
+        do h1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1,                       &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p1, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_1, tmpvec_1, tmp1)
+          !$OMP DO 
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p2, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_2, tmpvec_2, tmp1)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          ! to avoid tmp(N^4)
+          do p1 = 1, mo_num
+
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+            !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            !$OMP PARALLEL                                &
+            !$OMP DEFAULT (NONE)                          &
+            !$OMP PRIVATE (h2, ipoint)                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, &
+            !$OMP         mos_r_in_r_array_transp,        &
+            !$OMP         int2_grad1_u12_bimo_t,          &
+            !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+            !$OMP DO 
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            !$OMP PARALLEL                                    &
+            !$OMP DEFAULT (NONE)                              &
+            !$OMP PRIVATE (p2, ipoint)                        &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+            !$OMP         int2_grad1_u12_bimo_t,              &
+            !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+            !$OMP DO 
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                     &
+                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+    endif
+
+    deallocate(tmp_2d, tmp_3d)
+    deallocate(tmp1, tmp2, tmp3)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
+
+    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (no_aaa_contraction, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL
+
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aaa_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---

From ee06ddf85e2b3fc83faa25515e80b262a2932aa7 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 8 Jun 2023 15:59:14 +0200
Subject: [PATCH 62/79] free two (3xN_gridxMOxMO) tables in TC-CIPSI

---
 src/fci_tc_bi/fci_tc_bi_ortho.irp.f | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
index f9bda058..3e6f229b 100644
--- a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -63,7 +63,9 @@ subroutine run_cipsi_tc
         call provide_all_three_ints_bi_ortho()
       endif
     endif
-    ! ---
+
+    FREE int2_grad1_u12_bimo_transp int2_grad1_u12_ao_transp
+
     write(json_unit,json_array_open_fmt) 'fci_tc'
 
     if (do_pt2) then
@@ -78,13 +80,16 @@ subroutine run_cipsi_tc
     call json_close
 
   else
+
     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+
     if(elec_alpha_num+elec_beta_num.ge.3)then
       if(three_body_h_tc)then
         call provide_all_three_ints_bi_ortho
       endif
     endif
-    ! ---
+
+    FREE int2_grad1_u12_bimo_transp int2_grad1_u12_ao_transp
 
     call run_slave_cipsi
 

From 22e1dcd1c4a5cdce7159d926443968b83dfb271c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 9 Jun 2023 21:32:13 +0200
Subject: [PATCH 63/79] 4-idx tensors: DGEMM with tmp(N3) added

---
 src/bi_ort_ints/bi_ort_ints.irp.f        | 192 +++++---
 src/bi_ort_ints/three_body_ijmk.irp.f    | 564 +++++++----------------
 src/bi_ort_ints/three_body_ijmk_n4.irp.f | 484 +++++++++++++++++++
 3 files changed, 771 insertions(+), 469 deletions(-)
 create mode 100644 src/bi_ort_ints/three_body_ijmk_n4.irp.f

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 75af8fb1..7f90c6f3 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -18,10 +18,11 @@ program bi_ort_ints
 ! call test_3e
 ! call test_5idx
 ! call test_5idx2
- !call test_4idx
-  call test_4idx2()
-  call test_5idx2
- call test_5idx
+  call test_4idx()
+  call test_4idx_n4()
+  !call test_4idx2()
+  !call test_5idx2
+  !call test_5idx
 end
 
 subroutine test_5idx2
@@ -167,13 +168,138 @@ end
 
 ! ---
 
+subroutine test_4idx_n4()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  thr = 1d-10
+
+  PROVIDE three_e_4_idx_direct_bi_ort_old
+  PROVIDE three_e_4_idx_direct_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_direct_bi_ort_n4 (l,k,j,i)
+          ref = three_e_4_idx_direct_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_direct_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_direct_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch13_bi_ort_old
+  PROVIDE three_e_4_idx_exch13_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch13_bi_ort_n4 (l,k,j,i)
+          ref = three_e_4_idx_exch13_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch13_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch13_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_cycle_1_bi_ort_old
+  PROVIDE three_e_4_idx_cycle_1_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_cycle_1_bi_ort_n4 (l,k,j,i)
+          ref = three_e_4_idx_cycle_1_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_cycle_1_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_cycle_1_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch23_bi_ort_old
+  PROVIDE three_e_4_idx_exch23_bi_ort_n4
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch23_bi_ort_n4  (l,k,j,i)
+          ref = three_e_4_idx_exch23_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch23_bi_ort_n4'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch23_bi_ort_n4 = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  return
+end
+
+! ---
+
 subroutine test_4idx()
 
   implicit none
   integer          :: i, j, k, l
   double precision :: accu, contrib, new, ref, thr
 
-  thr = 1d-5
+  thr = 1d-10
 
   PROVIDE three_e_4_idx_direct_bi_ort_old
   PROVIDE three_e_4_idx_direct_bi_ort 
@@ -231,34 +357,6 @@ subroutine test_4idx()
 
   ! ---
 
-!  PROVIDE three_e_4_idx_exch12_bi_ort_old
-!  PROVIDE three_e_4_idx_exch12_bi_ort 
-!
-!  accu = 0.d0
-!  do i = 1, mo_num
-!    do j = 1, mo_num
-!      do k = 1, mo_num
-!        do l = 1, mo_num
-!
-!          new = three_e_4_idx_exch12_bi_ort    (l,k,j,i)
-!          ref = three_e_4_idx_exch12_bi_ort_old(l,k,j,i)
-!          contrib = dabs(new - ref)
-!          accu += contrib
-!          if(contrib .gt. thr) then
-!            print*, ' problem in three_e_4_idx_exch12_bi_ort'
-!            print*, l, k, j, i
-!            print*, ref, new, contrib
-!            stop
-!          endif
-!
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  print*, ' accu on three_e_4_idx_exch12_bi_ort = ', accu / dble(mo_num)**4
-
-  ! ---
-
   PROVIDE three_e_4_idx_cycle_1_bi_ort_old
   PROVIDE three_e_4_idx_cycle_1_bi_ort
 
@@ -287,34 +385,6 @@ subroutine test_4idx()
 
   ! ---
 
-!  PROVIDE three_e_4_idx_cycle_2_bi_ort_old
-!  PROVIDE three_e_4_idx_cycle_2_bi_ort
-!
-!  accu = 0.d0
-!  do i = 1, mo_num
-!    do j = 1, mo_num
-!      do k = 1, mo_num
-!        do l = 1, mo_num
-!
-!          new = three_e_4_idx_cycle_2_bi_ort    (l,k,j,i)
-!          ref = three_e_4_idx_cycle_2_bi_ort_old(l,k,j,i)
-!          contrib = dabs(new - ref)
-!          accu += contrib
-!          if(contrib .gt. thr) then
-!            print*, ' problem in three_e_4_idx_cycle_2_bi_ort'
-!            print*, l, k, j, i
-!            print*, ref, new, contrib
-!            stop
-!          endif
-!
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  print*, ' accu on three_e_4_idx_cycle_2_bi_ort = ', accu / dble(mo_num)**4
-
-  ! ---
-
   PROVIDE three_e_4_idx_exch23_bi_ort_old
   PROVIDE three_e_4_idx_exch23_bi_ort
 
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index ee7e88ef..0d466f9f 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -3,9 +3,8 @@
 
  BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
 &BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
-!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -13,28 +12,25 @@
   !
   ! three_e_4_idx_direct_bi_ort (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
   ! three_e_4_idx_exch13_bi_ort (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  ! three_e_4_idx_exch12_bi_ort (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !                                       = three_e_4_idx_exch13_bi_ort (j,m,k,i) 
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
   ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !                                       = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
   !
   ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
   ! three_e_4_idx_direct_bi_ort (m,j,k,i) : Lk Ri Imm Ijj + Lj Rj Imm Iki + Lm Rm Ijj Iki 
   ! three_e_4_idx_exch13_bi_ort (m,j,k,i) : Lk Rm Imi Ijj + Lj Rj Imi Ikm + Lm Ri Ijj Ikm 
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
   ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) : Lk Rm Imj Iji + Lj Ri Imj Ikm + Lm Rj Iji Ikm 
   !
   END_DOC
 
   implicit none
-  integer                       :: ipoint, i, j, k, l, m
+  integer                       :: ipoint, i, j, k, m, n
   double precision              :: wall1, wall0
-  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:)
-  double precision, allocatable :: tmp_4d(:,:,:,:)
-  double precision, allocatable :: tmp4(:,:,:)
-  double precision, allocatable :: tmp5(:,:)
-  double precision, allocatable :: tmp_3d(:,:,:)
+  double precision              :: tmp_loc_1, tmp_loc_2
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
+  double precision, allocatable :: tmp_2d(:,:)
+  double precision, allocatable :: tmp_aux_1(:,:,:), tmp_aux_2(:,:)
 
   print *, ' Providing the three_e_4_idx_bi_ort ...'
   call wall_time(wall0)
@@ -42,324 +38,188 @@
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
 
-  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
-
-  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
+  ! to reduce the number of operations
+  allocate(tmp_aux_1(n_points_final_grid,4,mo_num))
+  allocate(tmp_aux_2(n_points_final_grid,mo_num))
 
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP PRIVATE (n, ipoint)                                       &
   !$OMP SHARED (mo_num, n_points_final_grid,                      &
   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1, tmp2, tmp3)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-
-        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
-
-        tmp2(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_r_in_r_array_transp(ipoint,i)
-        tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
-        tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
-
-        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp3, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp1)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_4d(m,i,j,k)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp2)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_4d(m,k,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp1, 3*n_points_final_grid, tmp3, 3*n_points_final_grid            &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp3)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
-            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp1)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_direct_bi_ort(m,j,k,i) = three_e_4_idx_direct_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  deallocate(tmp_4d)
-
-
-  allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp5(n_points_final_grid,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, ipoint)                                       &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector,                         &
-  !$OMP         tmp5)
+  !$OMP         tmp_aux_1, tmp_aux_2)
   !$OMP DO
-  do i = 1, mo_num
+  do n = 1, mo_num
     do ipoint = 1, n_points_final_grid
-      tmp5(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_aux_1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * final_weight_at_r_vector(ipoint)
+        tmp_aux_1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,n) * final_weight_at_r_vector(ipoint)
+        tmp_aux_1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,n) * final_weight_at_r_vector(ipoint)
+        tmp_aux_1(ipoint,4,n) = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,n) * final_weight_at_r_vector(ipoint)
+
+        tmp_aux_2(ipoint,n) = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,n)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  allocate(tmp_2d(mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,4,mo_num))
+  allocate(tmp2(n_points_final_grid,4,mo_num))
 
-  allocate(tmp4(n_points_final_grid,mo_num,mo_num))
-
-  do m = 1, mo_num
-
-    !$OMP PARALLEL                                                 &
-    !$OMP DEFAULT (NONE)                                           &
-    !$OMP PRIVATE (i, k, ipoint)                                   &
-    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
-    !$OMP         int2_grad1_u12_bimo_t,                           &
-    !$OMP         tmp4)
-    !$OMP DO COLLAPSE(2)
+  ! loops approach to break the O(N^4) scaling in memory
+  do k = 1, mo_num
     do i = 1, mo_num
-      do k = 1, mo_num
-        do ipoint = 1, n_points_final_grid
 
-          tmp4(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
-                           + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
-                           + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
-        enddo
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2)                 &
+    !$OMP SHARED (mo_num, n_points_final_grid, i, k,                &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp_aux_2, tmp1)
+    !$OMP DO
+    do n = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i)
+        tmp_loc_2 = tmp_aux_2(ipoint,n)
+
+        tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,i) * tmp_loc_2
+        tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,i) * tmp_loc_2
+        tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,n) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,i) * tmp_loc_2
+        tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,n) * int2_grad1_u12_bimo_t(ipoint,1,k,i) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,n,n) * int2_grad1_u12_bimo_t(ipoint,2,k,i) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,n,n) * int2_grad1_u12_bimo_t(ipoint,3,k,i)
+
       enddo
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0 &
-              , tmp5, n_points_final_grid, tmp4, n_points_final_grid       &
-              , 0.d0, tmp_3d, mo_num)
+    call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                       &
+              , tmp_aux_1(1,1,1), 4*n_points_final_grid, tmp1(1,1,1), 4*n_points_final_grid &
+              , 0.d0, tmp_2d(1,1), mo_num)
 
-    !$OMP PARALLEL DO PRIVATE(i,j,k)
-    do i = 1, mo_num
-      do k = 1, mo_num
-        do j = 1, mo_num
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
-        enddo
-      enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-
-
-    !$OMP PARALLEL                                                 &
-    !$OMP DEFAULT (NONE)                                           &
-    !$OMP PRIVATE (j, k, ipoint)                                   &
-    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
-    !$OMP         mos_l_in_r_array_transp,                         &
-    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp4)
-    !$OMP DO COLLAPSE(2)
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-
-          tmp4(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
-                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
-                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
-                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
-        enddo
-      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
-              , tmp4, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
-              , 0.d0, tmp_3d, mo_num*mo_num)
-
-    !$OMP PARALLEL DO PRIVATE(i,j,k)
-    do i = 1, mo_num
-      do k = 1, mo_num
-        do j = 1, mo_num
-          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
-        enddo
-      enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-  enddo
-
-  deallocate(tmp5)
-  deallocate(tmp_3d)
-
-
-
-  do i = 1, mo_num
-
-    !$OMP PARALLEL                                                 &
-    !$OMP DEFAULT (NONE)                                           &
-    !$OMP PRIVATE (m, j, ipoint)                                   &
-    !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
-    !$OMP         mos_r_in_r_array_transp,                         &
-    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp4)
-    !$OMP DO COLLAPSE(2)
+    !$OMP PARALLEL DO PRIVATE(j,m)
     do j = 1, mo_num
       do m = 1, mo_num
-        do ipoint = 1, n_points_final_grid
+        three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_2d(m,j)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
 
-          tmp4(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
-                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-        enddo
+
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2)                 &
+    !$OMP SHARED (mo_num, n_points_final_grid, i, k,                &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp1, tmp2)
+    !$OMP DO
+    do n = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp_loc_1 = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n)
+        tmp_loc_2 = mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,i)
+
+        tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,k,n) * tmp_loc_2
+        tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,k,n) * tmp_loc_2
+        tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,i) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,k,n) * tmp_loc_2
+        tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,i) * int2_grad1_u12_bimo_t(ipoint,1,k,n) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,n,i) * int2_grad1_u12_bimo_t(ipoint,2,k,n) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,n,i) * int2_grad1_u12_bimo_t(ipoint,3,k,n)
+
+        tmp2(ipoint,1,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,n)
+        tmp2(ipoint,2,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,n)
+        tmp2(ipoint,3,n) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,n)
+        tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,n)
       enddo
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
-              , tmp4, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
-              , 1.d0, three_e_4_idx_cycle_1_bi_ort(1,1,1,i), mo_num*mo_num)
+    call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                       &
+              , tmp1(1,1,1), 4*n_points_final_grid, tmp_aux_1(1,1,1), 4*n_points_final_grid &
+              , 0.d0, tmp_2d(1,1), mo_num)
 
-  enddo
+    !$OMP PARALLEL DO PRIVATE(j,m)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_2d(m,j)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
 
-  deallocate(tmp4)
+    call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                  &
+              , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid &
+              , 0.d0, tmp_2d(1,1), mo_num)
 
+    !$OMP PARALLEL DO PRIVATE(j,m)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        three_e_4_idx_cycle_1_bi_ort(m,i,k,j) = -tmp_2d(m,j)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
 
-!  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-!  do i = 1, mo_num
-!    do k = 1, mo_num
-!      do j = 1, mo_num
-!        do m = 1, mo_num
-!          three_e_4_idx_exch12_bi_ort (m,j,k,i) = three_e_4_idx_exch13_bi_ort (j,m,k,i)
-!          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END PARALLEL DO
+  enddo ! i
+
+    do j = 1, mo_num
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (n, ipoint, tmp_loc_1, tmp_loc_2)                 &
+      !$OMP SHARED (mo_num, n_points_final_grid, j, k,                &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp1, tmp2)
+      !$OMP DO
+      do n = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp_loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,n)
+          tmp_loc_2 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,j)
+
+          tmp1(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,n,j) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,1,j,n) * tmp_loc_2
+          tmp1(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,n,j) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,2,j,n) * tmp_loc_2
+          tmp1(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,n,j) * tmp_loc_1 + int2_grad1_u12_bimo_t(ipoint,3,j,n) * tmp_loc_2
+          tmp1(ipoint,4,n) = int2_grad1_u12_bimo_t(ipoint,1,n,j) * int2_grad1_u12_bimo_t(ipoint,1,j,n) &
+                           + int2_grad1_u12_bimo_t(ipoint,2,n,j) * int2_grad1_u12_bimo_t(ipoint,2,j,n) &
+                           + int2_grad1_u12_bimo_t(ipoint,3,n,j) * int2_grad1_u12_bimo_t(ipoint,3,j,n)
+
+          tmp2(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,k,n)
+          tmp2(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,k,n)
+          tmp2(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,k,n)
+          tmp2(ipoint,4,n) = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num, mo_num, 4*n_points_final_grid, 1.d0                  &
+                , tmp1(1,1,1), 4*n_points_final_grid, tmp2(1,1,1), 4*n_points_final_grid &
+                , 0.d0, tmp_2d(1,1), mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(i,m)
+      do i = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch23_bi_ort(m,j,k,i) = -tmp_2d(m,i)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+    enddo ! j
+  enddo !k
+
+  deallocate(tmp_2d)
+  deallocate(tmp1)
+  deallocate(tmp2)
+  deallocate(tmp_aux_1)
+  deallocate(tmp_aux_2)
 
 
   call wall_time(wall1)
@@ -370,115 +230,3 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: i, j, k, l, m, ipoint
-  double precision              :: wall1, wall0
-  double precision, allocatable :: tmp1(:,:,:,:), tmp_4d(:,:,:,:)
-  double precision, allocatable :: tmp5(:,:,:), tmp6(:,:,:)
-
-  print *, ' Providing the three_e_4_idx_exch23_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
-
-  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
-  allocate(tmp6(n_points_final_grid,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp5, tmp6)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-
-        tmp5(ipoint,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * int2_grad1_u12_bimo_t(ipoint,1,i,l) &
-                         + int2_grad1_u12_bimo_t(ipoint,2,l,i) * int2_grad1_u12_bimo_t(ipoint,2,i,l) &
-                         + int2_grad1_u12_bimo_t(ipoint,3,l,i) * int2_grad1_u12_bimo_t(ipoint,3,i,l) 
-
-        tmp6(ipoint,l,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, -1.d0 &
-            , tmp5, n_points_final_grid, tmp6, n_points_final_grid               &
-            , 0.d0, three_e_4_idx_exch23_bi_ort, mo_num*mo_num)
-
-  deallocate(tmp5)
-  deallocate(tmp6)
-
-
-  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (i, l, ipoint)                                    &
-  !$OMP SHARED (mo_num, n_points_final_grid,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1)
-  !$OMP DO COLLAPSE(2)
-  do i = 1, mo_num
-    do l = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
-            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
-            , 0.d0, tmp_4d, mo_num*mo_num)
-
-  deallocate(tmp1)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_4_idx_exch23_bi_ort(m,j,k,i) = three_e_4_idx_exch23_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  deallocate(tmp_4d)
-
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
diff --git a/src/bi_ort_ints/three_body_ijmk_n4.irp.f b/src/bi_ort_ints/three_body_ijmk_n4.irp.f
new file mode 100644
index 00000000..157b70f4
--- /dev/null
+++ b/src/bi_ort_ints/three_body_ijmk_n4.irp.f
@@ -0,0 +1,484 @@
+
+! ---
+
+ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort_n4 , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort_n4 , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort_n4, (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_direct_bi_ort_n4 (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch13_bi_ort_n4 (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch12_bi_ort (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                       = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i) 
+  ! three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                       = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
+  !
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort_n4 can be directly used to compute Slater rules with a + sign
+  !
+  ! three_e_4_idx_direct_bi_ort_n4 (m,j,k,i) : Lk Ri Imm Ijj + Lj Rj Imm Iki + Lm Rm Ijj Iki 
+  ! three_e_4_idx_exch13_bi_ort_n4 (m,j,k,i) : Lk Rm Imi Ijj + Lj Rj Imi Ikm + Lm Ri Ijj Ikm 
+  ! three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) : Lk Rm Imj Iji + Lj Ri Imj Ikm + Lm Rj Iji Ikm 
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: ipoint, i, j, k, l, m
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:)
+  double precision, allocatable :: tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp4(:,:,:)
+  double precision, allocatable :: tmp5(:,:)
+  double precision, allocatable :: tmp_3d(:,:,:)
+
+  print *, ' Providing the O(N^4) three_e_4_idx_bi_ort ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1, tmp2, tmp3)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+
+        tmp2(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
+
+        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, tmp2(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort_n4(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp3(1,1,1,1), 3*n_points_final_grid, tmp1(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) = -tmp_4d(m,i,j,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, tmp2(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp2)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) - tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, tmp3(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp3)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0                         &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort_n4(m,j,k,i) = three_e_4_idx_direct_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp5(n_points_final_grid,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, ipoint)                                       &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector,                         &
+  !$OMP         tmp5)
+  !$OMP DO
+  do i = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+      tmp5(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  allocate(tmp4(n_points_final_grid,mo_num,mo_num))
+
+  do m = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (i, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         int2_grad1_u12_bimo_t,                           &
+    !$OMP         tmp4)
+    !$OMP DO COLLAPSE(2)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp4(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0       &
+              , tmp5(1,1), n_points_final_grid, tmp4(1,1,1), n_points_final_grid &
+              , 0.d0, tmp_3d(1,1,1), mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (j, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         mos_l_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp4)
+    !$OMP DO COLLAPSE(2)
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp4(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
+              , tmp4, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
+              , 0.d0, tmp_3d, mo_num*mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+  enddo
+
+  deallocate(tmp5)
+  deallocate(tmp_3d)
+
+
+
+  do i = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (m, j, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
+    !$OMP         mos_r_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp4)
+    !$OMP DO COLLAPSE(2)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp4(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
+              , tmp4, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
+              , 1.d0, three_e_4_idx_cycle_1_bi_ort_n4(1,1,1,i), mo_num*mo_num)
+
+  enddo
+
+  deallocate(tmp4)
+
+
+!  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+!  do i = 1, mo_num
+!    do k = 1, mo_num
+!      do j = 1, mo_num
+!        do m = 1, mo_num
+!          three_e_4_idx_exch12_bi_ort (m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i)
+!          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END PARALLEL DO
+
+
+  call wall_time(wall1)
+  print *, ' wall time for O(N^4) three_e_4_idx_bi_ort', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort_n4 , (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch23_bi_ort_n4 (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort_n4 can be directly used to compute Slater rules with a + sign
+  !
+  ! three_e_4_idx_exch23_bi_ort_n4 (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: i, j, k, l, m, ipoint
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp5(:,:,:), tmp6(:,:,:)
+
+  print *, ' Providing the O(N^4) three_e_4_idx_exch23_bi_ort_n4 ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+
+  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp6(n_points_final_grid,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp5, tmp6)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp5(ipoint,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * int2_grad1_u12_bimo_t(ipoint,1,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,l,i) * int2_grad1_u12_bimo_t(ipoint,2,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,l,i) * int2_grad1_u12_bimo_t(ipoint,3,i,l) 
+
+        tmp6(ipoint,l,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, -1.d0 &
+            , tmp5(1,1,1), n_points_final_grid, tmp6(1,1,1), n_points_final_grid &
+            , 0.d0, three_e_4_idx_exch23_bi_ort_n4(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp5)
+  deallocate(tmp6)
+
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0                         &
+            , tmp1(1,1,1,1), 3*n_points_final_grid, int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid &
+            , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch23_bi_ort_n4(m,j,k,i) = three_e_4_idx_exch23_bi_ort_n4(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  call wall_time(wall1)
+  print *, ' wall time for O(N^4) three_e_4_idx_exch23_bi_ort_n4', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+

From ba65e672166d5f9f41cebdf28b05f26f3adfef61 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 9 Jun 2023 22:05:55 +0200
Subject: [PATCH 64/79] 4-idx tensors seems to be correct

---
 src/bi_ort_ints/three_body_ijmk.irp.f    |  2 +-
 src/bi_ort_ints/three_body_ijmk_n4.irp.f | 38 +++++++++++++-----------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 0d466f9f..669861b7 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -194,7 +194,7 @@
           tmp2(ipoint,1,n) = int2_grad1_u12_bimo_t(ipoint,1,k,n)
           tmp2(ipoint,2,n) = int2_grad1_u12_bimo_t(ipoint,2,k,n)
           tmp2(ipoint,3,n) = int2_grad1_u12_bimo_t(ipoint,3,k,n)
-          tmp2(ipoint,4,n) = mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n)
+          tmp2(ipoint,4,n) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,n)
         enddo
       enddo
       !$OMP END DO
diff --git a/src/bi_ort_ints/three_body_ijmk_n4.irp.f b/src/bi_ort_ints/three_body_ijmk_n4.irp.f
index 157b70f4..e3faeff0 100644
--- a/src/bi_ort_ints/three_body_ijmk_n4.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk_n4.irp.f
@@ -1,11 +1,11 @@
 
 ! ---
 
- BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort_n4 , (mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort_n4 , (mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort_n4, (mo_num, mo_num, mo_num, mo_num)]
-!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
-!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort_n4 ,  (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort_n4 ,  (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort_n4,  (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort_n4,  (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort_n4, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -13,11 +13,11 @@
   !
   ! three_e_4_idx_direct_bi_ort_n4 (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
   ! three_e_4_idx_exch13_bi_ort_n4 (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  ! three_e_4_idx_exch12_bi_ort (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !                                       = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i) 
+  ! three_e_4_idx_exch12_bi_ort_n4 (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                          = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i) 
   ! three_e_4_idx_cycle_1_bi_ort_n4(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !                                       = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
+  ! three_e_4_idx_cycle_2_bi_ort_n4(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                          = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
   !
   ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort_n4 can be directly used to compute Slater rules with a + sign
   !
@@ -77,6 +77,7 @@
   !$OMP END DO
   !$OMP END PARALLEL
 
+
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
             , tmp1(1,1,1,1), 3*n_points_final_grid, tmp2(1,1,1,1), 3*n_points_final_grid &
             , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
@@ -97,7 +98,6 @@
             , tmp3(1,1,1,1), 3*n_points_final_grid, tmp1(1,1,1,1), 3*n_points_final_grid &
             , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
 
-  deallocate(tmp1)
 
   !$OMP PARALLEL DO PRIVATE(i,j,k,m)
   do i = 1, mo_num
@@ -133,10 +133,12 @@
   !$OMP END DO
   !$OMP END PARALLEL
 
+
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0        &
             , tmp1(1,1,1,1), 3*n_points_final_grid, tmp2(1,1,1,1), 3*n_points_final_grid &
             , 0.d0, tmp_4d(1,1,1,1), mo_num*mo_num)
 
+
   deallocate(tmp2)
 
   !$OMP PARALLEL DO PRIVATE(i,j,k,m)
@@ -202,7 +204,7 @@
     do k = 1, mo_num
       do j = 1, mo_num
         do m = 1, mo_num
-          three_e_4_idx_direct_bi_ort_n4(m,j,k,i) = three_e_4_idx_direct_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+          three_e_4_idx_direct_bi_ort_n4(m,j,k,i) = three_e_4_idx_direct_bi_ort_n4(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
         enddo
       enddo
     enddo
@@ -294,9 +296,9 @@
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
-              , tmp4, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
-              , 0.d0, tmp_3d, mo_num*mo_num)
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0                          &
+              , tmp4(1,1,1), n_points_final_grid, mos_r_in_r_array_transp(1,1), n_points_final_grid &
+              , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
     !$OMP PARALLEL DO PRIVATE(i,j,k)
     do i = 1, mo_num
@@ -339,8 +341,8 @@
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
-              , tmp4, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0                         &
+              , tmp4(1,1,1), n_points_final_grid, mos_l_in_r_array_transp(1,1), n_points_final_grid &
               , 1.d0, three_e_4_idx_cycle_1_bi_ort_n4(1,1,1,i), mo_num*mo_num)
 
   enddo
@@ -353,8 +355,8 @@
 !    do k = 1, mo_num
 !      do j = 1, mo_num
 !        do m = 1, mo_num
-!          three_e_4_idx_exch12_bi_ort (m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i)
-!          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
+!          three_e_4_idx_exch12_bi_ort_n4 (m,j,k,i) = three_e_4_idx_exch13_bi_ort_n4 (j,m,k,i)
+!          three_e_4_idx_cycle_2_bi_ort_n4(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort_n4(j,m,k,i)
 !        enddo
 !      enddo
 !    enddo

From 6e31ca280d5a11db7b09c5fa04e2f36a7d11c39f Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 10 Jun 2023 10:42:32 +0200
Subject: [PATCH 65/79] // in Norm_Ord

---
 .../normal_ordered_contractions.irp.f         | 289 +++++++++++++++++-
 1 file changed, 280 insertions(+), 9 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
index 855cfd17..6f70516d 100644
--- a/src/tc_bi_ortho/normal_ordered_contractions.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+BEGIN_PROVIDER [ double precision, no_aba_contraction_v0, (mo_num,mo_num,mo_num,mo_num)]
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
 
@@ -16,7 +16,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
   double precision,  allocatable :: tmp_2d(:,:)
 
-  print*,' Providing no_aba_contraction ...'
+  print*,' Providing no_aba_contraction_v0 ...'
   call wall_time(wall0)
 
   PROVIDE N_int
@@ -102,7 +102,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
           enddo
         enddo
       enddo
@@ -153,7 +153,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP PARALLEL DO PRIVATE(h2,p2)
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
           enddo
         enddo
         !$OMP END PARALLEL DO
@@ -220,7 +220,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
@@ -270,7 +270,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              no_aba_contraction_v0(p2,h2,p1,h1) = no_aba_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -285,11 +285,11 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpval_1, tmpval_2)
   deallocate(tmpvec_1, tmpvec_2)
 
-  no_aba_contraction = -0.5d0 * no_aba_contraction
-  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+  no_aba_contraction_v0 = -0.5d0 * no_aba_contraction_v0
+  call sum_A_At(no_aba_contraction_v0(1,1,1,1), mo_num*mo_num)
 
   call wall_time(wall1)
-  print*,' Wall time for no_aba_contraction', wall1-wall0
+  print*,' Wall time for no_aba_contraction_v0', wall1-wall0
 
 END_PROVIDER
 
@@ -1060,3 +1060,274 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
 END_PROVIDER
 
 ! ---
+
+BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aba_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+  !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+  !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+  !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         no_aba_contraction)
+
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+  !$OMP DO
+
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    do h1 = 1, mo_num
+
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+      enddo
+
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+        enddo
+      enddo
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+
+      do p1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                           ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                           + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                           + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                           - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                           - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                           - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+        enddo
+
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  !$OMP END DO
+
+  deallocate(tmp_3d, tmp_2d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+  !$OMP END PARALLEL
+
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+
+  ! purely open-shell part 
+  if(Ne(2) < Ne(1)) then
+
+    do ii = Ne(2) + 1, Ne(1)
+      i = occ(ii,1)
+
+      do h1 = 1, mo_num
+
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        do p1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                    , tmp2(1,1), n_points_final_grid                       &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo !i
+  endif
+
+  deallocate(tmp_2d, tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+  no_aba_contraction = -0.5d0 * no_aba_contraction
+  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aba_contraction', wall1-wall0
+
+END_PROVIDER

From d9921922fc00efd0146aa5669219c15bb0c408e9 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 10 Jun 2023 11:24:06 +0200
Subject: [PATCH 66/79] NO aba // ok

---
 .../normal_ordered_contractions.irp.f         | 118 +++++++++---------
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f        |  44 ++++++-
 2 files changed, 101 insertions(+), 61 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
index 6f70516d..980181e7 100644
--- a/src/tc_bi_ortho/normal_ordered_contractions.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -1104,12 +1104,20 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
   !$OMP         no_aba_contraction)
 
-
   allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
   allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
   allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
   allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
 
+  tmp_3d   = 0.d0
+  tmp_2d   = 0.d0
+  tmp1     = 0.d0
+  tmp2     = 0.d0
+  tmpval_1 = 0.d0
+  tmpval_2 = 0.d0
+  tmpvec_1 = 0.d0
+  tmpvec_2 = 0.d0 
+
   !$OMP DO
 
   do ii = 1, Ne(2)
@@ -1147,7 +1155,9 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
         do h2 = 1, mo_num
           do p2 = 1, mo_num
+            !$OMP CRITICAL
             no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            !$OMP END CRITICAL
           enddo
         enddo
       enddo
@@ -1177,7 +1187,9 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
 
         do h2 = 1, mo_num
           do p2 = 1, mo_num
+            !$OMP CRITICAL
             no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            !$OMP END CRITICAL
           enddo
         enddo
 
@@ -1195,28 +1207,40 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   !$OMP END PARALLEL
 
 
-  allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
-  allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
-  allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
-
-
   ! purely open-shell part 
   if(Ne(2) < Ne(1)) then
 
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+    !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+    !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+    !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         no_aba_contraction)
+
+    Allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+    Allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+    Allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+    Allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+    Tmp_3d   = 0.d0
+    Tmp_2d   = 0.d0
+    Tmp1     = 0.d0
+    Tmp2     = 0.d0
+    Tmpval_1 = 0.d0
+    Tmpval_2 = 0.d0
+    Tmpvec_1 = 0.d0
+    Tmpvec_2 = 0.d0 
+
+    !$OMP DO
+
     do ii = Ne(2) + 1, Ne(1)
       i = occ(ii,1)
 
       do h1 = 1, mo_num
 
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT (NONE)                                            &
-        !$OMP PRIVATE (ipoint)                                          &
-        !$OMP SHARED (n_points_final_grid, i, h1,                       &
-        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
-        !$OMP DO
         do ipoint = 1, n_points_final_grid
           tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
           tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
@@ -1227,16 +1251,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
           tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
           tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
         enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
 
-        !$OMP PARALLEL                                                &
-        !$OMP DEFAULT (NONE)                                          &
-        !$OMP PRIVATE (p1, ipoint)                                    &
-        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
-        !$OMP DO 
         do p1 = 1, mo_num
           do ipoint = 1, n_points_final_grid
             tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
@@ -1247,82 +1262,65 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
                               + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
           enddo
         enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
 
         call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
                   , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
                   , tmp1(1,1,1), 3*n_points_final_grid                            &
                   , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
-        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
+              !$OMP CRITICAL
               no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
             enddo
           enddo
         enddo
-        !$OMP END PARALLEL DO
 
         do p1 = 1, mo_num
 
-          ! to minimize the number of operations
-          !$OMP PARALLEL                                                  &
-          !$OMP DEFAULT (NONE)                                            &
-          !$OMP PRIVATE (ipoint)                                          &
-          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-          !$OMP         tmpval_1)
-          !$OMP DO
           do ipoint = 1, n_points_final_grid
-            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                  + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                  + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
-                                                                  - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
-                                                                  - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
-                                                                  - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                             ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
           enddo
-          !$OMP END DO
-          !$OMP END PARALLEL
 
-          !$OMP PARALLEL                             &
-          !$OMP DEFAULT (NONE)                       &
-          !$OMP PRIVATE (h2, ipoint)                 &
-          !$OMP SHARED (mo_num, n_points_final_grid, &
-          !$OMP         mos_r_in_r_array_transp,     &
-          !$OMP         tmpval_1, tmp2)
-          !$OMP DO 
           do h2 = 1, mo_num
             do ipoint = 1, n_points_final_grid
               tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
             enddo
           enddo
-          !$OMP END DO
-          !$OMP END PARALLEL
 
           call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
                     , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
                     , tmp2(1,1), n_points_final_grid                       &
                     , 0.d0, tmp_2d(1,1), mo_num)
 
-          !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
+              !$OMP CRITICAL
               no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
             enddo
           enddo
-          !$OMP END PARALLEL DO
 
         enddo ! p1
       enddo ! h1
     enddo !i
-  endif
+    !$OMP END DO
 
-  deallocate(tmp_2d, tmp_3d)
-  deallocate(tmp1, tmp2)
-  deallocate(tmpval_1, tmpval_2)
-  deallocate(tmpvec_1, tmpvec_2)
+    deallocate(tmp_3d, tmp_2d)
+    deallocate(tmp1, tmp2)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2)
+
+    !$OMP END PARALLEL
+  endif
 
   no_aba_contraction = -0.5d0 * no_aba_contraction
   call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index 33b5c5aa..a3cb1692 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -18,7 +18,8 @@ program tc_bi_ortho
 ! call timing_single
 ! call timing_double
 
-  call test_no()
+  !call test_no()
+  call test_no_aba()
 end
 
 subroutine test_h_u0
@@ -297,4 +298,45 @@ end
 
 ! ---
 
+subroutine test_no_aba()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing no_aba_contraction ...'
+
+  thr = 1d-8
+
+  PROVIDE no_aba_contraction_v0
+  PROVIDE no_aba_contraction
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = no_aba_contraction   (l,k,j,i)
+          ref = no_aba_contraction_v0(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on no_aba_contraction'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on no_aba_contraction = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
+
 

From 92a72a096840c829d9ae5bb8ec0d683bc62ec0d9 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 10 Jun 2023 11:38:41 +0200
Subject: [PATCH 67/79] no aab //

---
 .../normal_ordered_contractions.irp.f         | 210 +++++++++++++++++-
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f        |  43 ++++
 2 files changed, 243 insertions(+), 10 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
index 980181e7..f066c958 100644
--- a/src/tc_bi_ortho/normal_ordered_contractions.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -295,7 +295,7 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
+BEGIN_PROVIDER [ double precision, no_aab_contraction_v0, (mo_num,mo_num,mo_num,mo_num)]
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
 
@@ -310,7 +310,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
   double precision,  allocatable :: tmp_2d(:,:)
 
-  print*,' Providing no_aab_contraction ...'
+  print*,' Providing no_aab_contraction_v0 ...'
   call wall_time(wall0)
 
   PROVIDE N_int
@@ -387,7 +387,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            no_aab_contraction_v0(p2,h2,p1,h1) = no_aab_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
           enddo
         enddo
       enddo
@@ -435,7 +435,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP PARALLEL DO PRIVATE(h2,p2)
         do h2 = 1, mo_num
           do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            no_aab_contraction_v0(p2,h2,p1,h1) = no_aab_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
           enddo
         enddo
         !$OMP END PARALLEL DO
@@ -449,19 +449,19 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpval_1)
   deallocate(tmpvec_1)
 
-  no_aab_contraction = -0.5d0 * no_aab_contraction
+  no_aab_contraction_v0 = -0.5d0 * no_aab_contraction_v0
 
   !$OMP PARALLEL                 &
   !$OMP DEFAULT (NONE)           &
   !$OMP PRIVATE (h1, h2, p1, p2) & 
-  !$OMP SHARED (no_aab_contraction, mo_num)
+  !$OMP SHARED (no_aab_contraction_v0, mo_num)
 
   !$OMP DO 
   do h1 = 1, mo_num
     do h2 = 1, mo_num
       do p1 = 1, mo_num
         do p2 = p1, mo_num
-          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+          no_aab_contraction_v0(p2,h2,p1,h1) -= no_aab_contraction_v0(p1,h2,p2,h1)
         enddo
       enddo
     enddo
@@ -473,7 +473,7 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
     do h2 = 1, mo_num
       do p1 = 2, mo_num
         do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+          no_aab_contraction_v0(p2,h2,p1,h1) = -no_aab_contraction_v0(p1,h2,p2,h1)
         enddo
       enddo
     enddo
@@ -485,15 +485,16 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
     do h2 = h1+1, mo_num
       do p1 = 2, mo_num
         do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+          no_aab_contraction_v0(p2,h2,p1,h1) *= -1.d0
         enddo
       enddo
     enddo
   enddo
+  !$OMP END DO
   !$OMP END PARALLEL
 
   call wall_time(wall1)
-  print*,' Wall time for no_aab_contraction', wall1-wall0
+  print*,' Wall time for no_aab_contraction_v0', wall1-wall0
 
 END_PROVIDER
 
@@ -1329,3 +1330,192 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   print*,' Wall time for no_aba_contraction', wall1-wall0
 
 END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aab_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, ii, i, h1, p1, h2, p2,                   &
+  !$OMP          tmp_2d, tmp_3d, tmp1, tmp2,                      &
+  !$OMP          tmpval_1, tmpvec_1)                              &
+  !$OMP SHARED (n_points_final_grid, mo_num, Ne, occ,             &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         no_aab_contraction)
+
+
+  allocate(tmp_2d(mo_num,mo_num))
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+
+  tmp_2d   = 0.d0
+  tmp_3d   = 0.d0
+  tmp1     = 0.d0
+  tmp2     = 0.d0
+  tmpval_1 = 0.d0
+  tmpvec_1 = 0.d0
+
+  !$OMP DO
+
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    do h1 = 1, mo_num
+
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+      enddo
+
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+        enddo
+      enddo
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            !$OMP CRITICAL
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            !$OMP END CRITICAL
+          enddo
+        enddo
+      enddo
+
+      do p1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+        enddo
+
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            !$OMP CRITICAL
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            !$OMP END CRITICAL
+          enddo
+        enddo
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  !$OMP END DO
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1)
+  deallocate(tmpvec_1)
+
+  !$OMP END PARALLEL
+
+  no_aab_contraction = -0.5d0 * no_aab_contraction
+
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (h1, h2, p1, p2) & 
+  !$OMP SHARED (no_aab_contraction, mo_num)
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 1, mo_num
+        do p2 = p1, mo_num
+          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num-1
+    do h2 = h1+1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aab_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index a3cb1692..4f190407 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -20,6 +20,7 @@ program tc_bi_ortho
 
   !call test_no()
   call test_no_aba()
+  call test_no_aab()
 end
 
 subroutine test_h_u0
@@ -340,3 +341,45 @@ end
 ! ---
 
 
+subroutine test_no_aab()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing no_aab_contraction ...'
+
+  thr = 1d-8
+
+  PROVIDE no_aab_contraction_v0
+  PROVIDE no_aab_contraction
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = no_aab_contraction   (l,k,j,i)
+          ref = no_aab_contraction_v0(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on no_aab_contraction'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on no_aab_contraction = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
+
+

From caa11f20ea4a9aa812e6bc0c6dcd2faa3e0d485b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 10 Jun 2023 11:56:07 +0200
Subject: [PATCH 68/79] Fixed singles when no beta exc

---
 scripts/compilation/cache_compile.py |   2 +-
 src/cipsi/selection.irp.f            | 425 ++++-----------------------
 src/cipsi/selection_old.irp.f        | 350 ++++++++++++++++++++++
 src/cipsi/selection_singles.irp.f    | 356 ++++++++++++++++++++++
 src/ezfio_files/00.create.bats       |  28 ++
 src/fci/40.fci.bats                  | 113 ++++---
 src/hartree_fock/10.hf.bats          |  29 ++
 src/tools/print_wf.irp.f             |   1 +
 tests/input/h2_1.xyz                 |   6 +
 tests/input/h2_3.xyz                 |   6 +
 tests/input/h3_2.xyz                 |   7 +
 tests/input/h3_4.xyz                 |   7 +
 tests/input/h4_1.xyz                 |   7 +
 tests/input/h4_3.xyz                 |   7 +
 tests/input/h4_5.xyz                 |   7 +
 15 files changed, 947 insertions(+), 404 deletions(-)
 create mode 100644 src/cipsi/selection_old.irp.f
 create mode 100644 src/cipsi/selection_singles.irp.f
 create mode 100644 tests/input/h2_1.xyz
 create mode 100644 tests/input/h2_3.xyz
 create mode 100644 tests/input/h3_2.xyz
 create mode 100644 tests/input/h3_4.xyz
 create mode 100644 tests/input/h4_1.xyz
 create mode 100644 tests/input/h4_3.xyz
 create mode 100644 tests/input/h4_5.xyz

diff --git a/scripts/compilation/cache_compile.py b/scripts/compilation/cache_compile.py
index 440f6498..473976e7 100755
--- a/scripts/compilation/cache_compile.py
+++ b/scripts/compilation/cache_compile.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Save the .o from a .f90
-and is the .o is asked a second time, retur it
+and is the .o is asked a second time, return it
 Take in argv command like:
      ifort -g  -openmp -I IRPF90_temp/Ezfio_files/ -c IRPF90_temp/Integrals_Monoelec/kin_ao_ints.irp.module.F90 -o IRPF90_temp/Integrals_Monoelec/kin_ao_ints.irp.module.o
 """
diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 0705d103..b8fa2895 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -88,6 +88,10 @@ subroutine select_connected(i_generator,E0,pt2_data,b,subset,csubset)
       particle_mask(k,1) = iand(generators_bitmask(k,1,s_part), not(psi_det_generators(k,1,i_generator)) )
       particle_mask(k,2) = iand(generators_bitmask(k,2,s_part), not(psi_det_generators(k,2,i_generator)) )
   enddo
+  if ((subset == 1).and.(sum(hole_mask(:,2)) == 0_bit_kind)) then
+     ! No beta electron to excite
+     call select_singles(i_generator,hole_mask,particle_mask,fock_diag_tmp,E0,pt2_data,b)
+  endif
   call select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_diag_tmp,E0,pt2_data,b,subset,csubset)
   deallocate(fock_diag_tmp)
 end subroutine
@@ -142,7 +146,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
   use selection_types
   implicit none
   BEGIN_DOC
-!            WARNING /!\ : It is assumed that the generators and selectors are psi_det_sorted
+!  WARNING /!\ : It is assumed that the generators and selectors are psi_det_sorted
   END_DOC
 
   integer, intent(in)            :: i_generator, subset, csubset
@@ -237,7 +241,6 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
   enddo
 
   ! Iterate on 0S alpha, and find betas TQ such that exc_degree <= 4
-  ! Remove also contributions < 1.d-20)
   do j=1,N_det_alpha_unique
     call get_excitation_degree_spin(psi_det_alpha_unique(1,j),       &
         psi_det_generators(1,1,i_generator), nt, N_int)
@@ -480,7 +483,9 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
       do s2=s1,2
         sp = s1
 
-        if(s1 /= s2) sp = 3
+        if(s1 /= s2) then
+          sp = 3
+        endif
 
         ib = 1
         if(s1 == s2) ib = i1+1
@@ -528,7 +533,10 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
   deallocate(preinteresting, prefullinteresting, interesting, fullinteresting)
   deallocate(banned, bannedOrb,mat)
 end subroutine
-subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_diag_tmp, E0, pt2_data, mat, buf)
+
+BEGIN_TEMPLATE
+
+subroutine fill_buffer_$DOUBLE(i_generator, sp, h1, h2, bannedOrb, banned, fock_diag_tmp, E0, pt2_data, mat, buf)
   use bitmasks
   use selection_types
   implicit none
@@ -562,7 +570,20 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
     s1 = sp
     s2 = sp
   end if
-  call apply_holes(psi_det_generators(1,1,i_generator), s1, h1, s2, h2, mask, ok, N_int)
+
+  if ($IS_DOUBLE) then
+    if (h2 == 0) then
+       print *, 'h2=0 in '//trim(irp_here)
+       stop
+    endif
+    call apply_holes(psi_det_generators(1,1,i_generator), s1, h1, s2, h2, mask, ok, N_int)
+  else
+    if (h2 /= 0) then
+       print *, 'h2 /= in '//trim(irp_here)
+       stop
+    endif
+    call apply_hole(psi_det_generators(1,1,i_generator), s1, h1, mask, ok, N_int)
+  endif
   E_shift = 0.d0
 
   if (h0_type == 'CFG') then
@@ -570,12 +591,15 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
     E_shift = psi_det_Hii(i_generator) - psi_configuration_Hii(j)
   endif
 
-  do p1=1,mo_num
-    if(bannedOrb(p1, s1)) cycle
+  $DO_p1
+! do p1=1,mo_num
+
+    if (bannedOrb(p1, s1)) cycle
     ib = 1
     if(sp /= 3) ib = p1+1
 
-    do p2=ib,mo_num
+    $DO_p2
+  ! do p2=ib,mo_num
 
 ! -----
 ! /!\ Generating only single excited determinants doesn't work because a
@@ -584,9 +608,10 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
 ! detected as already generated when generating in the future with a
 ! double excitation.
 ! -----
-
-      if(bannedOrb(p2, s2)) cycle
-      if(banned(p1,p2)) cycle
+      if ($IS_DOUBLE) then
+        if(bannedOrb(p2, s2)) cycle
+        if(banned(p1,p2)) cycle
+      endif
 
       if(pseudo_sym)then
         if(dabs(mat(1, p1, p2)).lt.thresh_sym)then
@@ -596,7 +621,11 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
 
       val = maxval(abs(mat(1:N_states, p1, p2)))
       if( val == 0d0) cycle
-      call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+      if ($IS_DOUBLE) then
+        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+      else
+        call apply_particle(mask, s1, p1, det, ok, N_int)
+      endif
 
       if (do_only_cas) then
         integer, external :: number_of_holes, number_of_particles
@@ -797,7 +826,7 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
           case(5)
             ! Variance selection
             if (h0_type == 'CFG') then
-              w = min(w, - alpha_h_psi * alpha_h_psi * s_weight(istate,istate)) & 
+              w = min(w, - alpha_h_psi * alpha_h_psi * s_weight(istate,istate)) &
                 / c0_weight(istate)
             else
               w = min(w, - alpha_h_psi * alpha_h_psi * s_weight(istate,istate))
@@ -857,10 +886,19 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
       if(w <= buf%mini) then
         call add_to_selection_buffer(buf, det, w)
       end if
-    end do
-  end do
+  ! enddo
+    $ENDDO_p1
+! enddo
+  $ENDDO_p2
 end
 
+SUBST [ DOUBLE , DO_p1 , ENDDO_p1 , DO_p2 , ENDDO_p2 , IS_DOUBLE ]
+
+double ; do p1=1,mo_num ; enddo ; do p2=ib,mo_num ; enddo ; .True.  ;;
+single ; do p1=1,mo_num ; enddo ; p2=1            ;       ; .False. ;;
+
+END_TEMPLATE
+
 subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, interesting)
   use bitmasks
   implicit none
@@ -882,6 +920,7 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
 
   PROVIDE psi_selectors_coef_transp psi_det_sorted
   mat = 0d0
+  p=0
 
   do i=1,N_int
     negMask(i,1) = not(mask(i,1))
@@ -1435,7 +1474,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     p1 = p(1,sp)
     p2 = p(2,sp)
     do puti=1, mo_num
-      if(bannedOrb(puti, sp)) cycle
+      if (bannedOrb(puti, sp)) cycle
       call get_mo_two_e_integrals(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map)
       call get_mo_two_e_integrals(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map)
       do putj=puti+1, mo_num
@@ -1446,7 +1485,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           call i_h_j(gen, det, N_int, hij)
           if (hij == 0.d0) cycle
         else
-          hij = (mo_two_e_integral(p1, p2, puti, putj) -  mo_two_e_integral(p2, p1, puti, putj))
+          hij = hij_cache1(putj) - hij_cache2(putj)
           if (hij == 0.d0) cycle
           hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
         end if
@@ -1506,7 +1545,7 @@ subroutine spot_isinwf(mask, det, i_gen, N, banned, fullMatch, interesting)
   use bitmasks
   implicit none
   BEGIN_DOC
-! Identify the determinants in det which are in the internal space. These are
+! Identify the determinants in det that are in the internal space. These are
 ! the determinants that can be produced by creating two particles on the mask.
   END_DOC
 
@@ -1534,7 +1573,7 @@ subroutine spot_isinwf(mask, det, i_gen, N, banned, fullMatch, interesting)
       if(iand(det(j,2,i), mask(j,2)) /= mask(j, 2)) cycle genl
     end do
 
-    ! If det(i) < det(i_gen), it hs already been considered
+    ! If det(i) < det(i_gen), it has already been considered
     if(interesting(i) < i_gen) then
       fullMatch = .true.
       return
@@ -1585,352 +1624,4 @@ end
 
 
 
-! OLD unoptimized routines for debugging
-! ======================================
-
-subroutine get_d0_reference(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
-  use bitmasks
-  implicit none
-
-  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
-  integer(bit_kind), intent(in) :: phasemask(N_int,2)
-  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
-  integer(bit_kind) :: det(N_int, 2)
-  double precision, intent(in) :: coefs(N_states)
-  double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
-  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
-
-  integer :: i, j, s, h1, h2, p1, p2, puti, putj
-  double precision :: hij, phase
-  double precision, external :: get_phase_bi, mo_two_e_integral
-  logical :: ok
-
-  integer :: bant
-  bant = 1
-
-
-  if(sp == 3) then ! AB
-    h1 = p(1,1)
-    h2 = p(1,2)
-    do p1=1, mo_num
-      if(bannedOrb(p1, 1)) cycle
-      do p2=1, mo_num
-        if(bannedOrb(p2,2)) cycle
-        if(banned(p1, p2, bant)) cycle ! rentable?
-        if(p1 == h1 .or. p2 == h2) then
-          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
-          call i_h_j(gen, det, N_int, hij)
-        else
-          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-          hij = mo_two_e_integral(p1, p2, h1, h2) * phase
-        end if
-        mat(:, p1, p2) = mat(:, p1, p2) + coefs(:) * hij
-      end do
-    end do
-  else ! AA BB
-    p1 = p(1,sp)
-    p2 = p(2,sp)
-    do puti=1, mo_num
-      if(bannedOrb(puti, sp)) cycle
-      do putj=puti+1, mo_num
-        if(bannedOrb(putj, sp)) cycle
-        if(banned(puti, putj, bant)) cycle ! rentable?
-        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
-          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
-          call i_h_j(gen, det, N_int, hij)
-        else
-          hij = (mo_two_e_integral(p1, p2, puti, putj) -  mo_two_e_integral(p2, p1, puti, putj))* get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
-        end if
-        mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
-      end do
-    end do
-  end if
-end
-
-subroutine get_d1_reference(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
-  use bitmasks
-  implicit none
-
-  integer(bit_kind), intent(in)  :: mask(N_int, 2), gen(N_int, 2)
-  integer(bit_kind), intent(in)  :: phasemask(N_int,2)
-  logical, intent(in)            :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
-  integer(bit_kind)              :: det(N_int, 2)
-  double precision, intent(in)   :: coefs(N_states)
-  double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
-  integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
-  double precision               :: hij, tmp_row(N_states, mo_num), tmp_row2(N_states, mo_num)
-  double precision, external     :: get_phase_bi, mo_two_e_integral
-  logical                        :: ok
-
-  logical, allocatable           :: lbanned(:,:)
-  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j
-  integer                        :: hfix, pfix, h1, h2, p1, p2, ib
-
-  integer, parameter             :: turn2(2) = (/2,1/)
-  integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
-
-  integer                        :: bant
-
-
-  allocate (lbanned(mo_num, 2))
-  lbanned = bannedOrb
-
-  do i=1, p(0,1)
-    lbanned(p(i,1), 1) = .true.
-  end do
-  do i=1, p(0,2)
-    lbanned(p(i,2), 2) = .true.
-  end do
-
-  ma = 1
-  if(p(0,2) >= 2) ma = 2
-  mi = turn2(ma)
-
-  bant = 1
-
-  if(sp == 3) then
-    !move MA
-    if(ma == 2) bant = 2
-    puti = p(1,mi)
-    hfix = h(1,ma)
-    p1 = p(1,ma)
-    p2 = p(2,ma)
-    if(.not. bannedOrb(puti, mi)) then
-      tmp_row = 0d0
-      do putj=1, hfix-1
-        if(lbanned(putj, ma) .or. banned(putj, puti,bant)) cycle
-        hij = (mo_two_e_integral(p1, p2, putj, hfix)-mo_two_e_integral(p2,p1,putj,hfix)) * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
-        tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states)
-      end do
-      do putj=hfix+1, mo_num
-        if(lbanned(putj, ma) .or. banned(putj, puti,bant)) cycle
-        hij = (mo_two_e_integral(p1, p2, hfix, putj)-mo_two_e_integral(p2,p1,hfix,putj)) * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
-        tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states)
-      end do
-
-      if(ma == 1) then
-        mat(1:N_states,1:mo_num,puti) = mat(1:N_states,1:mo_num,puti) + tmp_row(1:N_states,1:mo_num)
-      else
-        mat(1:N_states,puti,1:mo_num) = mat(1:N_states,puti,1:mo_num) + tmp_row(1:N_states,1:mo_num)
-      end if
-    end if
-
-    !MOVE MI
-    pfix = p(1,mi)
-    tmp_row = 0d0
-    tmp_row2 = 0d0
-    do puti=1,mo_num
-      if(lbanned(puti,mi)) cycle
-      !p1 fixed
-      putj = p1
-      if(.not. banned(putj,puti,bant)) then
-        hij = mo_two_e_integral(p2,pfix,hfix,puti) * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
-        tmp_row(:,puti) = tmp_row(:,puti) + hij * coefs(:)
-      end if
-
-      putj = p2
-      if(.not. banned(putj,puti,bant)) then
-        hij = mo_two_e_integral(p1,pfix,hfix,puti) * get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
-        tmp_row2(:,puti) = tmp_row2(:,puti) + hij * coefs(:)
-      end if
-    end do
-
-    if(mi == 1) then
-      mat(:,:,p1) = mat(:,:,p1) + tmp_row(:,:)
-      mat(:,:,p2) = mat(:,:,p2) + tmp_row2(:,:)
-    else
-      mat(:,p1,:) = mat(:,p1,:) + tmp_row(:,:)
-      mat(:,p2,:) = mat(:,p2,:) + tmp_row2(:,:)
-    end if
-  else
-    if(p(0,ma) == 3) then
-      do i=1,3
-        hfix = h(1,ma)
-        puti = p(i, ma)
-        p1 = p(turn3(1,i), ma)
-        p2 = p(turn3(2,i), ma)
-        tmp_row = 0d0
-        do putj=1,hfix-1
-          if(lbanned(putj,ma) .or. banned(puti,putj,1)) cycle
-          hij = (mo_two_e_integral(p1, p2, putj, hfix)-mo_two_e_integral(p2,p1,putj,hfix)) * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
-          tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
-        end do
-        do putj=hfix+1,mo_num
-          if(lbanned(putj,ma) .or. banned(puti,putj,1)) cycle
-          hij = (mo_two_e_integral(p1, p2, hfix, putj)-mo_two_e_integral(p2,p1,hfix,putj)) * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
-          tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
-        end do
-
-        mat(:, :puti-1, puti) = mat(:, :puti-1, puti) + tmp_row(:,:puti-1)
-        mat(:, puti, puti:) = mat(:, puti, puti:) + tmp_row(:,puti:)
-      end do
-    else
-      hfix = h(1,mi)
-      pfix = p(1,mi)
-      p1 = p(1,ma)
-      p2 = p(2,ma)
-      tmp_row = 0d0
-      tmp_row2 = 0d0
-      do puti=1,mo_num
-        if(lbanned(puti,ma)) cycle
-        putj = p2
-        if(.not. banned(puti,putj,1)) then
-          hij = mo_two_e_integral(pfix, p1, hfix, puti) * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
-          tmp_row(:,puti) = tmp_row(:,puti) + hij * coefs(:)
-        end if
-
-        putj = p1
-        if(.not. banned(puti,putj,1)) then
-          hij = mo_two_e_integral(pfix, p2, hfix, puti) * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
-          tmp_row2(:,puti) = tmp_row2(:,puti) + hij * coefs(:)
-        end if
-      end do
-      mat(:,:p2-1,p2) = mat(:,:p2-1,p2) + tmp_row(:,:p2-1)
-      mat(:,p2,p2:) = mat(:,p2,p2:) + tmp_row(:,p2:)
-      mat(:,:p1-1,p1) = mat(:,:p1-1,p1) + tmp_row2(:,:p1-1)
-      mat(:,p1,p1:) = mat(:,p1,p1:) + tmp_row2(:,p1:)
-    end if
-  end if
-  deallocate(lbanned)
-
- !! MONO
-    if(sp == 3) then
-      s1 = 1
-      s2 = 2
-    else
-      s1 = sp
-      s2 = sp
-    end if
-
-    do i1=1,p(0,s1)
-      ib = 1
-      if(s1 == s2) ib = i1+1
-      do i2=ib,p(0,s2)
-        p1 = p(i1,s1)
-        p2 = p(i2,s2)
-        if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
-        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
-        call i_h_j(gen, det, N_int, hij)
-        mat(:, p1, p2) = mat(:, p1, p2) + coefs(:) * hij
-      end do
-    end do
-end
-
-subroutine get_d2_reference(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
-  use bitmasks
-  implicit none
-
-  integer(bit_kind), intent(in) :: mask(N_int, 2), gen(N_int, 2)
-  integer(bit_kind), intent(in) :: phasemask(2,N_int)
-  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
-  double precision, intent(in) :: coefs(N_states)
-  double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
-  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
-
-  double precision, external :: get_phase_bi, mo_two_e_integral
-
-  integer :: i, j, tip, ma, mi, puti, putj
-  integer :: h1, h2, p1, p2, i1, i2
-  double precision :: hij, phase
-
-  integer, parameter:: turn2d(2,3,4) = reshape((/0,0, 0,0, 0,0,  3,4, 0,0, 0,0,  2,4, 1,4, 0,0,  2,3, 1,3, 1,2 /), (/2,3,4/))
-  integer, parameter :: turn2(2) = (/2, 1/)
-  integer, parameter :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
-
-  integer :: bant
-  bant = 1
-
-  tip = p(0,1) * p(0,2)
-
-  ma = sp
-  if(p(0,1) > p(0,2)) ma = 1
-  if(p(0,1) < p(0,2)) ma = 2
-  mi = mod(ma, 2) + 1
-
-  if(sp == 3) then
-    if(ma == 2) bant = 2
-
-    if(tip == 3) then
-      puti = p(1, mi)
-      do i = 1, 3
-        putj = p(i, ma)
-        if(banned(putj,puti,bant)) cycle
-        i1 = turn3(1,i)
-        i2 = turn3(2,i)
-        p1 = p(i1, ma)
-        p2 = p(i2, ma)
-        h1 = h(1, ma)
-        h2 = h(2, ma)
-
-        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
-        if(ma == 1) then
-          mat(:, putj, puti) = mat(:, putj, puti) + coefs(:) * hij
-        else
-          mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
-        end if
-      end do
-    else
-      h1 = h(1,1)
-      h2 = h(1,2)
-      do j = 1,2
-        putj = p(j, 2)
-        p2 = p(turn2(j), 2)
-        do i = 1,2
-          puti = p(i, 1)
-
-          if(banned(puti,putj,bant)) cycle
-          p1 = p(turn2(i), 1)
-
-          hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2,N_int)
-          mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
-        end do
-      end do
-    end if
-
-  else
-    if(tip == 0) then
-      h1 = h(1, ma)
-      h2 = h(2, ma)
-      do i=1,3
-      puti = p(i, ma)
-      do j=i+1,4
-        putj = p(j, ma)
-        if(banned(puti,putj,1)) cycle
-
-        i1 = turn2d(1, i, j)
-        i2 = turn2d(2, i, j)
-        p1 = p(i1, ma)
-        p2 = p(i2, ma)
-        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2,N_int)
-        mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
-      end do
-      end do
-    else if(tip == 3) then
-      h1 = h(1, mi)
-      h2 = h(1, ma)
-      p1 = p(1, mi)
-      do i=1,3
-        puti = p(turn3(1,i), ma)
-        putj = p(turn3(2,i), ma)
-        if(banned(puti,putj,1)) cycle
-        p2 = p(i, ma)
-
-        hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2,N_int)
-        mat(:, min(puti, putj), max(puti, putj)) = mat(:, min(puti, putj), max(puti, putj)) + coefs(:) * hij
-      end do
-    else ! tip == 4
-      puti = p(1, sp)
-      putj = p(2, sp)
-      if(.not. banned(puti,putj,1)) then
-        p1 = p(1, mi)
-        p2 = p(2, mi)
-        h1 = h(1, mi)
-        h2 = h(2, mi)
-        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2,N_int)
-        mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
-      end if
-    end if
-  end if
-end
-
 
diff --git a/src/cipsi/selection_old.irp.f b/src/cipsi/selection_old.irp.f
new file mode 100644
index 00000000..8fd5bc2b
--- /dev/null
+++ b/src/cipsi/selection_old.irp.f
@@ -0,0 +1,350 @@
+
+! OLD unoptimized routines for debugging
+! ======================================
+
+subroutine get_d0_reference(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision, intent(in) :: coefs(N_states)
+  double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  integer :: i, j, s, h1, h2, p1, p2, puti, putj
+  double precision :: hij, phase
+  double precision, external :: get_phase_bi, mo_two_e_integral
+  logical :: ok
+
+  integer :: bant
+  bant = 1
+
+
+  if(sp == 3) then ! AB
+    h1 = p(1,1)
+    h2 = p(1,2)
+    do p1=1, mo_num
+      if(bannedOrb(p1, 1)) cycle
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          call i_h_j(gen, det, N_int, hij)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hij = mo_two_e_integral(p1, p2, h1, h2) * phase
+        end if
+        mat(:, p1, p2) = mat(:, p1, p2) + coefs(:) * hij
+      end do
+    end do
+  else ! AA BB
+    p1 = p(1,sp)
+    p2 = p(2,sp)
+    do puti=1, mo_num
+!      do not cycle here? otherwise singles will be missed??
+      if(bannedOrb(puti, sp)) cycle
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          call i_h_j(gen, det, N_int, hij)
+        else
+          hij = (mo_two_e_integral(p1, p2, puti, putj) -  mo_two_e_integral(p2, p1, puti, putj))* get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+        end if
+        mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
+      end do
+    end do
+  end if
+end
+
+subroutine get_d1_reference(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in)  :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in)  :: phasemask(N_int,2)
+  logical, intent(in)            :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind)              :: det(N_int, 2)
+  double precision, intent(in)   :: coefs(N_states)
+  double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
+  integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+  double precision               :: hij, tmp_row(N_states, mo_num), tmp_row2(N_states, mo_num)
+  double precision, external     :: get_phase_bi, mo_two_e_integral
+  logical                        :: ok
+
+  logical, allocatable           :: lbanned(:,:)
+  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j
+  integer                        :: hfix, pfix, h1, h2, p1, p2, ib
+
+  integer, parameter             :: turn2(2) = (/2,1/)
+  integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer                        :: bant
+
+
+  allocate (lbanned(mo_num, 2))
+  lbanned = bannedOrb
+
+  do i=1, p(0,1)
+    lbanned(p(i,1), 1) = .true.
+  end do
+  do i=1, p(0,2)
+    lbanned(p(i,2), 2) = .true.
+  end do
+
+  ma = 1
+  if(p(0,2) >= 2) ma = 2
+  mi = turn2(ma)
+
+  bant = 1
+
+  if(sp == 3) then
+    !move MA
+    if(ma == 2) bant = 2
+    puti = p(1,mi)
+    hfix = h(1,ma)
+    p1 = p(1,ma)
+    p2 = p(2,ma)
+    if(.not. bannedOrb(puti, mi)) then
+      tmp_row = 0d0
+      do putj=1, hfix-1
+        if(lbanned(putj, ma) .or. banned(putj, puti,bant)) cycle
+        hij = (mo_two_e_integral(p1, p2, putj, hfix)-mo_two_e_integral(p2,p1,putj,hfix)) * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+        tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states)
+      end do
+      do putj=hfix+1, mo_num
+        if(lbanned(putj, ma) .or. banned(putj, puti,bant)) cycle
+        hij = (mo_two_e_integral(p1, p2, hfix, putj)-mo_two_e_integral(p2,p1,hfix,putj)) * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+        tmp_row(1:N_states,putj) = tmp_row(1:N_states,putj) + hij * coefs(1:N_states)
+      end do
+
+      if(ma == 1) then
+        mat(1:N_states,1:mo_num,puti) = mat(1:N_states,1:mo_num,puti) + tmp_row(1:N_states,1:mo_num)
+      else
+        mat(1:N_states,puti,1:mo_num) = mat(1:N_states,puti,1:mo_num) + tmp_row(1:N_states,1:mo_num)
+      end if
+    end if
+
+    !MOVE MI
+    pfix = p(1,mi)
+    tmp_row = 0d0
+    tmp_row2 = 0d0
+    do puti=1,mo_num
+      if(lbanned(puti,mi)) cycle
+      !p1 fixed
+      putj = p1
+      if(.not. banned(putj,puti,bant)) then
+        hij = mo_two_e_integral(p2,pfix,hfix,puti) * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+        tmp_row(:,puti) = tmp_row(:,puti) + hij * coefs(:)
+      end if
+
+      putj = p2
+      if(.not. banned(putj,puti,bant)) then
+        hij = mo_two_e_integral(p1,pfix,hfix,puti) * get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
+        tmp_row2(:,puti) = tmp_row2(:,puti) + hij * coefs(:)
+      end if
+    end do
+
+    if(mi == 1) then
+      mat(:,:,p1) = mat(:,:,p1) + tmp_row(:,:)
+      mat(:,:,p2) = mat(:,:,p2) + tmp_row2(:,:)
+    else
+      mat(:,p1,:) = mat(:,p1,:) + tmp_row(:,:)
+      mat(:,p2,:) = mat(:,p2,:) + tmp_row2(:,:)
+    end if
+  else
+    if(p(0,ma) == 3) then
+      do i=1,3
+        hfix = h(1,ma)
+        puti = p(i, ma)
+        p1 = p(turn3(1,i), ma)
+        p2 = p(turn3(2,i), ma)
+        tmp_row = 0d0
+        do putj=1,hfix-1
+          if(lbanned(putj,ma) .or. banned(puti,putj,1)) cycle
+          hij = (mo_two_e_integral(p1, p2, putj, hfix)-mo_two_e_integral(p2,p1,putj,hfix)) * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+          tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
+        end do
+        do putj=hfix+1,mo_num
+          if(lbanned(putj,ma) .or. banned(puti,putj,1)) cycle
+          hij = (mo_two_e_integral(p1, p2, hfix, putj)-mo_two_e_integral(p2,p1,hfix,putj)) * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+          tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
+        end do
+
+        mat(:, :puti-1, puti) = mat(:, :puti-1, puti) + tmp_row(:,:puti-1)
+        mat(:, puti, puti:) = mat(:, puti, puti:) + tmp_row(:,puti:)
+      end do
+    else
+      hfix = h(1,mi)
+      pfix = p(1,mi)
+      p1 = p(1,ma)
+      p2 = p(2,ma)
+      tmp_row = 0d0
+      tmp_row2 = 0d0
+      do puti=1,mo_num
+        if(lbanned(puti,ma)) cycle
+        putj = p2
+        if(.not. banned(puti,putj,1)) then
+          hij = mo_two_e_integral(pfix, p1, hfix, puti) * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+          tmp_row(:,puti) = tmp_row(:,puti) + hij * coefs(:)
+        end if
+
+        putj = p1
+        if(.not. banned(puti,putj,1)) then
+          hij = mo_two_e_integral(pfix, p2, hfix, puti) * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
+          tmp_row2(:,puti) = tmp_row2(:,puti) + hij * coefs(:)
+        end if
+      end do
+      mat(:,:p2-1,p2) = mat(:,:p2-1,p2) + tmp_row(:,:p2-1)
+      mat(:,p2,p2:) = mat(:,p2,p2:) + tmp_row(:,p2:)
+      mat(:,:p1-1,p1) = mat(:,:p1-1,p1) + tmp_row2(:,:p1-1)
+      mat(:,p1,p1:) = mat(:,p1,p1:) + tmp_row2(:,p1:)
+    end if
+  end if
+  deallocate(lbanned)
+
+ !! MONO
+    if(sp == 3) then
+      s1 = 1
+      s2 = 2
+    else
+      s1 = sp
+      s2 = sp
+    end if
+
+    do i1=1,p(0,s1)
+      ib = 1
+      if(s1 == s2) ib = i1+1
+      do i2=ib,p(0,s2)
+        p1 = p(i1,s1)
+        p2 = p(i2,s2)
+        if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
+        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+        call i_h_j(gen, det, N_int, hij)
+        mat(:, p1, p2) = mat(:, p1, p2) + coefs(:) * hij
+      end do
+    end do
+end
+
+subroutine get_d2_reference(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(2,N_int)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  double precision, intent(in) :: coefs(N_states)
+  double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  double precision, external :: get_phase_bi, mo_two_e_integral
+
+  integer :: i, j, tip, ma, mi, puti, putj
+  integer :: h1, h2, p1, p2, i1, i2
+  double precision :: hij, phase
+
+  integer, parameter:: turn2d(2,3,4) = reshape((/0,0, 0,0, 0,0,  3,4, 0,0, 0,0,  2,4, 1,4, 0,0,  2,3, 1,3, 1,2 /), (/2,3,4/))
+  integer, parameter :: turn2(2) = (/2, 1/)
+  integer, parameter :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer :: bant
+  bant = 1
+
+  tip = p(0,1) * p(0,2)
+
+  ma = sp
+  if(p(0,1) > p(0,2)) ma = 1
+  if(p(0,1) < p(0,2)) ma = 2
+  mi = mod(ma, 2) + 1
+
+  if(sp == 3) then
+    if(ma == 2) bant = 2
+
+    if(tip == 3) then
+      puti = p(1, mi)
+      do i = 1, 3
+        putj = p(i, ma)
+        if(banned(putj,puti,bant)) cycle
+        i1 = turn3(1,i)
+        i2 = turn3(2,i)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        h1 = h(1, ma)
+        h2 = h(2, ma)
+
+        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        if(ma == 1) then
+          mat(:, putj, puti) = mat(:, putj, puti) + coefs(:) * hij
+        else
+          mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
+        end if
+      end do
+    else
+      h1 = h(1,1)
+      h2 = h(1,2)
+      do j = 1,2
+        putj = p(j, 2)
+        p2 = p(turn2(j), 2)
+        do i = 1,2
+          puti = p(i, 1)
+
+          if(banned(puti,putj,bant)) cycle
+          p1 = p(turn2(i), 1)
+
+          hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2,N_int)
+          mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
+        end do
+      end do
+    end if
+
+  else
+    if(tip == 0) then
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+      do i=1,3
+      puti = p(i, ma)
+      do j=i+1,4
+        putj = p(j, ma)
+        if(banned(puti,putj,1)) cycle
+
+        i1 = turn2d(1, i, j)
+        i2 = turn2d(2, i, j)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2,N_int)
+        mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
+      end do
+      end do
+    else if(tip == 3) then
+      h1 = h(1, mi)
+      h2 = h(1, ma)
+      p1 = p(1, mi)
+      do i=1,3
+        puti = p(turn3(1,i), ma)
+        putj = p(turn3(2,i), ma)
+        if(banned(puti,putj,1)) cycle
+        p2 = p(i, ma)
+
+        hij = mo_two_e_integral(p1, p2, h1, h2) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2,N_int)
+        mat(:, min(puti, putj), max(puti, putj)) = mat(:, min(puti, putj), max(puti, putj)) + coefs(:) * hij
+      end do
+    else ! tip == 4
+      puti = p(1, sp)
+      putj = p(2, sp)
+      if(.not. banned(puti,putj,1)) then
+        p1 = p(1, mi)
+        p2 = p(2, mi)
+        h1 = h(1, mi)
+        h2 = h(2, mi)
+        hij = (mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2,N_int)
+        mat(:, puti, putj) = mat(:, puti, putj) + coefs(:) * hij
+      end if
+    end if
+  end if
+end
+
diff --git a/src/cipsi/selection_singles.irp.f b/src/cipsi/selection_singles.irp.f
new file mode 100644
index 00000000..3821576c
--- /dev/null
+++ b/src/cipsi/selection_singles.irp.f
@@ -0,0 +1,356 @@
+use bitmasks
+
+subroutine select_singles(i_gen,hole_mask,particle_mask,fock_diag_tmp,E0,pt2_data,buf)
+  use bitmasks
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Select determinants connected to i_det by H
+  END_DOC
+  integer, intent(in)             :: i_gen
+  integer(bit_kind), intent(in)   :: hole_mask(N_int,2), particle_mask(N_int,2)
+  double precision, intent(in)    :: fock_diag_tmp(mo_num)
+  double precision, intent(in)    :: E0(N_states)
+  type(pt2_type),   intent(inout) :: pt2_data
+  type(selection_buffer), intent(inout) :: buf
+
+  logical, allocatable            :: banned(:,:), bannedOrb(:)
+  double precision, allocatable   :: mat(:,:,:)
+  integer                         :: i, j, k
+  integer                         :: h1,h2,s1,s2,i1,i2,ib,sp
+  integer(bit_kind)               :: hole(N_int,2), particle(N_int,2), mask(N_int, 2)
+  logical                         :: fullMatch, ok
+
+
+  do k=1,N_int
+    hole    (k,1) = iand(psi_det_generators(k,1,i_gen), hole_mask(k,1))
+    hole    (k,2) = iand(psi_det_generators(k,2,i_gen), hole_mask(k,2))
+    particle(k,1) = iand(not(psi_det_generators(k,1,i_gen)), particle_mask(k,1))
+    particle(k,2) = iand(not(psi_det_generators(k,2,i_gen)), particle_mask(k,2))
+  enddo
+
+  allocate(banned(mo_num,mo_num), bannedOrb(mo_num), mat(N_states, mo_num, 1))
+  banned = .False.
+
+  ! Create lists of holes and particles
+  ! -----------------------------------
+
+  integer                        :: N_holes(2), N_particles(2)
+  integer                        :: hole_list(N_int*bit_kind_size,2)
+  integer                        :: particle_list(N_int*bit_kind_size,2)
+
+  call bitstring_to_list_ab(hole    , hole_list    , N_holes    , N_int)
+  call bitstring_to_list_ab(particle, particle_list, N_particles, N_int)
+
+  do sp=1,2
+    do i=1, N_holes(sp)
+      h1 = hole_list(i,sp)
+      call apply_hole(psi_det_generators(1,1,i_gen), sp, h1, mask, ok, N_int)
+      bannedOrb = .true.
+      do j=1,N_particles(sp)
+        bannedOrb(particle_list(j, sp)) = .false.
+      end do
+      call spot_hasBeen(mask, sp, psi_det_sorted, i_gen, N_det, bannedOrb, fullMatch)
+      if(fullMatch) cycle
+      mat = 0d0
+      call splash_p(mask, sp, psi_selectors(1,1,i_gen), psi_selectors_coef_transp(1,i_gen), N_det_selectors - i_gen + 1, bannedOrb, mat(1,1,1))
+      call fill_buffer_single(i_gen, sp, h1, 0, bannedOrb, banned, fock_diag_tmp, E0, pt2_data, mat, buf)
+    end do
+  enddo
+end subroutine
+
+
+subroutine spot_hasBeen(mask, sp, det, i_gen, N, banned, fullMatch)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind),intent(in) :: mask(N_int, 2), det(N_int, 2, N)
+  integer, intent(in) :: i_gen, N, sp
+  logical, intent(inout) :: banned(mo_num)
+  logical, intent(out) :: fullMatch
+
+
+  integer :: i, j, na, nb, list(3), nt
+  integer(bit_kind) :: myMask(N_int, 2), negMask(N_int, 2)
+
+  fullMatch = .false.
+
+  do i=1,N_int
+    negMask(i,1) = not(mask(i,1))
+    negMask(i,2) = not(mask(i,2))
+  end do
+
+  genl : do i=1, N
+    nt = 0
+
+    do j=1, N_int
+      myMask(j, 1) = iand(det(j, 1, i), negMask(j, 1))
+      myMask(j, 2) = iand(det(j, 2, i), negMask(j, 2))
+      nt += popcnt(myMask(j, 1)) + popcnt(myMask(j, 2))
+    end do
+
+    if(nt > 3) cycle
+
+    if(nt <= 2 .and. i < i_gen) then
+      fullMatch = .true.
+      return
+    end if
+
+    call bitstring_to_list(myMask(1,sp), list(1), na, N_int)
+
+    if(nt == 3 .and. i < i_gen) then
+      do j=1,na
+        banned(list(j)) = .true.
+      end do
+    else if(nt == 1 .and. na == 1) then
+      banned(list(1)) = .true.
+    end if
+  end do genl
+end subroutine
+
+
+subroutine splash_p(mask, sp, det, coefs, N_sel, bannedOrb, vect)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind),intent(in) :: mask(N_int, 2), det(N_int,2,N_sel)
+  double precision, intent(in) :: coefs(N_states, N_sel)
+  integer, intent(in) :: sp, N_sel
+  logical, intent(inout) :: bannedOrb(mo_num)
+  double precision, intent(inout)     :: vect(N_states, mo_num)
+
+  integer :: i, j, h(0:2,2), p(0:3,2), nt
+  integer(bit_kind) :: perMask(N_int, 2), mobMask(N_int, 2), negMask(N_int, 2)
+  integer(bit_kind) :: phasemask(N_int, 2)
+
+  do i=1,N_int
+    negMask(i,1) = not(mask(i,1))
+    negMask(i,2) = not(mask(i,2))
+  end do
+
+  do i=1, N_sel
+    nt = 0
+    do j=1,N_int
+      mobMask(j,1) = iand(negMask(j,1), det(j,1,i))
+      mobMask(j,2) = iand(negMask(j,2), det(j,2,i))
+      nt += popcnt(mobMask(j, 1)) + popcnt(mobMask(j, 2))
+    end do
+
+    if(nt > 3) cycle
+
+    do j=1,N_int
+      perMask(j,1) = iand(mask(j,1), not(det(j,1,i)))
+      perMask(j,2) = iand(mask(j,2), not(det(j,2,i)))
+    end do
+
+    call bitstring_to_list(perMask(1,1), h(1,1), h(0,1), N_int)
+    call bitstring_to_list(perMask(1,2), h(1,2), h(0,2), N_int)
+
+    call bitstring_to_list(mobMask(1,1), p(1,1), p(0,1), N_int)
+    call bitstring_to_list(mobMask(1,2), p(1,2), p(0,2), N_int)
+
+    call get_mask_phase(psi_det_sorted(1,1,i), phasemask, N_int)
+
+    if(nt == 3) then
+      call get_m2(det(1,1,i), phasemask, bannedOrb, vect, mask, h, p, sp, coefs(1, i))
+    else if(nt == 2) then
+      call get_m1(det(1,1,i), phasemask, bannedOrb, vect, mask, h, p, sp, coefs(1, i))
+    else
+      call get_m0(det(1,1,i), phasemask, bannedOrb, vect, mask, h, p, sp, coefs(1, i))
+    end if
+  end do
+end subroutine
+
+
+subroutine get_m2(gen, phasemask, bannedOrb, vect, mask, h, p, sp, coefs)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int, 2)
+  logical, intent(in) :: bannedOrb(mo_num)
+  double precision, intent(in) :: coefs(N_states)
+  double precision, intent(inout) :: vect(N_states, mo_num)
+  integer, intent(in) :: sp, h(0:2, 2), p(0:3, 2)
+  integer :: i, j, h1, h2, p1, p2, sfix, hfix, pfix, hmob, pmob, puti
+  double precision :: hij
+  double precision, external :: get_phase_bi, mo_two_e_integral
+
+  integer, parameter :: turn3_2(2,3) = reshape((/2,3, 1,3, 1,2/), (/2,3/))
+  integer, parameter :: turn2(2) = (/2,1/)
+
+  if(h(0,sp) == 2) then
+    h1 = h(1, sp)
+    h2 = h(2, sp)
+    do i=1,3
+      puti = p(i, sp)
+      if(bannedOrb(puti)) cycle
+      p1 = p(turn3_2(1,i), sp)
+      p2 = p(turn3_2(2,i), sp)
+      hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2, p1, h1, h2)
+      hij *= get_phase_bi(phasemask, sp, sp, h1, p1, h2, p2)
+      vect(:, puti) += hij * coefs
+    end do
+  else if(h(0,sp) == 1) then
+    sfix = turn2(sp)
+    hfix = h(1,sfix)
+    pfix = p(1,sfix)
+    hmob = h(1,sp)
+    do j=1,2
+      puti = p(j, sp)
+      if(bannedOrb(puti)) cycle
+      pmob = p(turn2(j), sp)
+      hij = mo_two_e_integral(pfix, pmob, hfix, hmob)
+      hij *= get_phase_bi(phasemask, sp, sfix, hmob, pmob, hfix, pfix)
+      vect(:, puti) += hij * coefs
+    end do
+  else
+    puti = p(1,sp)
+    if(.not. bannedOrb(puti)) then
+      sfix = turn2(sp)
+      p1 = p(1,sfix)
+      p2 = p(2,sfix)
+      h1 = h(1,sfix)
+      h2 = h(2,sfix)
+      hij = (mo_two_e_integral(p1,p2,h1,h2) - mo_two_e_integral(p2,p1,h1,h2))
+      hij *= get_phase_bi(phasemask, sfix, sfix, h1, p1, h2, p2)
+      vect(:, puti) += hij * coefs
+    end if
+  end if
+end subroutine
+
+subroutine get_m1(gen, phasemask, bannedOrb, vect, mask, h, p, sp, coefs)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int, 2)
+  logical, intent(in) :: bannedOrb(mo_num)
+  double precision, intent(in) :: coefs(N_states)
+  double precision, intent(inout) :: vect(N_states, mo_num)
+  integer, intent(in) :: sp, h(0:2, 2), p(0:3, 2)
+  integer :: i, hole, p1, p2, sh
+  logical :: ok, lbanned(mo_num)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision :: hij
+  double precision, external :: get_phase_bi,mo_two_e_integral
+
+  lbanned = bannedOrb
+  sh = 1
+  if(h(0,2) == 1) sh = 2
+  hole = h(1, sh)
+  lbanned(p(1,sp)) = .true.
+  if(p(0,sp) == 2) lbanned(p(2,sp)) = .true.
+  !print *, "SPm1", sp, sh
+
+  p1 = p(1, sp)
+
+  if(sp == sh) then
+    p2 = p(2, sp)
+    lbanned(p2) = .true.
+
+    do i=1,hole-1
+      if(lbanned(i)) cycle
+      hij = (mo_two_e_integral(p1, p2, i, hole) - mo_two_e_integral(p2, p1, i, hole))
+      hij *= get_phase_bi(phasemask, sp, sp, i, p1, hole, p2)
+      vect(:,i) += hij * coefs
+    end do
+    do i=hole+1,mo_num
+      if(lbanned(i)) cycle
+      hij = (mo_two_e_integral(p1, p2, hole, i) - mo_two_e_integral(p2, p1, hole, i))
+      hij *= get_phase_bi(phasemask, sp, sp, hole, p1, i, p2)
+      vect(:,i) += hij * coefs
+    end do
+
+    call apply_particle(mask, sp, p2, det, ok,  N_int)
+    call i_h_j(gen, det, N_int, hij)
+    vect(:, p2) += hij * coefs
+  else
+    p2 = p(1, sh)
+    do i=1,mo_num
+      if(lbanned(i)) cycle
+      hij = mo_two_e_integral(p1, p2, i, hole)
+      hij *= get_phase_bi(phasemask, sp, sh, i, p1, hole, p2)
+      vect(:,i) += hij * coefs
+    end do
+  end if
+
+  call apply_particle(mask, sp, p1, det, ok,  N_int)
+  call i_h_j(gen, det, N_int, hij)
+  vect(:, p1) += hij * coefs
+end subroutine
+
+subroutine get_m0(gen, phasemask, bannedOrb, vect, mask, h, p, sp, coefs)
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int, 2)
+  logical, intent(in) :: bannedOrb(mo_num)
+  double precision, intent(in) :: coefs(N_states)
+  double precision, intent(inout) :: vect(N_states, mo_num)
+  integer, intent(in) :: sp, h(0:2, 2), p(0:3, 2)
+  integer :: i
+  logical :: ok, lbanned(mo_num)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision :: hij
+
+  lbanned = bannedOrb
+  lbanned(p(1,sp)) = .true.
+  do i=1,mo_num
+    if(lbanned(i)) cycle
+    call apply_particle(mask, sp, i, det, ok, N_int)
+    call i_h_j(gen, det, N_int, hij)
+    vect(:, i) += hij * coefs
+  end do
+end subroutine
+
+
+
+!
+!subroutine fill_buffer_single(i_generator, sp, h1, bannedOrb, fock_diag_tmp, E0, pt2, vect, buf)
+!  use bitmasks
+!  use selection_types
+!  implicit none
+!
+!  integer, intent(in) :: i_generator, sp, h1
+!  double precision, intent(in) :: vect(N_states, mo_num)
+!  logical, intent(in) :: bannedOrb(mo_num)
+!  double precision, intent(in)           :: fock_diag_tmp(mo_num)
+!  double precision, intent(in)    :: E0(N_states)
+!  double precision, intent(inout) :: pt2(N_states)
+!  type(selection_buffer), intent(inout) :: buf
+!  logical :: ok
+!  integer :: s1, s2, p1, p2, ib, istate
+!  integer(bit_kind) :: mask(N_int, 2), det(N_int, 2)
+!  double precision :: e_pert, delta_E, val, Hii, max_e_pert, tmp
+!  double precision, external :: diag_H_mat_elem_fock
+!
+!
+!  call apply_hole(psi_det_generators(1,1,i_generator), sp, h1, mask, ok, N_int)
+!
+!  do p1=1,mo_num
+!    if(bannedOrb(p1)) cycle
+!    if(vect(1, p1) == 0d0) cycle
+!    call apply_particle(mask, sp, p1, det, ok, N_int)
+!
+!
+!    Hii = diag_H_mat_elem_fock(psi_det_generators(1,1,i_generator),det,fock_diag_tmp,N_int)
+!    max_e_pert = 0d0
+!
+!    do istate=1,N_states
+!      val = vect(istate, p1) + vect(istate, p1)
+!      delta_E = E0(istate) - Hii
+!      tmp = dsqrt(delta_E * delta_E + val * val)
+!      if (delta_E < 0.d0) then
+!        tmp = -tmp
+!      endif
+!      e_pert = 0.5d0 * ( tmp - delta_E)
+!      pt2(istate) += e_pert
+!      if(dabs(e_pert) > dabs(max_e_pert)) max_e_pert = e_pert
+!    end do
+!
+!    if(dabs(max_e_pert) > buf%mini) call add_to_selection_buffer(buf, det, max_e_pert)
+!  end do
+!end subroutine
+!
diff --git a/src/ezfio_files/00.create.bats b/src/ezfio_files/00.create.bats
index cfa6247d..49430a0b 100644
--- a/src/ezfio_files/00.create.bats
+++ b/src/ezfio_files/00.create.bats
@@ -23,6 +23,34 @@ function run {
   qp set mo_two_e_ints io_mo_two_e_integrals "Write" 
 }
 
+@test "H2_1" {
+  run h2_1.xyz 1 0 cc-pvdz
+}
+
+@test "H2_3" {
+  run h2_3.xyz 3 0 cc-pvdz
+}
+
+@test "H3_2" {
+  run h3_2.xyz 2 0 cc-pvdz
+}
+
+@test "H3_4" {
+  run h3_4.xyz 4 0 cc-pvdz
+}
+
+@test "H4_1" {
+  run h4_1.xyz 1 0 cc-pvdz
+}
+
+@test "H4_3" {
+  run h4_3.xyz 3 0 cc-pvdz
+}
+
+@test "H4_5" {
+  run h4_5.xyz 5 0 cc-pvdz
+}
+
 
 @test "B-B" {
   qp set_file b2_stretched.ezfio
diff --git a/src/fci/40.fci.bats b/src/fci/40.fci.bats
index 4523d0e0..3c4a93c7 100644
--- a/src/fci/40.fci.bats
+++ b/src/fci/40.fci.bats
@@ -10,8 +10,8 @@ function run() {
   qp set perturbation do_pt2 False
   qp set determinants n_det_max 8000
   qp set determinants n_states  1
-  qp set davidson threshold_davidson 1.e-10
-  qp set davidson n_states_diag 8
+  qp set davidson_keywords threshold_davidson 1.e-10
+  qp set davidson_keywords n_states_diag 8
   qp run fci
   energy1="$(ezfio get fci energy | tr '[]' ' ' | cut -d ',' -f 1)"
   eq $energy1 $1 $thresh
@@ -24,99 +24,134 @@ function run_stoch() {
   qp set perturbation do_pt2 True
   qp set determinants n_det_max $3
   qp set determinants n_states  1
-  qp set davidson threshold_davidson 1.e-10
-  qp set davidson n_states_diag 1
+  qp set davidson_keywords threshold_davidson 1.e-10
+  qp set davidson_keywords n_states_diag 1
   qp run fci
   energy1="$(ezfio get fci energy_pt2 | tr '[]' ' ' | cut -d ',' -f 1)"
   eq $energy1 $1 $thresh
 }
 
-@test "B-B" {
+@test "H2_1" { # 1s
+  qp set_file h2_1.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -1.06415255 1.e-8 10000
+}
+
+@test "H2_3" { # 1s
+  qp set_file h2_3.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -0.96029881 1.e-8 10000
+}
+
+@test "H3_2" { # 3s
+  qp set_file h3_2.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -1.61003132 1.e-8 10000
+}
+
+@test "H3_4" { # 2s
+  qp set_file h3_4.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -1.02434843 1.e-8 10000
+}
+
+@test "H4_1" { # 13s
+  qp set_file h4_1.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -2.01675062 1.e-8 10000
+}
+
+@test "H4_3" { # 10s
+  qp set_file h4_3.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -1.95927626 1.e-8 10000
+}
+
+@test "H4_5" { # 3s
+  qp set_file h4_5.ezfio
+  qp set perturbation pt2_max 0.
+  run_stoch -1.25852765 1.e-8 10000
+}
+
+@test "B-B" { # 10s
   qp set_file b2_stretched.ezfio
   qp set determinants n_det_max 10000
   qp set_frozen_core
   run_stoch -49.14103054419 3.e-4 10000
 }
 
-@test "F2" { # 4.07m
-  [[ -n $TRAVIS ]] && skip
-  qp set_file f2.ezfio
-  qp set_frozen_core
-  run_stoch -199.304922384814 3.e-3  100000
-}
-
-@test "NH3" { # 10.6657s
+@test "NH3" { # 8s
   qp set_file nh3.ezfio
   qp set_mo_class --core="[1-4]" --act="[5-72]"
   run -56.244753429144986  3.e-4  100000
 }
 
-@test "DHNO" { # 11.4721s
+@test "DHNO" { # 8s
   qp set_file dhno.ezfio
   qp set_mo_class --core="[1-7]" --act="[8-64]"
-  run -130.459020029816 3.e-4  100000
+  run -130.466208113547 3.e-4  100000
 }
 
-@test "HCO" { # 12.2868s
+@test "HCO" { # 32s
   qp set_file hco.ezfio
-  run -113.393356604085 1.e-3  100000
+  run -113.395751656985 1.e-3  100000
 }
 
-@test "H2O2" { # 12.9214s
+@test "H2O2" { # 21s
   qp set_file h2o2.ezfio
   qp set_mo_class --core="[1-2]" --act="[3-24]" --del="[25-38]"
   run -151.005848404095 1.e-3  100000
 }
 
-@test "HBO" { # 13.3144s
+@test "HBO" { # 18s
   [[ -n $TRAVIS ]] && skip
   qp set_file hbo.ezfio
-  run -100.213 1.5e-3  100000
+  run -100.214 1.5e-3  100000
 }
 
-@test "H2O" { # 11.3727s
+@test "H2O" { # 16s
   [[ -n $TRAVIS ]] && skip
   qp set_file h2o.ezfio
-  run -76.2361605151999 5.e-4  100000
+  run -76.238051555276  5.e-4  100000
 }
 
-@test "ClO" { # 13.3755s
+@test "ClO" { # 47s
   [[ -n $TRAVIS ]] && skip
   qp set_file clo.ezfio
-  run -534.546453546852 1.e-3  100000
+  run -534.548529710256 1.e-3  100000
 }
 
-@test "SO" { # 13.4952s
+@test "SO" { # 23s
   [[ -n $TRAVIS ]] && skip
   qp set_file so.ezfio
   run -26.015 3.e-3  100000
 }
 
-@test "H2S" { # 13.6745s
+@test "H2S" { # 37s
   [[ -n $TRAVIS ]] && skip
   qp set_file h2s.ezfio
-  run -398.859577605891 5.e-4  100000
+  run -398.864853669111 5.e-4  100000
 }
 
-@test "OH" { # 13.865s
+@test "OH" { # 12s
   [[ -n $TRAVIS ]] && skip
   qp set_file oh.ezfio
-  run -75.6121856748294 3.e-4   100000
+  run -75.615 1.5e-3   100000
 }
 
-@test "SiH2_3B1" { # 13.938ss
+@test "SiH2_3B1" { # 10s
   [[ -n $TRAVIS ]] && skip
   qp set_file sih2_3b1.ezfio
-  run -290.0175411299477 3.e-4  100000
+  run -290.0206626734517 3.e-4  100000
 }
 
-@test "H3COH" { # 14.7299s
+@test "H3COH" { # 33s
   [[ -n $TRAVIS ]] && skip
   qp set_file h3coh.ezfio
-  run -115.205632960026 1.e-3  100000
+  run -115.206784386204 1.e-3  100000
 }
 
-@test "SiH3" { # 15.99s
+@test "SiH3" { # 15s
   [[ -n $TRAVIS ]] && skip
   qp set_file sih3.ezfio
   run -5.572 1.e-3   100000
@@ -132,7 +167,7 @@ function run_stoch() {
 @test "ClF" { # 16.8864s
   [[ -n $TRAVIS ]] && skip
   qp set_file clf.ezfio
-  run -559.169748890031 1.5e-3  100000
+  run -559.174371468224 1.5e-3  100000
 }
 
 @test "SO2" { # 17.5645s
@@ -170,7 +205,6 @@ function run_stoch() {
   run -187.970184372047 1.6e-3  100000
 }
 
-
 @test "[Cu(NH3)4]2+" { # 25.0417s
   [[ -n $TRAVIS ]] && skip
   qp set_file cu_nh3_4_2plus.ezfio
@@ -185,3 +219,10 @@ function run_stoch() {
   run -93.078 2.e-3  100000
 }
 
+@test "F2" { # 4.07m
+  [[ -n $TRAVIS ]] && skip
+  qp set_file f2.ezfio
+  qp set_frozen_core
+  run_stoch -199.304922384814 3.e-3  100000
+}
+
diff --git a/src/hartree_fock/10.hf.bats b/src/hartree_fock/10.hf.bats
index df566032..3647b775 100644
--- a/src/hartree_fock/10.hf.bats
+++ b/src/hartree_fock/10.hf.bats
@@ -48,6 +48,35 @@ good=-92.76613324421798
 rm -rf $EZFIO
 }
 
+
+@test "H2_1" { # 1s
+  run h2_1.ezfio -1.005924963288527
+}
+
+@test "H2_3" { # 1s
+  run h2_3.ezfio -0.9591011604845440
+}
+
+@test "H3_2" { # 1s
+  run h3_2.ezfio -1.558273529860488
+}
+
+@test "H3_4" { # 1s
+  run h3_4.ezfio -1.0158684760025190
+}
+
+@test "H4_1" { # 1s
+  run h4_1.ezfio -1.932022805374405
+}
+
+@test "H4_3" { # 1s
+  run h4_3.ezfio -1.8948449927787350
+}
+
+@test "H4_5" { # 1s
+  run h4_5.ezfio -1.2408338805496990
+}
+
 @test "point charges" { 
  run_pt_charges
 }
diff --git a/src/tools/print_wf.irp.f b/src/tools/print_wf.irp.f
index 64eb1a1f..9621ee89 100644
--- a/src/tools/print_wf.irp.f
+++ b/src/tools/print_wf.irp.f
@@ -47,6 +47,7 @@ subroutine routine
  do i = 1, min(N_det_print_wf,N_det)
   print*,''
   print*,'i = ',i
+  print *,psi_det_sorted(1,1,i)
   call debug_det(psi_det_sorted(1,1,i),N_int)
   call get_excitation_degree(psi_det_sorted(1,1,i),psi_det_sorted(1,1,1),degree,N_int)
   print*,'degree = ',degree
diff --git a/tests/input/h2_1.xyz b/tests/input/h2_1.xyz
new file mode 100644
index 00000000..8ecd7dab
--- /dev/null
+++ b/tests/input/h2_1.xyz
@@ -0,0 +1,6 @@
+2
+H2
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+
+
diff --git a/tests/input/h2_3.xyz b/tests/input/h2_3.xyz
new file mode 100644
index 00000000..8ecd7dab
--- /dev/null
+++ b/tests/input/h2_3.xyz
@@ -0,0 +1,6 @@
+2
+H2
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+
+
diff --git a/tests/input/h3_2.xyz b/tests/input/h3_2.xyz
new file mode 100644
index 00000000..7c251c35
--- /dev/null
+++ b/tests/input/h3_2.xyz
@@ -0,0 +1,7 @@
+3
+h3
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+H   0.0   0.0    0.0
+
+
diff --git a/tests/input/h3_4.xyz b/tests/input/h3_4.xyz
new file mode 100644
index 00000000..7c251c35
--- /dev/null
+++ b/tests/input/h3_4.xyz
@@ -0,0 +1,7 @@
+3
+h3
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+H   0.0   0.0    0.0
+
+
diff --git a/tests/input/h4_1.xyz b/tests/input/h4_1.xyz
new file mode 100644
index 00000000..fe163388
--- /dev/null
+++ b/tests/input/h4_1.xyz
@@ -0,0 +1,7 @@
+4
+h4
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+H   0.0   0.74  0.0
+H   0.0   0.0    0.0
+
diff --git a/tests/input/h4_3.xyz b/tests/input/h4_3.xyz
new file mode 100644
index 00000000..fe163388
--- /dev/null
+++ b/tests/input/h4_3.xyz
@@ -0,0 +1,7 @@
+4
+h4
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+H   0.0   0.74  0.0
+H   0.0   0.0    0.0
+
diff --git a/tests/input/h4_5.xyz b/tests/input/h4_5.xyz
new file mode 100644
index 00000000..fe163388
--- /dev/null
+++ b/tests/input/h4_5.xyz
@@ -0,0 +1,7 @@
+4
+h4
+H   0.0   0.0   -0.74
+H   0.0   0.0    0.74
+H   0.0   0.74  0.0
+H   0.0   0.0    0.0
+

From 93adc8d6c1316f95151fe7ab32829db657cbd34c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 10 Jun 2023 11:57:28 +0200
Subject: [PATCH 69/79] no aaa //

---
 .../normal_ordered_contractions.irp.f         | 539 +++++++++++++++++-
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f        |  40 ++
 2 files changed, 560 insertions(+), 19 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
index f066c958..d11c6727 100644
--- a/src/tc_bi_ortho/normal_ordered_contractions.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -500,7 +500,7 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+BEGIN_PROVIDER [ double precision, no_aaa_contraction_v0, (mo_num,mo_num,mo_num,mo_num)]
 
   BEGIN_DOC
   !
@@ -508,12 +508,12 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
   !    h1 < h2
   !    p1 > p2
   !
-  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
+  !   no_aaa_contraction_v0(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
   !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
   !
   ! else:
   !
-  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !   no_aaa_contraction_v0(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
   !
   ! 
   ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
@@ -536,7 +536,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
   double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
 
-  print*,' Providing no_aaa_contraction ...'
+  print*,' Providing no_aaa_contraction_v0 ...'
   call wall_time(wall0)
 
   PROVIDE N_int
@@ -556,7 +556,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
 
   if(Ne(2) .lt. 3) then
 
-    no_aaa_contraction = 0.d0
+    no_aaa_contraction_v0 = 0.d0
 
   else
 
@@ -630,7 +630,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
@@ -662,7 +662,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
             enddo
           enddo
         enddo
@@ -736,7 +736,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -776,7 +776,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -849,7 +849,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
               enddo
             enddo
           enddo
@@ -881,7 +881,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
               enddo
             enddo
           enddo
@@ -955,7 +955,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
@@ -995,7 +995,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                no_aaa_contraction_v0(p2,h2,p1,h1) = no_aaa_contraction_v0(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
@@ -1010,19 +1010,19 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
     deallocate(tmpval_1, tmpval_2)
     deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
 
-    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+    no_aaa_contraction_v0 = -0.5d0 * no_aaa_contraction_v0
 
     !$OMP PARALLEL                 &
     !$OMP DEFAULT (NONE)           &
     !$OMP PRIVATE (h1, h2, p1, p2) & 
-    !$OMP SHARED (no_aaa_contraction, mo_num)
+    !$OMP SHARED (no_aaa_contraction_v0, mo_num)
 
     !$OMP DO 
     do h1 = 1, mo_num
       do h2 = 1, mo_num
         do p1 = 1, mo_num
           do p2 = p1, mo_num
-            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+            no_aaa_contraction_v0(p2,h2,p1,h1) -= no_aaa_contraction_v0(p1,h2,p2,h1)
           enddo
         enddo
       enddo
@@ -1034,7 +1034,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
       do h2 = 1, mo_num
         do p1 = 2, mo_num
           do p2 = 1, p1-1
-            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+            no_aaa_contraction_v0(p2,h2,p1,h1) = -no_aaa_contraction_v0(p1,h2,p2,h1)
           enddo
         enddo
       enddo
@@ -1046,17 +1046,18 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
       do h2 = h1+1, mo_num
         do p1 = 2, mo_num
           do p2 = 1, p1-1
-            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+            no_aaa_contraction_v0(p2,h2,p1,h1) *= -1.d0
           enddo
         enddo
       enddo
     enddo
+    !$OMP END DO
     !$OMP END PARALLEL
 
   endif
 
   call wall_time(wall1)
-  print*,' Wall time for no_aaa_contraction', wall1-wall0
+  print*,' Wall time for no_aaa_contraction_v0', wall1-wall0
 
 END_PROVIDER
 
@@ -1519,3 +1520,503 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
 END_PROVIDER
 
 ! ---
+
+BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+
+  print*,' Providing no_aaa_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  if(Ne(2) .lt. 3) then
+
+    no_aaa_contraction = 0.d0
+
+  else
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+    !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+    !$OMP          tmpval_1, tmpval_2,                              &
+    !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+    !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         no_aaa_contraction)
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmp3(n_points_final_grid,3,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmpvec_3(n_points_final_grid,3))
+
+    tmp_2d   = 0.d0
+    tmp_3d   = 0.d0
+    tmp1     = 0.d0
+    tmp2     = 0.d0
+    tmp3     = 0.d0
+    tmpval_1 = 0.d0
+    tmpval_2 = 0.d0
+    tmpvec_1 = 0.d0
+    tmpvec_2 = 0.d0
+    tmpvec_3 = 0.d0
+
+    !$OMP DO
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , 0.d0, tmp_3d(1,1,1), mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp3(1,1,1), 3*n_points_final_grid                    &
+                    , tmp1(1,1,1), 3*n_points_final_grid                    &
+                    , 1.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+    !$OMP END DO
+
+    deallocate(tmp_2d)
+    deallocate(tmp_3d)
+    deallocate(tmp1)
+    deallocate(tmp2)
+    deallocate(tmp3)
+    deallocate(tmpval_1)
+    deallocate(tmpval_2)
+    deallocate(tmpvec_1)
+    deallocate(tmpvec_2)
+    deallocate(tmpvec_3)
+
+    !$OMP END PARALLEL
+
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+      !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+      !$OMP          tmpval_1, tmpval_2,                              &
+      !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+      !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         no_aaa_contraction)
+
+      allocate(tmp_2d(mo_num,mo_num))
+      allocate(tmp_3d(mo_num,mo_num,mo_num))
+      allocate(tmp1(n_points_final_grid,3,mo_num))
+      allocate(tmp2(n_points_final_grid,mo_num))
+      allocate(tmp3(n_points_final_grid,3,mo_num))
+      allocate(tmpval_1(n_points_final_grid))
+      allocate(tmpval_2(n_points_final_grid))
+      allocate(tmpvec_1(n_points_final_grid,3))
+      allocate(tmpvec_2(n_points_final_grid,3))
+      allocate(tmpvec_3(n_points_final_grid,3))
+
+      tmp_2d   = 0.d0
+      tmp_3d   = 0.d0
+      tmp1     = 0.d0
+      tmp2     = 0.d0
+      tmp3     = 0.d0
+      tmpval_1 = 0.d0
+      tmpval_2 = 0.d0
+      tmpvec_1 = 0.d0
+      tmpvec_2 = 0.d0
+      tmpvec_3 = 0.d0
+
+      !$OMP DO
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+        do h1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                     &
+                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+      !$OMP END DO
+
+      deallocate(tmp_2d)
+      deallocate(tmp_3d)
+      deallocate(tmp1)
+      deallocate(tmp2)
+      deallocate(tmp3)
+      deallocate(tmpval_1)
+      deallocate(tmpval_2)
+      deallocate(tmpvec_1)
+      deallocate(tmpvec_2)
+      deallocate(tmpvec_3)
+
+      !$OMP END PARALLEL
+    endif
+
+    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (no_aaa_contraction, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aaa_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index 4f190407..4404bc02 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -21,6 +21,7 @@ program tc_bi_ortho
   !call test_no()
   call test_no_aba()
   call test_no_aab()
+  call test_no_aaa()
 end
 
 subroutine test_h_u0
@@ -382,4 +383,43 @@ end
 
 ! ---
 
+subroutine test_no_aaa()
 
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing no_aaa_contraction ...'
+
+  thr = 1d-8
+
+  PROVIDE no_aaa_contraction_v0
+  PROVIDE no_aaa_contraction
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = no_aaa_contraction   (l,k,j,i)
+          ref = no_aaa_contraction_v0(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on no_aaa_contraction'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on no_aaa_contraction = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---

From 373c46303337fcaaea795425da5ea1cd53364c02 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 10 Jun 2023 18:09:20 +0200
Subject: [PATCH 70/79] Normal Ordering: Enhanced //

---
 src/tc_bi_ortho/normal_ordered.irp.f   | 954 ++++++++++++++++++++++++-
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f |   8 +-
 2 files changed, 951 insertions(+), 11 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 7259c270..ca5875c9 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_v0, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC 
   ! Normal ordering of the three body interaction on the HF density
@@ -18,13 +18,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
 
-  print*,' Providing normal_two_body_bi_orth ...'
+  print*,' Providing normal_two_body_bi_orth_v0 ...'
   call wall_time(walli)
  
   if(read_tc_norm_ord) then
 
-    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
-      read(11) normal_two_body_bi_orth
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_v0', action="read")
+      read(11) normal_two_body_bi_orth_v0
     close(11)
 
   else
@@ -318,7 +318,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
     call wall_time(wall1)
     print*,' Wall time for aba_contraction', wall1-wall0
 
-    normal_two_body_bi_orth = tmp
+    normal_two_body_bi_orth_v0 = tmp
 
     ! ---
     ! aab contraction
@@ -491,12 +491,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
         enddo
       enddo
     enddo
+    !$OMP END DO
     !$OMP END PARALLEL
 
     call wall_time(wall1)
     print*,' Wall time for aab_contraction', wall1-wall0
 
-    normal_two_body_bi_orth += tmp
+    normal_two_body_bi_orth_v0 += tmp
 
     ! ---
     ! aaa contraction
@@ -1002,9 +1003,948 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
           enddo
         enddo
       enddo
+      !$OMP END DO
       !$OMP END PARALLEL
 
-      call wall_time(wallf)
+      call wall_time(wall1)
+      print*,' Wall time for aaa_contraction', wall1-wall0
+
+      normal_two_body_bi_orth_v0 += tmp
+    endif ! Ne(2) .ge. 3
+
+    deallocate(tmp)
+
+  endif ! read_tc_norm_ord
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_v0', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth_v0
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
+  endif
+
+  call wall_time(wallf)
+  print*,' Wall time for normal_two_body_bi_orth_v0 ', wallf-walli
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC 
+  ! Normal ordering of the three body interaction on the HF density
+  END_DOC 
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: hh1, hh2, pp1, pp2
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1, walli, wallf
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+
+  print*,' Providing normal_two_body_bi_orth ...'
+  call wall_time(walli)
+ 
+  if(read_tc_norm_ord) then
+
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
+      read(11) normal_two_body_bi_orth
+    close(11)
+
+  else
+
+    double precision, allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+    double precision, allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+    double precision, allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+    double precision, allocatable :: tmp(:,:,:,:)
+
+    PROVIDE N_int
+
+    allocate( occ(N_int*bit_kind_size,2) )
+    allocate( key_i_core(N_int,2) )
+
+    if(core_tc_op) then
+      do i = 1, N_int
+        key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+      enddo
+      call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+    else
+      call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+    endif
+
+    allocate(tmp(mo_num,mo_num,mo_num,mo_num))
+
+    ! ---
+    ! aba contraction
+
+    print*,' Providing aba_contraction ...'
+    call wall_time(wall0)
+
+    tmp = 0.d0
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+    !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+    !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+    !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp)
+
+    allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+    allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+    tmp_3d   = 0.d0
+    tmp_2d   = 0.d0
+    tmp1     = 0.d0
+    tmp2     = 0.d0
+    tmpval_1 = 0.d0
+    tmpval_2 = 0.d0
+    tmpvec_1 = 0.d0
+    tmpvec_2 = 0.d0 
+
+    !$OMP DO
+
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                             ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                             - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+    !$OMP END DO
+
+    deallocate(tmp_3d, tmp_2d)
+    deallocate(tmp1, tmp2)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2)
+
+    !$OMP END PARALLEL
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, h1, p1, h2, p2, i, ii,                   &
+      !$OMP          tmp_3d, tmp_2d, tmp1, tmp2,                      &
+      !$OMP          tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)          & 
+      !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp)
+
+      Allocate(tmp_3d(mo_num,mo_num,mo_num), tmp_2d(mo_num,mo_num))
+      Allocate(tmp1(n_points_final_grid,3,mo_num), tmp2(n_points_final_grid,mo_num))
+      Allocate(tmpval_1(n_points_final_grid), tmpval_2(n_points_final_grid))
+      Allocate(tmpvec_1(n_points_final_grid,3), tmpvec_2(n_points_final_grid,3))
+
+      Tmp_3d   = 0.d0
+      Tmp_2d   = 0.d0
+      Tmp1     = 0.d0
+      Tmp2     = 0.d0
+      Tmpval_1 = 0.d0
+      Tmpval_2 = 0.d0
+      Tmpvec_1 = 0.d0
+      Tmpvec_2 = 0.d0 
+
+      !$OMP DO
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+        do h1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+            tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          enddo
+
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                           &
+                               ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                               - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                               - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                               - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+            enddo
+
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+      !$OMP END DO
+
+      deallocate(tmp_3d, tmp_2d)
+      deallocate(tmp1, tmp2)
+      deallocate(tmpval_1, tmpval_2)
+      deallocate(tmpvec_1, tmpvec_2)
+
+      !$OMP END PARALLEL
+    endif
+
+    tmp = -0.5d0 * tmp
+    call sum_A_At(tmp(1,1,1,1), mo_num*mo_num)
+
+    call wall_time(wall1)
+    print*,' Wall time for aba_contraction', wall1-wall0
+
+    normal_two_body_bi_orth = tmp
+
+    ! ---
+    ! aab contraction
+
+    print*,' Providing aab_contraction ...'
+    call wall_time(wall0)
+
+    tmp = 0.d0
+
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, ii, i, h1, p1, h2, p2,                   &
+    !$OMP          tmp_2d, tmp_3d, tmp1, tmp2,                      &
+    !$OMP          tmpval_1, tmpvec_1)                              &
+    !$OMP SHARED (n_points_final_grid, mo_num, Ne, occ,             &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp)
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+
+    tmp_2d   = 0.d0
+    tmp_3d   = 0.d0
+    tmp1     = 0.d0
+    tmp2     = 0.d0
+    tmpval_1 = 0.d0
+    tmpvec_1 = 0.d0
+
+    !$OMP DO
+
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      do h1 = 1, mo_num
+
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        enddo
+
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+        enddo
+
+        do p1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+          enddo
+
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              !$OMP CRITICAL
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              !$OMP END CRITICAL
+            enddo
+          enddo
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+    !$OMP END DO
+
+    deallocate(tmp_3d)
+    deallocate(tmp1, tmp2)
+    deallocate(tmpval_1)
+    deallocate(tmpvec_1)
+
+    !$OMP END PARALLEL
+
+    tmp = -0.5d0 * tmp
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (tmp, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call wall_time(wall1)
+    print*,' Wall time for aab_contraction', wall1-wall0
+
+    normal_two_body_bi_orth += tmp
+
+    ! ---
+    ! aaa contraction
+
+    if(Ne(2) .ge. 3) then
+
+      print*,' Providing aaa_contraction ...'
+      call wall_time(wall0)
+
+      tmp = 0.d0
+
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+      !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+      !$OMP          tmpval_1, tmpval_2,                              &
+      !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+      !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp)
+
+      allocate(tmp_2d(mo_num,mo_num))
+      allocate(tmp_3d(mo_num,mo_num,mo_num))
+      allocate(tmp1(n_points_final_grid,3,mo_num))
+      allocate(tmp2(n_points_final_grid,mo_num))
+      allocate(tmp3(n_points_final_grid,3,mo_num))
+      allocate(tmpval_1(n_points_final_grid))
+      allocate(tmpval_2(n_points_final_grid))
+      allocate(tmpvec_1(n_points_final_grid,3))
+      allocate(tmpvec_2(n_points_final_grid,3))
+      allocate(tmpvec_3(n_points_final_grid,3))
+
+      tmp_2d   = 0.d0
+      tmp_3d   = 0.d0
+      tmp1     = 0.d0
+      tmp2     = 0.d0
+      tmp3     = 0.d0
+      tmpval_1 = 0.d0
+      tmpval_2 = 0.d0
+      tmpvec_1 = 0.d0
+      tmpvec_2 = 0.d0
+      tmpvec_3 = 0.d0
+
+      !$OMP DO
+      do ii = 1, Ne(2)
+        i = occ(ii,2)
+
+        do h1 = 1, mo_num
+
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+          enddo
+
+          do p1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                      , tmp2(1,1), n_points_final_grid                      &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , tmp2(1,1), n_points_final_grid                      &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                    &
+                      , tmp1(1,1,1), 3*n_points_final_grid                    &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                !$OMP CRITICAL
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                !$OMP END CRITICAL
+              enddo
+            enddo
+
+          enddo ! p1
+        enddo ! h1
+      enddo ! i
+      !$OMP END DO
+
+      deallocate(tmp_2d)
+      deallocate(tmp_3d)
+      deallocate(tmp1)
+      deallocate(tmp2)
+      deallocate(tmp3)
+      deallocate(tmpval_1)
+      deallocate(tmpval_2)
+      deallocate(tmpvec_1)
+      deallocate(tmpvec_2)
+      deallocate(tmpvec_3)
+
+      !$OMP END PARALLEL
+
+      ! purely open-shell part 
+      if(Ne(2) < Ne(1)) then
+
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint, i, ii, h1, h2, p1, p2,                   &
+        !$OMP          tmp_2d, tmp_3d, tmp1, tmp2, tmp3,                &
+        !$OMP          tmpval_1, tmpval_2,                              &
+        !$OMP          tmpvec_1, tmpvec_2, tmpvec_3)                    &
+        !$OMP SHARED (n_points_final_grid, Ne, occ, mo_num,             &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmp)
+
+        allocate(tmp_2d(mo_num,mo_num))
+        allocate(tmp_3d(mo_num,mo_num,mo_num))
+        allocate(tmp1(n_points_final_grid,3,mo_num))
+        allocate(tmp2(n_points_final_grid,mo_num))
+        allocate(tmp3(n_points_final_grid,3,mo_num))
+        allocate(tmpval_1(n_points_final_grid))
+        allocate(tmpval_2(n_points_final_grid))
+        allocate(tmpvec_1(n_points_final_grid,3))
+        allocate(tmpvec_2(n_points_final_grid,3))
+        allocate(tmpvec_3(n_points_final_grid,3))
+
+        tmp_2d   = 0.d0
+        tmp_3d   = 0.d0
+        tmp1     = 0.d0
+        tmp2     = 0.d0
+        tmp3     = 0.d0
+        tmpval_1 = 0.d0
+        tmpval_2 = 0.d0
+        tmpvec_1 = 0.d0
+        tmpvec_2 = 0.d0
+        tmpvec_3 = 0.d0
+
+        !$OMP DO
+
+        do ii = Ne(2) + 1, Ne(1)
+          i = occ(ii,1)
+
+          do h1 = 1, mo_num
+
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            enddo
+
+            do p1 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+                tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+                tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+            enddo
+
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+                tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+                tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+              enddo
+            enddo
+
+            call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , 0.d0, tmp_3d(1,1,1), mo_num)
+
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+            enddo
+
+            do p1 = 1, mo_num
+
+              do ipoint = 1, n_points_final_grid
+
+                tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                                 ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+                tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+                tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+                tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+                tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              enddo
+
+              do h2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                  + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                  tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                  tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                  tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+                enddo
+              enddo
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+
+              do p2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                  tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                  tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                  tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+                enddo
+              enddo
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                        , tmp3(1,1,1), 3*n_points_final_grid                     &
+                        , tmp1(1,1,1), 3*n_points_final_grid                     &
+                        , 1.d0, tmp_2d(1,1), mo_num)
+
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  !$OMP CRITICAL
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                  !$OMP END CRITICAL
+                enddo
+              enddo
+
+            enddo ! p1
+          enddo ! h1
+        enddo !i
+        !$OMP END DO
+
+        deallocate(tmp_2d)
+        deallocate(tmp_3d)
+        deallocate(tmp1)
+        deallocate(tmp2)
+        deallocate(tmp3)
+        deallocate(tmpval_1)
+        deallocate(tmpval_2)
+        deallocate(tmpvec_1)
+        deallocate(tmpvec_2)
+        deallocate(tmpvec_3)
+
+        !$OMP END PARALLEL
+      endif
+
+      tmp = -0.5d0 * tmp
+
+      !$OMP PARALLEL                 &
+      !$OMP DEFAULT (NONE)           &
+      !$OMP PRIVATE (h1, h2, p1, p2) & 
+      !$OMP SHARED (tmp, mo_num)
+
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 1, mo_num
+            do p2 = p1, mo_num
+              tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+
+      !$OMP DO 
+      do h1 = 1, mo_num-1
+        do h2 = h1+1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) *= -1.d0
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call wall_time(wall1)
       print*,' Wall time for aaa_contraction', wall1-wall0
 
       normal_two_body_bi_orth += tmp
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index 4404bc02..902f7295 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -18,10 +18,10 @@ program tc_bi_ortho
 ! call timing_single
 ! call timing_double
 
-  !call test_no()
-  call test_no_aba()
-  call test_no_aab()
-  call test_no_aaa()
+  call test_no()
+  !call test_no_aba()
+  !call test_no_aab()
+  !call test_no_aaa()
 end
 
 subroutine test_h_u0

From 24f91e9bec8b255e9e790cb977d37c2d86877ce0 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 11 Jun 2023 11:41:48 +0200
Subject: [PATCH 71/79] Choose a port number based on PID

---
 ocaml/qp_run.ml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocaml/qp_run.ml b/ocaml/qp_run.ml
index b9d14efe..0cb862ae 100644
--- a/ocaml/qp_run.ml
+++ b/ocaml/qp_run.ml
@@ -38,7 +38,8 @@ let run slave ?prefix exe ezfio_file =
       | Unix.Unix_error _ -> try_new_port (port_number+100)
     in
     let result =
-      try_new_port 41279
+      let port = 10*(Unix.getpid () mod 2823) + 32_769 in
+      try_new_port port
     in
     Zmq.Socket.close dummy_socket;
     Zmq.Context.terminate zmq_context;

From 2f6c7e4ba00a7fb9a2dc6f1cabba87e8f036211f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 11 Jun 2023 12:19:39 +0200
Subject: [PATCH 72/79] Update test in FCI

---
 src/fci/40.fci.bats | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fci/40.fci.bats b/src/fci/40.fci.bats
index 3c4a93c7..889bf90a 100644
--- a/src/fci/40.fci.bats
+++ b/src/fci/40.fci.bats
@@ -209,7 +209,7 @@ function run_stoch() {
   [[ -n $TRAVIS ]] && skip
   qp set_file cu_nh3_4_2plus.ezfio
   qp set_mo_class --core="[1-24]" --act="[25-45]" --del="[46-87]"
-  run -1862.9869374387192  3.e-04  100000
+  run -1862.98320066637   3.e-04  100000
 }
 
 @test "HCN" { # 20.3273s

From bb23d6a5b5160387ef0695d1a07e7f4ef86f71b6 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 12 Jun 2023 13:36:01 +0200
Subject: [PATCH 73/79] Fixed the pt_charges bug:   + added the pt_charges
 integrals to the usual v_ne   + added only the nuclei_pt_charge interaction
 to the usual nuclear_repulsion (and not the pt_charge_pt_charge interaction)

---
 src/ao_one_e_ints/pot_ao_ints.irp.f | 3 +++
 src/hartree_fock/10.hf.bats         | 5 +++--
 src/nuclei/nuclei.irp.f             | 7 ++++++-
 src/nuclei/point_charges.irp.f      | 3 +++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/ao_one_e_ints/pot_ao_ints.irp.f b/src/ao_one_e_ints/pot_ao_ints.irp.f
index 446bf730..4f9ae76d 100644
--- a/src/ao_one_e_ints/pot_ao_ints.irp.f
+++ b/src/ao_one_e_ints/pot_ao_ints.irp.f
@@ -104,6 +104,9 @@ BEGIN_PROVIDER [ double precision, ao_integrals_n_e, (ao_num,ao_num)]
     IF(do_pseudo) THEN
        ao_integrals_n_e += ao_pseudo_integrals
     ENDIF
+    IF(point_charges) THEN
+       ao_integrals_n_e += ao_integrals_pt_chrg
+    ENDIF
 
   endif
 
diff --git a/src/hartree_fock/10.hf.bats b/src/hartree_fock/10.hf.bats
index 3647b775..6e7d0233 100644
--- a/src/hartree_fock/10.hf.bats
+++ b/src/hartree_fock/10.hf.bats
@@ -43,12 +43,11 @@ python write_pt_charges.py ${EZFIO}
 qp set nuclei point_charges True
 qp run scf | tee ${EZFIO}.pt_charges.out
   energy="$(ezfio get hartree_fock energy)"
-good=-92.76613324421798
+good=-92.79920682236470
   eq $energy $good $thresh
 rm -rf $EZFIO
 }
 
-
 @test "H2_1" { # 1s
   run h2_1.ezfio -1.005924963288527
 }
@@ -85,6 +84,8 @@ rm -rf $EZFIO
   run hcn.ezfio -92.88717500035233
 }
 
+
+
 @test "B-B" { # 3s
   run b2_stretched.ezfio -48.9950585434279
 }
diff --git a/src/nuclei/nuclei.irp.f b/src/nuclei/nuclei.irp.f
index fabdc42e..bb8cc782 100644
--- a/src/nuclei/nuclei.irp.f
+++ b/src/nuclei/nuclei.irp.f
@@ -206,7 +206,12 @@ BEGIN_PROVIDER [ double precision, nuclear_repulsion ]
      enddo
      nuclear_repulsion *= 0.5d0
      if(point_charges)then
-      nuclear_repulsion += pt_chrg_nuclei_interaction + pt_chrg_interaction
+      print*,'bear nuclear repulsion = ',nuclear_repulsion 
+      print*,'adding the interaction between the nuclein and the point charges'
+      print*,'to the usual nuclear repulsion '
+      nuclear_repulsion += pt_chrg_nuclei_interaction 
+      print*,'new nuclear repulsion =  ',nuclear_repulsion 
+      print*,'WARNING: we do not add the interaction between the point charges themselves'
      endif
    end if
 
diff --git a/src/nuclei/point_charges.irp.f b/src/nuclei/point_charges.irp.f
index b955537f..66905c8c 100644
--- a/src/nuclei/point_charges.irp.f
+++ b/src/nuclei/point_charges.irp.f
@@ -205,5 +205,8 @@ BEGIN_PROVIDER [ double precision, pt_chrg_nuclei_interaction]
  enddo
  print*,'Interaction between point charges and nuclei'
  print*,'pt_chrg_nuclei_interaction = ',pt_chrg_nuclei_interaction
+ if(point_charges)then
+  provide pt_chrg_interaction
+ endif
 END_PROVIDER 
 

From 4d9e28438c199c7f8956913b7380c5ba6ec07932 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 12 Jun 2023 14:05:36 +0200
Subject: [PATCH 74/79] Improved  I/O in CCSD

---
 src/ao_two_e_ints/cholesky.irp.f  | 13 ++----
 src/ccsd/EZFIO.cfg                | 11 +++++
 src/ccsd/ccsd_space_orb_sub.irp.f | 11 +++--
 src/ccsd/ccsd_spin_orb_sub.irp.f  | 14 +++---
 src/ccsd/save_energy.irp.f        | 13 ++++++
 src/mo_two_e_ints/cholesky.irp.f  |  2 +
 src/utils/linear_algebra.irp.f    |  6 +--
 src/utils_cc/EZFIO.cfg            | 16 +++----
 src/utils_cc/guess_t.irp.f        | 75 +++++++++++++++----------------
 9 files changed, 88 insertions(+), 73 deletions(-)
 create mode 100644 src/ccsd/EZFIO.cfg
 create mode 100644 src/ccsd/save_energy.irp.f

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index bb81b141..77eb6ddc 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -4,7 +4,7 @@ BEGIN_PROVIDER [ integer, cholesky_ao_num_guess ]
  ! Number of Cholesky vectors in AO basis
  END_DOC
 
- cholesky_ao_num_guess = ao_num*ao_num / 2
+ cholesky_ao_num_guess = ao_num*ao_num 
 END_PROVIDER
 
  BEGIN_PROVIDER [ integer, cholesky_ao_num ]
@@ -44,19 +44,12 @@ END_PROVIDER
    do m=0,9
      do l=1+m,ao_num,10
        !$OMP DO SCHEDULE(dynamic)
-       do j=1,l
+       do j=1,ao_num
          do k=1,ao_num
-           do i=1,min(k,j)
+           do i=1,ao_num
              if (ao_two_e_integral_zero(i,j,k,l)) cycle
              integral = get_ao_two_e_integral(i,j,k,l, ao_integrals_map)
              ao_integrals(i,k,j,l) = integral
-             ao_integrals(k,i,j,l) = integral
-             ao_integrals(i,k,l,j) = integral
-             ao_integrals(k,i,l,j) = integral
-             ao_integrals(j,l,i,k) = integral
-             ao_integrals(j,l,k,i) = integral
-             ao_integrals(l,j,i,k) = integral
-             ao_integrals(l,j,k,i) = integral
            enddo
          enddo
        enddo
diff --git a/src/ccsd/EZFIO.cfg b/src/ccsd/EZFIO.cfg
new file mode 100644
index 00000000..328cd981
--- /dev/null
+++ b/src/ccsd/EZFIO.cfg
@@ -0,0 +1,11 @@
+[energy]
+type: double precision
+doc: CCSD energy
+interface: ezfio
+
+[energy_t]
+type: double precision
+doc: CCSD(T) energy
+interface: ezfio
+
+
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 1467d9a4..40c57188 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -135,8 +135,11 @@ subroutine run_ccsd_space_orb
   write(*,'(A15,1pE10.2,A3)')' Conv        = ', max_r
   print*,''
 
-  call write_t1(nO,nV,t1)
-  call write_t2(nO,nV,t2)
+  if (write_amplitudes) then
+    call write_t1(nO,nV,t1)
+    call write_t2(nO,nV,t2)
+    call ezfio_set_utils_cc_io_amplitudes('Read')
+  endif
 
   ! Deallocation
   if (cc_update_method == 'diis') then
@@ -147,6 +150,7 @@ subroutine run_ccsd_space_orb
 
   ! CCSD(T)
   double precision :: e_t
+  e_t = 0.d0
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
@@ -182,8 +186,7 @@ subroutine run_ccsd_space_orb
     print*,''
   endif
 
-  print*,'Reference determinant:'
-  call print_det(det,N_int)
+  call save_energy(uncorr_energy + energy, e_t)
 
   deallocate(t1,t2)
 
diff --git a/src/ccsd/ccsd_spin_orb_sub.irp.f b/src/ccsd/ccsd_spin_orb_sub.irp.f
index 23e2cef1..a267cc45 100644
--- a/src/ccsd/ccsd_spin_orb_sub.irp.f
+++ b/src/ccsd/ccsd_spin_orb_sub.irp.f
@@ -269,8 +269,11 @@ subroutine run_ccsd_spin_orb
   write(*,'(A15,1pE10.2,A3)')' Conv        = ', max_r
   print*,''
 
-  call write_t1(nO,nV,t1)
-  call write_t2(nO,nV,t2)
+  if (write_amplitudes) then
+    call write_t1(nO,nV,t1)
+    call write_t2(nO,nV,t2)
+    call ezfio_set_utils_cc_io_amplitudes('Read')
+  endif
 
   ! Deallocate
   if (cc_update_method == 'diis') then
@@ -284,8 +287,9 @@ subroutine run_ccsd_spin_orb
   deallocate(v_ovoo,v_oovo)
   deallocate(v_ovvo,v_ovov,v_oovv)
   
+  double precision :: t_corr
+  t_corr = 0.d0
   if (cc_par_t .and. elec_alpha_num  +elec_beta_num > 2) then
-    double precision :: t_corr
     print*,'CCSD(T) calculation...'
     call wall_time(ta)
     !allocate(v_vvvo(nV,nV,nV,nO))
@@ -307,8 +311,8 @@ subroutine run_ccsd_spin_orb
     write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + t_corr, ' Ha'
     print*,''
   endif
-  print*,'Reference determinant:'
-  call print_det(det,N_int)
+
+  call save_energy(uncorr_energy + energy, t_corr)
   
   deallocate(f_oo,f_ov,f_vv,f_o,f_v)
   deallocate(v_ooov,v_vvoo,t1,t2)
diff --git a/src/ccsd/save_energy.irp.f b/src/ccsd/save_energy.irp.f
new file mode 100644
index 00000000..30d93ec3
--- /dev/null
+++ b/src/ccsd/save_energy.irp.f
@@ -0,0 +1,13 @@
+subroutine save_energy(E,ET)
+  implicit none
+  BEGIN_DOC
+! Saves the energy in |EZFIO|.
+  END_DOC
+  double precision, intent(in) :: E, ET
+  call ezfio_set_ccsd_energy(E)
+  if (ET /= 0.d0) then
+    call ezfio_set_ccsd_energy_t(E+ET)
+  endif
+end
+
+
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 8b1e6e1c..32c0dccd 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -27,6 +27,8 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_ao_num, mo_num,
  double precision, allocatable :: buffer(:,:)
 
  print *, 'AO->MO Transformation of Cholesky vectors  .'
+
+ call set_multiple_levels_omp(.False.)
  !$OMP PARALLEL PRIVATE(i,j,k,buffer)
  allocate(buffer(mo_num,mo_num))
  !$OMP DO SCHEDULE(static)
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 69873bc0..76a539a6 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -1831,7 +1831,7 @@ double precision, intent(in)     :: tol
 
 integer, dimension(:), allocatable          :: piv
 double precision, dimension(:), allocatable :: work
-character, parameter :: uplo = "U"
+character, parameter :: uplo = 'L'
 integer :: LDA
 integer :: info
 integer :: k, l, rank0
@@ -1848,14 +1848,14 @@ if (rank > rank0) then
 end if
 
 do k = 1, ndim
-  A(k+1:ndim, k) = 0.00D+0
+  A(k,k+1:ndim) = 0.00D+0
 end do
 ! TODO: It should be possible to use only one vector of size (1:rank) as a buffer
 ! to do the swapping in-place
 U(:,:) = 0.00D+0
 do k = 1, ndim
   l = piv(k)
-  U(l, 1:rank) = A(1:rank, k)
+  U(l, 1:rank) = A(k,1:rank)
 end do
 
 end subroutine pivoted_cholesky
diff --git a/src/utils_cc/EZFIO.cfg b/src/utils_cc/EZFIO.cfg
index 71ee87e3..fb6d9034 100644
--- a/src/utils_cc/EZFIO.cfg
+++ b/src/utils_cc/EZFIO.cfg
@@ -46,17 +46,11 @@ doc: Guess used to initialize the T2 amplitudes. none -> 0, MP -> perturbation t
 interface: ezfio,ocaml,provider
 default: MP
 
-[cc_write_t1]
-type: logical
-doc: If true, it will write on disk the T1 amplitudes at the end of the calculation.
-interface: ezfio,ocaml,provider
-default: False
-
-[cc_write_t2]
-type: logical
-doc: If true, it will write on disk the T2 amplitudes at the end of the calculation.
-interface: ezfio,ocaml,provider
-default: False
+[io_amplitudes]
+type: Disk_access
+doc: Read/Write |CCSD| amplitudes from/to disk [ Write | Read | None ]
+interface: ezfio,provider,ocaml
+default: None
 
 [cc_par_t]
 type: logical
diff --git a/src/utils_cc/guess_t.irp.f b/src/utils_cc/guess_t.irp.f
index 42acdf78..bb26e133 100644
--- a/src/utils_cc/guess_t.irp.f
+++ b/src/utils_cc/guess_t.irp.f
@@ -91,16 +91,17 @@ subroutine write_t1(nO,nV,t1)
   double precision, intent(in) :: t1(nO, nV)
 
   ! internal
-  integer                      :: i,a
+  integer                      :: i,a, iunit
+  integer, external :: getunitandopen
 
-  if (cc_write_t1) then
-    open(unit=11, file=trim(ezfio_filename)//'/cc_utils/T1')
+  if (write_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','w')
     do a = 1, nV
       do i = 1, nO
-         write(11,'(F20.12)') t1(i,a)
+         write(iunit,'(F20.12)') t1(i,a)
       enddo
     enddo
-    close(11)
+    close(iunit)
   endif
   
 end
@@ -120,20 +121,21 @@ subroutine write_t2(nO,nV,t2)
   double precision, intent(in) :: t2(nO, nO, nV, nV)
 
   ! internal
-  integer                      :: i,j,a,b
+  integer                      :: i,j,a,b, iunit
+  integer, external :: getunitandopen
 
-  if (cc_write_t2) then
-    open(unit=11, file=trim(ezfio_filename)//'/cc_utils/T2')
+  if (write_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','w')
     do b = 1, nV
       do a = 1, nV
         do j = 1, nO
           do i = 1, nO
-             write(11,'(F20.12)') t2(i,j,a,b)
+             write(iunit,'(F20.12)') t2(i,j,a,b)
           enddo
         enddo
       enddo
     enddo
-    close(11)
+    close(iunit)
   endif
   
 end
@@ -153,23 +155,19 @@ subroutine read_t1(nO,nV,t1)
   double precision, intent(out) :: t1(nO, nV)
 
   ! internal
-  integer                       :: i,a
+  integer                       :: i,a, iunit
   logical                       :: ok
+  integer, external :: getunitandopen
 
-  inquire(file=trim(ezfio_filename)//'/cc_utils/T1', exist=ok)
-  if (.not. ok) then
-     print*, 'There is no file'// trim(ezfio_filename)//'/cc_utils/T1'
-     print*, 'Do a first calculation with cc_write_t1 = True'
-     print*, 'and cc_guess_t1 /= read before setting cc_guess_t1 = read'
-     call abort
-  endif
-  open(unit=11, file=trim(ezfio_filename)//'/cc_utils/T1')
-  do a = 1, nV
-    do i = 1, nO
-       read(11,'(F20.12)') t1(i,a)
+  if (read_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T1','r')
+    do a = 1, nV
+      do i = 1, nO
+         read(iunit,'(F20.12)') t1(i,a)
+      enddo
     enddo
-  enddo
-  close(11)
+    close(iunit)
+  endif
   
 end
 
@@ -188,26 +186,23 @@ subroutine read_t2(nO,nV,t2)
   double precision, intent(out) :: t2(nO, nO, nV, nV)
 
   ! internal
-  integer                       :: i,j,a,b
+  integer                       :: i,j,a,b, iunit
   logical                       :: ok
 
-  inquire(file=trim(ezfio_filename)//'/cc_utils/T1', exist=ok)
-  if (.not. ok) then
-     print*, 'There is no file'// trim(ezfio_filename)//'/cc_utils/T1'
-     print*, 'Do a first calculation with cc_write_t2 = True'
-     print*, 'and cc_guess_t2 /= read before setting cc_guess_t2 = read'
-     call abort
-  endif
-  open(unit=11, file=trim(ezfio_filename)//'/cc_utils/T2')
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-           read(11,'(F20.12)') t2(i,j,a,b)
+  integer, external :: getunitandopen
+
+  if (read_amplitudes) then
+    iunit = getUnitAndOpen(trim(ezfio_filename)//'/work/T2','r')
+    do b = 1, nV
+      do a = 1, nV
+        do j = 1, nO
+          do i = 1, nO
+             read(iunit,'(F20.12)') t2(i,j,a,b)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  close(11)
+    close(iunit)
+  endif
   
 end

From 2f246780eb0a08ee87ab5d98fe9c0e2f17685594 Mon Sep 17 00:00:00 2001
From: ydamour <yann.damour@hotmail.fr>
Date: Tue, 13 Jun 2023 14:05:13 +0200
Subject: [PATCH 75/79] fix bug in get_excitation_general

---
 src/utils_cc/org/phase.org | 2 ++
 src/utils_cc/phase.irp.f   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/utils_cc/org/phase.org b/src/utils_cc/org/phase.org
index 5f67859c..2156a251 100644
--- a/src/utils_cc/org/phase.org
+++ b/src/utils_cc/org/phase.org
@@ -137,6 +137,7 @@ subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,N
   do j = 1, 2
     k = 1
     do i = 1, n1(j)
+       if (k > n_anni(j)) exit
        if (l1(i,j) /= list_anni(k,j)) cycle
        pos_anni(k,j) = i
        k = k + 1
@@ -147,6 +148,7 @@ subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,N
   do j = 1, 2
     k = 1
     do i = 1, n2(j)
+       if (k > n_crea(j)) exit
        if (l2(i,j) /= list_crea(k,j)) cycle
        pos_crea(k,j) = i
        k = k + 1
diff --git a/src/utils_cc/phase.irp.f b/src/utils_cc/phase.irp.f
index 01b41f49..e0703fb8 100644
--- a/src/utils_cc/phase.irp.f
+++ b/src/utils_cc/phase.irp.f
@@ -96,6 +96,7 @@ subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,N
   do j = 1, 2
     k = 1
     do i = 1, n1(j)
+       if (k > n_anni(j)) exit
        if (l1(i,j) /= list_anni(k,j)) cycle
        pos_anni(k,j) = i
        k = k + 1
@@ -106,6 +107,7 @@ subroutine get_excitation_general(det1,det2,degree,n,list_anni,list_crea,phase,N
   do j = 1, 2
     k = 1
     do i = 1, n2(j)
+       if (k > n_crea(j)) exit
        if (l2(i,j) /= list_crea(k,j)) cycle
        pos_crea(k,j) = i
        k = k + 1

From a56644a808e0aea4c16d72c61b717ac4b2e0cabc Mon Sep 17 00:00:00 2001
From: ydamour <yann.damour@hotmail.fr>
Date: Tue, 13 Jun 2023 14:24:39 +0200
Subject: [PATCH 76/79] remove old stuffs

---
 src/mo_optimization/my_providers.irp.f | 141 -------------------------
 1 file changed, 141 deletions(-)
 delete mode 100644 src/mo_optimization/my_providers.irp.f

diff --git a/src/mo_optimization/my_providers.irp.f b/src/mo_optimization/my_providers.irp.f
deleted file mode 100644
index 7469ffd5..00000000
--- a/src/mo_optimization/my_providers.irp.f
+++ /dev/null
@@ -1,141 +0,0 @@
-! Dimensions of MOs
-
-
-BEGIN_PROVIDER [ integer, n_mo_dim ]
-  implicit none
-  BEGIN_DOC
-  ! Number of different pairs (i,j) of MOs we can build,
-  ! with i>j
-  END_DOC
-
-  n_mo_dim = mo_num*(mo_num-1)/2
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ integer, n_mo_dim_core ]
-  implicit none 
-  BEGIN_DOC
-  ! Number of different pairs (i,j) of core MOs we can build,
-  ! with i>j
-  END_DOC
-
-  n_mo_dim_core = dim_list_core_orb*(dim_list_core_orb-1)/2
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ integer, n_mo_dim_act ]
-  implicit none
-  BEGIN_DOC
-  ! Number of different pairs (i,j) of active MOs we can build,
-  ! with i>j
-  END_DOC
-
-  n_mo_dim_act = dim_list_act_orb*(dim_list_act_orb-1)/2
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ integer, n_mo_dim_inact ]
-  implicit none 
-  BEGIN_DOC
-  ! Number of different pairs (i,j) of inactive MOs we can build,
-  ! with i>j
-  END_DOC
-
-  n_mo_dim_inact = dim_list_inact_orb*(dim_list_inact_orb-1)/2
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ integer, n_mo_dim_virt ]
-  implicit none 
-  BEGIN_DOC
-  ! Number of different pairs (i,j) of virtual MOs we can build,
-  ! with i>j
-  END_DOC
-
-  n_mo_dim_virt = dim_list_virt_orb*(dim_list_virt_orb-1)/2
-
-END_PROVIDER
-
-! Energies/criterions
-
-BEGIN_PROVIDER [ double precision, my_st_av_energy ]
-  implicit none
-  BEGIN_DOC
-  ! State average CI energy
-  END_DOC
-
-  !call update_st_av_ci_energy(my_st_av_energy)
-  call state_average_energy(my_st_av_energy)
-
-END_PROVIDER
-
-! With all the MOs
-
-BEGIN_PROVIDER [ double precision, my_gradient_opt, (n_mo_dim) ]
-&BEGIN_PROVIDER [ double precision, my_CC1_opt ]
-  implicit none
-  BEGIN_DOC
-  ! - Gradient of the energy with respect to the MO rotations, for all the MOs.
-  ! - Maximal element of the gradient in absolute value 
-  END_DOC
-
-  double precision :: norm_grad
-
-  PROVIDE mo_two_e_integrals_in_map
-
-  call gradient_opt(n_mo_dim, my_gradient_opt, my_CC1_opt, norm_grad)
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, my_hessian_opt, (n_mo_dim, n_mo_dim) ]
-  implicit none
-  BEGIN_DOC
-  ! - Gradient of the energy with respect to the MO rotations, for all the MOs.
-  ! - Maximal element of the gradient in absolute value 
-  END_DOC
-
-  double precision, allocatable :: h_f(:,:,:,:)
-
-  PROVIDE mo_two_e_integrals_in_map
-
-  allocate(h_f(mo_num, mo_num, mo_num, mo_num))
-
-  call hessian_list_opt(n_mo_dim, my_hessian_opt, h_f)
-
-END_PROVIDER
-
-! With the list of active MOs
-! Can be generalized to any mo_class by changing the list/dimension
-
-BEGIN_PROVIDER [ double precision, my_gradient_list_opt, (n_mo_dim_act) ]
-&BEGIN_PROVIDER [ double precision, my_CC2_opt ]
-  implicit none
-  BEGIN_DOC
-  ! - Gradient of the energy with respect to the MO rotations, only for the active MOs !
-  ! - Maximal element of the gradient in absolute value 
-  END_DOC
-
-  double precision :: norm_grad
-
-  PROVIDE mo_two_e_integrals_in_map !one_e_dm_mo two_e_dm_mo mo_one_e_integrals 
-
-  call gradient_list_opt(n_mo_dim_act, dim_list_act_orb, list_act, my_gradient_list_opt, my_CC2_opt, norm_grad)
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, my_hessian_list_opt, (n_mo_dim_act, n_mo_dim_act) ]
-  implicit none
-  BEGIN_DOC
-  ! - Gradient of the energy with respect to the MO rotations, only for the active MOs !
-  ! - Maximal element of the gradient in absolute value 
-  END_DOC
-
-  double precision, allocatable :: h_f(:,:,:,:)
-
-  PROVIDE mo_two_e_integrals_in_map
-
-  allocate(h_f(dim_list_act_orb, dim_list_act_orb, dim_list_act_orb, dim_list_act_orb))
-
-  call hessian_list_opt(n_mo_dim_act, dim_list_act_orb, list_act, my_hessian_list_opt, h_f)
-
-END_PROVIDER

From 88f168724e65038b4d9ce9d4a566252de62f1fb5 Mon Sep 17 00:00:00 2001
From: ydamour <yann.damour@hotmail.fr>
Date: Thu, 15 Jun 2023 14:46:17 +0200
Subject: [PATCH 77/79] fix binary search (T)

---
 src/ccsd/ccsd_t_space_orb_stoch.irp.f | 82 +++++++++++++++------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index b669025e..31fe67ce 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -104,17 +104,17 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   integer*8, allocatable :: sampled(:)
 !  integer(omp_lock_kind), allocatable :: lock(:)
   integer*2       , allocatable :: abc(:,:)
-  integer*8                     :: Nabc, i8
+  integer*8                     :: Nabc, i8,kiter
   integer*8, allocatable :: iorder(:)
   double precision :: eocc
   double precision :: norm
-  integer :: kiter, isample
+  integer :: isample
 
 
   ! Prepare table of triplets (a,b,c)
 
   Nabc = (int(nV,8) * int(nV+1,8) * int(nV+2,8))/6_8 - nV
-  allocate (memo(Nabc), sampled(Nabc), Pabc(Nabc), waccu(Nabc))
+  allocate (memo(Nabc), sampled(Nabc), Pabc(Nabc), waccu(0:Nabc))
   allocate (abc(4,Nabc), iorder(Nabc)) !, lock(Nabc))
 
 !  eocc = 3.d0/dble(nO) * sum(f_o(1:nO))
@@ -124,21 +124,21 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
       do c = b+1, nV
         Nabc = Nabc + 1_8
         Pabc(Nabc) = -1.d0/(f_v(a) + f_v(b) + f_v(c))
-        abc(1,Nabc) = a
-        abc(2,Nabc) = b
-        abc(3,Nabc) = c
+        abc(1,Nabc) = int(a,2)
+        abc(2,Nabc) = int(b,2)
+        abc(3,Nabc) = int(c,2)
       enddo
 
       Nabc = Nabc + 1_8
-      abc(1,Nabc) = a
-      abc(2,Nabc) = b
-      abc(3,Nabc) = a
+      abc(1,Nabc) = int(a,2)
+      abc(2,Nabc) = int(b,2)
+      abc(3,Nabc) = int(a,2)
       Pabc(Nabc) = -1.d0/(2.d0*f_v(a) + f_v(b))
 
       Nabc = Nabc + 1_8
-      abc(1,Nabc) = b
-      abc(2,Nabc) = a
-      abc(3,Nabc) = b
+      abc(1,Nabc) = int(b,2)
+      abc(2,Nabc) = int(a,2)
+      abc(3,Nabc) = int(b,2)
       Pabc(Nabc) = -1.d0/(f_v(a) + 2.d0*f_v(b))
     enddo
   enddo
@@ -169,6 +169,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
    waccu(i8) = waccu(i8+1) - Pabc(i8+1)
   enddo
   waccu(:) = waccu(:) + 1.d0
+  waccu(0) = 0.d0
 
   logical :: converged, do_comp
   double precision :: eta, variance, error, sample
@@ -222,8 +223,12 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   do kiter=1,Nabc
 
     !$OMP MASTER
-    do while ((imin <= Nabc).and.(sampled(imin)>-1_8))
-      imin = imin+1
+    do while (imin <= Nabc)
+      if (sampled(imin)>-1_8) then
+        imin = imin+1
+      else
+        exit
+      endif
     enddo
 
     ! Deterministic part
@@ -301,6 +306,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
         endif
       enddo
 
+      isample = min(isample,nbuckets)
       do ieta=bounds(1,isample), Nabc
           w = dble(max(sampled(ieta),0_8))
           tmp = w * memo(ieta) * Pabc(ieta)
@@ -331,33 +337,39 @@ end
 
 
 
-integer*8 function binary_search(arr, key, size)
+integer*8 function binary_search(arr, key, sze)
     implicit none
     BEGIN_DOC
-! Searches the key in array arr(1:size) between l_in and r_in, and returns its index
+! Searches the key in array arr(1:sze) between l_in and r_in, and returns its index
     END_DOC
-    integer*8 :: size, i, j, mid, l_in, r_in
-    double precision, dimension(size) :: arr(1:size)
+    integer*8 :: sze, i, j, mid
+    double precision :: arr(0:sze)
     double precision :: key
 
-    i = 1_8
-    j = size
+    if ( key < arr(1) ) then
+      binary_search = 0_8
+      return 
+    end if
 
-    do while (j >= i)
-        mid = i + (j - i) / 2
-        if (arr(mid) >= key) then
-            if (mid > 1 .and. arr(mid - 1) < key) then
-                binary_search = mid
-                return
-            end if
-            j = mid - 1
-        else if (arr(mid) < key) then
-            i = mid + 1
-        else
-            binary_search = mid + 1
-            return
-        end if
+    if ( key >= arr(sze) ) then
+      binary_search = sze
+      return 
+    end if
+
+    i = 0_8
+    j = sze + 1_8
+
+    do while (.True.)
+      mid = (i + j) / 2_8
+      if ( key >= arr(mid) ) then
+        i = mid
+      else
+        j = mid
+      end if
+      if (j-i <= 1_8) then
+        binary_search = i
+        return
+      endif
     end do
-    binary_search = i
 end function binary_search
 

From 2ef05d01c9033abccc5f8f7166b33418741b9420 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 19 Jun 2023 23:39:53 +0200
Subject: [PATCH 78/79] j1b_type 4/104 modif

---
 src/ao_many_one_e_ints/listj1b.irp.f   | 14 +++---
 src/non_h_ints_mu/j12_nucl_utils.irp.f |  8 ++--
 src/non_h_ints_mu/jast_deriv.irp.f     |  6 +--
 src/tc_keywords/EZFIO.cfg              |  6 +++
 src/tc_keywords/j1b_pen.irp.f          | 64 +++++++++++++++++++++-----
 5 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/src/ao_many_one_e_ints/listj1b.irp.f b/src/ao_many_one_e_ints/listj1b.irp.f
index 02963605..33ca8085 100644
--- a/src/ao_many_one_e_ints/listj1b.irp.f
+++ b/src/ao_many_one_e_ints/listj1b.irp.f
@@ -62,6 +62,7 @@ END_PROVIDER
   double precision :: tmp_cent_x, tmp_cent_y, tmp_cent_z
 
   provide j1b_pen
+  provide j1b_pen_coef
 
   List_all_comb_b2_coef = 0.d0
   List_all_comb_b2_expo = 0.d0
@@ -127,8 +128,8 @@ END_PROVIDER
     List_all_comb_b2_expo(    1) = 0.d0
     List_all_comb_b2_cent(1:3,1) = 0.d0
     do i = 1, nucl_num
-      List_all_comb_b2_coef(  i+1) = -1.d0
-      List_all_comb_b2_expo(  i+1) = j1b_pen(   i)
+      List_all_comb_b2_coef(  i+1) = -1.d0 * j1b_pen_coef(i)
+      List_all_comb_b2_expo(  i+1) = j1b_pen(i)
       List_all_comb_b2_cent(1,i+1) = nucl_coord(i,1)
       List_all_comb_b2_cent(2,i+1) = nucl_coord(i,2)
       List_all_comb_b2_cent(3,i+1) = nucl_coord(i,3)
@@ -225,6 +226,7 @@ END_PROVIDER
   double precision :: dx, dy, dz, r2
 
   provide j1b_pen
+  provide j1b_pen_coef
 
   List_all_comb_b3_coef = 0.d0
   List_all_comb_b3_expo = 0.d0
@@ -296,8 +298,8 @@ END_PROVIDER
 
     do i = 1, nucl_num
       ii = ii + 1
-      List_all_comb_b3_coef(  ii) = -2.d0
-      List_all_comb_b3_expo(  ii) = j1b_pen(   i)
+      List_all_comb_b3_coef(  ii) = -2.d0 * j1b_pen_coef(i)
+      List_all_comb_b3_expo(  ii) = j1b_pen(i)
       List_all_comb_b3_cent(1,ii) = nucl_coord(i,1)
       List_all_comb_b3_cent(2,ii) = nucl_coord(i,2)
       List_all_comb_b3_cent(3,ii) = nucl_coord(i,3)
@@ -305,7 +307,7 @@ END_PROVIDER
 
     do i = 1, nucl_num
       ii = ii + 1
-      List_all_comb_b3_coef(  ii) = 1.d0
+      List_all_comb_b3_coef(  ii) = 1.d0 * j1b_pen_coef(i) * j1b_pen_coef(i)
       List_all_comb_b3_expo(  ii) = 2.d0 * j1b_pen(i)
       List_all_comb_b3_cent(1,ii) = nucl_coord(i,1)
       List_all_comb_b3_cent(2,ii) = nucl_coord(i,2)
@@ -337,7 +339,7 @@ END_PROVIDER
         
         ii = ii + 1
         ! x 2 to avoid doing integrals twice
-        List_all_comb_b3_coef(  ii) = 2.d0 * dexp(-tmp1*tmp2*tmp4*r2)
+        List_all_comb_b3_coef(  ii) = 2.d0 * dexp(-tmp1*tmp2*tmp4*r2) * j1b_pen_coef(i) * j1b_pen_coef(j)
         List_all_comb_b3_expo(  ii) = tmp3
         List_all_comb_b3_cent(1,ii) = tmp4 * (tmp1 * xi + tmp2 * xj)
         List_all_comb_b3_cent(2,ii) = tmp4 * (tmp1 * yi + tmp2 * yj)
diff --git a/src/non_h_ints_mu/j12_nucl_utils.irp.f b/src/non_h_ints_mu/j12_nucl_utils.irp.f
index 9b91a8ed..ac077fe0 100644
--- a/src/non_h_ints_mu/j12_nucl_utils.irp.f
+++ b/src/non_h_ints_mu/j12_nucl_utils.irp.f
@@ -35,7 +35,7 @@ BEGIN_PROVIDER [ double precision, v_1b, (n_points_final_grid)]
 
   elseif(j1b_type .eq. 4) then
 
-    ! v(r) = 1 - \sum_{a} \exp(-\alpha_a (r - r_a)^2)
+    ! v(r) = 1 - \sum_{a} \beta_a \exp(-\alpha_a (r - r_a)^2)
 
     do ipoint = 1, n_points_final_grid
 
@@ -51,7 +51,7 @@ BEGIN_PROVIDER [ double precision, v_1b, (n_points_final_grid)]
         dz = z - nucl_coord(j,3)
         d  = dx*dx + dy*dy + dz*dz
 
-        fact_r = fact_r - dexp(-a*d)
+        fact_r = fact_r - j1b_pen_coef(j) * dexp(-a*d)
       enddo
 
       v_1b(ipoint) = fact_r
@@ -125,7 +125,7 @@ BEGIN_PROVIDER [double precision, v_1b_grad, (3, n_points_final_grid)]
 
   elseif(j1b_type .eq. 4) then
 
-    ! v(r) = 1 - \sum_{a} \exp(-\alpha_a (r - r_a)^2)
+    ! v(r) = 1 - \sum_{a} \beta_a \exp(-\alpha_a (r - r_a)^2)
 
     do ipoint = 1, n_points_final_grid
 
@@ -144,7 +144,7 @@ BEGIN_PROVIDER [double precision, v_1b_grad, (3, n_points_final_grid)]
         r2 = dx*dx + dy*dy + dz*dz
 
         a = j1b_pen(j)
-        e = a * dexp(-a * r2)
+        e = a * j1b_pen_coef(j) * dexp(-a * r2)
 
         ax_der += e * dx
         ay_der += e * dy
diff --git a/src/non_h_ints_mu/jast_deriv.irp.f b/src/non_h_ints_mu/jast_deriv.irp.f
index 5e99600e..bd7ff6b7 100644
--- a/src/non_h_ints_mu/jast_deriv.irp.f
+++ b/src/non_h_ints_mu/jast_deriv.irp.f
@@ -296,7 +296,7 @@ double precision function j1b_nucl(r)
       d = ( (r(1) - nucl_coord(i,1)) * (r(1) - nucl_coord(i,1)) &
           + (r(2) - nucl_coord(i,2)) * (r(2) - nucl_coord(i,2)) &
           + (r(3) - nucl_coord(i,3)) * (r(3) - nucl_coord(i,3)) )
-      j1b_nucl = j1b_nucl - dexp(-a*d)
+      j1b_nucl = j1b_nucl - j1b_pen_coef(i) * dexp(-a*d)
     enddo
 
   elseif((j1b_type .eq. 5) .or. (j1b_type .eq. 105)) then
@@ -363,7 +363,7 @@ double precision function j1b_nucl_square(r)
       d = ( (r(1) - nucl_coord(i,1)) * (r(1) - nucl_coord(i,1)) &
           + (r(2) - nucl_coord(i,2)) * (r(2) - nucl_coord(i,2)) &
           + (r(3) - nucl_coord(i,3)) * (r(3) - nucl_coord(i,3)) )
-      j1b_nucl_square = j1b_nucl_square - dexp(-a*d)
+      j1b_nucl_square = j1b_nucl_square - j1b_pen_coef(i) * dexp(-a*d)
     enddo
     j1b_nucl_square = j1b_nucl_square * j1b_nucl_square
 
@@ -475,7 +475,7 @@ subroutine grad1_j1b_nucl(r, grad)
       y = r(2) - nucl_coord(i,2)
       z = r(3) - nucl_coord(i,3)
       d = x*x + y*y + z*z
-      e = a * dexp(-a*d)
+      e = a * j1b_pen_coef(i) * dexp(-a*d)
 
       fact_x += e * x
       fact_y += e * y
diff --git a/src/tc_keywords/EZFIO.cfg b/src/tc_keywords/EZFIO.cfg
index a69f5bac..ea1503c3 100644
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@@ -130,6 +130,12 @@ doc: exponents of the 1-body Jastrow
 interface: ezfio
 size: (nuclei.nucl_num)
 
+[j1b_pen_coef]
+type: double precision
+doc: coefficients of the 1-body Jastrow
+interface: ezfio
+size: (nuclei.nucl_num)
+
 [j1b_coeff]
 type: double precision
 doc: coeff of the 1-body Jastrow
diff --git a/src/tc_keywords/j1b_pen.irp.f b/src/tc_keywords/j1b_pen.irp.f
index 57250b52..3f1eb8ac 100644
--- a/src/tc_keywords/j1b_pen.irp.f
+++ b/src/tc_keywords/j1b_pen.irp.f
@@ -1,17 +1,22 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
+ BEGIN_PROVIDER [ double precision, j1b_pen     , (nucl_num) ]
+&BEGIN_PROVIDER [ double precision, j1b_pen_coef, (nucl_num) ]
 
   BEGIN_DOC
-  ! exponents of the 1-body Jastrow
+  ! parameters of the 1-body Jastrow
   END_DOC
 
   implicit none
   logical :: exists
+  integer :: i
+  integer :: ierr
 
   PROVIDE ezfio_filename
 
+  ! ---
+
   if (mpi_master) then
     call ezfio_has_tc_keywords_j1b_pen(exists)
   endif
@@ -23,7 +28,6 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
 
   IRP_IF MPI
     include 'mpif.h'
-    integer :: ierr
     call MPI_BCAST(j1b_pen, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
     if (ierr /= MPI_SUCCESS) then
       stop 'Unable to read j1b_pen with MPI'
@@ -31,7 +35,6 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
   IRP_ENDIF
 
   if (exists) then
-
     if (mpi_master) then
       write(6,'(A)') '.. >>>>> [ IO READ: j1b_pen ] <<<<< ..'
       call ezfio_get_tc_keywords_j1b_pen(j1b_pen)
@@ -42,19 +45,55 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
         endif
       IRP_ENDIF
     endif
-
   else
- 
-    integer :: i
     do i = 1, nucl_num
       j1b_pen(i) = 1d5
     enddo
-
   endif
- print*,'parameters for nuclei jastrow'
- do i = 1, nucl_num
-  print*,'i,Z,j1b_pen(i)',i,nucl_charge(i),j1b_pen(i)
- enddo
+
+  ! ---
+
+  if (mpi_master) then
+    call ezfio_has_tc_keywords_j1b_pen_coef(exists)
+  endif
+
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+
+  IRP_IF MPI
+    include 'mpif.h'
+    call MPI_BCAST(j1b_pen_coef, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      stop 'Unable to read j1b_pen_coef with MPI'
+    endif
+  IRP_ENDIF
+
+  if (exists) then
+    if (mpi_master) then
+      write(6,'(A)') '.. >>>>> [ IO READ: j1b_pen_coef ] <<<<< ..'
+      call ezfio_get_tc_keywords_j1b_pen_coef(j1b_pen_coef)
+      IRP_IF MPI
+        call MPI_BCAST(j1b_pen_coef, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          stop 'Unable to read j1b_pen_coef with MPI'
+        endif
+      IRP_ENDIF
+    endif
+  else
+    do i = 1, nucl_num
+      j1b_pen_coef(i) = 1d0
+    enddo
+  endif
+
+  ! ---
+
+  print *, ' parameters for nuclei jastrow'
+  print *, ' i, Z, j1b_pen, j1b_pen_coef'
+  do i = 1, nucl_num
+    print *, i, nucl_charge(i), j1b_pen(i), j1b_pen_coef(i)
+  enddo
 
 END_PROVIDER
 
@@ -114,3 +153,4 @@ BEGIN_PROVIDER [ double precision, j1b_coeff, (nucl_num) ]
 END_PROVIDER
 
 ! ---
+

From 8c2f6c9485d995f82b5fbfa37e58baa77cef8b6c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 19 Jun 2023 23:57:39 +0200
Subject: [PATCH 79/79] minor modif

---
 src/tc_keywords/j1b_pen.irp.f | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tc_keywords/j1b_pen.irp.f b/src/tc_keywords/j1b_pen.irp.f
index 3f1eb8ac..ebcd5107 100644
--- a/src/tc_keywords/j1b_pen.irp.f
+++ b/src/tc_keywords/j1b_pen.irp.f
@@ -63,7 +63,6 @@
   IRP_ENDIF
 
   IRP_IF MPI
-    include 'mpif.h'
     call MPI_BCAST(j1b_pen_coef, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
     if (ierr /= MPI_SUCCESS) then
       stop 'Unable to read j1b_pen_coef with MPI'