From 86b48454127d011a0d361dd651a6ef6ef2b798d5 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 26 May 2023 08:10:18 +0200
Subject: [PATCH 01/18] IO TC normal ordering added

---
 src/tc_bi_ortho/normal_ordered.irp.f | 147 +++++++++++++++------------
 src/tc_keywords/EZFIO.cfg            |   6 ++
 2 files changed, 90 insertions(+), 63 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 8adc7a63..f9728d05 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -1,3 +1,6 @@
+
+! ---
+
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC 
@@ -15,79 +18,97 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   integer(bit_kind), allocatable :: key_i_core(:,:)
   double precision :: hthree_aba,hthree_aaa,hthree_aab
   double precision :: wall0,wall1
- 
-  PROVIDE N_int
 
-  allocate( occ(N_int*bit_kind_size,2) )
-  allocate( key_i_core(N_int,2) )
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
-  endif
-
-  normal_two_body_bi_orth = 0.d0
-  print*,'Providing normal_two_body_bi_orth ...'
+  print*,' Providing normal_two_body_bi_orth ...'
   call wall_time(wall0)
 
- !$OMP PARALLEL                                                                         &
- !$OMP DEFAULT (NONE)                                                                   &
- !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
- !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
- !$OMP DO SCHEDULE (static) 
-  do hh1 = 1, n_act_orb
-    h1 = list_act(hh1) 
-    do pp1 = 1, n_act_orb
-      p1 = list_act(pp1)
-      do hh2 = 1, n_act_orb
-        h2 = list_act(hh2) 
-        do pp2 = 1, n_act_orb
-          p2 = list_act(pp2)
-          ! all contributions from the 3-e terms to the double excitations 
-          ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
-    
+  if(read_tc_norm_ord) then
 
-          ! opposite spin double excitations : s1 /= s2
-          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
+      read(11) normal_two_body_bi_orth
+    close(11)
 
-          ! same spin double excitations : s1 == s2 
-          if(h1<h2.and.p1.gt.p2)then
-           ! with opposite spin contributions 
-           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-           ! same spin double excitations with same spin contributions 
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-           else
-             hthree_aaa = 0.d0
-           endif
-          else
-           ! with opposite spin contributions 
-           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-           if(Ne(2).ge.3)then
-            ! same spin double excitations with same spin contributions 
-             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-           else
-             hthree_aaa = 0.d0
-           endif
-          endif
-          normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+  else
+
+    PROVIDE N_int
+
+    allocate( occ(N_int*bit_kind_size,2) )
+    allocate( key_i_core(N_int,2) )
+
+    if(core_tc_op) then
+      do i = 1, N_int
+        key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+      enddo
+      call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+    else
+      call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    endif
+
+    normal_two_body_bi_orth = 0.d0
+
+    !$OMP PARALLEL                                                                         &
+    !$OMP DEFAULT (NONE)                                                                   &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
+    !$OMP DO SCHEDULE (static) 
+    do hh1 = 1, n_act_orb
+      h1 = list_act(hh1) 
+      do pp1 = 1, n_act_orb
+        p1 = list_act(pp1)
+        do hh2 = 1, n_act_orb
+          h2 = list_act(hh2) 
+          do pp2 = 1, n_act_orb
+            p2 = list_act(pp2)
+            ! all contributions from the 3-e terms to the double excitations 
+            ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
+      
+
+            ! opposite spin double excitations : s1 /= s2
+            call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
+
+            ! same spin double excitations : s1 == s2 
+            if(h1<h2.and.p1.gt.p2)then
+             ! with opposite spin contributions 
+             call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+             ! same spin double excitations with same spin contributions 
+             if(Ne(2).ge.3)then
+               call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+             else
+               hthree_aaa = 0.d0
+             endif
+            else
+             ! with opposite spin contributions 
+             call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+             if(Ne(2).ge.3)then
+              ! same spin double excitations with same spin contributions 
+               call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+             else
+               hthree_aaa = 0.d0
+             endif
+            endif
+            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+          enddo
         enddo
       enddo
     enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate( occ )
+    deallocate( key_i_core )
+  endif
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
+  endif
 
   call wall_time(wall1)
-  print*,'Wall time for normal_two_body_bi_orth ',wall1-wall0
-
-  deallocate( occ )
-  deallocate( key_i_core )
+  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
 
 END_PROVIDER 
 
diff --git a/src/tc_keywords/EZFIO.cfg b/src/tc_keywords/EZFIO.cfg
index 3a26a6eb..de638da9 100644
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@@ -202,6 +202,12 @@ doc: Read/Write integrals int2_grad1_u12_ao, tc_grad_square_ao and tc_grad_and_l
 interface: ezfio,provider,ocaml
 default: None
 
+[io_tc_norm_ord]
+type: Disk_access
+doc: Read/Write normal_two_body_bi_orth from/to disk [ Write | Read | None ]
+interface: ezfio,provider,ocaml
+default: None
+
 [debug_tc_pt2]
 type: integer
 doc: If :: 1 then you compute the TC-PT2 the old way, :: 2 then you check with the new version but without three-body

From b8bfab1d7cd8576c9597d92f70822d903628a6a6 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 27 May 2023 22:34:40 +0200
Subject: [PATCH 02/18] start working on NO

---
 src/tc_bi_ortho/normal_ordered.irp.f | 252 ++++++++++++++++-----------
 1 file changed, 150 insertions(+), 102 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 8adc7a63..c30cd1ef 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -1,3 +1,6 @@
+
+! ---
+
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC 
@@ -8,13 +11,16 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   implicit none
 
-  integer :: i,h1,p1,h2,p2
-  integer :: hh1,hh2,pp1,pp2
+  integer                        :: i, h1, p1, h2, p2
+  integer                        :: hh1, hh2, pp1, pp2
   integer                        :: Ne(2)
+  double precision               :: hthree_aba, hthree_aaa, hthree_aab
+  double precision               :: wall0, wall1
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision :: hthree_aba,hthree_aaa,hthree_aab
-  double precision :: wall0,wall1
+
+  print*,' Providing normal_two_body_bi_orth ...'
+  call wall_time(wall0)
  
   PROVIDE N_int
 
@@ -23,23 +29,21 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   if(core_tc_op) then
     do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
     enddo
-    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
   else
-    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
   endif
 
-  normal_two_body_bi_orth = 0.d0
-  print*,'Providing normal_two_body_bi_orth ...'
-  call wall_time(wall0)
+  normal_two_body_bi_orth(1:mo_num,1:mo_num,1:mo_num,1:mo_num) = 0.d0
 
- !$OMP PARALLEL                                                                         &
- !$OMP DEFAULT (NONE)                                                                   &
- !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
- !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
- !$OMP DO SCHEDULE (static) 
+  !$OMP PARALLEL                                                                         &
+  !$OMP DEFAULT (NONE)                                                                   &
+  !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+  !$OMP SHARED  (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
+  !$OMP DO SCHEDULE (static) 
   do hh1 = 1, n_act_orb
     h1 = list_act(hh1) 
     do pp1 = 1, n_act_orb
@@ -48,50 +52,57 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
         h2 = list_act(hh2) 
         do pp2 = 1, n_act_orb
           p2 = list_act(pp2)
+
           ! all contributions from the 3-e terms to the double excitations 
           ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
     
-
           ! opposite spin double excitations : s1 /= s2
           call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
 
           ! same spin double excitations : s1 == s2 
-          if(h1<h2.and.p1.gt.p2)then
-           ! with opposite spin contributions 
-           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-           ! same spin double excitations with same spin contributions 
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-           else
-             hthree_aaa = 0.d0
-           endif
-          else
-           ! with opposite spin contributions 
-           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-           if(Ne(2).ge.3)then
+          if((h1 .lt. h2) .and. (p1 .gt. p2)) then
+
+            ! with opposite spin contributions 
+            call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+
             ! same spin double excitations with same spin contributions 
-             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-           else
-             hthree_aaa = 0.d0
-           endif
+            if(Ne(2) .ge. 3) then
+              call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+            else
+              hthree_aaa = 0.d0
+            endif
+
+          else
+
+            ! with opposite spin contributions 
+            call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+
+            if(Ne(2) .ge. 3) then
+              ! same spin double excitations with same spin contributions 
+              call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+            else
+              hthree_aaa = 0.d0
+            endif
+
           endif
-          normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+
+          normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0 * (hthree_aba + hthree_aab + hthree_aaa)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print*,'Wall time for normal_two_body_bi_orth ',wall1-wall0
+  !$OMP END DO
+  !$OMP END PARALLEL
 
   deallocate( occ )
   deallocate( key_i_core )
 
+  call wall_time(wall1)
+  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
+
 END_PROVIDER 
 
-
+! ---
 
 subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
@@ -106,30 +117,41 @@ subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
   !!!! double alpha/beta
   hthree = 0.d0
+
   do ii = 1, Ne(2) ! purely closed shell part 
     i = occ(ii,2)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
     int_exc_13 = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2, i,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
     int_exc_12 = -1.d0 * integral
-    hthree += 2.d0 * int_direct - 1.d0 * ( int_exc_13 + int_exc_12)
+
+    hthree += 2.d0 * int_direct - 1.d0 * (int_exc_13 + int_exc_12)
   enddo
+
   do ii = Ne(2) + 1, Ne(1) ! purely open-shell part 
-   i = occ(ii,1)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
+    i = occ(ii,1)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
     int_exc_13 = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2, i,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
     int_exc_12 = -1.d0 * integral
-    hthree += 1.d0 * int_direct - 0.5d0* ( int_exc_13 + int_exc_12)
+
+    hthree += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
   enddo
 
-end subroutine give_aba_contraction
-
+  return
+end
 
+! ---
 
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num, mo_num, mo_num)]
 
@@ -152,29 +174,31 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num,
   allocate( key_i_core(N_int,2) )
   allocate( occ(N_int*bit_kind_size,2) )
 
-  if(core_tc_op)then
-   do i = 1, N_int
-    key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-    key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-   enddo
-   call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
   else
-   call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
   endif
+
   normal_two_body_bi_orth_ab = 0.d0
   do hh1 = 1, n_act_orb
-   h1 = list_act(hh1) 
-   do pp1 = 1, n_act_orb
-    p1 = list_act(pp1)
-    do hh2 = 1, n_act_orb
-     h2 = list_act(hh2) 
-     do pp2 = 1, n_act_orb
-      p2 = list_act(pp2)
-      call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
-      normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
-     enddo
+    h1 = list_act(hh1) 
+    do pp1 = 1, n_act_orb
+      p1 = list_act(pp1)
+      do hh2 = 1, n_act_orb
+        h2 = list_act(hh2) 
+        do pp2 = 1, n_act_orb
+          p2 = list_act(pp2)
+          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
+
+          normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
+        enddo
+      enddo
     enddo
-   enddo
   enddo
 
   deallocate( key_i_core )
@@ -182,7 +206,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num,
 
 END_PROVIDER 
 
-
+! ---
 
 BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_act_orb, n_act_orb, n_act_orb)]
 
@@ -250,13 +274,14 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_
 
 END_PROVIDER 
 
-
+! ---
 
 subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
   BEGIN_DOC
-! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
+  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
   END_DOC
+
   use bitmasks ! you need to include the bitmasks_module.f90 features
 
   implicit none
@@ -270,48 +295,64 @@ subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
   hthree = 0.d0
   do ii = 1, Ne(2) ! purely closed shell part 
     i = occ(ii,2)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,p1,i ,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
     int_exc_l = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,i ,p2,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
     int_exc_ll= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,i ,p1,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
     int_exc_12= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
     int_exc_13= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(i ,p1,p2,i,h2,h1,integral)
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
     int_exc_23= -1.d0 * integral
 
-    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll -( int_exc_12+ int_exc_13+ int_exc_23  )
+    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
   enddo
+
   do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
     i = occ(ii,1)
-    call give_integrals_3_body_bi_ort(i ,p2,p1,i,h2,h1,integral)
-    int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,p1,i ,i,h2,h1,integral)
-    int_exc_l = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,i ,p2,i,h2,h1,integral)
-    int_exc_ll= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p2,i ,p1,i,h2,h1,integral)
-    int_exc_12= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2, i,i,h2,h1,integral)
-    int_exc_13= -1.d0 * integral
-    call give_integrals_3_body_bi_ort(i ,p1,p2,i,h2,h1,integral)
-    int_exc_23= -1.d0 * integral
 
-    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll -( int_exc_12+ int_exc_13+ int_exc_23  ))
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
+    int_exc_l = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+    int_exc_ll = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+    int_exc_23 = -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
   enddo
 
-end subroutine give_aaa_contraction
-
+  return
+end
 
+! ---
 
 subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-  implicit none
+
   use bitmasks ! you need to include the bitmasks_module.f90 features
-  integer, intent(in)           :: Nint, h1, h2, p1, p2
-  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+
+  implicit none
+  integer,          intent(in)  :: Nint, h1, h2, p1, p2
+  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
   double precision, intent(out) :: hthree
   integer                       :: ii, i
   double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
@@ -320,11 +361,18 @@ subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
   hthree = 0.d0
   do ii = 1, Ne(2) ! purely closed shell part 
     i = occ(ii,2)
-    call give_integrals_3_body_bi_ort(p2,p1,i,h2,h1,i,integral)
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
     int_direct = -1.d0 * integral
-    call give_integrals_3_body_bi_ort(p1,p2,i,h2,h1,i,integral)
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
     int_exc_23= -1.d0 * integral
-    hthree  +=  1.d0 * int_direct - int_exc_23
+
+    hthree +=  1.d0 * int_direct - int_exc_23
   enddo
 
-end subroutine give_aab_contraction
+  return
+end
+
+! ---
+

From 5ab6a1d7fba6fbff88ac858747783bb9292b9a89 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 1 Jun 2023 19:59:25 +0200
Subject: [PATCH 03/18] few modifs

---
 src/bi_ort_ints/semi_num_ints_mo.irp.f       |  5 +++
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f |  2 +-
 src/non_h_ints_mu/grad_squared.irp.f         | 21 ++++++++++-
 src/non_h_ints_mu/new_grad_tc.irp.f          |  1 +
 src/non_h_ints_mu/tc_integ.irp.f             |  6 +++
 src/non_h_ints_mu/total_tc_int.irp.f         |  5 +++
 src/tc_scf/rh_tcscf_diis.irp.f               | 39 +++++++++++++++++++-
 src/tc_scf/tc_scf.irp.f                      |  4 ++
 8 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 0d727785..771d3274 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -138,10 +138,13 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
       enddo
     enddo
 
+    FREE int2_grad1_u12_ao
+
   endif
 
   call wall_time(wall1)
   print *, ' wall time for int2_grad1_u12_ao_transp ', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -200,6 +203,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
     enddo
   enddo
 
+  FREE int2_grad1_u12_bimo_transp
+
 END_PROVIDER 
 
 ! ---
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index e8b56307..5a3730b3 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -79,7 +79,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
   integer,          intent(in)  :: n, l, k, m, j, i
   double precision, intent(out) :: integral
   integer                       :: ipoint
-  double precision              :: weight
+  double precision              :: weight, tmp
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_t
diff --git a/src/non_h_ints_mu/grad_squared.irp.f b/src/non_h_ints_mu/grad_squared.irp.f
index 3f1a9bf5..44a6ae65 100644
--- a/src/non_h_ints_mu/grad_squared.irp.f
+++ b/src/non_h_ints_mu/grad_squared.irp.f
@@ -231,6 +231,7 @@ BEGIN_PROVIDER [ double precision, grad12_j12, (ao_num, ao_num, n_points_final_g
   call wall_time(time0)
 
   PROVIDE j1b_type
+  PROVIDE int2_grad1u2_grad2u2_j1b2
 
   do ipoint = 1, n_points_final_grid
     tmp1 = v_1b(ipoint)
@@ -242,6 +243,8 @@ BEGIN_PROVIDER [ double precision, grad12_j12, (ao_num, ao_num, n_points_final_g
     enddo
   enddo
 
+  FREE int2_grad1u2_grad2u2_j1b2
+
   !if(j1b_type .eq. 0) then 
   !  grad12_j12 = 0.d0
   !  do ipoint = 1, n_points_final_grid
@@ -262,6 +265,7 @@ BEGIN_PROVIDER [ double precision, grad12_j12, (ao_num, ao_num, n_points_final_g
 
   call wall_time(time1)
   print*, ' Wall time for grad12_j12 = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -278,6 +282,9 @@ BEGIN_PROVIDER [double precision, u12sq_j1bsq, (ao_num, ao_num, n_points_final_g
   print*, ' providing u12sq_j1bsq ...'
   call wall_time(time0)
 
+  ! do not free here
+  PROVIDE int2_u2_j1b2
+
   do ipoint = 1, n_points_final_grid
     tmp_x = v_1b_grad(1,ipoint)
     tmp_y = v_1b_grad(2,ipoint)
@@ -292,6 +299,7 @@ BEGIN_PROVIDER [double precision, u12sq_j1bsq, (ao_num, ao_num, n_points_final_g
 
   call wall_time(time1)
   print*, ' Wall time for u12sq_j1bsq = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -310,6 +318,9 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b, (ao_num, ao_num,
   print*, ' providing u12_grad1_u12_j1b_grad1_j1b ...'
   call wall_time(time0)
 
+  PROVIDE int2_u_grad1u_j1b2
+  PROVIDE int2_u_grad1u_x_j1b2
+
   do ipoint = 1, n_points_final_grid
 
     x     = final_grid_points(1,ipoint)
@@ -340,14 +351,17 @@ BEGIN_PROVIDER [ double precision, u12_grad1_u12_j1b_grad1_j1b, (ao_num, ao_num,
     enddo
   enddo
 
+  FREE int2_u_grad1u_j1b2
+  FREE int2_u_grad1u_x_j1b2
+
   call wall_time(time1)
   print*, ' Wall time for u12_grad1_u12_j1b_grad1_j1b = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
 ! ---
 
-
 BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]
 
   BEGIN_DOC
@@ -401,6 +415,8 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
               , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid &
               , 0.d0, tc_grad_square_ao, ao_num*ao_num)
 
+    FREE int2_grad1_u12_square_ao
+
     ! ---
 
     if(((j1b_type .eq. 3) .or. (j1b_type .eq. 4)) .and. use_ipp) then
@@ -442,6 +458,8 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
       call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0     &
                 , int2_u2_j1b2(1,1,1), ao_num*ao_num, b_mat(1,1,1), n_points_final_grid &
                 , 1.d0, tc_grad_square_ao, ao_num*ao_num)
+
+      FREE int2_u2_j1b2
     endif
 
     ! ---
@@ -478,6 +496,7 @@ BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao
 
   call wall_time(time1)
   print*, ' Wall time for tc_grad_square_ao = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER
 
diff --git a/src/non_h_ints_mu/new_grad_tc.irp.f b/src/non_h_ints_mu/new_grad_tc.irp.f
index 24e7e743..499ffe9d 100644
--- a/src/non_h_ints_mu/new_grad_tc.irp.f
+++ b/src/non_h_ints_mu/new_grad_tc.irp.f
@@ -284,6 +284,7 @@ BEGIN_PROVIDER [double precision, tc_grad_and_lapl_ao, (ao_num, ao_num, ao_num,
 
   call wall_time(time1)
   print*, ' Wall time for tc_grad_and_lapl_ao = ', time1 - time0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/non_h_ints_mu/tc_integ.irp.f b/src/non_h_ints_mu/tc_integ.irp.f
index d5995ae5..8251fc71 100644
--- a/src/non_h_ints_mu/tc_integ.irp.f
+++ b/src/non_h_ints_mu/tc_integ.irp.f
@@ -176,6 +176,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
 
   call wall_time(time1)
   print*, ' wall time for int2_grad1_u12_ao =', time1-time0 
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -242,6 +243,8 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
       !$OMP END DO
       !$OMP END PARALLEL
 
+      FREE u12sq_j1bsq grad12_j12
+
     else
 
       PROVIDE u12sq_j1bsq u12_grad1_u12_j1b_grad1_j1b grad12_j12
@@ -262,6 +265,8 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
       !$OMP END DO
       !$OMP END PARALLEL
 
+      FREE u12sq_j1bsq u12_grad1_u12_j1b_grad1_j1b grad12_j12
+
     endif
 
   elseif(j1b_type .ge. 100) then
@@ -324,6 +329,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
 
   call wall_time(time1)
   print*, ' wall time for int2_grad1_u12_square_ao =', time1-time0 
+  call print_memory_usage()
 
 END_PROVIDER
 
diff --git a/src/non_h_ints_mu/total_tc_int.irp.f b/src/non_h_ints_mu/total_tc_int.irp.f
index 450bbef0..2034872a 100644
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@@ -84,8 +84,13 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist, (ao_num, ao_num, ao_num, ao
     enddo
   endif
 
+  FREE tc_grad_square_ao
+  FREE tc_grad_and_lapl_ao
+  FREE ao_two_e_coul
+
   call wall_time(wall1)
   print *, ' wall time for ao_tc_int_chemist ', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/tc_scf/rh_tcscf_diis.irp.f b/src/tc_scf/rh_tcscf_diis.irp.f
index 20260a95..0504373c 100644
--- a/src/tc_scf/rh_tcscf_diis.irp.f
+++ b/src/tc_scf/rh_tcscf_diis.irp.f
@@ -11,6 +11,7 @@ subroutine rh_tcscf_diis()
 
   integer                       :: i, j, it
   integer                       :: dim_DIIS, index_dim_DIIS
+  logical                       :: converged
   double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, e_save, e_delta
   double precision              :: tc_grad, g_save, g_delta, g_delta_th
   double precision              :: level_shift_save, rate_th
@@ -92,8 +93,9 @@ subroutine rh_tcscf_diis()
 
   PROVIDE FQS_SQF_ao Fock_matrix_tc_ao_tot
 
+  converged = .false.
   !do while((tc_grad .gt. dsqrt(thresh_tcscf)) .and. (er_DIIS .gt. dsqrt(thresh_tcscf)))
-  do while(er_DIIS .gt. dsqrt(thresh_tcscf))
+  do while(.not. converged)
 
     call wall_time(t0)
 
@@ -218,21 +220,56 @@ subroutine rh_tcscf_diis()
     !g_delta_th  = dabs(tc_grad) ! g_delta)
     er_delta_th = dabs(er_DIIS) !er_delta)
 
+    converged = er_DIIS .lt. dsqrt(thresh_tcscf)
+
     call wall_time(t1)
     !write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
     !  it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
     write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
       it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
 
+
+!   Write data in JSON file
+
+    call lock_io
+    if (it == 1) then
+      write(json_unit, json_dict_uopen_fmt)
+    else
+      write(json_unit, json_dict_close_uopen_fmt)
+    endif
+    write(json_unit, json_int_fmt)  ' iteration      ', it
+    write(json_unit, json_real_fmt) ' SCF TC Energy  ', etc_tot
+    write(json_unit, json_real_fmt) ' E(1e)          ', etc_1e
+    write(json_unit, json_real_fmt) ' E(2e)          ', etc_2e
+    write(json_unit, json_real_fmt) ' E(3e)          ', etc_3e
+    write(json_unit, json_real_fmt) ' delta Energy   ', e_delta
+    write(json_unit, json_real_fmt) ' DIIS error     ', er_DIIS
+    write(json_unit, json_real_fmt) ' level_shift    ', level_shift_tcscf
+    write(json_unit, json_real_fmt) ' DIIS           ', dim_DIIS
+    write(json_unit, json_real_fmt) ' Wall time (min)', (t1-t0)/60.d0
+    call unlock_io
+
     if(er_delta .lt. 0.d0) then
       call ezfio_set_tc_scf_bitc_energy(etc_tot)
       call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
       call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
+      write(json_unit, json_true_fmt) 'saved'
+    else
+      write(json_unit, json_false_fmt) 'saved'
     endif
+    call lock_io
 
+    if (converged) then
+      write(json_unit, json_true_fmtx) 'converged'
+    else
+      write(json_unit, json_false_fmtx) 'converged'
+    endif
+    call unlock_io
     if(qp_stop()) exit
   enddo
 
+  write(json_unit, json_dict_close_fmtx)
+
   ! ---
 
   print *, ' TCSCF DIIS converged !'
diff --git a/src/tc_scf/tc_scf.irp.f b/src/tc_scf/tc_scf.irp.f
index 88ddd26c..04c4f92d 100644
--- a/src/tc_scf/tc_scf.irp.f
+++ b/src/tc_scf/tc_scf.irp.f
@@ -8,6 +8,8 @@ program tc_scf
 
   implicit none
 
+  write(json_unit,json_array_open_fmt) 'tc-scf'
+
   print *, ' starting ...'
 
   my_grid_becke  = .True.
@@ -57,6 +59,8 @@ program tc_scf
 
   endif
 
+  write(json_unit,json_array_close_fmtx)
+  call json_close
 
 end
 

From 6d01eb42ca24a4265710b20913d64c9fb3117298 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:16:39 +0200
Subject: [PATCH 04/18] print mem details

---
 src/bi_ort_ints/three_body_ijm.irp.f         | 7 +++++++
 src/bi_ort_ints/three_body_ijmk.irp.f        | 6 ++++++
 src/bi_ort_ints/three_body_ijmkl.irp.f       | 6 ++++++
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f | 1 +
 src/non_h_ints_mu/tc_integ.irp.f             | 2 ++
 src/non_h_ints_mu/total_tc_int.irp.f         | 4 +---
 6 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/bi_ort_ints/three_body_ijm.irp.f b/src/bi_ort_ints/three_body_ijm.irp.f
index 4d21cb93..b34638b8 100644
--- a/src/bi_ort_ints/three_body_ijm.irp.f
+++ b/src/bi_ort_ints/three_body_ijm.irp.f
@@ -49,6 +49,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_direct_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -102,6 +103,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_cycle_1_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_cycle_1_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -155,6 +157,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_cycle_2_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_cycle_2_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -208,6 +211,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch23_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch23_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -261,6 +265,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch13_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch13_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -306,6 +311,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch12_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch12_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -359,6 +365,7 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_exch12_bi_ort_new, (mo_num, mo_
 
   call wall_time(wall1)
   print *, ' wall time for three_e_3_idx_exch12_bi_ort_new', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 5afd49ab..95b57e37 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -43,6 +43,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_direct_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -90,6 +91,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_cycle_1_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -137,6 +139,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_cycle_2_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -184,6 +187,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER
 
@@ -230,6 +234,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_exch13_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -277,6 +282,7 @@ BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_4_idx_exch12_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index ae4c9bd5..507408e5 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -44,6 +44,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -93,6 +94,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -142,6 +144,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -191,6 +194,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -240,6 +244,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -289,6 +294,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index 5a3730b3..f82e8725 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -57,6 +57,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
 
   call wall_time(wall1)
   print *, ' wall time for three_body_ints_bi_ort', wall1 - wall0
+  call print_memory_usage()
 ! if(write_three_body_ints_bi_ort)then
 !  print*,'Writing three_body_ints_bi_ort on disk ...'
 !  call write_array_6_index_tensor(mo_num,three_body_ints_bi_ort,name_file)
diff --git a/src/non_h_ints_mu/tc_integ.irp.f b/src/non_h_ints_mu/tc_integ.irp.f
index 8251fc71..b2c0df31 100644
--- a/src/non_h_ints_mu/tc_integ.irp.f
+++ b/src/non_h_ints_mu/tc_integ.irp.f
@@ -100,6 +100,8 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
       !$OMP END DO
       !$OMP END PARALLEL
 
+      FREE v_ij_erf_rk_cst_mu_j1b v_ij_u_cst_mu_j1b x_v_ij_erf_rk_cst_mu_j1b
+
     elseif(j1b_type .ge. 100) then
 
       PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
diff --git a/src/non_h_ints_mu/total_tc_int.irp.f b/src/non_h_ints_mu/total_tc_int.irp.f
index 2034872a..afa10305 100644
--- a/src/non_h_ints_mu/total_tc_int.irp.f
+++ b/src/non_h_ints_mu/total_tc_int.irp.f
@@ -84,9 +84,7 @@ BEGIN_PROVIDER [double precision, ao_tc_int_chemist, (ao_num, ao_num, ao_num, ao
     enddo
   endif
 
-  FREE tc_grad_square_ao
-  FREE tc_grad_and_lapl_ao
-  FREE ao_two_e_coul
+  FREE tc_grad_square_ao tc_grad_and_lapl_ao ao_two_e_coul
 
   call wall_time(wall1)
   print *, ' wall time for ao_tc_int_chemist ', wall1 - wall0

From 107cc3f2fbfb13fdea4bad734c8bc5d11d9d8df8 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:19:25 +0200
Subject: [PATCH 05/18] fixed bug in TC-VAR

---
 src/tc_bi_ortho/tc_utils.irp.f | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/tc_bi_ortho/tc_utils.irp.f b/src/tc_bi_ortho/tc_utils.irp.f
index f8f648e8..737c393b 100644
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@@ -38,15 +38,16 @@ subroutine write_tc_var()
 
   implicit none
   integer          :: i, j, k
-  double precision :: hmono, htwoe, hthree, htot
+  double precision :: hmono, htwoe, hthree, htot_1j, htot_j1
   double precision :: SIGMA_TC
 
   do k = 1, n_states
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
-      SIGMA_TC = SIGMA_TC + htot * htot
+      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
+      call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
+      SIGMA_TC = SIGMA_TC + htot_1j * htot_j1
     enddo
 
     print *, " state    : ", k

From 82b2d8bd98e9f3d543b74f766553d28166486094 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:48:23 +0200
Subject: [PATCH 06/18] avoid long name in cosgtos

---
 .../two_e_Coul_integrals_cosgtos.irp.f        | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
index 527a98d5..ea9ff009 100644
--- a/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
+++ b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
@@ -29,14 +29,14 @@ double precision function ao_two_e_integral_cosgtos(i, j, k, l)
   complex*16          :: integral5, integral6, integral7, integral8
   complex*16          :: integral_tot
 
-  double precision    :: ao_two_e_integral_cosgtos_schwartz_accel
+  double precision    :: ao_2e_cosgtos_schwartz_accel
   complex*16          :: ERI_cosgtos
   complex*16          :: general_primitive_integral_cosgtos
 
   if(ao_prim_num(i) * ao_prim_num(j) * ao_prim_num(k) * ao_prim_num(l) > 1024) then
 
     !print *, ' with shwartz acc '
-    ao_two_e_integral_cosgtos = ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
+    ao_two_e_integral_cosgtos = ao_2e_cosgtos_schwartz_accel(i, j, k, l)
 
   else
     !print *, ' without shwartz acc '
@@ -294,7 +294,7 @@ end function ao_two_e_integral_cosgtos
 
 ! ---
 
-double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
+double precision function ao_2e_cosgtos_schwartz_accel(i, j, k, l)
 
   BEGIN_DOC
   !  integral of the AO basis <ik|jl> or (ij|kl)
@@ -329,7 +329,7 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
   complex*16                    :: ERI_cosgtos
   complex*16                    :: general_primitive_integral_cosgtos
 
-  ao_two_e_integral_cosgtos_schwartz_accel = 0.d0
+  ao_2e_cosgtos_schwartz_accel = 0.d0
 
   dim1 = n_pt_max_integrals
 
@@ -519,8 +519,7 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
 
             integral_tot = integral1 + integral2 + integral3 + integral4 + integral5 + integral6 + integral7 + integral8
 
-            ao_two_e_integral_cosgtos_schwartz_accel = ao_two_e_integral_cosgtos_schwartz_accel &
-                                                     + coef4 * 2.d0 * real(integral_tot)
+            ao_2e_cosgtos_schwartz_accel = ao_2e_cosgtos_schwartz_accel + coef4 * 2.d0 * real(integral_tot)
           enddo ! s
         enddo  ! r
       enddo   ! q
@@ -698,8 +697,7 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
 
             integral_tot = integral1 + integral2 + integral3 + integral4 + integral5 + integral6 + integral7 + integral8
 
-            ao_two_e_integral_cosgtos_schwartz_accel = ao_two_e_integral_cosgtos_schwartz_accel &
-                                      + coef4 * 2.d0 * real(integral_tot)
+            ao_2e_cosgtos_schwartz_accel = ao_2e_cosgtos_schwartz_accel + coef4 * 2.d0 * real(integral_tot)
           enddo ! s
         enddo  ! r
       enddo   ! q
@@ -709,11 +707,11 @@ double precision function ao_two_e_integral_cosgtos_schwartz_accel(i, j, k, l)
 
   deallocate(schwartz_kl)
 
-end function ao_two_e_integral_cosgtos_schwartz_accel
+end function ao_2e_cosgtos_schwartz_accel
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, ao_two_e_integral_cosgtos_schwartz, (ao_num,ao_num)  ]
+BEGIN_PROVIDER [ double precision, ao_2e_cosgtos_schwartz, (ao_num,ao_num)]
 
   BEGIN_DOC
   !  Needed to compute Schwartz inequalities
@@ -723,16 +721,16 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_cosgtos_schwartz, (ao_num,a
   integer          :: i, k
   double precision :: ao_two_e_integral_cosgtos
 
-  ao_two_e_integral_cosgtos_schwartz(1,1) = ao_two_e_integral_cosgtos(1, 1, 1, 1)
+  ao_2e_cosgtos_schwartz(1,1) = ao_two_e_integral_cosgtos(1, 1, 1, 1)
 
- !$OMP PARALLEL DO PRIVATE(i,k)                                       &
- !$OMP             DEFAULT(NONE)                                      &
- !$OMP             SHARED(ao_num, ao_two_e_integral_cosgtos_schwartz) &
+ !$OMP PARALLEL DO PRIVATE(i,k)                           &
+ !$OMP             DEFAULT(NONE)                          &
+ !$OMP             SHARED(ao_num, ao_2e_cosgtos_schwartz) &
  !$OMP             SCHEDULE(dynamic)
   do i = 1, ao_num
     do k = 1, i
-      ao_two_e_integral_cosgtos_schwartz(i,k) = dsqrt(ao_two_e_integral_cosgtos(i, i, k, k))
-      ao_two_e_integral_cosgtos_schwartz(k,i) = ao_two_e_integral_cosgtos_schwartz(i,k)
+      ao_2e_cosgtos_schwartz(i,k) = dsqrt(ao_two_e_integral_cosgtos(i, i, k, k))
+      ao_2e_cosgtos_schwartz(k,i) = ao_2e_cosgtos_schwartz(i,k)
     enddo
   enddo
  !$OMP END PARALLEL DO

From cab3b12b9b397933ca438717846d28d3164d4804 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Fri, 2 Jun 2023 20:55:51 +0200
Subject: [PATCH 07/18] minor modif in names

---
 src/tc_bi_ortho/tc_utils.irp.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tc_bi_ortho/tc_utils.irp.f b/src/tc_bi_ortho/tc_utils.irp.f
index e0f29eb8..9023e2f0 100644
--- a/src/tc_bi_ortho/tc_utils.irp.f
+++ b/src/tc_bi_ortho/tc_utils.irp.f
@@ -45,8 +45,8 @@ subroutine write_tc_var()
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
-      call htilde_mu_mat_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
       SIGMA_TC = SIGMA_TC + htot_1j * htot_j1
     enddo
 

From 072bea8041a5414da00bc8ddc001186c3c9ff269 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 3 Jun 2023 22:12:30 +0200
Subject: [PATCH 08/18] Improve 4idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f          | 186 +++++-
 src/bi_ort_ints/three_body_ijm.irp.f       |  14 +-
 src/bi_ort_ints/three_body_ijmk.irp.f      | 686 +++++++++++++--------
 src/bi_ort_ints/three_body_ijmk_old.irp.f  | 290 +++++++++
 src/bi_ort_ints/three_body_ijmkl.irp.f     | 299 +++++----
 src/tc_bi_ortho/slater_tc_opt.irp.f        |   3 +-
 src/tc_bi_ortho/slater_tc_opt_single.irp.f |   7 +-
 src/tc_bi_ortho/symmetrized_3_e_int.irp.f  |   6 +-
 8 files changed, 1071 insertions(+), 420 deletions(-)
 create mode 100644 src/bi_ort_ints/three_body_ijmk_old.irp.f

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 42bbe315..bb0424cd 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -8,8 +8,9 @@ program bi_ort_ints
   my_n_pt_a_grid = 14
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 ! call test_3e
- call test_5idx
-!  call test_5idx2
+! call test_5idx
+! call test_5idx2
+ call test_4idx
 end
 
 subroutine test_5idx2
@@ -145,3 +146,184 @@ subroutine test_5idx
 
 
 end
+
+! ---
+
+subroutine test_4idx()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  thr = 1d-5
+
+  PROVIDE three_e_4_idx_direct_bi_ort_old
+  PROVIDE three_e_4_idx_direct_bi_ort 
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_direct_bi_ort    (l,k,j,i)
+          ref = three_e_4_idx_direct_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_direct_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_direct_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch13_bi_ort_old
+  PROVIDE three_e_4_idx_exch13_bi_ort 
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch13_bi_ort   (l,k,j,i)
+          ref = three_e_4_idx_exch13_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch13_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch13_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+!  PROVIDE three_e_4_idx_exch12_bi_ort_old
+!  PROVIDE three_e_4_idx_exch12_bi_ort 
+!
+!  accu = 0.d0
+!  do i = 1, mo_num
+!    do j = 1, mo_num
+!      do k = 1, mo_num
+!        do l = 1, mo_num
+!
+!          new = three_e_4_idx_exch12_bi_ort    (l,k,j,i)
+!          ref = three_e_4_idx_exch12_bi_ort_old(l,k,j,i)
+!          contrib = dabs(new - ref)
+!          accu += contrib
+!          if(contrib .gt. thr) then
+!            print*, ' problem in three_e_4_idx_exch12_bi_ort'
+!            print*, l, k, j, i
+!            print*, ref, new, contrib
+!            stop
+!          endif
+!
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  print*, ' accu on three_e_4_idx_exch12_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_cycle_1_bi_ort_old
+  PROVIDE three_e_4_idx_cycle_1_bi_ort
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_cycle_1_bi_ort    (l,k,j,i)
+          ref = three_e_4_idx_cycle_1_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_cycle_1_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_cycle_1_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+!  PROVIDE three_e_4_idx_cycle_2_bi_ort_old
+!  PROVIDE three_e_4_idx_cycle_2_bi_ort
+!
+!  accu = 0.d0
+!  do i = 1, mo_num
+!    do j = 1, mo_num
+!      do k = 1, mo_num
+!        do l = 1, mo_num
+!
+!          new = three_e_4_idx_cycle_2_bi_ort    (l,k,j,i)
+!          ref = three_e_4_idx_cycle_2_bi_ort_old(l,k,j,i)
+!          contrib = dabs(new - ref)
+!          accu += contrib
+!          if(contrib .gt. thr) then
+!            print*, ' problem in three_e_4_idx_cycle_2_bi_ort'
+!            print*, l, k, j, i
+!            print*, ref, new, contrib
+!            stop
+!          endif
+!
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  print*, ' accu on three_e_4_idx_cycle_2_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  PROVIDE three_e_4_idx_exch23_bi_ort_old
+  PROVIDE three_e_4_idx_exch23_bi_ort
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = three_e_4_idx_exch23_bi_ort    (l,k,j,i)
+          ref = three_e_4_idx_exch23_bi_ort_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem in three_e_4_idx_exch23_bi_ort'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on three_e_4_idx_exch23_bi_ort = ', accu / dble(mo_num)**4
+
+  ! ---
+
+  return
+end
diff --git a/src/bi_ort_ints/three_body_ijm.irp.f b/src/bi_ort_ints/three_body_ijm.irp.f
index b34638b8..ae100fb5 100644
--- a/src/bi_ort_ints/three_body_ijm.irp.f
+++ b/src/bi_ort_ints/three_body_ijm.irp.f
@@ -23,11 +23,11 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
- !$OMP PARALLEL                 &
- !$OMP DEFAULT (NONE)           &
- !$OMP PRIVATE (i,j,m,integral) & 
- !$OMP SHARED (mo_num,three_e_3_idx_direct_bi_ort)
- !$OMP DO SCHEDULE (dynamic)
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (i,j,m,integral) & 
+  !$OMP SHARED (mo_num,three_e_3_idx_direct_bi_ort)
+  !$OMP DO SCHEDULE (dynamic)
   do i = 1, mo_num
     do j = 1, mo_num
       do m = j, mo_num
@@ -36,8 +36,8 @@ BEGIN_PROVIDER [ double precision, three_e_3_idx_direct_bi_ort, (mo_num, mo_num,
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END DO
+  !$OMP END PARALLEL
 
   do i = 1, mo_num
     do j = 1, mo_num
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 95b57e37..39a31751 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -1,287 +1,467 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+!&BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
   ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
   !
-  ! three_e_4_idx_direct_bi_ort(m,j,k,i) = <mjk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_direct_bi_ort (m,j,k,i) = < m j k | -L | m j i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch13_bi_ort (m,j,k,i) = < m j k | -L | i j m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_exch12_bi_ort (m,j,k,i) = < m j k | -L | m i j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                       = three_e_4_idx_exch13_bi_ort (j,m,k,i) 
+  ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = < m j k | -L | j i m > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = < m j k | -L | i m j > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !                                       = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
   !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
-  END_DOC
-
- implicit none
- integer          :: i, j, k, m
- double precision :: integral, wall1, wall0
-
-  three_e_4_idx_direct_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_direct_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_direct_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, m, j, i, integral)
-          three_e_4_idx_direct_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_direct_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = <mjk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  ! three_e_4_idx_direct_bi_ort (m,j,k,i) : Lk Ri Imm Ijj + Lj Rj Imm Iki + Lm Rm Ijj Iki 
+  ! three_e_4_idx_exch13_bi_ort (m,j,k,i) : Lk Rm Imi Ijj + Lj Rj Imi Ikm + Lm Ri Ijj Ikm 
+  ! three_e_4_idx_cycle_1_bi_ort(m,j,k,i) : Lk Rm Imj Iji + Lj Ri Imj Ikm + Lm Rj Iji Ikm 
   !
   END_DOC
 
   implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
+  integer                       :: ipoint, i, j, k, l, m
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:), tmp4(:,:,:,:)
+  double precision, allocatable :: tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp5(:,:,:)
+  double precision, allocatable :: tmp7(:,:)
+  double precision, allocatable :: tmp_3d(:,:,:)
 
-  three_e_4_idx_cycle_1_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_cycle_1_bi_ort ...'
+  print *, ' Providing the three_e_4_idx_bi_ort ...'
   call wall_time(wall0)
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_cycle_1_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp4(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1, tmp2, tmp3, tmp4)
+  !$OMP DO COLLAPSE(2)
   do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, j, i, m, integral)
-          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
 
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_cycle_1_bi_ort', wall1 - wall0
-  call print_memory_usage()
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * final_weight_at_r_vector(ipoint)
 
-END_PROVIDER 
+        tmp2(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
+        tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
 
-! --
+        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
 
-BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = <mjk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_cycle_2_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_cycle_2_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_cycle_2_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, i, m, j, integral)
-          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_cycle_2_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch23_bi_ort(m,j,k,i) = <mjk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_exch23_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_exch23_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_exch23_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, j, m, i, integral)
-          three_e_4_idx_exch23_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch13_bi_ort(m,j,k,i) = <mjk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_exch13_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_exch13_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_exch13_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, i, j, m, integral)
-          three_e_4_idx_exch13_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch13_bi_ort', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  ! 
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_4_idx_exch12_bi_ort(m,j,k,i) = <mjk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  ! 
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m
-  double precision :: integral, wall1, wall0
-
-  three_e_4_idx_exch12_bi_ort = 0.d0
-  print *, ' Providing the three_e_4_idx_exch12_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_4_idx_exch12_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          call give_integrals_3_body_bi_ort(m, j, k, m, i, j, integral)
-          three_e_4_idx_exch12_bi_ort(m,j,k,i) = -1.d0 * integral 
-        enddo
+        tmp4(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp4(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp4(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp4, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort(m,j,k,i) = -tmp_4d(m,i,j,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp3, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp2)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+            , tmp3, 3*n_points_final_grid, tmp4, 3*n_points_final_grid            &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp3)
+  deallocate(tmp4)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = -tmp_4d(m,k,j,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+
+
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
+            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_direct_bi_ort(m,j,k,i) = three_e_4_idx_direct_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp7(n_points_final_grid,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, ipoint)                                       &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector,  &
+  !$OMP         tmp7)
+  !$OMP DO
+  do i = 1, mo_num
+    do ipoint = 1, n_points_final_grid
+      tmp7(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+
+  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+
+  do m = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (i, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         int2_grad1_u12_bimo_t,                           &
+    !$OMP         tmp5)
+    !$OMP DO COLLAPSE(2)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp5(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
+                           + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0 &
+              , tmp7, n_points_final_grid, tmp5, n_points_final_grid       &
+              , 0.d0, tmp_3d, mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_exch13_bi_ort(m,j,k,i) = three_e_4_idx_exch13_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (j, k, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
+    !$OMP         mos_l_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp5)
+    !$OMP DO COLLAPSE(2)
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp5(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
+              , tmp5, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
+              , 0.d0, tmp_3d, mo_num*mo_num)
+
+    !$OMP PARALLEL DO PRIVATE(i,j,k)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          three_e_4_idx_cycle_1_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(m,j,k,i) - tmp_3d(j,k,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+  enddo
+
+  deallocate(tmp7)
+  deallocate(tmp_3d)
+
+
+
+  do i = 1, mo_num
+
+    !$OMP PARALLEL                                                 &
+    !$OMP DEFAULT (NONE)                                           &
+    !$OMP PRIVATE (m, j, ipoint)                                   &
+    !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
+    !$OMP         mos_r_in_r_array_transp,                         &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         tmp5)
+    !$OMP DO COLLAPSE(2)
+    do j = 1, mo_num
+      do m = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+
+          tmp5(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
+                           * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
+              , tmp5, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
+              , 1.d0, three_e_4_idx_cycle_1_bi_ort(1,1,1,i), mo_num*mo_num)
+
+  enddo
+
+  deallocate(tmp5)
+
+
+!  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+!  do i = 1, mo_num
+!    do k = 1, mo_num
+!      do j = 1, mo_num
+!        do m = 1, mo_num
+!          three_e_4_idx_exch12_bi_ort (m,j,k,i) = three_e_4_idx_exch13_bi_ort (j,m,k,i)
+!          three_e_4_idx_cycle_2_bi_ort(m,j,k,i) = three_e_4_idx_cycle_1_bi_ort(j,m,k,i)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END PARALLEL DO
+
+
   call wall_time(wall1)
-  print *, ' wall time for three_e_4_idx_exch12_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_4_idx_bi_ort', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) = < m j k | -L | j m i > ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_4_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  ! three_e_4_idx_exch23_bi_ort (m,j,k,i) : Lk Ri Imj Ijm + Lj Rm Imj Iki + Lm Rj Ijm Iki
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: i, j, k, l, m, ipoint
+  double precision              :: wall1, wall0
+  double precision, allocatable :: tmp1(:,:,:,:), tmp_4d(:,:,:,:)
+  double precision, allocatable :: tmp5(:,:,:), tmp6(:,:,:)
+
+  print *, ' Providing the three_e_4_idx_exch23_bi_ort ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+
+  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp6(n_points_final_grid,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp5, tmp6)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+
+        tmp5(ipoint,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * int2_grad1_u12_bimo_t(ipoint,1,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,2,l,i) * int2_grad1_u12_bimo_t(ipoint,2,i,l) &
+                         + int2_grad1_u12_bimo_t(ipoint,3,l,i) * int2_grad1_u12_bimo_t(ipoint,3,i,l) 
+
+        tmp6(ipoint,l,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, -1.d0 &
+            , tmp5, n_points_final_grid, tmp6, n_points_final_grid               &
+            , 0.d0, three_e_4_idx_exch23_bi_ort, mo_num*mo_num)
+
+  deallocate(tmp5)
+  deallocate(tmp6)
+
+
+  allocate(tmp_4d(mo_num,mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,2,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp1(ipoint,3,l,i) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,l)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0       &
+            , tmp1, 3*n_points_final_grid, int2_grad1_u12_bimo_t, 3*n_points_final_grid &
+            , 0.d0, tmp_4d, mo_num*mo_num)
+
+  deallocate(tmp1)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_4_idx_exch23_bi_ort(m,j,k,i) = three_e_4_idx_exch23_bi_ort(m,j,k,i) - tmp_4d(m,j,k,i) - tmp_4d(j,m,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(tmp_4d)
+
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch23_bi_ort', wall1 - wall0
   call print_memory_usage()
 
 END_PROVIDER 
diff --git a/src/bi_ort_ints/three_body_ijmk_old.irp.f b/src/bi_ort_ints/three_body_ijmk_old.irp.f
new file mode 100644
index 00000000..1a67f35b
--- /dev/null
+++ b/src/bi_ort_ints/three_body_ijmk_old.irp.f
@@ -0,0 +1,290 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_direct_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_direct_bi_ort_old(m,j,k,i) = <mjk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+ implicit none
+ integer          :: i, j, k, m
+ double precision :: integral, wall1, wall0
+
+  three_e_4_idx_direct_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_direct_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_direct_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, m, j, i, integral)
+          three_e_4_idx_direct_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_direct_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_1_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_cycle_1_bi_ort_old(m,j,k,i) = <mjk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_cycle_1_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_cycle_1_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_cycle_1_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, j, i, m, integral)
+          three_e_4_idx_cycle_1_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_cycle_1_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! --
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_cycle_2_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_cycle_2_bi_ort_old(m,j,k,i) = <mjk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_cycle_2_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_cycle_2_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_cycle_2_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, i, m, j, integral)
+          three_e_4_idx_cycle_2_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_cycle_2_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch23_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch23_bi_ort_old(m,j,k,i) = <mjk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_exch23_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_exch23_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_exch23_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, j, m, i, integral)
+          three_e_4_idx_exch23_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch23_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch13_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch13_bi_ort_old(m,j,k,i) = <mjk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_exch13_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_exch13_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_exch13_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, i, j, m, integral)
+          three_e_4_idx_exch13_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch13_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_4_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  ! 
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_4_idx_exch12_bi_ort_old(m,j,k,i) = <mjk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  ! 
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m
+  double precision :: integral, wall1, wall0
+
+  three_e_4_idx_exch12_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_4_idx_exch12_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_4_idx_exch12_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          call give_integrals_3_body_bi_ort(m, j, k, m, i, j, integral)
+          three_e_4_idx_exch12_bi_ort_old(m,j,k,i) = -1.d0 * integral 
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_4_idx_exch12_bi_ort_old', wall1 - wall0
+  call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index d67e1434..3e4412a3 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -19,17 +19,17 @@ end
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
-
   END_DOC
 
   implicit none
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
+  integer                       :: i, j, k, m, l
+  integer                       :: ipoint
+  double precision              :: wall1, wall0
   double precision, allocatable :: grad_mli(:,:,:), orb_mat(:,:,:)
   double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
   double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:)
+
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
@@ -38,201 +38,196 @@ end
   print *, ' Providing the three_e_5_idx_bi_ort ...'
   call wall_time(wall0)
 
- do m = 1, mo_num
+  do m = 1, mo_num
 
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         grad_mli, orb_mat)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
+    allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+    allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 
-         grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
-               int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
-               int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
-               int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (i,l,ipoint)                                      &
+    !$OMP SHARED (m,mo_num,n_points_final_grid,                     &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         grad_mli, orb_mat)
+    !$OMP DO COLLAPSE(2)
+    do i = 1, mo_num
+      do l = 1, mo_num
+         do ipoint = 1, n_points_final_grid
 
-         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+           grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
+                 int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
+                 int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
+                 int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
 
-       enddo
+           orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+
+         enddo
+      enddo
     enddo
-  enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
 
-  !$OMP END DO
-  !$OMP END PARALLEL
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
+        orb_mat, n_points_final_grid,  &
+        grad_mli, n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
 
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
-      orb_mat, n_points_final_grid,  &
-      grad_mli, n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
+    !$OMP END PARALLEL DO
 
-  deallocate(orb_mat,grad_mli)
+    deallocate(orb_mat,grad_mli)
 
+    allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+    allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+    allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
 
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (i,l,ipoint)                                      &
+    !$OMP SHARED (m,mo_num,n_points_final_grid,                     &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+    !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
+    !$OMP DO COLLAPSE(2)
+    do i=1,mo_num
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
 
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
+          lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+          lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+          lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
 
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+          rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+          rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+          rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
 
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
-
-       enddo
+          rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+          rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+          rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+        enddo
+      enddo
     enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lm_grad_ik, 3*n_points_final_grid,        &
+        rm_grad_ik, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
 
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP END PARALLEL DO
+  
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lm_grad_ik, 3*n_points_final_grid,        &
+        rk_grad_im, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
+  
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,j,k)
             three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = - tmp_mat(k,j,i,l)
             three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = - tmp_mat(k,i,j,l)
             three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = - tmp_mat(l,j,i,k)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-  deallocate(lm_grad_ik)
-
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         lk_grad_mi)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
-
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-       enddo
+    !$OMP END PARALLEL DO
+  
+    deallocate(lm_grad_ik)
+  
+    allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+  
+   !$OMP PARALLEL                                                  &
+   !$OMP DEFAULT (NONE)                                            &
+   !$OMP PRIVATE (i,l,ipoint)                                      &
+   !$OMP SHARED (m,mo_num,n_points_final_grid,                     &
+   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+   !$OMP         lk_grad_mi)
+   !$OMP DO COLLAPSE(2)
+    do i=1,mo_num
+      do l=1,mo_num
+        do ipoint=1, n_points_final_grid
+  
+          lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+          lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+          lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+  
+        enddo
+      enddo
     enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lk_grad_mi, 3*n_points_final_grid,  &
-      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lk_grad_mi, 3*n_points_final_grid,  &
+        rm_grad_ik, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
+  
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l,i)
             three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
             three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k,i)
             three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(k,i,l,j)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lk_grad_mi, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
+    !$OMP END PARALLEL DO
+  
+    call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+        lk_grad_mi, 3*n_points_final_grid,  &
+        rk_grad_im, 3*n_points_final_grid,  0.d0, &
+        tmp_mat, mo_num*mo_num)
+  
+    !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+    do i = 1, mo_num
+      do k = 1, mo_num
+        do j = 1, mo_num
+          do l = 1, mo_num
             three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,j,i,k)
             three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,i,j,l)
             three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,j,i,l)
             three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,i,j,k)
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-  deallocate(lk_grad_mi)
-  deallocate(rm_grad_ik)
-  deallocate(rk_grad_im)
+    !$OMP END PARALLEL DO
+  
+    deallocate(lk_grad_mi)
+    deallocate(rm_grad_ik)
+    deallocate(rk_grad_im)
   enddo
 
+  deallocate(tmp_mat)
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_bi_ort', wall1 - wall0
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index 882470ed..a2077f0f 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -9,8 +9,7 @@ subroutine provide_all_three_ints_bi_ortho
    PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
   endif
   if(three_e_4_idx_term)then
-   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_cycle_2_bi_ort
-   PROVIDE three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort three_e_4_idx_exch12_bi_ort
+   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
   endif
   if(.not.double_normal_ord.and.three_e_5_idx_term)then
    PROVIDE three_e_5_idx_direct_bi_ort 
diff --git a/src/tc_bi_ortho/slater_tc_opt_single.irp.f b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
index 7178d6d9..9719a6e7 100644
--- a/src/tc_bi_ortho/slater_tc_opt_single.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt_single.irp.f
@@ -243,7 +243,9 @@ subroutine fock_ac_tc_operator(iorb,ispin,key, h_fock,p_fock, ispin_fock,hthree,
    do j = 1, nb
     jj = occ(j,other_spin) 
     direct_int = three_e_4_idx_direct_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
-    exchange_int = three_e_4_idx_exch12_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
+    ! TODO
+    ! use transpose
+    exchange_int = three_e_4_idx_exch13_bi_ort(iorb,jj,p_fock,h_fock) ! USES 4-IDX TENSOR 
     hthree += direct_int - exchange_int
    enddo
   else !! ispin NE to ispin_fock
@@ -322,7 +324,8 @@ subroutine fock_a_tc_operator(iorb,ispin,key, h_fock,p_fock, ispin_fock,hthree,N
    do j = 1, nb
     jj = occ(j,other_spin) 
     direct_int = three_e_4_idx_direct_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
-    exchange_int = three_e_4_idx_exch12_bi_ort(jj,iorb,p_fock,h_fock) ! USES 4-IDX TENSOR 
+    ! TODO use transpose 
+    exchange_int = three_e_4_idx_exch13_bi_ort(iorb,jj,p_fock,h_fock) ! USES 4-IDX TENSOR 
     hthree -= direct_int - exchange_int
    enddo
   else !! ispin NE to ispin_fock
diff --git a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
index e725d8e5..3180d946 100644
--- a/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
+++ b/src/tc_bi_ortho/symmetrized_3_e_int.irp.f
@@ -96,9 +96,11 @@ double precision function three_e_single_parrallel_spin(m,j,k,i)
  implicit none
  integer, intent(in) :: i,k,j,m
   three_e_single_parrallel_spin = three_e_4_idx_direct_bi_ort(m,j,k,i)  ! direct
-  three_e_single_parrallel_spin += three_e_4_idx_cycle_1_bi_ort(m,j,k,i) + three_e_4_idx_cycle_2_bi_ort(m,j,k,i) & ! two cyclic permutations 
+  three_e_single_parrallel_spin += three_e_4_idx_cycle_1_bi_ort(m,j,k,i) + three_e_4_idx_cycle_1_bi_ort(j,m,k,i) & ! two cyclic permutations 
   - three_e_4_idx_exch23_bi_ort(m,j,k,i) - three_e_4_idx_exch13_bi_ort(m,j,k,i)  & ! two first exchange 
-  - three_e_4_idx_exch12_bi_ort(m,j,k,i) ! last exchange 
+  - three_e_4_idx_exch13_bi_ort(j,m,k,i) ! last exchange 
+  ! TODO
+  ! use transpose
 end
 
 double precision function three_e_double_parrallel_spin(m,l,j,k,i)

From a791a28523b787618d571947452fffbc4e7340c6 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Jun 2023 09:19:34 +0200
Subject: [PATCH 09/18] working on memory footprint

---
 .../grid_becke_vector.irp.f                   | 18 +++-
 src/bi_ort_ints/semi_num_ints_mo.irp.f        | 23 ++++--
 src/bi_ort_ints/three_body_ijmk.irp.f         | 82 +++++++++++--------
 src/bi_ortho_mos/bi_ort_mos_in_r.irp.f        |  6 +-
 src/fci_tc_bi/fci_tc_bi_ortho.irp.f           |  8 +-
 src/tc_bi_ortho/normal_ordered.irp.f          |  6 --
 src/tc_bi_ortho/slater_tc_opt.irp.f           | 50 +++++++----
 7 files changed, 121 insertions(+), 72 deletions(-)

diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index fd185641..8982fe83 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -62,20 +62,30 @@ END_PROVIDER
     enddo
   enddo
 
+  FREE grid_points_per_atom
+  FREE final_weight_at_r
+
 END_PROVIDER
 
 ! ---
 
 BEGIN_PROVIDER [double precision, final_grid_points_transp, (n_points_final_grid,3)]
-  implicit none
+
   BEGIN_DOC
-! Transposed final_grid_points
+  ! Transposed final_grid_points
   END_DOC
 
+  implicit none
   integer :: i,j
-  do j=1,3
-    do i=1,n_points_final_grid
+
+  do j = 1, 3
+    do i = 1, n_points_final_grid
       final_grid_points_transp(i,j) = final_grid_points(j,i)
     enddo
   enddo
+
 END_PROVIDER
+
+! ---
+
+
diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 771d3274..6354b393 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -124,6 +124,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
       enddo
     enddo
 
+    FREE int2_grad1_u12_ao_test
+
   else
 
     PROVIDE int2_grad1_u12_ao
@@ -153,14 +155,14 @@ END_PROVIDER
 BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
 
   implicit none
-  integer :: ipoint
+  integer          :: ipoint
   double precision :: wall0, wall1
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_ao_transp
 
-  !print *, ' providing int2_grad1_u12_bimo_transp'
-  !call wall_time(wall0)
+  print *, ' providing int2_grad1_u12_bimo_transp'
+  call wall_time(wall0)
 
   !$OMP PARALLEL         &
   !$OMP DEFAULT (NONE)   &
@@ -178,8 +180,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   !$OMP END DO
   !$OMP END PARALLEL
 
-  !call wall_time(wall1)
-  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  call wall_time(wall1)
+  print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  call print_memory_usage()
 
 END_PROVIDER 
 
@@ -188,7 +191,11 @@ END_PROVIDER
 BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
 
   implicit none
-  integer :: i, j, ipoint
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  call wall_time(wall0)
+  print *, ' Providing int2_grad1_u12_bimo_t ...'
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_transp
@@ -205,6 +212,10 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
 
   FREE int2_grad1_u12_bimo_transp
 
+  call wall_time(wall1)
+  print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
+  call print_memory_usage()
+
 END_PROVIDER 
 
 ! ---
diff --git a/src/bi_ort_ints/three_body_ijmk.irp.f b/src/bi_ort_ints/three_body_ijmk.irp.f
index 39a31751..ee7e88ef 100644
--- a/src/bi_ort_ints/three_body_ijmk.irp.f
+++ b/src/bi_ort_ints/three_body_ijmk.irp.f
@@ -30,10 +30,10 @@
   implicit none
   integer                       :: ipoint, i, j, k, l, m
   double precision              :: wall1, wall0
-  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:), tmp4(:,:,:,:)
+  double precision, allocatable :: tmp1(:,:,:,:), tmp2(:,:,:,:), tmp3(:,:,:,:)
   double precision, allocatable :: tmp_4d(:,:,:,:)
-  double precision, allocatable :: tmp5(:,:,:)
-  double precision, allocatable :: tmp7(:,:)
+  double precision, allocatable :: tmp4(:,:,:)
+  double precision, allocatable :: tmp5(:,:)
   double precision, allocatable :: tmp_3d(:,:,:)
 
   print *, ' Providing the three_e_4_idx_bi_ort ...'
@@ -47,7 +47,6 @@
   allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp2(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp3(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp4(n_points_final_grid,3,mo_num,mo_num))
 
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
@@ -55,7 +54,7 @@
   !$OMP SHARED (mo_num, n_points_final_grid,                      &
   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
   !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp1, tmp2, tmp3, tmp4)
+  !$OMP         tmp1, tmp2, tmp3)
   !$OMP DO COLLAPSE(2)
   do i = 1, mo_num
     do l = 1, mo_num
@@ -69,13 +68,9 @@
         tmp2(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,l) * mos_r_in_r_array_transp(ipoint,i)
         tmp2(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,l) * mos_r_in_r_array_transp(ipoint,i)
 
-        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
-
-        tmp4(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp4(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
-        tmp4(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,l,i) * mos_r_in_r_array_transp(ipoint,l)
+        tmp3(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,l,i) * mos_r_in_r_array_transp(ipoint,l)
       enddo
     enddo
   enddo
@@ -99,7 +94,7 @@
   !$OMP END PARALLEL DO
 
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp4, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
+            , tmp3, 3*n_points_final_grid, tmp1, 3*n_points_final_grid            &
             , 0.d0, tmp_4d, mo_num*mo_num)
 
   deallocate(tmp1)
@@ -116,8 +111,30 @@
   enddo
   !$OMP END PARALLEL DO
 
+
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (i, l, ipoint)                                    &
+  !$OMP SHARED (mo_num, n_points_final_grid,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp1)
+  !$OMP DO COLLAPSE(2)
+  do i = 1, mo_num
+    do l = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp1(ipoint,1,l,i) = int2_grad1_u12_bimo_t(ipoint,1,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,2,l,i) = int2_grad1_u12_bimo_t(ipoint,2,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+        tmp1(ipoint,3,l,i) = int2_grad1_u12_bimo_t(ipoint,3,i,l) * mos_l_in_r_array_transp(ipoint,l) * final_weight_at_r_vector(ipoint)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp3, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
+            , tmp1, 3*n_points_final_grid, tmp2, 3*n_points_final_grid            &
             , 0.d0, tmp_4d, mo_num*mo_num)
 
   deallocate(tmp2)
@@ -135,11 +152,10 @@
   !$OMP END PARALLEL DO
 
   call dgemm( 'T', 'N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-            , tmp3, 3*n_points_final_grid, tmp4, 3*n_points_final_grid            &
+            , tmp1, 3*n_points_final_grid, tmp3, 3*n_points_final_grid            &
             , 0.d0, tmp_4d, mo_num*mo_num)
 
   deallocate(tmp3)
-  deallocate(tmp4)
 
   !$OMP PARALLEL DO PRIVATE(i,j,k,m)
   do i = 1, mo_num
@@ -155,8 +171,6 @@
 
 
 
-  allocate(tmp1(n_points_final_grid,3,mo_num,mo_num))
-
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
   !$OMP PRIVATE (i, l, ipoint)                                    &
@@ -199,26 +213,26 @@
 
 
   allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp7(n_points_final_grid,mo_num))
+  allocate(tmp5(n_points_final_grid,mo_num))
 
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
   !$OMP PRIVATE (i, ipoint)                                       &
   !$OMP SHARED (mo_num, n_points_final_grid,                      &
   !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector,  &
-  !$OMP         tmp7)
+  !$OMP         final_weight_at_r_vector,                         &
+  !$OMP         tmp5)
   !$OMP DO
   do i = 1, mo_num
     do ipoint = 1, n_points_final_grid
-      tmp7(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+      tmp5(ipoint,i) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
 
-  allocate(tmp5(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp4(n_points_final_grid,mo_num,mo_num))
 
   do m = 1, mo_num
 
@@ -227,13 +241,13 @@
     !$OMP PRIVATE (i, k, ipoint)                                   &
     !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
     !$OMP         int2_grad1_u12_bimo_t,                           &
-    !$OMP         tmp5)
+    !$OMP         tmp4)
     !$OMP DO COLLAPSE(2)
     do i = 1, mo_num
       do k = 1, mo_num
         do ipoint = 1, n_points_final_grid
 
-          tmp5(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
+          tmp4(ipoint,k,i) = int2_grad1_u12_bimo_t(ipoint,1,k,m) * int2_grad1_u12_bimo_t(ipoint,1,m,i) &
                            + int2_grad1_u12_bimo_t(ipoint,2,k,m) * int2_grad1_u12_bimo_t(ipoint,2,m,i) &
                            + int2_grad1_u12_bimo_t(ipoint,3,k,m) * int2_grad1_u12_bimo_t(ipoint,3,m,i)
         enddo
@@ -243,7 +257,7 @@
     !$OMP END PARALLEL
 
     call dgemm( 'T', 'N', mo_num, mo_num*mo_num, n_points_final_grid, 1.d0 &
-              , tmp7, n_points_final_grid, tmp5, n_points_final_grid       &
+              , tmp5, n_points_final_grid, tmp4, n_points_final_grid       &
               , 0.d0, tmp_3d, mo_num)
 
     !$OMP PARALLEL DO PRIVATE(i,j,k)
@@ -264,13 +278,13 @@
     !$OMP SHARED (mo_num, n_points_final_grid, m,                  &
     !$OMP         mos_l_in_r_array_transp,                         &
     !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp5)
+    !$OMP         tmp4)
     !$OMP DO COLLAPSE(2)
     do k = 1, mo_num
       do j = 1, mo_num
         do ipoint = 1, n_points_final_grid
 
-          tmp5(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
+          tmp4(ipoint,j,k) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)        &
                            * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,k,m) &
                              + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,k,m) &
                              + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,k,m) )
@@ -281,7 +295,7 @@
     !$OMP END PARALLEL
 
     call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, 1.d0              &
-              , tmp5, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
+              , tmp4, n_points_final_grid, mos_r_in_r_array_transp, n_points_final_grid &
               , 0.d0, tmp_3d, mo_num*mo_num)
 
     !$OMP PARALLEL DO PRIVATE(i,j,k)
@@ -296,7 +310,7 @@
 
   enddo
 
-  deallocate(tmp7)
+  deallocate(tmp5)
   deallocate(tmp_3d)
 
 
@@ -309,13 +323,13 @@
     !$OMP SHARED (mo_num, n_points_final_grid, i,                  &
     !$OMP         mos_r_in_r_array_transp,                         &
     !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-    !$OMP         tmp5)
+    !$OMP         tmp4)
     !$OMP DO COLLAPSE(2)
     do j = 1, mo_num
       do m = 1, mo_num
         do ipoint = 1, n_points_final_grid
 
-          tmp5(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
+          tmp4(ipoint,m,j) = final_weight_at_r_vector(ipoint) * mos_r_in_r_array_transp(ipoint,m)        &
                            * ( int2_grad1_u12_bimo_t(ipoint,1,m,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
                              + int2_grad1_u12_bimo_t(ipoint,2,m,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
                              + int2_grad1_u12_bimo_t(ipoint,3,m,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
@@ -326,12 +340,12 @@
     !$OMP END PARALLEL
 
     call dgemm( 'T', 'N', mo_num*mo_num, mo_num, n_points_final_grid, -1.d0             &
-              , tmp5, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
+              , tmp4, n_points_final_grid, mos_l_in_r_array_transp, n_points_final_grid &
               , 1.d0, three_e_4_idx_cycle_1_bi_ort(1,1,1,i), mo_num*mo_num)
 
   enddo
 
-  deallocate(tmp5)
+  deallocate(tmp4)
 
 
 !  !$OMP PARALLEL DO PRIVATE(i,j,k,m)
diff --git a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
index 42130575..8667683e 100644
--- a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
+++ b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
@@ -46,6 +46,8 @@ BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid,
       mos_r_in_r_array_transp(i,j) = mos_r_in_r_array(j,i) 
     enddo
   enddo
+
+  FREE mos_r_in_r_array
  
 END_PROVIDER
 
@@ -116,7 +118,7 @@ end subroutine give_all_mos_l_at_r
 
 ! ---
 
-BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp,(n_points_final_grid,mo_num)]
+BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid,mo_num)]
 
   BEGIN_DOC
   ! mos_l_in_r_array_transp(i,j) = value of the jth mo on the ith grid point
@@ -130,6 +132,8 @@ BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp,(n_points_final_grid,mo
       mos_l_in_r_array_transp(i,j) = mos_l_in_r_array(j,i) 
     enddo
   enddo
+
+  FREE mos_l_in_r_array
  
 END_PROVIDER
 
diff --git a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
index ed75c882..f9bda058 100644
--- a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -54,11 +54,13 @@ subroutine run_cipsi_tc
 
   implicit none
 
-  if (.not.is_zmq_slave) then
+  if (.not. is_zmq_slave) then
+
     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
-    if(elec_alpha_num+elec_beta_num.ge.3)then
+
+    if(elec_alpha_num+elec_beta_num .ge. 3) then
       if(three_body_h_tc)then
-        call provide_all_three_ints_bi_ortho
+        call provide_all_three_ints_bi_ortho()
       endif
     endif
     ! ---
diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index cc01d144..f8e310df 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -24,9 +24,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
  
   PROVIDE N_int
 
-  print*,' Providing normal_two_body_bi_orth ...'
-  call wall_time(wall0)
-
   if(read_tc_norm_ord) then
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
@@ -115,9 +112,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   call wall_time(wall1)
   print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
 
-  call wall_time(wall1)
-  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
-
 END_PROVIDER 
 
 ! ---
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index a2077f0f..42c59308 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -1,24 +1,38 @@
-subroutine provide_all_three_ints_bi_ortho
- implicit none
- BEGIN_DOC
-! routine that provides all necessary three-electron integrals 
- END_DOC
- if(three_body_h_tc)then
-  if(three_e_3_idx_term)then
-   PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
-   PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
-  endif
-  if(three_e_4_idx_term)then
-   PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
-  endif
-  if(.not.double_normal_ord.and.three_e_5_idx_term)then
-   PROVIDE three_e_5_idx_direct_bi_ort 
-  elseif (double_normal_ord .and. (.not. three_e_5_idx_term))then
-   PROVIDE normal_two_body_bi_orth
-  endif
+
+! ---
+
+subroutine provide_all_three_ints_bi_ortho()
+
+  BEGIN_DOC
+  ! routine that provides all necessary three-electron integrals 
+  END_DOC
+
+  implicit none
+
+  if(three_body_h_tc) then
+
+    if(three_e_3_idx_term) then
+      PROVIDE three_e_3_idx_direct_bi_ort three_e_3_idx_cycle_1_bi_ort three_e_3_idx_cycle_2_bi_ort
+      PROVIDE three_e_3_idx_exch23_bi_ort three_e_3_idx_exch13_bi_ort three_e_3_idx_exch12_bi_ort
+    endif
+
+    if(three_e_4_idx_term) then
+      PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
+    endif
+
+    if(.not. double_normal_ord. and. three_e_5_idx_term) then
+      PROVIDE three_e_5_idx_direct_bi_ort 
+    elseif(double_normal_ord .and. (.not. three_e_5_idx_term)) then
+      PROVIDE normal_two_body_bi_orth
+    endif
+
  endif
+
+ return
 end
 
+! ---
+
 subroutine htilde_mu_mat_opt_bi_ortho_tot(key_j, key_i, Nint, htot)
  implicit none
   BEGIN_DOC

From 501b9d648702c0f5a2ba0f684ef40ce69d0cb6ce Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Jun 2023 09:58:29 +0200
Subject: [PATCH 10/18] minor modifs

---
 .../grid_becke_vector.irp.f                   | 21 +++++++++++++---
 src/bi_ort_ints/bi_ort_ints.irp.f             | 25 ++++++++++++++-----
 src/bi_ort_ints/semi_num_ints_mo.irp.f        | 22 ++++++++--------
 src/bi_ortho_mos/bi_ort_mos_in_r.irp.f        |  4 ---
 src/tc_bi_ortho/slater_tc_opt.irp.f           |  2 +-
 5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index 8982fe83..0386f3c6 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -1,10 +1,13 @@
 
 BEGIN_PROVIDER [integer, n_points_final_grid]
-  implicit none
+
   BEGIN_DOC
   ! Number of points which are non zero
   END_DOC
-  integer                        :: i,j,k,l
+
+  implicit none
+  integer :: i, j, k, l
+
   n_points_final_grid = 0
   do j = 1, nucl_num
     do i = 1, n_points_radial_grid -1
@@ -16,9 +19,11 @@ BEGIN_PROVIDER [integer, n_points_final_grid]
       enddo
     enddo
   enddo
-  print*,'n_points_final_grid = ',n_points_final_grid
-  print*,'n max point         = ',n_points_integration_angular*(n_points_radial_grid*nucl_num - 1)
+
+  print*,' n_points_final_grid = ', n_points_final_grid
+  print*,' n max point         = ', n_points_integration_angular*(n_points_radial_grid*nucl_num - 1)
   call ezfio_set_becke_numerical_grid_n_points_final_grid(n_points_final_grid)
+
 END_PROVIDER
 
 ! ---
@@ -41,6 +46,10 @@ END_PROVIDER
   implicit none
   integer          :: i, j, k, l, i_count
   double precision :: r(3)
+  double precision :: wall0, wall1
+
+  call wall_time(wall0)
+  print *, ' Providing final_grid_points ...'
 
   i_count = 0
   do j = 1, nucl_num
@@ -65,6 +74,10 @@ END_PROVIDER
   FREE grid_points_per_atom
   FREE final_weight_at_r
 
+  call wall_time(wall1)
+  print *, ' wall time for final_grid_points,', wall1 - wall0
+  call print_memory_usage()
+
 END_PROVIDER
 
 ! ---
diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index bb0424cd..e64892d7 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -1,22 +1,35 @@
+! ---
+
 program bi_ort_ints
-  implicit none
+
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+  ! TODO : Put the documentation of the program here
   END_DOC
+
+  implicit none
+
   my_grid_becke = .True.
-  my_n_pt_r_grid = 10
-  my_n_pt_a_grid = 14
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+  !my_n_pt_r_grid = 10
+  !my_n_pt_a_grid = 14
+  my_n_pt_r_grid = 30
+  my_n_pt_a_grid = 50
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
 ! call test_3e
 ! call test_5idx
 ! call test_5idx2
- call test_4idx
+ !call test_4idx
+  call test_4idx2()
 end
 
 subroutine test_5idx2
   PROVIDE three_e_5_idx_cycle_2_bi_ort
 end
 
+subroutine test_4idx2()
+  PROVIDE three_e_4_idx_direct_bi_ort 
+end
+
 subroutine test_3e
  implicit none
  integer :: i,k,j,l,m,n,ipoint
diff --git a/src/bi_ort_ints/semi_num_ints_mo.irp.f b/src/bi_ort_ints/semi_num_ints_mo.irp.f
index 6354b393..355fa38f 100644
--- a/src/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/src/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -54,7 +54,7 @@ BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_poi
     enddo
   enddo
 
-! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
+  !FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
 
 END_PROVIDER 
 
@@ -161,8 +161,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_ao_transp
 
-  print *, ' providing int2_grad1_u12_bimo_transp'
-  call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_bimo_transp'
+  !call wall_time(wall0)
 
   !$OMP PARALLEL         &
   !$OMP DEFAULT (NONE)   &
@@ -180,9 +180,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call wall_time(wall1)
-  print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
-  call print_memory_usage()
+  !call wall_time(wall1)
+  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  !call print_memory_usage()
 
 END_PROVIDER 
 
@@ -194,8 +194,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
   integer          :: i, j, ipoint
   double precision :: wall0, wall1
 
-  call wall_time(wall0)
-  print *, ' Providing int2_grad1_u12_bimo_t ...'
+  !call wall_time(wall0)
+  !print *, ' Providing int2_grad1_u12_bimo_t ...'
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_transp
@@ -212,9 +212,9 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
 
   FREE int2_grad1_u12_bimo_transp
 
-  call wall_time(wall1)
-  print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
-  call print_memory_usage()
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
+  !call print_memory_usage()
 
 END_PROVIDER 
 
diff --git a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
index 8667683e..25572854 100644
--- a/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
+++ b/src/bi_ortho_mos/bi_ort_mos_in_r.irp.f
@@ -47,8 +47,6 @@ BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid,
     enddo
   enddo
 
-  FREE mos_r_in_r_array
- 
 END_PROVIDER
 
 ! ---
@@ -133,8 +131,6 @@ BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid,m
     enddo
   enddo
 
-  FREE mos_l_in_r_array
- 
 END_PROVIDER
 
 ! ---
diff --git a/src/tc_bi_ortho/slater_tc_opt.irp.f b/src/tc_bi_ortho/slater_tc_opt.irp.f
index 42c59308..7acb0d0f 100644
--- a/src/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/src/tc_bi_ortho/slater_tc_opt.irp.f
@@ -20,7 +20,7 @@ subroutine provide_all_three_ints_bi_ortho()
       PROVIDE three_e_4_idx_direct_bi_ort three_e_4_idx_cycle_1_bi_ort three_e_4_idx_exch23_bi_ort three_e_4_idx_exch13_bi_ort 
     endif
 
-    if(.not. double_normal_ord. and. three_e_5_idx_term) then
+    if(.not. double_normal_ord .and. three_e_5_idx_term) then
       PROVIDE three_e_5_idx_direct_bi_ort 
     elseif(double_normal_ord .and. (.not. three_e_5_idx_term)) then
       PROVIDE normal_two_body_bi_orth

From b984d7a1f4a734ae459a3c91e2ca9ee2ea26bc50 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sun, 4 Jun 2023 15:27:07 +0200
Subject: [PATCH 11/18] minor modif

---
 src/bi_ort_ints/bi_ort_ints.irp.f | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index e64892d7..5e465d0f 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -27,7 +27,8 @@ subroutine test_5idx2
 end
 
 subroutine test_4idx2()
-  PROVIDE three_e_4_idx_direct_bi_ort 
+  !PROVIDE three_e_4_idx_direct_bi_ort 
+  PROVIDE three_e_4_idx_exch23_bi_ort
 end
 
 subroutine test_3e

From b48e6b269d624ecfabcbb4895c75bd397646c0d8 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 01:33:55 +0200
Subject: [PATCH 12/18] start optim normal ordering

---
 src/tc_bi_ortho/normal_ordered.irp.f     | 438 ++++++++++++-----------
 src/tc_bi_ortho/normal_ordered_old.irp.f | 390 ++++++++++++++++++++
 2 files changed, 625 insertions(+), 203 deletions(-)
 create mode 100644 src/tc_bi_ortho/normal_ordered_old.irp.f

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index f8e310df..a092762b 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -14,7 +14,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   integer                        :: i, h1, p1, h2, p2
   integer                        :: hh1, hh2, pp1, pp2
   integer                        :: Ne(2)
-  double precision               :: hthree_aba, hthree_aaa, hthree_aab
+  double precision               :: hthree_aaa, hthree_aab
   double precision               :: wall0, wall1
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
@@ -39,57 +39,65 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
     if(core_tc_op) then
       do i = 1, N_int
-        key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-        key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+        key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
       enddo
-      call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+      call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
     else
-      call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+      call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
     endif
 
-    normal_two_body_bi_orth = 0.d0
+    ! opposite spin double excitations : s1 /= s2
+    normal_two_body_bi_orth(:,:,:,:) = no_aba_contraction(:,:,:,:)
 
-    !$OMP PARALLEL                                                                         &
-    !$OMP DEFAULT (NONE)                                                                   &
-    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+    !$OMP PARALLEL                                                             &
+    !$OMP DEFAULT (NONE)                                                       &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa) & 
     !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
+
       do pp1 = 1, n_act_orb
         p1 = list_act(pp1)
+
         do hh2 = 1, n_act_orb
           h2 = list_act(hh2) 
+
           do pp2 = 1, n_act_orb
             p2 = list_act(pp2)
+
             ! all contributions from the 3-e terms to the double excitations 
             ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
-      
-
-            ! opposite spin double excitations : s1 /= s2
-            call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
 
             ! same spin double excitations : s1 == s2 
-            if(h1<h2.and.p1.gt.p2)then
-             ! with opposite spin contributions 
-             call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-             ! same spin double excitations with same spin contributions 
-             if(Ne(2).ge.3)then
-               call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-             else
-               hthree_aaa = 0.d0
-             endif
-            else
-             ! with opposite spin contributions 
-             call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-             if(Ne(2).ge.3)then
+            if((h1 < h2) .and. (p1 > p2)) then
+
+              ! with opposite spin contributions 
+              call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+
               ! same spin double excitations with same spin contributions 
-               call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-             else
-               hthree_aaa = 0.d0
-             endif
+              if(Ne(2) .ge. 3) then
+                call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+              else
+                hthree_aaa = 0.d0
+              endif
+
+            else
+
+              ! with opposite spin contributions 
+              call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+
+              if(Ne(2) .ge. 3) then
+                ! same spin double excitations with same spin contributions 
+                call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+              else
+                hthree_aaa = 0.d0
+              endif
+
             endif
-            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+
+            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aab + hthree_aaa)
           enddo
         enddo
       enddo
@@ -116,178 +124,6 @@ END_PROVIDER
 
 ! ---
 
-subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer, intent(in)           :: Nint, h1, h2, p1, p2
-  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
-  double precision, intent(out) :: hthree
-  integer                       :: ii, i
-  double precision              :: int_direct, int_exc_12, int_exc_13, integral
-
-  !!!! double alpha/beta
-  hthree = 0.d0
-
-  do ii = 1, Ne(2) ! purely closed shell part 
-    i = occ(ii,2)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12 = -1.d0 * integral
-
-    hthree += 2.d0 * int_direct - 1.d0 * (int_exc_13 + int_exc_12)
-  enddo
-
-  do ii = Ne(2) + 1, Ne(1) ! purely open-shell part 
-    i = occ(ii,1)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12 = -1.d0 * integral
-
-    hthree += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
-  enddo
-
-  return
-end
-
-! ---
-
-BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  ! Normal ordered two-body sector of the three-body terms for opposite spin double excitations 
-  END_DOC
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: h1, p1, h2, p2, i
-  integer                        :: hh1, hh2, pp1, pp2
-  integer                        :: Ne(2)
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision               :: hthree
-
-  PROVIDE N_int
-
-  allocate( key_i_core(N_int,2) )
-  allocate( occ(N_int*bit_kind_size,2) )
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
-  endif
-
-  normal_two_body_bi_orth_ab = 0.d0
-  do hh1 = 1, n_act_orb
-    h1 = list_act(hh1) 
-    do pp1 = 1, n_act_orb
-      p1 = list_act(pp1)
-      do hh2 = 1, n_act_orb
-        h2 = list_act(hh2) 
-        do pp2 = 1, n_act_orb
-          p2 = list_act(pp2)
-          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
-
-          normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate( key_i_core )
-  deallocate( occ )
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_act_orb, n_act_orb, n_act_orb)]
-
-  BEGIN_DOC
-  ! Normal ordered two-body sector of the three-body terms for same spin double excitations 
-  END_DOC
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i,ii,j,h1,p1,h2,p2
-  integer                        :: hh1,hh2,pp1,pp2
-  integer                        :: Ne(2)
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision               :: hthree_aab, hthree_aaa
-
-  PROVIDE N_int
-
-  allocate( key_i_core(N_int,2) )
-  allocate( occ(N_int*bit_kind_size,2) )
-
-  if(core_tc_op)then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  normal_two_body_bi_orth_aa_bb = 0.d0
-  do hh1 = 1, n_act_orb
-    h1 = list_act(hh1) 
-    do pp1 = 1 , n_act_orb
-      p1 = list_act(pp1)
-      do hh2 = 1, n_act_orb
-        h2 = list_act(hh2) 
-        do pp2 = 1 , n_act_orb
-          p2 = list_act(pp2)
-          if(h1<h2.and.p1.gt.p2)then
-           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-           else
-             hthree_aaa = 0.d0
-           endif
-          else
-           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-           if(Ne(2).ge.3)then
-             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-           else
-             hthree_aaa = 0.d0
-           endif
-          endif
-          normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1) = hthree_aab + hthree_aaa
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate( key_i_core )
-  deallocate( occ )
-
-END_PROVIDER 
-
-! ---
-
 subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
 
   BEGIN_DOC
@@ -388,3 +224,199 @@ end
 
 ! ---
 
+BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+
+  print*,' Providing no_aba_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+  allocate(tmpvec_2(n_points_final_grid,3))
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0              &
+                , int2_grad1_u12_bimo_t, 3*n_points_final_grid, tmp1, 3*n_points_final_grid &
+                , 0.d0, tmp_3d, mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0                     &
+                  , mos_l_in_r_array_transp, n_points_final_grid, tmp2, n_points_final_grid &
+                  , 1.d0, no_aba_contraction(p2,h2,1,1), mo_num*mo_num)
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+
+  double precision :: integral, int_direct, int_exc_13, int_exc_12
+
+  ! TODO
+  ! purely open-shell part 
+  if(Ne(2) < Ne(1)) then
+
+    do ii = Ne(2) + 1, Ne(1)
+      i = occ(ii,1)
+
+      call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+      int_direct = -1.d0 * integral
+
+      call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+      int_exc_13 = -1.d0 * integral
+
+      call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+      int_exc_12 = -1.d0 * integral
+
+      no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
+    enddo
+  endif
+
+  ! ---
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+
+  !$OMP PARALLEL DO PRIVATE(h1,h2,p1,p2)
+  do h1 = 1, mo_num
+    do p1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p2 = 1, mo_num
+          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (no_aba_contraction(p2,h2,p1,h1) + no_aba_contraction(p1,h1,p2,h2))
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+END_PROVIDER
+
+! ---
+
+
diff --git a/src/tc_bi_ortho/normal_ordered_old.irp.f b/src/tc_bi_ortho/normal_ordered_old.irp.f
new file mode 100644
index 00000000..553cafdb
--- /dev/null
+++ b/src/tc_bi_ortho/normal_ordered_old.irp.f
@@ -0,0 +1,390 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_old, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC 
+  ! Normal ordering of the three body interaction on the HF density
+  END_DOC 
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+
+  integer                        :: i, h1, p1, h2, p2
+  integer                        :: hh1, hh2, pp1, pp2
+  integer                        :: Ne(2)
+  double precision               :: hthree_aba, hthree_aaa, hthree_aab
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+
+  print*,' Providing normal_two_body_bi_orth_old ...'
+  call wall_time(wall0)
+ 
+  PROVIDE N_int
+
+  if(read_tc_norm_ord) then
+
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_old', action="read")
+      read(11) normal_two_body_bi_orth_old
+    close(11)
+
+  else
+
+    PROVIDE N_int
+
+    allocate( occ(N_int*bit_kind_size,2) )
+    allocate( key_i_core(N_int,2) )
+
+    if(core_tc_op) then
+      do i = 1, N_int
+        key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+        key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+      enddo
+      call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+    else
+      call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+    endif
+
+    normal_two_body_bi_orth_old = 0.d0
+
+    !$OMP PARALLEL                                                                         &
+    !$OMP DEFAULT (NONE)                                                                   &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aba, hthree_aab, hthree_aaa) & 
+    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth_old)
+    !$OMP DO SCHEDULE (static) 
+    do hh1 = 1, n_act_orb
+      h1 = list_act(hh1) 
+      do pp1 = 1, n_act_orb
+        p1 = list_act(pp1)
+        do hh2 = 1, n_act_orb
+          h2 = list_act(hh2) 
+          do pp2 = 1, n_act_orb
+            p2 = list_act(pp2)
+            ! all contributions from the 3-e terms to the double excitations 
+            ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
+      
+
+            ! opposite spin double excitations : s1 /= s2
+            call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aba)
+
+            ! same spin double excitations : s1 == s2 
+            if(h1<h2.and.p1.gt.p2)then
+             ! with opposite spin contributions 
+             call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+             ! same spin double excitations with same spin contributions 
+             if(Ne(2).ge.3)then
+               call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+             else
+               hthree_aaa = 0.d0
+             endif
+            else
+             ! with opposite spin contributions 
+             call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+             if(Ne(2).ge.3)then
+              ! same spin double excitations with same spin contributions 
+               call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+             else
+               hthree_aaa = 0.d0
+             endif
+            endif
+            normal_two_body_bi_orth_old(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate( occ )
+    deallocate( key_i_core )
+  endif
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth_old', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth_old
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for normal_two_body_bi_orth_old ', wall1-wall0
+
+END_PROVIDER 
+
+! ---
+
+subroutine give_aba_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer, intent(in)           :: Nint, h1, h2, p1, p2
+  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+  double precision, intent(out) :: hthree
+  integer                       :: ii, i
+  double precision              :: int_direct, int_exc_12, int_exc_13, integral
+
+  !!!! double alpha/beta
+  hthree = 0.d0
+
+  do ii = 1, Ne(2) ! purely closed shell part 
+    i = occ(ii,2)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    hthree += 2.d0 * int_direct - 1.d0 * (int_exc_13 + int_exc_12)
+  enddo
+
+  do ii = Ne(2) + 1, Ne(1) ! purely open-shell part 
+    i = occ(ii,1)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    hthree += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
+  enddo
+
+  return
+end
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_ab, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  ! Normal ordered two-body sector of the three-body terms for opposite spin double excitations 
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: h1, p1, h2, p2, i
+  integer                        :: hh1, hh2, pp1, pp2
+  integer                        :: Ne(2)
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision               :: hthree
+
+  PROVIDE N_int
+
+  allocate( key_i_core(N_int,2) )
+  allocate( occ(N_int*bit_kind_size,2) )
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core,occ,Ne,N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask,occ,Ne,N_int)
+  endif
+
+  normal_two_body_bi_orth_ab = 0.d0
+  do hh1 = 1, n_act_orb
+    h1 = list_act(hh1) 
+    do pp1 = 1, n_act_orb
+      p1 = list_act(pp1)
+      do hh2 = 1, n_act_orb
+        h2 = list_act(hh2) 
+        do pp2 = 1, n_act_orb
+          p2 = list_act(pp2)
+          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree)
+
+          normal_two_body_bi_orth_ab(p2,h2,p1,h1) = hthree    
+        enddo
+      enddo
+    enddo
+  enddo
+
+  deallocate( key_i_core )
+  deallocate( occ )
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_aa_bb, (n_act_orb, n_act_orb, n_act_orb, n_act_orb)]
+
+  BEGIN_DOC
+  ! Normal ordered two-body sector of the three-body terms for same spin double excitations 
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i,ii,j,h1,p1,h2,p2
+  integer                        :: hh1,hh2,pp1,pp2
+  integer                        :: Ne(2)
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision               :: hthree_aab, hthree_aaa
+
+  PROVIDE N_int
+
+  allocate( key_i_core(N_int,2) )
+  allocate( occ(N_int*bit_kind_size,2) )
+
+  if(core_tc_op)then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1),core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2),core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  normal_two_body_bi_orth_aa_bb = 0.d0
+  do hh1 = 1, n_act_orb
+    h1 = list_act(hh1) 
+    do pp1 = 1 , n_act_orb
+      p1 = list_act(pp1)
+      do hh2 = 1, n_act_orb
+        h2 = list_act(hh2) 
+        do pp2 = 1 , n_act_orb
+          p2 = list_act(pp2)
+          if(h1<h2.and.p1.gt.p2)then
+           call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
+           if(Ne(2).ge.3)then
+             call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
+           else
+             hthree_aaa = 0.d0
+           endif
+          else
+           call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
+           if(Ne(2).ge.3)then
+             call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
+           else
+             hthree_aaa = 0.d0
+           endif
+          endif
+          normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1) = hthree_aab + hthree_aaa
+        enddo
+      enddo
+    enddo
+  enddo
+
+  deallocate( key_i_core )
+  deallocate( occ )
+
+END_PROVIDER 
+
+! ---
+
+subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+
+  BEGIN_DOC
+  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer, intent(in)           :: Nint, h1, h2, p1, p2
+  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+  double precision, intent(out) :: hthree
+  integer                       :: ii,i
+  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
+  double precision              :: integral,int_exc_l,int_exc_ll
+
+  hthree = 0.d0
+  do ii = 1, Ne(2) ! purely closed shell part 
+    i = occ(ii,2)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
+    int_exc_l = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+    int_exc_ll= -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12= -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13= -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+    int_exc_23= -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
+  enddo
+
+  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
+    i = occ(ii,1)
+
+    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
+    int_exc_l = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+    int_exc_ll = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+    int_exc_12 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+    int_exc_13 = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+    int_exc_23 = -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+  enddo
+
+  return
+end
+
+! ---
+
+subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer,          intent(in)  :: Nint, h1, h2, p1, p2
+  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
+  double precision, intent(out) :: hthree
+  integer                       :: ii, i
+  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
+  double precision              :: integral, int_exc_l, int_exc_ll
+
+  hthree = 0.d0
+  do ii = 1, Ne(2) ! purely closed shell part 
+    i = occ(ii,2)
+
+    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
+    int_direct = -1.d0 * integral
+
+    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
+    int_exc_23= -1.d0 * integral
+
+    hthree +=  1.d0 * int_direct - int_exc_23
+  enddo
+
+  return
+end
+
+! ---
+

From 471283634919dd134e294aa71c0bac0a37d4872c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 11:17:08 +0200
Subject: [PATCH 13/18]  normal ordering: aba-DGEMM OK

---
 src/tc_bi_ortho/normal_ordered.irp.f   | 495 ++++++++++++++++++-------
 src/tc_bi_ortho/test_tc_bi_ortho.irp.f |  48 ++-
 2 files changed, 411 insertions(+), 132 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index a092762b..59e78b92 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -22,8 +22,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
   print*,' Providing normal_two_body_bi_orth ...'
   call wall_time(wall0)
  
-  PROVIDE N_int
-
   if(read_tc_norm_ord) then
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="read")
@@ -48,12 +46,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
     endif
 
     ! opposite spin double excitations : s1 /= s2
-    normal_two_body_bi_orth(:,:,:,:) = no_aba_contraction(:,:,:,:)
+    PROVIDE no_aba_contraction
 
-    !$OMP PARALLEL                                                             &
-    !$OMP DEFAULT (NONE)                                                       &
-    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa) & 
-    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth)
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
+    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
+    !$OMP         no_aba_contraction)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
@@ -97,7 +96,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
             endif
 
-            normal_two_body_bi_orth(p2,h2,p1,h1) = 0.5d0*(hthree_aab + hthree_aaa)
+            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + 0.5d0*(hthree_aab + hthree_aaa)
           enddo
         enddo
       enddo
@@ -124,103 +123,103 @@ END_PROVIDER
 
 ! ---
 
-subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-
-  BEGIN_DOC
-  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
-  END_DOC
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer, intent(in)           :: Nint, h1, h2, p1, p2
-  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
-  double precision, intent(out) :: hthree
-  integer                       :: ii,i
-  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
-  double precision              :: integral,int_exc_l,int_exc_ll
-
-  hthree = 0.d0
-  do ii = 1, Ne(2) ! purely closed shell part 
-    i = occ(ii,2)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
-    int_exc_l = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-    int_exc_ll= -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12= -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13= -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-    int_exc_23= -1.d0 * integral
-
-    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
-  enddo
-
-  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
-    i = occ(ii,1)
-
-    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
-    int_exc_l = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-    int_exc_ll = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-    int_exc_12 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-    int_exc_13 = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-    int_exc_23 = -1.d0 * integral
-
-    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
-  enddo
-
-  return
-end
+!subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+!
+!  BEGIN_DOC
+!  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
+!  END_DOC
+!
+!  use bitmasks ! you need to include the bitmasks_module.f90 features
+!
+!  implicit none
+!  integer, intent(in)           :: Nint, h1, h2, p1, p2
+!  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
+!  double precision, intent(out) :: hthree
+!  integer                       :: ii,i
+!  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
+!  double precision              :: integral,int_exc_l,int_exc_ll
+!
+!  hthree = 0.d0
+!  do ii = 1, Ne(2) ! purely closed shell part 
+!    i = occ(ii,2)
+!
+!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!    int_direct = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
+!    int_exc_l = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+!    int_exc_ll= -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!    int_exc_12= -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!    int_exc_13= -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+!    int_exc_23= -1.d0 * integral
+!
+!    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
+!  enddo
+!
+!  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
+!    i = occ(ii,1)
+!
+!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!    int_direct = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
+!    int_exc_l = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
+!    int_exc_ll = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!    int_exc_12 = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!    int_exc_13 = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
+!    int_exc_23 = -1.d0 * integral
+!
+!    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+!  enddo
+!
+!  return
+!end
 
 ! ---
 
-subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer,          intent(in)  :: Nint, h1, h2, p1, p2
-  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
-  double precision, intent(out) :: hthree
-  integer                       :: ii, i
-  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
-  double precision              :: integral, int_exc_l, int_exc_ll
-
-  hthree = 0.d0
-  do ii = 1, Ne(2) ! purely closed shell part 
-    i = occ(ii,2)
-
-    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
-    int_direct = -1.d0 * integral
-
-    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
-    int_exc_23= -1.d0 * integral
-
-    hthree +=  1.d0 * int_direct - int_exc_23
-  enddo
-
-  return
-end
+!subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
+!
+!  use bitmasks ! you need to include the bitmasks_module.f90 features
+!
+!  implicit none
+!  integer,          intent(in)  :: Nint, h1, h2, p1, p2
+!  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
+!  double precision, intent(out) :: hthree
+!  integer                       :: ii, i
+!  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
+!  double precision              :: integral, int_exc_l, int_exc_ll
+!
+!  hthree = 0.d0
+!  do ii = 1, Ne(2) ! purely closed shell part 
+!    i = occ(ii,2)
+!
+!    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
+!    int_direct = -1.d0 * integral
+!
+!    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
+!    int_exc_23= -1.d0 * integral
+!
+!    hthree +=  1.d0 * int_direct - int_exc_23
+!  enddo
+!
+!  return
+!end
 
 ! ---
 
@@ -264,6 +263,10 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   allocate(tmpvec_1(n_points_final_grid,3))
   allocate(tmpvec_2(n_points_final_grid,3))
 
+  double precision, allocatable :: tmp_2d(:,:)
+  allocate(tmp_2d(mo_num,mo_num))
+
+
   ! purely closed shell part 
   do ii = 1, Ne(2)
     i = occ(ii,2)
@@ -313,9 +316,10 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       !$OMP END DO
       !$OMP END PARALLEL
 
-      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0              &
-                , int2_grad1_u12_bimo_t, 3*n_points_final_grid, tmp1, 3*n_points_final_grid &
-                , 0.d0, tmp_3d, mo_num)
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
       !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
       do p1 = 1, mo_num
@@ -364,38 +368,163 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP END DO
         !$OMP END PARALLEL
 
-        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0                     &
-                  , mos_l_in_r_array_transp, n_points_final_grid, tmp2, n_points_final_grid &
-                  , 1.d0, no_aba_contraction(p2,h2,1,1), mo_num*mo_num)
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
 
       enddo ! p1
     enddo ! h1
   enddo ! i
 
 
-  double precision :: integral, int_direct, int_exc_13, int_exc_12
 
-  ! TODO
+
+
+
+
+
+
   ! purely open-shell part 
   if(Ne(2) < Ne(1)) then
-
     do ii = Ne(2) + 1, Ne(1)
       i = occ(ii,1)
 
-      call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-      int_direct = -1.d0 * integral
+      do h1 = 1, mo_num
 
-      call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-      int_exc_13 = -1.d0 * integral
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-      call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-      int_exc_12 = -1.d0 * integral
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-      no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
-    enddo
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        do p1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                    , tmp2(1,1), n_points_final_grid                       &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo !i
   endif
 
-  ! ---
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
   deallocate(tmp_3d)
   deallocate(tmp1, tmp2)
@@ -403,17 +532,121 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpvec_1, tmpvec_2)
 
 
-  !$OMP PARALLEL DO PRIVATE(h1,h2,p1,p2)
-  do h1 = 1, mo_num
-    do p1 = 1, mo_num
-      do h2 = 1, mo_num
-        do p2 = 1, mo_num
-          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (no_aba_contraction(p2,h2,p1,h1) + no_aba_contraction(p1,h1,p2,h2))
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
+
+
+
+
+
+
+
+  no_aba_contraction = -0.5d0 * no_aba_contraction
+  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+
+!  do h1 = 1, mo_num
+!    do p1 = 1, mo_num
+!      do h2 = 1, mo_num
+!        do p2 = 1, mo_num
+!          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (tmp_4d(p2,h2,p1,h1) + tmp_4d(p1,h1,p2,h2))
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+
+
+  ! ---
+
+  double precision :: integral, int_direct, int_exc_13, int_exc_12
+
+!  no_aba_contraction = 0.d0
+!
+!  ! purely closed shell part 
+!  do ii = 1, Ne(2)
+!    i = occ(ii,1)
+!
+!    !$OMP PARALLEL                                                               &
+!    !$OMP DEFAULT (NONE)                                                         &
+!    !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
+!    !$OMP SHARED (mo_num, i, no_aba_contraction)
+!    !$OMP DO SCHEDULE (static) 
+!    do h1 = 1, mo_num
+!      do p1 = 1, mo_num
+!        do h2 = 1, mo_num
+!          do p2 = 1, mo_num
+!
+!            call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!            int_direct = -1.d0 * integral
+!
+!            call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!            int_exc_13 = -1.d0 * integral
+!
+!            call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!            int_exc_12 = -1.d0 * integral
+!
+!            !no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
+!          enddo
+!        enddo
+!      enddo
+!    enddo
+!    !$OMP END DO
+!    !$OMP END PARALLEL
+!  enddo
+
+!  ! purely open-shell part 
+!  if(Ne(2) < Ne(1)) then
+!
+!    do ii = Ne(2) + 1, Ne(1)
+!      i = occ(ii,1)
+!
+!      !$OMP PARALLEL                                                               &
+!      !$OMP DEFAULT (NONE)                                                         &
+!      !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
+!      !$OMP SHARED (mo_num, i, no_aba_contraction)
+!      !$OMP DO SCHEDULE (static) 
+!      do h1 = 1, mo_num
+!        do p1 = 1, mo_num
+!          do h2 = 1, mo_num
+!            do p2 = 1, mo_num
+!
+!              call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
+!              int_direct = -1.d0 * integral
+!           
+!              call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
+!              int_exc_13 = -1.d0 * integral
+!           
+!              call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
+!              int_exc_12 = -1.d0 * integral
+!           
+!              no_aba_contraction(p2,h2,p1,h1) += 0.5d0 * int_direct - 0.25d0 * (int_exc_13 + int_exc_12)
+!            enddo
+!          enddo
+!        enddo
+!      enddo
+!      !$OMP END DO
+!      !$OMP END PARALLEL
+!    enddo
+!  endif
+
+  ! ---
+
+!  !$OMP PARALLEL                                         &
+!  !$OMP DEFAULT (NONE)                                   &
+!  !$OMP PRIVATE (h1, h2, p1, p2, integral)               & 
+!  !$OMP SHARED (mo_num, N_int,Ne, occ, no_aba_contraction)
+!  !$OMP DO SCHEDULE (static) 
+!  do h1 = 1, mo_num
+!    do p1 = 1, mo_num
+!      do h2 = 1, mo_num
+!        do p2 = 1, mo_num
+!          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, integral)
+!          no_aba_contraction(p2,h2,p1,h1) = 0.5d0 * integral
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END DO
+!  !$OMP END PARALLEL
+
+
 
 END_PROVIDER
 
diff --git a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
index df86ea65..33b5c5aa 100644
--- a/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -11,12 +11,14 @@ program tc_bi_ortho
   touch read_wf
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
- call test_h_u0
+! call test_h_u0
 ! call test_slater_tc_opt
 ! call timing_tot
 ! call timing_diag
 ! call timing_single
 ! call timing_double
+
+  call test_no()
 end
 
 subroutine test_h_u0
@@ -252,3 +254,47 @@ subroutine timing_double
 
 end
 
+! ---
+
+subroutine test_no()
+
+  implicit none
+  integer          :: i, j, k, l
+  double precision :: accu, contrib, new, ref, thr
+
+  print*, ' testing normal_two_body_bi_orth ...'
+
+  thr = 1d-8
+
+  PROVIDE normal_two_body_bi_orth_old
+  PROVIDE normal_two_body_bi_orth
+
+  accu = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+
+          new = normal_two_body_bi_orth    (l,k,j,i)
+          ref = normal_two_body_bi_orth_old(l,k,j,i)
+          contrib = dabs(new - ref)
+          accu += contrib
+          if(contrib .gt. thr) then
+            print*, ' problem on normal_two_body_bi_orth'
+            print*, l, k, j, i
+            print*, ref, new, contrib
+            stop
+          endif
+
+        enddo
+      enddo
+    enddo
+  enddo
+  print*, ' accu on normal_two_body_bi_orth = ', accu / dble(mo_num)**4
+
+ return
+end
+
+! ---
+
+

From 3a5dd05d7eb61dc21b1ec16eba330e3687b54001 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 15:13:18 +0200
Subject: [PATCH 14/18] NO: working on AAB contractions

---
 src/tc_bi_ortho/normal_ordered.irp.f | 428 +++++++++++----------------
 src/utils/util.irp.f                 |  37 ++-
 2 files changed, 217 insertions(+), 248 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 59e78b92..b3c413d3 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -45,14 +45,14 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
       call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
     endif
 
-    ! opposite spin double excitations : s1 /= s2
     PROVIDE no_aba_contraction
+    PROVIDE no_aab_contraction
 
     !$OMP PARALLEL                                                              &
     !$OMP DEFAULT (NONE)                                                        &
     !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
     !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
-    !$OMP         no_aba_contraction)
+    !$OMP         no_aba_contraction,no_aab_contraction)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
@@ -72,9 +72,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
             ! same spin double excitations : s1 == s2 
             if((h1 < h2) .and. (p1 > p2)) then
 
-              ! with opposite spin contributions 
-              call give_aab_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aab) ! exchange h1<->h2
-
               ! same spin double excitations with same spin contributions 
               if(Ne(2) .ge. 3) then
                 call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
@@ -84,9 +81,6 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
             else
 
-              ! with opposite spin contributions 
-              call give_aab_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aab)
-
               if(Ne(2) .ge. 3) then
                 ! same spin double excitations with same spin contributions 
                 call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
@@ -96,7 +90,9 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
             endif
 
-            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + 0.5d0*(hthree_aab + hthree_aaa)
+            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) &
+                                                 + no_aab_contraction(p2,h2,p1,h1) &
+                                                 + 0.5d0 * hthree_aaa
           enddo
         enddo
       enddo
@@ -123,106 +119,6 @@ END_PROVIDER
 
 ! ---
 
-!subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-!
-!  BEGIN_DOC
-!  ! pure same spin contribution to same spin double excitation s1=h1,p1, s2=h2,p2, with s1==s2
-!  END_DOC
-!
-!  use bitmasks ! you need to include the bitmasks_module.f90 features
-!
-!  implicit none
-!  integer, intent(in)           :: Nint, h1, h2, p1, p2
-!  integer, intent(in)           :: Ne(2), occ(Nint*bit_kind_size,2)
-!  double precision, intent(out) :: hthree
-!  integer                       :: ii,i
-!  double precision              :: int_direct,int_exc_12,int_exc_13,int_exc_23
-!  double precision              :: integral,int_exc_l,int_exc_ll
-!
-!  hthree = 0.d0
-!  do ii = 1, Ne(2) ! purely closed shell part 
-!    i = occ(ii,2)
-!
-!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!    int_direct = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, p1, i, i, h2, h1, integral)
-!    int_exc_l = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-!    int_exc_ll= -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!    int_exc_12= -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!    int_exc_13= -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-!    int_exc_23= -1.d0 * integral
-!
-!    hthree +=  1.d0 * int_direct + int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23)
-!  enddo
-!
-!  do ii = Ne(2)+1,Ne(1) ! purely open-shell part 
-!    i = occ(ii,1)
-!
-!    call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!    int_direct = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, p1, i , i, h2, h1, integral)
-!    int_exc_l = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, i, p2, i, h2, h1, integral)
-!    int_exc_ll = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!    int_exc_12 = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!    int_exc_13 = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
-!    int_exc_23 = -1.d0 * integral
-!
-!    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
-!  enddo
-!
-!  return
-!end
-
-! ---
-
-!subroutine give_aab_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
-!
-!  use bitmasks ! you need to include the bitmasks_module.f90 features
-!
-!  implicit none
-!  integer,          intent(in)  :: Nint, h1, h2, p1, p2
-!  integer,          intent(in)  :: Ne(2), occ(Nint*bit_kind_size,2)
-!  double precision, intent(out) :: hthree
-!  integer                       :: ii, i
-!  double precision              :: int_direct, int_exc_12, int_exc_13, int_exc_23
-!  double precision              :: integral, int_exc_l, int_exc_ll
-!
-!  hthree = 0.d0
-!  do ii = 1, Ne(2) ! purely closed shell part 
-!    i = occ(ii,2)
-!
-!    call give_integrals_3_body_bi_ort(p2, p1, i, h2, h1, i, integral)
-!    int_direct = -1.d0 * integral
-!
-!    call give_integrals_3_body_bi_ort(p1, p2, i, h2, h1, i, integral)
-!    int_exc_23= -1.d0 * integral
-!
-!    hthree +=  1.d0 * int_direct - int_exc_23
-!  enddo
-!
-!  return
-!end
-
-! ---
-
 BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
@@ -236,6 +132,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   double precision,  allocatable :: tmp_3d(:,:,:)
   double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
   double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
 
   print*,' Providing no_aba_contraction ...'
   call wall_time(wall0)
@@ -262,8 +159,6 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   allocate(tmpval_2(n_points_final_grid))
   allocate(tmpvec_1(n_points_final_grid,3))
   allocate(tmpvec_2(n_points_final_grid,3))
-
-  double precision, allocatable :: tmp_2d(:,:)
   allocate(tmp_2d(mo_num,mo_num))
 
 
@@ -386,13 +281,6 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
   enddo ! i
 
 
-
-
-
-
-
-
-
   ! purely open-shell part 
   if(Ne(2) < Ne(1)) then
     do ii = Ne(2) + 1, Ne(1)
@@ -510,146 +398,192 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
     enddo !i
   endif
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
   deallocate(tmp_3d)
   deallocate(tmp1, tmp2)
   deallocate(tmpval_1, tmpval_2)
   deallocate(tmpvec_1, tmpvec_2)
 
-
-
-
-
-
-
-
-
   no_aba_contraction = -0.5d0 * no_aba_contraction
   call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
 
-!  do h1 = 1, mo_num
-!    do p1 = 1, mo_num
-!      do h2 = 1, mo_num
-!        do p2 = 1, mo_num
-!          no_aba_contraction(p2,h2,p1,h1) = -0.5d0 * (tmp_4d(p2,h2,p1,h1) + tmp_4d(p1,h1,p2,h2))
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-
-
-  ! ---
-
-  double precision :: integral, int_direct, int_exc_13, int_exc_12
-
-!  no_aba_contraction = 0.d0
-!
-!  ! purely closed shell part 
-!  do ii = 1, Ne(2)
-!    i = occ(ii,1)
-!
-!    !$OMP PARALLEL                                                               &
-!    !$OMP DEFAULT (NONE)                                                         &
-!    !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
-!    !$OMP SHARED (mo_num, i, no_aba_contraction)
-!    !$OMP DO SCHEDULE (static) 
-!    do h1 = 1, mo_num
-!      do p1 = 1, mo_num
-!        do h2 = 1, mo_num
-!          do p2 = 1, mo_num
-!
-!            call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!            int_direct = -1.d0 * integral
-!
-!            call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!            int_exc_13 = -1.d0 * integral
-!
-!            call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!            int_exc_12 = -1.d0 * integral
-!
-!            !no_aba_contraction(p2,h2,p1,h1) += 1.d0 * int_direct - 0.5d0 * (int_exc_13 + int_exc_12)
-!          enddo
-!        enddo
-!      enddo
-!    enddo
-!    !$OMP END DO
-!    !$OMP END PARALLEL
-!  enddo
-
-!  ! purely open-shell part 
-!  if(Ne(2) < Ne(1)) then
-!
-!    do ii = Ne(2) + 1, Ne(1)
-!      i = occ(ii,1)
-!
-!      !$OMP PARALLEL                                                               &
-!      !$OMP DEFAULT (NONE)                                                         &
-!      !$OMP PRIVATE (h1, h2, p1, p2, int_direct, int_exc_13, int_exc_12, integral) & 
-!      !$OMP SHARED (mo_num, i, no_aba_contraction)
-!      !$OMP DO SCHEDULE (static) 
-!      do h1 = 1, mo_num
-!        do p1 = 1, mo_num
-!          do h2 = 1, mo_num
-!            do p2 = 1, mo_num
-!
-!              call give_integrals_3_body_bi_ort(i, p2, p1, i, h2, h1, integral)
-!              int_direct = -1.d0 * integral
-!           
-!              call give_integrals_3_body_bi_ort(p1, p2, i, i, h2, h1, integral)
-!              int_exc_13 = -1.d0 * integral
-!           
-!              call give_integrals_3_body_bi_ort(p2, i, p1, i, h2, h1, integral)
-!              int_exc_12 = -1.d0 * integral
-!           
-!              no_aba_contraction(p2,h2,p1,h1) += 0.5d0 * int_direct - 0.25d0 * (int_exc_13 + int_exc_12)
-!            enddo
-!          enddo
-!        enddo
-!      enddo
-!      !$OMP END DO
-!      !$OMP END PARALLEL
-!    enddo
-!  endif
-
-  ! ---
-
-!  !$OMP PARALLEL                                         &
-!  !$OMP DEFAULT (NONE)                                   &
-!  !$OMP PRIVATE (h1, h2, p1, p2, integral)               & 
-!  !$OMP SHARED (mo_num, N_int,Ne, occ, no_aba_contraction)
-!  !$OMP DO SCHEDULE (static) 
-!  do h1 = 1, mo_num
-!    do p1 = 1, mo_num
-!      do h2 = 1, mo_num
-!        do p2 = 1, mo_num
-!          call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, integral)
-!          no_aba_contraction(p2,h2,p1,h1) = 0.5d0 * integral
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
-
+  call wall_time(wall1)
+  print*,' Wall time for no_aba_contraction', wall1-wall0
 
 END_PROVIDER
 
 ! ---
 
+BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
 
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aab_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+  allocate(tmp_2d(mo_num,mo_num))
+
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpvec_1)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpvec_1, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1)
+  deallocate(tmpvec_1)
+
+  no_aab_contraction = 0.5d0 * no_aab_contraction
+  call sub_A_At(no_aab_contraction(1,1,1,1), mo_num*mo_num)
+
+  do h1 = 1, mo_num-1
+    do h2 = h1+1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+        enddo
+      enddo
+    enddo
+  enddo
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aab_contraction', wall1-wall0
+
+
+END_PROVIDER
+
+! ---
diff --git a/src/utils/util.irp.f b/src/utils/util.irp.f
index aba99c2b..a9f1a438 100644
--- a/src/utils/util.irp.f
+++ b/src/utils/util.irp.f
@@ -490,7 +490,7 @@ end subroutine check_sym
 subroutine sum_A_At(A, N)
 
   !BEGIN_DOC
-  ! useful for symmetrizing a tensor without a temporary tensor
+  ! add a tensor with its transpose without a temporary tensor
   !END_DOC
 
   implicit none
@@ -521,3 +521,38 @@ subroutine sum_A_At(A, N)
 
 end
 
+! ---
+
+subroutine sub_A_At(A, N)
+
+  !BEGIN_DOC
+  ! substruct a tensor with its transpose without a temporary tensor
+  !END_DOC
+
+  implicit none
+  integer,          intent(in)    :: N
+  double precision, intent(inout) :: A(N,N)
+  integer                         :: i, j
+
+ !$OMP PARALLEL       &
+ !$OMP DEFAULT (NONE) &
+ !$OMP PRIVATE (i, j) & 
+ !$OMP SHARED (A, N)
+ !$OMP DO 
+  do j = 1, N
+    do i = j, N
+      A(i,j) -= A(j,i)
+    enddo
+  enddo
+ !$OMP END DO
+
+ !$OMP DO 
+  do j = 2, N
+    do i = 1, j-1
+      A(i,j) = -A(j,i)
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+end

From b0da0ac04d49b3fbbbe0eb9649b0b6da87cce6d2 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 16:08:46 +0200
Subject: [PATCH 15/18] normal ordering: aab-DGEMM OK

---
 src/tc_bi_ortho/normal_ordered.irp.f | 73 +++++++++++++++++++---------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index b3c413d3..3a1e79fd 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -230,11 +230,11 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
 
         ! to minimize the number of operations
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT (NONE)                                            &
-        !$OMP PRIVATE (ipoint)                                          &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
         !$OMP         tmpval_1)
         !$OMP DO
         do ipoint = 1, n_points_final_grid
@@ -398,7 +398,7 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
     enddo !i
   endif
 
-  deallocate(tmp_3d)
+  deallocate(tmp_2d, tmp_3d)
   deallocate(tmp1, tmp2)
   deallocate(tmpval_1, tmpval_2)
   deallocate(tmpvec_1, tmpvec_2)
@@ -446,12 +446,12 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
     call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
   endif
 
+  allocate(tmp_2d(mo_num,mo_num))
   allocate(tmp_3d(mo_num,mo_num,mo_num))
   allocate(tmp1(n_points_final_grid,3,mo_num))
   allocate(tmp2(n_points_final_grid,mo_num))
   allocate(tmpval_1(n_points_final_grid))
   allocate(tmpvec_1(n_points_final_grid,3))
-  allocate(tmp_2d(mo_num,mo_num))
 
 
   ! purely closed shell part 
@@ -471,10 +471,10 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       !$OMP         tmpval_1, tmpvec_1)
       !$OMP DO
       do ipoint = 1, n_points_final_grid
-        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
-        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
@@ -515,17 +515,17 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       do p1 = 1, mo_num
 
         ! to minimize the number of operations
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT (NONE)                                            &
-        !$OMP PRIVATE (ipoint)                                          &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
         !$OMP         tmpval_1)
         !$OMP DO
         do ipoint = 1, n_points_final_grid
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
         enddo
         !$OMP END DO
         !$OMP END PARALLEL
@@ -567,9 +567,38 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
   deallocate(tmpval_1)
   deallocate(tmpvec_1)
 
-  no_aab_contraction = 0.5d0 * no_aab_contraction
-  call sub_A_At(no_aab_contraction(1,1,1,1), mo_num*mo_num)
+  no_aab_contraction = -0.5d0 * no_aab_contraction
 
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (h1, h2, p1, p2) & 
+  !$OMP SHARED (no_aab_contraction, mo_num)
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 1, mo_num
+        do p2 = p1, mo_num
+          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
   do h1 = 1, mo_num-1
     do h2 = h1+1, mo_num
       do p1 = 2, mo_num
@@ -579,11 +608,11 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
       enddo
     enddo
   enddo
+  !$OMP END PARALLEL
 
   call wall_time(wall1)
   print*,' Wall time for no_aab_contraction', wall1-wall0
 
-
 END_PROVIDER
 
 ! ---

From aafca191f1fe271575f80f5d16eb80587290213c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Mon, 5 Jun 2023 20:59:34 +0200
Subject: [PATCH 16/18] normal ordering: aaa-DGEMM OK

---
 src/tc_bi_ortho/normal_ordered.irp.f     | 593 +++++++++++++++++++++--
 src/tc_bi_ortho/normal_ordered_old.irp.f |   4 +-
 2 files changed, 568 insertions(+), 29 deletions(-)

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index 3a1e79fd..fea229c9 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -47,12 +47,13 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
     PROVIDE no_aba_contraction
     PROVIDE no_aab_contraction
+    PROVIDE no_aaa_contraction
 
     !$OMP PARALLEL                                                              &
     !$OMP DEFAULT (NONE)                                                        &
     !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
     !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
-    !$OMP         no_aba_contraction,no_aab_contraction)
+    !$OMP         no_aba_contraction, no_aab_contraction, no_aaa_contraction)
     !$OMP DO SCHEDULE (static) 
     do hh1 = 1, n_act_orb
       h1 = list_act(hh1) 
@@ -66,33 +67,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
           do pp2 = 1, n_act_orb
             p2 = list_act(pp2)
 
-            ! all contributions from the 3-e terms to the double excitations 
-            ! s1:(h1-->p1), s2:(h2-->p2) from the HF reference determinant 
-
-            ! same spin double excitations : s1 == s2 
-            if((h1 < h2) .and. (p1 > p2)) then
-
-              ! same spin double excitations with same spin contributions 
-              if(Ne(2) .ge. 3) then
-                call give_aaa_contraction(N_int, h2, h1, p1, p2, Ne, occ, hthree_aaa) ! exchange h1<->h2
-              else
-                hthree_aaa = 0.d0
-              endif
-
-            else
-
-              if(Ne(2) .ge. 3) then
-                ! same spin double excitations with same spin contributions 
-                call give_aaa_contraction(N_int, h1, h2, p1, p2, Ne, occ, hthree_aaa)
-              else
-                hthree_aaa = 0.d0
-              endif
-
-            endif
-
-            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) &
-                                                 + no_aab_contraction(p2,h2,p1,h1) &
-                                                 + 0.5d0 * hthree_aaa
+            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + no_aab_contraction(p2,h2,p1,h1) + no_aaa_contraction(p2,h2,p1,h1)
           enddo
         enddo
       enddo
@@ -616,3 +591,565 @@ BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_
 END_PROVIDER
 
 ! ---
+
+BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  !
+  ! if:
+  !    h1 < h2
+  !    p1 > p2
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
+  !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! else:
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! 
+  ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
+  ! J(p2,h2,p1,h1) = \sum_i [ <  i p2 p1 | i h2 h1 >
+  !                         + < p2 p1  i | i h2 h1 >
+  !                         + < p1  i p2 | i h2 h1 > ]
+  !
+  !
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+
+  print*,' Providing no_aaa_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  if(Ne(2) .lt. 3) then
+
+    no_aaa_contraction = 0.d0
+
+  else
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmp3(n_points_final_grid,3,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmpvec_3(n_points_final_grid,3))
+
+    ! purely closed shell part 
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      ! to avoid tmp(N^4)
+      do h1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpvec_1, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p2, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_2, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , 0.d0, tmp_3d(1,1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        ! to avoid tmp(N^4)
+        do p1 = 1, mo_num
+
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                &
+          !$OMP DEFAULT (NONE)                          &
+          !$OMP PRIVATE (h2, ipoint)                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, &
+          !$OMP         mos_r_in_r_array_transp,        &
+          !$OMP         int2_grad1_u12_bimo_t,          &
+          !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                    &
+          !$OMP DEFAULT (NONE)                              &
+          !$OMP PRIVATE (p2, ipoint)                        &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+          !$OMP         int2_grad1_u12_bimo_t,              &
+          !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp3(1,1,1), 3*n_points_final_grid                    &
+                    , tmp1(1,1,1), 3*n_points_final_grid                    &
+                    , 1.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+
+        ! to avoid tmp(N^4)
+        do h1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1,                       &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p1, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_1, tmpvec_1, tmp1)
+          !$OMP DO 
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p2, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_2, tmpvec_2, tmp1)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          ! to avoid tmp(N^4)
+          do p1 = 1, mo_num
+
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+            !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            !$OMP PARALLEL                                &
+            !$OMP DEFAULT (NONE)                          &
+            !$OMP PRIVATE (h2, ipoint)                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, &
+            !$OMP         mos_r_in_r_array_transp,        &
+            !$OMP         int2_grad1_u12_bimo_t,          &
+            !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+            !$OMP DO 
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            !$OMP PARALLEL                                    &
+            !$OMP DEFAULT (NONE)                              &
+            !$OMP PRIVATE (p2, ipoint)                        &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+            !$OMP         int2_grad1_u12_bimo_t,              &
+            !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+            !$OMP DO 
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                     &
+                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+    endif
+
+    deallocate(tmp_2d, tmp_3d)
+    deallocate(tmp1, tmp2, tmp3)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
+
+    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (no_aaa_contraction, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL
+
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aaa_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
diff --git a/src/tc_bi_ortho/normal_ordered_old.irp.f b/src/tc_bi_ortho/normal_ordered_old.irp.f
index 553cafdb..417580dd 100644
--- a/src/tc_bi_ortho/normal_ordered_old.irp.f
+++ b/src/tc_bi_ortho/normal_ordered_old.irp.f
@@ -89,6 +89,7 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth_old, (mo_num, mo_num,
                hthree_aaa = 0.d0
              endif
             endif
+
             normal_two_body_bi_orth_old(p2,h2,p1,h1) = 0.5d0*(hthree_aba + hthree_aab + hthree_aaa)
           enddo
         enddo
@@ -350,7 +351,8 @@ subroutine give_aaa_contraction(Nint, h1, h2, p1, p2, Ne, occ, hthree)
     call give_integrals_3_body_bi_ort(i, p1, p2, i, h2, h1, integral)
     int_exc_23 = -1.d0 * integral
 
-    hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+    !hthree +=  1.d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
+    hthree +=  0.5d0 * int_direct + 0.5d0 * (int_exc_l + int_exc_ll - (int_exc_12 + int_exc_13 + int_exc_23))
   enddo
 
   return

From 374a88bc624396370660182f6da3d876934b35b9 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 8 Jun 2023 15:51:52 +0200
Subject: [PATCH 17/18] normal ordering with DGEMM: OK

---
 src/tc_bi_ortho/normal_ordered.irp.f          | 1230 ++++++++---------
 .../normal_ordered_contractions.irp.f         | 1062 ++++++++++++++
 2 files changed, 1615 insertions(+), 677 deletions(-)
 create mode 100644 src/tc_bi_ortho/normal_ordered_contractions.irp.f

diff --git a/src/tc_bi_ortho/normal_ordered.irp.f b/src/tc_bi_ortho/normal_ordered.irp.f
index fea229c9..7259c270 100644
--- a/src/tc_bi_ortho/normal_ordered.irp.f
+++ b/src/tc_bi_ortho/normal_ordered.irp.f
@@ -11,16 +11,15 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   implicit none
 
-  integer                        :: i, h1, p1, h2, p2
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
   integer                        :: hh1, hh2, pp1, pp2
   integer                        :: Ne(2)
-  double precision               :: hthree_aaa, hthree_aab
-  double precision               :: wall0, wall1
+  double precision               :: wall0, wall1, walli, wallf
   integer,           allocatable :: occ(:,:)
   integer(bit_kind), allocatable :: key_i_core(:,:)
 
   print*,' Providing normal_two_body_bi_orth ...'
-  call wall_time(wall0)
+  call wall_time(walli)
  
   if(read_tc_norm_ord) then
 
@@ -30,6 +29,11 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
 
   else
 
+    double precision, allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+    double precision, allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+    double precision, allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+    double precision, allocatable :: tmp(:,:,:,:)
+
     PROVIDE N_int
 
     allocate( occ(N_int*bit_kind_size,2) )
@@ -45,224 +49,33 @@ BEGIN_PROVIDER [ double precision, normal_two_body_bi_orth, (mo_num, mo_num, mo_
       call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
     endif
 
-    PROVIDE no_aba_contraction
-    PROVIDE no_aab_contraction
-    PROVIDE no_aaa_contraction
+    allocate(tmp(mo_num,mo_num,mo_num,mo_num))
 
-    !$OMP PARALLEL                                                              &
-    !$OMP DEFAULT (NONE)                                                        &
-    !$OMP PRIVATE (hh1, h1, hh2, h2, pp1, p1, pp2, p2, hthree_aab, hthree_aaa)  & 
-    !$OMP SHARED (N_int, n_act_orb, list_act, Ne, occ, normal_two_body_bi_orth, &
-    !$OMP         no_aba_contraction, no_aab_contraction, no_aaa_contraction)
-    !$OMP DO SCHEDULE (static) 
-    do hh1 = 1, n_act_orb
-      h1 = list_act(hh1) 
+    ! ---
+    ! aba contraction
 
-      do pp1 = 1, n_act_orb
-        p1 = list_act(pp1)
+    print*,' Providing aba_contraction ...'
+    call wall_time(wall0)
 
-        do hh2 = 1, n_act_orb
-          h2 = list_act(hh2) 
+    tmp = 0.d0
 
-          do pp2 = 1, n_act_orb
-            p2 = list_act(pp2)
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmp_2d(mo_num,mo_num))
 
-            normal_two_body_bi_orth(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + no_aab_contraction(p2,h2,p1,h1) + no_aaa_contraction(p2,h2,p1,h1)
-          enddo
-        enddo
-      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    deallocate( occ )
-    deallocate( key_i_core )
-  endif
-
-  if(write_tc_norm_ord.and.mpi_master) then
-    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="write")
-      call ezfio_set_work_empty(.False.)
-      write(11) normal_two_body_bi_orth
-      close(11)
-      call ezfio_set_tc_keywords_io_tc_integ('Read')
-  endif
-
-  call wall_time(wall1)
-  print*,' Wall time for normal_two_body_bi_orth ', wall1-wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, ii, h1, p1, h2, p2, ipoint
-  integer                        :: Ne(2)
-  double precision               :: wall0, wall1
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision,  allocatable :: tmp_3d(:,:,:)
-  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
-  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
-  double precision,  allocatable :: tmp_2d(:,:)
-
-  print*,' Providing no_aba_contraction ...'
-  call wall_time(wall0)
-
-  PROVIDE N_int
-
-  allocate(occ(N_int*bit_kind_size,2))
-  allocate(key_i_core(N_int,2))
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num))
-  allocate(tmp2(n_points_final_grid,mo_num))
-  allocate(tmpval_1(n_points_final_grid))
-  allocate(tmpval_2(n_points_final_grid))
-  allocate(tmpvec_1(n_points_final_grid,3))
-  allocate(tmpvec_2(n_points_final_grid,3))
-  allocate(tmp_2d(mo_num,mo_num))
-
-
-  ! purely closed shell part 
-  do ii = 1, Ne(2)
-    i = occ(ii,2)
-
-    ! to avoid tmp(N^4)
-    do h1 = 1, mo_num
-
-      ! to minimize the number of operations
-      !$OMP PARALLEL                                                  &
-      !$OMP DEFAULT (NONE)                                            &
-      !$OMP PRIVATE (ipoint)                                          &
-      !$OMP SHARED (n_points_final_grid, i, h1,                       &
-      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
-      !$OMP DO
-      do ipoint = 1, n_points_final_grid
-        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
-        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
-        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
-        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      !$OMP PARALLEL                                                &
-      !$OMP DEFAULT (NONE)                                          &
-      !$OMP PRIVATE (p1, ipoint)                                    &
-      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
-      !$OMP DO 
-      do p1 = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
-                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
-          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
-                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
-          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
-                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
-        enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
-                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
-                , tmp1(1,1,1), 3*n_points_final_grid                           &
-                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
-
-      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
-      do p1 = 1, mo_num
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
-          enddo
-        enddo
-      enddo
-      !$OMP END PARALLEL DO
+    ! purely closed shell part 
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
 
       ! to avoid tmp(N^4)
-      do p1 = 1, mo_num
-
-        ! to minimize the number of operations
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT (NONE)                                           &
-        !$OMP PRIVATE (ipoint)                                         &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-        !$OMP         tmpval_1)
-        !$OMP DO
-        do ipoint = 1, n_points_final_grid
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
-                                                                - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
-                                                                - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
-                                                                - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        !$OMP PARALLEL                             &
-        !$OMP DEFAULT (NONE)                       &
-        !$OMP PRIVATE (h2, ipoint)                 &
-        !$OMP SHARED (mo_num, n_points_final_grid, &
-        !$OMP         mos_r_in_r_array_transp,     &
-        !$OMP         tmpval_1, tmp2)
-        !$OMP DO 
-        do h2 = 1, mo_num
-          do ipoint = 1, n_points_final_grid
-            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
-          enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
-                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
-                  , tmp2(1,1), n_points_final_grid                      &
-                  , 0.d0, tmp_2d(1,1), mo_num)
-
-        !$OMP PARALLEL DO PRIVATE(h2,p2)
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
-          enddo
-        enddo
-        !$OMP END PARALLEL DO
-
-      enddo ! p1
-    enddo ! h1
-  enddo ! i
-
-
-  ! purely open-shell part 
-  if(Ne(2) < Ne(1)) then
-    do ii = Ne(2) + 1, Ne(1)
-      i = occ(ii,1)
-
       do h1 = 1, mo_num
 
+        ! to minimize the number of operations
         !$OMP PARALLEL                                                  &
         !$OMP DEFAULT (NONE)                                            &
         !$OMP PRIVATE (ipoint)                                          &
@@ -304,29 +117,30 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP END DO
         !$OMP END PARALLEL
 
-        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
-                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
-                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
                   , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
         !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
         !$OMP END PARALLEL DO
 
+        ! to avoid tmp(N^4)
         do p1 = 1, mo_num
 
           ! to minimize the number of operations
-          !$OMP PARALLEL                                                  &
-          !$OMP DEFAULT (NONE)                                            &
-          !$OMP PRIVATE (ipoint)                                          &
-          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP PARALLEL                                                 &
+          !$OMP DEFAULT (NONE)                                           &
+          !$OMP PRIVATE (ipoint)                                         &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
           !$OMP         tmpval_1)
           !$OMP DO
           do ipoint = 1, n_points_final_grid
@@ -355,313 +169,171 @@ BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP END DO
           !$OMP END PARALLEL
 
-          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
-                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
-                    , tmp2(1,1), n_points_final_grid                       &
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
                     , 0.d0, tmp_2d(1,1), mo_num)
 
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
 
         enddo ! p1
       enddo ! h1
-    enddo !i
-  endif
+    enddo ! i
 
-  deallocate(tmp_2d, tmp_3d)
-  deallocate(tmp1, tmp2)
-  deallocate(tmpval_1, tmpval_2)
-  deallocate(tmpvec_1, tmpvec_2)
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
 
-  no_aba_contraction = -0.5d0 * no_aba_contraction
-  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+        do h1 = 1, mo_num
 
-  call wall_time(wall1)
-  print*,' Wall time for no_aba_contraction', wall1-wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, ii, h1, p1, h2, p2, ipoint
-  integer                        :: Ne(2)
-  double precision               :: wall0, wall1
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision,  allocatable :: tmp_3d(:,:,:)
-  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
-  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
-  double precision,  allocatable :: tmp_2d(:,:)
-
-  print*,' Providing no_aab_contraction ...'
-  call wall_time(wall0)
-
-  PROVIDE N_int
-
-  allocate(occ(N_int*bit_kind_size,2))
-  allocate(key_i_core(N_int,2))
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  allocate(tmp_2d(mo_num,mo_num))
-  allocate(tmp_3d(mo_num,mo_num,mo_num))
-  allocate(tmp1(n_points_final_grid,3,mo_num))
-  allocate(tmp2(n_points_final_grid,mo_num))
-  allocate(tmpval_1(n_points_final_grid))
-  allocate(tmpvec_1(n_points_final_grid,3))
-
-
-  ! purely closed shell part 
-  do ii = 1, Ne(2)
-    i = occ(ii,2)
-
-    ! to avoid tmp(N^4)
-    do h1 = 1, mo_num
-
-      ! to minimize the number of operations
-      !$OMP PARALLEL                                                  &
-      !$OMP DEFAULT (NONE)                                            &
-      !$OMP PRIVATE (ipoint)                                          &
-      !$OMP SHARED (n_points_final_grid, i, h1,                       &
-      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-      !$OMP         tmpval_1, tmpvec_1)
-      !$OMP DO
-      do ipoint = 1, n_points_final_grid
-        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      !$OMP PARALLEL                                                &
-      !$OMP DEFAULT (NONE)                                          &
-      !$OMP PRIVATE (p1, ipoint)                                    &
-      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-      !$OMP         tmpval_1, tmpvec_1, tmp1)
-      !$OMP DO 
-      do p1 = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
-          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
-          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
-        enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-
-      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
-                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
-                , tmp1(1,1,1), 3*n_points_final_grid                           &
-                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
-
-      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
-      do p1 = 1, mo_num
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
-          enddo
-        enddo
-      enddo
-      !$OMP END PARALLEL DO
-
-      ! to avoid tmp(N^4)
-      do p1 = 1, mo_num
-
-        ! to minimize the number of operations
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT (NONE)                                           &
-        !$OMP PRIVATE (ipoint)                                         &
-        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
-        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
-        !$OMP         tmpval_1)
-        !$OMP DO
-        do ipoint = 1, n_points_final_grid
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        !$OMP PARALLEL                             &
-        !$OMP DEFAULT (NONE)                       &
-        !$OMP PRIVATE (h2, ipoint)                 &
-        !$OMP SHARED (mo_num, n_points_final_grid, &
-        !$OMP         mos_r_in_r_array_transp,     &
-        !$OMP         tmpval_1, tmp2)
-        !$OMP DO 
-        do h2 = 1, mo_num
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1,                       &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+          !$OMP DO
           do ipoint = 1, n_points_final_grid
-            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+            tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
           enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
+          !$OMP END DO
+          !$OMP END PARALLEL
 
-        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
-                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
-                  , tmp2(1,1), n_points_final_grid                      &
-                  , 0.d0, tmp_2d(1,1), mo_num)
-
-        !$OMP PARALLEL DO PRIVATE(h2,p2)
-        do h2 = 1, mo_num
-          do p2 = 1, mo_num
-            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p1, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+          !$OMP DO 
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                                + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+            enddo
           enddo
-        enddo
-        !$OMP END PARALLEL DO
+          !$OMP END DO
+          !$OMP END PARALLEL
 
-      enddo ! p1
-    enddo ! h1
-  enddo ! i
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
-  deallocate(tmp_3d)
-  deallocate(tmp1, tmp2)
-  deallocate(tmpval_1)
-  deallocate(tmpvec_1)
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
 
-  no_aab_contraction = -0.5d0 * no_aab_contraction
+          do p1 = 1, mo_num
 
-  !$OMP PARALLEL                 &
-  !$OMP DEFAULT (NONE)           &
-  !$OMP PRIVATE (h1, h2, p1, p2) & 
-  !$OMP SHARED (no_aab_contraction, mo_num)
+            ! to minimize the number of operations
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1)
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                    - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                    - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                    - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
 
-  !$OMP DO 
-  do h1 = 1, mo_num
-    do h2 = 1, mo_num
-      do p1 = 1, mo_num
-        do p2 = p1, mo_num
-          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
+            !$OMP PARALLEL                             &
+            !$OMP DEFAULT (NONE)                       &
+            !$OMP PRIVATE (h2, ipoint)                 &
+            !$OMP SHARED (mo_num, n_points_final_grid, &
+            !$OMP         mos_r_in_r_array_transp,     &
+            !$OMP         tmpval_1, tmp2)
+            !$OMP DO 
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
 
-  !$OMP DO 
-  do h1 = 1, mo_num
-    do h2 = 1, mo_num
-      do p1 = 2, mo_num
-        do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
 
-  !$OMP DO 
-  do h1 = 1, mo_num-1
-    do h2 = h1+1, mo_num
-      do p1 = 2, mo_num
-        do p2 = 1, p1-1
-          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
 
-  call wall_time(wall1)
-  print*,' Wall time for no_aab_contraction', wall1-wall0
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+    endif
 
-END_PROVIDER
+    deallocate(tmp_3d)
+    deallocate(tmp1)
+    deallocate(tmp2)
+    deallocate(tmpval_1)
+    deallocate(tmpval_2)
+    deallocate(tmpvec_1)
+    deallocate(tmpvec_2)
+    deallocate(tmp_2d)
 
-! ---
+    tmp = -0.5d0 * tmp
+    call sum_A_At(tmp(1,1,1,1), mo_num*mo_num)
 
-BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+    call wall_time(wall1)
+    print*,' Wall time for aba_contraction', wall1-wall0
 
-  BEGIN_DOC
-  !
-  ! if:
-  !    h1 < h2
-  !    p1 > p2
-  !
-  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
-  !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
-  !
-  ! else:
-  !
-  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
-  !
-  ! 
-  ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
-  ! J(p2,h2,p1,h1) = \sum_i [ <  i p2 p1 | i h2 h1 >
-  !                         + < p2 p1  i | i h2 h1 >
-  !                         + < p1  i p2 | i h2 h1 > ]
-  !
-  !
-  END_DOC
+    normal_two_body_bi_orth = tmp
 
-  use bitmasks ! you need to include the bitmasks_module.f90 features
+    ! ---
+    ! aab contraction
 
-  implicit none
-  integer                        :: i, ii, h1, p1, h2, p2, ipoint
-  integer                        :: Ne(2)
-  double precision               :: wall0, wall1
-  integer,           allocatable :: occ(:,:)
-  integer(bit_kind), allocatable :: key_i_core(:,:)
-  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
-  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
-  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+    print*,' Providing aab_contraction ...'
+    call wall_time(wall0)
 
-  print*,' Providing no_aaa_contraction ...'
-  call wall_time(wall0)
-
-  PROVIDE N_int
-
-  allocate(occ(N_int*bit_kind_size,2))
-  allocate(key_i_core(N_int,2))
-
-  if(core_tc_op) then
-    do i = 1, N_int
-      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
-      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
-    enddo
-    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
-  else
-    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
-  endif
-
-  if(Ne(2) .lt. 3) then
-
-    no_aaa_contraction = 0.d0
-
-  else
+    tmp = 0.d0
 
     allocate(tmp_2d(mo_num,mo_num))
     allocate(tmp_3d(mo_num,mo_num,mo_num))
     allocate(tmp1(n_points_final_grid,3,mo_num))
     allocate(tmp2(n_points_final_grid,mo_num))
-    allocate(tmp3(n_points_final_grid,3,mo_num))
     allocate(tmpval_1(n_points_final_grid))
-    allocate(tmpval_2(n_points_final_grid))
     allocate(tmpvec_1(n_points_final_grid,3))
-    allocate(tmpvec_2(n_points_final_grid,3))
-    allocate(tmpvec_3(n_points_final_grid,3))
 
     ! purely closed shell part 
     do ii = 1, Ne(2)
@@ -677,21 +349,13 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         !$OMP SHARED (n_points_final_grid, i, h1,                       &
         !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
         !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+        !$OMP         tmpval_1, tmpvec_1)
         !$OMP DO
         do ipoint = 1, n_points_final_grid
-
-          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-
-          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
-
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
           tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
           tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
           tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
-
-          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
-          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
-          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
         enddo
         !$OMP END DO
         !$OMP END PARALLEL
@@ -722,39 +386,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         do p1 = 1, mo_num
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
-            enddo
-          enddo
-        enddo
-        !$OMP END PARALLEL DO
-
-        !$OMP PARALLEL                                                &
-        !$OMP DEFAULT (NONE)                                          &
-        !$OMP PRIVATE (p2, ipoint)                                    &
-        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
-        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
-        !$OMP         tmpval_2, tmpvec_2, tmp1)
-        !$OMP DO 
-        do p2 = 1, mo_num
-          do ipoint = 1, n_points_final_grid
-            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
-            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
-            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
-          enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-
-        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
-                  , tmp1(1,1,1), 3*n_points_final_grid                           &
-                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
-                  , 0.d0, tmp_3d(1,1,1), mo_num)
-
-        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
-        do p1 = 1, mo_num
-          do h2 = 1, mo_num
-            do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
             enddo
           enddo
         enddo
@@ -763,58 +395,32 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
         ! to avoid tmp(N^4)
         do p1 = 1, mo_num
 
-          !$OMP PARALLEL                                                  &
-          !$OMP DEFAULT (NONE)                                            &
-          !$OMP PRIVATE (ipoint)                                          &
-          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
-          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                 &
+          !$OMP DEFAULT (NONE)                                           &
+          !$OMP PRIVATE (ipoint)                                         &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+          !$OMP         tmpval_1)
           !$OMP DO
           do ipoint = 1, n_points_final_grid
-
-            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
-                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
-                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
-                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
-
-            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
-
-            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
-            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
-            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
-
-            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
-            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
-            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
-
-            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
           enddo
           !$OMP END DO
           !$OMP END PARALLEL
 
-          !$OMP PARALLEL                                &
-          !$OMP DEFAULT (NONE)                          &
-          !$OMP PRIVATE (h2, ipoint)                    &
-          !$OMP SHARED (mo_num, n_points_final_grid, i, &
-          !$OMP         mos_r_in_r_array_transp,        &
-          !$OMP         int2_grad1_u12_bimo_t,          &
-          !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
           !$OMP DO 
           do h2 = 1, mo_num
             do ipoint = 1, n_points_final_grid
-
-              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
-                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
-                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
-                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
-
-              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
-              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
-              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
-
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
             enddo
           enddo
           !$OMP END DO
@@ -828,47 +434,7 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP PARALLEL DO PRIVATE(h2,p2)
           do h2 = 1, mo_num
             do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
-            enddo
-          enddo
-          !$OMP END PARALLEL DO
-
-          !$OMP PARALLEL                                    &
-          !$OMP DEFAULT (NONE)                              &
-          !$OMP PRIVATE (p2, ipoint)                        &
-          !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
-          !$OMP         int2_grad1_u12_bimo_t,              &
-          !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
-          !$OMP DO 
-          do p2 = 1, mo_num
-            do ipoint = 1, n_points_final_grid
-
-              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
-                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
-                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
-
-              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
-              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
-              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
-            enddo
-          enddo
-          !$OMP END DO
-          !$OMP END PARALLEL
-
-          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
-                    , tmp2(1,1), n_points_final_grid                      &
-                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
-                    , 0.d0, tmp_2d(1,1), mo_num)
-
-          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
-                    , tmp3(1,1,1), 3*n_points_final_grid                    &
-                    , tmp1(1,1,1), 3*n_points_final_grid                    &
-                    , 1.d0, tmp_2d(1,1), mo_num)
-
-          !$OMP PARALLEL DO PRIVATE(h2,p2)
-          do h2 = 1, mo_num
-            do p2 = 1, mo_num
-              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
             enddo
           enddo
           !$OMP END PARALLEL DO
@@ -877,14 +443,85 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
       enddo ! h1
     enddo ! i
 
+    deallocate(tmp_2d)
+    deallocate(tmp_3d)
+    deallocate(tmp1)
+    deallocate(tmp2)
+    deallocate(tmpval_1)
+    deallocate(tmpvec_1)
 
+    tmp = -0.5d0 * tmp
 
-    ! purely open-shell part 
-    if(Ne(2) < Ne(1)) then
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (tmp, mo_num)
 
-      do ii = Ne(2) + 1, Ne(1)
-        i = occ(ii,1)
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
 
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            tmp(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL
+
+    call wall_time(wall1)
+    print*,' Wall time for aab_contraction', wall1-wall0
+
+    normal_two_body_bi_orth += tmp
+
+    ! ---
+    ! aaa contraction
+
+    if(Ne(2) .ge. 3) then
+
+      print*,' Providing aaa_contraction ...'
+      call wall_time(wall0)
+
+      tmp = 0.d0
+
+      allocate(tmp_2d(mo_num,mo_num))
+      allocate(tmp_3d(mo_num,mo_num,mo_num))
+      allocate(tmp1(n_points_final_grid,3,mo_num))
+      allocate(tmp2(n_points_final_grid,mo_num))
+      allocate(tmp3(n_points_final_grid,3,mo_num))
+      allocate(tmpval_1(n_points_final_grid))
+      allocate(tmpval_2(n_points_final_grid))
+      allocate(tmpvec_1(n_points_final_grid,3))
+      allocate(tmpvec_2(n_points_final_grid,3))
+      allocate(tmpvec_3(n_points_final_grid,3))
+
+      ! purely closed shell part 
+      do ii = 1, Ne(2)
+        i = occ(ii,2)
 
         ! to avoid tmp(N^4)
         do h1 = 1, mo_num
@@ -932,16 +569,16 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP END DO
           !$OMP END PARALLEL
 
-          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
-                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
-                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
                     , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
 
           !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
               enddo
             enddo
           enddo
@@ -964,16 +601,16 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
           !$OMP END DO
           !$OMP END PARALLEL
 
-          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
-                    , tmp1(1,1,1), 3*n_points_final_grid                            &
-                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                           &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
                     , 0.d0, tmp_3d(1,1,1), mo_num)
 
           !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
           do p1 = 1, mo_num
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
               enddo
             enddo
           enddo
@@ -1039,15 +676,15 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP END DO
             !$OMP END PARALLEL
 
-            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
-                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
-                      , tmp2(1,1), n_points_final_grid                       &
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                      , tmp2(1,1), n_points_final_grid                      &
                       , 0.d0, tmp_2d(1,1), mo_num)
 
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
@@ -1074,82 +711,321 @@ BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_
             !$OMP END DO
             !$OMP END PARALLEL
 
-            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
-                      , tmp2(1,1), n_points_final_grid                       &
-                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                      , tmp2(1,1), n_points_final_grid                      &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
                       , 0.d0, tmp_2d(1,1), mo_num)
 
-            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
-                      , tmp3(1,1,1), 3*n_points_final_grid                     &
-                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                    &
+                      , tmp1(1,1,1), 3*n_points_final_grid                    &
                       , 1.d0, tmp_2d(1,1), mo_num)
 
             !$OMP PARALLEL DO PRIVATE(h2,p2)
             do h2 = 1, mo_num
               do p2 = 1, mo_num
-                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
               enddo
             enddo
             !$OMP END PARALLEL DO
 
           enddo ! p1
         enddo ! h1
-      enddo !i
-    endif
+      enddo ! i
 
-    deallocate(tmp_2d, tmp_3d)
-    deallocate(tmp1, tmp2, tmp3)
-    deallocate(tmpval_1, tmpval_2)
-    deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
+      ! purely open-shell part 
+      if(Ne(2) < Ne(1)) then
 
-    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+        do ii = Ne(2) + 1, Ne(1)
+          i = occ(ii,1)
 
-    !$OMP PARALLEL                 &
-    !$OMP DEFAULT (NONE)           &
-    !$OMP PRIVATE (h1, h2, p1, p2) & 
-    !$OMP SHARED (no_aaa_contraction, mo_num)
+          ! to avoid tmp(N^4)
+          do h1 = 1, mo_num
 
-    !$OMP DO 
-    do h1 = 1, mo_num
-      do h2 = 1, mo_num
-        do p1 = 1, mo_num
-          do p2 = p1, mo_num
-            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+            ! to minimize the number of operations
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1,                       &
+            !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            !$OMP PARALLEL                                                &
+            !$OMP DEFAULT (NONE)                                          &
+            !$OMP PRIVATE (p1, ipoint)                                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+            !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+            !$OMP         tmpval_1, tmpvec_1, tmp1)
+            !$OMP DO 
+            do p1 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+                tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+                tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+                enddo
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            !$OMP PARALLEL                                                &
+            !$OMP DEFAULT (NONE)                                          &
+            !$OMP PRIVATE (p2, ipoint)                                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+            !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+            !$OMP         tmpval_2, tmpvec_2, tmp1)
+            !$OMP DO 
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+                tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+                tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+                tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp1(1,1,1), 3*n_points_final_grid                            &
+                      , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                      , 0.d0, tmp_3d(1,1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+            do p1 = 1, mo_num
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+                enddo
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            ! to avoid tmp(N^4)
+            do p1 = 1, mo_num
+
+              !$OMP PARALLEL                                                  &
+              !$OMP DEFAULT (NONE)                                            &
+              !$OMP PRIVATE (ipoint)                                          &
+              !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+              !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+              !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+              !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+              !$OMP DO
+              do ipoint = 1, n_points_final_grid
+
+                tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                                 ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                 + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+                tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+                tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+                tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+                tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+                tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+                tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+                tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              enddo
+              !$OMP END DO
+              !$OMP END PARALLEL
+
+              !$OMP PARALLEL                                &
+              !$OMP DEFAULT (NONE)                          &
+              !$OMP PRIVATE (h2, ipoint)                    &
+              !$OMP SHARED (mo_num, n_points_final_grid, i, &
+              !$OMP         mos_r_in_r_array_transp,        &
+              !$OMP         int2_grad1_u12_bimo_t,          &
+              !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+              !$OMP DO 
+              do h2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                  + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                  tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                  tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                  tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+                enddo
+              enddo
+              !$OMP END DO
+              !$OMP END PARALLEL
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              !$OMP PARALLEL DO PRIVATE(h2,p2)
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                enddo
+              enddo
+              !$OMP END PARALLEL DO
+
+              !$OMP PARALLEL                                    &
+              !$OMP DEFAULT (NONE)                              &
+              !$OMP PRIVATE (p2, ipoint)                        &
+              !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+              !$OMP         int2_grad1_u12_bimo_t,              &
+              !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+              !$OMP DO 
+              do p2 = 1, mo_num
+                do ipoint = 1, n_points_final_grid
+
+                  tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                  + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                  + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                  tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                  tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                  tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+                enddo
+              enddo
+              !$OMP END DO
+              !$OMP END PARALLEL
+
+              call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                        , tmp2(1,1), n_points_final_grid                       &
+                        , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                        , 0.d0, tmp_2d(1,1), mo_num)
+
+              call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                        , tmp3(1,1,1), 3*n_points_final_grid                     &
+                        , tmp1(1,1,1), 3*n_points_final_grid                     &
+                        , 1.d0, tmp_2d(1,1), mo_num)
+
+              !$OMP PARALLEL DO PRIVATE(h2,p2)
+              do h2 = 1, mo_num
+                do p2 = 1, mo_num
+                  tmp(p2,h2,p1,h1) = tmp(p2,h2,p1,h1) + tmp_2d(p2,h2)
+                enddo
+              enddo
+              !$OMP END PARALLEL DO
+
+            enddo ! p1
+          enddo ! h1
+        enddo !i
+      endif
+
+      deallocate(tmp_2d)
+      deallocate(tmp_3d)
+      deallocate(tmp1)
+      deallocate(tmp2)
+      deallocate(tmp3)
+      deallocate(tmpval_1)
+      deallocate(tmpval_2)
+      deallocate(tmpvec_1)
+      deallocate(tmpvec_2)
+      deallocate(tmpvec_3)
+
+      tmp = -0.5d0 * tmp
+
+      !$OMP PARALLEL                 &
+      !$OMP DEFAULT (NONE)           &
+      !$OMP PRIVATE (h1, h2, p1, p2) & 
+      !$OMP SHARED (tmp, mo_num)
+
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 1, mo_num
+            do p2 = p1, mo_num
+              tmp(p2,h2,p1,h1) -= tmp(p1,h2,p2,h1)
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
+      !$OMP END DO
 
-    !$OMP DO 
-    do h1 = 1, mo_num
-      do h2 = 1, mo_num
-        do p1 = 2, mo_num
-          do p2 = 1, p1-1
-            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+      !$OMP DO 
+      do h1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) = -tmp(p1,h2,p2,h1)
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
+      !$OMP END DO
 
-    !$OMP DO 
-    do h1 = 1, mo_num-1
-      do h2 = h1+1, mo_num
-        do p1 = 2, mo_num
-          do p2 = 1, p1-1
-            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+      !$OMP DO 
+      do h1 = 1, mo_num-1
+        do h2 = h1+1, mo_num
+          do p1 = 2, mo_num
+            do p2 = 1, p1-1
+              tmp(p2,h2,p1,h1) *= -1.d0
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END PARALLEL
+      !$OMP END PARALLEL
 
+      call wall_time(wallf)
+      print*,' Wall time for aaa_contraction', wall1-wall0
+
+      normal_two_body_bi_orth += tmp
+    endif ! Ne(2) .ge. 3
+
+    deallocate(tmp)
+
+  endif ! read_tc_norm_ord
+
+  if(write_tc_norm_ord.and.mpi_master) then
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/normal_two_body_bi_orth', action="write")
+      call ezfio_set_work_empty(.False.)
+      write(11) normal_two_body_bi_orth
+      close(11)
+      call ezfio_set_tc_keywords_io_tc_integ('Read')
   endif
 
-  call wall_time(wall1)
-  print*,' Wall time for no_aaa_contraction', wall1-wall0
+  call wall_time(wallf)
+  print*,' Wall time for normal_two_body_bi_orth ', wallf-walli
 
-END_PROVIDER
+END_PROVIDER 
 
 ! ---
+
diff --git a/src/tc_bi_ortho/normal_ordered_contractions.irp.f b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
new file mode 100644
index 00000000..855cfd17
--- /dev/null
+++ b/src/tc_bi_ortho/normal_ordered_contractions.irp.f
@@ -0,0 +1,1062 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aba_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aba_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpval_2(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+  allocate(tmpvec_2(n_points_final_grid,3))
+  allocate(tmp_2d(mo_num,mo_num))
+
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+        tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                            + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+
+  ! purely open-shell part 
+  if(Ne(2) < Ne(1)) then
+    do ii = Ne(2) + 1, Ne(1)
+      i = occ(ii,1)
+
+      do h1 = 1, mo_num
+
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint, i)
+          tmpval_2(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i, i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint, i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,1) - tmpvec_2(ipoint,1)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,2) - tmpvec_2(ipoint,2)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * (tmpvec_1(ipoint,3) - tmpvec_2(ipoint,3)) &
+                              + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) - tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                  , tmp1(1,1,1), 3*n_points_final_grid                            &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        do p1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1, i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,2, i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                  + int2_grad1_u12_bimo_t(ipoint,3, i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,1,p1,i) * int2_grad1_u12_bimo_t(ipoint,1, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,2,p1,i) * int2_grad1_u12_bimo_t(ipoint,2, i,h1) &
+                                                                  - int2_grad1_u12_bimo_t(ipoint,3,p1,i) * int2_grad1_u12_bimo_t(ipoint,3, i,h1) )
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                             &
+          !$OMP DEFAULT (NONE)                       &
+          !$OMP PRIVATE (h2, ipoint)                 &
+          !$OMP SHARED (mo_num, n_points_final_grid, &
+          !$OMP         mos_r_in_r_array_transp,     &
+          !$OMP         tmpval_1, tmp2)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                    , tmp2(1,1), n_points_final_grid                       &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aba_contraction(p2,h2,p1,h1) = no_aba_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo !i
+  endif
+
+  deallocate(tmp_2d, tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1, tmpval_2)
+  deallocate(tmpvec_1, tmpvec_2)
+
+  no_aba_contraction = -0.5d0 * no_aba_contraction
+  call sum_A_At(no_aba_contraction(1,1,1,1), mo_num*mo_num)
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aba_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aab_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpvec_1(:,:)
+  double precision,  allocatable :: tmp_2d(:,:)
+
+  print*,' Providing no_aab_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  allocate(tmp_2d(mo_num,mo_num))
+  allocate(tmp_3d(mo_num,mo_num,mo_num))
+  allocate(tmp1(n_points_final_grid,3,mo_num))
+  allocate(tmp2(n_points_final_grid,mo_num))
+  allocate(tmpval_1(n_points_final_grid))
+  allocate(tmpvec_1(n_points_final_grid,3))
+
+
+  ! purely closed shell part 
+  do ii = 1, Ne(2)
+    i = occ(ii,2)
+
+    ! to avoid tmp(N^4)
+    do h1 = 1, mo_num
+
+      ! to minimize the number of operations
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint)                                          &
+      !$OMP SHARED (n_points_final_grid, i, h1,                       &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmpval_1, tmpvec_1)
+      !$OMP DO
+      do ipoint = 1, n_points_final_grid
+        tmpval_1(ipoint)   = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+        tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+        tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      !$OMP PARALLEL                                                &
+      !$OMP DEFAULT (NONE)                                          &
+      !$OMP PRIVATE (p1, ipoint)                                    &
+      !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+      !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+      !$OMP         tmpval_1, tmpvec_1, tmp1)
+      !$OMP DO 
+      do p1 = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+          tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+          tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+
+      call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                , tmp1(1,1,1), 3*n_points_final_grid                           &
+                , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+      !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+      do p1 = 1, mo_num
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+          enddo
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! to avoid tmp(N^4)
+      do p1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (ipoint)                                         &
+        !$OMP SHARED (n_points_final_grid, i, h1, p1,                  &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+        !$OMP         tmpval_1)
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                                                                + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                             &
+        !$OMP DEFAULT (NONE)                       &
+        !$OMP PRIVATE (h2, ipoint)                 &
+        !$OMP SHARED (mo_num, n_points_final_grid, &
+        !$OMP         mos_r_in_r_array_transp,     &
+        !$OMP         tmpval_1, tmp2)
+        !$OMP DO 
+        do h2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint) 
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                  , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                  , tmp2(1,1), n_points_final_grid                      &
+                  , 0.d0, tmp_2d(1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(h2,p2)
+        do h2 = 1, mo_num
+          do p2 = 1, mo_num
+            no_aab_contraction(p2,h2,p1,h1) = no_aab_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+      enddo ! p1
+    enddo ! h1
+  enddo ! i
+
+  deallocate(tmp_3d)
+  deallocate(tmp1, tmp2)
+  deallocate(tmpval_1)
+  deallocate(tmpvec_1)
+
+  no_aab_contraction = -0.5d0 * no_aab_contraction
+
+  !$OMP PARALLEL                 &
+  !$OMP DEFAULT (NONE)           &
+  !$OMP PRIVATE (h1, h2, p1, p2) & 
+  !$OMP SHARED (no_aab_contraction, mo_num)
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 1, mo_num
+        do p2 = p1, mo_num
+          no_aab_contraction(p2,h2,p1,h1) -= no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num
+    do h2 = 1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) = -no_aab_contraction(p1,h2,p2,h1)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+
+  !$OMP DO 
+  do h1 = 1, mo_num-1
+    do h2 = h1+1, mo_num
+      do p1 = 2, mo_num
+        do p2 = 1, p1-1
+          no_aab_contraction(p2,h2,p1,h1) *= -1.d0
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aab_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, no_aaa_contraction, (mo_num,mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  !
+  ! if:
+  !    h1 < h2
+  !    p1 > p2
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) =  0.5 [Ialpha(p2,h1,p1,h2) + Ibeta(p2,h1,p1,h2)]
+  !                                   = -0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! else:
+  !
+  !   no_aaa_contraction(p2,h2.p1,h1) = 0.5 [Ialpha(p2,h2,p1,h1) + Ibeta(p2,h2,p1,h1)]
+  !
+  ! 
+  ! I(p2,h2,p1,h1) = J(p2,h2,p1,h1) - J(p1,h2,p2,h1)
+  ! J(p2,h2,p1,h1) = \sum_i [ <  i p2 p1 | i h2 h1 >
+  !                         + < p2 p1  i | i h2 h1 >
+  !                         + < p1  i p2 | i h2 h1 > ]
+  !
+  !
+  END_DOC
+
+  use bitmasks ! you need to include the bitmasks_module.f90 features
+
+  implicit none
+  integer                        :: i, ii, h1, p1, h2, p2, ipoint
+  integer                        :: Ne(2)
+  double precision               :: wall0, wall1
+  integer,           allocatable :: occ(:,:)
+  integer(bit_kind), allocatable :: key_i_core(:,:)
+  double precision,  allocatable :: tmp_2d(:,:), tmp_3d(:,:,:)
+  double precision,  allocatable :: tmp1(:,:,:), tmp2(:,:), tmp3(:,:,:)
+  double precision,  allocatable :: tmpval_1(:), tmpval_2(:), tmpvec_1(:,:), tmpvec_2(:,:), tmpvec_3(:,:)
+
+  print*,' Providing no_aaa_contraction ...'
+  call wall_time(wall0)
+
+  PROVIDE N_int
+
+  allocate(occ(N_int*bit_kind_size,2))
+  allocate(key_i_core(N_int,2))
+
+  if(core_tc_op) then
+    do i = 1, N_int
+      key_i_core(i,1) = xor(ref_bitmask(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(ref_bitmask(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, N_int)
+  else
+    call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
+  endif
+
+  if(Ne(2) .lt. 3) then
+
+    no_aaa_contraction = 0.d0
+
+  else
+
+    allocate(tmp_2d(mo_num,mo_num))
+    allocate(tmp_3d(mo_num,mo_num,mo_num))
+    allocate(tmp1(n_points_final_grid,3,mo_num))
+    allocate(tmp2(n_points_final_grid,mo_num))
+    allocate(tmp3(n_points_final_grid,3,mo_num))
+    allocate(tmpval_1(n_points_final_grid))
+    allocate(tmpval_2(n_points_final_grid))
+    allocate(tmpvec_1(n_points_final_grid,3))
+    allocate(tmpvec_2(n_points_final_grid,3))
+    allocate(tmpvec_3(n_points_final_grid,3))
+
+    ! purely closed shell part 
+    do ii = 1, Ne(2)
+      i = occ(ii,2)
+
+      ! to avoid tmp(N^4)
+      do h1 = 1, mo_num
+
+        ! to minimize the number of operations
+        !$OMP PARALLEL                                                  &
+        !$OMP DEFAULT (NONE)                                            &
+        !$OMP PRIVATE (ipoint)                                          &
+        !$OMP SHARED (n_points_final_grid, i, h1,                       &
+        !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+        !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+        !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+        !$OMP DO
+        do ipoint = 1, n_points_final_grid
+
+          tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+          tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+          tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+          tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p1, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_1, tmpvec_1, tmp1)
+        !$OMP DO 
+        do p1 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+            tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+            tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        !$OMP PARALLEL                                                &
+        !$OMP DEFAULT (NONE)                                          &
+        !$OMP PRIVATE (p2, ipoint)                                    &
+        !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+        !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+        !$OMP         tmpval_2, tmpvec_2, tmp1)
+        !$OMP DO 
+        do p2 = 1, mo_num
+          do ipoint = 1, n_points_final_grid
+            tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+            tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+            tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+
+        call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0 &
+                  , tmp1(1,1,1), 3*n_points_final_grid                           &
+                  , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid        &
+                  , 0.d0, tmp_3d(1,1,1), mo_num)
+
+        !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+        do p1 = 1, mo_num
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+            enddo
+          enddo
+        enddo
+        !$OMP END PARALLEL DO
+
+        ! to avoid tmp(N^4)
+        do p1 = 1, mo_num
+
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                             ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                             + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+            tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                &
+          !$OMP DEFAULT (NONE)                          &
+          !$OMP PRIVATE (h2, ipoint)                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, &
+          !$OMP         mos_r_in_r_array_transp,        &
+          !$OMP         int2_grad1_u12_bimo_t,          &
+          !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+          !$OMP DO 
+          do h2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                              + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+              tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+              tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+              tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , mos_l_in_r_array_transp(1,1), n_points_final_grid   &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                    &
+          !$OMP DEFAULT (NONE)                              &
+          !$OMP PRIVATE (p2, ipoint)                        &
+          !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+          !$OMP         int2_grad1_u12_bimo_t,              &
+          !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+
+              tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                              + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                              + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+              tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+              tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+              tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 1.d0 &
+                    , tmp2(1,1), n_points_final_grid                      &
+                    , mos_r_in_r_array_transp(1,1), n_points_final_grid   &
+                    , 0.d0, tmp_2d(1,1), mo_num)
+
+          call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 1.d0 &
+                    , tmp3(1,1,1), 3*n_points_final_grid                    &
+                    , tmp1(1,1,1), 3*n_points_final_grid                    &
+                    , 1.d0, tmp_2d(1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(h2,p2)
+          do h2 = 1, mo_num
+            do p2 = 1, mo_num
+              no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+        enddo ! p1
+      enddo ! h1
+    enddo ! i
+
+
+
+    ! purely open-shell part 
+    if(Ne(2) < Ne(1)) then
+
+      do ii = Ne(2) + 1, Ne(1)
+        i = occ(ii,1)
+
+
+        ! to avoid tmp(N^4)
+        do h1 = 1, mo_num
+
+          ! to minimize the number of operations
+          !$OMP PARALLEL                                                  &
+          !$OMP DEFAULT (NONE)                                            &
+          !$OMP PRIVATE (ipoint)                                          &
+          !$OMP SHARED (n_points_final_grid, i, h1,                       &
+          !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+          !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+          !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2 )
+          !$OMP DO
+          do ipoint = 1, n_points_final_grid
+
+            tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+
+            tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+            tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+            tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+            tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p1, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_1, tmpvec_1, tmp1)
+          !$OMP DO 
+          do p1 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,1) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1)
+              tmp1(ipoint,2,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,2) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1)
+              tmp1(ipoint,3,p1) = mos_l_in_r_array_transp(ipoint,p1) * tmpvec_1(ipoint,3) + tmpval_1(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num*mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , 0.d0, tmp_3d(1,1,1), mo_num*mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,h2,p1)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          !$OMP PARALLEL                                                &
+          !$OMP DEFAULT (NONE)                                          &
+          !$OMP PRIVATE (p2, ipoint)                                    &
+          !$OMP SHARED (mo_num, n_points_final_grid, h1, i,             &
+          !$OMP         mos_l_in_r_array_transp, int2_grad1_u12_bimo_t, &
+          !$OMP         tmpval_2, tmpvec_2, tmp1)
+          !$OMP DO 
+          do p2 = 1, mo_num
+            do ipoint = 1, n_points_final_grid
+              tmp1(ipoint,1,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,1)
+              tmp1(ipoint,2,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,2)
+              tmp1(ipoint,3,p2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p2,i) + mos_l_in_r_array_transp(ipoint,p2) * tmpvec_2(ipoint,3)
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( 'T', 'N', mo_num, mo_num*mo_num, 3*n_points_final_grid, 0.5d0 &
+                    , tmp1(1,1,1), 3*n_points_final_grid                            &
+                    , int2_grad1_u12_bimo_t(1,1,1,1), 3*n_points_final_grid         &
+                    , 0.d0, tmp_3d(1,1,1), mo_num)
+
+          !$OMP PARALLEL DO PRIVATE(p1,h2,p2)
+          do p1 = 1, mo_num
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_3d(p2,p1,h2)
+              enddo
+            enddo
+          enddo
+          !$OMP END PARALLEL DO
+
+          ! to avoid tmp(N^4)
+          do p1 = 1, mo_num
+
+            !$OMP PARALLEL                                                  &
+            !$OMP DEFAULT (NONE)                                            &
+            !$OMP PRIVATE (ipoint)                                          &
+            !$OMP SHARED (n_points_final_grid, i, h1, p1,                   &
+            !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+            !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+            !$OMP         tmpval_1, tmpval_2, tmpvec_1, tmpvec_2, tmpvec_3)
+            !$OMP DO
+            do ipoint = 1, n_points_final_grid
+
+              tmpval_1(ipoint) = final_weight_at_r_vector(ipoint) *                                          &
+                               ( int2_grad1_u12_bimo_t(ipoint,1,i,i) * int2_grad1_u12_bimo_t(ipoint,1,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,2,i,i) * int2_grad1_u12_bimo_t(ipoint,2,p1,h1) &
+                               + int2_grad1_u12_bimo_t(ipoint,3,i,i) * int2_grad1_u12_bimo_t(ipoint,3,p1,h1) )
+
+              tmpval_2(ipoint) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,p1) * mos_r_in_r_array_transp(ipoint,i)
+
+              tmpvec_1(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+              tmpvec_1(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_r_in_r_array_transp(ipoint,h1)
+
+              tmpvec_2(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+              tmpvec_2(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h1) * mos_l_in_r_array_transp(ipoint,p1)
+
+              tmpvec_3(ipoint,1) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,2) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+              tmpvec_3(ipoint,3) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,p1,i) * mos_l_in_r_array_transp(ipoint,i)
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            !$OMP PARALLEL                                &
+            !$OMP DEFAULT (NONE)                          &
+            !$OMP PRIVATE (h2, ipoint)                    &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, &
+            !$OMP         mos_r_in_r_array_transp,        &
+            !$OMP         int2_grad1_u12_bimo_t,          &
+            !$OMP         tmp1, tmp2, tmpval_1, tmpval_2, tmpvec_1)
+            !$OMP DO 
+            do h2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,h2) = mos_r_in_r_array_transp(ipoint,h2) * tmpval_1(ipoint)     & 
+                                + int2_grad1_u12_bimo_t(ipoint,1,i,h2) * tmpvec_1(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,i,h2) * tmpvec_1(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,i,h2) * tmpvec_1(ipoint,3)
+
+                tmp1(ipoint,1,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,i,h2)
+                tmp1(ipoint,2,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,i,h2)
+                tmp1(ipoint,3,h2) = tmpval_2(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,i,h2)
+
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , mos_l_in_r_array_transp(1,1), n_points_final_grid    &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+            !$OMP PARALLEL                                    &
+            !$OMP DEFAULT (NONE)                              &
+            !$OMP PRIVATE (p2, ipoint)                        &
+            !$OMP SHARED (mo_num, n_points_final_grid, i, h1, &
+            !$OMP         int2_grad1_u12_bimo_t,              &
+            !$OMP         tmpvec_2, tmpvec_3, tmp2, tmp3)
+            !$OMP DO 
+            do p2 = 1, mo_num
+              do ipoint = 1, n_points_final_grid
+
+                tmp2(ipoint,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,i) * tmpvec_2(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,p2,h1) * tmpvec_3(ipoint,1) &
+                                + int2_grad1_u12_bimo_t(ipoint,2,p2,i) * tmpvec_2(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,p2,h1) * tmpvec_3(ipoint,2) &
+                                + int2_grad1_u12_bimo_t(ipoint,3,p2,i) * tmpvec_2(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,p2,h1) * tmpvec_3(ipoint,3) 
+
+                tmp3(ipoint,1,p2) = int2_grad1_u12_bimo_t(ipoint,1,p2,h1) 
+                tmp3(ipoint,2,p2) = int2_grad1_u12_bimo_t(ipoint,2,p2,h1) 
+                tmp3(ipoint,3,p2) = int2_grad1_u12_bimo_t(ipoint,3,p2,h1) 
+              enddo
+            enddo
+            !$OMP END DO
+            !$OMP END PARALLEL
+
+            call dgemm( 'T', 'N', mo_num, mo_num, n_points_final_grid, 0.5d0 &
+                      , tmp2(1,1), n_points_final_grid                       &
+                      , mos_r_in_r_array_transp(1,1), n_points_final_grid    &
+                      , 0.d0, tmp_2d(1,1), mo_num)
+
+            call dgemm( 'T', 'N', mo_num, mo_num, 3*n_points_final_grid, 0.5d0 &
+                      , tmp3(1,1,1), 3*n_points_final_grid                     &
+                      , tmp1(1,1,1), 3*n_points_final_grid                     &
+                      , 1.d0, tmp_2d(1,1), mo_num)
+
+            !$OMP PARALLEL DO PRIVATE(h2,p2)
+            do h2 = 1, mo_num
+              do p2 = 1, mo_num
+                no_aaa_contraction(p2,h2,p1,h1) = no_aaa_contraction(p2,h2,p1,h1) + tmp_2d(p2,h2)
+              enddo
+            enddo
+            !$OMP END PARALLEL DO
+
+          enddo ! p1
+        enddo ! h1
+      enddo !i
+    endif
+
+    deallocate(tmp_2d, tmp_3d)
+    deallocate(tmp1, tmp2, tmp3)
+    deallocate(tmpval_1, tmpval_2)
+    deallocate(tmpvec_1, tmpvec_2, tmpvec_3)
+
+    no_aaa_contraction = -0.5d0 * no_aaa_contraction
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (h1, h2, p1, p2) & 
+    !$OMP SHARED (no_aaa_contraction, mo_num)
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 1, mo_num
+          do p2 = p1, mo_num
+            no_aaa_contraction(p2,h2,p1,h1) -= no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num
+      do h2 = 1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) = -no_aaa_contraction(p1,h2,p2,h1)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+
+    !$OMP DO 
+    do h1 = 1, mo_num-1
+      do h2 = h1+1, mo_num
+        do p1 = 2, mo_num
+          do p2 = 1, p1-1
+            no_aaa_contraction(p2,h2,p1,h1) *= -1.d0
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL
+
+  endif
+
+  call wall_time(wall1)
+  print*,' Wall time for no_aaa_contraction', wall1-wall0
+
+END_PROVIDER
+
+! ---

From ee06ddf85e2b3fc83faa25515e80b262a2932aa7 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 8 Jun 2023 15:59:14 +0200
Subject: [PATCH 18/18] free two (3xN_gridxMOxMO) tables in TC-CIPSI

---
 src/fci_tc_bi/fci_tc_bi_ortho.irp.f | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
index f9bda058..3e6f229b 100644
--- a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -63,7 +63,9 @@ subroutine run_cipsi_tc
         call provide_all_three_ints_bi_ortho()
       endif
     endif
-    ! ---
+
+    FREE int2_grad1_u12_bimo_transp int2_grad1_u12_ao_transp
+
     write(json_unit,json_array_open_fmt) 'fci_tc'
 
     if (do_pt2) then
@@ -78,13 +80,16 @@ subroutine run_cipsi_tc
     call json_close
 
   else
+
     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+
     if(elec_alpha_num+elec_beta_num.ge.3)then
       if(three_body_h_tc)then
         call provide_all_three_ints_bi_ortho
       endif
     endif
-    ! ---
+
+    FREE int2_grad1_u12_bimo_transp int2_grad1_u12_ao_transp
 
     call run_slave_cipsi