From 2d06e8fdaefff2905933a0b5ff52e93f8193f0a7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 May 2023 10:52:36 +0200
Subject: [PATCH 01/13] Cholesky false by default

---
 external/qp2-dependencies   | 2 +-
 src/ao_two_e_ints/EZFIO.cfg | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/qp2-dependencies b/external/qp2-dependencies
index e0d0e02e..6e23ebac 160000
--- a/external/qp2-dependencies
+++ b/external/qp2-dependencies
@@ -1 +1 @@
-Subproject commit e0d0e02e9f5ece138d1520106954a881ab0b8db2
+Subproject commit 6e23ebac001acae91d1c762ca934e09a9b7d614a
diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg
index caed4698..4ab080ec 100644
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@@ -22,4 +22,4 @@ ezfio_name: direct
 type: logical
 doc: Perform Cholesky decomposition of AO integrals
 interface: ezfio,provider,ocaml
-default: True
+default: False

From f2ca86ef604b906c3c10032691ac5e0e0ff83b53 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 26 May 2023 11:48:08 +0200
Subject: [PATCH 02/13] Improved cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 25 +++++++++++++++++++++++--
 src/cipsi/selection.irp.f        |  3 +++
 src/mo_two_e_ints/cholesky.irp.f |  1 +
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index d4c201aa..5aab12d9 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -51,8 +51,9 @@ END_PROVIDER
 
  double precision :: integral
  logical, external :: ao_two_e_integral_zero
- !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,j,k,l, integral) SCHEDULE(dynamic)
+ !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral)
  do l=1,ao_num
+  !$OMP DO SCHEDULE(dynamic)
   do j=1,l
    do k=1,ao_num
     do i=1,k
@@ -65,8 +66,28 @@ END_PROVIDER
     enddo
    enddo
   enddo
+  !$OMP END DO NOWAIT
  enddo
- !$OMP END PARALLEL DO
+ !$OMP END PARALLEL
+
+ !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,l, integral)
+ do l=1,ao_num
+  !$OMP DO SCHEDULE(dynamic)
+  do j=1,l
+   do k=1,ao_num
+    do i=1,k
+     if (ao_two_e_integral_zero(i,j,k,l)) cycle
+     integral = ao_two_e_integral(i,k,j,l)
+     ao_integrals(i,k,j,l) = integral
+     ao_integrals(k,i,j,l) = integral
+     ao_integrals(i,k,l,j) = integral
+     ao_integrals(k,i,l,j) = integral
+    enddo
+   enddo
+  enddo
+  !$OMP END DO NOWAIT
+ enddo
+ !$OMP END PARALLEL
 
  ! Call Lapack
  cholesky_ao_num = cholesky_ao_num_guess
diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 6f40a809..0705d103 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -76,6 +76,8 @@ subroutine select_connected(i_generator,E0,pt2_data,b,subset,csubset)
 
   double precision, allocatable  :: fock_diag_tmp(:,:)
 
+  if (csubset == 0) return
+
   allocate(fock_diag_tmp(2,mo_num+1))
 
   call build_fock_tmp(fock_diag_tmp,psi_det_generators(1,1,i_generator),N_int)
@@ -177,6 +179,7 @@ subroutine select_singles_and_doubles(i_generator,hole_mask,particle_mask,fock_d
   monoAdo = .true.
   monoBdo = .true.
 
+  if (csubset == 0) return
 
   do k=1,N_int
     hole    (k,1) = iand(psi_det_generators(k,1,i_generator), hole_mask(k,1))
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 14d3c696..1706b2ec 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -6,6 +6,7 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_ao_num
 
  integer :: k
 
+ call set_multiple_levels_omp(.False.)
  !$OMP PARALLEL DO PRIVATE(k)
  do k=1,cholesky_ao_num
   call ao_to_mo(cholesky_ao(1,1,k),ao_num,cholesky_mo(1,1,k),mo_num)

From 00be08932321f73b425987f99e4e5fcc685425f9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 09:56:06 +0200
Subject: [PATCH 03/13] Removed duplicate provider in cosgto

---
 src/cosgtos_ao_int/EZFIO.cfg | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/cosgtos_ao_int/EZFIO.cfg b/src/cosgtos_ao_int/EZFIO.cfg
index 8edeecd0..fe57b652 100644
--- a/src/cosgtos_ao_int/EZFIO.cfg
+++ b/src/cosgtos_ao_int/EZFIO.cfg
@@ -10,10 +10,3 @@ doc: If true, use cosgtos for AO integrals
 interface: ezfio,provider,ocaml
 default: False
 
-[ao_integrals_threshold]
-type: Threshold
-doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero
-interface: ezfio,provider,ocaml
-default: 1.e-15
-ezfio_name: threshold_ao
-

From 77186e0560d7febb0cf0f0ce7c6c03f98e1b9d9c Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 11:11:29 +0200
Subject: [PATCH 04/13] Cleaned cosgtos

---
 src/ao_basis/EZFIO.cfg                               | 11 +++++++++++
 src/ao_one_e_ints/NEED                               |  1 -
 .../aos_cosgtos.irp.f                                |  0
 .../one_e_Coul_integrals_cosgtos.irp.f}              |  0
 .../one_e_kin_integrals_cosgtos.irp.f}               |  0
 .../gauss_legendre.irp.f                             |  0
 .../two_e_Coul_integrals_cosgtos.irp.f}              |  0
 src/cosgtos_ao_int/EZFIO.cfg                         | 12 ------------
 src/cosgtos_ao_int/NEED                              |  2 --
 src/cosgtos_ao_int/README.rst                        |  4 ----
 src/cosgtos_ao_int/cosgtos_ao_int.irp.f              |  7 -------
 11 files changed, 11 insertions(+), 26 deletions(-)
 rename src/{cosgtos_ao_int => ao_one_e_ints}/aos_cosgtos.irp.f (100%)
 rename src/{cosgtos_ao_int/one_e_Coul_integrals.irp.f => ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f} (100%)
 rename src/{cosgtos_ao_int/one_e_kin_integrals.irp.f => ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f} (100%)
 rename src/{cosgtos_ao_int => ao_two_e_ints}/gauss_legendre.irp.f (100%)
 rename src/{cosgtos_ao_int/two_e_Coul_integrals.irp.f => ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f} (100%)
 delete mode 100644 src/cosgtos_ao_int/EZFIO.cfg
 delete mode 100644 src/cosgtos_ao_int/NEED
 delete mode 100644 src/cosgtos_ao_int/README.rst
 delete mode 100644 src/cosgtos_ao_int/cosgtos_ao_int.irp.f

diff --git a/src/ao_basis/EZFIO.cfg b/src/ao_basis/EZFIO.cfg
index 51d726da..a203e3f0 100644
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@@ -67,3 +67,14 @@ doc: Use normalized primitive functions
 interface: ezfio, provider
 default: true
 
+[ao_expoim_cosgtos]
+type: double precision
+doc: imag part for Exponents for each primitive of each cosGTOs |AO|
+size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
+interface: ezfio, provider
+
+[use_cosgtos]
+type: logical
+doc: If true, use cosgtos for AO integrals
+interface: ezfio,provider
+default: False
diff --git a/src/ao_one_e_ints/NEED b/src/ao_one_e_ints/NEED
index b9caaf5d..61d23b1e 100644
--- a/src/ao_one_e_ints/NEED
+++ b/src/ao_one_e_ints/NEED
@@ -1,3 +1,2 @@
 ao_basis
 pseudo
-cosgtos_ao_int
diff --git a/src/cosgtos_ao_int/aos_cosgtos.irp.f b/src/ao_one_e_ints/aos_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/aos_cosgtos.irp.f
rename to src/ao_one_e_ints/aos_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/one_e_Coul_integrals.irp.f b/src/ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/one_e_Coul_integrals.irp.f
rename to src/ao_one_e_ints/one_e_Coul_integrals_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/one_e_kin_integrals.irp.f b/src/ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/one_e_kin_integrals.irp.f
rename to src/ao_one_e_ints/one_e_kin_integrals_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/gauss_legendre.irp.f b/src/ao_two_e_ints/gauss_legendre.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/gauss_legendre.irp.f
rename to src/ao_two_e_ints/gauss_legendre.irp.f
diff --git a/src/cosgtos_ao_int/two_e_Coul_integrals.irp.f b/src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
similarity index 100%
rename from src/cosgtos_ao_int/two_e_Coul_integrals.irp.f
rename to src/ao_two_e_ints/two_e_Coul_integrals_cosgtos.irp.f
diff --git a/src/cosgtos_ao_int/EZFIO.cfg b/src/cosgtos_ao_int/EZFIO.cfg
deleted file mode 100644
index fe57b652..00000000
--- a/src/cosgtos_ao_int/EZFIO.cfg
+++ /dev/null
@@ -1,12 +0,0 @@
-[ao_expoim_cosgtos]
-type: double precision
-doc: imag part for Exponents for each primitive of each cosGTOs |AO|
-size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
-interface: ezfio, provider
-
-[use_cosgtos]
-type: logical
-doc: If true, use cosgtos for AO integrals
-interface: ezfio,provider,ocaml
-default: False
-
diff --git a/src/cosgtos_ao_int/NEED b/src/cosgtos_ao_int/NEED
deleted file mode 100644
index 932f88a3..00000000
--- a/src/cosgtos_ao_int/NEED
+++ /dev/null
@@ -1,2 +0,0 @@
-ezfio_files
-ao_basis
diff --git a/src/cosgtos_ao_int/README.rst b/src/cosgtos_ao_int/README.rst
deleted file mode 100644
index 01f25d6d..00000000
--- a/src/cosgtos_ao_int/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-==============
-cosgtos_ao_int
-==============
-
diff --git a/src/cosgtos_ao_int/cosgtos_ao_int.irp.f b/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
deleted file mode 100644
index d65dfba5..00000000
--- a/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
+++ /dev/null
@@ -1,7 +0,0 @@
-program cosgtos_ao_int
-  implicit none
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-  print *, 'Hello world'
-end

From d05e4ed0b310fb083bf4318e8c7dee481dda302f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 17:46:07 +0200
Subject: [PATCH 05/13] Fix ao_basis/use_cosgtos not found in EZFIO file

---
 src/ao_basis/EZFIO.cfg     |  3 ++-
 src/ao_basis/cosgtos.irp.f | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 src/ao_basis/cosgtos.irp.f

diff --git a/src/ao_basis/EZFIO.cfg b/src/ao_basis/EZFIO.cfg
index a203e3f0..6ad9b998 100644
--- a/src/ao_basis/EZFIO.cfg
+++ b/src/ao_basis/EZFIO.cfg
@@ -76,5 +76,6 @@ interface: ezfio, provider
 [use_cosgtos]
 type: logical
 doc: If true, use cosgtos for AO integrals
-interface: ezfio,provider
+interface: ezfio
 default: False
+
diff --git a/src/ao_basis/cosgtos.irp.f b/src/ao_basis/cosgtos.irp.f
new file mode 100644
index 00000000..721a3e57
--- /dev/null
+++ b/src/ao_basis/cosgtos.irp.f
@@ -0,0 +1,33 @@
+BEGIN_PROVIDER [ logical, use_cosgtos  ]
+  implicit none
+  BEGIN_DOC
+! If true, use cosgtos for AO integrals
+  END_DOC
+
+  logical                        :: has
+  PROVIDE ezfio_filename
+  if (mpi_master) then
+    call ezfio_has_ao_basis_use_cosgtos(has)
+    if (has) then
+!      write(6,'(A)') '.. >>>>> [ IO READ: use_cosgtos ] <<<<< ..'
+      call ezfio_get_ao_basis_use_cosgtos(use_cosgtos)
+    else
+      use_cosgtos = .False.
+    endif
+  endif
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    call MPI_BCAST( use_cosgtos, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      stop 'Unable to read use_cosgtos with MPI'
+    endif
+  IRP_ENDIF
+
+!  call write_time(6)
+
+END_PROVIDER

From 6971bf186cf020ce66d0bac091d06ae850bd803f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 1 Jun 2023 21:42:02 +0200
Subject: [PATCH 06/13] Accelerated multiply_poly

---
 src/ao_one_e_ints/pot_ao_ints.irp.f          |  21 +-
 src/ao_two_e_ints/two_e_integrals.irp.f      | 136 +------
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f |  52 +--
 src/utils/integration.irp.f                  | 366 +++++++++++++------
 4 files changed, 305 insertions(+), 270 deletions(-)

diff --git a/src/ao_one_e_ints/pot_ao_ints.irp.f b/src/ao_one_e_ints/pot_ao_ints.irp.f
index 928053ad..446bf730 100644
--- a/src/ao_one_e_ints/pot_ao_ints.irp.f
+++ b/src/ao_one_e_ints/pot_ao_ints.irp.f
@@ -455,10 +455,12 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
     do ix=0,nx
       X(ix) *= dble(c)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
     ny=0
     call I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,Y,ny,n_pt_in)
-    call multiply_poly(Y,ny,R1x,2,d,nd)
+!    call multiply_poly(Y,ny,R1x,2,d,nd)
+    call multiply_poly_c2(Y,ny,R1x,d,nd)
   else
     do ix=0,n_pt_in
       X(ix) = 0.d0
@@ -469,7 +471,8 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
     do ix=0,nx
       X(ix) *= dble(a-1)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
 
     nx = nd
     do ix=0,n_pt_in
@@ -479,10 +482,12 @@ recursive subroutine I_x1_pol_mult_one_e(a,c,R1x,R1xp,R2x,d,nd,n_pt_in)
     do ix=0,nx
       X(ix) *= dble(c)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
     ny=0
     call I_x1_pol_mult_one_e(a-1,c,R1x,R1xp,R2x,Y,ny,n_pt_in)
-    call multiply_poly(Y,ny,R1x,2,d,nd)
+!    call multiply_poly(Y,ny,R1x,2,d,nd)
+    call multiply_poly_c2(Y,ny,R1x,d,nd)
   endif
 end
 
@@ -519,7 +524,8 @@ recursive subroutine I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,d,nd,dim)
     do ix=0,nx
       X(ix) *= dble(c-1)
     enddo
-    call multiply_poly(X,nx,R2x,2,d,nd)
+!    call multiply_poly(X,nx,R2x,2,d,nd)
+    call multiply_poly_c2(X,nx,R2x,d,nd)
     ny = 0
     do ix=0,dim
       Y(ix) = 0.d0
@@ -527,7 +533,8 @@ recursive subroutine I_x2_pol_mult_one_e(c,R1x,R1xp,R2x,d,nd,dim)
 
     call I_x1_pol_mult_one_e(0,c-1,R1x,R1xp,R2x,Y,ny,dim)
     if(ny.ge.0)then
-      call multiply_poly(Y,ny,R1xp,2,d,nd)
+!      call multiply_poly(Y,ny,R1xp,2,d,nd)
+      call multiply_poly_c2(Y,ny,R1xp,d,nd)
     endif
   endif
 end
diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f
index 835dc89a..85ff5bcf 100644
--- a/src/ao_two_e_ints/two_e_integrals.irp.f
+++ b/src/ao_two_e_ints/two_e_integrals.irp.f
@@ -975,18 +975,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_10,2,d,nd)
-  if (nx >= 0) then
-    integer :: ib
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_10(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_10(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_10(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_10,d,nd)
 
   nx = nd
   !DIR$ LOOP COUNT(8)
@@ -1009,17 +998,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
     endif
 !    !DIR$ FORCEINLINE
 !    call multiply_poly(X,nx,B_00,2,d,nd)
-    if (nx >= 0) then
-       do ib=0,nx
-           d(ib  ) = d(ib  ) + B_00(0) * X(ib)
-           d(ib+1) = d(ib+1) + B_00(1) * X(ib)
-           d(ib+2) = d(ib+2) + B_00(2) * X(ib)
-       enddo
-
-       do nd = nx+2,0,-1
-         if (d(nd) /= 0.d0) exit
-       enddo
-    endif
+    call multiply_poly_c2(X,nx,B_00,d,nd)
   endif
 
   ny=0
@@ -1038,17 +1017,7 @@ recursive subroutine I_x1_pol_mult_recurs(a,c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(Y,ny,C_00,2,d,nd)
-   if (ny >= 0) then
-     do ib=0,ny
-         d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
-         d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
-         d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
-     enddo
-
-   do nd = ny+2,0,-1
-     if (d(nd) /= 0.d0) exit
-   enddo
-  endif
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
 
 recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
@@ -1088,18 +1057,7 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_00,2,d,nd)
-  if (nx >= 0) then
-    integer                        :: ib
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_00(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_00(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_00(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_00,d,nd)
 
   ny=0
 
@@ -1111,17 +1069,7 @@ recursive subroutine I_x1_pol_mult_a1(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(Y,ny,C_00,2,d,nd)
-  if (ny >= 0) then
-    do ib=0,ny
-      d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
-      d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
-      d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
-    enddo
-
-    do nd = ny+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 
 end
 
@@ -1150,18 +1098,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_10,2,d,nd)
-  if (nx >= 0) then
-    integer :: ib
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_10(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_10(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_10(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_10,d,nd)
 
   nx = nd
   !DIR$ LOOP COUNT(8)
@@ -1181,17 +1118,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(X,nx,B_00,2,d,nd)
-  if (nx >= 0) then
-    do ib=0,nx
-      d(ib  ) = d(ib  ) + B_00(0) * X(ib)
-      d(ib+1) = d(ib+1) + B_00(1) * X(ib)
-      d(ib+2) = d(ib+2) + B_00(2) * X(ib)
-    enddo
-
-    do nd = nx+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(X,nx,B_00,d,nd)
 
   ny=0
   !DIR$ LOOP COUNT(8)
@@ -1203,17 +1130,7 @@ recursive subroutine I_x1_pol_mult_a2(c,B_10,B_01,B_00,C_00,D_00,d,nd,n_pt_in)
 
 !  !DIR$ FORCEINLINE
 !  call multiply_poly(Y,ny,C_00,2,d,nd)
-  if (ny >= 0) then
-    do ib=0,ny
-        d(ib  ) = d(ib  ) + C_00(0) * Y(ib)
-        d(ib+1) = d(ib+1) + C_00(1) * Y(ib)
-        d(ib+2) = d(ib+2) + C_00(2) * Y(ib)
-    enddo
-
-    do nd = ny+2,0,-1
-      if (d(nd) /= 0.d0) exit
-    enddo
-  endif
+  call multiply_poly_c2(Y,ny,C_00,d,nd)
 end
 
 recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
@@ -1262,18 +1179,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
 
 !      !DIR$ FORCEINLINE
 !      call multiply_poly(Y,ny,D_00,2,d,nd)
-      if (ny >= 0) then
-        integer :: ib
-        do ib=0,ny
-            d(ib  ) = d(ib  ) + D_00(0) * Y(ib)
-            d(ib+1) = d(ib+1) + D_00(1) * Y(ib)
-            d(ib+2) = d(ib+2) + D_00(2) * Y(ib)
-        enddo
-
-        do nd = ny+2,0,-1
-          if (d(nd) /= 0.d0) exit
-        enddo
-      endif
+      call multiply_poly_c2(Y,ny,D_00,d,nd)
 
       return
 
@@ -1293,17 +1199,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
 
 !      !DIR$ FORCEINLINE
 !      call multiply_poly(X,nx,B_01,2,d,nd)
-      if (nx >= 0) then
-        do ib=0,nx
-          d(ib  ) = d(ib  ) + B_01(0) * X(ib)
-          d(ib+1) = d(ib+1) + B_01(1) * X(ib)
-          d(ib+2) = d(ib+2) + B_01(2) * X(ib)
-        enddo
-
-        do nd = nx+2,0,-1
-          if (d(nd) /= 0.d0) exit
-        enddo
-      endif
+      call multiply_poly_c2(X,nx,B_01,d,nd)
 
       ny = 0
       !DIR$ LOOP COUNT(6)
@@ -1314,17 +1210,7 @@ recursive subroutine I_x2_pol_mult(c,B_10,B_01,B_00,C_00,D_00,d,nd,dim)
 
 !      !DIR$ FORCEINLINE
 !      call multiply_poly(Y,ny,D_00,2,d,nd)
-      if (ny >= 0) then
-        do ib=0,ny
-            d(ib  ) = d(ib  ) + D_00(0) * Y(ib)
-            d(ib+1) = d(ib+1) + D_00(1) * Y(ib)
-            d(ib+2) = d(ib+2) + D_00(2) * Y(ib)
-        enddo
-
-        do nd = ny+2,0,-1
-          if (d(nd) /= 0.d0) exit
-        enddo
-      endif
+      call multiply_poly_c2(Y,ny,D_00,d,nd)
 
   end select
 end
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index e8b56307..a72cd682 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -4,7 +4,7 @@
 BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num, mo_num)]
 
  BEGIN_DOC
-! matrix element of the -L  three-body operator 
+! matrix element of the -L  three-body operator
 !
 ! notice the -1 sign: in this way three_body_ints_bi_ort can be directly used to compute Slater rules :)
  END_DOC
@@ -12,7 +12,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
  implicit none
  integer          :: i, j, k, l, m, n
  double precision :: integral, wall1, wall0
- character*(128)  :: name_file 
+ character*(128)  :: name_file
 
   three_body_ints_bi_ort = 0.d0
   print *, ' Providing the three_body_ints_bi_ort ...'
@@ -27,12 +27,12 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
 !   call read_array_6_index_tensor(mo_num,three_body_ints_bi_ort,name_file)
 !  else
 
-  !provide x_W_ki_bi_ortho_erf_rk 
+  !provide x_W_ki_bi_ortho_erf_rk
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
 
  !$OMP PARALLEL                       &
  !$OMP DEFAULT (NONE)                 &
- !$OMP PRIVATE (i,j,k,l,m,n,integral) & 
+ !$OMP PRIVATE (i,j,k,l,m,n,integral) &
  !$OMP SHARED (mo_num,three_body_ints_bi_ort)
  !$OMP DO SCHEDULE (dynamic)
   do i = 1, mo_num
@@ -43,7 +43,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
             do n = 1, mo_num
               call give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
 
-              three_body_ints_bi_ort(n,l,k,m,j,i) = -1.d0 * integral 
+              three_body_ints_bi_ort(n,l,k,m,j,i) = -1.d0 * integral
             enddo
           enddo
         enddo
@@ -63,7 +63,7 @@ BEGIN_PROVIDER [ double precision, three_body_ints_bi_ort, (mo_num, mo_num, mo_n
 !  call ezfio_set_three_body_ints_bi_ort_io_three_body_ints_bi_ort("Read")
 ! endif
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -71,7 +71,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
 
   BEGIN_DOC
   !
-  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS 
+  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS
   !
   END_DOC
 
@@ -79,28 +79,30 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
   integer,          intent(in)  :: n, l, k, m, j, i
   double precision, intent(out) :: integral
   integer                       :: ipoint
-  double precision              :: weight
+  double precision              :: weight, tmp
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_t
 
   integral = 0.d0
   do ipoint = 1, n_points_final_grid
-    weight = final_weight_at_r_vector(ipoint)                                                                          
 
-    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+    tmp =     mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
               * ( int2_grad1_u12_bimo_t(ipoint,1,n,m) * int2_grad1_u12_bimo_t(ipoint,1,l,j)    &
                 + int2_grad1_u12_bimo_t(ipoint,2,n,m) * int2_grad1_u12_bimo_t(ipoint,2,l,j)    &
                 + int2_grad1_u12_bimo_t(ipoint,3,n,m) * int2_grad1_u12_bimo_t(ipoint,3,l,j) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+
+    tmp = tmp + mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
               * ( int2_grad1_u12_bimo_t(ipoint,1,n,m) * int2_grad1_u12_bimo_t(ipoint,1,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,2,n,m) * int2_grad1_u12_bimo_t(ipoint,2,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,3,n,m) * int2_grad1_u12_bimo_t(ipoint,3,k,i) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,m) &
+
+    tmp = tmp + mos_l_in_r_array_transp(ipoint,n) * mos_r_in_r_array_transp(ipoint,m) &
               * ( int2_grad1_u12_bimo_t(ipoint,1,l,j) * int2_grad1_u12_bimo_t(ipoint,1,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,2,l,j) * int2_grad1_u12_bimo_t(ipoint,2,k,i)    &
                 + int2_grad1_u12_bimo_t(ipoint,3,l,j) * int2_grad1_u12_bimo_t(ipoint,3,k,i) )
 
+    integral = integral + tmp * final_weight_at_r_vector(ipoint)
   enddo
 
 end subroutine give_integrals_3_body_bi_ort
@@ -111,7 +113,7 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 
   BEGIN_DOC
   !
-  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS 
+  ! < n l k | -L | m j i > with a BI-ORTHONORMAL MOLECULAR ORBITALS
   !
   END_DOC
 
@@ -123,13 +125,13 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 
   integral = 0.d0
   do ipoint = 1, n_points_final_grid
-    weight = final_weight_at_r_vector(ipoint)                                                                          
+    weight = final_weight_at_r_vector(ipoint)
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
 !              * ( x_W_ki_bi_ortho_erf_rk(ipoint,1,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,1,l,j)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,2,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,2,l,j)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,3,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,3,l,j)  )
-!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
 !              * ( x_W_ki_bi_ortho_erf_rk(ipoint,1,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,1,k,i)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,2,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,2,k,i)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,3,n,m) * x_W_ki_bi_ortho_erf_rk(ipoint,3,k,i)  )
@@ -138,11 +140,11 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,2,l,j) * x_W_ki_bi_ortho_erf_rk(ipoint,2,k,i)  &
 !                + x_W_ki_bi_ortho_erf_rk(ipoint,3,l,j) * x_W_ki_bi_ortho_erf_rk(ipoint,3,k,i)  )
 
-!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &
 !              * ( int2_grad1_u12_bimo(1,n,m,ipoint) * int2_grad1_u12_bimo(1,l,j,ipoint)        &
 !                + int2_grad1_u12_bimo(2,n,m,ipoint) * int2_grad1_u12_bimo(2,l,j,ipoint)        &
 !                + int2_grad1_u12_bimo(3,n,m,ipoint) * int2_grad1_u12_bimo(3,l,j,ipoint)        )
-!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) & 
+!    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j) &
 !              * ( int2_grad1_u12_bimo(1,n,m,ipoint) * int2_grad1_u12_bimo(1,k,i,ipoint)        &
 !                + int2_grad1_u12_bimo(2,n,m,ipoint) * int2_grad1_u12_bimo(2,k,i,ipoint)        &
 !                + int2_grad1_u12_bimo(3,n,m,ipoint) * int2_grad1_u12_bimo(3,k,i,ipoint)        )
@@ -151,13 +153,13 @@ subroutine give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, integral)
 !                + int2_grad1_u12_bimo(2,l,j,ipoint) * int2_grad1_u12_bimo(2,k,i,ipoint)        &
 !                + int2_grad1_u12_bimo(3,l,j,ipoint) * int2_grad1_u12_bimo(3,k,i,ipoint)        )
 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i)        & 
+    integral += weight * mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i)        &
               * ( int2_grad1_u12_bimo_transp(n,m,1,ipoint) * int2_grad1_u12_bimo_transp(l,j,1,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,2,ipoint) * int2_grad1_u12_bimo_transp(l,j,2,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,3,ipoint) * int2_grad1_u12_bimo_transp(l,j,3,ipoint) )
-    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j)        & 
+    integral += weight * mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,j)        &
               * ( int2_grad1_u12_bimo_transp(n,m,1,ipoint) * int2_grad1_u12_bimo_transp(k,i,1,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,2,ipoint) * int2_grad1_u12_bimo_transp(k,i,2,ipoint) &
                 + int2_grad1_u12_bimo_transp(n,m,3,ipoint) * int2_grad1_u12_bimo_transp(k,i,3,ipoint) )
@@ -176,7 +178,7 @@ subroutine give_integrals_3_body_bi_ort_ao(n, l, k, m, j, i, integral)
 
   BEGIN_DOC
   !
-  ! < n l k | -L | m j i > with a BI-ORTHONORMAL ATOMIC ORBITALS 
+  ! < n l k | -L | m j i > with a BI-ORTHONORMAL ATOMIC ORBITALS
   !
   END_DOC
 
@@ -188,13 +190,13 @@ subroutine give_integrals_3_body_bi_ort_ao(n, l, k, m, j, i, integral)
 
   integral = 0.d0
   do ipoint = 1, n_points_final_grid
-    weight = final_weight_at_r_vector(ipoint)                                                                          
+    weight = final_weight_at_r_vector(ipoint)
 
-    integral += weight * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,i) & 
+    integral += weight * aos_in_r_array_transp(ipoint,k) * aos_in_r_array_transp(ipoint,i) &
               * ( int2_grad1_u12_ao_t(ipoint,1,n,m) * int2_grad1_u12_ao_t(ipoint,1,l,j)    &
                 + int2_grad1_u12_ao_t(ipoint,2,n,m) * int2_grad1_u12_ao_t(ipoint,2,l,j)    &
                 + int2_grad1_u12_ao_t(ipoint,3,n,m) * int2_grad1_u12_ao_t(ipoint,3,l,j) )
-    integral += weight * aos_in_r_array_transp(ipoint,l) * aos_in_r_array_transp(ipoint,j) & 
+    integral += weight * aos_in_r_array_transp(ipoint,l) * aos_in_r_array_transp(ipoint,j) &
               * ( int2_grad1_u12_ao_t(ipoint,1,n,m) * int2_grad1_u12_ao_t(ipoint,1,k,i)    &
                 + int2_grad1_u12_ao_t(ipoint,2,n,m) * int2_grad1_u12_ao_t(ipoint,2,k,i)    &
                 + int2_grad1_u12_ao_t(ipoint,3,n,m) * int2_grad1_u12_ao_t(ipoint,3,k,i) )
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index b60e3bc1..21179dac 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -56,7 +56,7 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
   !               * [ sum (l_y = 0,i_order(2)) P_new(l_y,2) * (y-P_center(2))^l_y ] exp (- p (y-P_center(2))^2 )
   !               * [ sum (l_z = 0,i_order(3)) P_new(l_z,3) * (z-P_center(3))^l_z ] exp (- p (z-P_center(3))^2 )
   !
-  ! WARNING ::: IF fact_k is too smal then: 
+  ! WARNING ::: IF fact_k is too smal then:
   ! returns a "s" function centered in zero
   ! with an inifinite exponent and a zero polynom coef
   END_DOC
@@ -86,7 +86,7 @@ subroutine give_explicit_poly_and_gaussian(P_new,P_center,p,fact_k,iorder,alpha,
   !DIR$ FORCEINLINE
   call gaussian_product(alpha,A_center,beta,B_center,fact_k,p,P_center)
   if (fact_k < thresh) then
-    ! IF fact_k is too smal then: 
+    ! IF fact_k is too smal then:
     ! returns a "s" function centered in zero
     ! with an inifinite exponent and a zero polynom coef
     P_center = 0.d0
@@ -468,114 +468,6 @@ end subroutine
 
 
 
-subroutine multiply_poly_0c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:0), c(0:nc)
-  double precision, intent(inout) :: d(0:0+nc)
-
-  integer                        :: ic
-
-  do ic = 0,nc
-    d(ic) = d(ic) + c(ic) * b(0)
-  enddo
-
-  do nd = nc,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-subroutine multiply_poly_1c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:1), c(0:nc)
-  double precision, intent(inout) :: d(0:1+nc)
-
-  integer                        :: ic, id
-  if(nc < 0) return
-
-  do ic = 0,nc
-    d(  ic) = d(  ic) + c(ic) * b(0)
-    d(1+ic) = d(1+ic) + c(ic) * b(1)
-  enddo
-
-  do nd = nc+1,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-
-subroutine multiply_poly_2c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:2), c(0:nc)
-  double precision, intent(inout) :: d(0:2+nc)
-
-  integer                        :: ic, id, k
-  if (nc <0) return
-
-  do ic = 0,nc
-    d(  ic) = d(  ic) + c(ic) * b(0)
-    d(1+ic) = d(1+ic) + c(ic) * b(1)
-    d(2+ic) = d(2+ic) + c(ic) * b(2)
-  enddo
-
-  do nd = nc+2,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-subroutine multiply_poly_3c(b,c,nc,d,nd)
-  implicit none
-  BEGIN_DOC
-  ! Multiply two polynomials
-  ! D(t) += B(t)*C(t)
-  END_DOC
-
-  integer, intent(in)            :: nc
-  integer, intent(out)           :: nd
-  double precision, intent(in)   :: b(0:3), c(0:nc)
-  double precision, intent(inout) :: d(0:3+nc)
-
-  integer                        :: ic, id
-  if (nc <0) return
-
-  do ic = 1,nc
-    d(  ic) = d(1+ic) + c(ic) * b(0)
-    d(1+ic) = d(1+ic) + c(ic) * b(1)
-    d(2+ic) = d(1+ic) + c(ic) * b(2)
-    d(3+ic) = d(1+ic) + c(ic) * b(3)
-  enddo
-
-  do nd = nc+3,0,-1
-    if (d(nd) /= 0.d0) exit
-  enddo
-
-end
-
-
-
 subroutine multiply_poly(b,nb,c,nc,d,nd)
   implicit none
   BEGIN_DOC
@@ -604,6 +496,254 @@ subroutine multiply_poly(b,nb,c,nc,d,nd)
 
 end
 
+
+subroutine multiply_poly_b0(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:0), c(0:nc)
+  double precision, intent(inout) :: d(0:nc)
+
+  integer                        :: ndtmp
+  integer                        :: ic, id, k
+  if(nc < 0) return !False if nc>=0
+
+  do ic = 0,nc
+    d(ic) = d(ic) + c(ic) * b(0)
+  enddo
+
+  do nd = nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+subroutine multiply_poly_b1(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:1), c(0:nc)
+  double precision, intent(inout) :: d(0:1+nc)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nc < 0) return !False if nc>=0
+
+
+  select case (nc)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(1) * b(1)
+
+    case default
+      d(0) = d(0) + c(0) * b(0)
+      do ic = 1,nc
+        d(ic) = d(ic) + c(ic) * b(0) + c(ic-1) * b(1)
+      enddo
+      d(nc+1) = d(nc+1) + c(nc) * b(1)
+
+  end select
+
+  do nd = 1+nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_b2(b,c,nc,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nc
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:2), c(0:nc)
+  double precision, intent(inout) :: d(0:2+nc)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nc < 0) return !False if nc>=0
+
+  select case (nc)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1)
+      d(2) = d(2) + c(0) * b(2)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(0) * b(2) + c(1) * b(1)
+      d(3) = d(3) + c(1) * b(2)
+
+    case (2)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
+      d(3) = d(3) + c(2) * b(1) + c(1) * b(2)
+      d(4) = d(4) + c(2) * b(2)
+
+    case default
+
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      do ic = 2,nc
+        d(ic) = d(ic) + c(ic) * b(0) + c(ic-1) * b(1) + c(ic-2) * b(2)
+      enddo
+      d(nc+1) = d(nc+1) + c(nc) * b(1) + c(nc-1) * b(2)
+      d(nc+2) = d(nc+2) + c(nc) * b(2)
+
+  end select
+
+  do nd = 2+nc,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_c0(b,nb,c,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:0)
+  double precision, intent(inout) :: d(0:nb)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nb < 0) return !False if nb>=0
+
+  do ib=0,nb
+      d(ib) = d(ib) + c(0) * b(ib)
+  enddo
+
+  do nd = nb,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_c1(b,nb,c,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:1)
+  double precision, intent(inout) :: d(0:nb+1)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nb < 0) return !False if nb>=0
+
+  select case (nb)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(1) * b(0)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(1) * b(1)
+
+    case default
+      d(0) = d(0) + c(0) * b(0)
+      do ib=1,nb
+        d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1)
+      enddo
+      d(nb+1) = d(nb+1) + c(1) * b(nb)
+
+  end select
+
+  do nd = nb+1,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+subroutine multiply_poly_c2(b,nb,c,d,nd)
+  implicit none
+  BEGIN_DOC
+  ! Multiply two polynomials
+  ! D(t) += B(t)*C(t)
+  END_DOC
+
+  integer, intent(in)            :: nb
+  integer, intent(out)           :: nd
+  double precision, intent(in)   :: b(0:nb), c(0:2)
+  double precision, intent(inout) :: d(0:nb+2)
+
+  integer                        :: ndtmp
+  integer                        :: ib, ic, id, k
+  if(nb < 0) return !False if nb>=0
+
+  select case (nb)
+    case (0)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(1) * b(0)
+      d(2) = d(2) + c(2) * b(0)
+
+    case (1)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(1) * b(1) + c(2) * b(0)
+      d(3) = d(3) + c(2) * b(1)
+
+    case (2)
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      d(2) = d(2) + c(0) * b(2) + c(1) * b(1) + c(2) * b(0)
+      d(3) = d(3) + c(1) * b(2) + c(2) * b(1)
+      d(4) = d(4) + c(2) * b(2)
+
+    case default
+      d(0) = d(0) + c(0) * b(0)
+      d(1) = d(1) + c(0) * b(1) + c(1) * b(0)
+      do ib=2,nb
+        d(ib) = d(ib) + c(0) * b(ib) + c(1) * b(ib-1) + c(2) * b(ib-2)
+      enddo
+      d(nb+1) = d(nb+1) + c(1) * b(nb) + c(2) * b(nb-1)
+      d(nb+2) = d(nb+2) + c(2) * b(nb)
+
+  end select
+
+  do nd = nb+2,0,-1
+    if (d(nd) /= 0.d0) exit
+  enddo
+
+end
+
+
+
+
 subroutine multiply_poly_v(b,nb,c,nc,d,nd,n_points)
   implicit none
   BEGIN_DOC
@@ -778,11 +918,11 @@ end subroutine recentered_poly2_v
 subroutine recentered_poly2_v0(P_new, lda, x_A, LD_xA, x_P, a, n_points)
 
   BEGIN_DOC
-  ! 
+  !
   ! Recenter two polynomials. Special case for b=(0,0,0)
-  ! 
+  !
   ! (x - A)^a (x - B)^0 = (x - P + P - A)^a  (x - Q + Q - B)^0
-  !                     = (x - P + P - A)^a 
+  !                     = (x - P + P - A)^a
   !
   END_DOC
 

From b9c18338960064f01bdd3e7bf6427ab3510575c9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 00:33:37 +0200
Subject: [PATCH 07/13] Optimized three_e_5_idx_exch12_bi_ort

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |  12 +-
 src/bi_ort_ints/three_body_ijmkl.irp.f | 162 ++++++++++++++++++++-----
 src/utils/integration.irp.f            |  24 ++++
 3 files changed, 166 insertions(+), 32 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index ca50dd56..63b2aa8c 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -16,23 +16,27 @@ subroutine test_3e
  double precision :: accu, contrib,new,ref
  i = 1
  k = 1
+ n = 0
  accu = 0.d0
  do i = 1, mo_num
   do k = 1, mo_num 
    do j = 1, mo_num
     do l = 1, mo_num 
      do m = 1, mo_num
-      do n = 1, mo_num
-        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
-        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
+      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
+!      do n = 1, mo_num
+!        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
+!        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
         contrib = dabs(new - ref)
         accu += contrib
         if(contrib .gt. 1.d-10)then
          print*,'pb !!'
          print*,i,k,j,l,m,n
          print*,ref,new,contrib
+         stop
         endif
-      enddo
+!      enddo
      enddo
     enddo
    enddo
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index ae4c9bd5..af2cb353 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -24,7 +24,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -33,7 +33,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
-            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -45,7 +45,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -73,7 +73,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -82,7 +82,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -94,7 +94,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -122,7 +122,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -131,7 +131,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
         do m = 1, mo_num
           do l = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -143,7 +143,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -171,7 +171,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -180,7 +180,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -192,7 +192,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num,
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
@@ -220,7 +220,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
  !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
  !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
   do i = 1, mo_num
@@ -229,7 +229,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
         do l = 1, mo_num
           do m = 1, mo_num
             call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
-            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral 
+            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
           enddo
         enddo
       enddo
@@ -241,7 +241,57 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num,
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  three_e_5_idx_exch12_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
+            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
 
 ! ---
 
@@ -259,38 +309,94 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
 
   implicit none
   integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  three_e_5_idx_exch12_bi_ort = 0.d0
   print *, ' Providing the three_e_5_idx_exch12_bi_ort ...'
   call wall_time(wall0)
 
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+ do m = 1, mo_num
 
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
- !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         m2grad_r, m2grad_l, grad_mli, tmp_mat, orb_mat)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+         grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
+               int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
+               int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
+               int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+         m2grad_l(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         m2grad_l(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         m2grad_l(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+         m2grad_r(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         m2grad_r(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         m2grad_r(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, n_points_final_grid, 1.d0, &
+      orb_mat, n_points_final_grid,  &
+      grad_mli, n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
   do i = 1, mo_num
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
-            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = -1.d0 * integral 
-          enddo
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,k,j) - tmp_mat(k,j,l,i)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      m2grad_l, 3*n_points_final_grid,  &
+      m2grad_r, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = &
+                three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  enddo
 
   call wall_time(wall1)
   print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
 
-END_PROVIDER 
+END_PROVIDER
 
 ! ---
 
diff --git a/src/utils/integration.irp.f b/src/utils/integration.irp.f
index 21179dac..b548b18a 100644
--- a/src/utils/integration.irp.f
+++ b/src/utils/integration.irp.f
@@ -484,6 +484,30 @@ subroutine multiply_poly(b,nb,c,nc,d,nd)
   integer                        :: ib, ic, id, k
   if(ior(nc,nb) < 0) return !False if nc>=0 and nb>=0
 
+  select case (nb)
+    case (0)
+      call multiply_poly_b0(b,c,nc,d,nd)
+      return
+    case (1)
+      call multiply_poly_b1(b,c,nc,d,nd)
+      return
+    case (2)
+      call multiply_poly_b2(b,c,nc,d,nd)
+      return
+  end select
+
+  select case (nc)
+    case (0)
+      call multiply_poly_c0(b,nb,c,d,nd)
+      return
+    case (1)
+      call multiply_poly_c1(b,nb,c,d,nd)
+      return
+    case (2)
+      call multiply_poly_c2(b,nb,c,d,nd)
+      return
+  end select
+
   do ib=0,nb
     do ic = 0,nc
       d(ib+ic) = d(ib+ic) + c(ic) * b(ib)

From fb5300a8e59d4dc08c4ce118317f4deffef7daba Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 08:51:04 +0200
Subject: [PATCH 08/13] Preparing for optimization of 5idx in TC

---
 external/qp2-dependencies                  |   2 +-
 src/bi_ort_ints/three_body_ijmkl.irp.f     |  58 +---
 src/bi_ort_ints/three_body_ijmkl_old.irp.f | 295 +++++++++++++++++++++
 3 files changed, 303 insertions(+), 52 deletions(-)
 create mode 100644 src/bi_ort_ints/three_body_ijmkl_old.irp.f

diff --git a/external/qp2-dependencies b/external/qp2-dependencies
index 6e23ebac..e0d0e02e 160000
--- a/external/qp2-dependencies
+++ b/external/qp2-dependencies
@@ -1 +1 @@
-Subproject commit 6e23ebac001acae91d1c762ca934e09a9b7d614a
+Subproject commit e0d0e02e9f5ece138d1520106954a881ab0b8db2
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index af2cb353..5220d8c7 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -245,56 +245,6 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  three_e_5_idx_exch12_bi_ort_old = 0.d0
-  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
-            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
 BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
@@ -305,6 +255,12 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   !
+  ! Equivalent to:
+  !
+  !    call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
+  !
+  !    three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+  !
   END_DOC
 
   implicit none
@@ -314,10 +270,10 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   double precision :: weight
   double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
   allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
   allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
   allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
diff --git a/src/bi_ort_ints/three_body_ijmkl_old.irp.f b/src/bi_ort_ints/three_body_ijmkl_old.irp.f
new file mode 100644
index 00000000..105cd179
--- /dev/null
+++ b/src/bi_ort_ints/three_body_ijmkl_old.irp.f
@@ -0,0 +1,295 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_direct_bi_ort_old(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_direct_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_direct_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
+            three_e_5_idx_direct_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_direct_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_cycle_1_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
+            three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_cycle_2_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          do l = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
+            three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_exch23_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch23_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
+            three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch23_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  three_e_5_idx_exch13_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch13_bi_ort_old ...'
+  call wall_time(wall0)
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
+            three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch13_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort_old, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  three_e_5_idx_exch12_bi_ort_old = 0.d0
+  print *, ' Providing the three_e_5_idx_exch12_bi_ort_old ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch12_bi_ort_old)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
+            three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch12_bi_ort_old', wall1 - wall0
+
+END_PROVIDER
+

From c4612318ae9cce73c3cf668703827eb9c7bfd093 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 09:11:32 +0200
Subject: [PATCH 09/13] Optimized direct 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |  62 ++-
 src/bi_ort_ints/three_body_ijmkl.irp.f | 500 ++++++++++++-------------
 2 files changed, 297 insertions(+), 265 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 63b2aa8c..d0367f6f 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -7,7 +7,8 @@ program bi_ort_ints
   my_n_pt_r_grid = 10
   my_n_pt_a_grid = 14
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
- call test_3e
+! call test_3e
+ call test_5idx
 end
 
 subroutine test_3e
@@ -19,15 +20,13 @@ subroutine test_3e
  n = 0
  accu = 0.d0
  do i = 1, mo_num
-  do k = 1, mo_num 
+  do k = 1, mo_num
    do j = 1, mo_num
-    do l = 1, mo_num 
+    do l = 1, mo_num
      do m = 1, mo_num
-      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
-!      do n = 1, mo_num
-!        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
-!        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
+      do n = 1, mo_num
+        call give_integrals_3_body_bi_ort(n, l, k, m, j, i, new)
+        call give_integrals_3_body_bi_ort_old(n, l, k, m, j, i, ref)
         contrib = dabs(new - ref)
         accu += contrib
         if(contrib .gt. 1.d-10)then
@@ -36,7 +35,7 @@ subroutine test_3e
          print*,ref,new,contrib
          stop
         endif
-!      enddo
+      enddo
      enddo
     enddo
    enddo
@@ -46,3 +45,48 @@ subroutine test_3e
 
 
 end
+
+subroutine test_5idx
+ implicit none
+ integer :: i,k,j,l,m,n,ipoint
+ double precision :: accu, contrib,new,ref
+ i = 1
+ k = 1
+ n = 0
+ accu = 0.d0
+ do i = 1, mo_num
+  do k = 1, mo_num
+   do j = 1, mo_num
+    do l = 1, mo_num
+     do m = 1, mo_num
+      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'direct'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+
+!      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'exch12'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+     enddo
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/dble(mo_num)**5
+
+
+end
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 5220d8c7..1db773f1 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,7 +1,8 @@
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -12,257 +13,6 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num,
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
   END_DOC
 
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_direct_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_direct_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, m, j, i, integral)
-            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_cycle_1_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_cycle_2_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          do l = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_exch23_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0
-
-  three_e_5_idx_exch13_bi_ort = 0.d0
-  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
-  call wall_time(wall0)
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
-            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = <mlk|-L|mij> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  ! Equivalent to:
-  !
-  !    call give_integrals_3_body_bi_ort(m, l, k, m, i, j, integral)
-  !
-  !    three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i) = -1.d0 * integral
-  !
-  END_DOC
-
   implicit none
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
@@ -279,7 +29,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  print *, ' Providing the three_e_5_idx_exch12_bi_ort ...'
+  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
   call wall_time(wall0)
 
  do m = 1, mo_num
@@ -322,6 +72,7 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = - tmp_mat(l,j,k,i) - tmp_mat(k,i,l,j)
             three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,k,j) - tmp_mat(k,j,l,i)
         enddo
       enddo
@@ -339,8 +90,8 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = &
-                three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
+            three_e_5_idx_direct_bi_ort(m,l,j,k,i) = three_e_5_idx_direct_bi_ort(m,l,j,k,i) - tmp_mat(l,j,k,i)
+            three_e_5_idx_exch12_bi_ort(m,l,j,k,i) = three_e_5_idx_exch12_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
         enddo
       enddo
     enddo
@@ -350,9 +101,246 @@ BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num,
   enddo
 
   call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch12_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
 
 END_PROVIDER
 
 ! ---
 
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          do l = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
+            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  double precision :: integral
+  integer          :: i, j, k, m, l
+  double precision :: wall1, wall0
+  integer          :: ipoint
+  double precision :: weight
+  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
+  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
+  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+
+  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
+  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
+
+  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
+  call wall_time(wall0)
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) &
+ !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
+ !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
+            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
+
+END_PROVIDER
+
+! ---
+
+

From 00bd8e2fcc8d435a1484af065a443efee3ca3c9f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 10:34:05 +0200
Subject: [PATCH 10/13] Optimized cyclic 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f            |  65 ++++-
 src/bi_ort_ints/three_body_ijmkl.irp.f       | 288 +++++++------------
 src/bi_ort_ints/three_body_ints_bi_ort.irp.f |   1 +
 3 files changed, 152 insertions(+), 202 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index d0367f6f..eae0affe 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -59,17 +59,18 @@ subroutine test_5idx
    do j = 1, mo_num
     do l = 1, mo_num
      do m = 1, mo_num
-      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
-      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
-      contrib = dabs(new - ref)
-      accu += contrib
-      if(contrib .gt. 1.d-10)then
-       print*,'direct'
-       print*,i,k,j,l,m
-       print*,ref,new,contrib
-       stop
-      endif
 
+!      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'direct'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+!
 !      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
 !      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
 !      contrib = dabs(new - ref)
@@ -81,6 +82,50 @@ subroutine test_5idx
 !       stop
 !      endif
 
+!      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'cycle1'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+!      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'cycle2'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+!      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
+!      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
+!      contrib = dabs(new - ref)
+!      accu += contrib
+!      if(contrib .gt. 1.d-10)then
+!       print*,'exch23'
+!       print*,i,k,j,l,m
+!       print*,ref,new,contrib
+!       stop
+!      endif
+
+      new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'exch13'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+
      enddo
     enddo
    enddo
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 1db773f1..9f316771 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,4 +1,3 @@
-
 ! ---
 
  BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
@@ -17,7 +16,6 @@
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
   integer          :: ipoint
-  double precision :: weight
   double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
   allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
@@ -45,17 +43,22 @@
   do i=1,mo_num
     do l=1,mo_num
        do ipoint=1, n_points_final_grid
+
          grad_mli(ipoint,l,i) = final_weight_at_r_vector(ipoint) * ( &
                int2_grad1_u12_bimo_t(ipoint,1,m,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) + &
                int2_grad1_u12_bimo_t(ipoint,2,m,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) + &
                int2_grad1_u12_bimo_t(ipoint,3,m,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) )
+
+         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+
          m2grad_l(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
          m2grad_l(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
          m2grad_l(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
          m2grad_r(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
          m2grad_r(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
          m2grad_r(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-         orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
+
        enddo
     enddo
   enddo
@@ -107,240 +110,141 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = <mlk|-L|jim> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  double precision :: integral
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_cycle_1_bi_ort ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_1_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, i, m, integral)
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_1_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE FIRST CYCLIC PERMUTATION TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = <mlk|-L|imj> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  double precision :: integral
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_cycle_2_bi_ort ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_cycle_2_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          do l = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, m, j, integral)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_2_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
   ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
   !
-  ! three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = <mlk|-L|jmi> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
   !
   ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
   END_DOC
 
   implicit none
-  double precision :: integral
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
   integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
+  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
+  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:)
+  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  print *, ' Providing the three_e_5_idx_exch23_bi_ort ...'
+  print *, ' Providing the three_e_5_idx_cycle_bi_ort ...'
   call wall_time(wall0)
 
+ do m = 1, mo_num
+
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch23_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         rk_grad_im, rm_grad_ik, lk_grad_mi, lm_grad_ik, tmp_mat)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
+         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
+         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lk_grad_mi, 3*n_points_final_grid,  &
+      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
   do i = 1, mo_num
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, j, m, i, integral)
-            three_e_5_idx_exch23_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -tmp_mat(k,j,l,i)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -tmp_mat(l,i,k,j)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = -tmp_mat(l,j,k,i)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = -tmp_mat(k,i,l,j)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END PARALLEL DO
 
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch23_bi_ort', wall1 - wall0
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lk_grad_mi, 3*n_points_final_grid,  &
+      rk_grad_im, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
 
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = <mlk|-L|ijm> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  double precision :: integral
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision :: weight
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_exch13_bi_ort ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) &
- !$OMP SHARED (mo_num,three_e_5_idx_exch13_bi_ort)
- !$OMP DO SCHEDULE (dynamic) COLLAPSE(2)
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
   do i = 1, mo_num
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-          do m = 1, mo_num
-            call give_integrals_3_body_bi_ort(m, l, k, i, j, m, integral)
-            three_e_5_idx_exch13_bi_ort(m,l,j,k,i) = -1.d0 * integral
-          enddo
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,j,i,k)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,i,j,l)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,j,i,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,i,j,k)
         enddo
       enddo
     enddo
   enddo
- !$OMP END DO
- !$OMP END PARALLEL
+  !$OMP END PARALLEL DO
+
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lm_grad_ik, 3*n_points_final_grid,  &
+      rk_grad_im, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,i,j,k)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,j,i,l)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,i,j,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,j,i,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+  enddo
 
   call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_exch13_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_cycle_bi_ort', wall1 - wall0
 
 END_PROVIDER
 
 ! ---
 
 
+
diff --git a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
index a72cd682..1962c8d6 100644
--- a/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/src/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -85,6 +85,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
   PROVIDE int2_grad1_u12_bimo_t
 
   integral = 0.d0
+  ! (n, l, k, m, j, i)
   do ipoint = 1, n_points_final_grid
 
     tmp =     mos_l_in_r_array_transp(ipoint,k) * mos_r_in_r_array_transp(ipoint,i) &

From b2c005eccb7e05eaebb59e9dcbc3c0e771c9a87f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 11:08:30 +0200
Subject: [PATCH 11/13] Finished optimizing 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f      | 102 +++++++++----------
 src/bi_ort_ints/three_body_ijmkl.irp.f | 135 +++++++------------------
 2 files changed, 87 insertions(+), 150 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index eae0affe..5653a2e2 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -60,60 +60,60 @@ subroutine test_5idx
     do l = 1, mo_num
      do m = 1, mo_num
 
-!      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'direct'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_direct_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_direct_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'direct'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
+
+      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'exch12'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 !
-!      new = three_e_5_idx_exch12_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_exch12_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'exch12'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'cycle1'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 
-!      new = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_cycle_1_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'cycle1'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'cycle2'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 
-!      new = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_cycle_2_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'cycle2'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
-
-!      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
-!      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
-!      contrib = dabs(new - ref)
-!      accu += contrib
-!      if(contrib .gt. 1.d-10)then
-!       print*,'exch23'
-!       print*,i,k,j,l,m
-!       print*,ref,new,contrib
-!       stop
-!      endif
+      new = three_e_5_idx_exch23_bi_ort(m,l,j,k,i)
+      ref = three_e_5_idx_exch23_bi_ort_old(m,l,j,k,i)
+      contrib = dabs(new - ref)
+      accu += contrib
+      if(contrib .gt. 1.d-10)then
+       print*,'exch23'
+       print*,i,k,j,l,m
+       print*,ref,new,contrib
+       stop
+      endif
 
       new = three_e_5_idx_exch13_bi_ort(m,l,j,k,i)
       ref = three_e_5_idx_exch13_bi_ort_old(m,l,j,k,i)
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index 9f316771..c9e88ab9 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -1,7 +1,11 @@
 ! ---
 
- BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+ BEGIN_PROVIDER [ double precision, three_e_5_idx_direct_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch12_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
 
   BEGIN_DOC
   !
@@ -16,18 +20,22 @@
   integer          :: i, j, k, m, l
   double precision :: wall1, wall0
   integer          :: ipoint
-  double precision, allocatable :: grad_mli(:,:,:), m2grad_r(:,:,:,:), m2grad_l(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:), orb_mat(:,:,:)
-  allocate(m2grad_r(n_points_final_grid,3,mo_num,mo_num))
-  allocate(m2grad_l(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
+  double precision, allocatable :: grad_mli(:,:,:), orb_mat(:,:,:)
+  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
+  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
+  double precision, allocatable :: tmp_mat(:,:,:,:)
+  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
   allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
   allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
+  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
   PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
 
-  print *, ' Providing the three_e_5_idx_direct_bi_ort ...'
+  print *, ' Providing the three_e_5_idx_bi_ort ...'
   call wall_time(wall0)
 
  do m = 1, mo_num
@@ -38,7 +46,8 @@
  !$OMP SHARED (m,mo_num,n_points_final_grid, &
  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         m2grad_r, m2grad_l, grad_mli, tmp_mat, orb_mat)
+ !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi, &
+ !$OMP         grad_mli, tmp_mat, orb_mat)
  !$OMP DO COLLAPSE(2)
   do i=1,mo_num
     do l=1,mo_num
@@ -51,13 +60,21 @@
 
          orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
 
-         m2grad_l(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         m2grad_l(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         m2grad_l(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
 
-         m2grad_r(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         m2grad_r(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         m2grad_r(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
+         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
 
        enddo
     enddo
@@ -84,8 +101,8 @@
   !$OMP END PARALLEL DO
 
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      m2grad_l, 3*n_points_final_grid,  &
-      m2grad_r, 3*n_points_final_grid,  0.d0, &
+      lm_grad_ik, 3*n_points_final_grid,  &
+      rm_grad_ik, 3*n_points_final_grid,  0.d0, &
       tmp_mat, mo_num*mo_num)
 
   !$OMP PARALLEL DO PRIVATE(i,j,k,l)
@@ -101,83 +118,6 @@
   enddo
   !$OMP END PARALLEL DO
 
-  enddo
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_direct_bi_ort', wall1 - wall0
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_1_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_cycle_2_bi_ort, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch23_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
-&BEGIN_PROVIDER [ double precision, three_e_5_idx_exch13_bi_ort , (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_5_idx_direct_bi_ort(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: wall1, wall0
-  integer          :: ipoint
-  double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
-  double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
-  double precision, allocatable :: tmp_mat(:,:,:,:)
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
-
-  provide mos_r_in_r_array_transp mos_l_in_r_array_transp
-  PROVIDE mo_l_coef mo_r_coef int2_grad1_u12_bimo_t
-
-  print *, ' Providing the three_e_5_idx_cycle_bi_ort ...'
-  call wall_time(wall0)
-
- do m = 1, mo_num
-
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,l,ipoint) &
- !$OMP SHARED (m,mo_num,n_points_final_grid, &
- !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
- !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rk_grad_im, rm_grad_ik, lk_grad_mi, lm_grad_ik, tmp_mat)
- !$OMP DO COLLAPSE(2)
-  do i=1,mo_num
-    do l=1,mo_num
-       do ipoint=1, n_points_final_grid
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
-
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
-
-       enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
       lk_grad_mi, 3*n_points_final_grid,  &
       rm_grad_ik, 3*n_points_final_grid,  0.d0, &
@@ -237,14 +177,11 @@ END_PROVIDER
     enddo
   enddo
   !$OMP END PARALLEL DO
+
   enddo
 
   call wall_time(wall1)
-  print *, ' wall time for three_e_5_idx_cycle_bi_ort', wall1 - wall0
+  print *, ' wall time for three_e_5_idx_bi_ort', wall1 - wall0
 
 END_PROVIDER
 
-! ---
-
-
-

From 896ac96e7e7339c710b6325972e878d94003b9e5 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 11:40:21 +0200
Subject: [PATCH 12/13] Reduced memory in 5idx

---
 src/bi_ort_ints/bi_ort_ints.irp.f      |   5 +
 src/bi_ort_ints/three_body_ijmkl.irp.f | 139 ++++++++++++++++---------
 2 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/src/bi_ort_ints/bi_ort_ints.irp.f b/src/bi_ort_ints/bi_ort_ints.irp.f
index 5653a2e2..f7a42f37 100644
--- a/src/bi_ort_ints/bi_ort_ints.irp.f
+++ b/src/bi_ort_ints/bi_ort_ints.irp.f
@@ -9,6 +9,11 @@ program bi_ort_ints
   touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 ! call test_3e
  call test_5idx
+!  call test_5idx2
+end
+
+subroutine test_5idx2
+  PROVIDE three_e_5_idx_cycle_2_bi_ort
 end
 
 subroutine test_3e
diff --git a/src/bi_ort_ints/three_body_ijmkl.irp.f b/src/bi_ort_ints/three_body_ijmkl.irp.f
index c9e88ab9..bd669163 100644
--- a/src/bi_ort_ints/three_body_ijmkl.irp.f
+++ b/src/bi_ort_ints/three_body_ijmkl.irp.f
@@ -24,12 +24,6 @@
   double precision, allocatable :: lk_grad_mi(:,:,:,:), rk_grad_im(:,:,:,:)
   double precision, allocatable :: lm_grad_ik(:,:,:,:), rm_grad_ik(:,:,:,:)
   double precision, allocatable :: tmp_mat(:,:,:,:)
-  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
-  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
-  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
-  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
-  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
   allocate(tmp_mat(mo_num,mo_num,mo_num,mo_num))
 
   provide mos_r_in_r_array_transp mos_l_in_r_array_transp
@@ -40,14 +34,15 @@
 
  do m = 1, mo_num
 
+  allocate(grad_mli(n_points_final_grid,mo_num,mo_num))
+  allocate(orb_mat(n_points_final_grid,mo_num,mo_num))
  !$OMP PARALLEL                     &
  !$OMP DEFAULT (NONE)               &
  !$OMP PRIVATE (i,l,ipoint) &
  !$OMP SHARED (m,mo_num,n_points_final_grid, &
  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
- !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi, &
- !$OMP         grad_mli, tmp_mat, orb_mat)
+ !$OMP         grad_mli, orb_mat)
  !$OMP DO COLLAPSE(2)
   do i=1,mo_num
     do l=1,mo_num
@@ -60,22 +55,6 @@
 
          orb_mat(ipoint,l,i) = mos_l_in_r_array_transp(ipoint,l) * mos_r_in_r_array_transp(ipoint,i)
 
-         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
-         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
-
-         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
-         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
-         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
-
-         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
-         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
-
-         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
-         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
-         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
-
        enddo
     enddo
   enddo
@@ -100,6 +79,41 @@
   enddo
   !$OMP END PARALLEL DO
 
+  deallocate(orb_mat,grad_mli)
+
+  allocate(lm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rm_grad_ik(n_points_final_grid,3,mo_num,mo_num))
+  allocate(rk_grad_im(n_points_final_grid,3,mo_num,mo_num))
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         rm_grad_ik, lm_grad_ik, rk_grad_im, lk_grad_mi)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+
+         lm_grad_ik(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i) * final_weight_at_r_vector(ipoint)
+         lm_grad_ik(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i) * final_weight_at_r_vector(ipoint)
+
+         rm_grad_ik(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,1,l,i)
+         rm_grad_ik(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,2,l,i)
+         rm_grad_ik(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,m) * int2_grad1_u12_bimo_t(ipoint,3,l,i)
+
+         rk_grad_im(ipoint,1,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,i,m)
+         rk_grad_im(ipoint,2,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,i,m)
+         rk_grad_im(ipoint,3,l,i) = mos_r_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,i,m)
+
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
       lm_grad_ik, 3*n_points_final_grid,  &
       rm_grad_ik, 3*n_points_final_grid,  0.d0, &
@@ -118,6 +132,52 @@
   enddo
   !$OMP END PARALLEL DO
 
+  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
+      lm_grad_ik, 3*n_points_final_grid,  &
+      rk_grad_im, 3*n_points_final_grid,  0.d0, &
+      tmp_mat, mo_num*mo_num)
+
+  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = - tmp_mat(l,i,j,k)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = - tmp_mat(k,j,i,l)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = - tmp_mat(k,i,j,l)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = - tmp_mat(l,j,i,k)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  deallocate(lm_grad_ik)
+
+  allocate(lk_grad_mi(n_points_final_grid,3,mo_num,mo_num))
+
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,l,ipoint) &
+ !$OMP SHARED (m,mo_num,n_points_final_grid, &
+ !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+ !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector, &
+ !$OMP         lk_grad_mi)
+ !$OMP DO COLLAPSE(2)
+  do i=1,mo_num
+    do l=1,mo_num
+       do ipoint=1, n_points_final_grid
+
+         lk_grad_mi(ipoint,1,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,1,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,2,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,2,m,i) * final_weight_at_r_vector(ipoint)
+         lk_grad_mi(ipoint,3,l,i) = mos_l_in_r_array_transp(ipoint,l) * int2_grad1_u12_bimo_t(ipoint,3,m,i) * final_weight_at_r_vector(ipoint)
+
+       enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
   call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
       lk_grad_mi, 3*n_points_final_grid,  &
       rm_grad_ik, 3*n_points_final_grid,  0.d0, &
@@ -128,10 +188,10 @@
     do k = 1, mo_num
       do j = 1, mo_num
         do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = -tmp_mat(k,j,l,i)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = -tmp_mat(l,i,k,j)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = -tmp_mat(l,j,k,i)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = -tmp_mat(k,i,l,j)
+            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(k,j,l,i)
+            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(l,i,k,j)
+            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(l,j,k,i)
+            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(k,i,l,j)
         enddo
       enddo
     enddo
@@ -158,26 +218,9 @@
   enddo
   !$OMP END PARALLEL DO
 
-  call dgemm('T','N', mo_num*mo_num, mo_num*mo_num, 3*n_points_final_grid, 1.d0, &
-      lm_grad_ik, 3*n_points_final_grid,  &
-      rk_grad_im, 3*n_points_final_grid,  0.d0, &
-      tmp_mat, mo_num*mo_num)
-
-  !$OMP PARALLEL DO PRIVATE(i,j,k,l)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-            three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_1_bi_ort(m,l,j,k,i) - tmp_mat(l,i,j,k)
-            three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) = three_e_5_idx_cycle_2_bi_ort(m,l,j,k,i) - tmp_mat(k,j,i,l)
-            three_e_5_idx_exch23_bi_ort (m,l,j,k,i) = three_e_5_idx_exch23_bi_ort (m,l,j,k,i) - tmp_mat(k,i,j,l)
-            three_e_5_idx_exch13_bi_ort (m,l,j,k,i) = three_e_5_idx_exch13_bi_ort (m,l,j,k,i) - tmp_mat(l,j,i,k)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END PARALLEL DO
-
+  deallocate(lk_grad_mi)
+  deallocate(rm_grad_ik)
+  deallocate(rk_grad_im)
   enddo
 
   call wall_time(wall1)

From 81b7751b00f54a988e2df30fb92edc98f0e49474 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 2 Jun 2023 16:10:04 +0200
Subject: [PATCH 13/13] Fix bug in number of args

---
 src/ccsd/ccsd_t_space_orb_stoch.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 1f3bebc2..b669025e 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -198,7 +198,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   allocate (bounds(2,nbuckets))
   do isample=1,nbuckets
     eta = 1.d0/dble(nbuckets) * dble(isample)
-    ieta = binary_search(waccu,eta,Nabc,ileft,iright)
+    ieta = binary_search(waccu,eta,Nabc)
     bounds(1,isample) = ileft
     bounds(2,isample) = ieta
     ileft = ieta+1