From 63650890e2e936934ba794b30961cef09d9792e3 Mon Sep 17 00:00:00 2001
From: Emmanuel Giner LCT <eginer@obelixdev.local>
Date: Fri, 22 Sep 2023 16:40:12 +0200
Subject: [PATCH 001/131] trying to fix bug

---
 src/tc_bi_ortho/psi_r_l_prov.irp.f    |  6 ++++--
 src/tc_bi_ortho/tc_h_eigvectors.irp.f | 16 +++++++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/tc_bi_ortho/psi_r_l_prov.irp.f b/src/tc_bi_ortho/psi_r_l_prov.irp.f
index b28c417f..1d233b0b 100644
--- a/src/tc_bi_ortho/psi_r_l_prov.irp.f
+++ b/src/tc_bi_ortho/psi_r_l_prov.irp.f
@@ -1,6 +1,7 @@
 use bitmasks
 
-BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (psi_det_size,N_states) ]
+!BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (psi_det_size,N_states) ]
+BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (N_det,N_states) ]
   implicit none
   BEGIN_DOC
   ! The wave function coefficients. Initialized with Hartree-Fock if the |EZFIO| file
@@ -68,7 +69,8 @@ BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (psi_det_size,N_states)
 END_PROVIDER
 
 
-BEGIN_PROVIDER [ double precision, psi_r_coef_bi_ortho, (psi_det_size,N_states) ]
+!BEGIN_PROVIDER [ double precision, psi_r_coef_bi_ortho, (psi_det_size,N_states) ]
+BEGIN_PROVIDER [ double precision, psi_r_coef_bi_ortho, (N_det,N_states) ]
   implicit none
   BEGIN_DOC
   ! The wave function coefficients. Initialized with Hartree-Fock if the |EZFIO| file
diff --git a/src/tc_bi_ortho/tc_h_eigvectors.irp.f b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
index a9e22e03..a636e8d6 100644
--- a/src/tc_bi_ortho/tc_h_eigvectors.irp.f
+++ b/src/tc_bi_ortho/tc_h_eigvectors.irp.f
@@ -320,24 +320,38 @@ end
   enddo
 
   double precision, allocatable :: buffer(:,:)
-  allocate(buffer(N_det,N_states))
+  allocate(buffer(psi_det_size,N_states))
+  print*,'passed the allocate'
+!  print*,N_det,N_states
+!  print*,size(psi_l_coef_bi_ortho,1),size(psi_l_coef_bi_ortho,2)
+!  print*,size(leigvec_tc_bi_orth,1),size(leigvec_tc_bi_orth,2)
+!  print*,size(reigvec_tc_bi_orth,1),size(reigvec_tc_bi_orth,2)
+!  print*,size(psi_r_coef_bi_ortho,1),size(psi_r_coef_bi_ortho,2)
+  buffer = 0.d0
   do k = 1, N_states
     do i = 1, N_det
       psi_l_coef_bi_ortho(i,k) = leigvec_tc_bi_orth(i,k)
       buffer(i,k) = leigvec_tc_bi_orth(i,k)
     enddo
   enddo
+  print*,'passed the first loop'
   TOUCH psi_l_coef_bi_ortho
+  print*,'passed the TOUCH psi_l_coef_bi_ortho'
   call ezfio_set_tc_bi_ortho_psi_l_coef_bi_ortho(buffer)
+  print*,'passed the ezfio_set_tc_bi_ortho_psi_l_coef_bi_ortho'
   do k = 1, N_states
     do i = 1, N_det
       psi_r_coef_bi_ortho(i,k) = reigvec_tc_bi_orth(i,k)
       buffer(i,k) = reigvec_tc_bi_orth(i,k)
     enddo
   enddo
+  print*,'passed the second loop'
   TOUCH psi_r_coef_bi_ortho
+  print*,'passed the TOUCH psi_r_coef_bi_ortho'
   call ezfio_set_tc_bi_ortho_psi_r_coef_bi_ortho(buffer)
+  print*,'passed the ezfio_set_tc_bi_ortho_psi_r_coef_bi_ortho'
   deallocate(buffer)
+  print*,'passed saving the wf'
 !  print*,'After diag'
 !  do i = 1, N_det! old version
 !   print*,'i',i,psi_l_coef_bi_ortho(i,1),psi_r_coef_bi_ortho(i,1)

From eaab1b80648bd2b4ec38aa17722a751fccf42ce0 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 7 Mar 2024 07:34:59 +0100
Subject: [PATCH 002/131] few modif for HHG

---
 .../multi_s_dipole_moment.irp.f               | 23 ++++++++++++++++
 src/utils/constants.include.F                 | 27 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f
index 913ae2f3..f21e08cd 100644
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@@ -91,3 +91,26 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
   enddo
 
 END_PROVIDER
+
+! ---
+
+ BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment_eigenvec, (N_states, N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment_eigenvec, (N_states, N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment_eigenvec, (N_states, N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment_eigenval,           (N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment_eigenval,           (N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment_eigenval,           (N_states)]
+
+  implicit none
+
+  PROVIDE multi_s_x_dipole_moment multi_s_y_dipole_moment multi_s_z_dipole_moment
+
+  call lapack_diag(multi_s_x_dipole_moment_eigenval(1), multi_s_x_dipole_moment_eigenvec(1,1), multi_s_x_dipole_moment(1,1), N_states, N_states)
+  call lapack_diag(multi_s_y_dipole_moment_eigenval(1), multi_s_y_dipole_moment_eigenvec(1,1), multi_s_y_dipole_moment(1,1), N_states, N_states)
+  call lapack_diag(multi_s_z_dipole_moment_eigenval(1), multi_s_z_dipole_moment_eigenvec(1,1), multi_s_z_dipole_moment(1,1), N_states, N_states)
+
+END_PROVIDER
+
+! ---
+
+
diff --git a/src/utils/constants.include.F b/src/utils/constants.include.F
index 422eff95..7b01f888 100644
--- a/src/utils/constants.include.F
+++ b/src/utils/constants.include.F
@@ -18,3 +18,30 @@ double precision, parameter :: c_4_3 = 4.d0/3.d0
 double precision, parameter :: c_1_3 = 1.d0/3.d0
 double precision, parameter :: sq_op5 = dsqrt(0.5d0)
 double precision, parameter :: dlog_2pi =  dlog(2.d0*dacos(-1.d0))
+
+! physical constants and units conversion factors
+double precision, parameter :: k_boltzman_si    = 1.38066d-23       ! K k^-1 
+double precision, parameter :: k_boltzman_au    = 3.1667d-6         ! Hartree k^-1 
+double precision, parameter :: k_boltzman_m1_au = 315795.26d0       ! Hartree^-1 k 
+double precision, parameter :: bohr_radius_si   = 0.529177d-10      ! m
+double precision, parameter :: bohr_radius_cm   = 0.529177d-8       ! cm
+double precision, parameter :: bohr_radius_angs = 0.529177d0        ! Angstrom
+double precision, parameter :: electronmass_si  = 9.10953d-31       ! Kg
+double precision, parameter :: electronmass_uma = 5.4858d-4         ! uma
+double precision, parameter :: electronvolt_si  = 1.6021892d-19     ! J  
+double precision, parameter :: uma_si           = 1.66057d-27       ! Kg
+double precision, parameter :: debye_si         = 3.33564d-30       ! coulomb meter
+double precision, parameter :: debye_au         = 0.393427228d0     ! e * Bohr
+double precision, parameter :: angstrom_to_au   = 1.889727d0        ! au
+double precision, parameter :: au_to_ohmcmm1    = 46000.0d0         ! (ohm cm)^-1
+double precision, parameter :: au_to_kb         = 294210.0d0        ! kbar
+double precision, parameter :: au_to_eV         = 27.211652d0
+double precision, parameter :: uma_to_au        = 1822.89d0
+double precision, parameter :: au_to_terahertz  = 2.4189d-5
+double precision, parameter :: au_to_sec        = 2.4189d-17
+double precision, parameter :: au_to_fsec       = 2.4189d-2
+double precision, parameter :: Wcm2             = 3.5d16
+double precision, parameter :: amconv           = 1.66042d-24/9.1095d-28*0.5d0  !  mass conversion: a.m.u to a.u. (ry)
+double precision, parameter :: uakbar           = 147105.d0         !  pressure conversion from ry/(a.u)^3 to k
+
+

From 3f861a41b5438d1722fa003da233642c79d96a47 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 8 Mar 2024 17:27:18 +0100
Subject: [PATCH 003/131] added the thresh_de_tc_angles keyword in minimize tc
 angles

---
 bin/zcat                                    | 23 ---------------------
 plugins/local/tc_keywords/EZFIO.cfg         |  5 +++++
 plugins/local/tc_scf/routines_rotates.irp.f |  1 +
 scripts/PYSCF_EOMCC.py                      |  1 +
 4 files changed, 7 insertions(+), 23 deletions(-)
 delete mode 100755 bin/zcat
 create mode 120000 scripts/PYSCF_EOMCC.py

diff --git a/bin/zcat b/bin/zcat
deleted file mode 100755
index 7ccecf07..00000000
--- a/bin/zcat
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# On Darwin: try gzcat if available, otherwise use Python
-
-if [[ $(uname -s) = Darwin ]] ; then
-   which gzcat &> /dev/null
-   if [[ $? -eq 0 ]] ; then 
-      exec gzcat $@
-   else
-
-      exec python3 << EOF
-import sys
-import gzip
-with gzip.open("$1", "rt") as f:
-    print(f.read())
-EOF
-   fi
-else
-   SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-   command=$(which -a zcat | grep -v "$SCRIPTPATH/" | head -1)
-   exec $command $@
-fi
-
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index 93ff790f..68fe9c94 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -280,3 +280,8 @@ doc: approach used to evaluate TC integrals [ analytic | numeric | semi-analytic
 interface: ezfio,ocaml,provider
 default: semi-analytic
 
+[thresh_de_tc_angles]
+type: Threshold
+doc: Thresholds on delta E for changing angles between orbitals 
+interface: ezfio,provider,ocaml
+default: 1.e-03
diff --git a/plugins/local/tc_scf/routines_rotates.irp.f b/plugins/local/tc_scf/routines_rotates.irp.f
index c42e846e..92abfa44 100644
--- a/plugins/local/tc_scf/routines_rotates.irp.f
+++ b/plugins/local/tc_scf/routines_rotates.irp.f
@@ -301,6 +301,7 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
 
   ! check if TC energy has changed
   E_new = TC_HF_energy
+  E_thr = thresh_de_tc_angles
   if(dabs(E_new - E_old) .gt. E_thr) then
     mo_r_coef = mo_r_coef_old
     mo_l_coef = mo_l_coef_old
diff --git a/scripts/PYSCF_EOMCC.py b/scripts/PYSCF_EOMCC.py
new file mode 120000
index 00000000..8ad341da
--- /dev/null
+++ b/scripts/PYSCF_EOMCC.py
@@ -0,0 +1 @@
+/home_lct/eginer/qp2/plugins/qp_plugins_lct/dev/fcidump_for_vbarb/PYSCF_EOMCC.py
\ No newline at end of file

From d405aea95785060f7550be7901c90d133b287a65 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Mon, 11 Mar 2024 10:21:59 +0100
Subject: [PATCH 004/131] few mom opt

---
 .../local/bi_ort_ints/total_twoe_pot.irp.f    | 99 +++++++++++++++----
 .../local/non_h_ints_mu/total_tc_int.irp.f    |  2 +
 src/tools/print_detweights.irp.f              | 66 +++++++++++++
 3 files changed, 148 insertions(+), 19 deletions(-)
 create mode 100644 src/tools/print_detweights.irp.f

diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 5e6a24e9..bf5cc36f 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -40,38 +40,95 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
   END_DOC
 
   implicit none
-  integer                       :: i, j, k, l, m, n, p, q
+  integer                       :: i, j, k, l, m, n, p, q, s, r
+  double precision              :: t1, t2
   double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
+  double precision, allocatable :: a_jkp(:,:,:), a_kpq(:,:,:), a_pqr(:,:,:)
+
+  print *, ' PROVIDING mo_bi_ortho_tc_two_e_chemist ...'
+  call wall_time(t1)
+  call print_memory_usage()
 
   PROVIDE mo_r_coef mo_l_coef
+  PROVIDe ao_two_e_tc_tot
 
-  allocate(a2(ao_num,ao_num,ao_num,mo_num))
+  if(ao_to_mo_tc_n3) then
 
-  call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
-            , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
-            , 0.d0 , a2(1,1,1,1), ao_num*ao_num*ao_num)
+    print*, ' memory scale of TC ao -> mo: O(N3) '
 
-  allocate(a1(ao_num,ao_num,mo_num,mo_num))
+    allocate(a_jkp(ao_num,ao_num,mo_num))
+    allocate(a_kpq(ao_num,mo_num,mo_num))
+    allocate(a_pqr(mo_num,mo_num,mo_num))
 
-  call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
-            , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+    do s = 1, mo_num
+      mo_bi_ortho_tc_two_e_chemist(:,:,:,s) = 0.d0
 
-  deallocate(a2)
-  allocate(a2(ao_num,mo_num,mo_num,mo_num))
+      do l = 1, ao_num
 
-  call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
-            , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+        call dgemm( 'T', 'N', ao_num*ao_num, mo_num, ao_num, 1.d0 &
+                  , ao_two_e_tc_tot(1,1,1,l), ao_num, mo_l_coef(1,1), ao_num  &
+                  , 0.d0, a_jkp(1,1,1), ao_num*ao_num)
+  
+        call dgemm( 'T', 'N', ao_num*mo_num, mo_num, ao_num, 1.d0 &
+                  , a_jkp(1,1,1), ao_num, mo_r_coef(1,1), ao_num  &
+                  , 0.d0, a_kpq(1,1,1), ao_num*mo_num)
+  
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, ao_num, 1.d0 &
+                  , a_kpq(1,1,1), ao_num, mo_l_coef(1,1), ao_num  &
+                  , 0.d0, a_pqr(1,1,1), mo_num*mo_num)
 
-  deallocate(a1)
+        !$OMP PARALLEL         &
+        !$OMP DEFAULT(NONE)    &
+        !$OMP PRIVATE(p, q, r) &
+        !$OMP SHARED(s, l, mo_num, mo_bi_ortho_tc_two_e_chemist, mo_r_coef, a_pqr)
+        !$OMP DO COLLAPSE(2)
+        do p = 1, mo_num
+          do q = 1, mo_num
+            do r = 1, mo_num
+              mo_bi_ortho_tc_two_e_chemist(p,q,r,s) = mo_bi_ortho_tc_two_e_chemist(p,q,r,s) + mo_r_coef(l,s) * a_pqr(p,q,r)
+            enddo
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-  call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
-            , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+      enddo ! l
+    enddo ! s
 
-  deallocate(a2)
+    deallocate(a_jkp, a_kpq, a_pqr)
 
+  else
+
+    print*, ' memory scale of TC ao -> mo: O(N4) '
+
+    allocate(a2(ao_num,ao_num,ao_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
+              , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+              , 0.d0, a2(1,1,1,1), ao_num*ao_num*ao_num)
+  
+    allocate(a1(ao_num,ao_num,mo_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+              , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+  
+    deallocate(a2)
+    allocate(a2(ao_num,mo_num,mo_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
+              , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+  
+    deallocate(a1)
+  
+    call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+              , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+  
+    deallocate(a2)
+  
+  endif
 
   !allocate(a1(mo_num,ao_num,ao_num,ao_num))
   !a1 = 0.d0
@@ -135,6 +192,10 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
   !enddo
   !deallocate(a1)
 
+  call wall_time(t2)
+  print *, ' WALL TIME for PROVIDING mo_bi_ortho_tc_two_e_chemist (min)', (t2-t1)/60.d0
+  call print_memory_usage()
+
 END_PROVIDER 
 
 ! ---
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index 9d3cf565..ba078d9b 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -201,6 +201,8 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     !$OMP END DO
     !$OMP END PARALLEL
 
+    call clear_ao_map()
+
     if(tc_integ_type .eq. "numeric") then
       FREE int2_grad1_u12_ao_num int2_grad1_u12_square_ao_num
     endif
diff --git a/src/tools/print_detweights.irp.f b/src/tools/print_detweights.irp.f
new file mode 100644
index 00000000..d5b0f2c9
--- /dev/null
+++ b/src/tools/print_detweights.irp.f
@@ -0,0 +1,66 @@
+program print_detweights
+
+  implicit none
+
+  read_wf = .True.
+  touch read_wf
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+  integer                       :: i
+  integer                       :: degree
+  integer                       :: ios
+  integer,          allocatable :: deg(:), ii(:), deg_sorted(:)
+  double precision, allocatable :: c(:)
+
+  PROVIDE N_int
+  PROVIDE N_det
+  PROVIDE psi_det
+  PROVIDe psi_coef
+
+  allocate(deg(N_det), ii(N_det), deg_sorted(N_det), c(N_det))
+
+  do i = 1, N_det
+
+    call debug_det(psi_det(1,1,i), N_int)
+    call get_excitation_degree(psi_det(1,1,i), psi_det(1,1,1), degree, N_int)
+
+    ii (i) = i
+    deg(i) = degree
+    c  (i) = dabs(psi_coef(i,1))
+  enddo
+
+  call dsort(c, ii, N_det)
+
+  do i = 1, N_det
+    deg_sorted(i) = deg(ii(i))
+  enddo
+
+  print *, ' saving psi'
+
+  ! Writing output in binary format
+  open(unit=10, file="coef.bin", status="replace", action="write", iostat=ios, form="unformatted")
+
+    if(ios /= 0) then
+      print *, ' Error opening file!'
+      stop
+    endif
+    
+    write(10) N_det
+    write(10) deg_sorted
+    write(10) c
+
+  close(10)
+
+  deallocate(deg, ii, deg_sorted, c)
+
+end
+
+

From 1dbde5643920054cc16f148c50d84fed01a88b13 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Wed, 13 Mar 2024 07:04:54 +0100
Subject: [PATCH 005/131] O(N4) -> O(N3) transformations

---
 .../local/bi_ort_ints/total_twoe_pot.irp.f    | 19 ++++++++++++++++---
 plugins/local/tc_keywords/EZFIO.cfg           |  7 +++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index bf5cc36f..79bfd336 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -41,7 +41,7 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
 
   implicit none
   integer                       :: i, j, k, l, m, n, p, q, s, r
-  double precision              :: t1, t2
+  double precision              :: t1, t2, tt1, tt2
   double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
   double precision, allocatable :: a_jkp(:,:,:), a_kpq(:,:,:), a_pqr(:,:,:)
 
@@ -60,9 +60,11 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
     allocate(a_kpq(ao_num,mo_num,mo_num))
     allocate(a_pqr(mo_num,mo_num,mo_num))
 
-    do s = 1, mo_num
-      mo_bi_ortho_tc_two_e_chemist(:,:,:,s) = 0.d0
+    call wall_time(tt1)
 
+    do s = 1, mo_num
+
+      mo_bi_ortho_tc_two_e_chemist(:,:,:,s) = 0.d0
       do l = 1, ao_num
 
         call dgemm( 'T', 'N', ao_num*ao_num, mo_num, ao_num, 1.d0 &
@@ -93,6 +95,17 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
         !$OMP END PARALLEL
 
       enddo ! l
+
+      if(s == 2) then
+        call wall_time(tt2)
+        print*, ' 1 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-1)*(tt2-tt1)/60.d0
+      elseif(s == 11) then
+        call wall_time(tt2)
+        print*, ' 10 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-10)*(tt2-tt1)/600.d0
+      endif
+
     enddo ! s
 
     deallocate(a_jkp, a_kpq, a_pqr)
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index 93ff790f..a8491660 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -280,3 +280,10 @@ doc: approach used to evaluate TC integrals [ analytic | numeric | semi-analytic
 interface: ezfio,ocaml,provider
 default: semi-analytic
 
+[ao_to_mo_tc_n3]
+type: logical
+doc: If |true|, memory scale of TC ao -> mo: O(N3)
+interface: ezfio,provider,ocaml
+default: False
+
+

From 83ed57312d9bc86dc2ec4cbc486491ded16d7053 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Tue, 19 Mar 2024 17:23:41 +0100
Subject: [PATCH 006/131] few modif in ao tc integ

---
 .../local/bi_ort_ints/total_twoe_pot.irp.f    | 25 +++++++++++++++++--
 .../local/non_h_ints_mu/tc_integ_num.irp.f    |  2 +-
 .../local/non_h_ints_mu/total_tc_int.irp.f    |  3 ++-
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 79bfd336..1e558038 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -50,7 +50,7 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
   call print_memory_usage()
 
   PROVIDE mo_r_coef mo_l_coef
-  PROVIDe ao_two_e_tc_tot
+  PROVIDE ao_two_e_tc_tot
 
   if(ao_to_mo_tc_n3) then
 
@@ -103,9 +103,30 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
       elseif(s == 11) then
         call wall_time(tt2)
         print*, ' 10 / mo_num done in (min)', (tt2-tt1)/60.d0
-        print*, ' estimated time required (min)', dble(mo_num-10)*(tt2-tt1)/600.d0
+        print*, ' estimated time required (min)', dble(mo_num-10)*(tt2-tt1)/(60.d0*10.d0)
+      elseif(s == 26) then
+        call wall_time(tt2)
+        print*, ' 25 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-25)*(tt2-tt1)/(60.d0*25.d0)
+      elseif(s == 51) then
+        call wall_time(tt2)
+        print*, ' 50 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-50)*(tt2-tt1)/(60.d0*50.d0)
+      elseif(s == 101) then
+        call wall_time(tt2)
+        print*, ' 100 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-100)*(tt2-tt1)/(60.d0*100.d0)
+      elseif(s == 201) then
+        call wall_time(tt2)
+        print*, ' 200 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-200)*(tt2-tt1)/(60.d0*200.d0)
+      elseif(s == 501) then
+        call wall_time(tt2)
+        print*, ' 500 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-500)*(tt2-tt1)/(60.d0*500.d0)
       endif
 
+
     enddo ! s
 
     deallocate(a_jkp, a_kpq, a_pqr)
diff --git a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
index e5d75c3d..6d446037 100644
--- a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
@@ -131,7 +131,7 @@
   deallocate(tmp)
 
   call wall_time(time1)
-  print*, ' wall time for int2_grad1_u12_ao_num & int2_grad1_u12_square_ao_num =', time1-time0 
+  print*, ' wall time for int2_grad1_u12_ao_num & int2_grad1_u12_square_ao_num = (min)', (time1-time0) / 60.d0
   call print_memory_usage()
 
 END_PROVIDER
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index ba078d9b..c7230dc3 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -201,7 +201,8 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     !$OMP END DO
     !$OMP END PARALLEL
 
-    call clear_ao_map()
+    !call clear_ao_map()
+    FREE ao_integrals_map
 
     if(tc_integ_type .eq. "numeric") then
       FREE int2_grad1_u12_ao_num int2_grad1_u12_square_ao_num

From d872d60e70f8eedb3913f5566d4f35d198d4aad5 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Sun, 7 Apr 2024 00:29:40 +0200
Subject: [PATCH 007/131] saving olympe2 modif

---
 plugins/local/bi_ortho_mos/overlap.irp.f      |  8 +-
 .../lapack_diag_non_hermit.irp.f              |  1 +
 plugins/local/tc_scf/minimize_tc_angles.irp.f |  2 +-
 plugins/local/tc_scf/routines_rotates.irp.f   | 79 ++++++++++---------
 src/tools/print_detweights.irp.f              | 35 +++++++-
 src/utils/block_diag_degen.irp.f              |  2 +-
 6 files changed, 81 insertions(+), 46 deletions(-)

diff --git a/plugins/local/bi_ortho_mos/overlap.irp.f b/plugins/local/bi_ortho_mos/overlap.irp.f
index ff5d5c84..7f07929f 100644
--- a/plugins/local/bi_ortho_mos/overlap.irp.f
+++ b/plugins/local/bi_ortho_mos/overlap.irp.f
@@ -56,10 +56,10 @@
     print*,'Average trace of overlap_bi_ortho is different from 1 by ', dabs(accu_d-1.d0)
     print*,'And bi orthogonality is off by an average of ',accu_nd
     print*,'****************'
-    print*,'Overlap matrix betwee mo_l_coef and mo_r_coef  '
-    do i = 1, mo_num
-      write(*,'(100(F16.10,X))')overlap_bi_ortho(i,:)
-    enddo
+    !print*,'Overlap matrix betwee mo_l_coef and mo_r_coef  '
+    !do i = 1, mo_num
+    !  write(*,'(100(F16.10,X))')overlap_bi_ortho(i,:)
+    !enddo
   endif
   print*,'Average trace of overlap_bi_ortho (should be 1.)'
   print*,'accu_d  = ',accu_d
diff --git a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
index cb38347e..4d4bc047 100644
--- a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
+++ b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
@@ -2144,6 +2144,7 @@ subroutine impose_biorthog_degen_eigvec(n, deg_num, e0, L0, R0)
       enddo
       !print*,' accu_nd after = ', accu_nd
       if(accu_nd .gt. 1d-12) then
+        print*, ' accu_nd =', accu_nd
         print*, ' your strategy for degenerates orbitals failed !'
         print*, m, 'deg on', i
         stop
diff --git a/plugins/local/tc_scf/minimize_tc_angles.irp.f b/plugins/local/tc_scf/minimize_tc_angles.irp.f
index c7752930..e5f6cf87 100644
--- a/plugins/local/tc_scf/minimize_tc_angles.irp.f
+++ b/plugins/local/tc_scf/minimize_tc_angles.irp.f
@@ -20,7 +20,7 @@ program minimize_tc_angles
   ! TODO
   ! check if rotations of orbitals affect the TC energy
   ! and refuse the step
-  call minimize_tc_orb_angles
+  call minimize_tc_orb_angles()
 
 end
 
diff --git a/plugins/local/tc_scf/routines_rotates.irp.f b/plugins/local/tc_scf/routines_rotates.irp.f
index c42e846e..2c5510f2 100644
--- a/plugins/local/tc_scf/routines_rotates.irp.f
+++ b/plugins/local/tc_scf/routines_rotates.irp.f
@@ -40,9 +40,6 @@ subroutine LTxSxR(n, m, L, S, R, C)
 
 end subroutine LTxR
 
-! ---
-
-
 ! ---
 
 subroutine minimize_tc_orb_angles()
@@ -103,7 +100,10 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
   double precision, allocatable :: stmp(:,:), T(:,:), Snew(:,:), smat2(:,:)
   double precision, allocatable :: mo_l_coef_tmp(:,:), mo_r_coef_tmp(:,:), mo_l_coef_new(:,:)
 
-  E_thr = 1d-04
+  PROVIDE TC_HF_energy
+  PROVIDE mo_r_coef mo_l_coef
+
+  E_thr = 1d-07
   E_old = TC_HF_energy
   allocate(mo_l_coef_old(ao_num,mo_num), mo_r_coef_old(ao_num,mo_num))
   mo_r_coef_old = mo_r_coef
@@ -111,7 +111,7 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
 
   good_angles = .False.
 
-  allocate(mo_l_coef_good(ao_num, mo_num), mo_r_coef_good(ao_num,mo_num))
+  allocate(mo_l_coef_good(ao_num,mo_num), mo_r_coef_good(ao_num,mo_num))
 
   print *, ' ***************************************'
   print *, ' ***************************************'
@@ -123,7 +123,7 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
   mo_r_coef_good = mo_r_coef
   mo_l_coef_good = mo_l_coef
 
-  allocate(mo_r_coef_new(ao_num, mo_num))
+  allocate(mo_r_coef_new(ao_num,mo_num))
   mo_r_coef_new = mo_r_coef
   do i = 1, mo_num
     norm = 1.d0/dsqrt(overlap_mo_r(i,i))
@@ -141,10 +141,11 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
   call build_s_matrix(ao_num, mo_num, mo_r_coef_new, mo_r_coef_new, ao_overlap, s_mat)
 ! call give_degen(fock_diag,mo_num,thr_deg,list_degen,n_degen_list)
   if(n_core_orb.ne.0)then
-   call give_degen_full_listcore(fock_diag, mo_num, list_core, n_core_orb, thr_deg, list_degen, n_degen_list)
+    call give_degen_full_listcore(fock_diag, mo_num, list_core, n_core_orb, thr_deg, list_degen, n_degen_list)
   else
-   call give_degen_full_list(fock_diag, mo_num, thr_deg, list_degen, n_degen_list)
+    call give_degen_full_list(fock_diag, mo_num, thr_deg, list_degen, n_degen_list)
   endif
+
   print *, ' fock_matrix_mo'
   do i = 1, mo_num
     print *, i, fock_diag(i), angle_left_right(i)
@@ -156,50 +157,52 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
 !  n_degen = ilast - ifirst +1
 
     n_degen = list_degen(i,0)
-    if(n_degen .ge. 1000)n_degen = 1 ! convention for core orbitals
+    if(n_degen .ge. 1000) n_degen = 1 ! convention for core orbitals
      
     if(n_degen .eq. 1) cycle
+    print*, ' working on orbital', i
+    print*, ' multiplicity =', n_degen
 
     allocate(stmp(n_degen,n_degen), smat2(n_degen,n_degen))
     allocate(mo_r_coef_tmp(ao_num,n_degen), mo_l_coef_tmp(ao_num,n_degen), mo_l_coef_new(ao_num,n_degen))
     allocate(T(n_degen,n_degen), Snew(n_degen,n_degen))
 
     print*,'Right orbitals before'
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))') mo_r_coef_new(1:ao_num,list_degen(i,j))
-     enddo
+    do j = 1, n_degen
+      write(*,'(1000(F16.10,X))') mo_r_coef_new(1:ao_num,list_degen(i,j))
+    enddo
     print*,'Left orbitals before'
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))')mo_l_coef(1:ao_num,list_degen(i,j)) 
-     enddo
+    do j = 1, n_degen
+      write(*,'(1000(F16.10,X))') mo_l_coef(1:ao_num,list_degen(i,j)) 
+    enddo
     if(angle_left_right(list_degen(i,1)).gt.80.d0.and.n_degen==2)then
-       integer :: i_list, j_list
-       i_list = list_degen(i,1)
-       j_list = list_degen(i,2)
-       print*,'Huge angle !!! == ',angle_left_right(list_degen(i,1)),angle_left_right(list_degen(i,2))
-       print*,'i_list = ',i_list
-       print*,'i_list = ',j_list
-       print*,'Swapping left/right orbitals'
-       call print_strong_overlap(i_list, j_list)
-       mo_r_coef_tmp(1:ao_num,1) = mo_r_coef_new(1:ao_num,i_list)
-       mo_r_coef_tmp(1:ao_num,2) = mo_l_coef(1:ao_num,i_list)
-       mo_l_coef_tmp(1:ao_num,1) = mo_l_coef(1:ao_num,j_list)
-       mo_l_coef_tmp(1:ao_num,2) = mo_r_coef_new(1:ao_num,j_list)
+      integer :: i_list, j_list
+      i_list = list_degen(i,1)
+      j_list = list_degen(i,2)
+      print*,'Huge angle !!! == ',angle_left_right(list_degen(i,1)),angle_left_right(list_degen(i,2))
+      print*,'i_list = ',i_list
+      print*,'i_list = ',j_list
+      print*,'Swapping left/right orbitals'
+      call print_strong_overlap(i_list, j_list)
+      mo_r_coef_tmp(1:ao_num,1) = mo_r_coef_new(1:ao_num,i_list)
+      mo_r_coef_tmp(1:ao_num,2) = mo_l_coef(1:ao_num,i_list)
+      mo_l_coef_tmp(1:ao_num,1) = mo_l_coef(1:ao_num,j_list)
+      mo_l_coef_tmp(1:ao_num,2) = mo_r_coef_new(1:ao_num,j_list)
     else
-     do j = 1, n_degen
-       print*,'i_list = ',list_degen(i,j)
-       mo_r_coef_tmp(1:ao_num,j) = mo_r_coef_new(1:ao_num,list_degen(i,j))
-       mo_l_coef_tmp(1:ao_num,j) = mo_l_coef(1:ao_num,list_degen(i,j))
-     enddo
+      do j = 1, n_degen
+        print*,'i_list = ',list_degen(i,j)
+        mo_r_coef_tmp(1:ao_num,j) = mo_r_coef_new(1:ao_num,list_degen(i,j))
+        mo_l_coef_tmp(1:ao_num,j) = mo_l_coef(1:ao_num,list_degen(i,j))
+      enddo
     endif
     print*,'Right orbitals '
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))')mo_r_coef_tmp(1:ao_num,j) 
-     enddo
+    do j = 1, n_degen
+      write(*,'(1000(F16.10,X))') mo_r_coef_tmp(1:ao_num,j) 
+    enddo
     print*,'Left orbitals '
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))')mo_l_coef_tmp(1:ao_num,j) 
-     enddo
+    do j = 1, n_degen
+      write(*,'(100(F16.10,X))') mo_l_coef_tmp(1:ao_num,j) 
+    enddo
     ! Orthogonalization of right functions
     print *, ' Orthogonalization of RIGHT functions'
     print *, ' ------------------------------------'
diff --git a/src/tools/print_detweights.irp.f b/src/tools/print_detweights.irp.f
index d5b0f2c9..5e5f2bb1 100644
--- a/src/tools/print_detweights.irp.f
+++ b/src/tools/print_detweights.irp.f
@@ -5,7 +5,8 @@ program print_detweights
   read_wf = .True.
   touch read_wf
 
-  call main()
+  call print_exc()
+  !call main()
 
 end
 
@@ -41,6 +42,7 @@ subroutine main()
 
   do i = 1, N_det
     deg_sorted(i) = deg(ii(i))
+    print *, deg_sorted(i), c(i)
   enddo
 
   print *, ' saving psi'
@@ -52,7 +54,7 @@ subroutine main()
       print *, ' Error opening file!'
       stop
     endif
-    
+
     write(10) N_det
     write(10) deg_sorted
     write(10) c
@@ -63,4 +65,33 @@ subroutine main()
 
 end
 
+! ---
+
+subroutine print_exc()
+
+  implicit none
+
+  integer              :: i
+  integer, allocatable :: deg(:)
+
+  PROVIDE N_int
+  PROVIDE N_det
+  PROVIDE psi_det
+
+  allocate(deg(N_det))
+
+  do i = 1, N_det
+    call get_excitation_degree(psi_det(1,1,1), psi_det(1,1,i), deg(i), N_int)
+  enddo
+
+  open(unit=10, file="exc.dat", action="write")
+    write(10,*) N_det
+    write(10,*) deg
+  close(10)
+
+  deallocate(deg)
+
+end
+
+
 
diff --git a/src/utils/block_diag_degen.irp.f b/src/utils/block_diag_degen.irp.f
index 188bfa58..1a9ca8d6 100644
--- a/src/utils/block_diag_degen.irp.f
+++ b/src/utils/block_diag_degen.irp.f
@@ -191,7 +191,7 @@ subroutine give_degen_full_list(A, n, thr, list_degen, n_degen_list)
     list_degen(n_degen_list,1) = i
     icount = 1
     do j = i+1, n
-      if(dabs(A(i)-A(j)).lt.thr.and.is_ok(j)) then
+      if(dabs(A(i)-A(j)).lt.thr .and. is_ok(j)) then
         is_ok(j) = .False.
         icount += 1
         list_degen(n_degen_list,icount) = j

From f8bff471222ac9cd2e6f23342f7d7a7aff5d62cd Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Thu, 28 Mar 2024 15:27:11 +0100
Subject: [PATCH 008/131] added loops

---
 .../local/non_h_ints_mu/total_tc_int.irp.f    | 165 +++++++++++++-----
 1 file changed, 121 insertions(+), 44 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index c7230dc3..72fd0f53 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -65,27 +65,59 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
     PROVIDE int2_grad1_u12_square_ao
 
-    allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+    if(tc_save_mem) then
 
-    !$OMP PARALLEL               &
-    !$OMP DEFAULT (NONE)         &
-    !$OMP PRIVATE (i, k, ipoint) &
-    !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
-    !$OMP DO SCHEDULE (static)
-    do i = 1, ao_num
-      do k = 1, ao_num
-        do ipoint = 1, n_points_final_grid
-          c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      print*, ' LOOPS are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
+
+      !$OMP PARALLEL                                              &
+      !$OMP DEFAULT (NONE)                                        &
+      !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
+      !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
+      !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
+      !$OMP DO COLLAPSE(4)
+      do i = 1, ao_num
+        do k = 1, ao_num
+          do l = 1, ao_num
+            do j = 1, ao_num
+              ao_two_e_tc_tot(j,l,k,i) = 0.d0
+              do ipoint = 1, n_points_final_grid
+                weight1 = final_weight_at_r_vector(ipoint)
+                ao_i_r = aos_in_r_array_transp(ipoint,i)
+                ao_k_r = aos_in_r_array_transp(ipoint,k)
+                ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) + int2_grad1_u12_square_ao(j,l,ipoint) * weight1 * ao_i_r * ao_k_r
+              enddo
+            enddo
+          enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
+      !$OMP END DO
+      !$OMP END PARALLEL
 
-    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                 &
-              , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
-              , 0.d0, ao_two_e_tc_tot, ao_num*ao_num)
+    else
 
+      print*, ' DGEMM are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
+
+      allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+      !$OMP PARALLEL               &
+      !$OMP DEFAULT (NONE)         &
+      !$OMP PRIVATE (i, k, ipoint) &
+      !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+      !$OMP DO SCHEDULE (static)
+      do i = 1, ao_num
+        do k = 1, ao_num
+          do ipoint = 1, n_points_final_grid
+            c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                 &
+                , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+                , 0.d0, ao_two_e_tc_tot, ao_num*ao_num)
+      deallocate(c_mat)
+    endif
+  
     FREE int2_grad1_u12_square_ao
 
     if( (tc_integ_type .eq. "semi-analytic")                            .and. &
@@ -96,6 +128,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       ! an additional term is added here directly instead of 
       ! being added in int2_grad1_u12_square_ao for performance
 
+      allocate(c_mat(n_points_final_grid,ao_num,ao_num))
       PROVIDE int2_u2_env2
 
       !$OMP PARALLEL                                                                                     &
@@ -127,10 +160,13 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
                 , int2_u2_env2(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
                 , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
 
+      deallocate(c_mat)
       FREE int2_u2_env2
     endif ! use_ipp
 
-    deallocate(c_mat)
+    call wall_time(time1)
+    print*, ' done with Hermitian part after (min) ', (time1 - time0) / 60.d0
+    call print_memory_usage()
 
     ! ---
 
@@ -138,38 +174,73 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
       PROVIDE int2_grad1_u12_ao
 
-      allocate(b_mat(n_points_final_grid,ao_num,ao_num,3))
+      if(tc_save_mem) then
 
-      !$OMP PARALLEL                                                              &
-      !$OMP DEFAULT (NONE)                                                        &
-      !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-      !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
-      !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
-      !$OMP DO SCHEDULE (static)
-      do i = 1, ao_num
-        do k = 1, ao_num
-          do ipoint = 1, n_points_final_grid
+        print*, ' LOOPS are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
 
-            weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-            ao_i_r  = aos_in_r_array_transp(ipoint,i)
-            ao_k_r  = aos_in_r_array_transp(ipoint,k)
-
-            b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
-            b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
-            b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1)    &
+        !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
+        !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
+        !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
+        !$OMP DO COLLAPSE(4)
+        do i = 1, ao_num
+          do k = 1, ao_num
+            do l = 1, ao_num
+              do j = 1, ao_num
+                do ipoint = 1, n_points_final_grid
+                  weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+                  ao_i_r  = aos_in_r_array_transp(ipoint,i)
+                  ao_k_r  = aos_in_r_array_transp(ipoint,k)
+                  ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) &
+                                           + weight1 * int2_grad1_u12_ao(j,l,ipoint,1) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) &
+                                           + weight1 * int2_grad1_u12_ao(j,l,ipoint,2) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) &
+                                           + weight1 * int2_grad1_u12_ao(j,l,ipoint,3) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+                enddo
+              enddo
+            enddo
           enddo
         enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-      do m = 1, 3
-        call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0             &
-                  , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
-                  , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
-      enddo
-      deallocate(b_mat)
+      else
 
+        print*, ' DGEMM are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
+
+        allocate(b_mat(n_points_final_grid,ao_num,ao_num,3))
+        !$OMP PARALLEL                                                              &
+        !$OMP DEFAULT (NONE)                                                        &
+        !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+        !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
+        !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
+        !$OMP DO SCHEDULE (static)
+        do i = 1, ao_num
+          do k = 1, ao_num
+            do ipoint = 1, n_points_final_grid
+  
+              weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+              ao_i_r  = aos_in_r_array_transp(ipoint,i)
+              ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+              b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
+              b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
+              b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+            enddo
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+  
+        do m = 1, 3
+          call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0             &
+                    , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
+                    , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
+        enddo
+        deallocate(b_mat)
+
+      end if
       FREE int2_grad1_u12_ao
 
       if(tc_integ_type .eq. "semi-analytic") then 
@@ -178,16 +249,22 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
     endif ! var_tc
 
+    call wall_time(time1)
+    print*, ' done with non-Hermitian part after (min) ', (time1 - time0) / 60.d0
+    call print_memory_usage()
+
     ! ---
 
     call sum_A_At(ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
 
+    ! ---
+
     PROVIDE ao_integrals_map
 
     !$OMP PARALLEL DEFAULT(NONE)                            &
     !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
     !$OMP PRIVATE(i, j, k, l)
-    !$OMP DO
+    !$OMP DO COLLAPSE(4)
     do j = 1, ao_num
       do l = 1, ao_num
         do i = 1, ao_num

From 002aff90f5e6ecd7a4929eb48e75608d94f9e3a8 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Thu, 28 Mar 2024 17:05:00 +0100
Subject: [PATCH 009/131] working on mem reduction

---
 .../local/non_h_ints_mu/total_tc_int.irp.f    | 98 ++++++++++++-------
 plugins/local/tc_keywords/EZFIO.cfg           |  6 ++
 2 files changed, 67 insertions(+), 37 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index 72fd0f53..b8379006 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -33,8 +33,10 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
   double precision              :: weight1, ao_k_r, ao_i_r
   double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
   double precision              :: time0, time1
-  double precision, allocatable :: b_mat(:,:,:,:), c_mat(:,:,:)
+  double precision, allocatable :: c_mat(:,:,:)
+  logical,          external    :: ao_two_e_integral_zero
   double precision, external    :: get_ao_two_e_integral
+  double precision, external    :: ao_two_e_integral
 
   PROVIDe tc_integ_type
   PROVIDE env_type
@@ -194,9 +196,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
                   ao_i_r  = aos_in_r_array_transp(ipoint,i)
                   ao_k_r  = aos_in_r_array_transp(ipoint,k)
                   ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) &
-                                           + weight1 * int2_grad1_u12_ao(j,l,ipoint,1) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) &
-                                           + weight1 * int2_grad1_u12_ao(j,l,ipoint,2) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) &
-                                           + weight1 * int2_grad1_u12_ao(j,l,ipoint,3) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,1) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,2) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,3) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
                 enddo
               enddo
             enddo
@@ -209,39 +211,37 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
         print*, ' DGEMM are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
 
-        allocate(b_mat(n_points_final_grid,ao_num,ao_num,3))
-        !$OMP PARALLEL                                                              &
-        !$OMP DEFAULT (NONE)                                                        &
-        !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-        !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
-        !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
-        !$OMP DO SCHEDULE (static)
-        do i = 1, ao_num
-          do k = 1, ao_num
-            do ipoint = 1, n_points_final_grid
+        allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+        do m = 1, 3
+          !$OMP PARALLEL                                                              &
+          !$OMP DEFAULT (NONE)                                                        &
+          !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+          !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+          !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+          !$OMP DO SCHEDULE (static)
+          do i = 1, ao_num
+            do k = 1, ao_num
+              do ipoint = 1, n_points_final_grid
   
-              weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-              ao_i_r  = aos_in_r_array_transp(ipoint,i)
-              ao_k_r  = aos_in_r_array_transp(ipoint,k)
+                weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+                ao_i_r  = aos_in_r_array_transp(ipoint,i)
+                ao_k_r  = aos_in_r_array_transp(ipoint,k)
   
-              b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
-              b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
-              b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+                c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+              enddo
             enddo
           enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
-  
-        do m = 1, 3
-          call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0             &
-                    , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+                    , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
                     , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
         enddo
-        deallocate(b_mat)
+        deallocate(c_mat)
 
       end if
-      FREE int2_grad1_u12_ao
+      !FREE int2_grad1_u12_ao
 
       if(tc_integ_type .eq. "semi-analytic") then 
         FREE int2_grad1_u2e_ao
@@ -258,19 +258,26 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     call sum_A_At(ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
 
     ! ---
+  
+    logical          :: integ_zero
+    double precision :: integ_val
 
-    PROVIDE ao_integrals_map
+    print*, ' adding ERI to ao_two_e_tc_tot ...'
 
-    !$OMP PARALLEL DEFAULT(NONE)                            &
-    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-    !$OMP PRIVATE(i, j, k, l)
+    !$OMP PARALLEL DEFAULT(NONE)                     &
+    !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
+    !$OMP SHARED(ao_num, ao_two_e_tc_tot)
     !$OMP DO COLLAPSE(4)
     do j = 1, ao_num
       do l = 1, ao_num
         do i = 1, ao_num
           do k = 1, ao_num
-            !                                                     < 1:i, 2:j | 1:k, 2:l > 
-            ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+            integ_zero = ao_two_e_integral_zero(i,j,k,l)
+            if(.not. integ_zero) then
+                          ! i,k : r1    j,l : r2
+              integ_val = ao_two_e_integral(i,k,j,l)
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + integ_val
+            endif
           enddo
         enddo
       enddo
@@ -278,8 +285,25 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     !$OMP END DO
     !$OMP END PARALLEL
 
-    !call clear_ao_map()
-    FREE ao_integrals_map
+    !PROVIDE ao_integrals_map
+    !!$OMP PARALLEL DEFAULT(NONE)                            &
+    !!$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+    !!$OMP PRIVATE(i, j, k, l)
+    !!$OMP DO COLLAPSE(4)
+    !do j = 1, ao_num
+    !  do l = 1, ao_num
+    !    do i = 1, ao_num
+    !      do k = 1, ao_num
+    !        !                                                     < 1:i, 2:j | 1:k, 2:l > 
+    !        ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+    !      enddo
+    !    enddo
+    !  enddo
+    !enddo
+    !!$OMP END DO
+    !!$OMP END PARALLEL
+    !!call clear_ao_map()
+    !FREE ao_integrals_map
 
     if(tc_integ_type .eq. "numeric") then
       FREE int2_grad1_u12_ao_num int2_grad1_u12_square_ao_num
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index a8491660..24362796 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -286,4 +286,10 @@ doc: If |true|, memory scale of TC ao -> mo: O(N3)
 interface: ezfio,provider,ocaml
 default: False
 
+[tc_save_mem]
+type: logical
+doc: If |true|, use loops to save memory TC
+interface: ezfio,provider,ocaml
+default: False
+
 

From 1a36d974b0bd5cd0c06453a15c96a8492c4baecc Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Sun, 7 Apr 2024 00:33:09 +0200
Subject: [PATCH 010/131] saving lcpq-ampere

---
 .../local/bi_ort_ints/semi_num_ints_mo.irp.f  |  68 ++++---
 .../bi_ort_ints/three_body_ints_bi_ort.irp.f  |   2 +-
 .../local/bi_ort_ints/total_twoe_pot.irp.f    |  87 ++++-----
 plugins/local/non_h_ints_mu/tc_integ.irp.f    | 175 ++++++++++++++++--
 .../local/non_h_ints_mu/total_tc_int.irp.f    |  93 +++++-----
 plugins/local/tc_keywords/EZFIO.cfg           |   8 +-
 .../local/tc_scf/fock_3e_bi_ortho_cs.irp.f    | 121 +++++++-----
 .../local/tc_scf/fock_3e_bi_ortho_uhf.irp.f   |   2 +-
 plugins/local/tc_scf/fock_tc.irp.f            |   2 +-
 .../local/tc_scf/write_ao_2e_tc_integ.irp.f   |  58 ++++++
 .../multi_s_dipole_moment.irp.f               |  22 ++-
 11 files changed, 442 insertions(+), 196 deletions(-)
 create mode 100644 plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f

diff --git a/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f b/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
index 51f0cba4..77e4cb9b 100644
--- a/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -107,8 +107,8 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
   integer          :: i, j, ipoint
   double precision :: wall0, wall1
 
-  print *, ' providing int2_grad1_u12_ao_transp ...'
-  call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_ao_transp ...'
+  !call wall_time(wall0)
 
   if(test_cycle_tc) then
 
@@ -142,15 +142,15 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3,
 
   endif
 
-  call wall_time(wall1)
-  print *, ' wall time for int2_grad1_u12_ao_transp ', wall1 - wall0
-  call print_memory_usage()
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_ao_transp (min) = ', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
 
 END_PROVIDER 
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
 
   implicit none
   integer          :: ipoint
@@ -159,7 +159,7 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_ao_transp
 
-  !print *, ' providing int2_grad1_u12_bimo_transp'
+  !print *, ' providing int2_grad1_u12_bimo_transp ...'
   !call wall_time(wall0)
 
   !$OMP PARALLEL         &
@@ -167,33 +167,35 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num,
   !$OMP PRIVATE (ipoint) & 
   !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao_transp,int2_grad1_u12_bimo_transp)
   !$OMP DO SCHEDULE (dynamic)
-   do ipoint = 1, n_points_final_grid
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,1,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,1,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,2,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,2,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,3,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,3,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-   enddo
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,1,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,1,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,2,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,2,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,3,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,3,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+  enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  !FREE int2_grad1_u12_ao_transp
+
   !call wall_time(wall1)
-  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
+  !print *, ' wall time for int2_grad1_u12_bimo_transp (min) =', (wall1 - wall0) / 60.d0
   !call print_memory_usage()
 
 END_PROVIDER 
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
 
   implicit none
   integer          :: i, j, ipoint
   double precision :: wall0, wall1
 
   !call wall_time(wall0)
-  !print *, ' Providing int2_grad1_u12_bimo_t ...'
+  !print *, ' providing int2_grad1_u12_bimo_t ...'
 
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE int2_grad1_u12_bimo_transp
@@ -211,17 +213,21 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid,
   FREE int2_grad1_u12_bimo_transp
 
   !call wall_time(wall1)
-  !print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
+  !print *, ' wall time for int2_grad1_u12_bimo_t (min) =', (wall1 - wall0) / 60.d0
   !call print_memory_usage()
 
 END_PROVIDER 
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3, ao_num, ao_num)]
+BEGIN_PROVIDER [double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3, ao_num, ao_num)]
 
   implicit none
-  integer :: i, j, ipoint
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_ao_t ...'
 
   PROVIDE int2_grad1_u12_ao
 
@@ -235,6 +241,10 @@ BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3,
     enddo
   enddo
 
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_ao_t (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
 END_PROVIDER 
 
 ! ---
@@ -275,8 +285,8 @@ BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk, (n_points_final_grid,
   double precision :: xyz
   double precision :: wall0, wall1
  
-  print*, ' providing x_W_ki_bi_ortho_erf_rk ...'
-  call wall_time(wall0)
+  !print*, ' providing x_W_ki_bi_ortho_erf_rk ...'
+  !call wall_time(wall0)
 
  !$OMP PARALLEL                   &
  !$OMP DEFAULT (NONE)             &
@@ -300,8 +310,8 @@ BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk, (n_points_final_grid,
  ! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu_transp 
  ! FREE mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp
 
-  call wall_time(wall1)
-  print *, ' time to provide x_W_ki_bi_ortho_erf_rk = ', wall1 - wall0
+  !call wall_time(wall1)
+  !print *, ' time to provide x_W_ki_bi_ortho_erf_rk = ', wall1 - wall0
 
 END_PROVIDER 
 
@@ -323,8 +333,8 @@ BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk_diag, (n_points_final_
   double precision :: xyz
   double precision :: wall0, wall1
  
-  print*,'providing x_W_ki_bi_ortho_erf_rk_diag ...'
-  call wall_time(wall0)
+  !print*,'providing x_W_ki_bi_ortho_erf_rk_diag ...'
+  !call wall_time(wall0)
 
  !$OMP PARALLEL                 &
  !$OMP DEFAULT (NONE)           &
@@ -343,8 +353,8 @@ BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk_diag, (n_points_final_
  !$OMP END DO
  !$OMP END PARALLEL
 
-  call wall_time(wall1)
-  print*,'time to provide x_W_ki_bi_ortho_erf_rk_diag = ',wall1 - wall0
+  !call wall_time(wall1)
+  !print*,'time to provide x_W_ki_bi_ortho_erf_rk_diag = ',wall1 - wall0
 
 END_PROVIDER 
 
diff --git a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
index 726e48ba..fd4a162f 100644
--- a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -168,7 +168,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
     integral = integral + tmp * final_weight_at_r_vector(ipoint)
   enddo
 
-end subroutine give_integrals_3_body_bi_ort
+end
 
 ! ---
 
diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 1e558038..e34a7b7b 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -16,10 +16,10 @@ double precision function bi_ortho_mo_ints(l, k, j, i)
   integer             :: m, n, p, q
 
   bi_ortho_mo_ints = 0.d0
-  do m = 1, ao_num
-    do p = 1, ao_num
-      do n = 1, ao_num
-        do q = 1, ao_num
+  do p = 1, ao_num
+    do m = 1, ao_num
+      do q = 1, ao_num
+        do n = 1, ao_num
           !                                   p1h1p2h2   l1                  l2              r1               r2
           bi_ortho_mo_ints += ao_two_e_tc_tot(n,q,m,p) * mo_l_coef(m,l) * mo_l_coef(n,k) * mo_r_coef(p,j) * mo_r_coef(q,i)
         enddo
@@ -27,7 +27,7 @@ double precision function bi_ortho_mo_ints(l, k, j, i)
     enddo
   enddo
 
-end function bi_ortho_mo_ints
+end
 
 ! ---
 
@@ -43,93 +43,68 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
   integer                       :: i, j, k, l, m, n, p, q, s, r
   double precision              :: t1, t2, tt1, tt2
   double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
-  double precision, allocatable :: a_jkp(:,:,:), a_kpq(:,:,:), a_pqr(:,:,:)
+  double precision, allocatable :: a_jkp(:,:,:), a_kpq(:,:,:), ao_two_e_tc_tot_tmp(:,:,:)
 
   print *, ' PROVIDING mo_bi_ortho_tc_two_e_chemist ...'
   call wall_time(t1)
   call print_memory_usage()
 
   PROVIDE mo_r_coef mo_l_coef
-  PROVIDE ao_two_e_tc_tot
 
   if(ao_to_mo_tc_n3) then
 
     print*, ' memory scale of TC ao -> mo: O(N3) '
 
+    if(.not.read_tc_integ) then
+       stop 'read_tc_integ needs to be set to true'
+    endif
+
     allocate(a_jkp(ao_num,ao_num,mo_num))
     allocate(a_kpq(ao_num,mo_num,mo_num))
-    allocate(a_pqr(mo_num,mo_num,mo_num))
+    allocate(ao_two_e_tc_tot_tmp(ao_num,ao_num,ao_num))
+
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="read")
 
     call wall_time(tt1)
 
-    do s = 1, mo_num
+    mo_bi_ortho_tc_two_e_chemist(:,:,:,:) = 0.d0
+    do l = 1, ao_num
+      read(11) ao_two_e_tc_tot_tmp(:,:,:)
 
-      mo_bi_ortho_tc_two_e_chemist(:,:,:,s) = 0.d0
-      do l = 1, ao_num
+      do s = 1, mo_num
 
-        call dgemm( 'T', 'N', ao_num*ao_num, mo_num, ao_num, 1.d0 &
-                  , ao_two_e_tc_tot(1,1,1,l), ao_num, mo_l_coef(1,1), ao_num  &
+        call dgemm( 'T', 'N', ao_num*ao_num, mo_num, ao_num, 1.d0              &
+                  , ao_two_e_tc_tot_tmp(1,1,1), ao_num, mo_l_coef(1,1), ao_num &
                   , 0.d0, a_jkp(1,1,1), ao_num*ao_num)
-  
+
         call dgemm( 'T', 'N', ao_num*mo_num, mo_num, ao_num, 1.d0 &
                   , a_jkp(1,1,1), ao_num, mo_r_coef(1,1), ao_num  &
                   , 0.d0, a_kpq(1,1,1), ao_num*mo_num)
-  
-        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, ao_num, 1.d0 &
-                  , a_kpq(1,1,1), ao_num, mo_l_coef(1,1), ao_num  &
-                  , 0.d0, a_pqr(1,1,1), mo_num*mo_num)
 
-        !$OMP PARALLEL         &
-        !$OMP DEFAULT(NONE)    &
-        !$OMP PRIVATE(p, q, r) &
-        !$OMP SHARED(s, l, mo_num, mo_bi_ortho_tc_two_e_chemist, mo_r_coef, a_pqr)
-        !$OMP DO COLLAPSE(2)
-        do p = 1, mo_num
-          do q = 1, mo_num
-            do r = 1, mo_num
-              mo_bi_ortho_tc_two_e_chemist(p,q,r,s) = mo_bi_ortho_tc_two_e_chemist(p,q,r,s) + mo_r_coef(l,s) * a_pqr(p,q,r)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO
-        !$OMP END PARALLEL
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, ao_num, mo_r_coef(l,s) &
+                  , a_kpq(1,1,1), ao_num, mo_l_coef(1,1), ao_num            &
+                  , 1.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,s), mo_num*mo_num)
 
-      enddo ! l
+      enddo ! s
 
-      if(s == 2) then
+      if(l == 2) then
         call wall_time(tt2)
         print*, ' 1 / mo_num done in (min)', (tt2-tt1)/60.d0
         print*, ' estimated time required (min)', dble(mo_num-1)*(tt2-tt1)/60.d0
-      elseif(s == 11) then
+      elseif(l == 11) then
         call wall_time(tt2)
         print*, ' 10 / mo_num done in (min)', (tt2-tt1)/60.d0
         print*, ' estimated time required (min)', dble(mo_num-10)*(tt2-tt1)/(60.d0*10.d0)
-      elseif(s == 26) then
-        call wall_time(tt2)
-        print*, ' 25 / mo_num done in (min)', (tt2-tt1)/60.d0
-        print*, ' estimated time required (min)', dble(mo_num-25)*(tt2-tt1)/(60.d0*25.d0)
-      elseif(s == 51) then
-        call wall_time(tt2)
-        print*, ' 50 / mo_num done in (min)', (tt2-tt1)/60.d0
-        print*, ' estimated time required (min)', dble(mo_num-50)*(tt2-tt1)/(60.d0*50.d0)
-      elseif(s == 101) then
+      elseif(l == 101) then
         call wall_time(tt2)
         print*, ' 100 / mo_num done in (min)', (tt2-tt1)/60.d0
         print*, ' estimated time required (min)', dble(mo_num-100)*(tt2-tt1)/(60.d0*100.d0)
-      elseif(s == 201) then
-        call wall_time(tt2)
-        print*, ' 200 / mo_num done in (min)', (tt2-tt1)/60.d0
-        print*, ' estimated time required (min)', dble(mo_num-200)*(tt2-tt1)/(60.d0*200.d0)
-      elseif(s == 501) then
-        call wall_time(tt2)
-        print*, ' 500 / mo_num done in (min)', (tt2-tt1)/60.d0
-        print*, ' estimated time required (min)', dble(mo_num-500)*(tt2-tt1)/(60.d0*500.d0)
       endif
+    enddo ! l
 
+    close(11)
 
-    enddo ! s
-
-    deallocate(a_jkp, a_kpq, a_pqr)
+    deallocate(a_jkp, a_kpq, ao_two_e_tc_tot_tmp)
 
   else
 
@@ -141,6 +116,8 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
               , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
               , 0.d0, a2(1,1,1,1), ao_num*ao_num*ao_num)
   
+    FREE ao_two_e_tc_tot
+
     allocate(a1(ao_num,ao_num,mo_num,mo_num))
   
     call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
diff --git a/plugins/local/non_h_ints_mu/tc_integ.irp.f b/plugins/local/non_h_ints_mu/tc_integ.irp.f
index 775a9e4c..58e3db48 100644
--- a/plugins/local/non_h_ints_mu/tc_integ.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ.irp.f
@@ -44,14 +44,92 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
     elseif(tc_integ_type .eq. "numeric") then
 
       print *, ' Numerical integration over r1 and r2 will be performed'
-  
-      ! TODO combine 1shot & int2_grad1_u12_ao_num
 
-      PROVIDE int2_grad1_u12_ao_num
-      int2_grad1_u12_ao = int2_grad1_u12_ao_num
+      if(tc_save_mem) then
 
-      !PROVIDE int2_grad1_u12_ao_num_1shot
-      !int2_grad1_u12_ao = int2_grad1_u12_ao_num_1shot
+        integer                       :: n_blocks, n_rest, n_pass
+        integer                       :: i_blocks, i_rest, i_pass, ii
+        double precision              :: mem, n_double
+        double precision, allocatable :: tmp(:,:,:), xx(:)
+        double precision, allocatable :: tmp_grad1_u12(:,:,:)
+
+        PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+        allocate(tmp(n_points_extra_final_grid,ao_num,ao_num), xx(n_points_extra_final_grid))
+        !$OMP PARALLEL               &
+        !$OMP DEFAULT (NONE)         &
+        !$OMP PRIVATE (j, i, jpoint) &
+        !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+        !$OMP DO COLLAPSE(2)
+        do j = 1, ao_num
+          do i = 1, ao_num
+            do jpoint = 1, n_points_extra_final_grid
+              tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+            enddo
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call total_memory(mem)
+        mem      = max(1.d0, qp_max_mem - mem)
+        n_double = mem * 1.d8
+        n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+        n_rest   = int(mod(n_points_final_grid, n_blocks))
+        n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+        call write_int(6, n_pass, 'Number of passes')
+        call write_int(6, n_blocks, 'Size of the blocks')
+        call write_int(6, n_rest, 'Size of the last block')
+        allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,3))
+        do i_pass = 1, n_pass
+          ii = (i_pass-1)*n_blocks + 1
+          !$OMP PARALLEL                   &
+          !$OMP DEFAULT (NONE)             &
+          !$OMP PRIVATE (i_blocks, ipoint) &
+          !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, xx, tmp_grad1_u12)
+          !$OMP DO 
+          do i_blocks = 1, n_blocks
+            ipoint = ii - 1 + i_blocks ! r1
+            call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), xx(1))
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+          do m = 1, 3
+            call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                      , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                      , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+          enddo
+        enddo
+        deallocate(tmp_grad1_u12)
+        if(n_rest .gt. 0) then
+          allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,3))
+          ii = n_pass*n_blocks + 1
+          !$OMP PARALLEL                 &
+          !$OMP DEFAULT (NONE)           &
+          !$OMP PRIVATE (i_rest, ipoint) &
+          !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, xx, tmp_grad1_u12)
+          !$OMP DO 
+          do i_rest = 1, n_rest
+            ipoint = ii - 1 + i_rest ! r1
+            call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), xx(1))
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+          do m = 1, 3
+            call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                      , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                      , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+          enddo
+          deallocate(tmp_grad1_u12)
+        endif
+        deallocate(tmp,xx)
+
+      else
+        ! TODO combine 1shot & int2_grad1_u12_ao_num
+        PROVIDE int2_grad1_u12_ao_num
+        int2_grad1_u12_ao = int2_grad1_u12_ao_num
+        !PROVIDE int2_grad1_u12_ao_num_1shot
+        !int2_grad1_u12_ao = int2_grad1_u12_ao_num_1shot
+      endif
 
     elseif(tc_integ_type .eq. "semi-analytic") then
 
@@ -177,13 +255,88 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
 
     print *, ' Numerical integration over r1 and r2 will be performed'
   
-    ! TODO combine 1shot & int2_grad1_u12_square_ao_num
+    if(tc_save_mem) then
 
-    PROVIDE int2_grad1_u12_square_ao_num
-    int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num
+      integer                       :: n_blocks, n_rest, n_pass
+      integer                       :: i_blocks, i_rest, i_pass, ii
+      double precision              :: mem, n_double
+      double precision, allocatable :: tmp(:,:,:), xx(:,:,:)
+      double precision, allocatable :: tmp_grad1_u12_squared(:,:)
 
-    !PROVIDE int2_grad1_u12_square_ao_num_1shot
-    !int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num_1shot
+      PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+      allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+      !$OMP PARALLEL               &
+      !$OMP DEFAULT (NONE)         &
+      !$OMP PRIVATE (j, i, jpoint) &
+      !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+      !$OMP DO COLLAPSE(2)
+      do j = 1, ao_num
+        do i = 1, ao_num
+          do jpoint = 1, n_points_extra_final_grid
+            tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call total_memory(mem)
+      mem      = max(1.d0, qp_max_mem - mem)
+      n_double = mem * 1.d8
+      n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+      n_rest   = int(mod(n_points_final_grid, n_blocks))
+      n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+      call write_int(6, n_pass, 'Number of passes')
+      call write_int(6, n_blocks, 'Size of the blocks')
+      call write_int(6, n_rest, 'Size of the last block')
+      allocate(tmp_grad1_u12_squared(n_points_extra_final_grid,n_blocks), xx(n_points_extra_final_grid,n_blocks,3))
+      do i_pass = 1, n_pass
+        ii = (i_pass-1)*n_blocks + 1
+        !$OMP PARALLEL                   &
+        !$OMP DEFAULT (NONE)             &
+        !$OMP PRIVATE (i_blocks, ipoint) &
+        !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, xx, final_grid_points, tmp_grad1_u12_squared)
+        !$OMP DO 
+        do i_blocks = 1, n_blocks
+          ipoint = ii - 1 + i_blocks ! r1
+          call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, xx(1,i_blocks,1), xx(1,i_blocks,2), xx(1,i_blocks,3), tmp_grad1_u12_squared(1,i_blocks))
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, -0.5d0                         &
+                  , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12_squared(1,1), n_points_extra_final_grid &
+                  , 0.d0, int2_grad1_u12_square_ao(1,1,ii), ao_num*ao_num)
+      enddo
+      deallocate(tmp_grad1_u12_squared, xx)
+      if(n_rest .gt. 0) then
+        ii = n_pass*n_blocks + 1
+        allocate(tmp_grad1_u12_squared(n_points_extra_final_grid,n_rest), xx(n_points_extra_final_grid,n_rest,3))
+        !$OMP PARALLEL                 &
+        !$OMP DEFAULT (NONE)           &
+        !$OMP PRIVATE (i_rest, ipoint) &
+        !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, xx, final_grid_points, tmp_grad1_u12_squared)
+        !$OMP DO 
+        do i_rest = 1, n_rest
+          ipoint = ii - 1 + i_rest ! r1
+          call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, xx(1,i_rest,1), xx(1,i_rest,2), xx(1,i_rest,3), tmp_grad1_u12_squared(1,i_rest))
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, -0.5d0                           &
+                  , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12_squared(1,1), n_points_extra_final_grid &
+                  , 0.d0, int2_grad1_u12_square_ao(1,1,ii), ao_num*ao_num)
+        deallocate(tmp_grad1_u12_squared, xx)
+      endif
+      deallocate(tmp)
+
+    else
+
+      ! TODO combine 1shot & int2_grad1_u12_square_ao_num
+      PROVIDE int2_grad1_u12_square_ao_num
+      int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num
+      !PROVIDE int2_grad1_u12_square_ao_num_1shot
+      !int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num_1shot
+    endif
 
   elseif(tc_integ_type .eq. "semi-analytic") then
 
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index b8379006..a1bbd6e0 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -55,7 +55,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     print*, ' Reading ao_two_e_tc_tot from ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="read")
-      read(11) ao_two_e_tc_tot
+    do i = 1, ao_num
+      read(11) ao_two_e_tc_tot(:,:,:,i)
+    enddo
     close(11)
 
   else
@@ -67,7 +69,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
     PROVIDE int2_grad1_u12_square_ao
 
-    if(tc_save_mem) then
+    if(tc_save_mem_loops) then
 
       print*, ' LOOPS are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
 
@@ -176,7 +178,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
       PROVIDE int2_grad1_u12_ao
 
-      if(tc_save_mem) then
+      if(tc_save_mem_loops) then
 
         print*, ' LOOPS are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
 
@@ -241,7 +243,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
         deallocate(c_mat)
 
       end if
-      !FREE int2_grad1_u12_ao
 
       if(tc_integ_type .eq. "semi-analytic") then 
         FREE int2_grad1_u2e_ao
@@ -264,48 +265,52 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
     print*, ' adding ERI to ao_two_e_tc_tot ...'
 
-    !$OMP PARALLEL DEFAULT(NONE)                     &
-    !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
-    !$OMP SHARED(ao_num, ao_two_e_tc_tot)
-    !$OMP DO COLLAPSE(4)
-    do j = 1, ao_num
-      do l = 1, ao_num
-        do i = 1, ao_num
-          do k = 1, ao_num
-            integ_zero = ao_two_e_integral_zero(i,j,k,l)
-            if(.not. integ_zero) then
-                          ! i,k : r1    j,l : r2
-              integ_val = ao_two_e_integral(i,k,j,l)
-              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + integ_val
-            endif
+    if(tc_save_mem) then
+      print*, ' ao_integrals_map will not be used'
+      !$OMP PARALLEL DEFAULT(NONE)                     &
+      !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot)
+      !$OMP DO COLLAPSE(4)
+      do j = 1, ao_num
+        do l = 1, ao_num
+          do i = 1, ao_num
+            do k = 1, ao_num
+              integ_zero = ao_two_e_integral_zero(i,j,k,l)
+              if(.not. integ_zero) then
+                            ! i,k : r1    j,l : r2
+                integ_val = ao_two_e_integral(i,k,j,l)
+                ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + integ_val
+              endif
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
+      !$OMP END DO
+      !$OMP END PARALLEL
+    else
+      print*, ' ao_integrals_map will be used'
+      PROVIDE ao_integrals_map
+      !$OMP PARALLEL DEFAULT(NONE)                            &
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+      !$OMP PRIVATE(i, j, k, l)
+      !$OMP DO COLLAPSE(4)
+      do j = 1, ao_num
+        do l = 1, ao_num
+          do i = 1, ao_num
+            do k = 1, ao_num
+              !                                                     < 1:i, 2:j | 1:k, 2:l > 
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      !call clear_ao_map()
+      FREE ao_integrals_map
+    endif
 
-    !PROVIDE ao_integrals_map
-    !!$OMP PARALLEL DEFAULT(NONE)                            &
-    !!$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-    !!$OMP PRIVATE(i, j, k, l)
-    !!$OMP DO COLLAPSE(4)
-    !do j = 1, ao_num
-    !  do l = 1, ao_num
-    !    do i = 1, ao_num
-    !      do k = 1, ao_num
-    !        !                                                     < 1:i, 2:j | 1:k, 2:l > 
-    !        ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
-    !      enddo
-    !    enddo
-    !  enddo
-    !enddo
-    !!$OMP END DO
-    !!$OMP END PARALLEL
-    !!call clear_ao_map()
-    !FREE ao_integrals_map
-
-    if(tc_integ_type .eq. "numeric") then
+    if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
       FREE int2_grad1_u12_ao_num int2_grad1_u12_square_ao_num
     endif
 
@@ -315,7 +320,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     print*, ' Saving ao_two_e_tc_tot in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
     call ezfio_set_work_empty(.False.)
-      write(11) ao_two_e_tc_tot
+    do i = 1, ao_num
+      write(11) ao_two_e_tc_tot(:,:,:,i)
+    enddo
     close(11)
     call ezfio_set_tc_keywords_io_tc_integ('Read')
   endif
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index 24362796..70169189 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -286,10 +286,16 @@ doc: If |true|, memory scale of TC ao -> mo: O(N3)
 interface: ezfio,provider,ocaml
 default: False
 
-[tc_save_mem]
+[tc_save_mem_loops]
 type: logical
 doc: If |true|, use loops to save memory TC
 interface: ezfio,provider,ocaml
 default: False
 
+[tc_save_mem]
+type: logical
+doc: If |true|, more calc but less mem
+interface: ezfio,provider,ocaml
+default: False
+
 
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
index 0b883865..8fd5e5b6 100644
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
+++ b/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
@@ -9,7 +9,7 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
   double precision              :: loc_1, loc_2, loc_3
   double precision, allocatable :: Okappa(:), Jkappa(:,:)
   double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
-  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
+  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:), tmp_22(:,:,:)
   double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
 
   PROVIDE mo_l_coef mo_r_coef
@@ -63,17 +63,13 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
   allocate(tmp_1(n_points_final_grid,4))
 
   do ipoint = 1, n_points_final_grid
-
     loc_1 = 2.d0 * Okappa(ipoint) 
-
     tmp_1(ipoint,1) = loc_1 * Jkappa(ipoint,1)
     tmp_1(ipoint,2) = loc_1 * Jkappa(ipoint,2)
     tmp_1(ipoint,3) = loc_1 * Jkappa(ipoint,3)
-
     tmp_1(ipoint,4) = Okappa(ipoint)
   enddo
 
-
   !$OMP PARALLEL                                                  &
   !$OMP DEFAULT (NONE)                                            &
   !$OMP PRIVATE (ipoint, i, j, loc_1, tmp_omp_d2)                 &
@@ -112,58 +108,81 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
 
   ! ---
 
-  allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
+  if(tc_save_mem) then
 
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, a, b)                                    &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
-        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
-        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                 &
-  !$OMP DEFAULT (NONE)                                           &
-  !$OMP PRIVATE (ipoint, a, b, i)                                &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,      &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t, &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-      tmp_2(:,4,b,a) = 0.d0
-      do i = 1, elec_beta_num
+    allocate(tmp_22(n_points_final_grid,4,mo_num))
+    do a = 1, mo_num
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, b, i)                                    &
+      !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, a,    &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp_22)
+      !$OMP DO
+      do b = 1, mo_num
         do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,4,b,a) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          tmp_22(ipoint,1,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+          tmp_22(ipoint,2,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+          tmp_22(ipoint,3,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        enddo
+        tmp_22(:,4,b) = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+            tmp_22(ipoint,4,b) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                     + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                     + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call dgemv( 'T', 4*n_points_final_grid, mo_num, -2.d0        &
+                , tmp_22(1,1,1), size(tmp_22, 1) * size(tmp_22, 2) &
+                , tmp_1(1,1), 1                                    &
+                , 0.d0, fock_3e_uhf_mo_cs(1,a), 1)
+    enddo
+    deallocate(tmp_22)
+
+  else
+
+    allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, a, b, i)                                 &
+    !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp_2)
+    !$OMP DO COLLAPSE(2)
+    do a = 1, mo_num
+      do b = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+          tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+          tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        enddo
+        tmp_2(:,4,b,a) = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+            tmp_2(ipoint,4,b,a) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                      + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                      + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          enddo
         enddo
       enddo
     enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, -2.d0 &
+              , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2)  &
+              , tmp_1(1,1), 1                                    &
+              , 0.d0, fock_3e_uhf_mo_cs(1,1), 1)
+    deallocate(tmp_2)
 
-  ! ---
+  endif
 
-  call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, -2.d0 &
-            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2)  &
-            , tmp_1(1,1), 1                                    &
-            , 0.d0, fock_3e_uhf_mo_cs(1,1), 1)
-
-  deallocate(tmp_1, tmp_2)
+  deallocate(tmp_1)
 
   ! ---
 
@@ -272,7 +291,7 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
   ! ---
 
   !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_cs =', tf - ti
+  !print *, ' total Wall time for fock_3e_uhf_mo_cs =', (tf - ti) / 60.d0
 
 END_PROVIDER 
 
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
index 63a1e162..47ee5b48 100644
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
+++ b/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
@@ -32,7 +32,7 @@ BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a, (mo_num, mo_num)]
   endif
 
   !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a =', tf - ti
+  !print *, ' Wall time for fock_3e_uhf_mo_a (min) =', (tf - ti)/60.d0
 
 END_PROVIDER 
 
diff --git a/plugins/local/tc_scf/fock_tc.irp.f b/plugins/local/tc_scf/fock_tc.irp.f
index 282f9873..d3ddb8ad 100644
--- a/plugins/local/tc_scf/fock_tc.irp.f
+++ b/plugins/local/tc_scf/fock_tc.irp.f
@@ -175,7 +175,7 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num) ]
+BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
 
   BEGIN_DOC
   ! Total alpha TC Fock matrix : h_c + Two-e^TC terms on the MO basis
diff --git a/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
new file mode 100644
index 00000000..7ce57578
--- /dev/null
+++ b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
@@ -0,0 +1,58 @@
+! ---
+
+program write_ao_2e_tc_integ
+
+  implicit none
+
+  PROVIDE j1e_type
+  PROVIDE j2e_type
+
+  print *, ' j1e_type = ', j1e_type
+  print *, ' j2e_type = ', j2e_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+  endif
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  PROVIDE ao_two_e_tc_tot
+
+end
+
+! ---
+
diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f
index f21e08cd..c7216a61 100644
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@@ -102,12 +102,28 @@ END_PROVIDER
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment_eigenval,           (N_states)]
 
   implicit none
+  double precision, allocatable :: eigval(:), eigvec(:,:), A(:,:)
 
   PROVIDE multi_s_x_dipole_moment multi_s_y_dipole_moment multi_s_z_dipole_moment
 
-  call lapack_diag(multi_s_x_dipole_moment_eigenval(1), multi_s_x_dipole_moment_eigenvec(1,1), multi_s_x_dipole_moment(1,1), N_states, N_states)
-  call lapack_diag(multi_s_y_dipole_moment_eigenval(1), multi_s_y_dipole_moment_eigenvec(1,1), multi_s_y_dipole_moment(1,1), N_states, N_states)
-  call lapack_diag(multi_s_z_dipole_moment_eigenval(1), multi_s_z_dipole_moment_eigenvec(1,1), multi_s_z_dipole_moment(1,1), N_states, N_states)
+  allocate(A(N_states,N_states), eigvec(N_states,N_states), eigval(N_states))
+
+  A = multi_s_x_dipole_moment
+  call lapack_diag(eigval(1), eigvec(1,1), A(1,1), N_states, N_states)
+  multi_s_x_dipole_moment_eigenval = eigval
+  multi_s_x_dipole_moment_eigenvec = eigvec
+
+  A = multi_s_y_dipole_moment
+  call lapack_diag(eigval(1), eigvec(1,1), A(1,1), N_states, N_states)
+  multi_s_y_dipole_moment_eigenval = eigval
+  multi_s_y_dipole_moment_eigenvec = eigvec
+
+  A = multi_s_z_dipole_moment
+  call lapack_diag(eigval(1), eigvec(1,1), A(1,1), N_states, N_states)
+  multi_s_z_dipole_moment_eigenval = eigval
+  multi_s_z_dipole_moment_eigenvec = eigvec
+
+  deallocate(A, eigvec, eigval)
 
 END_PROVIDER
 

From e65d7913bfdf83159ffd50eb39c76e63dea221d5 Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Sun, 7 Apr 2024 00:43:32 +0200
Subject: [PATCH 011/131] saving lcpq

---
 bin/qp_convert_output_to_ezfio | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/qp_convert_output_to_ezfio b/bin/qp_convert_output_to_ezfio
index 1b33f156..6f2d02d0 100755
--- a/bin/qp_convert_output_to_ezfio
+++ b/bin/qp_convert_output_to_ezfio
@@ -227,8 +227,8 @@ def write_ezfio(res, filename):
           shell_index += [nshell_tot] * len(b.prim)
 
     shell_num = len(ang_mom)
-    assert(shell_index[0] = 1)
-    assert(shell_index[-1] = shell_num)
+    assert(shell_index[0] == 1)
+    assert(shell_index[-1] == shell_num)
 
     # ~#~#~#~#~ #
     # W r i t e #

From 2c899e6dd71247ae26cd337ede2bb13ce9419489 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 20 Apr 2024 12:39:39 +0200
Subject: [PATCH 012/131] few modif in grids

---
 .../extra_grid_vector.irp.f                   | 23 +++++++++++++++++--
 .../grid_becke_vector.irp.f                   |  6 ++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/becke_numerical_grid/extra_grid_vector.irp.f b/src/becke_numerical_grid/extra_grid_vector.irp.f
index ae167282..16a52dc6 100644
--- a/src/becke_numerical_grid/extra_grid_vector.irp.f
+++ b/src/becke_numerical_grid/extra_grid_vector.irp.f
@@ -47,8 +47,12 @@ END_PROVIDER
   END_DOC
 
   implicit none
-  integer                        :: i,j,k,l,i_count
-  double precision               :: r(3)
+  integer          :: i, j, k, l, i_count
+  double precision :: r(3)
+  double precision :: wall0, wall1
+
+  call wall_time(wall0)
+  print *, ' Providing extra_final_grid_points ...'
 
   i_count = 0
   do j = 1, nucl_num
@@ -66,10 +70,25 @@ END_PROVIDER
         index_final_points_extra(2,i_count) = i
         index_final_points_extra(3,i_count) = j
         index_final_points_extra_reverse(k,i,j) = i_count
+
+        if(final_weight_at_r_vector_extra(i_count) .lt. 0.d0) then
+          print *, ' !!! WARNING !!!'
+          print *, ' negative weight !!!!'
+          print *, i_count, final_weight_at_r_vector_extra(i_count)
+          if(dabs(final_weight_at_r_vector_extra(i_count)) .lt. 1d-10) then
+            final_weight_at_r_vector_extra(i_count) = 0.d0
+          else
+            stop
+          endif
+        endif
       enddo
     enddo
   enddo
 
+  call wall_time(wall1)
+  print *, ' wall time for extra_final_grid_points,', wall1 - wall0
+  call print_memory_usage()
+
 END_PROVIDER
 
 
diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index 473096d0..c35918c3 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -72,7 +72,11 @@ END_PROVIDER
           print *, ' !!! WARNING !!!'
           print *, ' negative weight !!!!'
           print *, i_count, final_weight_at_r_vector(i_count)
-          stop
+          if(dabs(final_weight_at_r_vector(i_count)) .lt. 1d-10) then
+            final_weight_at_r_vector(i_count) = 0.d0
+          else
+            stop
+          endif
         endif 
       enddo
     enddo

From e9dccd2364f282397df9f3b5bc4e3373fe3bd7e6 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 25 Apr 2024 19:46:26 +0200
Subject: [PATCH 013/131] added spherical harmonics

---
 plugins/local/spher_harm/.gitignore           |  59 +++++
 plugins/local/spher_harm/NEED                 |   1 +
 plugins/local/spher_harm/README.rst           |   4 +
 plugins/local/spher_harm/assoc_gaus_pol.irp.f |  50 ++++
 plugins/local/spher_harm/spher_harm.irp.f     | 217 ++++++++++++++++++
 .../local/spher_harm/spher_harm_func.irp.f    | 151 ++++++++++++
 6 files changed, 482 insertions(+)
 create mode 100644 plugins/local/spher_harm/.gitignore
 create mode 100644 plugins/local/spher_harm/NEED
 create mode 100644 plugins/local/spher_harm/README.rst
 create mode 100644 plugins/local/spher_harm/assoc_gaus_pol.irp.f
 create mode 100644 plugins/local/spher_harm/spher_harm.irp.f
 create mode 100644 plugins/local/spher_harm/spher_harm_func.irp.f

diff --git a/plugins/local/spher_harm/.gitignore b/plugins/local/spher_harm/.gitignore
new file mode 100644
index 00000000..1561915b
--- /dev/null
+++ b/plugins/local/spher_harm/.gitignore
@@ -0,0 +1,59 @@
+IRPF90_temp/
+IRPF90_man/
+build.ninja
+irpf90.make
+ezfio_interface.irp.f
+irpf90_entities
+tags
+Makefile
+ao_basis
+ao_one_e_ints
+ao_two_e_erf_ints
+ao_two_e_ints
+aux_quantities
+becke_numerical_grid
+bitmask
+cis
+cisd
+cipsi
+davidson
+davidson_dressed
+davidson_undressed
+density_for_dft
+determinants
+dft_keywords
+dft_utils_in_r
+dft_utils_one_e
+dft_utils_two_body
+dressing
+dummy
+electrons
+ezfio_files
+fci
+generators_cas
+generators_full
+hartree_fock
+iterations
+kohn_sham
+kohn_sham_rs
+mo_basis
+mo_guess
+mo_one_e_ints
+mo_two_e_erf_ints
+mo_two_e_ints
+mpi
+mrpt_utils
+nuclei
+perturbation
+pseudo
+psiref_cas
+psiref_utils
+scf_utils
+selectors_cassd
+selectors_full
+selectors_utils
+single_ref_method
+slave
+tools
+utils
+zmq
diff --git a/plugins/local/spher_harm/NEED b/plugins/local/spher_harm/NEED
new file mode 100644
index 00000000..92df7f12
--- /dev/null
+++ b/plugins/local/spher_harm/NEED
@@ -0,0 +1 @@
+dft_utils_in_r
diff --git a/plugins/local/spher_harm/README.rst b/plugins/local/spher_harm/README.rst
new file mode 100644
index 00000000..bf897f73
--- /dev/null
+++ b/plugins/local/spher_harm/README.rst
@@ -0,0 +1,4 @@
+==========
+spher_harm
+==========
+
diff --git a/plugins/local/spher_harm/assoc_gaus_pol.irp.f b/plugins/local/spher_harm/assoc_gaus_pol.irp.f
new file mode 100644
index 00000000..fa790307
--- /dev/null
+++ b/plugins/local/spher_harm/assoc_gaus_pol.irp.f
@@ -0,0 +1,50 @@
+double precision function plgndr(l,m,x)
+ integer, intent(in) :: l,m
+ double precision, intent(in) :: x
+ BEGIN_DOC
+ ! associated Legenre polynom P_l,m(x). Used for the Y_lm(theta,phi)
+ ! Taken from https://iate.oac.uncor.edu/~mario/materia/nr/numrec/f6-8.pdf
+ END_DOC
+ integer :: i,ll
+ double precision :: fact,pll,pmm,pmmp1,somx2
+ if(m.lt.0.or.m.gt.l.or.dabs(x).gt.1.d0)then 
+  print*,'bad arguments in plgndr'
+  pause
+ endif
+ pmm=1.d0
+ if(m.gt.0) then
+  somx2=dsqrt((1.d0-x)*(1.d0+x))
+  fact=1.d0
+  do i=1,m
+   pmm=-pmm*fact*somx2
+   fact=fact+2.d0
+  enddo 
+ endif ! m > 0
+ if(l.eq.m) then
+  plgndr=pmm 
+ else
+  pmmp1=x*(2*m+1)*pmm ! Compute P_m+1^m
+   if(l.eq.m+1) then
+    plgndr=pmmp1
+   else ! Compute P_l^m, l> m+1
+    do ll=m+2,l
+     pll=(x*dble(2*ll-1)*pmmp1-dble(ll+m-1)*pmm)/(ll-m)
+     pmm=pmmp1
+     pmmp1=pll
+    enddo
+    plgndr=pll
+   endif ! l.eq.m+1
+ endif ! l.eq.m
+ return
+end
+
+double precision function ortho_assoc_gaus_pol(l1,m1,l2)
+ implicit none
+ integer, intent(in) :: l1,m1,l2
+ double precision :: fact
+ if(l1.ne.l2)then
+  ortho_assoc_gaus_pol= 0.d0
+ else
+  ortho_assoc_gaus_pol = 2.d0*fact(l1+m1) / (dble(2*l1+1)*fact(l1-m1))
+ endif
+end
diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
new file mode 100644
index 00000000..40661db1
--- /dev/null
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -0,0 +1,217 @@
+program spher_harm
+ implicit none
+ call test_spher_harm
+! call test_cart
+! call test_brutal_spheric
+end
+
+subroutine test_cart
+ implicit none
+ include 'constants.include.F'
+ double precision :: r(3),theta,phi,r_abs 
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+end
+
+subroutine test_spher_harm
+ implicit none
+  include 'constants.include.F'
+ integer :: l1,m1,i,l2,m2,lmax
+ double precision :: r(3),weight,accu_re, accu_im,accu
+ double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
+ l1 = 0
+ m1 = 0
+ l2 = 0
+ m2 = 0
+ lmax = 5
+ do l1 = 0,lmax
+  do m1 =  -l1 ,l1
+   do l2 = 0,lmax
+    do m2 =  -l2 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
+     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
+     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+     accu = 0.d0
+     do i = 1, n_points_integration_angular 
+      double precision :: theta,phi,r_abs
+      r(1:3) = angular_quadrature_points(i,1:3)
+      weight = weights_angular_points(i)
+      call cartesian_to_spherical(r,theta,phi,r_abs)
+      if(theta.gt.pi.or.theta.lt.0.d0)then
+       print*,'pb with theta',theta
+       print*,r
+      endif
+      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
+       print*,'pb with phi',phi/pi
+       print*,r
+      endif
+      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
+      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
+!      call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu += weight
+      write(33,'(10(F16.10,X))')phi/pi
+     enddo
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ double precision :: x,dx,xmax,xmin
+ integer:: nx
+ nx = 10000
+ xmin = -5.d0
+ xmax =  5.d0
+ dx = (xmax - xmin)/dble(nx)
+ x = xmin
+ do i = 1, nx
+  write(34,*)x,datan(x),dacos(x)
+  x += dx
+ enddo
+end
+
+subroutine test_brutal_spheric
+ implicit none
+  include 'constants.include.F'
+ integer :: itheta, iphi,ntheta,nphi
+ double precision :: theta_min, theta_max, dtheta,theta
+ double precision :: phi_min, phi_max, dphi,phi
+ double precision :: accu_re, accu_im,weight 
+ double precision :: re_ylm_1, im_ylm_1 ,re_ylm_2, im_ylm_2,accu
+ integer :: l1,m1,i,l2,m2,lmax
+ phi_min = 0.d0
+ phi_max = 2.D0 * pi
+ theta_min = 0.d0
+ theta_max = 1.D0 * pi
+ ntheta = 1000
+ nphi = 1000
+ dphi = (phi_max - phi_min)/dble(nphi)
+ dtheta = (theta_max - theta_min)/dble(ntheta)
+
+ lmax = 3
+ do l1 = 0,lmax
+  do m1 =  0 ,l1
+   do l2 = 0,lmax
+    do m2 =  0 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     accu  = 0.d0
+     theta = theta_min
+     do itheta = 1, ntheta
+      phi = phi_min
+      do iphi = 1, nphi
+!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+       call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       weight = dtheta * dphi * dsin(theta) 
+       accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+       accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+       accu += weight
+       phi += dphi
+      enddo
+      theta += dtheta
+     enddo
+     print*,'l1,m1,l2,m2',l1,m1,l2,m2
+     print*,'accu_re = ',accu_re
+     print*,'accu_im = ',accu_im
+     print*,'accu    = ',accu
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ 
+
+end
+
+subroutine test_assoc_leg_pol
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+ integer :: l1,m1,ngrid,i,l2,m2
+ l1 = 0
+ m1 = 0
+ l2 = 2
+ m2 = 0
+ double precision :: x, dx,xmax,accu,xmin
+ double precision :: plgndr,func_1,func_2,ortho_assoc_gaus_pol
+ ngrid = 100000
+ xmax = 1.d0
+ xmin = -1.d0
+ dx = (xmax-xmin)/dble(ngrid)
+ do l2 = 0,10
+  x = xmin
+  accu = 0.d0
+  do i = 1, ngrid
+   func_1 = plgndr(l1,m1,x)
+   func_2 = plgndr(l2,m2,x)
+   write(33,*)x, func_1,func_2
+   accu += func_1 * func_2 * dx
+   x += dx
+  enddo
+  print*,'l2 = ',l2
+  print*,'accu = ',accu
+  print*,ortho_assoc_gaus_pol(l1,m1,l2)
+ enddo
+end
diff --git a/plugins/local/spher_harm/spher_harm_func.irp.f b/plugins/local/spher_harm/spher_harm_func.irp.f
new file mode 100644
index 00000000..825bd8ac
--- /dev/null
+++ b/plugins/local/spher_harm/spher_harm_func.irp.f
@@ -0,0 +1,151 @@
+subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
+ implicit none
+ integer, intent(in) :: l,m
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: re_ylm, im_ylm
+
+ double precision :: theta, phi,r_abs
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+end
+
+
+subroutine spher_harm_func_m_pos(l,m,theta,phi,re_ylm, im_ylm)
+  include 'constants.include.F'
+ implicit none
+ BEGIN_DOC
+! Y_lm(theta,phi) with m >0
+!
+ END_DOC
+ double precision, intent(in) :: theta, phi
+ integer, intent(in)          :: l,m
+ double precision, intent(out):: re_ylm,im_ylm
+ double precision :: prefact,fact,cos_theta,plgndr,p_lm
+ double precision :: tmp
+ prefact = dble(2*l+1)*fact(l-m)/(dfour_pi * fact(l+m))
+ prefact = dsqrt(prefact)
+ cos_theta = dcos(theta)
+ p_lm = plgndr(l,m,cos_theta)
+ tmp  = prefact * p_lm
+ re_ylm = dcos(dble(m)*phi) * tmp
+ im_ylm = dsin(dble(m)*phi) * tmp
+end
+
+subroutine spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+ implicit none 
+ BEGIN_DOC
+ ! Y_lm(theta,phi) with -l<m<+l
+ !
+ END_DOC
+ double precision, intent(in) :: theta, phi
+ integer, intent(in)          :: l,m
+ double precision, intent(out):: re_ylm,im_ylm
+ double precision :: re_ylm_pos,im_ylm_pos,tmp
+ integer :: minus_m
+ if(abs(m).gt.l)then
+  print*,'|m| > l in spher_harm_func !! stopping ...'
+  stop
+ endif
+ if(m.ge.0)then
+  call spher_harm_func_m_pos(l,m,theta,phi,re_ylm_pos, im_ylm_pos)
+  re_ylm = re_ylm_pos
+  im_ylm = im_ylm_pos
+ else
+  minus_m = -m !> 0
+  call spher_harm_func_m_pos(l,minus_m,theta,phi,re_ylm_pos, im_ylm_pos)
+  tmp = (-1)**minus_m
+  re_ylm =   tmp  * re_ylm_pos
+  im_ylm =  -tmp  * im_ylm_pos ! complex conjugate 
+ endif
+end
+
+subroutine cartesian_to_spherical(r,theta,phi,r_abs)
+ implicit none 
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: theta, phi,r_abs
+ double precision :: r_2,x_2_y_2,tmp
+ include 'constants.include.F'
+ x_2_y_2 = r(1)*r(1) + r(2)*r(2)
+ r_2 = x_2_y_2 + r(3)*r(3)
+ r_abs = dsqrt(r_2)
+
+ if(r_abs.gt.1.d-20)then
+  theta = dacos(r(3)/r_abs)
+ else
+  theta = 0.d0
+ endif
+
+ if(.true.)then
+  if(dabs(r(1)).gt.0.d0)then
+   tmp = datan(r(2)/r(1))
+!   phi = datan2(r(2),r(1))
+  endif
+  ! From Wikipedia on Spherical Harmonics
+  if(r(1).gt.0.d0)then
+   phi = tmp
+  else if(r(1).lt.0.d0.and.r(2).ge.0.d0)then
+   phi = tmp + pi
+  else if(r(1).lt.0.d0.and.r(2).lt.0.d0)then
+   phi = tmp - pi
+  else if(r(1)==0.d0.and.r(2).gt.0.d0)then
+   phi = 0.5d0*pi
+  else if(r(1)==0.d0.and.r(2).lt.0.d0)then
+   phi =-0.5d0*pi
+  else if(r(1)==0.d0.and.r(2)==0.d0)then
+   phi = 0.d0
+  endif
+  if(r(2).lt.0.d0.and.r(1).le.0.d0)then
+   tmp = pi - dabs(phi)
+   phi = pi + tmp
+  else if(r(2).lt.0.d0.and.r(1).gt.0.d0)then
+   phi = dtwo_pi + phi
+  endif
+ endif
+
+ if(.false.)then
+  x_2_y_2 = dsqrt(x_2_y_2)  
+  if(dabs(x_2_y_2).gt.1.d-20.and.dabs(r(2)).gt.1.d-20)then
+   phi = dabs(r(2))/r(2) * dacos(r(1)/x_2_y_2) 
+  else 
+   phi = 0.d0
+  endif
+ endif
+end
+
+
+subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
+ implicit none 
+ BEGIN_DOC
+ ! Y_lm(theta,phi) with -l<m<+l and 0<= l <=2
+ !
+ END_DOC
+ double precision, intent(in) :: theta, phi
+ integer, intent(in)          :: l,m
+ double precision, intent(out):: re_ylm,im_ylm
+ double precision :: tmp
+ include 'constants.include.F'
+ if(l==0.and.m==0)then
+  re_ylm = 0.5d0 * inv_sq_pi 
+  im_ylm = 0.d0
+ else if(l==1.and.m==1)then
+  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
+  re_ylm = tmp * dcos(phi)
+  im_ylm = tmp * dsin(phi)
+ else if(l==1.and.m==0)then
+  tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) 
+  re_ylm = tmp 
+  im_ylm = 0.d0
+ else if(l==2.and.m==2)then
+  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
+  re_ylm = tmp * dcos(2.d0*phi)
+  im_ylm = tmp * dsin(2.d0*phi)
+ else if(l==2.and.m==1)then
+  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
+  re_ylm = tmp * dcos(phi)
+  im_ylm = tmp * dsin(phi)
+ else if(l==2.and.m==0)then
+  tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
+  re_ylm = tmp
+  im_ylm = 0.d0
+ endif
+end

From f35bc230368a954e351c5e64dd3e9e19d5978023 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 24 Apr 2024 14:48:23 +0200
Subject: [PATCH 014/131] Begining to make some cleaning in TC

---
 .../local/bi_ort_ints/total_twoe_pot.irp.f    | 22 +++++++++++++++++++
 plugins/local/mo_localization/README.md       |  2 +-
 .../normal_ordered.irp.f                      |  0
 .../normal_ordered_contractions.irp.f         |  0
 .../normal_ordered_old.irp.f                  |  0
 .../normal_ordered_v0.irp.f                   |  0
 .../h_biortho.irp.f                           |  0
 .../h_mat_triple.irp.f                        |  0
 .../h_tc_bi_ortho_psi.irp.f                   |  0
 .../h_tc_s2_u0.irp.f                          |  0
 .../slater_tc_3e_slow.irp.f                   |  0
 .../slater_tc_opt.irp.f                       |  0
 .../slater_tc_opt_diag.irp.f                  |  0
 .../slater_tc_opt_double.irp.f                |  0
 .../slater_tc_opt_single.irp.f                |  0
 .../slater_tc_slow.irp.f                      |  0
 .../{tc_bi_ortho => slater_tc}/tc_hmat.irp.f  |  0
 plugins/local/tc_bi_ortho/NEED                |  6 +----
 18 files changed, 24 insertions(+), 6 deletions(-)
 rename plugins/local/{tc_bi_ortho => normal_order_old}/normal_ordered.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => normal_order_old}/normal_ordered_contractions.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => normal_order_old}/normal_ordered_old.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => normal_order_old}/normal_ordered_v0.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/h_biortho.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/h_mat_triple.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/h_tc_bi_ortho_psi.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/h_tc_s2_u0.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/slater_tc_3e_slow.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/slater_tc_opt.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/slater_tc_opt_diag.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/slater_tc_opt_double.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/slater_tc_opt_single.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/slater_tc_slow.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => slater_tc}/tc_hmat.irp.f (100%)

diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 5e6a24e9..42a7ba62 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -176,6 +176,28 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e, (mo_num, mo_num, mo_num,
 
 END_PROVIDER 
 
+BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_transp, (mo_num, mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+  !
+  ! mo_bi_ortho_tc_two_e_transp(i,j,k,l) = <k l| V(r_12) |i j> = transpose of mo_bi_ortho_tc_two_e
+  !
+  ! the potential V(r_12) contains ALL TWO-E CONTRIBUTION OF THE TC-HAMILTONIAN
+  !
+ END_DOC
+
+ integer :: i,j,k,l
+ do i = 1, mo_num
+  do j = 1, mo_num
+   do k = 1, mo_num
+    do l = 1, mo_num
+     mo_bi_ortho_tc_two_e_transp(i,j,k,l) = mo_bi_ortho_tc_two_e_transp(k,l,i,j)
+    enddo
+   enddo
+  enddo
+ enddo
+
+END_PROVIDER 
 ! ---
 
  BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj,          (mo_num,mo_num)]
diff --git a/plugins/local/mo_localization/README.md b/plugins/local/mo_localization/README.md
index c28a5ee1..512e36af 100644
--- a/plugins/local/mo_localization/README.md
+++ b/plugins/local/mo_localization/README.md
@@ -3,7 +3,7 @@ To localize the MOs:
 ```
 qp run localization  
 ```
-By default, the different otbital classes are automatically set by splitting  
+By default, the different orbital classes are automatically set by splitting  
 the orbitales in the following classes:  
 - Core -> Core  
 - Active, doubly occupied -> Inactive  
diff --git a/plugins/local/tc_bi_ortho/normal_ordered.irp.f b/plugins/local/normal_order_old/normal_ordered.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered.irp.f
rename to plugins/local/normal_order_old/normal_ordered.irp.f
diff --git a/plugins/local/tc_bi_ortho/normal_ordered_contractions.irp.f b/plugins/local/normal_order_old/normal_ordered_contractions.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered_contractions.irp.f
rename to plugins/local/normal_order_old/normal_ordered_contractions.irp.f
diff --git a/plugins/local/tc_bi_ortho/normal_ordered_old.irp.f b/plugins/local/normal_order_old/normal_ordered_old.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered_old.irp.f
rename to plugins/local/normal_order_old/normal_ordered_old.irp.f
diff --git a/plugins/local/tc_bi_ortho/normal_ordered_v0.irp.f b/plugins/local/normal_order_old/normal_ordered_v0.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered_v0.irp.f
rename to plugins/local/normal_order_old/normal_ordered_v0.irp.f
diff --git a/plugins/local/tc_bi_ortho/h_biortho.irp.f b/plugins/local/slater_tc/h_biortho.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_biortho.irp.f
rename to plugins/local/slater_tc/h_biortho.irp.f
diff --git a/plugins/local/tc_bi_ortho/h_mat_triple.irp.f b/plugins/local/slater_tc/h_mat_triple.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_mat_triple.irp.f
rename to plugins/local/slater_tc/h_mat_triple.irp.f
diff --git a/plugins/local/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f b/plugins/local/slater_tc/h_tc_bi_ortho_psi.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
rename to plugins/local/slater_tc/h_tc_bi_ortho_psi.irp.f
diff --git a/plugins/local/tc_bi_ortho/h_tc_s2_u0.irp.f b/plugins/local/slater_tc/h_tc_s2_u0.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_tc_s2_u0.irp.f
rename to plugins/local/slater_tc/h_tc_s2_u0.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_3e_slow.irp.f b/plugins/local/slater_tc/slater_tc_3e_slow.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/slater_tc_3e_slow.irp.f
rename to plugins/local/slater_tc/slater_tc_3e_slow.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/slater_tc_opt.irp.f
rename to plugins/local/slater_tc/slater_tc_opt.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f b/plugins/local/slater_tc/slater_tc_opt_diag.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f
rename to plugins/local/slater_tc/slater_tc_opt_diag.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f b/plugins/local/slater_tc/slater_tc_opt_double.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f
rename to plugins/local/slater_tc/slater_tc_opt_double.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f
rename to plugins/local/slater_tc/slater_tc_opt_single.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_slow.irp.f b/plugins/local/slater_tc/slater_tc_slow.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/slater_tc_slow.irp.f
rename to plugins/local/slater_tc/slater_tc_slow.irp.f
diff --git a/plugins/local/tc_bi_ortho/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/tc_hmat.irp.f
rename to plugins/local/slater_tc/tc_hmat.irp.f
diff --git a/plugins/local/tc_bi_ortho/NEED b/plugins/local/tc_bi_ortho/NEED
index 9a0c20ef..01841e02 100644
--- a/plugins/local/tc_bi_ortho/NEED
+++ b/plugins/local/tc_bi_ortho/NEED
@@ -1,6 +1,2 @@
-bi_ort_ints
-bi_ortho_mos
-tc_keywords
-non_hermit_dav
-dav_general_mat
 tc_scf
+slater_tc

From 05f35ab601a1f8ee17a3b55136bec92aefc96176 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 25 Apr 2024 20:00:42 +0200
Subject: [PATCH 015/131] Added properly the routines for the test of the
 Spherical Harmonics

---
 plugins/local/spher_harm/routines_test.irp.f | 227 +++++++++++++++++++
 plugins/local/spher_harm/spher_harm.irp.f    | 210 -----------------
 2 files changed, 227 insertions(+), 210 deletions(-)
 create mode 100644 plugins/local/spher_harm/routines_test.irp.f

diff --git a/plugins/local/spher_harm/routines_test.irp.f b/plugins/local/spher_harm/routines_test.irp.f
new file mode 100644
index 00000000..6f7cbc1c
--- /dev/null
+++ b/plugins/local/spher_harm/routines_test.irp.f
@@ -0,0 +1,227 @@
+
+subroutine test_cart
+ implicit none
+ BEGIN_DOC
+ ! test for the cartesian --> spherical change of coordinates 
+ !
+ ! simple test such that the polar angle theta ranges in [0,pi]
+ !
+ ! and the asymuthal angle phi ranges in [0,2pi]
+ END_DOC
+ include 'constants.include.F'
+ double precision :: r(3),theta,phi,r_abs 
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+end
+
+subroutine test_spher_harm
+ implicit none
+ BEGIN_DOC
+ ! routine to test the spherical harmonics integration on a sphere with the grid. 
+ ! 
+ ! We test <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2
+ END_DOC
+  include 'constants.include.F'
+ integer :: l1,m1,i,l2,m2,lmax
+ double precision :: r(3),weight,accu_re, accu_im,accu
+ double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
+ l1 = 0
+ m1 = 0
+ l2 = 0
+ m2 = 0
+ lmax = 5
+ do l1 = 0,lmax
+  do m1 =  -l1 ,l1
+   do l2 = 0,lmax
+    do m2 =  -l2 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
+     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
+     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+     accu = 0.d0
+     do i = 1, n_points_integration_angular 
+      double precision :: theta,phi,r_abs
+      r(1:3) = angular_quadrature_points(i,1:3)
+      weight = weights_angular_points(i)
+      call cartesian_to_spherical(r,theta,phi,r_abs)
+      if(theta.gt.pi.or.theta.lt.0.d0)then
+       print*,'pb with theta',theta
+       print*,r
+      endif
+      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
+       print*,'pb with phi',phi/pi
+       print*,r
+      endif
+      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
+      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
+      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu += weight
+      write(33,'(10(F16.10,X))')phi/pi
+     enddo
+     ! Test for the delta l1,l2 and delta m1,m2
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ double precision :: x,dx,xmax,xmin
+ integer:: nx
+ nx = 10000
+ xmin = -5.d0
+ xmax =  5.d0
+ dx = (xmax - xmin)/dble(nx)
+ x = xmin
+ do i = 1, nx
+  write(34,*)x,datan(x),dacos(x)
+  x += dx
+ enddo
+end
+
+subroutine test_brutal_spheric
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ ! test for the <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2 using a two dimentional integration 
+ ! 
+ !   \int_0^2pi d Phi \int_-1^+1 d(cos(Theta)) Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi)
+ !
+ !=  \int_0^2pi d Phi \int_0^pi dTheta sin(Theta)  Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi) 
+ !
+ ! Allows to test for the general functions spher_harm_func_m_pos with spher_harm_func_expl
+ END_DOC
+ integer :: itheta, iphi,ntheta,nphi
+ double precision :: theta_min, theta_max, dtheta,theta
+ double precision :: phi_min, phi_max, dphi,phi
+ double precision :: accu_re, accu_im,weight 
+ double precision :: re_ylm_1, im_ylm_1 ,re_ylm_2, im_ylm_2,accu
+ integer :: l1,m1,i,l2,m2,lmax
+ phi_min = 0.d0
+ phi_max = 2.D0 * pi
+ theta_min = 0.d0
+ theta_max = 1.D0 * pi
+ ntheta = 1000
+ nphi = 1000
+ dphi = (phi_max - phi_min)/dble(nphi)
+ dtheta = (theta_max - theta_min)/dble(ntheta)
+
+ lmax = 3
+ do l1 = 0,lmax
+  do m1 =  0 ,l1
+   do l2 = 0,lmax
+    do m2 =  0 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     accu  = 0.d0
+     theta = theta_min
+     do itheta = 1, ntheta
+      phi = phi_min
+      do iphi = 1, nphi
+!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+       call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       weight = dtheta * dphi * dsin(theta) 
+       accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+       accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+       accu += weight
+       phi += dphi
+      enddo
+      theta += dtheta
+     enddo
+     print*,'l1,m1,l2,m2',l1,m1,l2,m2
+     print*,'accu_re = ',accu_re
+     print*,'accu_im = ',accu_im
+     print*,'accu    = ',accu
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ 
+
+end
+
+subroutine test_assoc_leg_pol
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+ integer :: l1,m1,ngrid,i,l2,m2
+ l1 = 0
+ m1 = 0
+ l2 = 2
+ m2 = 0
+ double precision :: x, dx,xmax,accu,xmin
+ double precision :: plgndr,func_1,func_2,ortho_assoc_gaus_pol
+ ngrid = 100000
+ xmax = 1.d0
+ xmin = -1.d0
+ dx = (xmax-xmin)/dble(ngrid)
+ do l2 = 0,10
+  x = xmin
+  accu = 0.d0
+  do i = 1, ngrid
+   func_1 = plgndr(l1,m1,x)
+   func_2 = plgndr(l2,m2,x)
+   write(33,*)x, func_1,func_2
+   accu += func_1 * func_2 * dx
+   x += dx
+  enddo
+  print*,'l2 = ',l2
+  print*,'accu = ',accu
+  print*,ortho_assoc_gaus_pol(l1,m1,l2)
+ enddo
+end
diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
index 40661db1..e8deafb9 100644
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -5,213 +5,3 @@ program spher_harm
 ! call test_brutal_spheric
 end
 
-subroutine test_cart
- implicit none
- include 'constants.include.F'
- double precision :: r(3),theta,phi,r_abs 
- print*,''
- r = 0.d0
- r(1) = 1.d0
- r(2) = 1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
- print*,''
- r = 0.d0
- r(1) =-1.d0
- r(2) = 1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
- print*,''
- r = 0.d0
- r(1) =-1.d0
- r(2) =-1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
- print*,''
- r = 0.d0
- r(1) = 1.d0
- r(2) =-1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
-end
-
-subroutine test_spher_harm
- implicit none
-  include 'constants.include.F'
- integer :: l1,m1,i,l2,m2,lmax
- double precision :: r(3),weight,accu_re, accu_im,accu
- double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
- l1 = 0
- m1 = 0
- l2 = 0
- m2 = 0
- lmax = 5
- do l1 = 0,lmax
-  do m1 =  -l1 ,l1
-   do l2 = 0,lmax
-    do m2 =  -l2 ,l2
-     accu_re = 0.d0
-     accu_im = 0.d0
-     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
-     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
-     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-     accu = 0.d0
-     do i = 1, n_points_integration_angular 
-      double precision :: theta,phi,r_abs
-      r(1:3) = angular_quadrature_points(i,1:3)
-      weight = weights_angular_points(i)
-      call cartesian_to_spherical(r,theta,phi,r_abs)
-      if(theta.gt.pi.or.theta.lt.0.d0)then
-       print*,'pb with theta',theta
-       print*,r
-      endif
-      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
-       print*,'pb with phi',phi/pi
-       print*,r
-      endif
-      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
-      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
-!      call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
-      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-      accu += weight
-      write(33,'(10(F16.10,X))')phi/pi
-     enddo
-     if(l1.ne.l2.or.m1.ne.m2)then
-      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb OFF DIAG !!!!! '
-       print*,'l1,m1,l2,m2',l1,m1,l2,m2
-       print*,'accu_re = ',accu_re
-       print*,'accu_im = ',accu_im
-      endif
-     endif
-     if(l1==l2.and.m1==m2)then
-      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb DIAG !!!!! '
-       print*,'l1,m1,l2,m2',l1,m1,l2,m2
-       print*,'accu_re = ',accu_re
-       print*,'accu_im = ',accu_im
-      endif
-     endif
-    enddo
-   enddo
-  enddo
- enddo
- double precision :: x,dx,xmax,xmin
- integer:: nx
- nx = 10000
- xmin = -5.d0
- xmax =  5.d0
- dx = (xmax - xmin)/dble(nx)
- x = xmin
- do i = 1, nx
-  write(34,*)x,datan(x),dacos(x)
-  x += dx
- enddo
-end
-
-subroutine test_brutal_spheric
- implicit none
-  include 'constants.include.F'
- integer :: itheta, iphi,ntheta,nphi
- double precision :: theta_min, theta_max, dtheta,theta
- double precision :: phi_min, phi_max, dphi,phi
- double precision :: accu_re, accu_im,weight 
- double precision :: re_ylm_1, im_ylm_1 ,re_ylm_2, im_ylm_2,accu
- integer :: l1,m1,i,l2,m2,lmax
- phi_min = 0.d0
- phi_max = 2.D0 * pi
- theta_min = 0.d0
- theta_max = 1.D0 * pi
- ntheta = 1000
- nphi = 1000
- dphi = (phi_max - phi_min)/dble(nphi)
- dtheta = (theta_max - theta_min)/dble(ntheta)
-
- lmax = 3
- do l1 = 0,lmax
-  do m1 =  0 ,l1
-   do l2 = 0,lmax
-    do m2 =  0 ,l2
-     accu_re = 0.d0
-     accu_im = 0.d0
-     accu  = 0.d0
-     theta = theta_min
-     do itheta = 1, ntheta
-      phi = phi_min
-      do iphi = 1, nphi
-!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-       call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-       call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-       weight = dtheta * dphi * dsin(theta) 
-       accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
-       accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-       accu += weight
-       phi += dphi
-      enddo
-      theta += dtheta
-     enddo
-     print*,'l1,m1,l2,m2',l1,m1,l2,m2
-     print*,'accu_re = ',accu_re
-     print*,'accu_im = ',accu_im
-     print*,'accu    = ',accu
-     if(l1.ne.l2.or.m1.ne.m2)then
-      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb OFF DIAG !!!!! '
-      endif
-     endif
-     if(l1==l2.and.m1==m2)then
-      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb DIAG !!!!! '
-      endif
-     endif
-    enddo
-   enddo
-  enddo
- enddo
- 
-
-end
-
-subroutine test_assoc_leg_pol
-  implicit none
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-  print *, 'Hello world'
- integer :: l1,m1,ngrid,i,l2,m2
- l1 = 0
- m1 = 0
- l2 = 2
- m2 = 0
- double precision :: x, dx,xmax,accu,xmin
- double precision :: plgndr,func_1,func_2,ortho_assoc_gaus_pol
- ngrid = 100000
- xmax = 1.d0
- xmin = -1.d0
- dx = (xmax-xmin)/dble(ngrid)
- do l2 = 0,10
-  x = xmin
-  accu = 0.d0
-  do i = 1, ngrid
-   func_1 = plgndr(l1,m1,x)
-   func_2 = plgndr(l2,m2,x)
-   write(33,*)x, func_1,func_2
-   accu += func_1 * func_2 * dx
-   x += dx
-  enddo
-  print*,'l2 = ',l2
-  print*,'accu = ',accu
-  print*,ortho_assoc_gaus_pol(l1,m1,l2)
- enddo
-end

From c3483df9a16003065a41bfa92d37274a3eb466ee Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 25 Apr 2024 20:00:42 +0200
Subject: [PATCH 016/131] Added properly the routines for the test of the
 Spherical Harmonics

---
 plugins/local/spher_harm/routines_test.irp.f | 227 +++++++++++++++++++
 plugins/local/spher_harm/spher_harm.irp.f    | 210 -----------------
 2 files changed, 227 insertions(+), 210 deletions(-)
 create mode 100644 plugins/local/spher_harm/routines_test.irp.f

diff --git a/plugins/local/spher_harm/routines_test.irp.f b/plugins/local/spher_harm/routines_test.irp.f
new file mode 100644
index 00000000..6f7cbc1c
--- /dev/null
+++ b/plugins/local/spher_harm/routines_test.irp.f
@@ -0,0 +1,227 @@
+
+subroutine test_cart
+ implicit none
+ BEGIN_DOC
+ ! test for the cartesian --> spherical change of coordinates 
+ !
+ ! simple test such that the polar angle theta ranges in [0,pi]
+ !
+ ! and the asymuthal angle phi ranges in [0,2pi]
+ END_DOC
+ include 'constants.include.F'
+ double precision :: r(3),theta,phi,r_abs 
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+end
+
+subroutine test_spher_harm
+ implicit none
+ BEGIN_DOC
+ ! routine to test the spherical harmonics integration on a sphere with the grid. 
+ ! 
+ ! We test <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2
+ END_DOC
+  include 'constants.include.F'
+ integer :: l1,m1,i,l2,m2,lmax
+ double precision :: r(3),weight,accu_re, accu_im,accu
+ double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
+ l1 = 0
+ m1 = 0
+ l2 = 0
+ m2 = 0
+ lmax = 5
+ do l1 = 0,lmax
+  do m1 =  -l1 ,l1
+   do l2 = 0,lmax
+    do m2 =  -l2 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
+     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
+     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+     accu = 0.d0
+     do i = 1, n_points_integration_angular 
+      double precision :: theta,phi,r_abs
+      r(1:3) = angular_quadrature_points(i,1:3)
+      weight = weights_angular_points(i)
+      call cartesian_to_spherical(r,theta,phi,r_abs)
+      if(theta.gt.pi.or.theta.lt.0.d0)then
+       print*,'pb with theta',theta
+       print*,r
+      endif
+      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
+       print*,'pb with phi',phi/pi
+       print*,r
+      endif
+      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
+      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
+      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu += weight
+      write(33,'(10(F16.10,X))')phi/pi
+     enddo
+     ! Test for the delta l1,l2 and delta m1,m2
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ double precision :: x,dx,xmax,xmin
+ integer:: nx
+ nx = 10000
+ xmin = -5.d0
+ xmax =  5.d0
+ dx = (xmax - xmin)/dble(nx)
+ x = xmin
+ do i = 1, nx
+  write(34,*)x,datan(x),dacos(x)
+  x += dx
+ enddo
+end
+
+subroutine test_brutal_spheric
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ ! test for the <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2 using a two dimentional integration 
+ ! 
+ !   \int_0^2pi d Phi \int_-1^+1 d(cos(Theta)) Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi)
+ !
+ !=  \int_0^2pi d Phi \int_0^pi dTheta sin(Theta)  Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi) 
+ !
+ ! Allows to test for the general functions spher_harm_func_m_pos with spher_harm_func_expl
+ END_DOC
+ integer :: itheta, iphi,ntheta,nphi
+ double precision :: theta_min, theta_max, dtheta,theta
+ double precision :: phi_min, phi_max, dphi,phi
+ double precision :: accu_re, accu_im,weight 
+ double precision :: re_ylm_1, im_ylm_1 ,re_ylm_2, im_ylm_2,accu
+ integer :: l1,m1,i,l2,m2,lmax
+ phi_min = 0.d0
+ phi_max = 2.D0 * pi
+ theta_min = 0.d0
+ theta_max = 1.D0 * pi
+ ntheta = 1000
+ nphi = 1000
+ dphi = (phi_max - phi_min)/dble(nphi)
+ dtheta = (theta_max - theta_min)/dble(ntheta)
+
+ lmax = 3
+ do l1 = 0,lmax
+  do m1 =  0 ,l1
+   do l2 = 0,lmax
+    do m2 =  0 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     accu  = 0.d0
+     theta = theta_min
+     do itheta = 1, ntheta
+      phi = phi_min
+      do iphi = 1, nphi
+!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+       call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       weight = dtheta * dphi * dsin(theta) 
+       accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+       accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+       accu += weight
+       phi += dphi
+      enddo
+      theta += dtheta
+     enddo
+     print*,'l1,m1,l2,m2',l1,m1,l2,m2
+     print*,'accu_re = ',accu_re
+     print*,'accu_im = ',accu_im
+     print*,'accu    = ',accu
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ 
+
+end
+
+subroutine test_assoc_leg_pol
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+ integer :: l1,m1,ngrid,i,l2,m2
+ l1 = 0
+ m1 = 0
+ l2 = 2
+ m2 = 0
+ double precision :: x, dx,xmax,accu,xmin
+ double precision :: plgndr,func_1,func_2,ortho_assoc_gaus_pol
+ ngrid = 100000
+ xmax = 1.d0
+ xmin = -1.d0
+ dx = (xmax-xmin)/dble(ngrid)
+ do l2 = 0,10
+  x = xmin
+  accu = 0.d0
+  do i = 1, ngrid
+   func_1 = plgndr(l1,m1,x)
+   func_2 = plgndr(l2,m2,x)
+   write(33,*)x, func_1,func_2
+   accu += func_1 * func_2 * dx
+   x += dx
+  enddo
+  print*,'l2 = ',l2
+  print*,'accu = ',accu
+  print*,ortho_assoc_gaus_pol(l1,m1,l2)
+ enddo
+end
diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
index 40661db1..e8deafb9 100644
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -5,213 +5,3 @@ program spher_harm
 ! call test_brutal_spheric
 end
 
-subroutine test_cart
- implicit none
- include 'constants.include.F'
- double precision :: r(3),theta,phi,r_abs 
- print*,''
- r = 0.d0
- r(1) = 1.d0
- r(2) = 1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
- print*,''
- r = 0.d0
- r(1) =-1.d0
- r(2) = 1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
- print*,''
- r = 0.d0
- r(1) =-1.d0
- r(2) =-1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
- print*,''
- r = 0.d0
- r(1) = 1.d0
- r(2) =-1.d0
- call cartesian_to_spherical(r,theta,phi,r_abs)
- print*,r
- print*,phi/pi
-end
-
-subroutine test_spher_harm
- implicit none
-  include 'constants.include.F'
- integer :: l1,m1,i,l2,m2,lmax
- double precision :: r(3),weight,accu_re, accu_im,accu
- double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
- l1 = 0
- m1 = 0
- l2 = 0
- m2 = 0
- lmax = 5
- do l1 = 0,lmax
-  do m1 =  -l1 ,l1
-   do l2 = 0,lmax
-    do m2 =  -l2 ,l2
-     accu_re = 0.d0
-     accu_im = 0.d0
-     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
-     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
-     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-     accu = 0.d0
-     do i = 1, n_points_integration_angular 
-      double precision :: theta,phi,r_abs
-      r(1:3) = angular_quadrature_points(i,1:3)
-      weight = weights_angular_points(i)
-      call cartesian_to_spherical(r,theta,phi,r_abs)
-      if(theta.gt.pi.or.theta.lt.0.d0)then
-       print*,'pb with theta',theta
-       print*,r
-      endif
-      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
-       print*,'pb with phi',phi/pi
-       print*,r
-      endif
-      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
-      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
-!      call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
-      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-      accu += weight
-      write(33,'(10(F16.10,X))')phi/pi
-     enddo
-     if(l1.ne.l2.or.m1.ne.m2)then
-      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb OFF DIAG !!!!! '
-       print*,'l1,m1,l2,m2',l1,m1,l2,m2
-       print*,'accu_re = ',accu_re
-       print*,'accu_im = ',accu_im
-      endif
-     endif
-     if(l1==l2.and.m1==m2)then
-      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb DIAG !!!!! '
-       print*,'l1,m1,l2,m2',l1,m1,l2,m2
-       print*,'accu_re = ',accu_re
-       print*,'accu_im = ',accu_im
-      endif
-     endif
-    enddo
-   enddo
-  enddo
- enddo
- double precision :: x,dx,xmax,xmin
- integer:: nx
- nx = 10000
- xmin = -5.d0
- xmax =  5.d0
- dx = (xmax - xmin)/dble(nx)
- x = xmin
- do i = 1, nx
-  write(34,*)x,datan(x),dacos(x)
-  x += dx
- enddo
-end
-
-subroutine test_brutal_spheric
- implicit none
-  include 'constants.include.F'
- integer :: itheta, iphi,ntheta,nphi
- double precision :: theta_min, theta_max, dtheta,theta
- double precision :: phi_min, phi_max, dphi,phi
- double precision :: accu_re, accu_im,weight 
- double precision :: re_ylm_1, im_ylm_1 ,re_ylm_2, im_ylm_2,accu
- integer :: l1,m1,i,l2,m2,lmax
- phi_min = 0.d0
- phi_max = 2.D0 * pi
- theta_min = 0.d0
- theta_max = 1.D0 * pi
- ntheta = 1000
- nphi = 1000
- dphi = (phi_max - phi_min)/dble(nphi)
- dtheta = (theta_max - theta_min)/dble(ntheta)
-
- lmax = 3
- do l1 = 0,lmax
-  do m1 =  0 ,l1
-   do l2 = 0,lmax
-    do m2 =  0 ,l2
-     accu_re = 0.d0
-     accu_im = 0.d0
-     accu  = 0.d0
-     theta = theta_min
-     do itheta = 1, ntheta
-      phi = phi_min
-      do iphi = 1, nphi
-!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-       call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
-       call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
-       weight = dtheta * dphi * dsin(theta) 
-       accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
-       accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-       accu += weight
-       phi += dphi
-      enddo
-      theta += dtheta
-     enddo
-     print*,'l1,m1,l2,m2',l1,m1,l2,m2
-     print*,'accu_re = ',accu_re
-     print*,'accu_im = ',accu_im
-     print*,'accu    = ',accu
-     if(l1.ne.l2.or.m1.ne.m2)then
-      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb OFF DIAG !!!!! '
-      endif
-     endif
-     if(l1==l2.and.m1==m2)then
-      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb DIAG !!!!! '
-      endif
-     endif
-    enddo
-   enddo
-  enddo
- enddo
- 
-
-end
-
-subroutine test_assoc_leg_pol
-  implicit none
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-  print *, 'Hello world'
- integer :: l1,m1,ngrid,i,l2,m2
- l1 = 0
- m1 = 0
- l2 = 2
- m2 = 0
- double precision :: x, dx,xmax,accu,xmin
- double precision :: plgndr,func_1,func_2,ortho_assoc_gaus_pol
- ngrid = 100000
- xmax = 1.d0
- xmin = -1.d0
- dx = (xmax-xmin)/dble(ngrid)
- do l2 = 0,10
-  x = xmin
-  accu = 0.d0
-  do i = 1, ngrid
-   func_1 = plgndr(l1,m1,x)
-   func_2 = plgndr(l2,m2,x)
-   write(33,*)x, func_1,func_2
-   accu += func_1 * func_2 * dx
-   x += dx
-  enddo
-  print*,'l2 = ',l2
-  print*,'accu = ',accu
-  print*,ortho_assoc_gaus_pol(l1,m1,l2)
- enddo
-end

From 5c69a7c005ecabe8428c386bf17bad3327891578 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 26 Apr 2024 10:57:57 +0200
Subject: [PATCH 017/131] removed stupid stuffs in spher_harm

---
 plugins/local/spher_harm/README.rst          |   3 +
 plugins/local/spher_harm/routines_test.irp.f | 172 ++++++++++---------
 plugins/local/spher_harm/spher_harm.irp.f    |   4 +-
 3 files changed, 93 insertions(+), 86 deletions(-)

diff --git a/plugins/local/spher_harm/README.rst b/plugins/local/spher_harm/README.rst
index bf897f73..9c9b12a6 100644
--- a/plugins/local/spher_harm/README.rst
+++ b/plugins/local/spher_harm/README.rst
@@ -2,3 +2,6 @@
 spher_harm
 ==========
 
+Routines for spherical Harmonics evaluation in real space. 
+The main routine is "spher_harm_func_r3(r,l,m,re_ylm, im_ylm)".  
+The test routine is "test_spher_harm" where everything is explained in details. 
diff --git a/plugins/local/spher_harm/routines_test.irp.f b/plugins/local/spher_harm/routines_test.irp.f
index 6f7cbc1c..fe8fc422 100644
--- a/plugins/local/spher_harm/routines_test.irp.f
+++ b/plugins/local/spher_harm/routines_test.irp.f
@@ -1,10 +1,93 @@
+subroutine test_spher_harm
+ implicit none
+ BEGIN_DOC
+ ! routine to test the generic spherical harmonics routine "spher_harm_func_r3" from R^3 --> C
+ ! 
+ ! We test <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2
+ ! 
+ ! The test is done through the integration on a sphere with the Lebedev grid. 
+ END_DOC
+  include 'constants.include.F'
+ integer :: l1,m1,i,l2,m2,lmax
+ double precision :: r(3),weight,accu_re, accu_im,accu
+ double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
+ double precision :: theta,phi,r_abs
+ lmax = 5 ! Maximum angular momentum until which we are going to test orthogonality conditions
+ do l1 = 0,lmax
+  do m1 =  -l1 ,l1
+   do l2 = 0,lmax
+    do m2 =  -l2 ,l2
+     accu_re = 0.d0 ! accumulator for the REAL part of <Y_l1,m1|Y_l2,m2>
+     accu_im = 0.d0 ! accumulator for the IMAGINARY part of <Y_l1,m1|Y_l2,m2>
+     accu    = 0.d0 ! accumulator for the weights ==> should be \int dOmega == 4 pi
+     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
+     !               \approx \sum_i W_i Y_l1,m1^*(r_i) Y_l2,m2(r_i) WITH r_i being on the spher of radius 1 
+     do i = 1, n_points_integration_angular 
+      r(1:3) = angular_quadrature_points(i,1:3) ! ith Lebedev point (x,y,z) on the sphere of radius 1 
+      weight = weights_angular_points(i)        ! associated Lebdev weight not necessarily positive 
+
+!!!!!!!!!!! Test of the Cartesian --> Spherical coordinates  
+      ! theta MUST belong to [0,pi] and phi to [0,2pi]
+      ! gets the cartesian to spherical change of coordinates 
+      call cartesian_to_spherical(r,theta,phi,r_abs)
+      if(theta.gt.pi.or.theta.lt.0.d0)then
+       print*,'pb with theta, it should be in [0,pi]',theta
+       print*,r
+      endif
+      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
+       print*,'pb with phi, it should be in [0,2 pi]',phi/pi
+       print*,r
+      endif
+      
+!!!!!!!!!!! Routines returning the Spherical harmonics on the grid point 
+      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
+      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
+      
+!!!!!!!!!!! Integration of Y_l1,m1^*(r) Y_l2,m2(r)
+     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
+     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu += weight
+     enddo
+     ! Test that the sum of the weights is 4 pi
+     if(dabs(accu - dfour_pi).gt.1.d-6)then
+      print*,'Problem !! The sum of the Lebedev weight is not 4 pi ..'
+      print*,accu
+      stop
+     endif
+     ! Test for the delta l1,l2 and delta m1,m2
+     !
+     ! Test for the off-diagonal part of the Kronecker delta 
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+     ! Test for the diagonal part of the Kronecker delta 
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+end
 
 subroutine test_cart
  implicit none
  BEGIN_DOC
  ! test for the cartesian --> spherical change of coordinates 
  !
- ! simple test such that the polar angle theta ranges in [0,pi]
+ ! test the routine "cartesian_to_spherical" such that the polar angle theta ranges in [0,pi]
  !
  ! and the asymuthal angle phi ranges in [0,2pi]
  END_DOC
@@ -40,97 +123,18 @@ subroutine test_cart
  print*,phi/pi
 end
 
-subroutine test_spher_harm
- implicit none
- BEGIN_DOC
- ! routine to test the spherical harmonics integration on a sphere with the grid. 
- ! 
- ! We test <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2
- END_DOC
-  include 'constants.include.F'
- integer :: l1,m1,i,l2,m2,lmax
- double precision :: r(3),weight,accu_re, accu_im,accu
- double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
- l1 = 0
- m1 = 0
- l2 = 0
- m2 = 0
- lmax = 5
- do l1 = 0,lmax
-  do m1 =  -l1 ,l1
-   do l2 = 0,lmax
-    do m2 =  -l2 ,l2
-     accu_re = 0.d0
-     accu_im = 0.d0
-     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
-     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
-     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-     accu = 0.d0
-     do i = 1, n_points_integration_angular 
-      double precision :: theta,phi,r_abs
-      r(1:3) = angular_quadrature_points(i,1:3)
-      weight = weights_angular_points(i)
-      call cartesian_to_spherical(r,theta,phi,r_abs)
-      if(theta.gt.pi.or.theta.lt.0.d0)then
-       print*,'pb with theta',theta
-       print*,r
-      endif
-      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
-       print*,'pb with phi',phi/pi
-       print*,r
-      endif
-      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
-      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
-      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
-      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
-      accu += weight
-      write(33,'(10(F16.10,X))')phi/pi
-     enddo
-     ! Test for the delta l1,l2 and delta m1,m2
-     if(l1.ne.l2.or.m1.ne.m2)then
-      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb OFF DIAG !!!!! '
-       print*,'l1,m1,l2,m2',l1,m1,l2,m2
-       print*,'accu_re = ',accu_re
-       print*,'accu_im = ',accu_im
-      endif
-     endif
-     if(l1==l2.and.m1==m2)then
-      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
-       print*,'pb DIAG !!!!! '
-       print*,'l1,m1,l2,m2',l1,m1,l2,m2
-       print*,'accu_re = ',accu_re
-       print*,'accu_im = ',accu_im
-      endif
-     endif
-    enddo
-   enddo
-  enddo
- enddo
- double precision :: x,dx,xmax,xmin
- integer:: nx
- nx = 10000
- xmin = -5.d0
- xmax =  5.d0
- dx = (xmax - xmin)/dble(nx)
- x = xmin
- do i = 1, nx
-  write(34,*)x,datan(x),dacos(x)
-  x += dx
- enddo
-end
 
 subroutine test_brutal_spheric
  implicit none
   include 'constants.include.F'
  BEGIN_DOC
- ! test for the <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2 using a two dimentional integration 
+ ! Test for the <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2 using the following two dimentional integration 
  ! 
  !   \int_0^2pi d Phi \int_-1^+1 d(cos(Theta)) Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi)
  !
  !=  \int_0^2pi d Phi \int_0^pi dTheta sin(Theta)  Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi) 
  !
- ! Allows to test for the general functions spher_harm_func_m_pos with spher_harm_func_expl
+ ! Allows to test for the general functions "spher_harm_func_m_pos" with "spher_harm_func_expl"
  END_DOC
  integer :: itheta, iphi,ntheta,nphi
  double precision :: theta_min, theta_max, dtheta,theta
@@ -147,7 +151,7 @@ subroutine test_brutal_spheric
  dphi = (phi_max - phi_min)/dble(nphi)
  dtheta = (theta_max - theta_min)/dble(ntheta)
 
- lmax = 3
+ lmax = 2
  do l1 = 0,lmax
   do m1 =  0 ,l1
    do l2 = 0,lmax
@@ -196,7 +200,7 @@ end
 subroutine test_assoc_leg_pol
   implicit none
   BEGIN_DOC
-! TODO : Put the documentation of the program here
+! Test for the associated Legendre Polynoms. The test is done through the orthogonality condition. 
   END_DOC
   print *, 'Hello world'
  integer :: l1,m1,ngrid,i,l2,m2
diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
index e8deafb9..7a2eea06 100644
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -1,7 +1,7 @@
 program spher_harm
  implicit none
- call test_spher_harm
+! call test_spher_harm
 ! call test_cart
-! call test_brutal_spheric
+ call test_brutal_spheric
 end
 

From 40ea886cf1f6fe18d2501f1964e4f69deb66d947 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 1 May 2024 19:00:02 +0200
Subject: [PATCH 018/131] added NEED in local/slater_tc

---
 external/irpf90                         | 2 +-
 plugins/local/slater_tc/NEED            | 7 +++++++
 plugins/local/slater_tc/slater_tc.irp.f | 7 +++++++
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 plugins/local/slater_tc/NEED
 create mode 100644 plugins/local/slater_tc/slater_tc.irp.f

diff --git a/external/irpf90 b/external/irpf90
index beac6153..4ab1b175 160000
--- a/external/irpf90
+++ b/external/irpf90
@@ -1 +1 @@
-Subproject commit beac615343f421bd6c0571a408ba389a6d5a32ac
+Subproject commit 4ab1b175fc7ed0d96c1912f13dc53579b24157a6
diff --git a/plugins/local/slater_tc/NEED b/plugins/local/slater_tc/NEED
new file mode 100644
index 00000000..ef0aa3f7
--- /dev/null
+++ b/plugins/local/slater_tc/NEED
@@ -0,0 +1,7 @@
+determinants
+normal_order_old
+bi_ort_ints
+bi_ortho_mos
+tc_keywords
+non_hermit_dav
+dav_general_mat
diff --git a/plugins/local/slater_tc/slater_tc.irp.f b/plugins/local/slater_tc/slater_tc.irp.f
new file mode 100644
index 00000000..27ab47c5
--- /dev/null
+++ b/plugins/local/slater_tc/slater_tc.irp.f
@@ -0,0 +1,7 @@
+program slater_tc
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+end

From 0465a0f4397a53daa5a3a1c8374a5e34f5b61c67 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 1 May 2024 19:03:21 +0200
Subject: [PATCH 019/131] added local/normal_order_old/NEED

---
 plugins/local/normal_order_old/NEED       | 1 +
 plugins/local/normal_order_old/README.rst | 4 ++++
 2 files changed, 5 insertions(+)
 create mode 100644 plugins/local/normal_order_old/NEED
 create mode 100644 plugins/local/normal_order_old/README.rst

diff --git a/plugins/local/normal_order_old/NEED b/plugins/local/normal_order_old/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/normal_order_old/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/normal_order_old/README.rst b/plugins/local/normal_order_old/README.rst
new file mode 100644
index 00000000..a284fcfd
--- /dev/null
+++ b/plugins/local/normal_order_old/README.rst
@@ -0,0 +1,4 @@
+================
+normal_order_old
+================
+

From c50018e8bdbd0e11da5af2ddfe4032c7d6e86df2 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 1 May 2024 20:25:01 +0200
Subject: [PATCH 020/131] TC SPRING CLEANING: BEGINNING

---
 .../bi_ort_ints/three_body_ints_bi_ort.irp.f  |    2 +-
 .../local/non_h_ints_mu/jast_1e_utils.irp.f   |   18 +-
 .../local/non_h_ints_mu/numerical_integ.irp.f |    6 +-
 .../local/non_h_ints_mu/tc_integ_num.irp.f    |   20 +-
 .../local/non_h_ints_mu/test_non_h_ints.irp.f |   26 +-
 plugins/local/non_hermit_dav/biorthog.irp.f   | 1069 +----------------
 .../lapack_diag_non_hermit.irp.f              |  118 --
 .../local/non_hermit_dav/new_routines.irp.f   |  670 -----------
 .../mu_j_ints_usual_mos.irp.f                 |    8 -
 plugins/local/tc_bi_ortho/EZFIO.cfg           |   11 +
 .../local/tc_bi_ortho/print_tc_energy.irp.f   |   30 +-
 plugins/local/tc_bi_ortho/print_tc_var.irp.f  |    5 +-
 .../save_bitcpsileft_for_qmcchem.irp.f        |    8 +-
 plugins/local/tc_bi_ortho/tc_utils.irp.f      |   89 +-
 plugins/local/tc_scf/EZFIO.cfg                |    4 +-
 plugins/local/tc_scf/combine_lr_tcscf.irp.f   |   75 --
 plugins/local/tc_scf/diago_vartcfock.irp.f    |   96 --
 plugins/local/tc_scf/diis_tcscf.irp.f         |   75 +-
 .../local/tc_scf/fock_3e_bi_ortho_cs.irp.f    |  299 -----
 .../local/tc_scf/fock_3e_bi_ortho_os.irp.f    |  536 ---------
 .../local/tc_scf/fock_3e_bi_ortho_uhf.irp.f   |   77 --
 .../tc_scf/fock_3e_bi_ortho_uhf_old.irp.f     |  490 --------
 plugins/local/tc_scf/fock_tc.irp.f            | 1000 +++++++++++++--
 plugins/local/tc_scf/fock_tc_mo_tot.irp.f     |   11 +-
 plugins/local/tc_scf/fock_vartc.irp.f         |  287 -----
 plugins/local/tc_scf/rh_tcscf_diis.irp.f      |    4 +-
 plugins/local/tc_scf/rh_tcscf_simple.irp.f    |    2 +-
 plugins/local/tc_scf/rh_vartcscf_simple.irp.f |   89 --
 plugins/local/tc_scf/tc_scf.irp.f             |   58 +-
 plugins/local/tc_scf/tc_scf_energy.irp.f      |   41 +-
 plugins/local/tc_scf/test_int.irp.f           |  970 ---------------
 .../extra_grid_vector.irp.f                   |   11 -
 .../grid_becke_vector.irp.f                   |   11 -
 src/utils/util.irp.f                          |   80 +-
 34 files changed, 1188 insertions(+), 5108 deletions(-)
 delete mode 100644 plugins/local/non_hermit_dav/new_routines.irp.f
 delete mode 100644 plugins/local/tc_scf/combine_lr_tcscf.irp.f
 delete mode 100644 plugins/local/tc_scf/diago_vartcfock.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_vartc.irp.f
 delete mode 100644 plugins/local/tc_scf/rh_vartcscf_simple.irp.f
 delete mode 100644 plugins/local/tc_scf/test_int.irp.f

diff --git a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
index fd4a162f..73e5a611 100644
--- a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -123,7 +123,7 @@ subroutine give_integrals_3_body_bi_ort_spin( n, sigma_n, l, sigma_l, k, sigma_k
   endif
 
   return
-end subroutine give_integrals_3_body_bi_ort_spin
+end
 
 ! ---
 
diff --git a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
index 9cfabf58..c6b2b0a0 100644
--- a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
@@ -132,6 +132,7 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   double precision, allocatable :: A(:,:,:,:), b(:), A_tmp(:,:,:,:)
   double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:)
   double precision, allocatable :: u1e_tmp(:), tmp(:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
   double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:)
 
 
@@ -176,26 +177,27 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   ! --- --- ---
   ! get A
 
-  allocate(tmp(n_points_final_grid,ao_num,ao_num))
+  allocate(tmp1(n_points_final_grid,ao_num,ao_num), tmp2(n_points_final_grid,ao_num,ao_num))
   allocate(A(ao_num,ao_num,ao_num,ao_num))
 
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
   !$OMP DO COLLAPSE(2)
   do j = 1, ao_num
     do i = 1, ao_num
       do ipoint = 1, n_points_final_grid
-        tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(ipoint,i,j) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(ipoint,i,j) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
+  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0  &
+            , tmp1(1,1,1), n_points_final_grid, tmp2(1,1,1), n_points_final_grid &
             , 0.d0, A(1,1,1,1), ao_num*ao_num)
 
   allocate(A_tmp(ao_num,ao_num,ao_num,ao_num))
@@ -207,13 +209,13 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   allocate(b(ao_num*ao_num))
 
   do ipoint = 1, n_points_final_grid
-    u1e_tmp(ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * u1e_tmp(ipoint)
+    u1e_tmp(ipoint) = u1e_tmp(ipoint)
   enddo
 
-  call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1)
+  call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp1(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1)
 
   deallocate(u1e_tmp)
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)
 
   ! --- --- ---
   ! solve Ax = b
diff --git a/plugins/local/non_h_ints_mu/numerical_integ.irp.f b/plugins/local/non_h_ints_mu/numerical_integ.irp.f
index 5436b857..2737774a 100644
--- a/plugins/local/non_h_ints_mu/numerical_integ.irp.f
+++ b/plugins/local/non_h_ints_mu/numerical_integ.irp.f
@@ -179,7 +179,7 @@ double precision function num_v_ij_erf_rk_cst_mu_env(i, j, ipoint)
     dx    = r1(1) - r2(1)
     dy    = r1(2) - r2(2)
     dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
     if(r12 .lt. 1d-10) cycle
 
     tmp1  = (derf(mu_erf * r12) - 1.d0) / r12
@@ -228,7 +228,7 @@ subroutine num_x_v_ij_erf_rk_cst_mu_env(i, j, ipoint, integ)
     dx    = r1(1) - r2(1)
     dy    = r1(2) - r2(2)
     dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
     if(r12 .lt. 1d-10) cycle
 
     tmp1  = (derf(mu_erf * r12) - 1.d0) / r12
@@ -530,7 +530,7 @@ subroutine num_int2_u_grad1u_total_env2(i, j, ipoint, integ)
     dx    = r1(1) - r2(1)
     dy    = r1(2) - r2(2)
     dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
     if(r12 .lt. 1d-10) cycle
 
     tmp0 = env_nucl(r2)
diff --git a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
index 6d446037..9d9601c0 100644
--- a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
@@ -63,12 +63,10 @@
   do i_pass = 1, n_pass
     ii = (i_pass-1)*n_blocks + 1
   
-    !$OMP PARALLEL                                         &
-    !$OMP DEFAULT (NONE)                                   &
-    !$OMP PRIVATE (i_blocks, ipoint)                       &
-    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, &
-    !$OMP         final_grid_points, tmp_grad1_u12,        &
-    !$OMP         tmp_grad1_u12_squared)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12, tmp_grad1_u12_squared)
     !$OMP DO 
     do i_blocks = 1, n_blocks
       ipoint = ii - 1 + i_blocks ! r1
@@ -99,12 +97,10 @@
   
     ii = n_pass*n_blocks + 1
 
-    !$OMP PARALLEL                                       &
-    !$OMP DEFAULT (NONE)                                 &
-    !$OMP PRIVATE (i_rest, ipoint)                       &
-    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, &
-    !$OMP         final_grid_points, tmp_grad1_u12,      &
-    !$OMP         tmp_grad1_u12_squared)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12, tmp_grad1_u12_squared)
     !$OMP DO 
     do i_rest = 1, n_rest
       ipoint = ii - 1 + i_rest ! r1
diff --git a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
index 464a1c1f..4c63dec4 100644
--- a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
+++ b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
@@ -1125,6 +1125,7 @@ subroutine test_fit_coef_A1()
   double precision              :: accu, norm, diff
   double precision, allocatable :: A1(:,:)
   double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
 
   ! ---
 
@@ -1165,16 +1166,17 @@ subroutine test_fit_coef_A1()
 
   call wall_time(t1)
 
-  allocate(tmp(ao_num,ao_num,n_points_final_grid))
+  allocate(tmp1(ao_num,ao_num,n_points_final_grid), tmp2(ao_num,ao_num,n_points_final_grid))
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
   !$OMP DO COLLAPSE(2)
   do j = 1, ao_num
     do i = 1, ao_num
       do ipoint = 1, n_points_final_grid
-        tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(i,j,ipoint) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(i,j,ipoint) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
       enddo
     enddo
   enddo
@@ -1184,9 +1186,9 @@ subroutine test_fit_coef_A1()
   allocate(A2(ao_num,ao_num,ao_num,ao_num))
 
   call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num              &
+            , tmp1(1,1,1), ao_num*ao_num, tmp2(1,1,1), ao_num*ao_num            &
             , 0.d0, A2(1,1,1,1), ao_num*ao_num)
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)
 
   call wall_time(t2)
   print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0
@@ -1238,6 +1240,7 @@ subroutine test_fit_coef_inv()
   double precision, allocatable :: A1(:,:), A1_inv(:,:), A1_tmp(:,:)
   double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:), A2_inv(:,:,:,:)
   double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A2_tmp(:,:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
 
   cutoff_svd = 5d-8
 
@@ -1286,16 +1289,17 @@ subroutine test_fit_coef_inv()
 
   call wall_time(t1)
 
-  allocate(tmp(n_points_final_grid,ao_num,ao_num))
+  allocate(tmp1(n_points_final_grid,ao_num,ao_num), tmp2(n_points_final_grid,ao_num,ao_num))
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
   !$OMP DO COLLAPSE(2)
   do j = 1, ao_num
     do i = 1, ao_num
       do ipoint = 1, n_points_final_grid
-        tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(ipoint,i,j) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(ipoint,i,j) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
       enddo
     enddo
   enddo
@@ -1304,11 +1308,11 @@ subroutine test_fit_coef_inv()
 
   allocate(A2(ao_num,ao_num,ao_num,ao_num))
 
-  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
+  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0  &
+            , tmp1(1,1,1), n_points_final_grid, tmp2(1,1,1), n_points_final_grid &
             , 0.d0, A2(1,1,1,1), ao_num*ao_num)
 
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)
 
   call wall_time(t2)
   print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0
diff --git a/plugins/local/non_hermit_dav/biorthog.irp.f b/plugins/local/non_hermit_dav/biorthog.irp.f
index 2229e17d..b36b0130 100644
--- a/plugins/local/non_hermit_dav/biorthog.irp.f
+++ b/plugins/local/non_hermit_dav/biorthog.irp.f
@@ -1,254 +1,3 @@
-subroutine non_hrmt_diag_split_degen(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  integer                       :: n_good
-  double precision              :: shift,shift_current
-  double precision              :: r,thr
-  integer,          allocatable :: list_good(:), iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-
-
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  shift = 1.d-15
-  shift_current = shift
-  iteration = 1 
-  logical :: good_ortho
-  good_ortho = .False.
-  do while(n_real_eigv.ne.n.or. .not.good_ortho)
-   if(shift.gt.1.d-3)then
-    print*,'shift > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   print*,'***** iteration = ',iteration
-   print*,'shift = ',shift
-   allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-   Aw = A_save
-   do i = 1, n
-    do j = 1, n
-     if(dabs(Aw(j,i)).lt.shift)then
-      Aw(j,i) = 0.d0
-     endif
-    enddo
-   enddo
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   allocate(im_part(n),iorder(n))
-   do i = 1, n
-    im_part(i) = -dabs(WI(i))
-    iorder(i) = i
-   enddo
-   call dsort(im_part, iorder, n)
-
-   shift_current = max(10.d0 * dabs(im_part(1)),shift)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-   Aw = A_save
-   call split_matrix_degen(Aw,n,shift_current)
-   deallocate( im_part, iorder )
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   ! You track the real eigenvalues 
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_good += 1
-     else
-       print*,'Found an imaginary component to eigenvalue'
-       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   allocate( list_good(n_good), iorder(n_good) )
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_good += 1
-       list_good(n_good) = i
-       eigval(n_good) = WR(i)
-     endif
-   enddo
-   deallocate( WR, WI )
- 
-   n_real_eigv = n_good 
-   do i = 1, n_good
-     iorder(i) = i
-   enddo
- 
-   ! You sort the real eigenvalues 
-   call dsort(eigval, iorder, n_good)
- 
-   reigvec(:,:) = 0.d0 
-   leigvec(:,:) = 0.d0 
-   do i = 1, n_real_eigv
-     do j = 1, n
-       reigvec_tmp(j,i) = VR(j,list_good(iorder(i)))
-       leigvec_tmp(j,i) = Vl(j,list_good(iorder(i)))
-     enddo
-   enddo
-
-   if(n_real_eigv == n)then
-    allocate(S(n,n))
-    call check_bi_ortho(reigvec_tmp,leigvec_tmp,n,S,accu_nd)
-    print*,'accu_nd = ',accu_nd
-    double precision :: accu_nd
-    good_ortho = accu_nd .lt. 1.d-10
-    deallocate(S)
-   endif
- 
-   deallocate( list_good, iorder )
-   deallocate( VL, VR, Aw)
-   shift *= 10.d0
-   iteration += 1
-  enddo
-  do i = 1, n
-   do j = 1, n
-    reigvec(iorder_origin(j),i) = reigvec_tmp(j,i)
-    leigvec(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-end
-
-! ---
-
-subroutine non_hrmt_real_diag_new(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_good
-  double precision              :: shift,shift_current
-  double precision              :: r,thr
-  integer,          allocatable :: list_good(:), iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: im_part(:)
-
-
-  print*,'Computing the left/right eigenvectors ...'
-
-  ! Eigvalue(n) = WR(n) + i * WI(n)
-  shift = 1.d-10
-  do while(n_real_eigv.ne.n.or.shift.gt.1.d-3)
-   allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-   Aw = A
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   allocate(im_part(n), iorder(n))
-   do i = 1, n
-    im_part(i) = -dabs(WI(i))
-    iorder(i) = i
-   enddo
-   shift_current = max(10.d0 * dabs(im_part(1)),shift)
-   print*,'adding random number of magnitude ',shift_current
-   Aw = A
-   do i = 1, n
-     call RANDOM_NUMBER(r)
-     Aw(i,i) += shift_current * r
-   enddo
-   deallocate( im_part, iorder )
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
- 
-   ! You track the real eigenvalues 
-   thr = 1.d-10
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.thr)then
-       n_good += 1
-     else
-       print*,'Found an imaginary component to eigenvalue'
-       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
- 
-   allocate( list_good(n_good), iorder(n_good) )
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.thr)then
-       n_good += 1
-       list_good(n_good) = i
-       eigval(n_good) = WR(i)
-     endif
-   enddo
- 
-   deallocate( WR, WI )
- 
-   n_real_eigv = n_good 
-   do i = 1, n_good
-     iorder(i) = i
-   enddo
- 
-   ! You sort the real eigenvalues 
-   call dsort(eigval, iorder, n_good)
- 
-   reigvec(:,:) = 0.d0 
-   leigvec(:,:) = 0.d0 
-   do i = 1, n_real_eigv
-     do j = 1, n
-       reigvec(j,i) = VR(j,list_good(iorder(i)))
-       leigvec(j,i) = Vl(j,list_good(iorder(i)))
-     enddo
-   enddo
- 
-   deallocate( list_good, iorder )
-   deallocate( VL, VR, Aw)
-   shift *= 10.d0
-  enddo
-  if(shift.gt.1.d-3)then
-   print*,'shift > 1.d-3 !!'
-   print*,'Your matrix intrinsically contains complex eigenvalues'
-  endif
-
-end
 
 ! ---
 
@@ -282,126 +31,20 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
 
   allocate(phi_1_tilde(n),phi_2_tilde(n),chi_1_tilde(n),chi_2_tilde(n))
 
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  !print *, ' '
-  !print *, ' Computing the left/right eigenvectors ...'
-  !print *, ' '
-
   allocate(WR(n), WI(n), VL(n,n), VR(n,n)) 
-  
-  !print *, ' fock matrix'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') A(i,:)
-  !enddo
 
-  !thr_cut = 1.d-15
-  !call cancel_small_elmts(A, n, thr_cut)
-
-  !call lapack_diag_non_sym_right(n, A, WR, WI, VR)
   call lapack_diag_non_sym(n, A, WR, WI, VL, VR)
-  !call lapack_diag_non_sym_new(n, A, WR, WI, VL, VR)
-
-
-
-  !print *, ' '
-  !print *, ' eigenvalues'
-  i = 1
-  do while(i .le. n)
-    !write(*, '(I3,X,1000(F16.10,X))')i, WR(i), WI(i)
-   if(.false.)then
-    if(WI(i).ne.0.d0)then
-     print*,'*****************'
-     print*,'WARNING ! IMAGINARY EIGENVALUES !!!'
-     write(*, '(1000(F16.10,X))') WR(i), WI(i+1)
-     ! phi = VR(:,i), psi = VR(:,i+1), |Phi_i> = phi + j psi , |Phi_i+1> = phi - j psi 
-     ! chi = VL(:,i), xhi = VL(:,i+1), |Chi_i> = chi + j xhi , |Chi_i+1> = chi - j xhi 
-     !                                   
-     accu_chi_phi = 0.d0
-     accu_xhi_psi = 0.d0
-     accu_chi_psi = 0.d0
-     accu_xhi_phi = 0.d0
-     double precision :: accu_chi_phi, accu_xhi_psi, accu_chi_psi, accu_xhi_phi
-     double precision :: mat_ovlp(2,2),eigval_tmp(2),eigvec(2,2),mat_ovlp_orig(2,2)
-     do j = 1, n 
-      accu_chi_phi += VL(j,i)   * VR(j,i)
-      accu_xhi_psi += VL(j,i+1) * VR(j,i+1)
-      accu_chi_psi += VL(j,i)   * VR(j,i+1)
-      accu_xhi_phi += VL(j,i+1) * VR(j,i)
-     enddo
-     mat_ovlp_orig(1,1) = accu_chi_phi
-     mat_ovlp_orig(2,1) = accu_xhi_phi
-     mat_ovlp_orig(1,2) = accu_chi_psi
-     mat_ovlp_orig(2,2) = accu_xhi_psi
-     print*,'old overlap matrix '
-     write(*,'(100(F16.10,X))')mat_ovlp_orig(1:2,1)
-     write(*,'(100(F16.10,X))')mat_ovlp_orig(1:2,2)
-
-
-     mat_ovlp(1,1) = accu_xhi_phi
-     mat_ovlp(2,1) = accu_chi_phi
-     mat_ovlp(1,2) = accu_xhi_psi
-     mat_ovlp(2,2) = accu_chi_psi
-    !print*,'accu_chi_phi = ',accu_chi_phi
-    !print*,'accu_xhi_psi = ',accu_xhi_psi
-    !print*,'accu_chi_psi = ',accu_chi_psi
-    !print*,'accu_xhi_phi = ',accu_xhi_phi
-     print*,'new overlap matrix '
-     write(*,'(100(F16.10,X))')mat_ovlp(1:2,1)
-     write(*,'(100(F16.10,X))')mat_ovlp(1:2,2)
-     call lapack_diag(eigval_tmp,eigvec,mat_ovlp,2,2)
-     print*,'eigval_tmp(1) = ',eigval_tmp(1)
-     print*,'eigvec(1) = ',eigvec(1:2,1)
-     print*,'eigval_tmp(2) = ',eigval_tmp(2)
-     print*,'eigvec(2) = ',eigvec(1:2,2)
-     print*,'*****************'
-     phi_1_tilde = 0.d0
-     phi_2_tilde = 0.d0
-     chi_1_tilde = 0.d0
-     chi_2_tilde = 0.d0
-     do j = 1, n
-      phi_1_tilde(j) += VR(j,i) * eigvec(1,1) + VR(j,i+1) * eigvec(2,1)
-      phi_2_tilde(j) += VR(j,i) * eigvec(1,2) + VR(j,i+1) * eigvec(2,2)
-      chi_1_tilde(j) += VL(j,i+1) * eigvec(1,1) + VL(j,i) * eigvec(2,1)
-      chi_2_tilde(j) += VL(j,i+1) * eigvec(1,2) + VL(j,i) * eigvec(2,2)
-     enddo
-     VR(1:n,i)   = phi_1_tilde(1:n)
-     VR(1:n,i+1) = phi_2_tilde(1:n)
-!     Vl(1:n,i)   = -chi_1_tilde(1:n)
-!     Vl(1:n,i+1) = chi_2_tilde(1:n)
-     i+=1 
-    endif
-   endif
-    i+=1
-  enddo
-  !print *, ' right eigenvect bef' 
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') VR(:,i)
-  !enddo
-  !print *, ' left eigenvect bef'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') VL(:,i)
-  !enddo
 
   thr_diag = 1d-06
   thr_norm = 1d+10
-  !call check_EIGVEC(n, n, A, WR, VL, VR, thr_diag, thr_norm, .false.)
-
-  !
-  ! -------------------------------------------------------------------------------------
 
   ! ---
 
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
+  ! track & sort the real eigenvalues 
 
   n_good = 0
-  !thr    = 100d0
   thr    = Im_thresh_tcscf
   do i = 1, n
-    !print*, 'Re(i) + Im(i)', WR(i), WI(i)
     if(dabs(WI(i)) .lt. thr) then
       n_good += 1
     else
@@ -410,11 +53,12 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
     endif
   enddo
 
-  if(n_good.ne.n)then
-   print*,'there are some imaginary eigenvalues '
-   thr_diag = 1d-03
-   n_good = n
+  if(n_good.ne.n) then
+    print*,'there are some imaginary eigenvalues '
+    thr_diag = 1d-03
+    n_good = n
   endif
+
   allocate(list_good(n_good), iorder(n_good))
 
   n_good = 0
@@ -446,26 +90,9 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
 
   ASSERT(n==n_real_eigv)
 
-  !print *, ' eigenvalues'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') eigval(i)
-  !enddo
-  !print *, ' right eigenvect aft ord' 
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') reigvec(:,i)
-  !enddo
-  !print *, ' left eigenvect aft ord'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') leigvec(:,i)
-  !enddo
-
-  !
-  ! -------------------------------------------------------------------------------------
-
   ! ---
 
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
+  ! check bi-orthogonality
 
   thr_diag = 10.d0
   thr_norm = 1d+10
@@ -495,8 +122,6 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
 
     print *, ' lapack vectors are not normalized neither bi-orthogonalized'
 
-    ! ---
-
     allocate(deg_num(n))
     call reorder_degen_eigvec(n, deg_num, eigval, leigvec, reigvec)
     call impose_biorthog_degen_eigvec(n, deg_num, eigval, leigvec, reigvec)
@@ -508,700 +133,36 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
     endif
     call check_biorthog(n, n_real_eigv, leigvec, reigvec, accu_d, accu_nd, S, thr_d, thr_nd, .true.)
 
-    !call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, thr_diag, thr_norm, .true.)
-
     deallocate(S)
 
   endif
 
-  !
-  ! -------------------------------------------------------------------------------------
-
   return
 
 end
 
 ! ---
 
-subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigval)
+subroutine check_bi_ortho(reigvec, leigvec, n, S, accu_nd)
 
   BEGIN_DOC
-  ! 
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
+  ! retunrs the overlap matrix S = Leigvec^T Reigvec 
   !
+  ! and the square root of the sum of the squared off-diagonal elements of S
   END_DOC
 
   implicit none
   integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
+  double precision, intent(in)  :: reigvec(n,n), leigvec(n,n)
+  double precision, intent(out) :: S(n,n), accu_nd
 
-  integer                       :: i, j
-  integer                       :: n_good
-  double precision              :: thr
-  double precision              :: accu_nd
+  integer :: i,j
 
-  integer,          allocatable :: list_good(:), iorder(:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:)
-  double precision, allocatable :: S(:,:)
-  double precision :: r
-
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n) )
-
-  Aw(:,:) = A(:,:)
-  call lapack_diag_non_sym_new(n, Aw, WR, WI, VL, VR)
-
-  thr    = 1.d-12
-  double precision, allocatable :: im_part(:)
-  n_good = 0
-  do i = 1, n
-    if( dabs(WI(i)).lt.thr ) then
-      n_good += 1
-    else
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) + Im(i)', WR(i), WI(i)
-    endif
-  enddo
-  print*,'n_good = ',n_good
-  if(n_good .lt. n)then
-   print*,'Removing degeneracies to remove imaginary parts'
-   allocate(im_part(n),iorder(n))
-   r = 0.d0
-   do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-   enddo
-   call dsort(im_part,iorder,n) 
-   thr = 10.d0 * dabs(im_part(1))
-   print*,'adding random numbers on the diagonal of magnitude ',thr
-   Aw(:,:) = A(:,:)
-   do i = 1, n
-     call RANDOM_NUMBER(r)
-     print*,'r = ',r*thr
-     Aw(i,i) += thr * r
-   enddo
-   print*,'Rediagonalizing the matrix with random numbers'
-   call lapack_diag_non_sym_new(n, Aw, WR, WI, VL, VR)
-   deallocate(im_part,iorder)
-  endif
-  deallocate( Aw )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  n_good = 0
-  thr    = 1.d-5
-  do i = 1, n
-    if( dabs(WI(i)).lt.thr ) then
-      n_good += 1
-    else
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) + Im(i)', WR(i), WI(i)
-    endif
-  enddo
-  print*,'n_good = ',n_good
-  allocate( list_good(n_good), iorder(n_good) )
-
-  n_good = 0
-  do i = 1, n
-    if( dabs(WI(i)).lt.thr ) then
-      n_good += 1
-      list_good(n_good) = i
-      eigval(n_good) = WR(i)
-    endif
-  enddo
-
-  deallocate( WR, WI )
-
-  n_real_eigv = n_good 
-  do i = 1, n_good
-    iorder(i) = i
-  enddo
-  call dsort(eigval, iorder, n_good)
-      
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n_real_eigv
-    do j = 1, n
-      reigvec(j,i) = VR(j,list_good(iorder(i)))
-      leigvec(j,i) = VL(j,list_good(iorder(i)))
-    enddo
-  enddo
-
-  deallocate( list_good, iorder )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n_real_eigv,n_real_eigv) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n_real_eigv, n_real_eigv, n, 1.d0          &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n_real_eigv
-    do j = 1, n_real_eigv
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    ! L x R is already bi-orthogonal
-
-    print *, ' L & T bi-orthogonality: ok'
-    deallocate( S )
-    return
-
-  else
-    ! impose bi-orthogonality 
-
-    print *, ' L & T bi-orthogonality: not imposed yet'
-    print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n_real_eigv, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
-    deallocate( S )
-  
-  endif
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  return
-
-end
-
-! ---
-
-subroutine non_hrmt_real_im(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  ! 
-  ! routine which returns the EIGENVALUES sorted the REAL part and corresponding LEFT/RIGHT eigenvetors 
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_bad
-  double precision              :: thr
-  double precision              :: accu_nd
-
-  integer,          allocatable :: iorder(:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:)
-  double precision, allocatable :: S(:,:)
-  double precision :: r
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n), iorder(n))
-
-  Aw(:,:) = A(:,:)
-   do i = 1, n
-     call RANDOM_NUMBER(r)
-     Aw(i,i) += 10.d-10* r
-   enddo
-  call lapack_diag_non_sym(n, Aw, WR, WI, VL, VR)
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  i = 1
-  thr    = 1.d-15
-  n_real_eigv = 0
-  do while (i.le.n) 
-!    print*,i,dabs(WI(i))
-    if( dabs(WI(i)).gt.thr ) then
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) , Im(i)  ', WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)
-      i+=1
-      print*, 'Re(i+1),Im(i+1)',WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)
-      i+=1
-    else  
-      n_real_eigv += 1
-      iorder(i) = i
-      eigval(i) = WR(i)
-      i+=1
-    endif
-  enddo
-  call dsort(eigval, iorder, n)
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n
-    do j = 1, n
-      reigvec(j,i) = VR(j,iorder(i))
-      leigvec(j,i) = VL(j,iorder(i))
-    enddo
-  enddo
-
-  deallocate( iorder )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n,n) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n, n, n, 1.d0          &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n
-    do j = 1, n
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  deallocate( S )
-
-end
-
-! ---
-
-subroutine non_hrmt_generalized_real_im(n, A, B, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  ! 
-  ! routine which returns the EIGENVALUES sorted the REAL part and corresponding LEFT/RIGHT eigenvetors 
-  ! for A R = lambda B R and A^\dagger L = lambda B^\dagger L
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n),B(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_bad
-  double precision              :: thr
-  double precision              :: accu_nd
-
-  integer,          allocatable :: iorder(:)
-  double precision, allocatable :: Aw(:,:),Bw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:), beta(:)
-  double precision, allocatable :: S(:,:)
-  double precision :: r
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n), Bw(n,n),iorder(n),beta(n))
-
-  Aw(:,:) = A(:,:)
-  Bw(:,:) = B(:,:)
-  call lapack_diag_general_non_sym(n,Aw,Bw,WR,beta,WI,VL,VR)
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  i = 1
-  thr    = 1.d-10
-  n_real_eigv = 0
-  do while (i.le.n) 
-    if( dabs(WI(i)).gt.thr ) then
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) , Im(i)  ', WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)/(beta(i) + 1.d-10)
-      i+=1
-      print*, 'Re(i+1),Im(i+1)',WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)/(beta(i) + 1.d-10)
-      i+=1
-    else  
-      n_real_eigv += 1
-      iorder(i) = i
-      eigval(i) = WR(i)/(beta(i) + 1.d-10)
-      i+=1
-    endif
-  enddo
-  call dsort(eigval, iorder, n)
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n
-    do j = 1, n
-      reigvec(j,i) = VR(j,iorder(i))
-      leigvec(j,i) = VL(j,iorder(i))
-    enddo
-  enddo
-
-  deallocate( iorder )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n,n) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n, n, n, 1.d0          &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n
-    do j = 1, n
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  deallocate( S )
-
-end
-
-! ---
-
-subroutine non_hrmt_bieig_fullvect(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  ! 
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_good
-  double precision              :: thr
-  double precision              :: accu_nd
-
-  integer,          allocatable :: iorder(:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:)
-  double precision, allocatable :: S(:,:)
-  double precision, allocatable :: eigval_sorted(:)
-
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n) )
-  Aw(:,:) = A(:,:)
-
-  call lapack_diag_non_sym_new(n, Aw, WR, WI, VL, VR)
-
-  deallocate( Aw )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  allocate( eigval_sorted(n), iorder(n) )
-
-  n_good = 0
-  thr    = 1.d-10
-
-  do i = 1, n
-
-    iorder(i) = i
-    eigval_sorted(i) = WR(i)
-
-    if(dabs(WI(i)) .gt. thr) then
-      print*, ' Found an imaginary component to eigenvalue on i = ', i
-      print*, ' Re(i) + Im(i)', WR(i), WI(i)
-    else
-      n_good += 1
-    endif
-
-  enddo
-
-  n_real_eigv = n_good 
-
-  call dsort(eigval_sorted, iorder, n)
-      
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n
-    eigval(i) = WR(i)
-    do j = 1, n
-      reigvec(j,i) = VR(j,iorder(i))
-      leigvec(j,i) = VL(j,iorder(i))
-    enddo
-  enddo
-
-  deallocate( eigval_sorted, iorder )
-  deallocate( WR, WI )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n,n) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n, n, n, 1.d0                              &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n
-    do j = 1, n
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    ! L x R is already bi-orthogonal
-
-    !print *, ' L & T bi-orthogonality: ok'
-    deallocate( S )
-    return
-
-  else
-    ! impose bi-orthogonality 
-
-    !print *, ' L & T bi-orthogonality: not imposed yet'
-    !print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
-    deallocate( S )
-  
-  endif
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  return
-
-end
-
-! ---
-
-
-subroutine split_matrix_degen(aw,n,shift)
- implicit none
- BEGIN_DOC
- ! subroutines that splits the degeneracies of a matrix by adding a splitting of magnitude thr * n_degen/2
- !
- ! WARNING !! THE MATRIX IS ASSUMED TO BE PASSED WITH INCREASING DIAGONAL ELEMENTS
- END_DOC
- double precision,intent(inout) :: Aw(n,n)
- double precision,intent(in)    :: shift
- integer, intent(in) :: n
- integer :: i,j,n_degen
- logical :: keep_on
- i=1
- do while(i.lt.n)
-  if(dabs(Aw(i,i)-Aw(i+1,i+1)).lt.shift)then
-   j=1
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i,i)-Aw(i+j,i+j)).lt.shift)then
-     j+=1
-    else
-     keep_on=.False.
-     exit
-    endif
-   enddo
-   n_degen = j
-   j=0
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j+1.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i+j,i+j)-Aw(i+j+1,i+j+1)).lt.shift)then
-     Aw(i+j,i+j) += (j-n_degen/2) * shift
-     j+=1
-    else 
-     keep_on = .False.
-     exit
-    endif
-   enddo
-   Aw(i+n_degen-1,i+n_degen-1) += (n_degen-1-n_degen/2) * shift
-   i+=n_degen
-  else 
-   i+=1
-  endif
- enddo
-
-end
-
-subroutine give_degen(a,n,shift,list_degen,n_degen_list)
- implicit none
- BEGIN_DOC
- ! returns n_degen_list :: the number of degenerated SET of elements (i.e. with |A(i)-A(i+1)| below shift)
- !
- ! for each of these sets, list_degen(1,i) = first degenerate element of the set i, 
- !
- !                         list_degen(2,i) = last degenerate element of the set i.
- END_DOC
- double precision,intent(in) :: A(n)
- double precision,intent(in)    :: shift
- integer, intent(in) :: n
- integer, intent(out) :: list_degen(2,n),n_degen_list
- integer :: i,j,n_degen,k
- logical :: keep_on
- double precision,allocatable :: Aw(:)
- list_degen = -1
- allocate(Aw(n))
- Aw = A
- i=1
- k = 0
- do while(i.lt.n)
-  if(dabs(Aw(i)-Aw(i+1)).lt.shift)then
-   k+=1
-   j=1
-   list_degen(1,k) = i
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i)-Aw(i+j)).lt.shift)then
-     j+=1
-    else
-     keep_on=.False.
-     exit
-    endif
-   enddo
-   n_degen = j
-   list_degen(2,k) = list_degen(1,k)-1 + n_degen
-   j=0
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j+1.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i+j)-Aw(i+j+1)).lt.shift)then
-     Aw(i+j) += (j-n_degen/2) * shift
-     j+=1
-    else 
-     keep_on = .False.
-     exit
-    endif
-   enddo
-   Aw(i+n_degen-1) += (n_degen-1-n_degen/2) * shift
-   i+=n_degen
-  else 
-   i+=1
-  endif
- enddo
- n_degen_list = k
-
-end
-
-subroutine cancel_small_elmts(aw,n,shift)
- implicit none
- BEGIN_DOC
- ! subroutines that splits the degeneracies of a matrix by adding a splitting of magnitude thr * n_degen/2
- !
- ! WARNING !! THE MATRIX IS ASSUMED TO BE PASSED WITH INCREASING DIAGONAL ELEMENTS
- END_DOC
- double precision,intent(inout) :: Aw(n,n)
- double precision,intent(in)    :: shift
- integer, intent(in) :: n
- integer :: i,j
- do i = 1, n
-  do j = 1, n
-   if(dabs(Aw(j,i)).lt.shift)then
-    Aw(j,i) = 0.d0
-   endif
-  enddo
- enddo
-end
-
-subroutine check_bi_ortho(reigvec,leigvec,n,S,accu_nd)
- implicit none
- integer, intent(in) :: n
- double precision,intent(in) :: reigvec(n,n),leigvec(n,n)
- double precision, intent(out) :: S(n,n),accu_nd
- BEGIN_DOC
-! retunrs the overlap matrix S = Leigvec^T Reigvec 
-!
-! and the square root of the sum of the squared off-diagonal elements of S
- END_DOC
- integer :: i,j
   ! S = VL x VR
   call dgemm( 'T', 'N', n, n, n, 1.d0 &
             , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1)  &
             , 0.d0, S, size(S, 1) )
+
   accu_nd = 0.d0
   do i = 1, n
     do j = 1, n
@@ -1213,3 +174,5 @@ subroutine check_bi_ortho(reigvec,leigvec,n,S,accu_nd)
   accu_nd = dsqrt(accu_nd)
 
 end
+
+
diff --git a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
index 4d4bc047..2c053ac8 100644
--- a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
+++ b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
@@ -273,60 +273,6 @@ end
 
 ! ---
 
-subroutine lapack_diag_non_sym_right(n, A, WR, WI, VR)
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  double precision, intent(out) :: WR(n), WI(n), VR(n,n)
-
-  integer                       :: i, lda, ldvl, ldvr, LWORK, INFO
-  double precision, allocatable :: Atmp(:,:), WORK(:), VL(:,:)
-
-  lda  = n
-  ldvl = 1
-  ldvr = n
-
-  allocate( Atmp(n,n), VL(1,1) )
-  Atmp(1:n,1:n) = A(1:n,1:n)
-
-  allocate(WORK(1))
-  LWORK = -1
-  call dgeev('N', 'V', n, Atmp, lda, WR, WI, VL, ldvl, VR, ldvr, WORK, LWORK, INFO)
-  if(INFO.gt.0)then
-    print*,'dgeev failed !!',INFO
-    stop
-  endif
-
-  LWORK = max(int(WORK(1)), 1) ! this is the optimal size of WORK 
-  deallocate(WORK)
-
-  allocate(WORK(LWORK))
-
-  ! Actual diagonalization 
-  call dgeev('N', 'V', n, Atmp, lda, WR, WI, VL, ldvl, VR, ldvr, WORK, LWORK, INFO)
-  if(INFO.ne.0) then
-    print*,'dgeev failed !!', INFO
-    stop
-  endif
-
-  deallocate(Atmp, WORK, VL)
-
-! print *, ' JOBL = F'
-! print *, ' eigenvalues'
-! do i = 1, n
-!   write(*, '(1000(F16.10,X))') WR(i), WI(i)
-! enddo
-! print *, ' right eigenvect' 
-! do i = 1, n
-!   write(*, '(1000(F16.10,X))') VR(:,i)
-! enddo
-
-end
-
-! ---
-
 subroutine non_hrmt_real_diag(n, A, leigvec, reigvec, n_real_eigv, eigval)
 
   BEGIN_DOC
@@ -1780,70 +1726,6 @@ end
 
 ! ---
 
-subroutine check_weighted_biorthog(n, m, W, Vl, Vr, thr_d, thr_nd, accu_d, accu_nd, S, stop_ifnot)
-
-  implicit none
-  
-  integer,          intent(in)  :: n, m
-  double precision, intent(in)  :: Vl(n,m), Vr(n,m), W(n,n)
-  double precision, intent(in)  :: thr_d, thr_nd
-  logical,          intent(in)  :: stop_ifnot
-  double precision, intent(out) :: accu_d, accu_nd, S(m,m)
-
-  integer                       :: i, j
-  double precision, allocatable :: SS(:,:), tmp(:,:)
-
-  print *, ' check weighted bi-orthogonality'
-
-  ! ---
-
-  allocate(tmp(m,n))
-  call dgemm( 'T', 'N', m, n, n, 1.d0        &
-            , Vl, size(Vl, 1), W, size(W, 1) &
-            , 0.d0, tmp, size(tmp, 1) )
-  call dgemm( 'N', 'N', m, m, n, 1.d0            &
-            , tmp, size(tmp, 1), Vr, size(Vr, 1) &
-            , 0.d0, S, size(S, 1) )
-  deallocate(tmp)
-
-  !print *, ' overlap matrix:'
-  !do i = 1, m
-  !  write(*,'(1000(F16.10,X))') S(i,:)
-  !enddo
-
-  accu_d  = 0.d0
-  accu_nd = 0.d0
-  do i = 1, m
-    do j = 1, m
-      if(i==j) then
-        accu_d = accu_d + dabs(S(i,i))
-      else
-        accu_nd = accu_nd + S(j,i) * S(j,i)
-      endif
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  print *, ' accu_nd = ', accu_nd
-  print *, ' accu_d  = ', dabs(accu_d-dble(m))/dble(m)
-
-  ! ---
-
-  if( stop_ifnot .and. ((accu_nd .gt. thr_nd) .or. dabs(accu_d-dble(m))/dble(m) .gt. thr_d) ) then
-    print *, ' non bi-orthogonal vectors !'
-    print *, ' accu_nd = ', accu_nd
-    print *, ' accu_d  = ', dabs(accu_d-dble(m))/dble(m)
-    !print *, ' overlap matrix:'
-    !do i = 1, m
-    !  write(*,'(1000(F16.10,X))') S(i,:)
-    !enddo
-    stop
-  endif
-
-end
-
-! ---
-
 subroutine check_biorthog(n, m, Vl, Vr, accu_d, accu_nd, S, thr_d, thr_nd, stop_ifnot)
 
   implicit none
diff --git a/plugins/local/non_hermit_dav/new_routines.irp.f b/plugins/local/non_hermit_dav/new_routines.irp.f
deleted file mode 100644
index 8db044d3..00000000
--- a/plugins/local/non_hermit_dav/new_routines.irp.f
+++ /dev/null
@@ -1,670 +0,0 @@
-subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut, thr_norm=1d0
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    print *, ' '
-    print *, ' '
-    print *, ' orthog between degen eigenvect' 
-    print *, ' '
-    double precision, allocatable :: S_nh_inv_half(:,:)
-    allocate(S_nh_inv_half(n,n))
-    logical :: complex_root
-    deallocate(S_nh_inv_half)
-    call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp)
-    call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S)
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-     if(accu_nd .lt. thresh_biorthog_nondiag) then
-       print *, ' bi-orthogonality: ok'
-     else 
-      print*,'New vectors not bi-orthonormals at ',accu_nd
-      print*,'Must be a deep problem ...'
-      stop
-     endif
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
-
-subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut, thr_norm=1.d0
-  double precision, allocatable :: S_nh_inv_half(:,:)
-  logical :: complex_root
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-  allocate(S_nh_inv_half(n,n))
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-!   thr_cut = shift_current
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    if(complex_root) then 
-     print *, ' '
-     print *, ' '
-     print *, ' orthog between degen eigenvect' 
-     print *, ' '
-     ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
-     call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp) ! orthogonalization of reigvec
-     call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp) ! orthogonalization of leigvec
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S,  thresh_biorthog_diag, thresh_biorthog_nondiag, .false.) 
-
-     if(accu_nd .lt. thresh_biorthog_nondiag) then
-       print *, ' bi-orthogonality: ok'
-     else 
-      print*,'New vectors not bi-orthonormals at ', accu_nd
-      call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half, complex_root)
-      if(complex_root)then 
-       call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
-      else
-       print*,'S^{-1/2} exists !!'
-       call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-      endif
-     endif
-    else ! the matrix S^{-1/2} exists
-     print*,'S^{-1/2} exists !!'
-     call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     print*,'Must be a deep problem ...'
-     stop
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
-subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine returning the eigenvalues and left/right eigenvectors of the TC fock matrix 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut
-  double precision, allocatable :: S_nh_inv_half(:,:)
-  logical :: complex_root
-  double precision :: thr_norm=1d0
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-  allocate(S_nh_inv_half(n,n))
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-!   thr_cut = shift_current
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    print *, ' '
-    print *, ' '
-    print *, ' Using impose_unique_biorthog_degen_eigvec' 
-    print *, ' '
-    ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
-    call impose_unique_biorthog_degen_eigvec(n, eigval, mo_coef, leigvec_tmp, reigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    print*,'accu_nd = ',accu_nd
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half,complex_root)
-     if(complex_root)then 
-      print*,'S^{-1/2} does not exits, using QR bi-orthogonalization'
-      call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
-     else
-      print*,'S^{-1/2} exists !!'
-      call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-     endif
-    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     print*,'Must be a deep problem ...'
-     stop
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
diff --git a/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f b/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
index a3f1b6ef..cb7cdb22 100644
--- a/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
+++ b/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
@@ -183,11 +183,3 @@ BEGIN_PROVIDER [ double precision, x_W_ij_erf_rk, ( n_points_final_grid,3,mo_num
 
 END_PROVIDER 
 
-BEGIN_PROVIDER [ double precision, sqrt_weight_at_r, (n_points_final_grid)]
- implicit none
- integer :: ipoint
- do ipoint = 1, n_points_final_grid
-  sqrt_weight_at_r(ipoint) = dsqrt(final_weight_at_r_vector(ipoint))
- enddo
-END_PROVIDER 
-
diff --git a/plugins/local/tc_bi_ortho/EZFIO.cfg b/plugins/local/tc_bi_ortho/EZFIO.cfg
index a34d2134..67c780d7 100644
--- a/plugins/local/tc_bi_ortho/EZFIO.cfg
+++ b/plugins/local/tc_bi_ortho/EZFIO.cfg
@@ -9,3 +9,14 @@ interface: ezfio
 doc: Coefficients for the right wave function
 type: double precision
 size: (determinants.n_det,determinants.n_states)
+
+[tc_gs_energy]
+type: Threshold
+doc: TC GS Energy
+interface: ezfio
+
+[tc_gs_var]
+type: Threshold
+doc: TC GS VAR
+interface: ezfio
+
diff --git a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f b/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
index ef38cbcc..1fa0c6d9 100644
--- a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
+++ b/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
@@ -6,18 +6,9 @@ program print_tc_energy
 
   implicit none
 
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
   read_wf = .True.
   touch read_wf
 
-
   PROVIDE j2e_type
   PROVIDE j1e_type
   PROVIDE env_type
@@ -26,6 +17,27 @@ program print_tc_energy
   print *, ' j1e_type = ', j1e_type
   print *, ' env_type = ', env_type
 
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+  endif
+
   call write_tc_energy()
 
 end
diff --git a/plugins/local/tc_bi_ortho/print_tc_var.irp.f b/plugins/local/tc_bi_ortho/print_tc_var.irp.f
index bec34f18..6743cd11 100644
--- a/plugins/local/tc_bi_ortho/print_tc_var.irp.f
+++ b/plugins/local/tc_bi_ortho/print_tc_var.irp.f
@@ -6,7 +6,8 @@ program print_tc_var
 
   implicit none
 
-  print *, 'Hello world'
+  print *, ' TC VAR is available only for HF REF WF'
+  print *, ' DO NOT FORGET TO RUN A CISD CALCULATION BEF'
 
   my_grid_becke = .True.
   PROVIDE tc_grid1_a tc_grid1_r
@@ -17,7 +18,7 @@ program print_tc_var
   read_wf = .True.
   touch read_wf
 
-  call write_tc_var()
+  call write_tc_gs_var_HF()
 
 end
 
diff --git a/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f b/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f
index efa4aa2c..ac90f737 100644
--- a/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f
+++ b/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f
@@ -38,9 +38,9 @@ subroutine main()
         call ezfio_has_cisd_energy(exists)
         if(.not.exists) then
 
-          call ezfio_has_tc_scf_bitc_energy(exists)
+          call ezfio_has_tc_scf_tcscf_energy(exists)
           if(exists) then
-            call ezfio_get_tc_scf_bitc_energy(e_ref)
+            call ezfio_get_tc_scf_tcscf_energy(e_ref)
           endif
 
         else
@@ -59,7 +59,7 @@ subroutine main()
 
   close(iunit)
 
-end subroutine main
+end
 
 ! --
 
@@ -89,7 +89,7 @@ subroutine write_lr_spindeterminants()
   call ezfio_set_spindeterminants_psi_left_coef_matrix_values(buffer)
   deallocate(buffer)
 
-end subroutine write_lr_spindeterminants
+end
 
 ! ---
 
diff --git a/plugins/local/tc_bi_ortho/tc_utils.irp.f b/plugins/local/tc_bi_ortho/tc_utils.irp.f
index 53fe5884..43a6865e 100644
--- a/plugins/local/tc_bi_ortho/tc_utils.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_utils.irp.f
@@ -2,12 +2,67 @@
 subroutine write_tc_energy()
 
   implicit none
-  integer          :: i, j, k
-  double precision :: hmono, htwoe, hthree, htot
-  double precision :: E_TC, O_TC
-  double precision :: E_1e, E_2e, E_3e
+  integer                       :: i, j, k
+  double precision              :: hmono, htwoe, hthree, htot
+  double precision              :: E_TC, O_TC
+  double precision              :: E_1e, E_2e, E_3e
+  double precision, allocatable :: E_TC_tmp(:), E_1e_tmp(:), E_2e_tmp(:), E_3e_tmp(:)
 
-  do k = 1, n_states
+  ! GS 
+  ! ---
+
+  allocate(E_TC_tmp(N_det), E_1e_tmp(N_det), E_2e_tmp(N_det), E_3e_tmp(N_det))
+
+  !$OMP PARALLEL                                                                &
+  !$OMP DEFAULT (NONE)                                                          &
+  !$OMP PRIVATE(i, j, hmono, htwoe, hthree, htot)                               &
+  !$OMP SHARED(N_det, psi_det, N_int, psi_l_coef_bi_ortho, psi_r_coef_bi_ortho, &
+  !$OMP        E_TC_tmp, E_1e_tmp, E_2e_tmp, E_3e_tmp)
+  !$OMP DO
+  do i = 1, N_det
+    E_TC_tmp(i) = 0.d0
+    E_1e_tmp(i) = 0.d0
+    E_2e_tmp(i) = 0.d0
+    E_3e_tmp(i) = 0.d0
+    do j = 1, N_det
+      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+      E_TC_tmp(i) = E_TC_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * htot
+      E_1e_tmp(i) = E_1e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * hmono
+      E_2e_tmp(i) = E_2e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * htwoe
+      E_3e_tmp(i) = E_3e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * hthree
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  E_1e = 0.d0
+  E_2e = 0.d0
+  E_3e = 0.d0
+  E_TC = 0.d0
+  O_TC = 0.d0
+  do i = 1, N_det
+    E_1e = E_1e + E_1e_tmp(i)
+    E_2e = E_2e + E_2e_tmp(i)
+    E_3e = E_3e + E_3e_tmp(i)
+    E_TC = E_TC + E_TC_tmp(i)
+    O_TC = O_TC + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(i,1)
+  enddo
+
+  print *, ' state :', 1
+  print *, " E_TC = ", E_TC / O_TC
+  print *, " E_1e = ", E_1e / O_TC
+  print *, " E_2e = ", E_2e / O_TC
+  print *, " E_3e = ", E_3e / O_TC
+  print *, " O_TC = ", O_TC
+
+  call ezfio_set_tc_bi_ortho_tc_gs_energy(E_TC/O_TC)
+
+  ! ---
+
+  ! ES 
+  ! ---
+
+  do k = 2, n_states
 
     E_TC = 0.d0
     E_1e = 0.d0
@@ -37,6 +92,8 @@ subroutine write_tc_energy()
 
   enddo
 
+  deallocate(E_TC_tmp, E_1e_tmp, E_2e_tmp, E_3e_tmp)
+
 end
 
 ! ---
@@ -66,3 +123,25 @@ end
 
 ! ---
 
+subroutine write_tc_gs_var_HF()
+
+  implicit none
+  integer          :: i, j, k
+  double precision :: hmono, htwoe, hthree, htot
+  double precision :: SIGMA_TC
+
+  SIGMA_TC = 0.d0
+  do j = 2, N_det
+    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot)
+    SIGMA_TC = SIGMA_TC + htot * htot
+  enddo
+
+  print *, " SIGMA_TC = ", SIGMA_TC
+
+  call ezfio_set_tc_bi_ortho_tc_gs_var(SIGMA_TC)
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_scf/EZFIO.cfg b/plugins/local/tc_scf/EZFIO.cfg
index 3dfa9a71..510c777c 100644
--- a/plugins/local/tc_scf/EZFIO.cfg
+++ b/plugins/local/tc_scf/EZFIO.cfg
@@ -1,6 +1,6 @@
-[bitc_energy]
+[tcscf_energy]
 type: Threshold
-doc: Energy bi-tc HF
+doc: TC-SCF ENERGY 
 interface: ezfio
 
 [converged_tcscf]
diff --git a/plugins/local/tc_scf/combine_lr_tcscf.irp.f b/plugins/local/tc_scf/combine_lr_tcscf.irp.f
deleted file mode 100644
index a22614ba..00000000
--- a/plugins/local/tc_scf/combine_lr_tcscf.irp.f
+++ /dev/null
@@ -1,75 +0,0 @@
-
-! ---
-
-program combine_lr_tcscf
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  bi_ortho = .True.
-  touch bi_ortho
-
-  call comb_orbitals()
-
-end
-
-! ---
-
-subroutine comb_orbitals()
-
-  implicit none
-  integer                       :: i, m, n, nn, mm
-  double precision              :: accu_d, accu_nd
-  double precision, allocatable :: R(:,:), L(:,:), Rnew(:,:), tmp(:,:), S(:,:)
-
-  n  = ao_num
-  m  = mo_num
-  nn = elec_alpha_num
-  mm = m - nn
-
-  allocate(L(n,m), R(n,m), Rnew(n,m), S(m,m))
-  L = mo_l_coef
-  R = mo_r_coef
-
-  call check_weighted_biorthog(n, m, ao_overlap, L, R, accu_d, accu_nd, S, .true.)
-
-  allocate(tmp(n,nn))
-  do i = 1, nn 
-    tmp(1:n,i) = R(1:n,i)
-  enddo
-  call impose_weighted_orthog_svd(n, nn, ao_overlap, tmp)
-  do i = 1, nn
-    Rnew(1:n,i) = tmp(1:n,i)
-  enddo
-  deallocate(tmp)
-
-  allocate(tmp(n,mm))
-  do i = 1, mm
-    tmp(1:n,i) = L(1:n,i+nn)
-  enddo
-  call impose_weighted_orthog_svd(n, mm, ao_overlap, tmp)
-  do i = 1, mm
-    Rnew(1:n,i+nn) = tmp(1:n,i)
-  enddo
-  deallocate(tmp)
-
-  call check_weighted_biorthog(n, m, ao_overlap, Rnew, Rnew, accu_d, accu_nd, S, .true.)
-
-  mo_r_coef = Rnew
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-
-  deallocate(L, R, Rnew, S)
-
-end subroutine comb_orbitals
-
-! ---
-
diff --git a/plugins/local/tc_scf/diago_vartcfock.irp.f b/plugins/local/tc_scf/diago_vartcfock.irp.f
deleted file mode 100644
index 0c881dcb..00000000
--- a/plugins/local/tc_scf/diago_vartcfock.irp.f
+++ /dev/null
@@ -1,96 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_vartc_eigvec_mo, (mo_num, mo_num)]
-
-  implicit none
-
-  integer                       :: i, j
-  integer                       :: liwork, lwork, n, info
-  integer,          allocatable :: iwork(:)
-  double precision, allocatable :: work(:), F(:,:), F_save(:,:)
-  double precision, allocatable :: diag(:)
-
-  PROVIDE mo_r_coef
-  PROVIDE Fock_matrix_vartc_mo_tot
-
-  allocate( F(mo_num,mo_num), F_save(mo_num,mo_num)  )
-  allocate (diag(mo_num) )
-
-  do j = 1, mo_num
-    do i = 1, mo_num
-      F(i,j) = Fock_matrix_vartc_mo_tot(i,j)
-    enddo
-  enddo
-
-  ! Insert level shift here
-  do i = elec_beta_num+1, elec_alpha_num
-    F(i,i) += 0.5d0 * level_shift_tcscf
-  enddo
-  do i = elec_alpha_num+1, mo_num
-    F(i,i) += level_shift_tcscf
-  enddo
-
-  n = mo_num
-  lwork = 1+6*n + 2*n*n
-  liwork = 3 + 5*n
-
-  allocate(work(lwork))
-  allocate(iwork(liwork) )
-
-  lwork = -1
-  liwork = -1
-
-  F_save = F
-  call dsyevd('V', 'U', mo_num, F, size(F, 1), diag, work, lwork, iwork, liwork, info)
-
-  if (info /= 0) then
-    print *,  irp_here//' DSYEVD failed : ', info
-    stop 1
-  endif
-  lwork = int(work(1))
-  liwork = iwork(1)
-  deallocate(iwork)
-  deallocate(work)
-
-  allocate(work(lwork))
-  allocate(iwork(liwork) )
-  call dsyevd('V', 'U', mo_num, F, size(F, 1), diag, work, lwork, iwork, liwork, info)
-  deallocate(iwork)
-
-  if (info /= 0) then
-    F = F_save
-    call dsyev('V', 'L', mo_num, F, size(F, 1), diag, work, lwork, info)
-
-    if (info /= 0) then
-      print *,  irp_here//' DSYEV failed : ', info
-      stop 1
-    endif
-  endif
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      fock_vartc_eigvec_mo(j,i) = F(j,i)
-    enddo
-  enddo
-
-  deallocate(work, F, F_save, diag)
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_vartc_eigvec_ao, (ao_num, mo_num)]
-
-  implicit none
-
-  PROVIDE mo_r_coef
-
-  call dgemm( 'N', 'N', ao_num, mo_num, mo_num, 1.d0                                             &
-            , mo_r_coef, size(mo_r_coef, 1), fock_vartc_eigvec_mo, size(fock_vartc_eigvec_mo, 1) &
-            , 0.d0, fock_vartc_eigvec_ao, size(fock_vartc_eigvec_ao, 1))
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/diis_tcscf.irp.f b/plugins/local/tc_scf/diis_tcscf.irp.f
index 5d7d6b2e..ccc8eb15 100644
--- a/plugins/local/tc_scf/diis_tcscf.irp.f
+++ b/plugins/local/tc_scf/diis_tcscf.irp.f
@@ -91,28 +91,14 @@ BEGIN_PROVIDER [double precision, FQS_SQF_ao, (ao_num, ao_num)]
   double precision, allocatable :: tmp(:,:)
   double precision, allocatable :: F(:,:)
 
-  !print *, ' Providing FQS_SQF_ao ...'
-  !call wall_time(t0)
+  PROVIDE Fock_matrix_tc_ao_tot
 
   allocate(F(ao_num,ao_num))
-  if(var_tc) then
-
-    do i = 1, ao_num
-      do j = 1, ao_num
-        F(j,i) = Fock_matrix_vartc_ao_tot(j,i)
-      enddo
+  do i = 1, ao_num
+    do j = 1, ao_num
+      F(j,i) = Fock_matrix_tc_ao_tot(j,i)
     enddo
-
-  else
-
-    PROVIDE Fock_matrix_tc_ao_tot
-    do i = 1, ao_num
-      do j = 1, ao_num
-        F(j,i) = Fock_matrix_tc_ao_tot(j,i)
-      enddo
-    enddo
-
-  endif
+  enddo
 
   allocate(tmp(ao_num,ao_num))
 
@@ -140,9 +126,6 @@ BEGIN_PROVIDER [double precision, FQS_SQF_ao, (ao_num, ao_num)]
   deallocate(tmp)
   deallocate(F)
 
-  !call wall_time(t1)
-  !print *, ' Wall time for FQS_SQF_ao =', t1-t0
-
 END_PROVIDER
 
 ! ---
@@ -152,61 +135,13 @@ BEGIN_PROVIDER [double precision, FQS_SQF_mo, (mo_num, mo_num)]
   implicit none
   double precision :: t0, t1
 
-  !print*, ' Providing FQS_SQF_mo ...'
-  !call wall_time(t0)
-
   PROVIDE mo_r_coef mo_l_coef
   PROVIDE FQS_SQF_ao
 
   call ao_to_mo_bi_ortho( FQS_SQF_ao, size(FQS_SQF_ao, 1) &
                         , FQS_SQF_mo, size(FQS_SQF_mo, 1) )
 
-  !call wall_time(t1)
-  !print*, ' Wall time for FQS_SQF_mo =', t1-t0 
-
 END_PROVIDER
 
 ! ---
 
-! BEGIN_PROVIDER [ double precision, eigenval_Fock_tc_ao, (ao_num) ]
-!&BEGIN_PROVIDER [ double precision, eigenvec_Fock_tc_ao, (ao_num,ao_num) ]
-!
-!  BEGIN_DOC
-!  !
-!  ! Eigenvalues and eigenvectors of the Fock matrix over the ao basis
-!  !
-!  ! F' = X.T x F x X   where X = ao_overlap^(-1/2)
-!  ! 
-!  ! F'   x Cr' = Cr' x E ==> F   Cr = Cr x E with Cr = X x Cr'
-!  ! F'.T x Cl' = Cl' x E ==> F.T Cl = Cl x E with Cl = X x Cl'
-!  !
-!  END_DOC
-!
-!  implicit none
-!  double precision, allocatable :: tmp1(:,:), tmp2(:,:)
-!
-!  ! ---
-!  ! Fock matrix in orthogonal basis: F' = X.T x F x X
-!
-!  allocate(tmp1(ao_num,ao_num))
-!  call dgemm( 'N', 'N', ao_num, ao_num, ao_num, 1.d0                                                 &
-!            , Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1), S_half_inv, size(S_half_inv, 1) &
-!            , 0.d0, tmp1, size(tmp1, 1) )
-!
-!  allocate(tmp2(ao_num,ao_num))
-!  call dgemm( 'T', 'N', ao_num, ao_num, ao_num, 1.d0               &
-!            , S_half_inv, size(S_half_inv, 1), tmp1, size(tmp1, 1) &
-!            , 0.d0, tmp2, size(tmp2, 1) )
-!
-!  ! ---
-!
-!  ! Diagonalize F' to obtain eigenvectors in orthogonal basis C' and eigenvalues
-!  ! TODO
-!
-!  ! Back-transform eigenvectors: C =X.C'
-!
-!END_PROVIDER
-
-! ---
-
-~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
deleted file mode 100644
index 8fd5e5b6..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
+++ /dev/null
@@ -1,299 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
-
-  implicit none
-  integer                       :: a, b, i, j, ipoint
-  double precision              :: ti, tf
-  double precision              :: loc_1, loc_2, loc_3
-  double precision, allocatable :: Okappa(:), Jkappa(:,:)
-  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
-  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:), tmp_22(:,:,:)
-  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' PROVIDING fock_3e_uhf_mo_cs ...'
-  !call wall_time(ti)
-
-  ! ---
-
-  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
-  Jkappa = 0.d0
-  Okappa = 0.d0
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)               &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
-  tmp_omp_d2 = 0.d0
-  tmp_omp_d1 = 0.d0
-
-  !$OMP DO
-  do i = 1, elec_beta_num
-    do ipoint = 1, n_points_final_grid
-      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
-      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
-      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
-      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
-    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2, tmp_omp_d1)
-
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_1(n_points_final_grid,4))
-
-  do ipoint = 1, n_points_final_grid
-    loc_1 = 2.d0 * Okappa(ipoint) 
-    tmp_1(ipoint,1) = loc_1 * Jkappa(ipoint,1)
-    tmp_1(ipoint,2) = loc_1 * Jkappa(ipoint,2)
-    tmp_1(ipoint,3) = loc_1 * Jkappa(ipoint,3)
-    tmp_1(ipoint,4) = Okappa(ipoint)
-  enddo
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, i, j, loc_1, tmp_omp_d2)                 &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3))
-  tmp_omp_d2 = 0.d0
-
-  !$OMP DO COLLAPSE(2)
-  do i = 1, elec_beta_num
-    do j = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
-
-        tmp_omp_d2(ipoint,1) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) 
-        tmp_omp_d2(ipoint,2) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) 
-        tmp_omp_d2(ipoint,3) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) 
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2)
-  !$OMP END PARALLEL
-
-  ! ---
-
-  if(tc_save_mem) then
-
-    allocate(tmp_22(n_points_final_grid,4,mo_num))
-    do a = 1, mo_num
-      !$OMP PARALLEL                                                  &
-      !$OMP DEFAULT (NONE)                                            &
-      !$OMP PRIVATE (ipoint, b, i)                                    &
-      !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, a,    &
-      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-      !$OMP         tmp_22)
-      !$OMP DO
-      do b = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-          tmp_22(ipoint,1,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
-          tmp_22(ipoint,2,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
-          tmp_22(ipoint,3,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
-        enddo
-        tmp_22(:,4,b) = 0.d0
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_22(ipoint,4,b) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                     + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                     + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-          enddo
-        enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
-      call dgemv( 'T', 4*n_points_final_grid, mo_num, -2.d0        &
-                , tmp_22(1,1,1), size(tmp_22, 1) * size(tmp_22, 2) &
-                , tmp_1(1,1), 1                                    &
-                , 0.d0, fock_3e_uhf_mo_cs(1,a), 1)
-    enddo
-    deallocate(tmp_22)
-
-  else
-
-    allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
-    !$OMP PARALLEL                                                  &
-    !$OMP DEFAULT (NONE)                                            &
-    !$OMP PRIVATE (ipoint, a, b, i)                                 &
-    !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
-    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-    !$OMP         tmp_2)
-    !$OMP DO COLLAPSE(2)
-    do a = 1, mo_num
-      do b = 1, mo_num
-        do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
-          tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
-          tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
-        enddo
-        tmp_2(:,4,b,a) = 0.d0
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_2(ipoint,4,b,a) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                      + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                      + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-          enddo
-        enddo
-      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-    call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, -2.d0 &
-              , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2)  &
-              , tmp_1(1,1), 1                                    &
-              , 0.d0, fock_3e_uhf_mo_cs(1,1), 1)
-    deallocate(tmp_2)
-
-  endif
-
-  deallocate(tmp_1)
-
-  ! ---
-
-  allocate(tmp_3(n_points_final_grid,5,mo_num), tmp_4(n_points_final_grid,5,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector, Jkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    tmp_3(:,:,b) = 0.d0
-    tmp_4(:,:,b) = 0.d0
-    do ipoint = 1, n_points_final_grid
-      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
-
-      tmp_4(ipoint,1,b) = -2.d0 * mos_r_in_r_array_transp(ipoint,b) * ( Jkappa(ipoint,1) * Jkappa(ipoint,1) &
-                                                                      + Jkappa(ipoint,2) * Jkappa(ipoint,2) &
-                                                                      + Jkappa(ipoint,3) * Jkappa(ipoint,3) )
-      tmp_4(ipoint,5,b) = mos_r_in_r_array_transp(ipoint,b)
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2)                      &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         Jkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-
-        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
-        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
-        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
-        tmp_3(ipoint,5,b) += 2.d0 * loc_1 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
-                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
-                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
-                                                                                                       
-        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-        tmp_4(ipoint,1,b) += 2.d0 * loc_2 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)            &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do j = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
-          loc_2 = mos_r_in_r_array_transp(ipoint,b)
-          loc_3 = mos_r_in_r_array_transp(ipoint,i)
-
-          tmp_3(ipoint,5,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
-
-          tmp_4(ipoint,1,b) += ( loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) ) &
-                               - loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) ) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemm( 'T', 'N', mo_num, mo_num, 5*n_points_final_grid, 1.d0 &
-            , tmp_3(1,1,1), 5*n_points_final_grid                   &
-            , tmp_4(1,1,1), 5*n_points_final_grid                   &
-            , 1.d0, fock_3e_uhf_mo_cs(1,1), mo_num)
-
-  deallocate(tmp_3, tmp_4)
-  deallocate(Jkappa, Okappa)
-
-  ! ---
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_cs =', (tf - ti) / 60.d0
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f
deleted file mode 100644
index 4bbce720..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f
+++ /dev/null
@@ -1,536 +0,0 @@
-
-! ---
-
- BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a_os, (mo_num, mo_num)]
-&BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_b_os, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! Open Shell part of the Fock matrix from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: a, b, i, j, ipoint
-  double precision              :: loc_1, loc_2, loc_3, loc_4
-  double precision              :: ti, tf
-  double precision, allocatable :: Okappa(:), Jkappa(:,:), Obarkappa(:), Jbarkappa(:,:)
-  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
-  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
-  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' Providing fock_3e_uhf_mo_a_os and fock_3e_uhf_mo_b_os ...'
-  !call wall_time(ti)
-
-  ! ---
-
-  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
-  allocate(Jbarkappa(n_points_final_grid,3), Obarkappa(n_points_final_grid))
-  Jkappa    = 0.d0
-  Okappa    = 0.d0
-  Jbarkappa = 0.d0
-  Obarkappa = 0.d0
-
-  !$OMP PARALLEL                                                    &
-  !$OMP DEFAULT (NONE)                                              &
-  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)                 &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa, Obarkappa, Jbarkappa)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
-
-  tmp_omp_d2 = 0.d0
-  tmp_omp_d1 = 0.d0
-  !$OMP DO
-  do i = 1, elec_beta_num
-    do ipoint = 1, n_points_final_grid
-      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
-      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
-      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
-      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
-    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
-  enddo
-  !$OMP END CRITICAL
-
-  tmp_omp_d2 = 0.d0
-  tmp_omp_d1 = 0.d0
-  !$OMP DO
-  do i = elec_beta_num+1, elec_alpha_num
-    do ipoint = 1, n_points_final_grid
-      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
-      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
-      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
-      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    Jbarkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    Jbarkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    Jbarkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
-    Obarkappa(ipoint)   += tmp_omp_d1(ipoint)   
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2, tmp_omp_d1)
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_1(n_points_final_grid,4))
-
-  do ipoint = 1, n_points_final_grid
-
-    loc_1 = -2.d0 * Okappa   (ipoint) 
-    loc_2 = -2.d0 * Obarkappa(ipoint) 
-    loc_3 =         Obarkappa(ipoint) 
-
-    tmp_1(ipoint,1) = (loc_1 - loc_3) * Jbarkappa(ipoint,1) + loc_2 * Jkappa(ipoint,1)
-    tmp_1(ipoint,2) = (loc_1 - loc_3) * Jbarkappa(ipoint,2) + loc_2 * Jkappa(ipoint,2)
-    tmp_1(ipoint,3) = (loc_1 - loc_3) * Jbarkappa(ipoint,3) + loc_2 * Jkappa(ipoint,3)
-
-    tmp_1(ipoint,4) = Obarkappa(ipoint)
-  enddo
-
-
-  !$OMP PARALLEL                                                    &
-  !$OMP DEFAULT (NONE)                                              &
-  !$OMP PRIVATE (ipoint, i, j, loc_1, loc_2, tmp_omp_d2)            &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3))
-
-  tmp_omp_d2 = 0.d0
-  !$OMP DO COLLAPSE(2)
-  do i = 1, elec_beta_num
-    do j = elec_beta_num+1, elec_alpha_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
-        loc_2 = mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j) 
-
-        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,1,j,i) 
-        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,2,j,i) 
-        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,3,j,i) 
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
-  enddo
-  !$OMP END CRITICAL
-
-  tmp_omp_d2 = 0.d0
-  !$OMP DO COLLAPSE(2)
-  do i = elec_beta_num+1, elec_alpha_num
-    do j = elec_beta_num+1, elec_alpha_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
-
-        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j)
-        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j)
-        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2)
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, a, b)                                    &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
-        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
-        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, a, b, i)                                           &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      tmp_2(:,4,b,a) = 0.d0
-      do i = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,4,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, 1.d0 &
-            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
-            , tmp_1(1,1), 1                                   &
-            , 0.d0, fock_3e_uhf_mo_b_os(1,1), 1)
-
-  deallocate(tmp_1, tmp_2)
-
-  ! ---
-
-  allocate(tmp_3(n_points_final_grid,2,mo_num), tmp_4(n_points_final_grid,2,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    tmp_3(:,:,b) = 0.d0
-    tmp_4(:,:,b) = 0.d0
-    do ipoint = 1, n_points_final_grid
-
-      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
-
-      loc_1 = -2.0d0 * mos_r_in_r_array_transp(ipoint,b)
-
-      tmp_4(ipoint,1,b) = loc_1 * ( Jbarkappa(ipoint,1) * (Jkappa(ipoint,1) + 0.25d0 * Jbarkappa(ipoint,1)) &
-                                  + Jbarkappa(ipoint,2) * (Jkappa(ipoint,2) + 0.25d0 * Jbarkappa(ipoint,2)) &
-                                  + Jbarkappa(ipoint,3) * (Jkappa(ipoint,3) + 0.25d0 * Jbarkappa(ipoint,3)) )
-
-      tmp_4(ipoint,2,b) = mos_r_in_r_array_transp(ipoint,b)
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-
-    do i = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-
-        tmp_3(ipoint,2,b) += loc_1 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
-                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
-                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
-                                                                                                       
-        tmp_4(ipoint,1,b) += loc_2 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_2 = mos_r_in_r_array_transp(ipoint,b)
-
-          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-        enddo
-      enddo
-    enddo
-
-    do i = elec_beta_num+1, elec_alpha_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
-
-          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemm( 'T', 'N', mo_num, mo_num, 2*n_points_final_grid, 1.d0 &
-            , tmp_3(1,1,1), 2*n_points_final_grid                   &
-            , tmp_4(1,1,1), 2*n_points_final_grid                   &
-            , 1.d0, fock_3e_uhf_mo_b_os(1,1), mo_num)
-
-  deallocate(tmp_3, tmp_4)
-
-
-
-
-  ! ---
-
-  fock_3e_uhf_mo_a_os = fock_3e_uhf_mo_b_os
-
-  allocate(tmp_1(n_points_final_grid,1))
-
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) = Obarkappa(ipoint) + 2.d0 * Okappa(ipoint) 
-  enddo
-
-  allocate(tmp_2(n_points_final_grid,1,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, a, b, i)                                           &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      tmp_2(:,1,b,a) = 0.d0
-      do i = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,1,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemv( 'T', n_points_final_grid, mo_num*mo_num, 1.d0   &
-            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
-            , tmp_1(1,1), 1                                   &
-            , 1.d0, fock_3e_uhf_mo_a_os(1,1), 1)
-
-  deallocate(tmp_1, tmp_2)
-
-  ! ---
-
-  allocate(tmp_3(n_points_final_grid,8,mo_num), tmp_4(n_points_final_grid,8,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b)                                       &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    tmp_3(:,:,b) = 0.d0
-    tmp_4(:,:,b) = 0.d0
-    do ipoint = 1, n_points_final_grid
-
-      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
-
-      tmp_4(ipoint,8,b) = mos_r_in_r_array_transp(ipoint,b)
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-
-        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
-        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
-        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
-
-        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-      enddo
-    enddo
-
-    do i = elec_beta_num+1, elec_alpha_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_3 = 2.d0 * loc_1
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-        loc_4 = 2.d0 * loc_2
-
-        tmp_3(ipoint,5,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
-        tmp_3(ipoint,6,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
-        tmp_3(ipoint,7,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
-
-        tmp_3(ipoint,8,b) += loc_3 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
-                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
-                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
-                                                                                                       
-        tmp_4(ipoint,1,b) += loc_4 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-
-        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-
-        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-
-    do i = 1, elec_beta_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
-          loc_2 = mos_r_in_r_array_transp(ipoint,b)
-          loc_3 = mos_r_in_r_array_transp(ipoint,i)
-
-          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
-
-          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-          loc_3 = mos_r_in_r_array_transp(ipoint,j)
-
-          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-
-          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,j,i) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,j,i) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,j,i) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-        enddo
-      enddo
-    enddo
-
-    do i = elec_beta_num+1, elec_alpha_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
-          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
-          loc_3 = mos_r_in_r_array_transp(ipoint,i)
-
-          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
-
-          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemm( 'T', 'N', mo_num, mo_num, 8*n_points_final_grid, 1.d0 &
-            , tmp_3(1,1,1), 8*n_points_final_grid                   &
-            , tmp_4(1,1,1), 8*n_points_final_grid                   &
-            , 1.d0, fock_3e_uhf_mo_a_os(1,1), mo_num)
-
-  deallocate(tmp_3, tmp_4)
-  deallocate(Jkappa, Okappa)
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a_os and fock_3e_uhf_mo_b_os =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
deleted file mode 100644
index 47ee5b48..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
+++ /dev/null
@@ -1,77 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! Fock matrix alpha from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  double precision :: ti, tf
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' Providing fock_3e_uhf_mo_a ...'
-  !call wall_time(ti)
-
-  ! CLOSED-SHELL PART
-  PROVIDE fock_3e_uhf_mo_cs
-  fock_3e_uhf_mo_a = fock_3e_uhf_mo_cs
-
-  if(elec_alpha_num .ne. elec_beta_num) then
-
-    ! OPEN-SHELL PART
-    PROVIDE fock_3e_uhf_mo_a_os
-
-    fock_3e_uhf_mo_a += fock_3e_uhf_mo_a_os
-  endif
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a (min) =', (tf - ti)/60.d0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_b, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! Fock matrix beta from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  double precision :: ti, tf
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' Providing and fock_3e_uhf_mo_b ...'
-  !call wall_time(ti)
-
-  ! CLOSED-SHELL PART
-  PROVIDE fock_3e_uhf_mo_cs
-  fock_3e_uhf_mo_b = fock_3e_uhf_mo_cs
-
-  if(elec_alpha_num .ne. elec_beta_num) then
-
-    ! OPEN-SHELL PART
-    PROVIDE fock_3e_uhf_mo_b_os
-
-    fock_3e_uhf_mo_b += fock_3e_uhf_mo_b_os
-  endif
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_b =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f
deleted file mode 100644
index 3bf6bd85..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f
+++ /dev/null
@@ -1,490 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs_old, (mo_num, mo_num)]
-
-  implicit none
-  integer                       :: a, b, i, j
-  double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
-  double precision              :: ti, tf
-  double precision, allocatable :: tmp(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-  call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
-
-  !print *, ' PROVIDING fock_3e_uhf_mo_cs_old ...'
-  !call wall_time(ti)
-
-  fock_3e_uhf_mo_cs_old = 0.d0
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                     &
-  !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, elec_beta_num, fock_3e_uhf_mo_cs_old)
-
-  allocate(tmp(mo_num,mo_num))
-  tmp = 0.d0
-
-  !$OMP DO
-  do a = 1, mo_num
-    do b = 1, mo_num
-   
-      do j = 1, elec_beta_num
-        do i = 1, elec_beta_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 4.d0 * I_bij_aij &
-                              +        I_bij_ija &
-                              +        I_bij_jai &
-                              - 2.d0 * I_bij_aji &
-                              - 2.d0 * I_bij_iaj &
-                              - 2.d0 * I_bij_jia )
-
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do a = 1, mo_num
-    do b = 1, mo_num
-      fock_3e_uhf_mo_cs_old(b,a) += tmp(b,a)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_cs_old =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a_old, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! ALPHA part of the Fock matrix from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: a, b, i, j, o
-  double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
-  double precision              :: ti, tf
-  double precision, allocatable :: tmp(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-  PROVIDE fock_3e_uhf_mo_cs
-
-  !print *, ' Providing fock_3e_uhf_mo_a_old ...'
-  !call wall_time(ti)
-
-  o = elec_beta_num + 1
-  call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
-
-  PROVIDE fock_3e_uhf_mo_cs_old
-  fock_3e_uhf_mo_a_old = fock_3e_uhf_mo_cs_old
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                     &
-  !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, o, elec_alpha_num, elec_beta_num, fock_3e_uhf_mo_a_old)
-
-  allocate(tmp(mo_num,mo_num))
-  tmp = 0.d0
-
-  !$OMP DO
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = 1, elec_beta_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              +        I_bij_ija &
-                              +        I_bij_jai &
-                              -        I_bij_aji &
-                              -        I_bij_iaj &
-                              - 2.d0 * I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = 1, elec_beta_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              +        I_bij_ija &
-                              +        I_bij_jai &
-                              -        I_bij_aji &
-                              - 2.d0 * I_bij_iaj &
-                              -        I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( I_bij_aij &
-                              + I_bij_ija &
-                              + I_bij_jai &
-                              - I_bij_aji &
-                              - I_bij_iaj &
-                              - I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do a = 1, mo_num
-    do b = 1, mo_num
-      fock_3e_uhf_mo_a_old(b,a) += tmp(b,a)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a_old =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_b_old, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! BETA part of the Fock matrix from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: a, b, i, j, o
-  double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
-  double precision              :: ti, tf
-  double precision, allocatable :: tmp(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' PROVIDING fock_3e_uhf_mo_b_old ...'
-  !call wall_time(ti)
-
-  o = elec_beta_num + 1
-  call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
-
-  PROVIDE fock_3e_uhf_mo_cs_old
-  fock_3e_uhf_mo_b_old = fock_3e_uhf_mo_cs_old
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                     &
-  !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, o, elec_alpha_num, elec_beta_num, fock_3e_uhf_mo_b_old)
-
-  allocate(tmp(mo_num,mo_num))
-  tmp = 0.d0
-
-  !$OMP DO
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = 1, elec_beta_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              -        I_bij_aji &
-                              -        I_bij_iaj )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = 1, elec_beta_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              -        I_bij_aji &
-                              -        I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( I_bij_aij &
-                              - I_bij_aji )
-
-        enddo
-      enddo
-
-      ! ---
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do a = 1, mo_num
-    do b = 1, mo_num
-      fock_3e_uhf_mo_b_old(b,a) += tmp(b,a)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_b_old =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_a, (ao_num, ao_num)]
-
-  BEGIN_DOC
-  !
-  ! Equations (B6) and (B7)
-  !
-  ! g <--> gamma
-  ! d <--> delta
-  ! e <--> eta
-  ! k <--> kappa
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: g, d, e, k, mu, nu
-  double precision              :: dm_ge_a, dm_ge_b, dm_ge
-  double precision              :: dm_dk_a, dm_dk_b, dm_dk
-  double precision              :: i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu
-  double precision              :: ti, tf
-  double precision, allocatable :: f_tmp(:,:)
-
-  !print *, ' PROVIDING fock_3e_uhf_ao_a ...'
-  !call wall_time(ti)
-
-  fock_3e_uhf_ao_a = 0.d0
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                &
-  !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk, f_tmp,  &
-  !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
-  !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_a)
-
-  allocate(f_tmp(ao_num,ao_num))
-  f_tmp = 0.d0
-
-  !$OMP DO
-  do g = 1, ao_num
-    do e = 1, ao_num
-      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
-      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
-      dm_ge   = dm_ge_a + dm_ge_b
-      do d = 1, ao_num
-        do k = 1, ao_num
-          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
-          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
-          dm_dk   = dm_dk_a + dm_dk_b
-          do mu = 1, ao_num
-            do nu = 1, ao_num
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
-              f_tmp(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
-                                      + dm_ge_a * dm_dk_a * i_mugd_eknu &
-                                      + dm_ge_a * dm_dk_a * i_mugd_knue &
-                                      - dm_ge_a * dm_dk   * i_mugd_enuk &
-                                      - dm_ge   * dm_dk_a * i_mugd_kenu &
-                                      - dm_ge_a * dm_dk_a * i_mugd_nuke &
-                                      - dm_ge_b * dm_dk_b * i_mugd_nuke )
-            enddo
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do mu = 1, ao_num
-    do nu = 1, ao_num
-      fock_3e_uhf_ao_a(mu,nu) += f_tmp(mu,nu)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(f_tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_ao_a =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_b, (ao_num, ao_num)]
-
-  BEGIN_DOC
-  !
-  ! Equations (B6) and (B7)
-  !
-  ! g <--> gamma
-  ! d <--> delta
-  ! e <--> eta
-  ! k <--> kappa
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: g, d, e, k, mu, nu
-  double precision              :: dm_ge_a, dm_ge_b, dm_ge
-  double precision              :: dm_dk_a, dm_dk_b, dm_dk
-  double precision              :: i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu
-  double precision              :: ti, tf
-  double precision, allocatable :: f_tmp(:,:)
-
-  !print *, ' PROVIDING fock_3e_uhf_ao_b ...'
-  !call wall_time(ti)
-
-  fock_3e_uhf_ao_b = 0.d0
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                &
-  !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk, f_tmp,  &
-  !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
-  !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_b)
-
-  allocate(f_tmp(ao_num,ao_num))
-  f_tmp = 0.d0
-
-  !$OMP DO
-  do g = 1, ao_num
-    do e = 1, ao_num
-      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
-      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
-      dm_ge   = dm_ge_a + dm_ge_b
-      do d = 1, ao_num
-        do k = 1, ao_num
-          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
-          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
-          dm_dk   = dm_dk_a + dm_dk_b
-          do mu = 1, ao_num
-            do nu = 1, ao_num
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
-              f_tmp(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
-                                      + dm_ge_b * dm_dk_b * i_mugd_eknu &
-                                      + dm_ge_b * dm_dk_b * i_mugd_knue &
-                                      - dm_ge_b * dm_dk   * i_mugd_enuk &
-                                      - dm_ge   * dm_dk_b * i_mugd_kenu &
-                                      - dm_ge_b * dm_dk_b * i_mugd_nuke &
-                                      - dm_ge_a * dm_dk_a * i_mugd_nuke )
-            enddo
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do mu = 1, ao_num
-    do nu = 1, ao_num
-      fock_3e_uhf_ao_b(mu,nu) += f_tmp(mu,nu)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(f_tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_ao_b =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_tc.irp.f b/plugins/local/tc_scf/fock_tc.irp.f
index d3ddb8ad..508f3cd7 100644
--- a/plugins/local/tc_scf/fock_tc.irp.f
+++ b/plugins/local/tc_scf/fock_tc.irp.f
@@ -1,78 +1,15 @@
 
+
 ! ---
 
- BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_beta , (ao_num, ao_num)]
+ BEGIN_PROVIDER [ double precision, two_e_tc_integral_alpha, (ao_num, ao_num)]
+&BEGIN_PROVIDER [ double precision, two_e_tc_integral_beta , (ao_num, ao_num)]
 
   BEGIN_DOC
   !
-  ! two_e_tc_non_hermit_integral_seq_alpha(k,i) = <k| F^tc_alpha |i> ON THE AO BASIS 
+  ! two_e_tc_integral_alpha(k,i) = <k| F^tc_2e_alpha |i> ON THE AO BASIS 
   !
-  ! where F^tc is the TWO-BODY part of the TC Fock matrix and k,i are AO basis functions
-  !
-  ! works in SEQUENTIAL 
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, l
-  double precision :: density, density_a, density_b
-  double precision :: t0, t1
-
-  PROVIDE ao_two_e_tc_tot
-
-  !print*, ' providing two_e_tc_non_hermit_integral_seq ...'
-  !call wall_time(t0)
-
-  two_e_tc_non_hermit_integral_seq_alpha = 0.d0
-  two_e_tc_non_hermit_integral_seq_beta  = 0.d0
-
-  do i = 1, ao_num
-    do k = 1, ao_num
-      do j = 1, ao_num
-        do l = 1, ao_num
-
-          density_a = TCSCF_density_matrix_ao_alpha(l,j)
-          density_b = TCSCF_density_matrix_ao_beta (l,j)
-          density   = density_a + density_b
-
-          !!                                         rho(l,j)   *      < k l| T | i j>
-          !two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
-          !!                                         rho(l,j)   *      < k l| T | i j>
-          !two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
-          !!                                         rho_a(l,j) *      < l k| T | i j>
-          !two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
-          !!                                         rho_b(l,j) *      < l k| T | i j>
-          !two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
-
-          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
-          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
-          !                                         rho_a(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
-          !                                         rho_b(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
-
-        enddo
-      enddo
-    enddo
-  enddo
-
-  !call wall_time(t1)
-  !print*, ' wall time for two_e_tc_non_hermit_integral_seq after = ', t1 - t0
-
-END_PROVIDER 
-
-! ---
-
- BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_beta , (ao_num, ao_num)]
-
-  BEGIN_DOC
-  !
-  ! two_e_tc_non_hermit_integral_alpha(k,i) = <k| F^tc_alpha |i> ON THE AO BASIS 
-  !
-  ! where F^tc is the TWO-BODY part of the TC Fock matrix and k,i are AO basis functions
+  ! where F^tc_2e is the TWO-BODY part of the TC Fock matrix and k,i are AO basis functions
   !
   END_DOC
 
@@ -86,16 +23,13 @@ END_PROVIDER
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE TCSCF_density_matrix_ao_alpha TCSCF_density_matrix_ao_beta
 
-  !print*, ' Providing two_e_tc_non_hermit_integral ...'
-  !call wall_time(t0)
-
-  two_e_tc_non_hermit_integral_alpha = 0.d0
-  two_e_tc_non_hermit_integral_beta  = 0.d0
+  two_e_tc_integral_alpha = 0.d0
+  two_e_tc_integral_beta  = 0.d0
 
   !$OMP PARALLEL DEFAULT (NONE)                                                                        &
   !$OMP PRIVATE (i, j, k, l, density_a, density_b, density, tmp_a, tmp_b, I_coul, I_kjli)              &
   !$OMP SHARED  (ao_num, TCSCF_density_matrix_ao_alpha, TCSCF_density_matrix_ao_beta, ao_two_e_tc_tot, &
-  !$OMP         two_e_tc_non_hermit_integral_alpha, two_e_tc_non_hermit_integral_beta)
+  !$OMP         two_e_tc_integral_alpha, two_e_tc_integral_beta)
 
   allocate(tmp_a(ao_num,ao_num), tmp_b(ao_num,ao_num))
   tmp_a = 0.d0
@@ -124,8 +58,8 @@ END_PROVIDER
   !$OMP CRITICAL
   do i = 1, ao_num
     do j = 1, ao_num
-      two_e_tc_non_hermit_integral_alpha(j,i) += tmp_a(j,i)
-      two_e_tc_non_hermit_integral_beta (j,i) += tmp_b(j,i)
+      two_e_tc_integral_alpha(j,i) += tmp_a(j,i)
+      two_e_tc_integral_beta (j,i) += tmp_b(j,i)
     enddo
   enddo
   !$OMP END CRITICAL
@@ -133,9 +67,6 @@ END_PROVIDER
   deallocate(tmp_a, tmp_b)
   !$OMP END PARALLEL
 
-  !call wall_time(t1)
-  !print*, ' Wall time for two_e_tc_non_hermit_integral = ', t1 - t0
-
 END_PROVIDER 
 
 ! ---
@@ -149,13 +80,7 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_alpha, (ao_num, ao_num)]
   implicit none
   double precision :: t0, t1
 
-  !print*, ' Providing Fock_matrix_tc_ao_alpha ...'
-  !call wall_time(t0)
-
-  Fock_matrix_tc_ao_alpha = ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_alpha
-
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_ao_alpha =', t1-t0
+  Fock_matrix_tc_ao_alpha = ao_one_e_integrals_tc_tot + two_e_tc_integral_alpha
 
 END_PROVIDER 
 
@@ -169,7 +94,7 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_beta, (ao_num, ao_num)]
 
   implicit none
 
-  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_beta 
+  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot + two_e_tc_integral_beta 
 
 END_PROVIDER 
 
@@ -185,9 +110,6 @@ BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
   double precision              :: t0, t1, tt0, tt1
   double precision, allocatable :: tmp(:,:)
 
-  !print*, ' Providing Fock_matrix_tc_mo_alpha ...'
-  !call wall_time(t0)
-
   if(bi_ortho) then
 
     PROVIDE mo_l_coef mo_r_coef
@@ -196,8 +118,8 @@ BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
                           , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
 
     if(three_body_h_tc) then
-      PROVIDE fock_3e_uhf_mo_a
-      Fock_matrix_tc_mo_alpha += fock_3e_uhf_mo_a
+      PROVIDE fock_3e_mo_a
+      Fock_matrix_tc_mo_alpha += fock_3e_mo_a
     endif
 
   else
@@ -207,9 +129,6 @@ BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
 
   endif
 
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_mo_alpha =', t1-t0
-
 END_PROVIDER
 
 ! ---
@@ -229,8 +148,8 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_beta, (mo_num,mo_num) ]
                           , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
 
     if(three_body_h_tc) then
-      PROVIDE fock_3e_uhf_mo_b
-      Fock_matrix_tc_mo_beta += fock_3e_uhf_mo_b
+      PROVIDE fock_3e_mo_b
+      Fock_matrix_tc_mo_beta += fock_3e_mo_b
     endif
 
   else
@@ -286,20 +205,895 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_tot, (ao_num, ao_num) ]
   implicit none
   double precision :: t0, t1
 
-  !print*, ' Providing Fock_matrix_tc_ao_tot ...'
-  !call wall_time(t0)
-
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE Fock_matrix_tc_mo_tot
 
   call mo_to_ao_bi_ortho( Fock_matrix_tc_mo_tot, size(Fock_matrix_tc_mo_tot, 1) &
                         , Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1) )
 
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_ao_tot =', t1-t0
-
 END_PROVIDER
 
 ! ---
 
 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_mo_a, (mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! Fock matrix alpha from three-electron terms 
+  !
+  ! WARNING :: non hermitian if bi-ortho MOS used 
+  !
+  END_DOC
+
+  implicit none
+  double precision :: ti, tf
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! CLOSED-SHELL PART
+  PROVIDE fock_3e_mo_cs
+  fock_3e_mo_a = fock_3e_mo_cs
+
+  if(elec_alpha_num .ne. elec_beta_num) then
+
+    ! OPEN-SHELL PART
+    PROVIDE fock_3e_mo_a_os
+
+    fock_3e_mo_a += fock_3e_mo_a_os
+  endif
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_mo_b, (mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! Fock matrix beta from three-electron terms 
+  !
+  ! WARNING :: non hermitian if bi-ortho MOS used 
+  !
+  END_DOC
+
+  implicit none
+  double precision :: ti, tf
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! CLOSED-SHELL PART
+  PROVIDE fock_3e_mo_cs
+  fock_3e_mo_b = fock_3e_mo_cs
+
+  if(elec_alpha_num .ne. elec_beta_num) then
+
+    ! OPEN-SHELL PART
+    PROVIDE fock_3e_mo_b_os
+
+    fock_3e_mo_b += fock_3e_mo_b_os
+  endif
+
+END_PROVIDER 
+
+! ---
+
+
+! ---
+
+ BEGIN_PROVIDER [double precision, fock_3e_mo_a_os, (mo_num, mo_num)]
+&BEGIN_PROVIDER [double precision, fock_3e_mo_b_os, (mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! Open Shell part of the Fock matrix from three-electron terms 
+  !
+  ! WARNING :: non hermitian if bi-ortho MOS used 
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: a, b, i, j, ipoint
+  double precision              :: loc_1, loc_2, loc_3, loc_4
+  double precision              :: ti, tf
+  double precision, allocatable :: Okappa(:), Jkappa(:,:), Obarkappa(:), Jbarkappa(:,:)
+  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
+  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
+  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! ---
+
+  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
+  allocate(Jbarkappa(n_points_final_grid,3), Obarkappa(n_points_final_grid))
+  Jkappa    = 0.d0
+  Okappa    = 0.d0
+  Jbarkappa = 0.d0
+  Obarkappa = 0.d0
+
+  !$OMP PARALLEL                                                    &
+  !$OMP DEFAULT (NONE)                                              &
+  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)                 &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa, Obarkappa, Jbarkappa)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
+
+  tmp_omp_d2 = 0.d0
+  tmp_omp_d1 = 0.d0
+  !$OMP DO
+  do i = 1, elec_beta_num
+    do ipoint = 1, n_points_final_grid
+      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
+    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  tmp_omp_d2 = 0.d0
+  tmp_omp_d1 = 0.d0
+  !$OMP DO
+  do i = elec_beta_num+1, elec_alpha_num
+    do ipoint = 1, n_points_final_grid
+      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    Jbarkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    Jbarkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    Jbarkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
+    Obarkappa(ipoint)   += tmp_omp_d1(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2, tmp_omp_d1)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_1(n_points_final_grid,4))
+
+  do ipoint = 1, n_points_final_grid
+
+    loc_1 = -2.d0 * Okappa   (ipoint) 
+    loc_2 = -2.d0 * Obarkappa(ipoint) 
+    loc_3 =         Obarkappa(ipoint) 
+
+    tmp_1(ipoint,1) = (loc_1 - loc_3) * Jbarkappa(ipoint,1) + loc_2 * Jkappa(ipoint,1)
+    tmp_1(ipoint,2) = (loc_1 - loc_3) * Jbarkappa(ipoint,2) + loc_2 * Jkappa(ipoint,2)
+    tmp_1(ipoint,3) = (loc_1 - loc_3) * Jbarkappa(ipoint,3) + loc_2 * Jkappa(ipoint,3)
+
+    tmp_1(ipoint,4) = Obarkappa(ipoint)
+  enddo
+
+  !$OMP PARALLEL                                                    &
+  !$OMP DEFAULT (NONE)                                              &
+  !$OMP PRIVATE (ipoint, i, j, loc_1, loc_2, tmp_omp_d2)            &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3))
+
+  tmp_omp_d2 = 0.d0
+  !$OMP DO COLLAPSE(2)
+  do i = 1, elec_beta_num
+    do j = elec_beta_num+1, elec_alpha_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+        loc_2 = mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j) 
+
+        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,1,j,i) 
+        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,2,j,i) 
+        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,3,j,i) 
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
+  enddo
+  !$OMP END CRITICAL
+
+  tmp_omp_d2 = 0.d0
+  !$OMP DO COLLAPSE(2)
+  do i = elec_beta_num+1, elec_alpha_num
+    do j = elec_beta_num+1, elec_alpha_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+
+        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j)
+        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j)
+        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, a, b)                                    &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp_2)
+  !$OMP DO COLLAPSE(2)
+  do a = 1, mo_num
+    do b = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, a, b, i)                                           &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         tmp_2)
+  !$OMP DO COLLAPSE(2)
+  do a = 1, mo_num
+    do b = 1, mo_num
+
+      tmp_2(:,4,b,a) = 0.d0
+      do i = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,4,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, 1.d0 &
+            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
+            , tmp_1(1,1), 1                                   &
+            , 0.d0, fock_3e_mo_b_os(1,1), 1)
+
+  deallocate(tmp_1, tmp_2)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,2,mo_num), tmp_4(n_points_final_grid,2,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    tmp_3(:,:,b) = 0.d0
+    tmp_4(:,:,b) = 0.d0
+    do ipoint = 1, n_points_final_grid
+
+      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+
+      loc_1 = -2.0d0 * mos_r_in_r_array_transp(ipoint,b)
+
+      tmp_4(ipoint,1,b) = loc_1 * ( Jbarkappa(ipoint,1) * (Jkappa(ipoint,1) + 0.25d0 * Jbarkappa(ipoint,1)) &
+                                  + Jbarkappa(ipoint,2) * (Jkappa(ipoint,2) + 0.25d0 * Jbarkappa(ipoint,2)) &
+                                  + Jbarkappa(ipoint,3) * (Jkappa(ipoint,3) + 0.25d0 * Jbarkappa(ipoint,3)) )
+
+      tmp_4(ipoint,2,b) = mos_r_in_r_array_transp(ipoint,b)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_3(ipoint,2,b) += loc_1 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+                                                                                                       
+        tmp_4(ipoint,1,b) += loc_2 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_2 = mos_r_in_r_array_transp(ipoint,b)
+
+          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+
+    do i = elec_beta_num+1, elec_alpha_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
+
+          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 2*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 2*n_points_final_grid                   &
+            , tmp_4(1,1,1), 2*n_points_final_grid                   &
+            , 1.d0, fock_3e_mo_b_os(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+
+  ! ---
+
+  fock_3e_mo_a_os = fock_3e_mo_b_os
+
+  allocate(tmp_1(n_points_final_grid,1))
+
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) = Obarkappa(ipoint) + 2.d0 * Okappa(ipoint) 
+  enddo
+
+  allocate(tmp_2(n_points_final_grid,1,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, a, b, i)                                           &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         tmp_2)
+  !$OMP DO COLLAPSE(2)
+  do a = 1, mo_num
+    do b = 1, mo_num
+
+      tmp_2(:,1,b,a) = 0.d0
+      do i = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,1,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemv( 'T', n_points_final_grid, mo_num*mo_num, 1.d0   &
+            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
+            , tmp_1(1,1), 1                                   &
+            , 1.d0, fock_3e_mo_a_os(1,1), 1)
+
+  deallocate(tmp_1, tmp_2)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,8,mo_num), tmp_4(n_points_final_grid,8,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b)                                       &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    tmp_3(:,:,b) = 0.d0
+    tmp_4(:,:,b) = 0.d0
+    do ipoint = 1, n_points_final_grid
+
+      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+
+      tmp_4(ipoint,8,b) = mos_r_in_r_array_transp(ipoint,b)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+
+        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+      enddo
+    enddo
+
+    do i = elec_beta_num+1, elec_alpha_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_3 = 2.d0 * loc_1
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+        loc_4 = 2.d0 * loc_2
+
+        tmp_3(ipoint,5,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmp_3(ipoint,6,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmp_3(ipoint,7,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+
+        tmp_3(ipoint,8,b) += loc_3 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+                                                                                                       
+        tmp_4(ipoint,1,b) += loc_4 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+
+        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+
+        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+
+    do i = 1, elec_beta_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = mos_r_in_r_array_transp(ipoint,b)
+          loc_3 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+          loc_3 = mos_r_in_r_array_transp(ipoint,j)
+
+          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+
+          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,j,i) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,j,i) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,j,i) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+        enddo
+      enddo
+    enddo
+
+    do i = elec_beta_num+1, elec_alpha_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
+          loc_3 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 8*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 8*n_points_final_grid                   &
+            , tmp_4(1,1,1), 8*n_points_final_grid                   &
+            , 1.d0, fock_3e_mo_a_os(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+  deallocate(Jkappa, Okappa)
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_mo_cs, (mo_num, mo_num)]
+
+  implicit none
+  integer                       :: a, b, i, j, ipoint
+  double precision              :: ti, tf
+  double precision              :: loc_1, loc_2, loc_3
+  double precision, allocatable :: Okappa(:), Jkappa(:,:)
+  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
+  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:), tmp_22(:,:,:)
+  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! ---
+
+  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
+  Jkappa = 0.d0
+  Okappa = 0.d0
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)               &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
+  tmp_omp_d2 = 0.d0
+  tmp_omp_d1 = 0.d0
+
+  !$OMP DO
+  do i = 1, elec_beta_num
+    do ipoint = 1, n_points_final_grid
+      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
+    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2, tmp_omp_d1)
+
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_1(n_points_final_grid,4))
+
+  do ipoint = 1, n_points_final_grid
+    loc_1 = 2.d0 * Okappa(ipoint) 
+    tmp_1(ipoint,1) = loc_1 * Jkappa(ipoint,1)
+    tmp_1(ipoint,2) = loc_1 * Jkappa(ipoint,2)
+    tmp_1(ipoint,3) = loc_1 * Jkappa(ipoint,3)
+    tmp_1(ipoint,4) = Okappa(ipoint)
+  enddo
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, i, j, loc_1, tmp_omp_d2)                 &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3))
+  tmp_omp_d2 = 0.d0
+
+  !$OMP DO COLLAPSE(2)
+  do i = 1, elec_beta_num
+    do j = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+
+        tmp_omp_d2(ipoint,1) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) 
+        tmp_omp_d2(ipoint,2) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) 
+        tmp_omp_d2(ipoint,3) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) 
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  if(tc_save_mem) then
+
+    allocate(tmp_22(n_points_final_grid,4,mo_num))
+    do a = 1, mo_num
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, b, i)                                    &
+      !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, a,    &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp_22)
+      !$OMP DO
+      do b = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp_22(ipoint,1,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+          tmp_22(ipoint,2,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+          tmp_22(ipoint,3,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        enddo
+        tmp_22(:,4,b) = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+            tmp_22(ipoint,4,b) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                     + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                     + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call dgemv( 'T', 4*n_points_final_grid, mo_num, -2.d0        &
+                , tmp_22(1,1,1), size(tmp_22, 1) * size(tmp_22, 2) &
+                , tmp_1(1,1), 1                                    &
+                , 0.d0, fock_3e_mo_cs(1,a), 1)
+    enddo
+    deallocate(tmp_22)
+
+  else
+
+    allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, a, b, i)                                 &
+    !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp_2)
+    !$OMP DO COLLAPSE(2)
+    do a = 1, mo_num
+      do b = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+          tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+          tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        enddo
+        tmp_2(:,4,b,a) = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+            tmp_2(ipoint,4,b,a) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                      + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                      + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, -2.d0 &
+              , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2)  &
+              , tmp_1(1,1), 1                                    &
+              , 0.d0, fock_3e_mo_cs(1,1), 1)
+    deallocate(tmp_2)
+
+  endif
+
+  deallocate(tmp_1)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,5,mo_num), tmp_4(n_points_final_grid,5,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, Jkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    tmp_3(:,:,b) = 0.d0
+    tmp_4(:,:,b) = 0.d0
+    do ipoint = 1, n_points_final_grid
+      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+
+      tmp_4(ipoint,1,b) = -2.d0 * mos_r_in_r_array_transp(ipoint,b) * ( Jkappa(ipoint,1) * Jkappa(ipoint,1) &
+                                                                      + Jkappa(ipoint,2) * Jkappa(ipoint,2) &
+                                                                      + Jkappa(ipoint,3) * Jkappa(ipoint,3) )
+      tmp_4(ipoint,5,b) = mos_r_in_r_array_transp(ipoint,b)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2)                      &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         Jkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+        tmp_3(ipoint,5,b) += 2.d0 * loc_1 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+                                                                                                       
+        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+        tmp_4(ipoint,1,b) += 2.d0 * loc_2 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)            &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do j = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = mos_r_in_r_array_transp(ipoint,b)
+          loc_3 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmp_3(ipoint,5,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmp_4(ipoint,1,b) += ( loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) ) &
+                               - loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) ) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 5*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 5*n_points_final_grid                   &
+            , tmp_4(1,1,1), 5*n_points_final_grid                   &
+            , 1.d0, fock_3e_mo_cs(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+  deallocate(Jkappa, Okappa)
+
+  ! ---
+
+END_PROVIDER 
+
+! ---
+
diff --git a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
index eb8973ff..2df2421e 100644
--- a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
+++ b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
@@ -1,4 +1,6 @@
 
+! ---
+
  BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_tot, (mo_num,mo_num) ]
 &BEGIN_PROVIDER [ double precision, Fock_matrix_tc_diag_mo_tot, (mo_num)]
 
@@ -23,9 +25,6 @@
   integer          :: i, j, n
   double precision :: t0, t1
 
-  !print*, ' Providing Fock_matrix_tc_mo_tot ...'
-  !call wall_time(t0)
-
   if(elec_alpha_num == elec_beta_num) then
 
     PROVIDE Fock_matrix_tc_mo_alpha
@@ -158,8 +157,8 @@
     Fock_matrix_tc_mo_tot += fock_3_mat
   endif
 
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_mo_tot =', t1-t0
-
 END_PROVIDER
 
+! ---
+
+
diff --git a/plugins/local/tc_scf/fock_vartc.irp.f b/plugins/local/tc_scf/fock_vartc.irp.f
deleted file mode 100644
index 2b4a57e5..00000000
--- a/plugins/local/tc_scf/fock_vartc.irp.f
+++ /dev/null
@@ -1,287 +0,0 @@
-
-! ---
-
- BEGIN_PROVIDER [ double precision, two_e_vartc_integral_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_vartc_integral_beta , (ao_num, ao_num)]
-
-  implicit none
-  integer                       :: i, j, k, l
-  double precision              :: density, density_a, density_b, I_coul, I_kjli
-  double precision              :: t0, t1
-  double precision, allocatable :: tmp_a(:,:), tmp_b(:,:)
-
-  two_e_vartc_integral_alpha = 0.d0
-  two_e_vartc_integral_beta  = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                                                        &
- !$OMP PRIVATE (i, j, k, l, density_a, density_b, density, tmp_a, tmp_b, I_coul, I_kjli)              &
- !$OMP SHARED  (ao_num, TCSCF_density_matrix_ao_alpha, TCSCF_density_matrix_ao_beta, ao_two_e_tc_tot, &
- !$OMP         two_e_vartc_integral_alpha, two_e_vartc_integral_beta)
-
-  allocate(tmp_a(ao_num,ao_num), tmp_b(ao_num,ao_num))
-  tmp_a = 0.d0
-  tmp_b = 0.d0
-
- !$OMP DO
-  do j = 1, ao_num
-    do l = 1, ao_num
-      density_a = TCSCF_density_matrix_ao_alpha(l,j)
-      density_b = TCSCF_density_matrix_ao_beta (l,j)
-      density   = density_a + density_b                      
-      do i = 1, ao_num
-        do k = 1, ao_num
-
-          I_coul = density * ao_two_e_tc_tot(k,i,l,j)
-          I_kjli = ao_two_e_tc_tot(k,j,l,i)
-
-          tmp_a(k,i) += I_coul - density_a * I_kjli
-          tmp_b(k,i) += I_coul - density_b * I_kjli
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO NOWAIT
-
- !$OMP CRITICAL
-  do i = 1, ao_num
-    do j = 1, ao_num
-      two_e_vartc_integral_alpha(j,i) += tmp_a(j,i)
-      two_e_vartc_integral_beta (j,i) += tmp_b(j,i)
-    enddo
-  enddo
- !$OMP END CRITICAL
-
-  deallocate(tmp_a, tmp_b)
- !$OMP END PARALLEL
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_ao_alpha, (ao_num, ao_num)]
-
-  implicit none
-
-  Fock_matrix_vartc_ao_alpha = ao_one_e_integrals_tc_tot + two_e_vartc_integral_alpha 
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_ao_beta, (ao_num, ao_num)]
-
-  implicit none
-
-  Fock_matrix_vartc_ao_beta = ao_one_e_integrals_tc_tot + two_e_vartc_integral_beta 
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_mo_alpha, (mo_num, mo_num) ]
-
-  implicit none
-
-  call ao_to_mo_bi_ortho( Fock_matrix_vartc_ao_alpha, size(Fock_matrix_vartc_ao_alpha, 1) &
-                        , Fock_matrix_vartc_mo_alpha, size(Fock_matrix_vartc_mo_alpha, 1) )
-  if(three_body_h_tc) then
-    Fock_matrix_vartc_mo_alpha += fock_3e_uhf_mo_a
-  endif
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_mo_beta, (mo_num,mo_num) ]
-
-  implicit none
-
-  call ao_to_mo_bi_ortho( Fock_matrix_vartc_ao_beta, size(Fock_matrix_vartc_ao_beta, 1) &
-                        , Fock_matrix_vartc_mo_beta, size(Fock_matrix_vartc_mo_beta, 1) )
-  if(three_body_h_tc) then
-    Fock_matrix_vartc_mo_beta += fock_3e_uhf_mo_b
-  endif
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, grad_vartc]
-
-  implicit none
-  integer          :: i, k
-  double precision :: grad_left, grad_right
-
-  grad_left  = 0.d0
-  grad_right = 0.d0
-
-  do i = 1, elec_beta_num ! doc --> SOMO
-    do k = elec_beta_num+1, elec_alpha_num
-      grad_left  = max(grad_left , dabs(Fock_matrix_vartc_mo_tot(k,i)))
-      grad_right = max(grad_right, dabs(Fock_matrix_vartc_mo_tot(i,k)))
-    enddo
-  enddo
-
-  do i = 1, elec_beta_num ! doc --> virt 
-    do k = elec_alpha_num+1, mo_num
-      grad_left  = max(grad_left , dabs(Fock_matrix_vartc_mo_tot(k,i)))
-      grad_right = max(grad_right, dabs(Fock_matrix_vartc_mo_tot(i,k)))
-    enddo
-  enddo
-
-  do i = elec_beta_num+1, elec_alpha_num ! SOMO --> virt 
-    do k = elec_alpha_num+1, mo_num
-      grad_left  = max(grad_left , dabs(Fock_matrix_vartc_mo_tot(k,i)))
-      grad_right = max(grad_right, dabs(Fock_matrix_vartc_mo_tot(i,k)))
-    enddo
-  enddo
-
-  grad_vartc = grad_left + grad_right
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_ao_tot, (ao_num, ao_num) ]
-
-  implicit none
-
-  call mo_to_ao_bi_ortho( Fock_matrix_vartc_mo_tot, size(Fock_matrix_vartc_mo_tot, 1) &
-                        , Fock_matrix_vartc_ao_tot, size(Fock_matrix_vartc_ao_tot, 1) )
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_mo_tot, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_diag_mo_tot, (mo_num)]
-
-  implicit none
-  integer :: i, j, n
-
-  if(elec_alpha_num == elec_beta_num) then
-    Fock_matrix_vartc_mo_tot = Fock_matrix_vartc_mo_alpha
-  else
-
-    do j = 1, elec_beta_num
-      ! F-K
-      do i = 1, elec_beta_num !CC
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-            - (Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F+K/2
-      do i = elec_beta_num+1, elec_alpha_num  !CA
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             + 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F
-      do i = elec_alpha_num+1, mo_num !CV
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))
-      enddo
-    enddo
-
-    do j = elec_beta_num+1, elec_alpha_num
-      ! F+K/2
-      do i = 1, elec_beta_num !AC
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             + 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F
-      do i = elec_beta_num+1, elec_alpha_num !AA
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))
-      enddo
-      ! F-K/2
-      do i = elec_alpha_num+1, mo_num !AV
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             - 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-    enddo
-    
-    do j = elec_alpha_num+1, mo_num
-      ! F
-      do i = 1, elec_beta_num !VC
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))
-      enddo
-      ! F-K/2
-      do i = elec_beta_num+1, elec_alpha_num !VA
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             - 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F+K
-      do i = elec_alpha_num+1, mo_num !VV
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j)) &
-             + (Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-    enddo
-    if(three_body_h_tc)then
-      ! C-O
-      do j = 1, elec_beta_num
-        do i = elec_beta_num+1, elec_alpha_num
-          Fock_matrix_vartc_mo_tot(i,j) += 0.5d0*(fock_a_tot_3e_bi_orth(i,j) + fock_b_tot_3e_bi_orth(i,j))
-          Fock_matrix_vartc_mo_tot(j,i) += 0.5d0*(fock_a_tot_3e_bi_orth(j,i) + fock_b_tot_3e_bi_orth(j,i))
-        enddo
-      enddo
-      ! C-V
-      do j = 1, elec_beta_num
-        do i = elec_alpha_num+1, mo_num
-          Fock_matrix_vartc_mo_tot(i,j) += 0.5d0*(fock_a_tot_3e_bi_orth(i,j) + fock_b_tot_3e_bi_orth(i,j))
-          Fock_matrix_vartc_mo_tot(j,i) += 0.5d0*(fock_a_tot_3e_bi_orth(j,i) + fock_b_tot_3e_bi_orth(j,i))
-        enddo
-      enddo
-      ! O-V
-      do j = elec_beta_num+1, elec_alpha_num
-        do i = elec_alpha_num+1, mo_num
-          Fock_matrix_vartc_mo_tot(i,j) += 0.5d0*(fock_a_tot_3e_bi_orth(i,j) + fock_b_tot_3e_bi_orth(i,j))
-          Fock_matrix_vartc_mo_tot(j,i) += 0.5d0*(fock_a_tot_3e_bi_orth(j,i) + fock_b_tot_3e_bi_orth(j,i))
-        enddo
-      enddo
-    endif
-
-  endif
-
-  do i = 1, mo_num
-    Fock_matrix_vartc_diag_mo_tot(i) = Fock_matrix_vartc_mo_tot(i,i)
-  enddo
-
-  if(frozen_orb_scf)then
-    integer :: iorb, jorb
-    do i = 1, n_core_orb
-     iorb = list_core(i)
-     do j = 1, n_act_orb
-      jorb = list_act(j)
-      Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-      Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0
-     enddo
-    enddo
-  endif
-
-  if(no_oa_or_av_opt)then
-    do i = 1, n_act_orb
-      iorb = list_act(i)
-      do j = 1, n_inact_orb
-        jorb = list_inact(j)
-        Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-        Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0
-      enddo
-      do j = 1, n_virt_orb
-        jorb = list_virt(j)
-        Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-        Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0
-      enddo
-      do j = 1, n_core_orb
-        jorb = list_core(j)
-        Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-        Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0                                                                                                                 
-      enddo
-    enddo
-  endif
-
-  !call check_sym(Fock_matrix_vartc_mo_tot, mo_num)
-  !do i = 1, mo_num
-  !  write(*,'(100(F15.8, I4))') Fock_matrix_vartc_mo_tot(i,:)
-  !enddo
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/rh_tcscf_diis.irp.f b/plugins/local/tc_scf/rh_tcscf_diis.irp.f
index 431b6e08..853c4ab5 100644
--- a/plugins/local/tc_scf/rh_tcscf_diis.irp.f
+++ b/plugins/local/tc_scf/rh_tcscf_diis.irp.f
@@ -234,7 +234,7 @@ subroutine rh_tcscf_diis()
     call unlock_io
 
     if(er_delta .lt. 0.d0) then
-      call ezfio_set_tc_scf_bitc_energy(etc_tot)
+      call ezfio_set_tc_scf_tcscf_energy(etc_tot)
       call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
       call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
       write(json_unit, json_true_fmt) 'saved'
@@ -263,7 +263,7 @@ subroutine rh_tcscf_diis()
 
   deallocate(mo_r_coef_save, mo_l_coef_save, F_DIIS, E_DIIS)
 
-  call ezfio_set_tc_scf_bitc_energy(TC_HF_energy)
+  call ezfio_set_tc_scf_tcscf_energy(TC_HF_energy)
   call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
   call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
 
diff --git a/plugins/local/tc_scf/rh_tcscf_simple.irp.f b/plugins/local/tc_scf/rh_tcscf_simple.irp.f
index 0b79e8ea..2c2cf2c2 100644
--- a/plugins/local/tc_scf/rh_tcscf_simple.irp.f
+++ b/plugins/local/tc_scf/rh_tcscf_simple.irp.f
@@ -91,7 +91,7 @@ subroutine rh_tcscf_simple()
       e_delta = dabs(etc_tot - e_save)
       e_save  = etc_tot
 
-      call ezfio_set_tc_scf_bitc_energy(etc_tot)
+      call ezfio_set_tc_scf_tcscf_energy(etc_tot)
 
       call wall_time(t1)
       write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
diff --git a/plugins/local/tc_scf/rh_vartcscf_simple.irp.f b/plugins/local/tc_scf/rh_vartcscf_simple.irp.f
deleted file mode 100644
index ecb0709e..00000000
--- a/plugins/local/tc_scf/rh_vartcscf_simple.irp.f
+++ /dev/null
@@ -1,89 +0,0 @@
-! ---
-
-subroutine rh_vartcscf_simple()
-
-  implicit none
-  integer          :: i, j, it, dim_DIIS
-  double precision :: t0, t1
-  double precision :: e_save, e_delta, rho_delta
-  double precision :: etc_tot, etc_1e, etc_2e, etc_3e
-  double precision :: er_DIIS
-
-
-  it       = 0
-  e_save   = 0.d0
-  dim_DIIS = 0
-
-  ! ---
-
-  PROVIDE level_shift_tcscf
-  PROVIDE mo_r_coef
-
-  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')                      &
-    '====', '================', '================', '================', '================', '================' &
-          , '================', '================', '====', '========'
-  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')                      &
-    ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
-          , '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
-  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')                      &
-    '====', '================', '================', '================', '================', '================' &
-          , '================', '================', '====', '========'
-
-
-  ! first iteration (HF orbitals)
-  call wall_time(t0)
-
-  etc_tot = VARTC_HF_energy
-  etc_1e  = VARTC_HF_one_e_energy
-  etc_2e  = VARTC_HF_two_e_energy
-  etc_3e  = 0.d0
-  if(three_body_h_tc) then
-    etc_3e = diag_three_elem_hf
-  endif
-  er_DIIS = maxval(abs(FQS_SQF_mo))
-  e_delta = dabs(etc_tot - e_save)
-  e_save  = etc_tot
-
-  call wall_time(t1)
-  write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-    it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-
-  do while(er_DIIS .gt. dsqrt(thresh_tcscf))
-    call wall_time(t0)
-
-    it += 1
-    if(it > n_it_tcscf_max) then
-      print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
-      stop
-    endif
-
-    mo_r_coef = fock_vartc_eigvec_ao
-    mo_l_coef = mo_r_coef
-    call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-    call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-    TOUCH mo_l_coef mo_r_coef
-
-    etc_tot = VARTC_HF_energy
-    etc_1e  = VARTC_HF_one_e_energy
-    etc_2e  = VARTC_HF_two_e_energy
-    etc_3e  = 0.d0
-    if(three_body_h_tc) then
-      etc_3e = diag_three_elem_hf
-    endif
-    er_DIIS = maxval(abs(FQS_SQF_mo))
-    e_delta = dabs(etc_tot - e_save)
-    e_save  = etc_tot
-
-    call ezfio_set_tc_scf_bitc_energy(etc_tot)
-
-    call wall_time(t1)
-    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-  enddo
-
-  print *, ' VAR-TCSCF Simple converged !'
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/tc_scf.irp.f b/plugins/local/tc_scf/tc_scf.irp.f
index 768069d6..ee8e8dad 100644
--- a/plugins/local/tc_scf/tc_scf.irp.f
+++ b/plugins/local/tc_scf/tc_scf.irp.f
@@ -13,7 +13,6 @@ program tc_scf
   PROVIDE j1e_type
   PROVIDE j2e_type
   PROVIDE tcscf_algorithm
-  PROVIDE var_tc
 
   print *, ' TC-SCF with:'
   print *, ' j1e_type = ', j1e_type
@@ -45,46 +44,29 @@ program tc_scf
   !call create_guess()
   !call orthonormalize_mos()
 
-
-  if(var_tc) then
-
-    print *, ' VAR-TC'
-
-    if(tcscf_algorithm == 'DIIS') then
-      print*, ' NOT implemented yet'
-    elseif(tcscf_algorithm == 'Simple') then
-      call rh_vartcscf_simple()
-    else
-      print *, ' not implemented yet', tcscf_algorithm
-      stop
-    endif
-
+  if(tcscf_algorithm == 'DIIS') then
+    call rh_tcscf_diis()
+  elseif(tcscf_algorithm == 'Simple') then
+    call rh_tcscf_simple()
   else
-
-    if(tcscf_algorithm == 'DIIS') then
-      call rh_tcscf_diis()
-    elseif(tcscf_algorithm == 'Simple') then
-      call rh_tcscf_simple()
-    else
-      print *, ' not implemented yet', tcscf_algorithm
-      stop
-    endif
-
-    PROVIDE Fock_matrix_tc_diag_mo_tot
-    print*, ' Eigenvalues:' 
-    do i = 1, mo_num
-      print*, i, Fock_matrix_tc_diag_mo_tot(i)
-    enddo
-
-    ! TODO 
-    ! rotate angles in separate code only if necessary
-    if(minimize_lr_angles)then
-     call minimize_tc_orb_angles()
-    endif
-    call print_energy_and_mos(good_angles)
-
+    print *, ' not implemented yet', tcscf_algorithm
+    stop
   endif
 
+  PROVIDE Fock_matrix_tc_diag_mo_tot
+  print*, ' Eigenvalues:' 
+  do i = 1, mo_num
+    print*, i, Fock_matrix_tc_diag_mo_tot(i)
+  enddo
+
+  ! TODO 
+  ! rotate angles in separate code only if necessary
+  if(minimize_lr_angles)then
+   call minimize_tc_orb_angles()
+  endif
+  call print_energy_and_mos(good_angles)
+
+
   write(json_unit,json_array_close_fmtx)
   call json_close
 
diff --git a/plugins/local/tc_scf/tc_scf_energy.irp.f b/plugins/local/tc_scf/tc_scf_energy.irp.f
index 833b48aa..0266c605 100644
--- a/plugins/local/tc_scf/tc_scf_energy.irp.f
+++ b/plugins/local/tc_scf/tc_scf_energy.irp.f
@@ -11,11 +11,8 @@
   integer          :: i, j
   double precision :: t0, t1
 
-  !print*, ' Providing TC energy ...'
-  !call wall_time(t0)
-
   PROVIDE mo_l_coef mo_r_coef
-  PROVIDE two_e_tc_non_hermit_integral_alpha two_e_tc_non_hermit_integral_beta
+  PROVIDE two_e_tc_integral_alpha two_e_tc_integral_beta
 
   TC_HF_energy = nuclear_repulsion
   TC_HF_one_e_energy = 0.d0
@@ -23,8 +20,8 @@
 
   do j = 1, ao_num
     do i = 1, ao_num
-      TC_HF_two_e_energy += 0.5d0 * ( two_e_tc_non_hermit_integral_alpha(i,j) * TCSCF_density_matrix_ao_alpha(i,j) &
-                                    + two_e_tc_non_hermit_integral_beta (i,j) * TCSCF_density_matrix_ao_beta (i,j) )
+      TC_HF_two_e_energy += 0.5d0 * ( two_e_tc_integral_alpha(i,j) * TCSCF_density_matrix_ao_alpha(i,j) &
+                                    + two_e_tc_integral_beta (i,j) * TCSCF_density_matrix_ao_beta (i,j) )
       TC_HF_one_e_energy += ao_one_e_integrals_tc_tot(i,j) &
                           * (TCSCF_density_matrix_ao_alpha(i,j) + TCSCF_density_matrix_ao_beta (i,j) )
     enddo
@@ -33,38 +30,6 @@
   TC_HF_energy += TC_HF_one_e_energy + TC_HF_two_e_energy
   TC_HF_energy += diag_three_elem_hf
 
-  !call wall_time(t1)
-  !print*, ' Wall time for TC energy=', t1-t0
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, VARTC_HF_energy]
-&BEGIN_PROVIDER [ double precision, VARTC_HF_one_e_energy]
-&BEGIN_PROVIDER [ double precision, VARTC_HF_two_e_energy]
-
-  implicit none
-  integer :: i, j
-
-  PROVIDE mo_r_coef
-
-  VARTC_HF_energy = nuclear_repulsion
-  VARTC_HF_one_e_energy = 0.d0
-  VARTC_HF_two_e_energy = 0.d0
-
-  do j = 1, ao_num
-    do i = 1, ao_num
-      VARTC_HF_two_e_energy += 0.5d0 * ( two_e_vartc_integral_alpha(i,j) * TCSCF_density_matrix_ao_alpha(i,j) &
-                                       + two_e_vartc_integral_beta (i,j) * TCSCF_density_matrix_ao_beta (i,j) )
-      VARTC_HF_one_e_energy += ao_one_e_integrals_tc_tot(i,j) &
-                             * (TCSCF_density_matrix_ao_alpha(i,j) + TCSCF_density_matrix_ao_beta (i,j) )
-    enddo
-  enddo
-
-  VARTC_HF_energy += VARTC_HF_one_e_energy + VARTC_HF_two_e_energy
-  VARTC_HF_energy += diag_three_elem_hf
-
 END_PROVIDER
 
 ! ---
diff --git a/plugins/local/tc_scf/test_int.irp.f b/plugins/local/tc_scf/test_int.irp.f
deleted file mode 100644
index e135fcd8..00000000
--- a/plugins/local/tc_scf/test_int.irp.f
+++ /dev/null
@@ -1,970 +0,0 @@
-program test_ints
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, ' starting test_ints ...'
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  my_extra_grid_becke = .True.
-  my_n_pt_r_extra_grid = 30
-  my_n_pt_a_extra_grid = 50 ! small extra_grid for quick debug
-  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
-
-!! OK 
-! call routine_int2_u_grad1u_env2 
-! OK
-! call routine_v_ij_erf_rk_cst_mu_env
-! OK 
-! call routine_x_v_ij_erf_rk_cst_mu_env
-! OK
-! call routine_int2_u2_env2
-! OK
-! call routine_int2_u_grad1u_x_env2
-! OK 
-! call routine_int2_grad1u2_grad2u2_env2
-! call routine_int2_u_grad1u_env2
-! call test_int2_grad1_u12_ao_test
-! call routine_v_ij_u_cst_mu_env_test
-! call test_grid_points_ao
- !call test_int_gauss
-
-  !call test_fock_3e_uhf_ao()
-  !call test_fock_3e_uhf_mo()
-
-  !call test_two_e_tc_non_hermit_integral()
-
-!!PROVIDE TC_HF_energy VARTC_HF_energy
-!!print *, '    TC_HF_energy = ',    TC_HF_energy
-!!print *, ' VARTC_HF_energy = ', VARTC_HF_energy
-
-  call test_fock_3e_uhf_mo_cs()
-  call test_fock_3e_uhf_mo_a()
-  call test_fock_3e_uhf_mo_b()
-
-end
-
-! ---
-
-subroutine routine_test_env
- implicit none
- integer :: i,icount,j
- icount = 0
- do i = 1, List_env1s_square_size
-  if(dabs(List_env1s_square_coef(i)).gt.1.d-10)then
-   print*,''
-   print*,List_env1s_square_expo(i),List_env1s_square_coef(i)
-   print*,List_env1s_square_cent(1:3,i)
-   print*,''
-   icount += 1
-  endif
-  
- enddo
- print*,'List_env1s_square_coef,icount = ',List_env1s_square_size,icount
- do i = 1, ao_num
-  do j = 1, ao_num
-   do icount = 1, List_comb_thr_b3_size(j,i)
-    print*,'',j,i
-    print*,List_comb_thr_b3_expo(icount,j,i),List_comb_thr_b3_coef(icount,j,i)
-    print*,List_comb_thr_b3_cent(1:3,icount,j,i)
-    print*,''
-   enddo
-!   enddo
-  enddo
- enddo
- print*,'max_List_comb_thr_b3_size = ',max_List_comb_thr_b3_size,List_env1s_square_size
-
-end
-
-subroutine routine_int2_u_grad1u_env2
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += int2_u_grad1u_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += int2_u_grad1u_env2(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_int2_u_grad1u_env2'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-subroutine routine_v_ij_erf_rk_cst_mu_env
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += v_ij_erf_rk_cst_mu_env_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_erf_rk_cst_mu_env(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_v_ij_erf_rk_cst_mu_env'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-
-subroutine routine_x_v_ij_erf_rk_cst_mu_env
- implicit none
- integer :: i,j,ipoint,k,l,m
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      do m = 1, 3
-       array(j,i,l,k)     += x_v_ij_erf_rk_cst_mu_env_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += x_v_ij_erf_rk_cst_mu_env     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
-
- print*,'******'
- print*,'******'
- print*,'routine_x_v_ij_erf_rk_cst_mu_env'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-
-
-subroutine routine_v_ij_u_cst_mu_env_test
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += v_ij_u_cst_mu_env_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_u_cst_mu_env_fit (j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_v_ij_u_cst_mu_env_test'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-end
-
-subroutine routine_int2_grad1u2_grad2u2_env2
- implicit none
- integer :: i,j,ipoint,k,l
- integer :: ii , jj
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- double precision, allocatable :: ints(:,:,:)
- allocate(ints(ao_num, ao_num, n_points_final_grid))
-! do ipoint = 1, n_points_final_grid
-!  do i = 1, ao_num
-!   do j = 1, ao_num
-!    read(33,*)ints(j,i,ipoint)
-!   enddo
-!  enddo
-! enddo
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += int2_grad1u2_grad2u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!     !array(j,i,l,k)     += int2_grad1u2_grad2u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!      array_ref(j,i,l,k)     += int2_grad1u2_grad2u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!     !array(j,i,l,k) += ints(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!       array_ref(j,i,l,k) += int2_grad1u2_grad2u2_env2(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += ints(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!      if(dabs(int2_grad1u2_grad2u2_env2_test(j,i,ipoint)).gt.1.d-6)then
-!       if(dabs(int2_grad1u2_grad2u2_env2_test(j,i,ipoint) - int2_grad1u2_grad2u2_env2_test(j,i,ipoint)).gt.1.d-6)then
-!        print*,j,i,ipoint
-!        print*,int2_grad1u2_grad2u2_env2_test(j,i,ipoint) , int2_grad1u2_grad2u2_env2_test(j,i,ipoint), dabs(int2_grad1u2_grad2u2_env2_test(j,i,ipoint) - int2_grad1u2_grad2u2_env2_test(j,i,ipoint))
-!        print*,int2_grad1u2_grad2u2_env2_test(i,j,ipoint) , int2_grad1u2_grad2u2_env2_test(i,j,ipoint), dabs(int2_grad1u2_grad2u2_env2_test(i,j,ipoint) - int2_grad1u2_grad2u2_env2_test(i,j,ipoint))
-!        stop
-!       endif
-!      endif
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- double precision :: e_ref, e_new
- accu_relat = 0.d0
- accu_abs   = 0.d0
- e_ref = 0.d0
- e_new = 0.d0
- do ii = 1, elec_alpha_num
-  do jj = ii, elec_alpha_num
-   do k = 1, ao_num
-    do l = 1, ao_num
-     do i = 1, ao_num
-      do j = 1, ao_num
-       e_ref += mo_coef(j,ii) * mo_coef(i,ii) * array_ref(j,i,l,k) * mo_coef(l,jj) * mo_coef(k,jj)
-       e_new += mo_coef(j,ii) * mo_coef(i,ii) * array(j,i,l,k) * mo_coef(l,jj) * mo_coef(k,jj)
-       contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-       accu_abs += contrib
-!       if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-!        accu_relat += contrib/dabs(array_ref(j,i,l,k))
-!       endif
-      enddo
-     enddo
-    enddo
-   enddo
-
-  enddo
- enddo
- print*,'e_ref = ',e_ref
- print*,'e_new = ',e_new
-! print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
-! print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-subroutine routine_int2_u2_env2
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += int2_u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += int2_u2_env2(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_int2_u2_env2'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-
-subroutine routine_int2_u_grad1u_x_env2
- implicit none
- integer :: i,j,ipoint,k,l,m
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      do m = 1, 3
-       array(j,i,l,k)     += int2_u_grad1u_x_env2_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += int2_u_grad1u_x_env2     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_int2_u_grad1u_x_env2'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-subroutine routine_v_ij_u_cst_mu_env
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += v_ij_u_cst_mu_env_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_u_cst_mu_env_fit (j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_v_ij_u_cst_mu_env'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_ao()
-
-  implicit none
-  integer                       :: i, j
-  double precision              :: diff_tot, diff_ij, thr_ih, norm
-  double precision, allocatable :: fock_3e_uhf_ao_a_mo(:,:), fock_3e_uhf_ao_b_mo(:,:)
-
-  thr_ih = 1d-7
-
-  PROVIDE fock_a_tot_3e_bi_orth fock_b_tot_3e_bi_orth
-  PROVIDE fock_3e_uhf_ao_a fock_3e_uhf_ao_b
-
-  ! ---
-
-  allocate(fock_3e_uhf_ao_a_mo(mo_num,mo_num))
-  call ao_to_mo_bi_ortho( fock_3e_uhf_ao_a   , size(fock_3e_uhf_ao_a   , 1) &
-                        , fock_3e_uhf_ao_a_mo, size(fock_3e_uhf_ao_a_mo, 1) )
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_ao_a_mo(j,i) - fock_a_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_a_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_ao_a_mo  (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_a_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_a = ', diff_tot / norm
-  print *, ' '
-
-  deallocate(fock_3e_uhf_ao_a_mo)
-
-  ! ---
-
-  allocate(fock_3e_uhf_ao_b_mo(mo_num,mo_num))
-  call ao_to_mo_bi_ortho( fock_3e_uhf_ao_b   , size(fock_3e_uhf_ao_b   , 1) &
-                        , fock_3e_uhf_ao_b_mo, size(fock_3e_uhf_ao_b_mo, 1) )
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_ao_b_mo(j,i) - fock_b_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_b_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_ao_b_mo  (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_b_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_b = ', diff_tot/norm
-  print *, ' '
-
-  deallocate(fock_3e_uhf_ao_b_mo)
-
-  ! ---
-
-end subroutine test_fock_3e_uhf_ao()
-
-! ---
-
-subroutine test_fock_3e_uhf_mo()
-
-  implicit none
-  integer          :: i, j
-  double precision :: diff_tot, diff_ij, thr_ih, norm
-
-  thr_ih = 1d-12
-
-  PROVIDE fock_a_tot_3e_bi_orth fock_b_tot_3e_bi_orth
-  PROVIDE fock_3e_uhf_mo_a fock_3e_uhf_mo_b
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_mo_a(j,i) - fock_a_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_a_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_mo_a     (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_a_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_a = ', diff_tot / norm
-  print *, '      norm_a = ', norm
-  print *, ' '
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_mo_b(j,i) - fock_b_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_b_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_mo_b     (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_b_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_b = ', diff_tot/norm
-  print *, '      norm_b = ', norm
-  print *, ' '
-
-  ! ---
-
-end
-
-! ---
-
-subroutine test_grid_points_ao
- implicit none
- integer :: i,j,ipoint,icount,icount_good, icount_bad,icount_full
- double precision :: thr
- thr = 1.d-10
-! print*,'max_n_pts_grid_ao_prod = ',max_n_pts_grid_ao_prod
-! print*,'n_pts_grid_ao_prod'
- do i = 1, ao_num
-  do j = i, ao_num
-  icount = 0
-  icount_good = 0
-  icount_bad = 0
-  icount_full = 0
-  do ipoint = 1, n_points_final_grid
-!   if(dabs(int2_u_grad1u_x_env2_test(j,i,ipoint,1)) & 
-!    + dabs(int2_u_grad1u_x_env2_test(j,i,ipoint,2)) &
-!    + dabs(int2_u_grad1u_x_env2_test(j,i,ipoint,3)) )
-!   if(dabs(int2_u2_env2_test(j,i,ipoint)).gt.thr)then
-!    icount += 1
-!   endif
-   if(dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)).gt.thr*0.1d0)then
-    icount_full += 1
-   endif
-   if(dabs(v_ij_u_cst_mu_env_test(j,i,ipoint)).gt.thr)then
-    icount += 1
-    if(dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)).gt.thr*0.1d0)then
-    icount_good += 1
-    else
-    print*,j,i,ipoint
-    print*,dabs(v_ij_u_cst_mu_env_test(j,i,ipoint)), dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)),dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint))/dabs(v_ij_u_cst_mu_env_test(j,i,ipoint))
-    icount_bad  += 1
-    endif
-   endif
-!   if(dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)).gt.thr)then
-!   endif
-  enddo
-   print*,''
-   print*,j,i
-   print*,icount,icount_full, icount_bad!,n_pts_grid_ao_prod(j,i)
-   print*,dble(icount)/dble(n_points_final_grid),dble(icount_full)/dble(n_points_final_grid)
-!          dble(n_pts_grid_ao_prod(j,i))/dble(n_points_final_grid)
-!   if(icount.gt.n_pts_grid_ao_prod(j,i))then
-!    print*,'pb !!'
-!   endif
-  enddo
- enddo
-end
-
-subroutine test_int_gauss
- implicit none
- integer :: i,j
- print*,'center'
- do i = 1, ao_num
-  do j = i, ao_num
-   print*,j,i
-   print*,ao_prod_sigma(j,i),ao_overlap_abs_grid(j,i)
-   print*,ao_prod_center(1:3,j,i)
-  enddo
- enddo
- print*,''
- double precision :: weight, r(3),integral_1,pi,center(3),f_r,alpha,distance,integral_2
- center = 0.d0
- pi = dacos(-1.d0)
- integral_1 = 0.d0
- integral_2 = 0.d0
- alpha = 0.75d0
- do i = 1,  n_points_final_grid
-  ! you get x, y and z of the ith grid point
-  r(1) = final_grid_points(1,i)
-  r(2) = final_grid_points(2,i)
-  r(3) = final_grid_points(3,i)
-  weight = final_weight_at_r_vector(i)
-  distance = dsqrt( (r(1) - center(1))**2 +  (r(2) - center(2))**2 + (r(3) - center(3))**2 )
-  f_r = dexp(-alpha * distance*distance)
-  ! you add the contribution of the grid point to the integral
-  integral_1 += f_r * weight
-  integral_2 += f_r * distance * weight
- enddo
- print*,'integral_1      =',integral_1
- print*,'(pi/alpha)**1.5 =',(pi / alpha)**1.5
- print*,'integral_2      =',integral_2
- print*,'(pi/alpha)**1.5 =',2.d0*pi / (alpha)**2
-
-
-end
-
-! ---
-
-subroutine test_two_e_tc_non_hermit_integral()
-
-  implicit none
-  integer          :: i, j
-  double precision :: diff_tot, diff, thr_ih, norm
-
-  thr_ih = 1d-10
-
-  PROVIDE two_e_tc_non_hermit_integral_beta two_e_tc_non_hermit_integral_alpha
-  PROVIDE two_e_tc_non_hermit_integral_seq_beta two_e_tc_non_hermit_integral_seq_alpha
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, ao_num
-    do j = 1, ao_num
-
-      diff = dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i) - two_e_tc_non_hermit_integral_alpha(j,i))
-      if(diff .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_alpha(j,i)
-        print *, ' //          : ', two_e_tc_non_hermit_integral_alpha    (j,i)
-        !stop
-      endif
-
-      norm     += dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i))
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot a = ', diff_tot / norm
-  print *, '     norm a = ', norm
-  print *, ' '
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, ao_num
-    do j = 1, ao_num
-
-      diff = dabs(two_e_tc_non_hermit_integral_seq_beta(j,i) - two_e_tc_non_hermit_integral_beta(j,i))
-      if(diff .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_beta(j,i)
-        print *, ' //          : ', two_e_tc_non_hermit_integral_beta    (j,i)
-        !stop
-      endif
-
-      norm     += dabs(two_e_tc_non_hermit_integral_seq_beta(j,i))
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot b = ', diff_tot / norm
-  print *, '     norm b = ', norm
-  print *, ' '
-
-  ! ---
-
-  return
-
-end
-
-! ---
-
-subroutine test_int2_grad1_u12_ao_test
- implicit none
- integer :: i,j,ipoint,m,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do m = 1, 3
-  do ipoint = 1, n_points_final_grid
-   weight = final_weight_at_r_vector(ipoint)
-   do k = 1, ao_num
-    do l = 1, ao_num
-     do i = 1, ao_num
-      do j = 1, ao_num
-       array(j,i,l,k)     += int2_grad1_u12_ao_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += int2_grad1_u12_ao(j,i,ipoint,m)      * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'test_int2_grad1_u12_ao_test'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_mo_cs()
-
-  implicit none
-  integer          :: i, j
-  double precision :: I_old, I_new
-  double precision :: diff_tot, diff, thr_ih, norm
-
-!  double precision :: t0, t1
-!  print*, ' Providing fock_a_tot_3e_bi_orth ...'
-!  call wall_time(t0)
-!  PROVIDE fock_a_tot_3e_bi_orth
-!  call wall_time(t1)
-!  print*, ' Wall time for fock_a_tot_3e_bi_orth =', t1 - t0 
-
-  PROVIDE fock_3e_uhf_mo_cs fock_3e_uhf_mo_cs_old
-
-  thr_ih   = 1d-8
-  norm     = 0.d0
-  diff_tot = 0.d0
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      I_old = fock_3e_uhf_mo_cs_old(j,i)
-      I_new = fock_3e_uhf_mo_cs    (j,i)
-
-      diff = dabs(I_old - I_new)
-      if(diff .gt. thr_ih) then
-        print *, ' problem in fock_3e_uhf_mo_cs on ', j, i
-        print *, ' old value = ', I_old
-        print *, ' new value = ', I_new
-        !stop
-      endif
-
-      norm     += dabs(I_old)
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
-
-  return
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_mo_a()
-
-  implicit none
-  integer          :: i, j
-  double precision :: I_old, I_new
-  double precision :: diff_tot, diff, thr_ih, norm
-
-  PROVIDE fock_3e_uhf_mo_a fock_3e_uhf_mo_a_old
-
-  thr_ih   = 1d-8
-  norm     = 0.d0
-  diff_tot = 0.d0
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      I_old = fock_3e_uhf_mo_a_old(j,i)
-      I_new = fock_3e_uhf_mo_a    (j,i)
-
-      diff = dabs(I_old - I_new)
-      if(diff .gt. thr_ih) then
-        print *, ' problem in fock_3e_uhf_mo_a on ', j, i
-        print *, ' old value = ', I_old
-        print *, ' new value = ', I_new
-        !stop
-      endif
-
-      norm     += dabs(I_old)
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
-
-  return
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_mo_b()
-
-  implicit none
-  integer          :: i, j
-  double precision :: I_old, I_new
-  double precision :: diff_tot, diff, thr_ih, norm
-
-  PROVIDE fock_3e_uhf_mo_b fock_3e_uhf_mo_b_old
-
-  thr_ih   = 1d-8
-  norm     = 0.d0
-  diff_tot = 0.d0
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      I_old = fock_3e_uhf_mo_b_old(j,i)
-      I_new = fock_3e_uhf_mo_b    (j,i)
-
-      diff = dabs(I_old - I_new)
-      if(diff .gt. thr_ih) then
-        print *, ' problem in fock_3e_uhf_mo_b on ', j, i
-        print *, ' old value = ', I_old
-        print *, ' new value = ', I_new
-        !stop
-      endif
-
-      norm     += dabs(I_old)
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
-
-  return
-end
-
-! ---
-
diff --git a/src/becke_numerical_grid/extra_grid_vector.irp.f b/src/becke_numerical_grid/extra_grid_vector.irp.f
index 16a52dc6..e054e22c 100644
--- a/src/becke_numerical_grid/extra_grid_vector.irp.f
+++ b/src/becke_numerical_grid/extra_grid_vector.irp.f
@@ -70,17 +70,6 @@ END_PROVIDER
         index_final_points_extra(2,i_count) = i
         index_final_points_extra(3,i_count) = j
         index_final_points_extra_reverse(k,i,j) = i_count
-
-        if(final_weight_at_r_vector_extra(i_count) .lt. 0.d0) then
-          print *, ' !!! WARNING !!!'
-          print *, ' negative weight !!!!'
-          print *, i_count, final_weight_at_r_vector_extra(i_count)
-          if(dabs(final_weight_at_r_vector_extra(i_count)) .lt. 1d-10) then
-            final_weight_at_r_vector_extra(i_count) = 0.d0
-          else
-            stop
-          endif
-        endif
       enddo
     enddo
   enddo
diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index c35918c3..9da8a099 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -67,17 +67,6 @@ END_PROVIDER
         index_final_points(2,i_count) = i
         index_final_points(3,i_count) = j
         index_final_points_reverse(k,i,j) = i_count
-
-        if(final_weight_at_r_vector(i_count) .lt. 0.d0) then
-          print *, ' !!! WARNING !!!'
-          print *, ' negative weight !!!!'
-          print *, i_count, final_weight_at_r_vector(i_count)
-          if(dabs(final_weight_at_r_vector(i_count)) .lt. 1d-10) then
-            final_weight_at_r_vector(i_count) = 0.d0
-          else
-            stop
-          endif
-        endif 
       enddo
     enddo
   enddo
diff --git a/src/utils/util.irp.f b/src/utils/util.irp.f
index 97cbde67..c67bbf03 100644
--- a/src/utils/util.irp.f
+++ b/src/utils/util.irp.f
@@ -576,7 +576,7 @@ logical function is_same_spin(sigma_1, sigma_2)
     is_same_spin = .false.
   endif
 
-end function is_same_spin
+end
 
 ! ---
    
@@ -596,7 +596,7 @@ function Kronecker_delta(i, j) result(delta)
     delta = 0.d0
   endif
 
-end function Kronecker_delta
+end
 
 ! ---
 
@@ -634,7 +634,81 @@ subroutine diagonalize_sym_matrix(N, A, e)
     print*,'Problem in diagonalize_sym_matrix (dsyev)!!'
   endif
 
-end subroutine diagonalize_sym_matrix
+end
+
+! ---
+
+
+subroutine give_degen(A, n, shift, list_degen, n_degen_list)
+
+  BEGIN_DOC
+  ! returns n_degen_list :: the number of degenerated SET of elements (i.e. with |A(i)-A(i+1)| below shift)
+  !
+  ! for each of these sets, list_degen(1,i) = first degenerate element of the set i, 
+  !
+  !                         list_degen(2,i) = last degenerate element of the set i.
+  END_DOC
+
+  implicit none
+
+  double precision, intent(in)  :: A(n)
+  double precision, intent(in)  :: shift
+  integer,          intent(in)  :: n
+  integer,          intent(out) :: list_degen(2,n), n_degen_list
+
+  integer                       :: i, j, n_degen, k
+  logical                       :: keep_on
+  double precision, allocatable :: Aw(:)
+
+  list_degen = -1
+  allocate(Aw(n))
+  Aw = A
+  i=1
+  k = 0
+  do while(i.lt.n)
+   if(dabs(Aw(i)-Aw(i+1)).lt.shift)then
+    k+=1
+    j=1
+    list_degen(1,k) = i
+    keep_on = .True.
+    do while(keep_on)
+     if(i+j.gt.n)then
+      keep_on = .False.
+      exit
+     endif
+     if(dabs(Aw(i)-Aw(i+j)).lt.shift)then
+      j+=1
+     else
+      keep_on=.False.
+      exit
+     endif
+    enddo
+    n_degen = j
+    list_degen(2,k) = list_degen(1,k)-1 + n_degen
+    j=0
+    keep_on = .True.
+    do while(keep_on)
+     if(i+j+1.gt.n)then
+      keep_on = .False.
+      exit
+     endif
+     if(dabs(Aw(i+j)-Aw(i+j+1)).lt.shift)then
+      Aw(i+j) += (j-n_degen/2) * shift
+      j+=1
+     else
+      keep_on = .False.
+      exit
+     endif
+    enddo
+    Aw(i+n_degen-1) += (n_degen-1-n_degen/2) * shift
+    i+=n_degen
+   else
+    i+=1
+   endif
+  enddo
+  n_degen_list = k
+
+end
 
 ! ---
 

From da8eac81e01e9ee558351195aba1f964ed5fbc0b Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 1 May 2024 21:52:00 +0200
Subject: [PATCH 021/131] TC-SCF CLEANED

---
 plugins/local/bi_ort_ints/no_dressing.irp.f   |   7 +-
 plugins/local/tc_scf/EZFIO.cfg                |  36 +
 plugins/local/tc_scf/fock_hermit.irp.f        | 107 ---
 plugins/local/tc_scf/fock_tc.irp.f            |  40 +-
 plugins/local/tc_scf/fock_tc_mo_tot.irp.f     |  19 +-
 plugins/local/tc_scf/fock_three_hermit.irp.f  | 771 ------------------
 .../local/tc_scf/integrals_in_r_stuff.irp.f   | 391 ---------
 plugins/local/tc_scf/jast_schmos_90.irp.f     | 318 --------
 plugins/local/tc_scf/plot_j_schMos.irp.f      |  69 --
 plugins/local/tc_scf/print_fit_param.irp.f    |  59 --
 plugins/local/tc_scf/print_tcscf_energy.irp.f |  55 --
 plugins/local/tc_scf/rh_tcscf_simple.irp.f    | 129 ---
 .../local/tc_scf/rotate_tcscf_orbitals.irp.f  | 369 ---------
 .../local/tc_scf/tc_petermann_factor.irp.f    |  91 ---
 plugins/local/tc_scf/tc_scf.irp.f             |  25 +-
 plugins/local/tc_scf/tc_scf_dm.irp.f          |  24 +-
 plugins/local/tc_scf/tc_scf_energy.irp.f      | 423 ++++++++++
 plugins/local/tc_scf/tcscf_energy_naive.irp.f |  80 --
 .../tc_scf/three_e_energy_bi_ortho.irp.f      | 189 -----
 .../local/tc_scf/write_ao_2e_tc_integ.irp.f   |   6 +-
 20 files changed, 502 insertions(+), 2706 deletions(-)
 delete mode 100644 plugins/local/tc_scf/fock_hermit.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_three_hermit.irp.f
 delete mode 100644 plugins/local/tc_scf/integrals_in_r_stuff.irp.f
 delete mode 100644 plugins/local/tc_scf/jast_schmos_90.irp.f
 delete mode 100644 plugins/local/tc_scf/plot_j_schMos.irp.f
 delete mode 100644 plugins/local/tc_scf/print_fit_param.irp.f
 delete mode 100644 plugins/local/tc_scf/print_tcscf_energy.irp.f
 delete mode 100644 plugins/local/tc_scf/rh_tcscf_simple.irp.f
 delete mode 100644 plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
 delete mode 100644 plugins/local/tc_scf/tc_petermann_factor.irp.f
 delete mode 100644 plugins/local/tc_scf/tcscf_energy_naive.irp.f
 delete mode 100644 plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f

diff --git a/plugins/local/bi_ort_ints/no_dressing.irp.f b/plugins/local/bi_ort_ints/no_dressing.irp.f
index bd225274..721ac0f8 100644
--- a/plugins/local/bi_ort_ints/no_dressing.irp.f
+++ b/plugins/local/bi_ort_ints/no_dressing.irp.f
@@ -322,6 +322,12 @@ END_PROVIDER
 
 BEGIN_PROVIDER [double precision, noL_0e]
 
+  BEGIN_DOC
+  ! 
+  ! < Phi_left | L | Phi_right >
+  !
+  END_DOC 
+
   implicit none
   integer                       :: i, j, k, ipoint
   double precision              :: t0, t1
@@ -330,7 +336,6 @@ BEGIN_PROVIDER [double precision, noL_0e]
   double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
   double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
 
-
   call wall_time(t0)
   print*, " Providing noL_0e ..."
 
diff --git a/plugins/local/tc_scf/EZFIO.cfg b/plugins/local/tc_scf/EZFIO.cfg
index 510c777c..6820a8b0 100644
--- a/plugins/local/tc_scf/EZFIO.cfg
+++ b/plugins/local/tc_scf/EZFIO.cfg
@@ -9,3 +9,39 @@ doc: If |true|, tc-scf has converged
 interface: ezfio,provider,ocaml
 default: False
 
+[max_dim_diis_tcscf]
+type: integer
+doc: Maximum size of the DIIS extrapolation procedure
+interface: ezfio,provider,ocaml
+default: 15
+
+[level_shift_tcscf]
+type: Positive_float
+doc: Energy shift on the virtual MOs to improve TCSCF convergence
+interface: ezfio,provider,ocaml
+default: 0.
+
+[im_thresh_tcscf]
+type: Threshold
+doc: Thresholds on the Imag part of energy
+interface: ezfio,provider,ocaml
+default: 1.e-7
+
+[thresh_tcscf]
+type: Threshold
+doc: Threshold on the convergence of the Hartree Fock energy.
+interface: ezfio,provider,ocaml
+default: 1.e-8
+
+[n_it_tcscf_max]
+type: Strictly_positive_int
+doc: Maximum number of SCF iterations
+interface: ezfio,provider,ocaml
+default: 50
+
+[tc_Brillouin_Right]
+type: logical
+doc: If |true|, impose only right-Brillouin condition
+interface: ezfio,provider,ocaml
+default: False
+
diff --git a/plugins/local/tc_scf/fock_hermit.irp.f b/plugins/local/tc_scf/fock_hermit.irp.f
deleted file mode 100644
index 5a51b324..00000000
--- a/plugins/local/tc_scf/fock_hermit.irp.f
+++ /dev/null
@@ -1,107 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, good_hermit_tc_fock_mat, (mo_num, mo_num)]
-
-  BEGIN_DOC
-! good_hermit_tc_fock_mat = Hermitian Upper triangular Fock matrix 
-!
-! The converged eigenvectors of such matrix yield to orthonormal vectors satisfying the left Brillouin theorem
-  END_DOC
-  implicit none
-  integer :: i, j
-
-  good_hermit_tc_fock_mat = Fock_matrix_tc_mo_tot
-  do j = 1, mo_num
-    do i = 1, j-1
-      good_hermit_tc_fock_mat(i,j) = Fock_matrix_tc_mo_tot(j,i) 
-    enddo
-  enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, hermit_average_tc_fock_mat, (mo_num, mo_num)]
-
-  BEGIN_DOC
-! hermit_average_tc_fock_mat = (F + F^\dagger)/2
-  END_DOC
-  implicit none
-  integer :: i, j
-
-  hermit_average_tc_fock_mat = Fock_matrix_tc_mo_tot
-  do j = 1, mo_num
-    do i = 1, mo_num
-      hermit_average_tc_fock_mat(i,j) = 0.5d0 * (Fock_matrix_tc_mo_tot(j,i) + Fock_matrix_tc_mo_tot(i,j))
-    enddo
-  enddo
-
-END_PROVIDER 
-
-
-! ---
-BEGIN_PROVIDER [ double precision, grad_hermit]
- implicit none
- BEGIN_DOC
- ! square of gradient of the energy
- END_DOC
- if(symetric_fock_tc)then
-  grad_hermit = grad_hermit_average_tc_fock_mat
- else
-  grad_hermit = grad_good_hermit_tc_fock_mat
- endif
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, grad_good_hermit_tc_fock_mat]
-  implicit none
-  BEGIN_DOC
-  ! grad_good_hermit_tc_fock_mat = norm of gradients of the upper triangular TC fock
-  END_DOC
-  integer :: i, j
-  grad_good_hermit_tc_fock_mat = 0.d0
-  do i = 1, elec_alpha_num
-    do j = elec_alpha_num+1, mo_num
-      grad_good_hermit_tc_fock_mat += dabs(good_hermit_tc_fock_mat(i,j))
-    enddo
-  enddo
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, grad_hermit_average_tc_fock_mat]
-  implicit none
-  BEGIN_DOC
-  ! grad_hermit_average_tc_fock_mat = norm of gradients of the upper triangular TC fock
-  END_DOC
-  integer :: i, j
-  grad_hermit_average_tc_fock_mat = 0.d0
-  do i = 1, elec_alpha_num
-    do j = elec_alpha_num+1, mo_num
-      grad_hermit_average_tc_fock_mat += dabs(hermit_average_tc_fock_mat(i,j))
-    enddo
-  enddo
-END_PROVIDER 
-
-
-! ---
-
-subroutine save_good_hermit_tc_eigvectors()
-
-  implicit none
-  integer        :: sign
-  character*(64) :: label
-  logical        :: output
-
-  sign = 1
-  label = "Canonical"
-  output = .False.
-  
-  if(symetric_fock_tc)then
-   call mo_as_eigvectors_of_mo_matrix(hermit_average_tc_fock_mat, mo_num, mo_num, label, sign, output)
-  else
-   call mo_as_eigvectors_of_mo_matrix(good_hermit_tc_fock_mat, mo_num, mo_num, label, sign, output)
-  endif
-end subroutine save_good_hermit_tc_eigvectors
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_tc.irp.f b/plugins/local/tc_scf/fock_tc.irp.f
index 508f3cd7..16bb5c87 100644
--- a/plugins/local/tc_scf/fock_tc.irp.f
+++ b/plugins/local/tc_scf/fock_tc.irp.f
@@ -110,23 +110,14 @@ BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
   double precision              :: t0, t1, tt0, tt1
   double precision, allocatable :: tmp(:,:)
 
-  if(bi_ortho) then
+  PROVIDE mo_l_coef mo_r_coef
 
-    PROVIDE mo_l_coef mo_r_coef
-
-    call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
-                          , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
-
-    if(three_body_h_tc) then
-      PROVIDE fock_3e_mo_a
-      Fock_matrix_tc_mo_alpha += fock_3e_mo_a
-    endif
-
-  else
-
-    call ao_to_mo( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
-                 , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
+  call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
+                        , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
 
+  if(three_body_h_tc) then
+    PROVIDE fock_3e_mo_a
+    Fock_matrix_tc_mo_alpha += fock_3e_mo_a
   endif
 
 END_PROVIDER
@@ -142,21 +133,12 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_beta, (mo_num,mo_num) ]
   implicit none
   double precision, allocatable :: tmp(:,:)
 
-  if(bi_ortho) then
-
-    call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
-                          , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
-
-    if(three_body_h_tc) then
-      PROVIDE fock_3e_mo_b
-      Fock_matrix_tc_mo_beta += fock_3e_mo_b
-    endif
-
-  else
-
-    call ao_to_mo( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
-                 , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
+  call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
+                        , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
 
+  if(three_body_h_tc) then
+    PROVIDE fock_3e_mo_b
+    Fock_matrix_tc_mo_beta += fock_3e_mo_b
   endif
 
 END_PROVIDER
diff --git a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
index 2df2421e..fd490af6 100644
--- a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
+++ b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
@@ -132,7 +132,7 @@
      enddo
    endif
 
-  if(no_oa_or_av_opt)then
+  if(no_oa_or_av_opt) then
     do i = 1, n_act_orb
       iorb = list_act(i)
       do j = 1, n_inact_orb
@@ -153,8 +153,21 @@
     enddo
   endif
 
-  if(.not.bi_ortho .and. three_body_h_tc)then
-    Fock_matrix_tc_mo_tot += fock_3_mat
+  if(tc_Brillouin_Right) then
+
+    double precision, allocatable :: tmp(:,:)
+    allocate(tmp(mo_num,mo_num))
+
+    tmp = Fock_matrix_tc_mo_tot
+    do j = 1, mo_num
+      do i = 1, j-1
+        tmp(i,j) = Fock_matrix_tc_mo_tot(j,i)
+      enddo
+    enddo
+
+    Fock_matrix_tc_mo_tot = tmp
+    deallocate(tmp)
+
   endif
 
 END_PROVIDER
diff --git a/plugins/local/tc_scf/fock_three_hermit.irp.f b/plugins/local/tc_scf/fock_three_hermit.irp.f
deleted file mode 100644
index 00d47fae..00000000
--- a/plugins/local/tc_scf/fock_three_hermit.irp.f
+++ /dev/null
@@ -1,771 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_3_mat, (mo_num, mo_num)] 
-
-  implicit none
-  integer :: i,j
-  double precision :: contrib
-
-  fock_3_mat = 0.d0
-  if(.not.bi_ortho .and. three_body_h_tc) then
-
-    call give_fock_ia_three_e_total(1, 1, contrib)
-    !!  !$OMP PARALLEL                  &
-    !!  !$OMP DEFAULT (NONE)            &
-    !!  !$OMP PRIVATE (i,j,m,integral) & 
-    !!  !$OMP SHARED (mo_num,three_body_3_index)
-    !!  !$OMP DO SCHEDULE (guided) COLLAPSE(3)
-    do i = 1, mo_num
-      do j = 1, mo_num
-        call give_fock_ia_three_e_total(j,i,contrib)
-        fock_3_mat(j,i) = -contrib
-       enddo
-     enddo
-    !else if(bi_ortho.and.three_body_h_tc) then
-    !!  !$OMP END DO
-    !!  !$OMP END PARALLEL
-    !!  do i = 1, mo_num
-    !!   do j = 1, i-1
-    !!    mat_three(j,i) = mat_three(i,j)
-    !!   enddo
-    !!  enddo
-  endif
-
-END_PROVIDER 
-
-
-subroutine give_fock_ia_three_e_total(i,a,contrib)
- implicit none
- BEGIN_DOC
-! contrib is the TOTAL (same spins / opposite spins) contribution from the three body term to the Fock operator 
-!
- END_DOC
- integer, intent(in) :: i,a
- double precision, intent(out) :: contrib
- double precision :: int_1, int_2, int_3
- double precision :: mos_i, mos_a, w_ia
- double precision :: mos_ia, weight
-
- integer :: mm, ipoint,k,l
-
- int_1 = 0.d0
- int_2 = 0.d0
- int_3 = 0.d0
- do mm = 1, 3
-  do ipoint = 1, n_points_final_grid
-   weight = final_weight_at_r_vector(ipoint)                                                                          
-   mos_i  = mos_in_r_array_transp(ipoint,i) 
-   mos_a  = mos_in_r_array_transp(ipoint,a) 
-   mos_ia = mos_a * mos_i
-   w_ia   = x_W_ij_erf_rk(ipoint,mm,i,a) 
-     
-   int_1  += weight * fock_3_w_kk_sum(ipoint,mm) * (4.d0 * fock_3_rho_beta(ipoint) * w_ia               & 
-                                                  + 2.0d0 * mos_ia * fock_3_w_kk_sum(ipoint,mm)         & 
-                                                  - 2.0d0 * fock_3_w_ki_mos_k(ipoint,mm,i) * mos_a      & 
-                                                  - 2.0d0 * fock_3_w_ki_mos_k(ipoint,mm,a) * mos_i      )
-   int_2  += weight * (-1.d0) * ( 2.0d0 * fock_3_w_kl_mo_k_mo_l(ipoint,mm) * w_ia                     & 
-                                + 2.0d0 * fock_3_rho_beta(ipoint) * fock_3_w_ki_wk_a(ipoint,mm,i,a)   & 
-                                + 1.0d0 * mos_ia * fock_3_trace_w_tilde(ipoint,mm)                    )
-
-   int_3  += weight *   1.d0  * (fock_3_w_kl_wla_phi_k(ipoint,mm,i) * mos_a + fock_3_w_kl_wla_phi_k(ipoint,mm,a) * mos_i & 
-                                +fock_3_w_ki_mos_k(ipoint,mm,i)     * fock_3_w_ki_mos_k(ipoint,mm,a)                     )
-  enddo
- enddo
- contrib = int_1 + int_2 + int_3
-
-end
-
-! ---
-
-BEGIN_PROVIDER [double precision, diag_three_elem_hf]
-
-  implicit none
-  integer                       :: i, j, k, ipoint, mm
-  double precision              :: contrib, weight, four_third, one_third, two_third, exchange_int_231
-  double precision              :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb
-  double precision, allocatable :: tmp(:)
-  double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
-  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
-  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' providing diag_three_elem_hf'
-
-  if(.not. three_body_h_tc) then
-
-   if(noL_standard) then
-      PROVIDE noL_0e
-      diag_three_elem_hf = noL_0e
-    else
-      diag_three_elem_hf = 0.d0
-    endif
-
-  else
-
-    if(.not. bi_ortho) then
-
-      ! ---
-
-      one_third  = 1.d0/3.d0
-      two_third  = 2.d0/3.d0
-      four_third = 4.d0/3.d0
-      diag_three_elem_hf = 0.d0
-      do i = 1, elec_beta_num
-        do j = 1, elec_beta_num
-          do k = 1, elec_beta_num
-            call give_integrals_3_body(k, j, i, j, i, k, exchange_int_231)   
-            diag_three_elem_hf += two_third * exchange_int_231
-          enddo
-        enddo
-      enddo
-      do mm = 1, 3
-        do ipoint = 1, n_points_final_grid
-          weight  = final_weight_at_r_vector(ipoint)                                                                          
-          contrib = 3.d0 * fock_3_w_kk_sum(ipoint,mm) * fock_3_rho_beta(ipoint) * fock_3_w_kk_sum(ipoint,mm) & 
-                  - 2.d0 * fock_3_w_kl_mo_k_mo_l(ipoint,mm) * fock_3_w_kk_sum(ipoint,mm)                     & 
-                  - 1.d0 * fock_3_rho_beta(ipoint) * fock_3_w_kl_w_kl(ipoint,mm)
-          contrib *= four_third
-          contrib += -two_third  * fock_3_rho_beta(ipoint)    * fock_3_w_kl_w_kl(ipoint,mm) & 
-                     -four_third * fock_3_w_kk_sum(ipoint,mm) * fock_3_w_kl_mo_k_mo_l(ipoint,mm)
-          diag_three_elem_hf += weight * contrib
-       enddo
-      enddo
-
-      diag_three_elem_hf = - diag_three_elem_hf
-
-      ! ---
-
-    else
-
-      ! ------------
-      ! SLOW VERSION
-      ! ------------
-
-      !call give_aaa_contrib(integral_aaa)
-      !call give_aab_contrib(integral_aab)
-      !call give_abb_contrib(integral_abb)
-      !call give_bbb_contrib(integral_bbb)
-      !diag_three_elem_hf = integral_aaa + integral_aab + integral_abb + integral_bbb
-
-      ! ------------
-      ! ------------
-
-      PROVIDE int2_grad1_u12_bimo_t
-      PROVIDE mos_l_in_r_array_transp
-      PROVIDE mos_r_in_r_array_transp
-
-      if(elec_alpha_num .eq. elec_beta_num) then
-    
-        allocate(tmp(elec_beta_num))
-        allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-    
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT(NONE)                                            &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                      &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = 1, elec_beta_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        diag_three_elem_hf = -2.d0 * sum(tmp)
-    
-        deallocate(tmp)
-        deallocate(tmp_L, tmp_R)
-    
-        ! ---
-    
-        allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-        tmp_O = 0.d0
-        tmp_J = 0.d0
-    
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT(NONE)                                             &
-        !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,                & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,  &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-    
-        allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-        tmp_O_priv = 0.d0
-        tmp_J_priv = 0.d0
-      
-        !$OMP DO 
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_O = tmp_O + tmp_O_priv
-        tmp_J = tmp_J + tmp_J_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_O_priv, tmp_J_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-        tmp_M = 0.d0
-        tmp_S = 0.d0
-    
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT(NONE)                                            &
-        !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)            &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-    
-        allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-        tmp_M_priv = 0.d0
-        tmp_S_priv = 0.d0
-      
-        !$OMP DO COLLAPSE(2)
-        do i = 1, elec_beta_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_M = tmp_M + tmp_M_priv
-        tmp_S = tmp_S + tmp_S_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_M_priv, tmp_S_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp(n_points_final_grid))
-    
-        do ipoint = 1, n_points_final_grid
-    
-          tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-    
-          tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                           - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                    + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                    + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-        enddo
-    
-        diag_three_elem_hf = diag_three_elem_hf -2.d0 * (sum(tmp))
-    
-        deallocate(tmp)
-    
-      else
-    
-        allocate(tmp(elec_alpha_num))
-        allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = 1, elec_beta_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = elec_beta_num+1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-    
-          do i = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        ! ---
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = elec_beta_num+1, elec_alpha_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = 1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + 0.5d0 * final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        diag_three_elem_hf = -2.d0 * sum(tmp)
-    
-        deallocate(tmp)
-        deallocate(tmp_L, tmp_R)
-    
-        ! ---
-    
-        allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-        tmp_O = 0.d0
-        tmp_J = 0.d0
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                 &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-    
-        allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-        tmp_O_priv = 0.d0
-        tmp_J_priv = 0.d0
-      
-        !$OMP DO 
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO 
-        do i = elec_beta_num+1, elec_alpha_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_O = tmp_O + tmp_O_priv
-        tmp_J = tmp_J + tmp_J_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_O_priv, tmp_J_priv)
-        !$OMP END PARALLEL
-    
-        ! ---
-    
-        allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-        tmp_M = 0.d0
-        tmp_S = 0.d0
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)              &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-    
-        allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-        tmp_M_priv = 0.d0
-        tmp_S_priv = 0.d0
-      
-        !$OMP DO COLLAPSE(2)
-        do i = 1, elec_beta_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO COLLAPSE(2)
-        do i = elec_beta_num+1, elec_alpha_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO COLLAPSE(2)
-        do i = elec_beta_num+1, elec_alpha_num
-          do j = elec_beta_num+1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_M = tmp_M + tmp_M_priv
-        tmp_S = tmp_S + tmp_S_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_M_priv, tmp_S_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp(n_points_final_grid))
-    
-        do ipoint = 1, n_points_final_grid
-    
-          tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-    
-          tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                           - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                    + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                    + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-        enddo
-    
-        diag_three_elem_hf = diag_three_elem_hf - 2.d0 * (sum(tmp))
-    
-        deallocate(tmp)
-    
-      endif
-
-
-    endif
-
-  endif
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_3_mat_a_op_sh, (mo_num, mo_num)]
- implicit none 
- integer :: h,p,i,j
- double precision :: direct_int, exch_int, exchange_int_231, exchange_int_312
- double precision :: exchange_int_23, exchange_int_12, exchange_int_13 
-
- fock_3_mat_a_op_sh = 0.d0
- do h = 1, mo_num
-  do p = 1, mo_num
-   !F_a^{ab}(h,p) 
-   do i = 1, elec_beta_num ! beta 
-    do j = elec_beta_num+1, elec_alpha_num ! alpha
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)    ! <hji|pji>
-     call  give_integrals_3_body(h,j,i,j,p,i,exch_int)   
-     fock_3_mat_a_op_sh(h,p) -= direct_int - exch_int
-    enddo
-   enddo
-   !F_a^{aa}(h,p)
-   do i = 1, elec_beta_num ! alpha 
-    do j = elec_beta_num+1, elec_alpha_num ! alpha
-       call  give_integrals_3_body(h,j,i,p,j,i,direct_int) 
-       call  give_integrals_3_body(h,j,i,i,p,j,exchange_int_231)
-       call  give_integrals_3_body(h,j,i,j,i,p,exchange_int_312) 
-       call  give_integrals_3_body(h,j,i,p,i,j,exchange_int_23) 
-       call  give_integrals_3_body(h,j,i,i,j,p,exchange_int_12)
-       call  give_integrals_3_body(h,j,i,j,p,i,exchange_int_13)  
-       fock_3_mat_a_op_sh(h,p) -= ( direct_int + exchange_int_231 + exchange_int_312 & 
-              -  exchange_int_23 & ! i <-> j
-              -  exchange_int_12 & ! p <-> j
-              -  exchange_int_13  )! p <-> i
-    enddo 
-   enddo
-  enddo
- enddo
-! symmetrized 
-! do p = 1, elec_beta_num
-!  do h = elec_alpha_num +1, mo_num
-!   fock_3_mat_a_op_sh(h,p) = fock_3_mat_a_op_sh(p,h)
-!  enddo
-! enddo
- 
-! do h = elec_beta_num+1, elec_alpha_num
-!  do p = elec_alpha_num +1, mo_num
-!   !F_a^{bb}(h,p) 
-!   do i = 1, elec_beta_num
-!    do j = i+1, elec_beta_num
-!     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-!     call  give_integrals_3_body(h,j,i,p,i,j,exch_int)   
-!     fock_3_mat_a_op_sh(h,p) -= direct_int - exch_int
-!    enddo
-!   enddo
-!  enddo
-! enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_mat_b_op_sh, (mo_num, mo_num)]
- implicit none 
- integer :: h,p,i,j
- double precision :: direct_int, exch_int
- fock_3_mat_b_op_sh = 0.d0
- do h = 1, elec_beta_num
-  do p = elec_alpha_num +1, mo_num
-   !F_b^{aa}(h,p) 
-   do i = 1, elec_beta_num
-    do j = elec_beta_num+1, elec_alpha_num
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-     call  give_integrals_3_body(h,j,i,p,i,j,exch_int)   
-     fock_3_mat_b_op_sh(h,p) += direct_int - exch_int
-    enddo
-   enddo
-
-   !F_b^{ab}(h,p) 
-   do i = elec_beta_num+1, elec_beta_num
-    do j = 1, elec_beta_num
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-     call  give_integrals_3_body(h,j,i,j,p,i,exch_int)   
-     fock_3_mat_b_op_sh(h,p) += direct_int - exch_int
-    enddo
-   enddo
- 
-  enddo
- enddo
-
-END_PROVIDER 
-
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kk_sum, (n_points_final_grid,3)]
- implicit none
- integer :: mm, ipoint,k
- double precision :: w_kk
- fock_3_w_kk_sum = 0.d0
- do k = 1, elec_beta_num
-  do mm = 1, 3
-   do ipoint = 1, n_points_final_grid
-    w_kk   = x_W_ij_erf_rk(ipoint,mm,k,k) 
-    fock_3_w_kk_sum(ipoint,mm) += w_kk
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_ki_mos_k, (n_points_final_grid,3,mo_num)]
- implicit none
- integer :: mm, ipoint,k,i
- double precision :: w_ki, mo_k
- fock_3_w_ki_mos_k = 0.d0
- do i = 1, mo_num
-  do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     w_ki   = x_W_ij_erf_rk(ipoint,mm,k,i) 
-     mo_k = mos_in_r_array(k,ipoint)
-     fock_3_w_ki_mos_k(ipoint,mm,i) += w_ki * mo_k
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_w_kl, (n_points_final_grid,3)]
- implicit none
- integer :: k,j,ipoint,mm
- double precision :: w_kj
- fock_3_w_kl_w_kl = 0.d0
- do j = 1, elec_beta_num
-  do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     w_kj   = x_W_ij_erf_rk(ipoint,mm,k,j) 
-     fock_3_w_kl_w_kl(ipoint,mm) += w_kj * w_kj
-    enddo
-   enddo
-  enddo
- enddo
-
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_rho_beta, (n_points_final_grid)]
- implicit none
- integer :: ipoint,k
- fock_3_rho_beta = 0.d0
- do ipoint = 1, n_points_final_grid
-  do k = 1, elec_beta_num
-   fock_3_rho_beta(ipoint) += mos_in_r_array(k,ipoint) * mos_in_r_array(k,ipoint)
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_mo_k_mo_l, (n_points_final_grid,3)]
- implicit none
- integer :: ipoint,k,l,mm
- double precision :: mos_k, mos_l, w_kl
- fock_3_w_kl_mo_k_mo_l = 0.d0
- do k = 1, elec_beta_num
-  do l = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     mos_k  = mos_in_r_array_transp(ipoint,k) 
-     mos_l  = mos_in_r_array_transp(ipoint,l) 
-     w_kl   = x_W_ij_erf_rk(ipoint,mm,l,k)
-     fock_3_w_kl_mo_k_mo_l(ipoint,mm) += w_kl * mos_k * mos_l 
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_ki_wk_a, (n_points_final_grid,3,mo_num, mo_num)]
- implicit none
- integer :: ipoint,i,a,k,mm
- double precision :: w_ki,w_ka
- fock_3_w_ki_wk_a = 0.d0
- do i = 1, mo_num
-  do a = 1, mo_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     do k = 1, elec_beta_num
-      w_ki   = x_W_ij_erf_rk(ipoint,mm,k,i)
-      w_ka   = x_W_ij_erf_rk(ipoint,mm,k,a)
-      fock_3_w_ki_wk_a(ipoint,mm,a,i) += w_ki * w_ka
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_trace_w_tilde, (n_points_final_grid,3)]
- implicit none
- integer :: ipoint,k,mm
- fock_3_trace_w_tilde = 0.d0
- do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     fock_3_trace_w_tilde(ipoint,mm) += fock_3_w_ki_wk_a(ipoint,mm,k,k)
-    enddo
-   enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_wla_phi_k, (n_points_final_grid,3,mo_num)]
- implicit none
- integer :: ipoint,a,k,mm,l
- double precision :: w_kl,w_la, mo_k
- fock_3_w_kl_wla_phi_k = 0.d0
- do a = 1, mo_num
-  do k = 1, elec_beta_num 
-   do l = 1, elec_beta_num
-    do mm = 1, 3
-     do ipoint = 1, n_points_final_grid
-      w_kl   = x_W_ij_erf_rk(ipoint,mm,l,k)
-      w_la   = x_W_ij_erf_rk(ipoint,mm,l,a)
-      mo_k  = mos_in_r_array_transp(ipoint,k) 
-      fock_3_w_kl_wla_phi_k(ipoint,mm,a) += w_kl * w_la * mo_k
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-
-
-
-
diff --git a/plugins/local/tc_scf/integrals_in_r_stuff.irp.f b/plugins/local/tc_scf/integrals_in_r_stuff.irp.f
deleted file mode 100644
index 3ce85a97..00000000
--- a/plugins/local/tc_scf/integrals_in_r_stuff.irp.f
+++ /dev/null
@@ -1,391 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, tc_scf_dm_in_r, (n_points_final_grid) ]
-
-  implicit none
-  integer :: i, j
-
-  tc_scf_dm_in_r = 0.d0
-  do i = 1, n_points_final_grid
-    do j = 1, elec_beta_num
-      tc_scf_dm_in_r(i) += mos_r_in_r_array(j,i) * mos_l_in_r_array(j,i)
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, w_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: ipoint, j, xi
-
-  w_sum_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        !w_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,j)
-        w_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, ww_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer          :: ipoint, j, xi
-  double precision :: tmp
-
-  ww_sum_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        tmp = x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-        ww_sum_in_r(ipoint,xi) += tmp * tmp
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_r_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_r_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_r_in_r(ipoint,xi,i) += mos_r_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_l_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_l_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_l_in_r(ipoint,xi,i) += mos_l_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        W1_in_r(ipoint,xi) += W1_l_in_r(ipoint,xi,j) * mos_r_in_r_array_transp(ipoint,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_diag_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_diag_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        W1_diag_in_r(ipoint,xi) += mos_r_in_r_array_transp(ipoint,j) * mos_l_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, v_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-  v_sum_in_r = 0.d0
-  do i = 1, elec_beta_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          v_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_W1_r_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, m, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_W1_r_in_r = 0.d0
-  do i = 1, mo_num
-    do m = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_W1_r_in_r(ipoint,xi,i) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,m,i) * W1_r_in_r(ipoint,xi,m)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_W1_l_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_W1_l_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_W1_l_in_r(ipoint,xi,i) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j) * W1_l_in_r(ipoint,xi,j)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-subroutine direct_term_imj_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | i m j > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight, tmp
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-      !integral += ( mos_l_in_r_array(a,ipoint) * mos_r_in_r_array(i,ipoint) * w_sum_in_r(ipoint,xi) * w_sum_in_r(ipoint,xi) & 
-      !            + 2.d0 * tc_scf_dm_in_r(ipoint) * w_sum_in_r(ipoint,xi) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i) ) * weight
-
-      tmp = w_sum_in_r(ipoint,xi)
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * tmp * tmp & 
-                  + 2.d0 * tc_scf_dm_in_r(ipoint) * tmp * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i)       &
-                  ) * weight
-    enddo
-  enddo
-
-end 
-
-! ---
-
-subroutine exch_term_jmi_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | j m i > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi, j
-  double precision              :: weight, tmp
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      tmp = 0.d0
-      do j = 1, elec_beta_num
-        tmp = tmp + x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i) 
-      enddo
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_r_in_r(ipoint,xi,i) * w_sum_in_r(ipoint,xi) & 
-                  + tc_scf_dm_in_r(ipoint) * tmp                                                       &
-                  + mos_r_in_r_array_transp(ipoint,i) * W1_l_in_r(ipoint,xi,a) * w_sum_in_r(ipoint,xi) & 
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine exch_term_ijm_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | i j m > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * v_sum_in_r(ipoint,xi) & 
-                  + 2.d0 * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i) * W1_in_r(ipoint,xi)                             &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine direct_term_ijj_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j = 1, elec_beta_num) < a j j | i j j > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * ww_sum_in_r(ipoint,xi) & 
-                  + 2.d0 * W1_diag_in_r(ipoint, xi) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i)                        &
-                  ) * weight
-    enddo
-  enddo
-
-end 
-
-! ---
-
-subroutine cyclic_term_jim_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | j i m > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_W1_r_in_r(ipoint,xi,i)  & 
-                  + W1_W1_l_in_r(ipoint,xi,a) * mos_r_in_r_array_transp(ipoint,i)  &
-                  + W1_l_in_r(ipoint,xi,a) * W1_r_in_r(ipoint,xi,i)                &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine cyclic_term_mji_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | m j i > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_W1_r_in_r(ipoint,xi,i)  & 
-                  + W1_l_in_r(ipoint,xi,a) * W1_r_in_r(ipoint,xi,i)                &
-                  + W1_W1_l_in_r(ipoint,xi,a) * mos_r_in_r_array_transp(ipoint,i)  &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/jast_schmos_90.irp.f b/plugins/local/tc_scf/jast_schmos_90.irp.f
deleted file mode 100644
index 5c5e625f..00000000
--- a/plugins/local/tc_scf/jast_schmos_90.irp.f
+++ /dev/null
@@ -1,318 +0,0 @@
- BEGIN_PROVIDER [integer ,  m_max_sm_7]
-&BEGIN_PROVIDER [integer ,  n_max_sm_7]
-&BEGIN_PROVIDER [integer ,  o_max_sm_7]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_7 version of Table IV
- END_DOC
- m_max_sm_7 = 4
- n_max_sm_7 = 0
- o_max_sm_7 = 4
-END_PROVIDER 
-
- BEGIN_PROVIDER [integer ,  m_max_sm_9]
-&BEGIN_PROVIDER [integer ,  n_max_sm_9]
-&BEGIN_PROVIDER [integer ,  o_max_sm_9]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_9 version of Table IV
- END_DOC
- m_max_sm_9 = 4
- n_max_sm_9 = 2
- o_max_sm_9 = 4
-END_PROVIDER 
-
-
- BEGIN_PROVIDER [integer ,  m_max_sm_17]
-&BEGIN_PROVIDER [integer ,  n_max_sm_17]
-&BEGIN_PROVIDER [integer ,  o_max_sm_17]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_17 version of Table IV
- END_DOC
- m_max_sm_17 = 6
- n_max_sm_17 = 2
- o_max_sm_17 = 6
-END_PROVIDER 
-
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_7, (0:m_max_sm_7,0:n_max_sm_7,0:o_max_sm_7,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_7(0:4,0:4,2:10) = coefficient for the SM_7 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_7
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_7 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_7(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_7(0,0,2,2) =  0.50516d0 
- c_mn_o_sm_7(0,0,3,2) = -0.19313d0 
- c_mn_o_sm_7(0,0,4,2) =  0.30276d0 
- ! one-electron terms 
- c_mn_o_sm_7(2,0,0,2) = -0.16995d0
- c_mn_o_sm_7(3,0,0,2) = -0.34505d0
- c_mn_o_sm_7(4,0,0,2) = -0.54777d0
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_7(0,0,2,10) = -0.792d0
- c_mn_o_sm_7(0,0,3,10) =  1.05232d0 
- c_mn_o_sm_7(0,0,4,10) = -0.65615d0 
- ! one-electron terms 
- c_mn_o_sm_7(2,0,0,10) = -0.13312d0
- c_mn_o_sm_7(3,0,0,10) = -0.00131d0
- c_mn_o_sm_7(4,0,0,10) =  0.09083d0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_9, (0:m_max_sm_9,0:n_max_sm_9,0:o_max_sm_9,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_9(0:4,0:4,2:10) = coefficient for the SM_9 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_9
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_9 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_9(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_9(0,0,2,2) =  0.50516d0 
- c_mn_o_sm_9(0,0,3,2) = -0.19313d0 
- c_mn_o_sm_9(0,0,4,2) =  0.30276d0 
- ! one-electron terms 
- c_mn_o_sm_9(2,0,0,2) = -0.16995d0
- c_mn_o_sm_9(3,0,0,2) = -0.34505d0
- c_mn_o_sm_9(4,0,0,2) = -0.54777d0
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_9(0,0,2,10) = -0.792d0
- c_mn_o_sm_9(0,0,3,10) =  1.05232d0 
- c_mn_o_sm_9(0,0,4,10) = -0.65615d0 
- ! one-electron terms 
- c_mn_o_sm_9(2,0,0,10) = -0.13312d0
- c_mn_o_sm_9(3,0,0,10) = -0.00131d0
- c_mn_o_sm_9(4,0,0,10) =  0.09083d0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_17, (0:m_max_sm_17,0:n_max_sm_17,0:o_max_sm_17,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_17(0:4,0:4,2:10) = coefficient for the SM_17 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_17
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_17 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_17(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_17(0,0,2,2) =  0.09239d0 
- c_mn_o_sm_17(0,0,3,2) = -0.38664d0 
- c_mn_o_sm_17(0,0,4,2) =  0.95764d0 
- ! one-electron terms 
- c_mn_o_sm_17(2,0,0,2) =  0.23208d0
- c_mn_o_sm_17(3,0,0,2) = -0.45032d0
- c_mn_o_sm_17(4,0,0,2) =  0.82777d0
- c_mn_o_sm_17(2,2,0,2) = -4.15388d0
- ! ee-n terms 
- c_mn_o_sm_17(2,0,2,2) =  0.80622d0
- c_mn_o_sm_17(2,2,2,2) = 10.19704d0
- c_mn_o_sm_17(4,0,2,2) = -4.96259d0
- c_mn_o_sm_17(2,0,4,2) = -1.35647d0
- c_mn_o_sm_17(4,2,2,2) = -5.90907d0
- c_mn_o_sm_17(6,0,2,2) =  0.90343d0
- c_mn_o_sm_17(4,0,4,2) =  5.50739d0
- c_mn_o_sm_17(2,2,4,2) = -0.03154d0
- c_mn_o_sm_17(2,0,6,2) = -1.1051860
-
-
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_17(0,0,2,10) = -0.80909d0 
- c_mn_o_sm_17(0,0,3,10) = -0.00219d0 
- c_mn_o_sm_17(0,0,4,10) =  0.59188d0
- ! one-electron terms 
- c_mn_o_sm_17(2,0,0,10) = -0.00567d0
- c_mn_o_sm_17(3,0,0,10) =  0.14011d0
- c_mn_o_sm_17(4,0,0,10) = -0.05671d0
- c_mn_o_sm_17(2,2,0,10) = -3.33767d0
- ! ee-n terms 
- c_mn_o_sm_17(2,0,2,10) =  1.95067d0
- c_mn_o_sm_17(2,2,2,10) =  6.83340d0
- c_mn_o_sm_17(4,0,2,10) = -3.29231d0
- c_mn_o_sm_17(2,0,4,10) = -2.44998d0
- c_mn_o_sm_17(4,2,2,10) = -2.13029d0
- c_mn_o_sm_17(6,0,2,10) =  2.25768d0
- c_mn_o_sm_17(4,0,4,10) =  1.97951d0
- c_mn_o_sm_17(2,2,4,10) = -2.0924160
- c_mn_o_sm_17(2,0,6,10) =  0.35493d0
-
-END_PROVIDER 
-
- BEGIN_PROVIDER [ double precision, b_I_sm_90,(2:10)]
-&BEGIN_PROVIDER [ double precision, d_I_sm_90,(2:10)]
- implicit none
- BEGIN_DOC
-! "b_I" and "d_I" parameters of Eqs. (4) and (5) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- END_DOC
- b_I_sm_90 = 1.d0
- d_I_sm_90 = 1.d0
- 
-END_PROVIDER 
-
-subroutine get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
- implicit none 
- double precision, intent(in) :: r1(3),r2(3),rI(3)
- integer, intent(in)          :: sm_j, i_charge 
- double precision, intent(out):: j_1e,j_2e,j_een,j_tot
- BEGIN_DOC
- ! Jastrow function as in Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! the i_charge variable is the integer specifying the charge of the atom for the Jastrow 
- ! the sm_j integer variable represents the "quality" of the jastrow : sm_j = 7, 9, 17
- END_DOC 
- double precision :: r_inucl,r_jnucl,r_ij,b_I, d_I
- b_I = b_I_sm_90(i_charge)
- d_I = d_I_sm_90(i_charge)
- call get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
- call jastrow_func_sm_90(r_inucl,r_jnucl,r_ij,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-end
-
-subroutine get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
- implicit none
- BEGIN_DOC
- ! rescaled variables of Eq. (5) and (6) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! the "b_I" and "d_I" parameters are the same as in Eqs. (5) and (6) 
- END_DOC
- double precision, intent(in) :: r1(3),r2(3),rI(3)
- double precision, intent(in) :: b_I, d_I
- double precision, intent(out):: r_inucl,r_jnucl,r_ij
- double precision :: rin, rjn, rij
- integer :: i
- rin = 0.d0
- rjn = 0.d0
- rij = 0.d0
- do i = 1,3
-  rin += (r1(i) - rI(i)) * (r1(i) - rI(i))
-  rjn += (r2(i) - rI(i)) * (r2(i) - rI(i))
-  rij += (r2(i) - r1(i)) * (r2(i) - r1(i))
- enddo
- rin = dsqrt(rin)
- rjn = dsqrt(rjn)
- rij = dsqrt(rij)
- r_inucl = b_I * rin/(1.d0 + b_I * rin)
- r_jnucl = b_I * rjn/(1.d0 + b_I * rjn)
- r_ij    = d_I * rij/(1.d0 + b_I * rij)
-end
-
-subroutine jastrow_func_sm_90(r_inucl,r_jnucl,r_ij,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
- implicit none
- BEGIN_DOC
- ! Jastrow function as in Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! Here the r_inucl, r_jnucl are the rescaled variables as defined in Eq. (5) with "b_I" 
- !          r_ij is the rescaled variable as defined in Eq. (6) with "d_I" 
- ! the i_charge variable is the integer specifying the charge of the atom for the Jastrow 
- ! the sm_j integer variable represents the "quality" of the jastrow : sm_j = 7, 9, 17
- !
- ! it returns the j_1e  : sum of terms with "o" = "n" = 0, "m" /= 0,
- !                j_2e  : sum of terms with "m" = "n" = 0, "o" /= 0,
- !                j_een : sum of terms with "m" /=0, "n" /= 0, "o" /= 0,
- !                j_tot : the total sum 
- END_DOC
- double precision, intent(in) :: r_inucl,r_jnucl,r_ij
- integer, intent(in)          :: sm_j,i_charge
- double precision, intent(out):: j_1e,j_2e,j_een,j_tot
- j_1e  = 0.D0
- j_2e  = 0.D0
- j_een = 0.D0
- double precision :: delta_mn,jastrow_sm_90_atomic
- integer :: m,n,o
-BEGIN_TEMPLATE
- ! pure 2e part 
- n = 0
- m = 0 
- if(sm_j == $X )then
-  do o = 1, o_max_sm_$X
-   if(dabs(c_mn_o_sm_$X(m,n,o,i_charge)).lt.1.d-10)cycle 
-   j_2e += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-  enddo
-! else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- ! pure one-e part 
- o = 0 
- if(sm_j == $X)then
-  do n = 2, n_max_sm_$X
-   do m = 2, m_max_sm_$X
-    j_1e += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-   enddo
-  enddo
-! else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- ! e-e-n part 
- if(sm_j == $X)then
-  do o = 1, o_max_sm_$X
-   do m = 2, m_max_sm_$X
-    do n = 2, n_max_sm_$X
-     j_een += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-    enddo
-   enddo
-  enddo
- else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- j_tot = j_1e + j_2e + j_een
-SUBST [ X]
-  7 ;; 
-  9 ;; 
-  17 ;;
-END_TEMPLATE 
-end
-
-double precision function jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
- implicit none
- BEGIN_DOC
-! contribution to the function of Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
-! for a given m,n,o and atom 
- END_DOC
- double precision, intent(in) :: r_inucl,r_jnucl,r_ij
- integer         , intent(in) :: m,n,o,i_charge
- double precision :: delta_mn 
- if(m==n)then
-  delta_mn = 0.5d0
- else
-  delta_mn = 1.D0
- endif
- jastrow_sm_90_atomic = delta_mn * (r_inucl**m * r_jnucl**n + r_jnucl**m * r_inucl**n)*r_ij**o
-end
diff --git a/plugins/local/tc_scf/plot_j_schMos.irp.f b/plugins/local/tc_scf/plot_j_schMos.irp.f
deleted file mode 100644
index eda0dd25..00000000
--- a/plugins/local/tc_scf/plot_j_schMos.irp.f
+++ /dev/null
@@ -1,69 +0,0 @@
-program plot_j
- implicit none
- double precision :: r1(3),rI(3),r2(3)
- double precision :: r12,dx,xmax, j_1e,j_2e,j_een,j_tot
- double precision :: j_mu_F_x_j
- integer :: i,nx,m,i_charge,sm_j
-
- character*(128) :: output
- integer :: i_unit_output_He_sm_7,i_unit_output_Ne_sm_7
- integer :: i_unit_output_He_sm_17,i_unit_output_Ne_sm_17
- integer :: getUnitAndOpen
- output='J_SM_7_He'
- i_unit_output_He_sm_7 = getUnitAndOpen(output,'w')
- output='J_SM_7_Ne'
- i_unit_output_Ne_sm_7 = getUnitAndOpen(output,'w')
-
- output='J_SM_17_He'
- i_unit_output_He_sm_17 = getUnitAndOpen(output,'w')
- output='J_SM_17_Ne'
- i_unit_output_Ne_sm_17 = getUnitAndOpen(output,'w')
-
- rI = 0.d0
- r1 = 0.d0
- r2 = 0.d0
- r1(1) = 1.5d0
- xmax = 20.d0
- r2(1) = -xmax*0.5d0
- nx = 1000
- dx = xmax/dble(nx)
- do i = 1, nx
-  r12 = 0.d0
-  do m = 1, 3
-   r12 += (r1(m) - r2(m))*(r1(m) - r2(m))
-  enddo
-  r12 = dsqrt(r12)
-  double precision :: jmu,env_nucl,jmu_env,jmu_scaled, jmu_scaled_env
-  double precision :: b_I,d_I,r_inucl,r_jnucl,r_ij
-  b_I = 1.D0
-  d_I = 1.D0
-  call get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
-  jmu=j_mu_F_x_j(r12)
-  jmu_scaled=j_mu_F_x_j(r_ij)
-  jmu_env = jmu * env_nucl(r1) * env_nucl(r2)
-!  jmu_scaled_env= jmu_scaled * (1.d0 - env_coef(1) * dexp(-env_expo(1)*r_inucl**2)) * (1.d0 - env_coef(1) * dexp(-env_expo(1)*r_jnucl**2))
-  jmu_scaled_env= jmu_scaled * env_nucl(r1) * env_nucl(r2)
-  ! He 
-  i_charge = 2
-  ! SM 7 Jastrow 
-  sm_j = 7
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_He_sm_7,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! SM 17 Jastrow 
-  sm_j = 17
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_He_sm_17,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! Ne 
-  i_charge = 10
-  ! SM 7 Jastrow 
-  sm_j = 7
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_Ne_sm_7,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! SM 17 Jastrow 
-  sm_j = 17
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_Ne_sm_17,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  r2(1) += dx
- enddo
- 
-end
diff --git a/plugins/local/tc_scf/print_fit_param.irp.f b/plugins/local/tc_scf/print_fit_param.irp.f
deleted file mode 100644
index e62f0dde..00000000
--- a/plugins/local/tc_scf/print_fit_param.irp.f
+++ /dev/null
@@ -1,59 +0,0 @@
-program print_fit_param
-
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  !call create_guess
-  !call orthonormalize_mos
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  integer :: i
-
-  mu_erf = 1.d0
-  touch mu_erf
-
-  print *, ' fit for (1 - erf(x))^2'
-  do i = 1, n_max_fit_slat
-    print*, expo_gauss_1_erf_x_2(i), coef_gauss_1_erf_x_2(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)]'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_x(i), 2.d0 * coef_gauss_j_mu_x(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)]^2'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_x_2(i), 4.d0 * coef_gauss_j_mu_x_2(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)] x [1 - erf(mu * r12)]'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_1_erf(i), 4.d0 * coef_gauss_j_mu_1_erf(i)
-  enddo
-
-  return
-end subroutine main
-
-! ---
-
diff --git a/plugins/local/tc_scf/print_tcscf_energy.irp.f b/plugins/local/tc_scf/print_tcscf_energy.irp.f
deleted file mode 100644
index 6f9afd9a..00000000
--- a/plugins/local/tc_scf/print_tcscf_energy.irp.f
+++ /dev/null
@@ -1,55 +0,0 @@
-program print_tcscf_energy
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  double precision :: etc_tot, etc_1e, etc_2e, etc_3e
-
-  PROVIDE j2e_type mu_erf
-  PROVIDE j1e_type j1e_coef j1e_expo
-  PROVIDE env_type env_coef env_expo
-
-  print*, ' j2e_type = ', j2e_type
-  print*, ' j1e_type = ', j1e_type
-  print*, ' env_type = ', env_type
-
-  print*, ' mu_erf   = ', mu_erf
-
-  etc_tot = TC_HF_energy
-  etc_1e  = TC_HF_one_e_energy
-  etc_2e  = TC_HF_two_e_energy
-  etc_3e  = 0.d0
-  if(three_body_h_tc) then
-    !etc_3e = diag_three_elem_hf
-    etc_3e = tcscf_energy_3e_naive
-  endif
-
-  print *, " E_TC = ", etc_tot
-  print *, " E_1e = ", etc_1e
-  print *, " E_2e = ", etc_2e
-  print *, " E_3e = ", etc_3e
-
-  return
-end subroutine main
-
-! ---
-
diff --git a/plugins/local/tc_scf/rh_tcscf_simple.irp.f b/plugins/local/tc_scf/rh_tcscf_simple.irp.f
deleted file mode 100644
index 2c2cf2c2..00000000
--- a/plugins/local/tc_scf/rh_tcscf_simple.irp.f
+++ /dev/null
@@ -1,129 +0,0 @@
-! ---
-
-subroutine rh_tcscf_simple()
-
-  implicit none
-  integer                       :: i, j, it, dim_DIIS
-  double precision              :: t0, t1
-  double precision              :: e_save, e_delta, rho_delta
-  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, tc_grad
-  double precision              :: er_DIIS
-  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
-
-  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
-
-  it       = 0
-  e_save   = 0.d0
-  dim_DIIS = 0
-
-  ! ---
-
-  if(.not. bi_ortho) then
-   print *, ' grad_hermit = ', grad_hermit
-   call save_good_hermit_tc_eigvectors
-   TOUCH mo_coef 
-   call save_mos
-  endif
-
-  ! ---
-
-  if(bi_ortho) then
-
-    PROVIDE level_shift_tcscf
-    PROVIDE mo_l_coef mo_r_coef
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      '====', '================', '================', '================', '================', '================' &
-            , '================', '================', '================', '====', '========'
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
-            , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      '====', '================', '================', '================', '================', '================' &
-            , '================', '================', '================', '====', '========'
-
-
-    ! first iteration (HF orbitals)
-    call wall_time(t0)
-
-    etc_tot = TC_HF_energy
-    etc_1e  = TC_HF_one_e_energy
-    etc_2e  = TC_HF_two_e_energy
-    etc_3e  = 0.d0
-    if(three_body_h_tc) then
-      etc_3e = diag_three_elem_hf
-    endif
-    tc_grad = grad_non_hermit
-    er_DIIS = maxval(abs(FQS_SQF_mo))
-    e_delta = dabs(etc_tot - e_save)
-    e_save  = etc_tot
-
-    call wall_time(t1)
-    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-
-    do while(tc_grad .gt. dsqrt(thresh_tcscf))
-      call wall_time(t0)
-
-      it += 1
-      if(it > n_it_tcscf_max) then
-        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
-        stop
-      endif
-
-      mo_l_coef = fock_tc_leigvec_ao
-      mo_r_coef = fock_tc_reigvec_ao
-      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-      TOUCH mo_l_coef mo_r_coef
-
-      etc_tot = TC_HF_energy
-      etc_1e  = TC_HF_one_e_energy
-      etc_2e  = TC_HF_two_e_energy
-      etc_3e  = 0.d0
-      if(three_body_h_tc) then
-        etc_3e = diag_three_elem_hf
-      endif
-      tc_grad = grad_non_hermit
-      er_DIIS = maxval(abs(FQS_SQF_mo))
-      e_delta = dabs(etc_tot - e_save)
-      e_save  = etc_tot
-
-      call ezfio_set_tc_scf_tcscf_energy(etc_tot)
-
-      call wall_time(t1)
-      write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-        it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-    enddo
-
-  else
-
-   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. (it.lt.n_it_tcscf_max) )
-      print*,'grad_hermit = ',grad_hermit
-      it += 1
-      print *, 'iteration = ', it
-      print *, '***'
-      print *, 'TC HF total energy = ', TC_HF_energy
-      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
-      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
-      print *, 'TC HF 3 body       = ', diag_three_elem_hf
-      print *, '***'
-      print *, ''
-      call save_good_hermit_tc_eigvectors
-      TOUCH mo_coef 
-      call save_mos
-    enddo
-
-  endif
-
-  print *, ' TCSCF Simple converged !'
-  !call print_energy_and_mos(good_angles)
-
-  deallocate(rho_old, rho_new)
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f b/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
deleted file mode 100644
index 0f2663e5..00000000
--- a/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
+++ /dev/null
@@ -1,369 +0,0 @@
-
-! ---
-
-program rotate_tcscf_orbitals
-
-  BEGIN_DOC
-  ! TODO : Rotate the bi-orthonormal orbitals in order to minimize left-right angles when degenerate
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  bi_ortho = .True.
-  touch bi_ortho
-
-  call minimize_tc_orb_angles()
-  !call maximize_overlap()
-
-end
-
-! ---
-
-subroutine maximize_overlap()
-
-  implicit none
-  integer                       :: i, m, n
-  double precision              :: accu_d, accu_nd
-  double precision, allocatable :: C(:,:), R(:,:), L(:,:), W(:,:), e(:)
-  double precision, allocatable :: S(:,:)
-
-  n = ao_num
-  m = mo_num
-
-  allocate(L(n,m), R(n,m), C(n,m), W(n,n), e(m))
-  L = mo_l_coef
-  R = mo_r_coef
-  C = mo_coef
-  W = ao_overlap
-
-  print*, ' fock matrix diag elements'
-  do i = 1, m
-    e(i) = Fock_matrix_tc_mo_tot(i,i)
-    print*, e(i)
-  enddo
-
-  ! ---
-   
-  print *, ' overlap before :'
-  print *, ' '
-
-  allocate(S(m,m)) 
-
-  call LTxSxR(n, m, L, W, R, S)
-  !print*, " L.T x R"
-  !do i = 1, m
-  !  write(*, '(100(F16.10,X))') S(i,i)
-  !enddo
-  call LTxSxR(n, m, L, W, C, S)
-  print*, " L.T x C"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-  call LTxSxR(n, m, C, W, R, S)
-  print*, " C.T x R"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-
-  deallocate(S)
-
-  ! ---
-
-  call rotate_degen_eigvec_to_maximize_overlap(n, m, e, C, W, L, R)
-
-  ! ---
-   
-  print *, ' overlap after :'
-  print *, ' '
-
-  allocate(S(m,m)) 
-
-  call LTxSxR(n, m, L, W, R, S)
-  !print*, " L.T x R"
-  !do i = 1, m
-  !  write(*, '(100(F16.10,X))') S(i,i)
-  !enddo
-  call LTxSxR(n, m, L, W, C, S)
-  print*, " L.T x C"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-  call LTxSxR(n, m, C, W, R, S)
-  print*, " C.T x R"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-
-  deallocate(S)
-
-  ! ---
-
-  mo_l_coef = L
-  mo_r_coef = R
-  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-
-  ! ---
-
-  deallocate(L, R, C, W, e)
-
-end subroutine maximize_overlap
-
-! ---
-
-subroutine rotate_degen_eigvec_to_maximize_overlap(n, m, e0, C0, W0, L0, R0)
-
-  implicit none
-
-  integer,          intent(in)    :: n, m
-  double precision, intent(in)    :: e0(m), W0(n,n), C0(n,m)
-  double precision, intent(inout) :: L0(n,m), R0(n,m)
-
-
-  integer                         :: i, j, k, kk, mm, id1, tot_deg
-  double precision                :: ei, ej, de, de_thr
-  integer,          allocatable   :: deg_num(:)
-  double precision, allocatable   :: L(:,:), R(:,:), C(:,:), Lnew(:,:), Rnew(:,:), tmp(:,:)
-  !double precision, allocatable   :: S(:,:), Snew(:,:), T(:,:), Ttmp(:,:), Stmp(:,:)
-  double precision, allocatable   :: S(:,:), Snew(:,:), T(:,:), Ttmp(:,:), Stmp(:,:)
-  !real*8                          :: S(m,m), Snew(m,m), T(m,m)
-
-  id1 = 700
-  allocate(S(id1,id1), Snew(id1,id1), T(id1,id1))
-
-  ! ---
-
-  allocate( deg_num(m) )
-  do i = 1, m
-    deg_num(i) = 1
-  enddo
-
-  de_thr = thr_degen_tc
-
-  do i = 1, m-1
-    ei = e0(i)
-
-    ! already considered in degen vectors
-    if(deg_num(i).eq.0) cycle
-
-    do j = i+1, m
-      ej = e0(j)
-      de = dabs(ei - ej)
-
-      if(de .lt. de_thr) then
-        deg_num(i) = deg_num(i) + 1
-        deg_num(j) = 0
-      endif
-
-    enddo
-  enddo
-
-  tot_deg = 0
-  do i = 1, m
-    if(deg_num(i).gt.1) then
-      print *, ' degen on', i, deg_num(i)
-      tot_deg = tot_deg + 1
-    endif
-  enddo
-
-  if(tot_deg .eq. 0) then
-    print *, ' no degen'
-    return
-  endif
-
-  ! ---
-
-  do i = 1, m
-    mm = deg_num(i)
-
-    if(mm .gt. 1) then
-
-      allocate(L(n,mm), R(n,mm), C(n,mm))
-      do j = 1, mm
-        L(1:n,j) = L0(1:n,i+j-1)
-        R(1:n,j) = R0(1:n,i+j-1)
-        C(1:n,j) = C0(1:n,i+j-1)
-      enddo
-
-      ! ---
-
-      ! C.T x W0 x R
-      allocate(tmp(mm,n), Stmp(mm,mm))
-      call dgemm( 'T', 'N', mm, n, n, 1.d0       &
-                , C, size(C, 1), W0, size(W0, 1) &
-                , 0.d0, tmp, size(tmp, 1) )
-      call dgemm( 'N', 'N', mm, mm, n, 1.d0        &
-                , tmp, size(tmp, 1), R, size(R, 1) &
-                , 0.d0, Stmp, size(Stmp, 1) )
-      deallocate(C, tmp)
-
-      S = 0.d0
-      do k = 1, mm
-        do kk = 1, mm
-          S(kk,k) = Stmp(kk,k)
-        enddo
-      enddo
-      deallocate(Stmp)
-
-      !print*, " overlap bef"
-      !do k = 1, mm
-      !  write(*, '(100(F16.10,X))') (S(k,kk), kk=1, mm)
-      !enddo
-    
-      T    = 0.d0
-      Snew = 0.d0
-      call maxovl(mm, mm, S, T, Snew)
-
-      !print*, " overlap aft"
-      !do k = 1, mm
-      !  write(*, '(100(F16.10,X))') (Snew(k,kk), kk=1, mm)
-      !enddo
-
-      allocate(Ttmp(mm,mm))
-      Ttmp(1:mm,1:mm) = T(1:mm,1:mm)
-
-      allocate(Lnew(n,mm), Rnew(n,mm))
-      call dgemm( 'N', 'N', n, mm, mm, 1.d0               &
-                , R, size(R, 1), Ttmp(1,1), size(Ttmp, 1) &
-                , 0.d0, Rnew, size(Rnew, 1) )
-      call dgemm( 'N', 'N', n, mm, mm, 1.d0               &
-                , L, size(L, 1), Ttmp(1,1), size(Ttmp, 1) &
-                , 0.d0, Lnew, size(Lnew, 1) )
-
-      deallocate(L, R)
-      deallocate(Ttmp)
-
-      ! ---
-
-      do j = 1, mm
-        L0(1:n,i+j-1) = Lnew(1:n,j)
-        R0(1:n,i+j-1) = Rnew(1:n,j)
-      enddo
-      deallocate(Lnew, Rnew)
-
-    endif
-  enddo
-
-  deallocate(S, Snew, T)
-
-end subroutine rotate_degen_eigvec_to_maximize_overlap
-
-! ---
-
-subroutine fix_right_to_one()
-
-  implicit none
-  integer                       :: i, j, m, n, mm, tot_deg
-  double precision              :: accu_d, accu_nd
-  double precision              :: de_thr, ei, ej, de
-  integer,          allocatable :: deg_num(:)
-  double precision, allocatable :: R0(:,:), L0(:,:), W(:,:), e0(:)
-  double precision, allocatable :: R(:,:), L(:,:), S(:,:), Stmp(:,:), tmp(:,:)
-
-  n = ao_num
-  m = mo_num
-
-  allocate(L0(n,m), R0(n,m), W(n,n), e0(m))
-  L0 = mo_l_coef
-  R0 = mo_r_coef
-  W  = ao_overlap
-
-  print*, ' fock matrix diag elements'
-  do i = 1, m
-    e0(i) = Fock_matrix_tc_mo_tot(i,i)
-    print*, e0(i)
-  enddo
-
-  ! ---
-
-  allocate( deg_num(m) )
-  do i = 1, m
-    deg_num(i) = 1
-  enddo
-
-  de_thr = 1d-6
-
-  do i = 1, m-1
-    ei = e0(i)
-
-    ! already considered in degen vectors
-    if(deg_num(i).eq.0) cycle
-
-    do j = i+1, m
-      ej = e0(j)
-      de = dabs(ei - ej)
-
-      if(de .lt. de_thr) then
-        deg_num(i) = deg_num(i) + 1
-        deg_num(j) = 0
-      endif
-
-    enddo
-  enddo
-
-  deallocate(e0)
-
-  tot_deg = 0
-  do i = 1, m
-    if(deg_num(i).gt.1) then
-      print *, ' degen on', i, deg_num(i)
-      tot_deg = tot_deg + 1
-    endif
-  enddo
-
-  if(tot_deg .eq. 0) then
-    print *, ' no degen'
-    return
-  endif
-
-  ! ---
-
-  do i = 1, m
-    mm = deg_num(i)
-
-    if(mm .gt. 1) then
-
-      allocate(L(n,mm), R(n,mm))
-      do j = 1, mm
-        L(1:n,j) = L0(1:n,i+j-1)
-        R(1:n,j) = R0(1:n,i+j-1)
-      enddo
-
-      ! ---
-
-      call impose_weighted_orthog_svd(n, mm, W, R)
-      call impose_weighted_biorthog_qr(n, mm, thresh_biorthog_diag, thresh_biorthog_nondiag, R, W, L)
-
-      ! ---
-
-      do j = 1, mm
-        L0(1:n,i+j-1) = L(1:n,j)
-        R0(1:n,i+j-1) = R(1:n,j)
-      enddo
-      deallocate(L, R)
-
-    endif
-  enddo
-
-  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thresh_biorthog_diag, thresh_biorthog_nondiag, .true.)
-
-  deallocate(W, deg_num)
-
-  mo_l_coef = L0
-  mo_r_coef = R0
-  deallocate(L0, R0)
-
-  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-  print *, ' orbitals are rotated '
-
-  return
-end subroutine fix_right_to_one
-
-! ---
diff --git a/plugins/local/tc_scf/tc_petermann_factor.irp.f b/plugins/local/tc_scf/tc_petermann_factor.irp.f
deleted file mode 100644
index 14fff898..00000000
--- a/plugins/local/tc_scf/tc_petermann_factor.irp.f
+++ /dev/null
@@ -1,91 +0,0 @@
-
-! ---
-
-program tc_petermann_factor
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  integer                       :: i, j
-  double precision              :: Pf_diag_av
-  double precision, allocatable :: Sl(:,:), Sr(:,:), Pf(:,:)
-
-  allocate(Sl(mo_num,mo_num), Sr(mo_num,mo_num), Pf(mo_num,mo_num))
-
-
-  call LTxSxR(ao_num, mo_num, mo_l_coef, ao_overlap, mo_r_coef, Sl)
-  !call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-  !          , mo_l_coef, size(mo_l_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
-  !          , 0.d0, Sl, size(Sl, 1) )
-
-  print *, ''
-  print *, ' left-right orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sl(:,i)
-  enddo
-
-  call LTxSxR(ao_num, mo_num, mo_l_coef, ao_overlap, mo_l_coef, Sl)
-  !call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-  !          , mo_l_coef, size(mo_l_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
-  !          , 0.d0, Sl, size(Sl, 1) )
-
-  print *, ''
-  print *, ' left-orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sl(:,i)
-  enddo
-
-  call LTxSxR(ao_num, mo_num, mo_r_coef, ao_overlap, mo_r_coef, Sr)
-!  call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-!            , mo_r_coef, size(mo_r_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
-!            , 0.d0, Sr, size(Sr, 1) )
-
-  print *, ''
-  print *, ' right-orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sr(:,i)
-  enddo
-
-  print *, ''
-  print *, ' Petermann matrix:'
-  do i = 1, mo_num
-    do j = 1, mo_num
-      Pf(j,i) = Sl(j,i) * Sr(j,i)
-    enddo
-    write(*,'(100(F8.4,X))') Pf(:,i)
-  enddo
-
-  Pf_diag_av = 0.d0
-  do i = 1, mo_num
-    Pf_diag_av = Pf_diag_av + Pf(i,i)
-  enddo
-  Pf_diag_av = Pf_diag_av / dble(mo_num)
-
-  print *, ''
-  print *, ' mean of the diagonal Petermann factor = ', Pf_diag_av
-
-  deallocate(Sl, Sr, Pf)
-
-  return
-end subroutine
-
-! ---
-
diff --git a/plugins/local/tc_scf/tc_scf.irp.f b/plugins/local/tc_scf/tc_scf.irp.f
index ee8e8dad..f099b90e 100644
--- a/plugins/local/tc_scf/tc_scf.irp.f
+++ b/plugins/local/tc_scf/tc_scf.irp.f
@@ -10,13 +10,10 @@ program tc_scf
   integer :: i
   logical :: good_angles
 
-  PROVIDE j1e_type
-  PROVIDE j2e_type
-  PROVIDE tcscf_algorithm
-
   print *, ' TC-SCF with:'
-  print *, ' j1e_type = ', j1e_type
   print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
 
   write(json_unit,json_array_open_fmt) 'tc-scf'
 
@@ -29,7 +26,6 @@ program tc_scf
   call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
   call write_int(6, my_n_pt_a_grid, 'angular external grid over')
 
-
   if(tc_integ_type .eq. "numeric") then
     my_extra_grid_becke  = .True.
     PROVIDE tc_grid2_a tc_grid2_r
@@ -41,17 +37,7 @@ program tc_scf
     call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
   endif
 
-  !call create_guess()
-  !call orthonormalize_mos()
-
-  if(tcscf_algorithm == 'DIIS') then
-    call rh_tcscf_diis()
-  elseif(tcscf_algorithm == 'Simple') then
-    call rh_tcscf_simple()
-  else
-    print *, ' not implemented yet', tcscf_algorithm
-    stop
-  endif
+  call rh_tcscf_diis()
 
   PROVIDE Fock_matrix_tc_diag_mo_tot
   print*, ' Eigenvalues:' 
@@ -59,14 +45,11 @@ program tc_scf
     print*, i, Fock_matrix_tc_diag_mo_tot(i)
   enddo
 
-  ! TODO 
-  ! rotate angles in separate code only if necessary
-  if(minimize_lr_angles)then
+  if(minimize_lr_angles) then
    call minimize_tc_orb_angles()
   endif
   call print_energy_and_mos(good_angles)
 
-
   write(json_unit,json_array_close_fmtx)
   call json_close
 
diff --git a/plugins/local/tc_scf/tc_scf_dm.irp.f b/plugins/local/tc_scf/tc_scf_dm.irp.f
index bf31a4a1..5d25fce2 100644
--- a/plugins/local/tc_scf/tc_scf_dm.irp.f
+++ b/plugins/local/tc_scf/tc_scf_dm.irp.f
@@ -10,16 +10,8 @@ BEGIN_PROVIDER [double precision, TCSCF_density_matrix_ao_beta, (ao_num, ao_num)
 
   implicit none
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-    TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
-
-  else
-
-    TCSCF_density_matrix_ao_beta = SCF_density_matrix_ao_beta
-
-  endif
+  PROVIDE mo_l_coef mo_r_coef
+  TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
 
 END_PROVIDER 
 
@@ -35,16 +27,8 @@ BEGIN_PROVIDER [double precision, TCSCF_density_matrix_ao_alpha, (ao_num, ao_num
 
   implicit none
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-    TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
-
-  else
-
-    TCSCF_density_matrix_ao_alpha = SCF_density_matrix_ao_alpha
-
-  endif
+  PROVIDE mo_l_coef mo_r_coef
+  TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
 
 END_PROVIDER 
 
diff --git a/plugins/local/tc_scf/tc_scf_energy.irp.f b/plugins/local/tc_scf/tc_scf_energy.irp.f
index 0266c605..c9366195 100644
--- a/plugins/local/tc_scf/tc_scf_energy.irp.f
+++ b/plugins/local/tc_scf/tc_scf_energy.irp.f
@@ -34,3 +34,426 @@ END_PROVIDER
 
 ! ---
 
+BEGIN_PROVIDER [double precision, diag_three_elem_hf]
+
+  BEGIN_DOC
+  ! 
+  ! < Phi_left | L | Phi_right >
+  !
+  !
+  ! if three_body_h_tc == false and noL_standard == true ==> do a normal ordering
+  !
+  ! todo
+  ! this should be equivalent to
+  !     three_body_h_tc == true and noL_standard == false
+  !
+  ! if three_body_h_tc == false and noL_standard == false ==> this is equal to 0
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: i, j, k, ipoint, mm
+  double precision              :: contrib, weight, four_third, one_third, two_third, exchange_int_231
+  double precision              :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb
+  double precision, allocatable :: tmp(:)
+  double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
+  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
+  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  if(.not. three_body_h_tc) then
+
+   if(noL_standard) then
+      PROVIDE noL_0e
+      diag_three_elem_hf = noL_0e
+    else
+      diag_three_elem_hf = 0.d0
+    endif
+
+  else
+
+    PROVIDE int2_grad1_u12_bimo_t
+    PROVIDE mos_l_in_r_array_transp
+    PROVIDE mos_r_in_r_array_transp
+
+    if(elec_alpha_num .eq. elec_beta_num) then
+  
+      allocate(tmp(elec_beta_num))
+      allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
+  
+      !$OMP PARALLEL                                                 &
+      !$OMP DEFAULT(NONE)                                            &
+      !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                      &
+      !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
+  
+      !$OMP DO
+      do j = 1, elec_beta_num
+  
+        tmp_L = 0.d0
+        tmp_R = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
+  
+            tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+        enddo
+  
+        tmp(j) = 0.d0
+        do ipoint = 1, n_points_final_grid
+          tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        enddo
+      enddo ! j
+      !$OMP END DO
+      !$OMP END PARALLEL
+  
+      diag_three_elem_hf = -2.d0 * sum(tmp)
+  
+      deallocate(tmp)
+      deallocate(tmp_L, tmp_R)
+  
+      ! ---
+  
+      allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
+      tmp_O = 0.d0
+      tmp_J = 0.d0
+  
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT(NONE)                                             &
+      !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                &
+      !$OMP SHARED(elec_beta_num, n_points_final_grid,                & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,  &
+      !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
+  
+      allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
+      tmp_O_priv = 0.d0
+      tmp_J_priv = 0.d0
+    
+      !$OMP DO 
+      do i = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+          tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+          tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
+          tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
+          tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP CRITICAL
+      tmp_O = tmp_O + tmp_O_priv
+      tmp_J = tmp_J + tmp_J_priv
+      !$OMP END CRITICAL
+  
+      deallocate(tmp_O_priv, tmp_J_priv)
+      !$OMP END PARALLEL
+  
+      allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
+      tmp_M = 0.d0
+      tmp_S = 0.d0
+  
+      !$OMP PARALLEL                                                 &
+      !$OMP DEFAULT(NONE)                                            &
+      !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)            &
+      !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
+  
+      allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
+      tmp_M_priv = 0.d0
+      tmp_S_priv = 0.d0
+    
+      !$OMP DO COLLAPSE(2)
+      do i = 1, elec_beta_num
+        do j = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+  
+            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                                        + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                                        + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP CRITICAL
+      tmp_M = tmp_M + tmp_M_priv
+      tmp_S = tmp_S + tmp_S_priv
+      !$OMP END CRITICAL
+  
+      deallocate(tmp_M_priv, tmp_S_priv)
+      !$OMP END PARALLEL
+  
+      allocate(tmp(n_points_final_grid))
+  
+      do ipoint = 1, n_points_final_grid
+  
+        tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
+  
+        tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
+                                                         - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
+                                                                  + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
+                                                                  + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
+      enddo
+  
+      diag_three_elem_hf = diag_three_elem_hf -2.d0 * (sum(tmp))
+  
+      deallocate(tmp)
+  
+    else ! elec_alpha_num .neq. elec_beta_num
+  
+      allocate(tmp(elec_alpha_num))
+      allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
+  
+      !$OMP PARALLEL                                                   &
+      !$OMP DEFAULT(NONE)                                              &
+      !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
+      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+      !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
+  
+      !$OMP DO
+      do j = 1, elec_beta_num
+  
+        tmp_L = 0.d0
+        tmp_R = 0.d0
+        do i = elec_beta_num+1, elec_alpha_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
+  
+            tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+        enddo
+  
+        tmp(j) = 0.d0
+        do ipoint = 1, n_points_final_grid
+          tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        enddo
+  
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
+  
+            tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+        enddo
+  
+        do ipoint = 1, n_points_final_grid
+          tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        enddo
+      enddo ! j
+      !$OMP END DO
+      !$OMP END PARALLEL
+  
+      ! ---
+  
+      !$OMP PARALLEL                                                   &
+      !$OMP DEFAULT(NONE)                                              &
+      !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
+      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+      !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
+  
+      !$OMP DO
+      do j = elec_beta_num+1, elec_alpha_num
+  
+        tmp_L = 0.d0
+        tmp_R = 0.d0
+        do i = 1, elec_alpha_num
+          do ipoint = 1, n_points_final_grid
+            tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
+            tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
+  
+            tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
+          enddo
+        enddo
+  
+        tmp(j) = 0.d0
+        do ipoint = 1, n_points_final_grid
+          tmp(j) = tmp(j) + 0.5d0 * final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
+        enddo
+      enddo ! j
+      !$OMP END DO
+      !$OMP END PARALLEL
+  
+      diag_three_elem_hf = -2.d0 * sum(tmp)
+  
+      deallocate(tmp)
+      deallocate(tmp_L, tmp_R)
+  
+      ! ---
+  
+      allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
+      tmp_O = 0.d0
+      tmp_J = 0.d0
+  
+      !$OMP PARALLEL                                                   &
+      !$OMP DEFAULT(NONE)                                              &
+      !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                 &
+      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+      !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
+  
+      allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
+      tmp_O_priv = 0.d0
+      tmp_J_priv = 0.d0
+    
+      !$OMP DO 
+      do i = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+          tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+          tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
+          tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
+          tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP DO 
+      do i = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+          tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+          tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,i)
+          tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,i)
+          tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,i)
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP CRITICAL
+      tmp_O = tmp_O + tmp_O_priv
+      tmp_J = tmp_J + tmp_J_priv
+      !$OMP END CRITICAL
+  
+      deallocate(tmp_O_priv, tmp_J_priv)
+      !$OMP END PARALLEL
+  
+      ! ---
+  
+      allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
+      tmp_M = 0.d0
+      tmp_S = 0.d0
+  
+      !$OMP PARALLEL                                                   &
+      !$OMP DEFAULT(NONE)                                              &
+      !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)              &
+      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
+      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+      !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
+  
+      allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
+      tmp_M_priv = 0.d0
+      tmp_S_priv = 0.d0
+    
+      !$OMP DO COLLAPSE(2)
+      do i = 1, elec_beta_num
+        do j = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+  
+            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                                        + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                                        + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP DO COLLAPSE(2)
+      do i = elec_beta_num+1, elec_alpha_num
+        do j = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+  
+            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
+            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
+  
+            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                                        + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                                        + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP DO COLLAPSE(2)
+      do i = elec_beta_num+1, elec_alpha_num
+        do j = elec_beta_num+1, elec_alpha_num
+          do ipoint = 1, n_points_final_grid
+  
+            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
+  
+            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                                        + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                                        + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO NOWAIT
+  
+      !$OMP CRITICAL
+      tmp_M = tmp_M + tmp_M_priv
+      tmp_S = tmp_S + tmp_S_priv
+      !$OMP END CRITICAL
+  
+      deallocate(tmp_M_priv, tmp_S_priv)
+      !$OMP END PARALLEL
+  
+      allocate(tmp(n_points_final_grid))
+  
+      do ipoint = 1, n_points_final_grid
+  
+        tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
+  
+        tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
+                                                         - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
+                                                                  + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
+                                                                  + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
+      enddo
+  
+      diag_three_elem_hf = diag_three_elem_hf - 2.d0 * (sum(tmp))
+  
+      deallocate(tmp)
+  
+    endif ! alpha/beta condition
+
+  endif ! three_body_h_tc
+
+END_PROVIDER 
+
+! ---
+
diff --git a/plugins/local/tc_scf/tcscf_energy_naive.irp.f b/plugins/local/tc_scf/tcscf_energy_naive.irp.f
deleted file mode 100644
index 82bb8799..00000000
--- a/plugins/local/tc_scf/tcscf_energy_naive.irp.f
+++ /dev/null
@@ -1,80 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, tcscf_energy_3e_naive]
-
-  implicit none
-  integer          :: i, j, k
-  integer          :: neu, ned, D(elec_num)
-  integer          :: ii, jj, kk
-  integer          :: si, sj, sk
-  double precision :: I_ijk, I_jki, I_kij, I_jik, I_ikj, I_kji
-  double precision :: I_tot
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  neu = elec_alpha_num
-  ned = elec_beta_num
-  if (neu > 0) D(1:neu) = [(2*i-1, i = 1, neu)]
-  if (ned > 0) D(neu+1:neu+ned) = [(2*i, i = 1, ned)]
-
-  !print*, "D = "
-  !do i = 1, elec_num
-  !  ii = (D(i) - 1) / 2 + 1
-  !  si = mod(D(i), 2)
-  !  print*, i, D(i), ii, si
-  !enddo
-
-  tcscf_energy_3e_naive = 0.d0
-
-  do i = 1, elec_num - 2
-    ii = (D(i) - 1) / 2 + 1
-    si = mod(D(i), 2)
-
-    do j = i + 1, elec_num - 1
-      jj = (D(j) - 1) / 2 + 1
-      sj = mod(D(j), 2)
-
-      do k = j + 1, elec_num
-        kk = (D(k) - 1) / 2 + 1
-        sk = mod(D(k), 2)
-
-        call give_integrals_3_body_bi_ort(ii, jj, kk, ii, jj, kk, I_ijk)
-        I_tot = I_ijk
-
-        if(sj==si .and. sk==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, kk, ii, I_jki)
-          I_tot += I_jki
-        endif
-
-        if(sk==si .and. si==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, ii, jj, I_kij)
-          I_tot += I_kij
-        endif
-
-        if(sj==si) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, ii, kk, I_jik)
-          I_tot -= I_jik
-        endif
-
-        if(sk==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, ii, kk, jj, I_ikj)
-          I_tot -= I_ikj
-        endif
-
-        if(sk==si) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, jj, ii, I_kji)
-          I_tot -= I_kji
-        endif
-
-        tcscf_energy_3e_naive += I_tot
-      enddo
-    enddo
-  enddo
-
-  tcscf_energy_3e_naive = -tcscf_energy_3e_naive
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f b/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f
deleted file mode 100644
index 0c9ebbd7..00000000
--- a/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f
+++ /dev/null
@@ -1,189 +0,0 @@
-
-subroutine contrib_3e_diag_sss(i, j, k, integral)
-
-  BEGIN_DOC
-  ! returns the pure same spin contribution to diagonal matrix element of 3e term
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, j, k
-  double precision, intent(out) :: integral
-  double precision              :: direct_int, exch_13_int, exch_23_int, exch_12_int, c_3_int, c_minus_3_int
-
-  call give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int )!!! < i k j | i k j >
-  call give_integrals_3_body_bi_ort(i, k, j, j, i, k, c_3_int)      ! < i k j | j i k >
-  call give_integrals_3_body_bi_ort(i, k, j, k, j, i, c_minus_3_int)! < i k j | k j i >
-  integral = direct_int + c_3_int + c_minus_3_int 
-
-  ! negative terms :: exchange contrib
-  call give_integrals_3_body_bi_ort(i, k, j, j, k, i, exch_13_int)!!! < i k j | j k i > : E_13 
-  call give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)!!! < i k j | i j k > : E_23
-  call give_integrals_3_body_bi_ort(i, k, j, k, i, j, exch_12_int)!!! < i k j | k i j > : E_12
-
-  integral += - exch_13_int - exch_23_int  - exch_12_int 
-  integral = -integral
-
-end
-
-! ---
-
-subroutine contrib_3e_diag_soo(i,j,k,integral)
- implicit none
- integer, intent(in) :: i,j,k
- BEGIN_DOC
- ! returns the pure same spin contribution to diagonal matrix element of 3e term
- END_DOC
- double precision, intent(out) :: integral
- double precision :: direct_int, exch_23_int
- call  give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int) ! < i k j | i k j >
- call  give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)! < i k j | i j k > : E_23
- integral = direct_int - exch_23_int 
- integral = -integral
-end
-
-
-subroutine give_aaa_contrib_bis(integral_aaa)
- implicit none
- double precision, intent(out) :: integral_aaa
- double precision :: integral
- integer :: i,j,k
- integral_aaa = 0.d0
- do i = 1, elec_alpha_num
-  do j = i+1, elec_alpha_num
-   do k = j+1, elec_alpha_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_aaa += integral
-   enddo
-  enddo
- enddo
-
-end
-
-! ---
-
-subroutine give_aaa_contrib(integral_aaa)
-
-  implicit none
-  integer                       :: i, j, k
-  double precision              :: integral
-  double precision, intent(out) :: integral_aaa
-
-  integral_aaa = 0.d0
-  do i = 1, elec_alpha_num
-    do j = 1, elec_alpha_num
-      do k = 1, elec_alpha_num
-        call contrib_3e_diag_sss(i, j, k, integral)
-        integral_aaa += integral
-      enddo
-    enddo
-  enddo
-  integral_aaa *= 1.d0/6.d0 
-
-  return
-end
-
-! ---
-
-subroutine give_aab_contrib(integral_aab)
- implicit none
- double precision, intent(out) :: integral_aab
- double precision :: integral
- integer :: i,j,k
- integral_aab = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_alpha_num
-   do k = 1, elec_alpha_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_aab += integral
-   enddo
-  enddo
- enddo
- integral_aab *= 0.5d0
-end
-
-
-subroutine give_aab_contrib_bis(integral_aab)
- implicit none
- double precision, intent(out) :: integral_aab
- double precision :: integral
- integer :: i,j,k
- integral_aab = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_alpha_num
-   do k = j+1, elec_alpha_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_aab += integral
-   enddo
-  enddo
- enddo
-end
-
-
-subroutine give_abb_contrib(integral_abb)
- implicit none
- double precision, intent(out) :: integral_abb
- double precision :: integral
- integer :: i,j,k
- integral_abb = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_beta_num
-   do k = 1, elec_beta_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_abb += integral
-   enddo
-  enddo
- enddo
- integral_abb *= 0.5d0
-end
-
-subroutine give_abb_contrib_bis(integral_abb)
- implicit none
- double precision, intent(out) :: integral_abb
- double precision :: integral
- integer :: i,j,k
- integral_abb = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_beta_num
-   do k = j+1, elec_beta_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_abb += integral
-   enddo
-  enddo
- enddo
-end
-
-subroutine give_bbb_contrib_bis(integral_bbb)
- implicit none
- double precision, intent(out) :: integral_bbb
- double precision :: integral
- integer :: i,j,k
- integral_bbb = 0.d0
- do i = 1, elec_beta_num
-  do j = i+1, elec_beta_num
-   do k = j+1, elec_beta_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_bbb += integral
-   enddo
-  enddo
- enddo
-
-end
-
-subroutine give_bbb_contrib(integral_bbb)
- implicit none
- double precision, intent(out) :: integral_bbb
- double precision :: integral
- integer :: i,j,k
- integral_bbb = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_beta_num
-   do k = 1, elec_beta_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_bbb += integral
-   enddo
-  enddo
- enddo
- integral_bbb *= 1.d0/6.d0 
-end
-
-
diff --git a/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
index 7ce57578..ec5167d1 100644
--- a/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
+++ b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
@@ -4,11 +4,9 @@ program write_ao_2e_tc_integ
 
   implicit none
 
-  PROVIDE j1e_type
-  PROVIDE j2e_type
-
-  print *, ' j1e_type = ', j1e_type
   print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
 
   my_grid_becke  = .True.
   PROVIDE tc_grid1_a tc_grid1_r

From d43d960b1a15e7ddb9dcf1e871bf6e9a4e70983b Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 1 May 2024 21:52:00 +0200
Subject: [PATCH 022/131] TC-SCF CLEANED

---
 plugins/local/bi_ort_ints/no_dressing.irp.f   |   7 +-
 plugins/local/non_hermit_dav/biorthog.irp.f   |   2 +-
 plugins/local/slater_tc/NEED                  |   1 +
 .../symmetrized_3_e_int_prov.irp.f            |   0
 plugins/local/tc_bi_ortho/test_tc_fock.irp.f  |  33 -
 plugins/local/tc_keywords/EZFIO.cfg           |  48 +-
 plugins/local/tc_keywords/tc_keywords.irp.f   |   7 -
 plugins/local/tc_scf/EZFIO.cfg                |  30 +
 plugins/local/tc_scf/fock_hermit.irp.f        | 107 ---
 plugins/local/tc_scf/fock_tc.irp.f            |  40 +-
 plugins/local/tc_scf/fock_tc_mo_tot.irp.f     |  19 +-
 plugins/local/tc_scf/fock_three_hermit.irp.f  | 771 ------------------
 .../local/tc_scf/integrals_in_r_stuff.irp.f   | 391 ---------
 plugins/local/tc_scf/jast_schmos_90.irp.f     | 318 --------
 plugins/local/tc_scf/plot_j_schMos.irp.f      |  69 --
 plugins/local/tc_scf/print_fit_param.irp.f    |  59 --
 plugins/local/tc_scf/print_tcscf_energy.irp.f |  55 --
 plugins/local/tc_scf/rh_tcscf_diis.irp.f      |   4 +-
 plugins/local/tc_scf/rh_tcscf_simple.irp.f    | 129 ---
 .../local/tc_scf/rotate_tcscf_orbitals.irp.f  | 369 ---------
 .../local/tc_scf/tc_petermann_factor.irp.f    |  91 ---
 plugins/local/tc_scf/tc_scf.irp.f             |  25 +-
 plugins/local/tc_scf/tc_scf_dm.irp.f          |  24 +-
 plugins/local/tc_scf/tc_scf_energy.irp.f      |  16 +-
 plugins/local/tc_scf/tcscf_energy_naive.irp.f |  80 --
 .../tc_scf/three_e_energy_bi_ortho.irp.f      | 189 -----
 .../local/tc_scf/write_ao_2e_tc_integ.irp.f   |   6 +-
 27 files changed, 94 insertions(+), 2796 deletions(-)
 rename plugins/local/{tc_bi_ortho => slater_tc}/symmetrized_3_e_int_prov.irp.f (100%)
 delete mode 100644 plugins/local/tc_keywords/tc_keywords.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_hermit.irp.f
 delete mode 100644 plugins/local/tc_scf/fock_three_hermit.irp.f
 delete mode 100644 plugins/local/tc_scf/integrals_in_r_stuff.irp.f
 delete mode 100644 plugins/local/tc_scf/jast_schmos_90.irp.f
 delete mode 100644 plugins/local/tc_scf/plot_j_schMos.irp.f
 delete mode 100644 plugins/local/tc_scf/print_fit_param.irp.f
 delete mode 100644 plugins/local/tc_scf/print_tcscf_energy.irp.f
 delete mode 100644 plugins/local/tc_scf/rh_tcscf_simple.irp.f
 delete mode 100644 plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
 delete mode 100644 plugins/local/tc_scf/tc_petermann_factor.irp.f
 delete mode 100644 plugins/local/tc_scf/tcscf_energy_naive.irp.f
 delete mode 100644 plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f

diff --git a/plugins/local/bi_ort_ints/no_dressing.irp.f b/plugins/local/bi_ort_ints/no_dressing.irp.f
index bd225274..721ac0f8 100644
--- a/plugins/local/bi_ort_ints/no_dressing.irp.f
+++ b/plugins/local/bi_ort_ints/no_dressing.irp.f
@@ -322,6 +322,12 @@ END_PROVIDER
 
 BEGIN_PROVIDER [double precision, noL_0e]
 
+  BEGIN_DOC
+  ! 
+  ! < Phi_left | L | Phi_right >
+  !
+  END_DOC 
+
   implicit none
   integer                       :: i, j, k, ipoint
   double precision              :: t0, t1
@@ -330,7 +336,6 @@ BEGIN_PROVIDER [double precision, noL_0e]
   double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
   double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
 
-
   call wall_time(t0)
   print*, " Providing noL_0e ..."
 
diff --git a/plugins/local/non_hermit_dav/biorthog.irp.f b/plugins/local/non_hermit_dav/biorthog.irp.f
index b36b0130..4b618228 100644
--- a/plugins/local/non_hermit_dav/biorthog.irp.f
+++ b/plugins/local/non_hermit_dav/biorthog.irp.f
@@ -43,7 +43,7 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
   ! track & sort the real eigenvalues 
 
   n_good = 0
-  thr    = Im_thresh_tcscf
+  thr    = Im_thresh_tc
   do i = 1, n
     if(dabs(WI(i)) .lt. thr) then
       n_good += 1
diff --git a/plugins/local/slater_tc/NEED b/plugins/local/slater_tc/NEED
index ef0aa3f7..a8669866 100644
--- a/plugins/local/slater_tc/NEED
+++ b/plugins/local/slater_tc/NEED
@@ -5,3 +5,4 @@ bi_ortho_mos
 tc_keywords
 non_hermit_dav
 dav_general_mat
+tc_scf
diff --git a/plugins/local/tc_bi_ortho/symmetrized_3_e_int_prov.irp.f b/plugins/local/slater_tc/symmetrized_3_e_int_prov.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/symmetrized_3_e_int_prov.irp.f
rename to plugins/local/slater_tc/symmetrized_3_e_int_prov.irp.f
diff --git a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f b/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
index f1a7cc0a..85f3ed97 100644
--- a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
+++ b/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
@@ -24,44 +24,12 @@ program test_tc_fock
   !call routine_2
 !  call routine_3()
 
-! call test_3e
   call routine_tot
 
 end
 
 ! ---
 
-subroutine test_3e
- implicit none
- double precision :: integral_aaa,integral_aab,integral_abb,integral_bbb,accu
- double precision ::  hmono, htwoe, hthree, htot
- call htilde_mu_mat_bi_ortho_slow(ref_bitmask, ref_bitmask, N_int, hmono, htwoe, hthree, htot)
- print*,'hmono = ',hmono
- print*,'htwoe = ',htwoe
- print*,'hthree= ',hthree
- print*,'htot  = ',htot
- print*,''
- print*,''
- print*,'TC_one= ',tc_hf_one_e_energy
- print*,'TC_two= ',TC_HF_two_e_energy
- print*,'TC_3e = ',diag_three_elem_hf
- print*,'TC_tot= ',TC_HF_energy
- print*,''
- print*,''
- call give_aaa_contrib(integral_aaa)
- print*,'integral_aaa = ',integral_aaa
- call give_aab_contrib(integral_aab)
- print*,'integral_aab = ',integral_aab
- call give_abb_contrib(integral_abb)
- print*,'integral_abb = ',integral_abb
- call give_bbb_contrib(integral_bbb)
- print*,'integral_bbb = ',integral_bbb
- accu = integral_aaa + integral_aab + integral_abb + integral_bbb
- print*,'accu = ',accu
- print*,'delta = ',hthree - accu
-
-end
-
 subroutine routine_3()
 
   use bitmasks ! you need to include the bitmasks_module.f90 features
@@ -86,7 +54,6 @@ subroutine routine_3()
    do i = 1, elec_num_tab(s1)
      do a = elec_num_tab(s1)+1, mo_num ! virtual 
  
- 
        det_i = ref_bitmask
        call do_single_excitation(det_i, i, a, s1, i_ok)
        if(i_ok == -1) then
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index e4d9701a..33b9db57 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -100,30 +100,12 @@ doc: If |true|, the states are re-ordered to match the input states
 default: False
 interface: ezfio,provider,ocaml
 
-[bi_ortho]
-type: logical
-doc: If |true|, the MO basis is assumed to be bi-orthonormal
-interface: ezfio,provider,ocaml
-default: True
-
 [symetric_fock_tc]
 type: logical
 doc: If |true|, using F+F^t as Fock TC
 interface: ezfio,provider,ocaml
 default: False
 
-[thresh_tcscf]
-type: Threshold
-doc: Threshold on the convergence of the Hartree Fock energy.
-interface: ezfio,provider,ocaml
-default: 1.e-8
-
-[n_it_tcscf_max]
-type: Strictly_positive_int
-doc: Maximum number of SCF iterations
-interface: ezfio,provider,ocaml
-default: 50
-
 [selection_tc]
 type: integer
 doc: if +1: only positive is selected, -1: only negative is selected, :0 both positive and negative
@@ -160,30 +142,6 @@ doc: If |true|, maximize the overlap between orthogonalized left- and right eige
 interface: ezfio,provider,ocaml
 default: False
 
-[max_dim_diis_tcscf]
-type: integer
-doc: Maximum size of the DIIS extrapolation procedure
-interface: ezfio,provider,ocaml
-default: 15
-
-[level_shift_tcscf]
-type: Positive_float
-doc: Energy shift on the virtual MOs to improve TCSCF convergence
-interface: ezfio,provider,ocaml
-default: 0.
-
-[tcscf_algorithm]
-type: character*(32)
-doc: Type of TCSCF algorithm used. Possible choices are [Simple | DIIS]
-interface: ezfio,provider,ocaml
-default: DIIS
-
-[im_thresh_tcscf]
-type: Threshold
-doc: Thresholds on the Imag part of energy
-interface: ezfio,provider,ocaml
-default: 1.e-7
-
 [test_cycle_tc]
 type: logical
 doc: If |true|, the integrals of the three-body jastrow are computed with cycles
@@ -304,3 +262,9 @@ doc: If |true|, more calc but less mem
 interface: ezfio,provider,ocaml
 default: False
 
+[im_thresh_tc]
+type: Threshold
+doc: Thresholds on the Imag part of TC energy
+interface: ezfio,provider,ocaml
+default: 1.e-7
+
diff --git a/plugins/local/tc_keywords/tc_keywords.irp.f b/plugins/local/tc_keywords/tc_keywords.irp.f
deleted file mode 100644
index 3bc68550..00000000
--- a/plugins/local/tc_keywords/tc_keywords.irp.f
+++ /dev/null
@@ -1,7 +0,0 @@
-program tc_keywords
-  implicit none
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-  print *, 'Hello world'
-end
diff --git a/plugins/local/tc_scf/EZFIO.cfg b/plugins/local/tc_scf/EZFIO.cfg
index 510c777c..e3d24338 100644
--- a/plugins/local/tc_scf/EZFIO.cfg
+++ b/plugins/local/tc_scf/EZFIO.cfg
@@ -9,3 +9,33 @@ doc: If |true|, tc-scf has converged
 interface: ezfio,provider,ocaml
 default: False
 
+[max_dim_diis_tcscf]
+type: integer
+doc: Maximum size of the DIIS extrapolation procedure
+interface: ezfio,provider,ocaml
+default: 15
+
+[level_shift_tcscf]
+type: Positive_float
+doc: Energy shift on the virtual MOs to improve TCSCF convergence
+interface: ezfio,provider,ocaml
+default: 0.
+
+[thresh_tcscf]
+type: Threshold
+doc: Threshold on the convergence of the Hartree Fock energy.
+interface: ezfio,provider,ocaml
+default: 1.e-8
+
+[n_it_tcscf_max]
+type: Strictly_positive_int
+doc: Maximum number of SCF iterations
+interface: ezfio,provider,ocaml
+default: 50
+
+[tc_Brillouin_Right]
+type: logical
+doc: If |true|, impose only right-Brillouin condition
+interface: ezfio,provider,ocaml
+default: False
+
diff --git a/plugins/local/tc_scf/fock_hermit.irp.f b/plugins/local/tc_scf/fock_hermit.irp.f
deleted file mode 100644
index 5a51b324..00000000
--- a/plugins/local/tc_scf/fock_hermit.irp.f
+++ /dev/null
@@ -1,107 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, good_hermit_tc_fock_mat, (mo_num, mo_num)]
-
-  BEGIN_DOC
-! good_hermit_tc_fock_mat = Hermitian Upper triangular Fock matrix 
-!
-! The converged eigenvectors of such matrix yield to orthonormal vectors satisfying the left Brillouin theorem
-  END_DOC
-  implicit none
-  integer :: i, j
-
-  good_hermit_tc_fock_mat = Fock_matrix_tc_mo_tot
-  do j = 1, mo_num
-    do i = 1, j-1
-      good_hermit_tc_fock_mat(i,j) = Fock_matrix_tc_mo_tot(j,i) 
-    enddo
-  enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, hermit_average_tc_fock_mat, (mo_num, mo_num)]
-
-  BEGIN_DOC
-! hermit_average_tc_fock_mat = (F + F^\dagger)/2
-  END_DOC
-  implicit none
-  integer :: i, j
-
-  hermit_average_tc_fock_mat = Fock_matrix_tc_mo_tot
-  do j = 1, mo_num
-    do i = 1, mo_num
-      hermit_average_tc_fock_mat(i,j) = 0.5d0 * (Fock_matrix_tc_mo_tot(j,i) + Fock_matrix_tc_mo_tot(i,j))
-    enddo
-  enddo
-
-END_PROVIDER 
-
-
-! ---
-BEGIN_PROVIDER [ double precision, grad_hermit]
- implicit none
- BEGIN_DOC
- ! square of gradient of the energy
- END_DOC
- if(symetric_fock_tc)then
-  grad_hermit = grad_hermit_average_tc_fock_mat
- else
-  grad_hermit = grad_good_hermit_tc_fock_mat
- endif
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, grad_good_hermit_tc_fock_mat]
-  implicit none
-  BEGIN_DOC
-  ! grad_good_hermit_tc_fock_mat = norm of gradients of the upper triangular TC fock
-  END_DOC
-  integer :: i, j
-  grad_good_hermit_tc_fock_mat = 0.d0
-  do i = 1, elec_alpha_num
-    do j = elec_alpha_num+1, mo_num
-      grad_good_hermit_tc_fock_mat += dabs(good_hermit_tc_fock_mat(i,j))
-    enddo
-  enddo
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, grad_hermit_average_tc_fock_mat]
-  implicit none
-  BEGIN_DOC
-  ! grad_hermit_average_tc_fock_mat = norm of gradients of the upper triangular TC fock
-  END_DOC
-  integer :: i, j
-  grad_hermit_average_tc_fock_mat = 0.d0
-  do i = 1, elec_alpha_num
-    do j = elec_alpha_num+1, mo_num
-      grad_hermit_average_tc_fock_mat += dabs(hermit_average_tc_fock_mat(i,j))
-    enddo
-  enddo
-END_PROVIDER 
-
-
-! ---
-
-subroutine save_good_hermit_tc_eigvectors()
-
-  implicit none
-  integer        :: sign
-  character*(64) :: label
-  logical        :: output
-
-  sign = 1
-  label = "Canonical"
-  output = .False.
-  
-  if(symetric_fock_tc)then
-   call mo_as_eigvectors_of_mo_matrix(hermit_average_tc_fock_mat, mo_num, mo_num, label, sign, output)
-  else
-   call mo_as_eigvectors_of_mo_matrix(good_hermit_tc_fock_mat, mo_num, mo_num, label, sign, output)
-  endif
-end subroutine save_good_hermit_tc_eigvectors
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_tc.irp.f b/plugins/local/tc_scf/fock_tc.irp.f
index 508f3cd7..16bb5c87 100644
--- a/plugins/local/tc_scf/fock_tc.irp.f
+++ b/plugins/local/tc_scf/fock_tc.irp.f
@@ -110,23 +110,14 @@ BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
   double precision              :: t0, t1, tt0, tt1
   double precision, allocatable :: tmp(:,:)
 
-  if(bi_ortho) then
+  PROVIDE mo_l_coef mo_r_coef
 
-    PROVIDE mo_l_coef mo_r_coef
-
-    call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
-                          , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
-
-    if(three_body_h_tc) then
-      PROVIDE fock_3e_mo_a
-      Fock_matrix_tc_mo_alpha += fock_3e_mo_a
-    endif
-
-  else
-
-    call ao_to_mo( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
-                 , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
+  call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
+                        , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
 
+  if(three_body_h_tc) then
+    PROVIDE fock_3e_mo_a
+    Fock_matrix_tc_mo_alpha += fock_3e_mo_a
   endif
 
 END_PROVIDER
@@ -142,21 +133,12 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_beta, (mo_num,mo_num) ]
   implicit none
   double precision, allocatable :: tmp(:,:)
 
-  if(bi_ortho) then
-
-    call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
-                          , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
-
-    if(three_body_h_tc) then
-      PROVIDE fock_3e_mo_b
-      Fock_matrix_tc_mo_beta += fock_3e_mo_b
-    endif
-
-  else
-
-    call ao_to_mo( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
-                 , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
+  call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
+                        , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
 
+  if(three_body_h_tc) then
+    PROVIDE fock_3e_mo_b
+    Fock_matrix_tc_mo_beta += fock_3e_mo_b
   endif
 
 END_PROVIDER
diff --git a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
index 2df2421e..fd490af6 100644
--- a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
+++ b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
@@ -132,7 +132,7 @@
      enddo
    endif
 
-  if(no_oa_or_av_opt)then
+  if(no_oa_or_av_opt) then
     do i = 1, n_act_orb
       iorb = list_act(i)
       do j = 1, n_inact_orb
@@ -153,8 +153,21 @@
     enddo
   endif
 
-  if(.not.bi_ortho .and. three_body_h_tc)then
-    Fock_matrix_tc_mo_tot += fock_3_mat
+  if(tc_Brillouin_Right) then
+
+    double precision, allocatable :: tmp(:,:)
+    allocate(tmp(mo_num,mo_num))
+
+    tmp = Fock_matrix_tc_mo_tot
+    do j = 1, mo_num
+      do i = 1, j-1
+        tmp(i,j) = Fock_matrix_tc_mo_tot(j,i)
+      enddo
+    enddo
+
+    Fock_matrix_tc_mo_tot = tmp
+    deallocate(tmp)
+
   endif
 
 END_PROVIDER
diff --git a/plugins/local/tc_scf/fock_three_hermit.irp.f b/plugins/local/tc_scf/fock_three_hermit.irp.f
deleted file mode 100644
index 00d47fae..00000000
--- a/plugins/local/tc_scf/fock_three_hermit.irp.f
+++ /dev/null
@@ -1,771 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_3_mat, (mo_num, mo_num)] 
-
-  implicit none
-  integer :: i,j
-  double precision :: contrib
-
-  fock_3_mat = 0.d0
-  if(.not.bi_ortho .and. three_body_h_tc) then
-
-    call give_fock_ia_three_e_total(1, 1, contrib)
-    !!  !$OMP PARALLEL                  &
-    !!  !$OMP DEFAULT (NONE)            &
-    !!  !$OMP PRIVATE (i,j,m,integral) & 
-    !!  !$OMP SHARED (mo_num,three_body_3_index)
-    !!  !$OMP DO SCHEDULE (guided) COLLAPSE(3)
-    do i = 1, mo_num
-      do j = 1, mo_num
-        call give_fock_ia_three_e_total(j,i,contrib)
-        fock_3_mat(j,i) = -contrib
-       enddo
-     enddo
-    !else if(bi_ortho.and.three_body_h_tc) then
-    !!  !$OMP END DO
-    !!  !$OMP END PARALLEL
-    !!  do i = 1, mo_num
-    !!   do j = 1, i-1
-    !!    mat_three(j,i) = mat_three(i,j)
-    !!   enddo
-    !!  enddo
-  endif
-
-END_PROVIDER 
-
-
-subroutine give_fock_ia_three_e_total(i,a,contrib)
- implicit none
- BEGIN_DOC
-! contrib is the TOTAL (same spins / opposite spins) contribution from the three body term to the Fock operator 
-!
- END_DOC
- integer, intent(in) :: i,a
- double precision, intent(out) :: contrib
- double precision :: int_1, int_2, int_3
- double precision :: mos_i, mos_a, w_ia
- double precision :: mos_ia, weight
-
- integer :: mm, ipoint,k,l
-
- int_1 = 0.d0
- int_2 = 0.d0
- int_3 = 0.d0
- do mm = 1, 3
-  do ipoint = 1, n_points_final_grid
-   weight = final_weight_at_r_vector(ipoint)                                                                          
-   mos_i  = mos_in_r_array_transp(ipoint,i) 
-   mos_a  = mos_in_r_array_transp(ipoint,a) 
-   mos_ia = mos_a * mos_i
-   w_ia   = x_W_ij_erf_rk(ipoint,mm,i,a) 
-     
-   int_1  += weight * fock_3_w_kk_sum(ipoint,mm) * (4.d0 * fock_3_rho_beta(ipoint) * w_ia               & 
-                                                  + 2.0d0 * mos_ia * fock_3_w_kk_sum(ipoint,mm)         & 
-                                                  - 2.0d0 * fock_3_w_ki_mos_k(ipoint,mm,i) * mos_a      & 
-                                                  - 2.0d0 * fock_3_w_ki_mos_k(ipoint,mm,a) * mos_i      )
-   int_2  += weight * (-1.d0) * ( 2.0d0 * fock_3_w_kl_mo_k_mo_l(ipoint,mm) * w_ia                     & 
-                                + 2.0d0 * fock_3_rho_beta(ipoint) * fock_3_w_ki_wk_a(ipoint,mm,i,a)   & 
-                                + 1.0d0 * mos_ia * fock_3_trace_w_tilde(ipoint,mm)                    )
-
-   int_3  += weight *   1.d0  * (fock_3_w_kl_wla_phi_k(ipoint,mm,i) * mos_a + fock_3_w_kl_wla_phi_k(ipoint,mm,a) * mos_i & 
-                                +fock_3_w_ki_mos_k(ipoint,mm,i)     * fock_3_w_ki_mos_k(ipoint,mm,a)                     )
-  enddo
- enddo
- contrib = int_1 + int_2 + int_3
-
-end
-
-! ---
-
-BEGIN_PROVIDER [double precision, diag_three_elem_hf]
-
-  implicit none
-  integer                       :: i, j, k, ipoint, mm
-  double precision              :: contrib, weight, four_third, one_third, two_third, exchange_int_231
-  double precision              :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb
-  double precision, allocatable :: tmp(:)
-  double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
-  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
-  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' providing diag_three_elem_hf'
-
-  if(.not. three_body_h_tc) then
-
-   if(noL_standard) then
-      PROVIDE noL_0e
-      diag_three_elem_hf = noL_0e
-    else
-      diag_three_elem_hf = 0.d0
-    endif
-
-  else
-
-    if(.not. bi_ortho) then
-
-      ! ---
-
-      one_third  = 1.d0/3.d0
-      two_third  = 2.d0/3.d0
-      four_third = 4.d0/3.d0
-      diag_three_elem_hf = 0.d0
-      do i = 1, elec_beta_num
-        do j = 1, elec_beta_num
-          do k = 1, elec_beta_num
-            call give_integrals_3_body(k, j, i, j, i, k, exchange_int_231)   
-            diag_three_elem_hf += two_third * exchange_int_231
-          enddo
-        enddo
-      enddo
-      do mm = 1, 3
-        do ipoint = 1, n_points_final_grid
-          weight  = final_weight_at_r_vector(ipoint)                                                                          
-          contrib = 3.d0 * fock_3_w_kk_sum(ipoint,mm) * fock_3_rho_beta(ipoint) * fock_3_w_kk_sum(ipoint,mm) & 
-                  - 2.d0 * fock_3_w_kl_mo_k_mo_l(ipoint,mm) * fock_3_w_kk_sum(ipoint,mm)                     & 
-                  - 1.d0 * fock_3_rho_beta(ipoint) * fock_3_w_kl_w_kl(ipoint,mm)
-          contrib *= four_third
-          contrib += -two_third  * fock_3_rho_beta(ipoint)    * fock_3_w_kl_w_kl(ipoint,mm) & 
-                     -four_third * fock_3_w_kk_sum(ipoint,mm) * fock_3_w_kl_mo_k_mo_l(ipoint,mm)
-          diag_three_elem_hf += weight * contrib
-       enddo
-      enddo
-
-      diag_three_elem_hf = - diag_three_elem_hf
-
-      ! ---
-
-    else
-
-      ! ------------
-      ! SLOW VERSION
-      ! ------------
-
-      !call give_aaa_contrib(integral_aaa)
-      !call give_aab_contrib(integral_aab)
-      !call give_abb_contrib(integral_abb)
-      !call give_bbb_contrib(integral_bbb)
-      !diag_three_elem_hf = integral_aaa + integral_aab + integral_abb + integral_bbb
-
-      ! ------------
-      ! ------------
-
-      PROVIDE int2_grad1_u12_bimo_t
-      PROVIDE mos_l_in_r_array_transp
-      PROVIDE mos_r_in_r_array_transp
-
-      if(elec_alpha_num .eq. elec_beta_num) then
-    
-        allocate(tmp(elec_beta_num))
-        allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-    
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT(NONE)                                            &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                      &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = 1, elec_beta_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        diag_three_elem_hf = -2.d0 * sum(tmp)
-    
-        deallocate(tmp)
-        deallocate(tmp_L, tmp_R)
-    
-        ! ---
-    
-        allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-        tmp_O = 0.d0
-        tmp_J = 0.d0
-    
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT(NONE)                                             &
-        !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,                & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,  &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-    
-        allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-        tmp_O_priv = 0.d0
-        tmp_J_priv = 0.d0
-      
-        !$OMP DO 
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_O = tmp_O + tmp_O_priv
-        tmp_J = tmp_J + tmp_J_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_O_priv, tmp_J_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-        tmp_M = 0.d0
-        tmp_S = 0.d0
-    
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT(NONE)                                            &
-        !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)            &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-    
-        allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-        tmp_M_priv = 0.d0
-        tmp_S_priv = 0.d0
-      
-        !$OMP DO COLLAPSE(2)
-        do i = 1, elec_beta_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_M = tmp_M + tmp_M_priv
-        tmp_S = tmp_S + tmp_S_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_M_priv, tmp_S_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp(n_points_final_grid))
-    
-        do ipoint = 1, n_points_final_grid
-    
-          tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-    
-          tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                           - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                    + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                    + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-        enddo
-    
-        diag_three_elem_hf = diag_three_elem_hf -2.d0 * (sum(tmp))
-    
-        deallocate(tmp)
-    
-      else
-    
-        allocate(tmp(elec_alpha_num))
-        allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = 1, elec_beta_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = elec_beta_num+1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-    
-          do i = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        ! ---
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = elec_beta_num+1, elec_alpha_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = 1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + 0.5d0 * final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        diag_three_elem_hf = -2.d0 * sum(tmp)
-    
-        deallocate(tmp)
-        deallocate(tmp_L, tmp_R)
-    
-        ! ---
-    
-        allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-        tmp_O = 0.d0
-        tmp_J = 0.d0
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                 &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-    
-        allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-        tmp_O_priv = 0.d0
-        tmp_J_priv = 0.d0
-      
-        !$OMP DO 
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO 
-        do i = elec_beta_num+1, elec_alpha_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_O = tmp_O + tmp_O_priv
-        tmp_J = tmp_J + tmp_J_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_O_priv, tmp_J_priv)
-        !$OMP END PARALLEL
-    
-        ! ---
-    
-        allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-        tmp_M = 0.d0
-        tmp_S = 0.d0
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)              &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-    
-        allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-        tmp_M_priv = 0.d0
-        tmp_S_priv = 0.d0
-      
-        !$OMP DO COLLAPSE(2)
-        do i = 1, elec_beta_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO COLLAPSE(2)
-        do i = elec_beta_num+1, elec_alpha_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO COLLAPSE(2)
-        do i = elec_beta_num+1, elec_alpha_num
-          do j = elec_beta_num+1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_M = tmp_M + tmp_M_priv
-        tmp_S = tmp_S + tmp_S_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_M_priv, tmp_S_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp(n_points_final_grid))
-    
-        do ipoint = 1, n_points_final_grid
-    
-          tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-    
-          tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                           - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                    + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                    + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-        enddo
-    
-        diag_three_elem_hf = diag_three_elem_hf - 2.d0 * (sum(tmp))
-    
-        deallocate(tmp)
-    
-      endif
-
-
-    endif
-
-  endif
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_3_mat_a_op_sh, (mo_num, mo_num)]
- implicit none 
- integer :: h,p,i,j
- double precision :: direct_int, exch_int, exchange_int_231, exchange_int_312
- double precision :: exchange_int_23, exchange_int_12, exchange_int_13 
-
- fock_3_mat_a_op_sh = 0.d0
- do h = 1, mo_num
-  do p = 1, mo_num
-   !F_a^{ab}(h,p) 
-   do i = 1, elec_beta_num ! beta 
-    do j = elec_beta_num+1, elec_alpha_num ! alpha
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)    ! <hji|pji>
-     call  give_integrals_3_body(h,j,i,j,p,i,exch_int)   
-     fock_3_mat_a_op_sh(h,p) -= direct_int - exch_int
-    enddo
-   enddo
-   !F_a^{aa}(h,p)
-   do i = 1, elec_beta_num ! alpha 
-    do j = elec_beta_num+1, elec_alpha_num ! alpha
-       call  give_integrals_3_body(h,j,i,p,j,i,direct_int) 
-       call  give_integrals_3_body(h,j,i,i,p,j,exchange_int_231)
-       call  give_integrals_3_body(h,j,i,j,i,p,exchange_int_312) 
-       call  give_integrals_3_body(h,j,i,p,i,j,exchange_int_23) 
-       call  give_integrals_3_body(h,j,i,i,j,p,exchange_int_12)
-       call  give_integrals_3_body(h,j,i,j,p,i,exchange_int_13)  
-       fock_3_mat_a_op_sh(h,p) -= ( direct_int + exchange_int_231 + exchange_int_312 & 
-              -  exchange_int_23 & ! i <-> j
-              -  exchange_int_12 & ! p <-> j
-              -  exchange_int_13  )! p <-> i
-    enddo 
-   enddo
-  enddo
- enddo
-! symmetrized 
-! do p = 1, elec_beta_num
-!  do h = elec_alpha_num +1, mo_num
-!   fock_3_mat_a_op_sh(h,p) = fock_3_mat_a_op_sh(p,h)
-!  enddo
-! enddo
- 
-! do h = elec_beta_num+1, elec_alpha_num
-!  do p = elec_alpha_num +1, mo_num
-!   !F_a^{bb}(h,p) 
-!   do i = 1, elec_beta_num
-!    do j = i+1, elec_beta_num
-!     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-!     call  give_integrals_3_body(h,j,i,p,i,j,exch_int)   
-!     fock_3_mat_a_op_sh(h,p) -= direct_int - exch_int
-!    enddo
-!   enddo
-!  enddo
-! enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_mat_b_op_sh, (mo_num, mo_num)]
- implicit none 
- integer :: h,p,i,j
- double precision :: direct_int, exch_int
- fock_3_mat_b_op_sh = 0.d0
- do h = 1, elec_beta_num
-  do p = elec_alpha_num +1, mo_num
-   !F_b^{aa}(h,p) 
-   do i = 1, elec_beta_num
-    do j = elec_beta_num+1, elec_alpha_num
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-     call  give_integrals_3_body(h,j,i,p,i,j,exch_int)   
-     fock_3_mat_b_op_sh(h,p) += direct_int - exch_int
-    enddo
-   enddo
-
-   !F_b^{ab}(h,p) 
-   do i = elec_beta_num+1, elec_beta_num
-    do j = 1, elec_beta_num
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-     call  give_integrals_3_body(h,j,i,j,p,i,exch_int)   
-     fock_3_mat_b_op_sh(h,p) += direct_int - exch_int
-    enddo
-   enddo
- 
-  enddo
- enddo
-
-END_PROVIDER 
-
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kk_sum, (n_points_final_grid,3)]
- implicit none
- integer :: mm, ipoint,k
- double precision :: w_kk
- fock_3_w_kk_sum = 0.d0
- do k = 1, elec_beta_num
-  do mm = 1, 3
-   do ipoint = 1, n_points_final_grid
-    w_kk   = x_W_ij_erf_rk(ipoint,mm,k,k) 
-    fock_3_w_kk_sum(ipoint,mm) += w_kk
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_ki_mos_k, (n_points_final_grid,3,mo_num)]
- implicit none
- integer :: mm, ipoint,k,i
- double precision :: w_ki, mo_k
- fock_3_w_ki_mos_k = 0.d0
- do i = 1, mo_num
-  do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     w_ki   = x_W_ij_erf_rk(ipoint,mm,k,i) 
-     mo_k = mos_in_r_array(k,ipoint)
-     fock_3_w_ki_mos_k(ipoint,mm,i) += w_ki * mo_k
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_w_kl, (n_points_final_grid,3)]
- implicit none
- integer :: k,j,ipoint,mm
- double precision :: w_kj
- fock_3_w_kl_w_kl = 0.d0
- do j = 1, elec_beta_num
-  do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     w_kj   = x_W_ij_erf_rk(ipoint,mm,k,j) 
-     fock_3_w_kl_w_kl(ipoint,mm) += w_kj * w_kj
-    enddo
-   enddo
-  enddo
- enddo
-
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_rho_beta, (n_points_final_grid)]
- implicit none
- integer :: ipoint,k
- fock_3_rho_beta = 0.d0
- do ipoint = 1, n_points_final_grid
-  do k = 1, elec_beta_num
-   fock_3_rho_beta(ipoint) += mos_in_r_array(k,ipoint) * mos_in_r_array(k,ipoint)
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_mo_k_mo_l, (n_points_final_grid,3)]
- implicit none
- integer :: ipoint,k,l,mm
- double precision :: mos_k, mos_l, w_kl
- fock_3_w_kl_mo_k_mo_l = 0.d0
- do k = 1, elec_beta_num
-  do l = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     mos_k  = mos_in_r_array_transp(ipoint,k) 
-     mos_l  = mos_in_r_array_transp(ipoint,l) 
-     w_kl   = x_W_ij_erf_rk(ipoint,mm,l,k)
-     fock_3_w_kl_mo_k_mo_l(ipoint,mm) += w_kl * mos_k * mos_l 
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_ki_wk_a, (n_points_final_grid,3,mo_num, mo_num)]
- implicit none
- integer :: ipoint,i,a,k,mm
- double precision :: w_ki,w_ka
- fock_3_w_ki_wk_a = 0.d0
- do i = 1, mo_num
-  do a = 1, mo_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     do k = 1, elec_beta_num
-      w_ki   = x_W_ij_erf_rk(ipoint,mm,k,i)
-      w_ka   = x_W_ij_erf_rk(ipoint,mm,k,a)
-      fock_3_w_ki_wk_a(ipoint,mm,a,i) += w_ki * w_ka
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_trace_w_tilde, (n_points_final_grid,3)]
- implicit none
- integer :: ipoint,k,mm
- fock_3_trace_w_tilde = 0.d0
- do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     fock_3_trace_w_tilde(ipoint,mm) += fock_3_w_ki_wk_a(ipoint,mm,k,k)
-    enddo
-   enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_wla_phi_k, (n_points_final_grid,3,mo_num)]
- implicit none
- integer :: ipoint,a,k,mm,l
- double precision :: w_kl,w_la, mo_k
- fock_3_w_kl_wla_phi_k = 0.d0
- do a = 1, mo_num
-  do k = 1, elec_beta_num 
-   do l = 1, elec_beta_num
-    do mm = 1, 3
-     do ipoint = 1, n_points_final_grid
-      w_kl   = x_W_ij_erf_rk(ipoint,mm,l,k)
-      w_la   = x_W_ij_erf_rk(ipoint,mm,l,a)
-      mo_k  = mos_in_r_array_transp(ipoint,k) 
-      fock_3_w_kl_wla_phi_k(ipoint,mm,a) += w_kl * w_la * mo_k
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-
-
-
-
diff --git a/plugins/local/tc_scf/integrals_in_r_stuff.irp.f b/plugins/local/tc_scf/integrals_in_r_stuff.irp.f
deleted file mode 100644
index 3ce85a97..00000000
--- a/plugins/local/tc_scf/integrals_in_r_stuff.irp.f
+++ /dev/null
@@ -1,391 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, tc_scf_dm_in_r, (n_points_final_grid) ]
-
-  implicit none
-  integer :: i, j
-
-  tc_scf_dm_in_r = 0.d0
-  do i = 1, n_points_final_grid
-    do j = 1, elec_beta_num
-      tc_scf_dm_in_r(i) += mos_r_in_r_array(j,i) * mos_l_in_r_array(j,i)
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, w_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: ipoint, j, xi
-
-  w_sum_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        !w_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,j)
-        w_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, ww_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer          :: ipoint, j, xi
-  double precision :: tmp
-
-  ww_sum_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        tmp = x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-        ww_sum_in_r(ipoint,xi) += tmp * tmp
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_r_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_r_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_r_in_r(ipoint,xi,i) += mos_r_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_l_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_l_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_l_in_r(ipoint,xi,i) += mos_l_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        W1_in_r(ipoint,xi) += W1_l_in_r(ipoint,xi,j) * mos_r_in_r_array_transp(ipoint,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_diag_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_diag_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        W1_diag_in_r(ipoint,xi) += mos_r_in_r_array_transp(ipoint,j) * mos_l_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, v_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-  v_sum_in_r = 0.d0
-  do i = 1, elec_beta_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          v_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_W1_r_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, m, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_W1_r_in_r = 0.d0
-  do i = 1, mo_num
-    do m = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_W1_r_in_r(ipoint,xi,i) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,m,i) * W1_r_in_r(ipoint,xi,m)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_W1_l_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_W1_l_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_W1_l_in_r(ipoint,xi,i) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j) * W1_l_in_r(ipoint,xi,j)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-subroutine direct_term_imj_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | i m j > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight, tmp
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-      !integral += ( mos_l_in_r_array(a,ipoint) * mos_r_in_r_array(i,ipoint) * w_sum_in_r(ipoint,xi) * w_sum_in_r(ipoint,xi) & 
-      !            + 2.d0 * tc_scf_dm_in_r(ipoint) * w_sum_in_r(ipoint,xi) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i) ) * weight
-
-      tmp = w_sum_in_r(ipoint,xi)
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * tmp * tmp & 
-                  + 2.d0 * tc_scf_dm_in_r(ipoint) * tmp * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i)       &
-                  ) * weight
-    enddo
-  enddo
-
-end 
-
-! ---
-
-subroutine exch_term_jmi_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | j m i > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi, j
-  double precision              :: weight, tmp
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      tmp = 0.d0
-      do j = 1, elec_beta_num
-        tmp = tmp + x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i) 
-      enddo
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_r_in_r(ipoint,xi,i) * w_sum_in_r(ipoint,xi) & 
-                  + tc_scf_dm_in_r(ipoint) * tmp                                                       &
-                  + mos_r_in_r_array_transp(ipoint,i) * W1_l_in_r(ipoint,xi,a) * w_sum_in_r(ipoint,xi) & 
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine exch_term_ijm_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | i j m > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * v_sum_in_r(ipoint,xi) & 
-                  + 2.d0 * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i) * W1_in_r(ipoint,xi)                             &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine direct_term_ijj_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j = 1, elec_beta_num) < a j j | i j j > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * ww_sum_in_r(ipoint,xi) & 
-                  + 2.d0 * W1_diag_in_r(ipoint, xi) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i)                        &
-                  ) * weight
-    enddo
-  enddo
-
-end 
-
-! ---
-
-subroutine cyclic_term_jim_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | j i m > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_W1_r_in_r(ipoint,xi,i)  & 
-                  + W1_W1_l_in_r(ipoint,xi,a) * mos_r_in_r_array_transp(ipoint,i)  &
-                  + W1_l_in_r(ipoint,xi,a) * W1_r_in_r(ipoint,xi,i)                &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine cyclic_term_mji_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | m j i > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_W1_r_in_r(ipoint,xi,i)  & 
-                  + W1_l_in_r(ipoint,xi,a) * W1_r_in_r(ipoint,xi,i)                &
-                  + W1_W1_l_in_r(ipoint,xi,a) * mos_r_in_r_array_transp(ipoint,i)  &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/jast_schmos_90.irp.f b/plugins/local/tc_scf/jast_schmos_90.irp.f
deleted file mode 100644
index 5c5e625f..00000000
--- a/plugins/local/tc_scf/jast_schmos_90.irp.f
+++ /dev/null
@@ -1,318 +0,0 @@
- BEGIN_PROVIDER [integer ,  m_max_sm_7]
-&BEGIN_PROVIDER [integer ,  n_max_sm_7]
-&BEGIN_PROVIDER [integer ,  o_max_sm_7]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_7 version of Table IV
- END_DOC
- m_max_sm_7 = 4
- n_max_sm_7 = 0
- o_max_sm_7 = 4
-END_PROVIDER 
-
- BEGIN_PROVIDER [integer ,  m_max_sm_9]
-&BEGIN_PROVIDER [integer ,  n_max_sm_9]
-&BEGIN_PROVIDER [integer ,  o_max_sm_9]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_9 version of Table IV
- END_DOC
- m_max_sm_9 = 4
- n_max_sm_9 = 2
- o_max_sm_9 = 4
-END_PROVIDER 
-
-
- BEGIN_PROVIDER [integer ,  m_max_sm_17]
-&BEGIN_PROVIDER [integer ,  n_max_sm_17]
-&BEGIN_PROVIDER [integer ,  o_max_sm_17]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_17 version of Table IV
- END_DOC
- m_max_sm_17 = 6
- n_max_sm_17 = 2
- o_max_sm_17 = 6
-END_PROVIDER 
-
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_7, (0:m_max_sm_7,0:n_max_sm_7,0:o_max_sm_7,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_7(0:4,0:4,2:10) = coefficient for the SM_7 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_7
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_7 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_7(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_7(0,0,2,2) =  0.50516d0 
- c_mn_o_sm_7(0,0,3,2) = -0.19313d0 
- c_mn_o_sm_7(0,0,4,2) =  0.30276d0 
- ! one-electron terms 
- c_mn_o_sm_7(2,0,0,2) = -0.16995d0
- c_mn_o_sm_7(3,0,0,2) = -0.34505d0
- c_mn_o_sm_7(4,0,0,2) = -0.54777d0
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_7(0,0,2,10) = -0.792d0
- c_mn_o_sm_7(0,0,3,10) =  1.05232d0 
- c_mn_o_sm_7(0,0,4,10) = -0.65615d0 
- ! one-electron terms 
- c_mn_o_sm_7(2,0,0,10) = -0.13312d0
- c_mn_o_sm_7(3,0,0,10) = -0.00131d0
- c_mn_o_sm_7(4,0,0,10) =  0.09083d0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_9, (0:m_max_sm_9,0:n_max_sm_9,0:o_max_sm_9,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_9(0:4,0:4,2:10) = coefficient for the SM_9 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_9
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_9 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_9(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_9(0,0,2,2) =  0.50516d0 
- c_mn_o_sm_9(0,0,3,2) = -0.19313d0 
- c_mn_o_sm_9(0,0,4,2) =  0.30276d0 
- ! one-electron terms 
- c_mn_o_sm_9(2,0,0,2) = -0.16995d0
- c_mn_o_sm_9(3,0,0,2) = -0.34505d0
- c_mn_o_sm_9(4,0,0,2) = -0.54777d0
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_9(0,0,2,10) = -0.792d0
- c_mn_o_sm_9(0,0,3,10) =  1.05232d0 
- c_mn_o_sm_9(0,0,4,10) = -0.65615d0 
- ! one-electron terms 
- c_mn_o_sm_9(2,0,0,10) = -0.13312d0
- c_mn_o_sm_9(3,0,0,10) = -0.00131d0
- c_mn_o_sm_9(4,0,0,10) =  0.09083d0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_17, (0:m_max_sm_17,0:n_max_sm_17,0:o_max_sm_17,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_17(0:4,0:4,2:10) = coefficient for the SM_17 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_17
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_17 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_17(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_17(0,0,2,2) =  0.09239d0 
- c_mn_o_sm_17(0,0,3,2) = -0.38664d0 
- c_mn_o_sm_17(0,0,4,2) =  0.95764d0 
- ! one-electron terms 
- c_mn_o_sm_17(2,0,0,2) =  0.23208d0
- c_mn_o_sm_17(3,0,0,2) = -0.45032d0
- c_mn_o_sm_17(4,0,0,2) =  0.82777d0
- c_mn_o_sm_17(2,2,0,2) = -4.15388d0
- ! ee-n terms 
- c_mn_o_sm_17(2,0,2,2) =  0.80622d0
- c_mn_o_sm_17(2,2,2,2) = 10.19704d0
- c_mn_o_sm_17(4,0,2,2) = -4.96259d0
- c_mn_o_sm_17(2,0,4,2) = -1.35647d0
- c_mn_o_sm_17(4,2,2,2) = -5.90907d0
- c_mn_o_sm_17(6,0,2,2) =  0.90343d0
- c_mn_o_sm_17(4,0,4,2) =  5.50739d0
- c_mn_o_sm_17(2,2,4,2) = -0.03154d0
- c_mn_o_sm_17(2,0,6,2) = -1.1051860
-
-
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_17(0,0,2,10) = -0.80909d0 
- c_mn_o_sm_17(0,0,3,10) = -0.00219d0 
- c_mn_o_sm_17(0,0,4,10) =  0.59188d0
- ! one-electron terms 
- c_mn_o_sm_17(2,0,0,10) = -0.00567d0
- c_mn_o_sm_17(3,0,0,10) =  0.14011d0
- c_mn_o_sm_17(4,0,0,10) = -0.05671d0
- c_mn_o_sm_17(2,2,0,10) = -3.33767d0
- ! ee-n terms 
- c_mn_o_sm_17(2,0,2,10) =  1.95067d0
- c_mn_o_sm_17(2,2,2,10) =  6.83340d0
- c_mn_o_sm_17(4,0,2,10) = -3.29231d0
- c_mn_o_sm_17(2,0,4,10) = -2.44998d0
- c_mn_o_sm_17(4,2,2,10) = -2.13029d0
- c_mn_o_sm_17(6,0,2,10) =  2.25768d0
- c_mn_o_sm_17(4,0,4,10) =  1.97951d0
- c_mn_o_sm_17(2,2,4,10) = -2.0924160
- c_mn_o_sm_17(2,0,6,10) =  0.35493d0
-
-END_PROVIDER 
-
- BEGIN_PROVIDER [ double precision, b_I_sm_90,(2:10)]
-&BEGIN_PROVIDER [ double precision, d_I_sm_90,(2:10)]
- implicit none
- BEGIN_DOC
-! "b_I" and "d_I" parameters of Eqs. (4) and (5) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- END_DOC
- b_I_sm_90 = 1.d0
- d_I_sm_90 = 1.d0
- 
-END_PROVIDER 
-
-subroutine get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
- implicit none 
- double precision, intent(in) :: r1(3),r2(3),rI(3)
- integer, intent(in)          :: sm_j, i_charge 
- double precision, intent(out):: j_1e,j_2e,j_een,j_tot
- BEGIN_DOC
- ! Jastrow function as in Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! the i_charge variable is the integer specifying the charge of the atom for the Jastrow 
- ! the sm_j integer variable represents the "quality" of the jastrow : sm_j = 7, 9, 17
- END_DOC 
- double precision :: r_inucl,r_jnucl,r_ij,b_I, d_I
- b_I = b_I_sm_90(i_charge)
- d_I = d_I_sm_90(i_charge)
- call get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
- call jastrow_func_sm_90(r_inucl,r_jnucl,r_ij,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-end
-
-subroutine get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
- implicit none
- BEGIN_DOC
- ! rescaled variables of Eq. (5) and (6) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! the "b_I" and "d_I" parameters are the same as in Eqs. (5) and (6) 
- END_DOC
- double precision, intent(in) :: r1(3),r2(3),rI(3)
- double precision, intent(in) :: b_I, d_I
- double precision, intent(out):: r_inucl,r_jnucl,r_ij
- double precision :: rin, rjn, rij
- integer :: i
- rin = 0.d0
- rjn = 0.d0
- rij = 0.d0
- do i = 1,3
-  rin += (r1(i) - rI(i)) * (r1(i) - rI(i))
-  rjn += (r2(i) - rI(i)) * (r2(i) - rI(i))
-  rij += (r2(i) - r1(i)) * (r2(i) - r1(i))
- enddo
- rin = dsqrt(rin)
- rjn = dsqrt(rjn)
- rij = dsqrt(rij)
- r_inucl = b_I * rin/(1.d0 + b_I * rin)
- r_jnucl = b_I * rjn/(1.d0 + b_I * rjn)
- r_ij    = d_I * rij/(1.d0 + b_I * rij)
-end
-
-subroutine jastrow_func_sm_90(r_inucl,r_jnucl,r_ij,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
- implicit none
- BEGIN_DOC
- ! Jastrow function as in Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! Here the r_inucl, r_jnucl are the rescaled variables as defined in Eq. (5) with "b_I" 
- !          r_ij is the rescaled variable as defined in Eq. (6) with "d_I" 
- ! the i_charge variable is the integer specifying the charge of the atom for the Jastrow 
- ! the sm_j integer variable represents the "quality" of the jastrow : sm_j = 7, 9, 17
- !
- ! it returns the j_1e  : sum of terms with "o" = "n" = 0, "m" /= 0,
- !                j_2e  : sum of terms with "m" = "n" = 0, "o" /= 0,
- !                j_een : sum of terms with "m" /=0, "n" /= 0, "o" /= 0,
- !                j_tot : the total sum 
- END_DOC
- double precision, intent(in) :: r_inucl,r_jnucl,r_ij
- integer, intent(in)          :: sm_j,i_charge
- double precision, intent(out):: j_1e,j_2e,j_een,j_tot
- j_1e  = 0.D0
- j_2e  = 0.D0
- j_een = 0.D0
- double precision :: delta_mn,jastrow_sm_90_atomic
- integer :: m,n,o
-BEGIN_TEMPLATE
- ! pure 2e part 
- n = 0
- m = 0 
- if(sm_j == $X )then
-  do o = 1, o_max_sm_$X
-   if(dabs(c_mn_o_sm_$X(m,n,o,i_charge)).lt.1.d-10)cycle 
-   j_2e += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-  enddo
-! else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- ! pure one-e part 
- o = 0 
- if(sm_j == $X)then
-  do n = 2, n_max_sm_$X
-   do m = 2, m_max_sm_$X
-    j_1e += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-   enddo
-  enddo
-! else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- ! e-e-n part 
- if(sm_j == $X)then
-  do o = 1, o_max_sm_$X
-   do m = 2, m_max_sm_$X
-    do n = 2, n_max_sm_$X
-     j_een += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-    enddo
-   enddo
-  enddo
- else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- j_tot = j_1e + j_2e + j_een
-SUBST [ X]
-  7 ;; 
-  9 ;; 
-  17 ;;
-END_TEMPLATE 
-end
-
-double precision function jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
- implicit none
- BEGIN_DOC
-! contribution to the function of Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
-! for a given m,n,o and atom 
- END_DOC
- double precision, intent(in) :: r_inucl,r_jnucl,r_ij
- integer         , intent(in) :: m,n,o,i_charge
- double precision :: delta_mn 
- if(m==n)then
-  delta_mn = 0.5d0
- else
-  delta_mn = 1.D0
- endif
- jastrow_sm_90_atomic = delta_mn * (r_inucl**m * r_jnucl**n + r_jnucl**m * r_inucl**n)*r_ij**o
-end
diff --git a/plugins/local/tc_scf/plot_j_schMos.irp.f b/plugins/local/tc_scf/plot_j_schMos.irp.f
deleted file mode 100644
index eda0dd25..00000000
--- a/plugins/local/tc_scf/plot_j_schMos.irp.f
+++ /dev/null
@@ -1,69 +0,0 @@
-program plot_j
- implicit none
- double precision :: r1(3),rI(3),r2(3)
- double precision :: r12,dx,xmax, j_1e,j_2e,j_een,j_tot
- double precision :: j_mu_F_x_j
- integer :: i,nx,m,i_charge,sm_j
-
- character*(128) :: output
- integer :: i_unit_output_He_sm_7,i_unit_output_Ne_sm_7
- integer :: i_unit_output_He_sm_17,i_unit_output_Ne_sm_17
- integer :: getUnitAndOpen
- output='J_SM_7_He'
- i_unit_output_He_sm_7 = getUnitAndOpen(output,'w')
- output='J_SM_7_Ne'
- i_unit_output_Ne_sm_7 = getUnitAndOpen(output,'w')
-
- output='J_SM_17_He'
- i_unit_output_He_sm_17 = getUnitAndOpen(output,'w')
- output='J_SM_17_Ne'
- i_unit_output_Ne_sm_17 = getUnitAndOpen(output,'w')
-
- rI = 0.d0
- r1 = 0.d0
- r2 = 0.d0
- r1(1) = 1.5d0
- xmax = 20.d0
- r2(1) = -xmax*0.5d0
- nx = 1000
- dx = xmax/dble(nx)
- do i = 1, nx
-  r12 = 0.d0
-  do m = 1, 3
-   r12 += (r1(m) - r2(m))*(r1(m) - r2(m))
-  enddo
-  r12 = dsqrt(r12)
-  double precision :: jmu,env_nucl,jmu_env,jmu_scaled, jmu_scaled_env
-  double precision :: b_I,d_I,r_inucl,r_jnucl,r_ij
-  b_I = 1.D0
-  d_I = 1.D0
-  call get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
-  jmu=j_mu_F_x_j(r12)
-  jmu_scaled=j_mu_F_x_j(r_ij)
-  jmu_env = jmu * env_nucl(r1) * env_nucl(r2)
-!  jmu_scaled_env= jmu_scaled * (1.d0 - env_coef(1) * dexp(-env_expo(1)*r_inucl**2)) * (1.d0 - env_coef(1) * dexp(-env_expo(1)*r_jnucl**2))
-  jmu_scaled_env= jmu_scaled * env_nucl(r1) * env_nucl(r2)
-  ! He 
-  i_charge = 2
-  ! SM 7 Jastrow 
-  sm_j = 7
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_He_sm_7,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! SM 17 Jastrow 
-  sm_j = 17
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_He_sm_17,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! Ne 
-  i_charge = 10
-  ! SM 7 Jastrow 
-  sm_j = 7
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_Ne_sm_7,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! SM 17 Jastrow 
-  sm_j = 17
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_Ne_sm_17,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  r2(1) += dx
- enddo
- 
-end
diff --git a/plugins/local/tc_scf/print_fit_param.irp.f b/plugins/local/tc_scf/print_fit_param.irp.f
deleted file mode 100644
index e62f0dde..00000000
--- a/plugins/local/tc_scf/print_fit_param.irp.f
+++ /dev/null
@@ -1,59 +0,0 @@
-program print_fit_param
-
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  !call create_guess
-  !call orthonormalize_mos
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  integer :: i
-
-  mu_erf = 1.d0
-  touch mu_erf
-
-  print *, ' fit for (1 - erf(x))^2'
-  do i = 1, n_max_fit_slat
-    print*, expo_gauss_1_erf_x_2(i), coef_gauss_1_erf_x_2(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)]'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_x(i), 2.d0 * coef_gauss_j_mu_x(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)]^2'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_x_2(i), 4.d0 * coef_gauss_j_mu_x_2(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)] x [1 - erf(mu * r12)]'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_1_erf(i), 4.d0 * coef_gauss_j_mu_1_erf(i)
-  enddo
-
-  return
-end subroutine main
-
-! ---
-
diff --git a/plugins/local/tc_scf/print_tcscf_energy.irp.f b/plugins/local/tc_scf/print_tcscf_energy.irp.f
deleted file mode 100644
index 6f9afd9a..00000000
--- a/plugins/local/tc_scf/print_tcscf_energy.irp.f
+++ /dev/null
@@ -1,55 +0,0 @@
-program print_tcscf_energy
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  double precision :: etc_tot, etc_1e, etc_2e, etc_3e
-
-  PROVIDE j2e_type mu_erf
-  PROVIDE j1e_type j1e_coef j1e_expo
-  PROVIDE env_type env_coef env_expo
-
-  print*, ' j2e_type = ', j2e_type
-  print*, ' j1e_type = ', j1e_type
-  print*, ' env_type = ', env_type
-
-  print*, ' mu_erf   = ', mu_erf
-
-  etc_tot = TC_HF_energy
-  etc_1e  = TC_HF_one_e_energy
-  etc_2e  = TC_HF_two_e_energy
-  etc_3e  = 0.d0
-  if(three_body_h_tc) then
-    !etc_3e = diag_three_elem_hf
-    etc_3e = tcscf_energy_3e_naive
-  endif
-
-  print *, " E_TC = ", etc_tot
-  print *, " E_1e = ", etc_1e
-  print *, " E_2e = ", etc_2e
-  print *, " E_3e = ", etc_3e
-
-  return
-end subroutine main
-
-! ---
-
diff --git a/plugins/local/tc_scf/rh_tcscf_diis.irp.f b/plugins/local/tc_scf/rh_tcscf_diis.irp.f
index 853c4ab5..1cade02a 100644
--- a/plugins/local/tc_scf/rh_tcscf_diis.irp.f
+++ b/plugins/local/tc_scf/rh_tcscf_diis.irp.f
@@ -61,7 +61,7 @@ subroutine rh_tcscf_diis()
   etc_tot = TC_HF_energy
   etc_1e  = TC_HF_one_e_energy
   etc_2e  = TC_HF_two_e_energy
-  etc_3e  = diag_three_elem_hf
+  etc_3e  = TC_HF_three_e_energy
   !tc_grad = grad_non_hermit
   er_DIIS = maxval(abs(FQS_SQF_mo))
   e_delta = dabs(etc_tot - e_save)
@@ -189,7 +189,7 @@ subroutine rh_tcscf_diis()
     etc_tot = TC_HF_energy
     etc_1e  = TC_HF_one_e_energy
     etc_2e  = TC_HF_two_e_energy
-    etc_3e  = diag_three_elem_hf
+    etc_3e  = TC_HF_three_e_energy
     !tc_grad  = grad_non_hermit
     er_DIIS  = maxval(abs(FQS_SQF_mo))
     e_delta  = dabs(etc_tot - e_save)
diff --git a/plugins/local/tc_scf/rh_tcscf_simple.irp.f b/plugins/local/tc_scf/rh_tcscf_simple.irp.f
deleted file mode 100644
index 2c2cf2c2..00000000
--- a/plugins/local/tc_scf/rh_tcscf_simple.irp.f
+++ /dev/null
@@ -1,129 +0,0 @@
-! ---
-
-subroutine rh_tcscf_simple()
-
-  implicit none
-  integer                       :: i, j, it, dim_DIIS
-  double precision              :: t0, t1
-  double precision              :: e_save, e_delta, rho_delta
-  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, tc_grad
-  double precision              :: er_DIIS
-  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
-
-  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
-
-  it       = 0
-  e_save   = 0.d0
-  dim_DIIS = 0
-
-  ! ---
-
-  if(.not. bi_ortho) then
-   print *, ' grad_hermit = ', grad_hermit
-   call save_good_hermit_tc_eigvectors
-   TOUCH mo_coef 
-   call save_mos
-  endif
-
-  ! ---
-
-  if(bi_ortho) then
-
-    PROVIDE level_shift_tcscf
-    PROVIDE mo_l_coef mo_r_coef
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      '====', '================', '================', '================', '================', '================' &
-            , '================', '================', '================', '====', '========'
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
-            , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      '====', '================', '================', '================', '================', '================' &
-            , '================', '================', '================', '====', '========'
-
-
-    ! first iteration (HF orbitals)
-    call wall_time(t0)
-
-    etc_tot = TC_HF_energy
-    etc_1e  = TC_HF_one_e_energy
-    etc_2e  = TC_HF_two_e_energy
-    etc_3e  = 0.d0
-    if(three_body_h_tc) then
-      etc_3e = diag_three_elem_hf
-    endif
-    tc_grad = grad_non_hermit
-    er_DIIS = maxval(abs(FQS_SQF_mo))
-    e_delta = dabs(etc_tot - e_save)
-    e_save  = etc_tot
-
-    call wall_time(t1)
-    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-
-    do while(tc_grad .gt. dsqrt(thresh_tcscf))
-      call wall_time(t0)
-
-      it += 1
-      if(it > n_it_tcscf_max) then
-        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
-        stop
-      endif
-
-      mo_l_coef = fock_tc_leigvec_ao
-      mo_r_coef = fock_tc_reigvec_ao
-      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-      TOUCH mo_l_coef mo_r_coef
-
-      etc_tot = TC_HF_energy
-      etc_1e  = TC_HF_one_e_energy
-      etc_2e  = TC_HF_two_e_energy
-      etc_3e  = 0.d0
-      if(three_body_h_tc) then
-        etc_3e = diag_three_elem_hf
-      endif
-      tc_grad = grad_non_hermit
-      er_DIIS = maxval(abs(FQS_SQF_mo))
-      e_delta = dabs(etc_tot - e_save)
-      e_save  = etc_tot
-
-      call ezfio_set_tc_scf_tcscf_energy(etc_tot)
-
-      call wall_time(t1)
-      write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-        it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-    enddo
-
-  else
-
-   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. (it.lt.n_it_tcscf_max) )
-      print*,'grad_hermit = ',grad_hermit
-      it += 1
-      print *, 'iteration = ', it
-      print *, '***'
-      print *, 'TC HF total energy = ', TC_HF_energy
-      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
-      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
-      print *, 'TC HF 3 body       = ', diag_three_elem_hf
-      print *, '***'
-      print *, ''
-      call save_good_hermit_tc_eigvectors
-      TOUCH mo_coef 
-      call save_mos
-    enddo
-
-  endif
-
-  print *, ' TCSCF Simple converged !'
-  !call print_energy_and_mos(good_angles)
-
-  deallocate(rho_old, rho_new)
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f b/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
deleted file mode 100644
index 0f2663e5..00000000
--- a/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
+++ /dev/null
@@ -1,369 +0,0 @@
-
-! ---
-
-program rotate_tcscf_orbitals
-
-  BEGIN_DOC
-  ! TODO : Rotate the bi-orthonormal orbitals in order to minimize left-right angles when degenerate
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  bi_ortho = .True.
-  touch bi_ortho
-
-  call minimize_tc_orb_angles()
-  !call maximize_overlap()
-
-end
-
-! ---
-
-subroutine maximize_overlap()
-
-  implicit none
-  integer                       :: i, m, n
-  double precision              :: accu_d, accu_nd
-  double precision, allocatable :: C(:,:), R(:,:), L(:,:), W(:,:), e(:)
-  double precision, allocatable :: S(:,:)
-
-  n = ao_num
-  m = mo_num
-
-  allocate(L(n,m), R(n,m), C(n,m), W(n,n), e(m))
-  L = mo_l_coef
-  R = mo_r_coef
-  C = mo_coef
-  W = ao_overlap
-
-  print*, ' fock matrix diag elements'
-  do i = 1, m
-    e(i) = Fock_matrix_tc_mo_tot(i,i)
-    print*, e(i)
-  enddo
-
-  ! ---
-   
-  print *, ' overlap before :'
-  print *, ' '
-
-  allocate(S(m,m)) 
-
-  call LTxSxR(n, m, L, W, R, S)
-  !print*, " L.T x R"
-  !do i = 1, m
-  !  write(*, '(100(F16.10,X))') S(i,i)
-  !enddo
-  call LTxSxR(n, m, L, W, C, S)
-  print*, " L.T x C"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-  call LTxSxR(n, m, C, W, R, S)
-  print*, " C.T x R"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-
-  deallocate(S)
-
-  ! ---
-
-  call rotate_degen_eigvec_to_maximize_overlap(n, m, e, C, W, L, R)
-
-  ! ---
-   
-  print *, ' overlap after :'
-  print *, ' '
-
-  allocate(S(m,m)) 
-
-  call LTxSxR(n, m, L, W, R, S)
-  !print*, " L.T x R"
-  !do i = 1, m
-  !  write(*, '(100(F16.10,X))') S(i,i)
-  !enddo
-  call LTxSxR(n, m, L, W, C, S)
-  print*, " L.T x C"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-  call LTxSxR(n, m, C, W, R, S)
-  print*, " C.T x R"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-
-  deallocate(S)
-
-  ! ---
-
-  mo_l_coef = L
-  mo_r_coef = R
-  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-
-  ! ---
-
-  deallocate(L, R, C, W, e)
-
-end subroutine maximize_overlap
-
-! ---
-
-subroutine rotate_degen_eigvec_to_maximize_overlap(n, m, e0, C0, W0, L0, R0)
-
-  implicit none
-
-  integer,          intent(in)    :: n, m
-  double precision, intent(in)    :: e0(m), W0(n,n), C0(n,m)
-  double precision, intent(inout) :: L0(n,m), R0(n,m)
-
-
-  integer                         :: i, j, k, kk, mm, id1, tot_deg
-  double precision                :: ei, ej, de, de_thr
-  integer,          allocatable   :: deg_num(:)
-  double precision, allocatable   :: L(:,:), R(:,:), C(:,:), Lnew(:,:), Rnew(:,:), tmp(:,:)
-  !double precision, allocatable   :: S(:,:), Snew(:,:), T(:,:), Ttmp(:,:), Stmp(:,:)
-  double precision, allocatable   :: S(:,:), Snew(:,:), T(:,:), Ttmp(:,:), Stmp(:,:)
-  !real*8                          :: S(m,m), Snew(m,m), T(m,m)
-
-  id1 = 700
-  allocate(S(id1,id1), Snew(id1,id1), T(id1,id1))
-
-  ! ---
-
-  allocate( deg_num(m) )
-  do i = 1, m
-    deg_num(i) = 1
-  enddo
-
-  de_thr = thr_degen_tc
-
-  do i = 1, m-1
-    ei = e0(i)
-
-    ! already considered in degen vectors
-    if(deg_num(i).eq.0) cycle
-
-    do j = i+1, m
-      ej = e0(j)
-      de = dabs(ei - ej)
-
-      if(de .lt. de_thr) then
-        deg_num(i) = deg_num(i) + 1
-        deg_num(j) = 0
-      endif
-
-    enddo
-  enddo
-
-  tot_deg = 0
-  do i = 1, m
-    if(deg_num(i).gt.1) then
-      print *, ' degen on', i, deg_num(i)
-      tot_deg = tot_deg + 1
-    endif
-  enddo
-
-  if(tot_deg .eq. 0) then
-    print *, ' no degen'
-    return
-  endif
-
-  ! ---
-
-  do i = 1, m
-    mm = deg_num(i)
-
-    if(mm .gt. 1) then
-
-      allocate(L(n,mm), R(n,mm), C(n,mm))
-      do j = 1, mm
-        L(1:n,j) = L0(1:n,i+j-1)
-        R(1:n,j) = R0(1:n,i+j-1)
-        C(1:n,j) = C0(1:n,i+j-1)
-      enddo
-
-      ! ---
-
-      ! C.T x W0 x R
-      allocate(tmp(mm,n), Stmp(mm,mm))
-      call dgemm( 'T', 'N', mm, n, n, 1.d0       &
-                , C, size(C, 1), W0, size(W0, 1) &
-                , 0.d0, tmp, size(tmp, 1) )
-      call dgemm( 'N', 'N', mm, mm, n, 1.d0        &
-                , tmp, size(tmp, 1), R, size(R, 1) &
-                , 0.d0, Stmp, size(Stmp, 1) )
-      deallocate(C, tmp)
-
-      S = 0.d0
-      do k = 1, mm
-        do kk = 1, mm
-          S(kk,k) = Stmp(kk,k)
-        enddo
-      enddo
-      deallocate(Stmp)
-
-      !print*, " overlap bef"
-      !do k = 1, mm
-      !  write(*, '(100(F16.10,X))') (S(k,kk), kk=1, mm)
-      !enddo
-    
-      T    = 0.d0
-      Snew = 0.d0
-      call maxovl(mm, mm, S, T, Snew)
-
-      !print*, " overlap aft"
-      !do k = 1, mm
-      !  write(*, '(100(F16.10,X))') (Snew(k,kk), kk=1, mm)
-      !enddo
-
-      allocate(Ttmp(mm,mm))
-      Ttmp(1:mm,1:mm) = T(1:mm,1:mm)
-
-      allocate(Lnew(n,mm), Rnew(n,mm))
-      call dgemm( 'N', 'N', n, mm, mm, 1.d0               &
-                , R, size(R, 1), Ttmp(1,1), size(Ttmp, 1) &
-                , 0.d0, Rnew, size(Rnew, 1) )
-      call dgemm( 'N', 'N', n, mm, mm, 1.d0               &
-                , L, size(L, 1), Ttmp(1,1), size(Ttmp, 1) &
-                , 0.d0, Lnew, size(Lnew, 1) )
-
-      deallocate(L, R)
-      deallocate(Ttmp)
-
-      ! ---
-
-      do j = 1, mm
-        L0(1:n,i+j-1) = Lnew(1:n,j)
-        R0(1:n,i+j-1) = Rnew(1:n,j)
-      enddo
-      deallocate(Lnew, Rnew)
-
-    endif
-  enddo
-
-  deallocate(S, Snew, T)
-
-end subroutine rotate_degen_eigvec_to_maximize_overlap
-
-! ---
-
-subroutine fix_right_to_one()
-
-  implicit none
-  integer                       :: i, j, m, n, mm, tot_deg
-  double precision              :: accu_d, accu_nd
-  double precision              :: de_thr, ei, ej, de
-  integer,          allocatable :: deg_num(:)
-  double precision, allocatable :: R0(:,:), L0(:,:), W(:,:), e0(:)
-  double precision, allocatable :: R(:,:), L(:,:), S(:,:), Stmp(:,:), tmp(:,:)
-
-  n = ao_num
-  m = mo_num
-
-  allocate(L0(n,m), R0(n,m), W(n,n), e0(m))
-  L0 = mo_l_coef
-  R0 = mo_r_coef
-  W  = ao_overlap
-
-  print*, ' fock matrix diag elements'
-  do i = 1, m
-    e0(i) = Fock_matrix_tc_mo_tot(i,i)
-    print*, e0(i)
-  enddo
-
-  ! ---
-
-  allocate( deg_num(m) )
-  do i = 1, m
-    deg_num(i) = 1
-  enddo
-
-  de_thr = 1d-6
-
-  do i = 1, m-1
-    ei = e0(i)
-
-    ! already considered in degen vectors
-    if(deg_num(i).eq.0) cycle
-
-    do j = i+1, m
-      ej = e0(j)
-      de = dabs(ei - ej)
-
-      if(de .lt. de_thr) then
-        deg_num(i) = deg_num(i) + 1
-        deg_num(j) = 0
-      endif
-
-    enddo
-  enddo
-
-  deallocate(e0)
-
-  tot_deg = 0
-  do i = 1, m
-    if(deg_num(i).gt.1) then
-      print *, ' degen on', i, deg_num(i)
-      tot_deg = tot_deg + 1
-    endif
-  enddo
-
-  if(tot_deg .eq. 0) then
-    print *, ' no degen'
-    return
-  endif
-
-  ! ---
-
-  do i = 1, m
-    mm = deg_num(i)
-
-    if(mm .gt. 1) then
-
-      allocate(L(n,mm), R(n,mm))
-      do j = 1, mm
-        L(1:n,j) = L0(1:n,i+j-1)
-        R(1:n,j) = R0(1:n,i+j-1)
-      enddo
-
-      ! ---
-
-      call impose_weighted_orthog_svd(n, mm, W, R)
-      call impose_weighted_biorthog_qr(n, mm, thresh_biorthog_diag, thresh_biorthog_nondiag, R, W, L)
-
-      ! ---
-
-      do j = 1, mm
-        L0(1:n,i+j-1) = L(1:n,j)
-        R0(1:n,i+j-1) = R(1:n,j)
-      enddo
-      deallocate(L, R)
-
-    endif
-  enddo
-
-  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thresh_biorthog_diag, thresh_biorthog_nondiag, .true.)
-
-  deallocate(W, deg_num)
-
-  mo_l_coef = L0
-  mo_r_coef = R0
-  deallocate(L0, R0)
-
-  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-  print *, ' orbitals are rotated '
-
-  return
-end subroutine fix_right_to_one
-
-! ---
diff --git a/plugins/local/tc_scf/tc_petermann_factor.irp.f b/plugins/local/tc_scf/tc_petermann_factor.irp.f
deleted file mode 100644
index 14fff898..00000000
--- a/plugins/local/tc_scf/tc_petermann_factor.irp.f
+++ /dev/null
@@ -1,91 +0,0 @@
-
-! ---
-
-program tc_petermann_factor
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  integer                       :: i, j
-  double precision              :: Pf_diag_av
-  double precision, allocatable :: Sl(:,:), Sr(:,:), Pf(:,:)
-
-  allocate(Sl(mo_num,mo_num), Sr(mo_num,mo_num), Pf(mo_num,mo_num))
-
-
-  call LTxSxR(ao_num, mo_num, mo_l_coef, ao_overlap, mo_r_coef, Sl)
-  !call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-  !          , mo_l_coef, size(mo_l_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
-  !          , 0.d0, Sl, size(Sl, 1) )
-
-  print *, ''
-  print *, ' left-right orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sl(:,i)
-  enddo
-
-  call LTxSxR(ao_num, mo_num, mo_l_coef, ao_overlap, mo_l_coef, Sl)
-  !call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-  !          , mo_l_coef, size(mo_l_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
-  !          , 0.d0, Sl, size(Sl, 1) )
-
-  print *, ''
-  print *, ' left-orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sl(:,i)
-  enddo
-
-  call LTxSxR(ao_num, mo_num, mo_r_coef, ao_overlap, mo_r_coef, Sr)
-!  call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-!            , mo_r_coef, size(mo_r_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
-!            , 0.d0, Sr, size(Sr, 1) )
-
-  print *, ''
-  print *, ' right-orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sr(:,i)
-  enddo
-
-  print *, ''
-  print *, ' Petermann matrix:'
-  do i = 1, mo_num
-    do j = 1, mo_num
-      Pf(j,i) = Sl(j,i) * Sr(j,i)
-    enddo
-    write(*,'(100(F8.4,X))') Pf(:,i)
-  enddo
-
-  Pf_diag_av = 0.d0
-  do i = 1, mo_num
-    Pf_diag_av = Pf_diag_av + Pf(i,i)
-  enddo
-  Pf_diag_av = Pf_diag_av / dble(mo_num)
-
-  print *, ''
-  print *, ' mean of the diagonal Petermann factor = ', Pf_diag_av
-
-  deallocate(Sl, Sr, Pf)
-
-  return
-end subroutine
-
-! ---
-
diff --git a/plugins/local/tc_scf/tc_scf.irp.f b/plugins/local/tc_scf/tc_scf.irp.f
index ee8e8dad..f099b90e 100644
--- a/plugins/local/tc_scf/tc_scf.irp.f
+++ b/plugins/local/tc_scf/tc_scf.irp.f
@@ -10,13 +10,10 @@ program tc_scf
   integer :: i
   logical :: good_angles
 
-  PROVIDE j1e_type
-  PROVIDE j2e_type
-  PROVIDE tcscf_algorithm
-
   print *, ' TC-SCF with:'
-  print *, ' j1e_type = ', j1e_type
   print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
 
   write(json_unit,json_array_open_fmt) 'tc-scf'
 
@@ -29,7 +26,6 @@ program tc_scf
   call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
   call write_int(6, my_n_pt_a_grid, 'angular external grid over')
 
-
   if(tc_integ_type .eq. "numeric") then
     my_extra_grid_becke  = .True.
     PROVIDE tc_grid2_a tc_grid2_r
@@ -41,17 +37,7 @@ program tc_scf
     call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
   endif
 
-  !call create_guess()
-  !call orthonormalize_mos()
-
-  if(tcscf_algorithm == 'DIIS') then
-    call rh_tcscf_diis()
-  elseif(tcscf_algorithm == 'Simple') then
-    call rh_tcscf_simple()
-  else
-    print *, ' not implemented yet', tcscf_algorithm
-    stop
-  endif
+  call rh_tcscf_diis()
 
   PROVIDE Fock_matrix_tc_diag_mo_tot
   print*, ' Eigenvalues:' 
@@ -59,14 +45,11 @@ program tc_scf
     print*, i, Fock_matrix_tc_diag_mo_tot(i)
   enddo
 
-  ! TODO 
-  ! rotate angles in separate code only if necessary
-  if(minimize_lr_angles)then
+  if(minimize_lr_angles) then
    call minimize_tc_orb_angles()
   endif
   call print_energy_and_mos(good_angles)
 
-
   write(json_unit,json_array_close_fmtx)
   call json_close
 
diff --git a/plugins/local/tc_scf/tc_scf_dm.irp.f b/plugins/local/tc_scf/tc_scf_dm.irp.f
index bf31a4a1..5d25fce2 100644
--- a/plugins/local/tc_scf/tc_scf_dm.irp.f
+++ b/plugins/local/tc_scf/tc_scf_dm.irp.f
@@ -10,16 +10,8 @@ BEGIN_PROVIDER [double precision, TCSCF_density_matrix_ao_beta, (ao_num, ao_num)
 
   implicit none
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-    TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
-
-  else
-
-    TCSCF_density_matrix_ao_beta = SCF_density_matrix_ao_beta
-
-  endif
+  PROVIDE mo_l_coef mo_r_coef
+  TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
 
 END_PROVIDER 
 
@@ -35,16 +27,8 @@ BEGIN_PROVIDER [double precision, TCSCF_density_matrix_ao_alpha, (ao_num, ao_num
 
   implicit none
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-    TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
-
-  else
-
-    TCSCF_density_matrix_ao_alpha = SCF_density_matrix_ao_alpha
-
-  endif
+  PROVIDE mo_l_coef mo_r_coef
+  TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
 
 END_PROVIDER 
 
diff --git a/plugins/local/tc_scf/tc_scf_energy.irp.f b/plugins/local/tc_scf/tc_scf_energy.irp.f
index 0266c605..74ab9d05 100644
--- a/plugins/local/tc_scf/tc_scf_energy.irp.f
+++ b/plugins/local/tc_scf/tc_scf_energy.irp.f
@@ -1,7 +1,8 @@
 
- BEGIN_PROVIDER [ double precision, TC_HF_energy      ]
-&BEGIN_PROVIDER [ double precision, TC_HF_one_e_energy]
-&BEGIN_PROVIDER [ double precision, TC_HF_two_e_energy]
+ BEGIN_PROVIDER [double precision, TC_HF_energy        ]
+&BEGIN_PROVIDER [double precision, TC_HF_one_e_energy  ]
+&BEGIN_PROVIDER [double precision, TC_HF_two_e_energy  ]
+&BEGIN_PROVIDER [double precision, TC_HF_three_e_energy]
 
   BEGIN_DOC
   ! TC Hartree-Fock energy containing the nuclear repulsion, and its one- and two-body components.
@@ -27,8 +28,13 @@
     enddo
   enddo
 
-  TC_HF_energy += TC_HF_one_e_energy + TC_HF_two_e_energy
-  TC_HF_energy += diag_three_elem_hf
+  if((three_body_h_tc .eq. .False.) .and. (.not. noL_standard)) then
+    TC_HF_three_e_energy = 0.d0
+  else
+    TC_HF_three_e_energy = noL_0e
+  endif
+
+  TC_HF_energy += TC_HF_one_e_energy + TC_HF_two_e_energy + TC_HF_three_e_energy
 
 END_PROVIDER
 
diff --git a/plugins/local/tc_scf/tcscf_energy_naive.irp.f b/plugins/local/tc_scf/tcscf_energy_naive.irp.f
deleted file mode 100644
index 82bb8799..00000000
--- a/plugins/local/tc_scf/tcscf_energy_naive.irp.f
+++ /dev/null
@@ -1,80 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, tcscf_energy_3e_naive]
-
-  implicit none
-  integer          :: i, j, k
-  integer          :: neu, ned, D(elec_num)
-  integer          :: ii, jj, kk
-  integer          :: si, sj, sk
-  double precision :: I_ijk, I_jki, I_kij, I_jik, I_ikj, I_kji
-  double precision :: I_tot
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  neu = elec_alpha_num
-  ned = elec_beta_num
-  if (neu > 0) D(1:neu) = [(2*i-1, i = 1, neu)]
-  if (ned > 0) D(neu+1:neu+ned) = [(2*i, i = 1, ned)]
-
-  !print*, "D = "
-  !do i = 1, elec_num
-  !  ii = (D(i) - 1) / 2 + 1
-  !  si = mod(D(i), 2)
-  !  print*, i, D(i), ii, si
-  !enddo
-
-  tcscf_energy_3e_naive = 0.d0
-
-  do i = 1, elec_num - 2
-    ii = (D(i) - 1) / 2 + 1
-    si = mod(D(i), 2)
-
-    do j = i + 1, elec_num - 1
-      jj = (D(j) - 1) / 2 + 1
-      sj = mod(D(j), 2)
-
-      do k = j + 1, elec_num
-        kk = (D(k) - 1) / 2 + 1
-        sk = mod(D(k), 2)
-
-        call give_integrals_3_body_bi_ort(ii, jj, kk, ii, jj, kk, I_ijk)
-        I_tot = I_ijk
-
-        if(sj==si .and. sk==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, kk, ii, I_jki)
-          I_tot += I_jki
-        endif
-
-        if(sk==si .and. si==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, ii, jj, I_kij)
-          I_tot += I_kij
-        endif
-
-        if(sj==si) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, ii, kk, I_jik)
-          I_tot -= I_jik
-        endif
-
-        if(sk==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, ii, kk, jj, I_ikj)
-          I_tot -= I_ikj
-        endif
-
-        if(sk==si) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, jj, ii, I_kji)
-          I_tot -= I_kji
-        endif
-
-        tcscf_energy_3e_naive += I_tot
-      enddo
-    enddo
-  enddo
-
-  tcscf_energy_3e_naive = -tcscf_energy_3e_naive
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f b/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f
deleted file mode 100644
index 0c9ebbd7..00000000
--- a/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f
+++ /dev/null
@@ -1,189 +0,0 @@
-
-subroutine contrib_3e_diag_sss(i, j, k, integral)
-
-  BEGIN_DOC
-  ! returns the pure same spin contribution to diagonal matrix element of 3e term
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, j, k
-  double precision, intent(out) :: integral
-  double precision              :: direct_int, exch_13_int, exch_23_int, exch_12_int, c_3_int, c_minus_3_int
-
-  call give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int )!!! < i k j | i k j >
-  call give_integrals_3_body_bi_ort(i, k, j, j, i, k, c_3_int)      ! < i k j | j i k >
-  call give_integrals_3_body_bi_ort(i, k, j, k, j, i, c_minus_3_int)! < i k j | k j i >
-  integral = direct_int + c_3_int + c_minus_3_int 
-
-  ! negative terms :: exchange contrib
-  call give_integrals_3_body_bi_ort(i, k, j, j, k, i, exch_13_int)!!! < i k j | j k i > : E_13 
-  call give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)!!! < i k j | i j k > : E_23
-  call give_integrals_3_body_bi_ort(i, k, j, k, i, j, exch_12_int)!!! < i k j | k i j > : E_12
-
-  integral += - exch_13_int - exch_23_int  - exch_12_int 
-  integral = -integral
-
-end
-
-! ---
-
-subroutine contrib_3e_diag_soo(i,j,k,integral)
- implicit none
- integer, intent(in) :: i,j,k
- BEGIN_DOC
- ! returns the pure same spin contribution to diagonal matrix element of 3e term
- END_DOC
- double precision, intent(out) :: integral
- double precision :: direct_int, exch_23_int
- call  give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int) ! < i k j | i k j >
- call  give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)! < i k j | i j k > : E_23
- integral = direct_int - exch_23_int 
- integral = -integral
-end
-
-
-subroutine give_aaa_contrib_bis(integral_aaa)
- implicit none
- double precision, intent(out) :: integral_aaa
- double precision :: integral
- integer :: i,j,k
- integral_aaa = 0.d0
- do i = 1, elec_alpha_num
-  do j = i+1, elec_alpha_num
-   do k = j+1, elec_alpha_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_aaa += integral
-   enddo
-  enddo
- enddo
-
-end
-
-! ---
-
-subroutine give_aaa_contrib(integral_aaa)
-
-  implicit none
-  integer                       :: i, j, k
-  double precision              :: integral
-  double precision, intent(out) :: integral_aaa
-
-  integral_aaa = 0.d0
-  do i = 1, elec_alpha_num
-    do j = 1, elec_alpha_num
-      do k = 1, elec_alpha_num
-        call contrib_3e_diag_sss(i, j, k, integral)
-        integral_aaa += integral
-      enddo
-    enddo
-  enddo
-  integral_aaa *= 1.d0/6.d0 
-
-  return
-end
-
-! ---
-
-subroutine give_aab_contrib(integral_aab)
- implicit none
- double precision, intent(out) :: integral_aab
- double precision :: integral
- integer :: i,j,k
- integral_aab = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_alpha_num
-   do k = 1, elec_alpha_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_aab += integral
-   enddo
-  enddo
- enddo
- integral_aab *= 0.5d0
-end
-
-
-subroutine give_aab_contrib_bis(integral_aab)
- implicit none
- double precision, intent(out) :: integral_aab
- double precision :: integral
- integer :: i,j,k
- integral_aab = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_alpha_num
-   do k = j+1, elec_alpha_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_aab += integral
-   enddo
-  enddo
- enddo
-end
-
-
-subroutine give_abb_contrib(integral_abb)
- implicit none
- double precision, intent(out) :: integral_abb
- double precision :: integral
- integer :: i,j,k
- integral_abb = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_beta_num
-   do k = 1, elec_beta_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_abb += integral
-   enddo
-  enddo
- enddo
- integral_abb *= 0.5d0
-end
-
-subroutine give_abb_contrib_bis(integral_abb)
- implicit none
- double precision, intent(out) :: integral_abb
- double precision :: integral
- integer :: i,j,k
- integral_abb = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_beta_num
-   do k = j+1, elec_beta_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_abb += integral
-   enddo
-  enddo
- enddo
-end
-
-subroutine give_bbb_contrib_bis(integral_bbb)
- implicit none
- double precision, intent(out) :: integral_bbb
- double precision :: integral
- integer :: i,j,k
- integral_bbb = 0.d0
- do i = 1, elec_beta_num
-  do j = i+1, elec_beta_num
-   do k = j+1, elec_beta_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_bbb += integral
-   enddo
-  enddo
- enddo
-
-end
-
-subroutine give_bbb_contrib(integral_bbb)
- implicit none
- double precision, intent(out) :: integral_bbb
- double precision :: integral
- integer :: i,j,k
- integral_bbb = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_beta_num
-   do k = 1, elec_beta_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_bbb += integral
-   enddo
-  enddo
- enddo
- integral_bbb *= 1.d0/6.d0 
-end
-
-
diff --git a/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
index 7ce57578..ec5167d1 100644
--- a/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
+++ b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
@@ -4,11 +4,9 @@ program write_ao_2e_tc_integ
 
   implicit none
 
-  PROVIDE j1e_type
-  PROVIDE j2e_type
-
-  print *, ' j1e_type = ', j1e_type
   print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
 
   my_grid_becke  = .True.
   PROVIDE tc_grid1_a tc_grid1_r

From 23acd603d01118e0f2ce59fb14568a64d9994335 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 1 May 2024 23:17:36 +0200
Subject: [PATCH 023/131] removed diag_three_elem_hf

---
 plugins/local/tc_scf/tc_scf_energy.irp.f | 423 -----------------------
 1 file changed, 423 deletions(-)

diff --git a/plugins/local/tc_scf/tc_scf_energy.irp.f b/plugins/local/tc_scf/tc_scf_energy.irp.f
index 14d618ae..74ab9d05 100644
--- a/plugins/local/tc_scf/tc_scf_energy.irp.f
+++ b/plugins/local/tc_scf/tc_scf_energy.irp.f
@@ -40,426 +40,3 @@ END_PROVIDER
 
 ! ---
 
-BEGIN_PROVIDER [double precision, diag_three_elem_hf]
-
-  BEGIN_DOC
-  ! 
-  ! < Phi_left | L | Phi_right >
-  !
-  !
-  ! if three_body_h_tc == false and noL_standard == true ==> do a normal ordering
-  !
-  ! todo
-  ! this should be equivalent to
-  !     three_body_h_tc == true and noL_standard == false
-  !
-  ! if three_body_h_tc == false and noL_standard == false ==> this is equal to 0
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: i, j, k, ipoint, mm
-  double precision              :: contrib, weight, four_third, one_third, two_third, exchange_int_231
-  double precision              :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb
-  double precision, allocatable :: tmp(:)
-  double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
-  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
-  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  if(.not. three_body_h_tc) then
-
-   if(noL_standard) then
-      PROVIDE noL_0e
-      diag_three_elem_hf = noL_0e
-    else
-      diag_three_elem_hf = 0.d0
-    endif
-
-  else
-
-    PROVIDE int2_grad1_u12_bimo_t
-    PROVIDE mos_l_in_r_array_transp
-    PROVIDE mos_r_in_r_array_transp
-
-    if(elec_alpha_num .eq. elec_beta_num) then
-  
-      allocate(tmp(elec_beta_num))
-      allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-  
-      !$OMP PARALLEL                                                 &
-      !$OMP DEFAULT(NONE)                                            &
-      !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                      &
-      !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-  
-      !$OMP DO
-      do j = 1, elec_beta_num
-  
-        tmp_L = 0.d0
-        tmp_R = 0.d0
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-  
-            tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-          enddo
-        enddo
-  
-        tmp(j) = 0.d0
-        do ipoint = 1, n_points_final_grid
-          tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-        enddo
-      enddo ! j
-      !$OMP END DO
-      !$OMP END PARALLEL
-  
-      diag_three_elem_hf = -2.d0 * sum(tmp)
-  
-      deallocate(tmp)
-      deallocate(tmp_L, tmp_R)
-  
-      ! ---
-  
-      allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-      tmp_O = 0.d0
-      tmp_J = 0.d0
-  
-      !$OMP PARALLEL                                                  &
-      !$OMP DEFAULT(NONE)                                             &
-      !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                &
-      !$OMP SHARED(elec_beta_num, n_points_final_grid,                & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,  &
-      !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-  
-      allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-      tmp_O_priv = 0.d0
-      tmp_J_priv = 0.d0
-    
-      !$OMP DO 
-      do i = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-          tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-          tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-          tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-          tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP CRITICAL
-      tmp_O = tmp_O + tmp_O_priv
-      tmp_J = tmp_J + tmp_J_priv
-      !$OMP END CRITICAL
-  
-      deallocate(tmp_O_priv, tmp_J_priv)
-      !$OMP END PARALLEL
-  
-      allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-      tmp_M = 0.d0
-      tmp_S = 0.d0
-  
-      !$OMP PARALLEL                                                 &
-      !$OMP DEFAULT(NONE)                                            &
-      !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)            &
-      !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-      !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-  
-      allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-      tmp_M_priv = 0.d0
-      tmp_S_priv = 0.d0
-    
-      !$OMP DO COLLAPSE(2)
-      do i = 1, elec_beta_num
-        do j = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-  
-            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                        + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                        + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-          enddo
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP CRITICAL
-      tmp_M = tmp_M + tmp_M_priv
-      tmp_S = tmp_S + tmp_S_priv
-      !$OMP END CRITICAL
-  
-      deallocate(tmp_M_priv, tmp_S_priv)
-      !$OMP END PARALLEL
-  
-      allocate(tmp(n_points_final_grid))
-  
-      do ipoint = 1, n_points_final_grid
-  
-        tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-  
-        tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                         - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                  + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                  + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-      enddo
-  
-      diag_three_elem_hf = diag_three_elem_hf -2.d0 * (sum(tmp))
-  
-      deallocate(tmp)
-  
-    else ! elec_alpha_num .neq. elec_beta_num
-  
-      allocate(tmp(elec_alpha_num))
-      allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-  
-      !$OMP PARALLEL                                                   &
-      !$OMP DEFAULT(NONE)                                              &
-      !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-      !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-  
-      !$OMP DO
-      do j = 1, elec_beta_num
-  
-        tmp_L = 0.d0
-        tmp_R = 0.d0
-        do i = elec_beta_num+1, elec_alpha_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-  
-            tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-          enddo
-        enddo
-  
-        tmp(j) = 0.d0
-        do ipoint = 1, n_points_final_grid
-          tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-        enddo
-  
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-  
-            tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-          enddo
-        enddo
-  
-        do ipoint = 1, n_points_final_grid
-          tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-        enddo
-      enddo ! j
-      !$OMP END DO
-      !$OMP END PARALLEL
-  
-      ! ---
-  
-      !$OMP PARALLEL                                                   &
-      !$OMP DEFAULT(NONE)                                              &
-      !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-      !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-  
-      !$OMP DO
-      do j = elec_beta_num+1, elec_alpha_num
-  
-        tmp_L = 0.d0
-        tmp_R = 0.d0
-        do i = 1, elec_alpha_num
-          do ipoint = 1, n_points_final_grid
-            tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-            tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-  
-            tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-          enddo
-        enddo
-  
-        tmp(j) = 0.d0
-        do ipoint = 1, n_points_final_grid
-          tmp(j) = tmp(j) + 0.5d0 * final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-        enddo
-      enddo ! j
-      !$OMP END DO
-      !$OMP END PARALLEL
-  
-      diag_three_elem_hf = -2.d0 * sum(tmp)
-  
-      deallocate(tmp)
-      deallocate(tmp_L, tmp_R)
-  
-      ! ---
-  
-      allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-      tmp_O = 0.d0
-      tmp_J = 0.d0
-  
-      !$OMP PARALLEL                                                   &
-      !$OMP DEFAULT(NONE)                                              &
-      !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                 &
-      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-      !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-  
-      allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-      tmp_O_priv = 0.d0
-      tmp_J_priv = 0.d0
-    
-      !$OMP DO 
-      do i = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-          tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-          tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-          tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-          tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP DO 
-      do i = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-          tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-          tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,i)
-          tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,i)
-          tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,i)
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP CRITICAL
-      tmp_O = tmp_O + tmp_O_priv
-      tmp_J = tmp_J + tmp_J_priv
-      !$OMP END CRITICAL
-  
-      deallocate(tmp_O_priv, tmp_J_priv)
-      !$OMP END PARALLEL
-  
-      ! ---
-  
-      allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-      tmp_M = 0.d0
-      tmp_S = 0.d0
-  
-      !$OMP PARALLEL                                                   &
-      !$OMP DEFAULT(NONE)                                              &
-      !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)              &
-      !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-      !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-      !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-  
-      allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-      tmp_M_priv = 0.d0
-      tmp_S_priv = 0.d0
-    
-      !$OMP DO COLLAPSE(2)
-      do i = 1, elec_beta_num
-        do j = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-  
-            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                        + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                        + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-          enddo
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP DO COLLAPSE(2)
-      do i = elec_beta_num+1, elec_alpha_num
-        do j = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-  
-            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-  
-            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                        + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                        + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-          enddo
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP DO COLLAPSE(2)
-      do i = elec_beta_num+1, elec_alpha_num
-        do j = elec_beta_num+1, elec_alpha_num
-          do ipoint = 1, n_points_final_grid
-  
-            tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-            tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-  
-            tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                        + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                        + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-          enddo
-        enddo
-      enddo
-      !$OMP END DO NOWAIT
-  
-      !$OMP CRITICAL
-      tmp_M = tmp_M + tmp_M_priv
-      tmp_S = tmp_S + tmp_S_priv
-      !$OMP END CRITICAL
-  
-      deallocate(tmp_M_priv, tmp_S_priv)
-      !$OMP END PARALLEL
-  
-      allocate(tmp(n_points_final_grid))
-  
-      do ipoint = 1, n_points_final_grid
-  
-        tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-  
-        tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                         - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                  + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                  + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-      enddo
-  
-      diag_three_elem_hf = diag_three_elem_hf - 2.d0 * (sum(tmp))
-  
-      deallocate(tmp)
-  
-    endif ! alpha/beta condition
-
-  endif ! three_body_h_tc
-
-END_PROVIDER 
-
-! ---
-

From bd8d45b99b7505e00533bd9e97ad1b43453fb037 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Thu, 2 May 2024 17:18:45 +0200
Subject: [PATCH 024/131] FIXED BUG IN OPTIM J_BH

---
 plugins/local/bi_ort_ints/no_dressing.irp.f   |   8 -
 .../non_h_ints_mu/jast_deriv_utils_vect.irp.f | 183 ++++++++----------
 plugins/local/slater_tc/tc_hmat.irp.f         |   1 +
 .../local/tc_bi_ortho/print_tc_energy.irp.f   |  27 ++-
 plugins/local/tc_scf/tc_scf.irp.f             |  31 ++-
 5 files changed, 117 insertions(+), 133 deletions(-)

diff --git a/plugins/local/bi_ort_ints/no_dressing.irp.f b/plugins/local/bi_ort_ints/no_dressing.irp.f
index 721ac0f8..fd2c6285 100644
--- a/plugins/local/bi_ort_ints/no_dressing.irp.f
+++ b/plugins/local/bi_ort_ints/no_dressing.irp.f
@@ -336,9 +336,6 @@ BEGIN_PROVIDER [double precision, noL_0e]
   double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
   double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
 
-  call wall_time(t0)
-  print*, " Providing noL_0e ..."
-
   if(elec_alpha_num .eq. elec_beta_num) then
 
     allocate(tmp(elec_beta_num))
@@ -713,11 +710,6 @@ BEGIN_PROVIDER [double precision, noL_0e]
 
   endif
 
-  call wall_time(t1)
-  print*, " Wall time for noL_0e (min) = ", (t1 - t0)/60.d0
-
-  print*, " noL_0e = ", noL_0e
-
 END_PROVIDER
 
 ! ---
diff --git a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
index 33563102..db06e835 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
@@ -4,7 +4,7 @@
 subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
 
   BEGIN_DOC
-  !
+  ! 
   ! grad_1 u(r1,r2)
   !
   ! we use grid for r1 and extra_grid for r2
@@ -167,7 +167,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   integer                       :: jpoint
   integer                       :: i_nucl, p, mpA, npA, opA
   double precision              :: r2(3)
-  double precision              :: dx, dy, dz, r12, tmp, r12_inv
+  double precision              :: dx, dy, dz, r12, tmp
   double precision              :: mu_val, mu_tmp, mu_der(3)
   double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
   double precision              :: tmp1, tmp2
@@ -181,7 +181,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
     !  d/dy1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (y1 - y2)
     !  d/dz1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (z1 - z2)
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -191,19 +191,15 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       dy = r1(2) - r2(2)
       dz = r1(3) - r2(3)
 
-      r12 = dx * dx + dy * dy + dz * dz
-
-      if(r12 .lt. 1d-20) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
         cycle
       endif
 
-      r12_inv = 1.d0/dsqrt(r12)
-      r12 = r12*r12_inv
-
-      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) * r12_inv
+      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
 
       gradx(jpoint) = tmp * dx
       grady(jpoint) = tmp * dy
@@ -212,10 +208,10 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
   elseif(j2e_type .eq. "Mur") then
 
-    ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 ) d/dx1 mu(r1,r2)
+    ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 ) d/dx1 mu(r1,r2) 
     !                        + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -224,29 +220,23 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       dx  = r1(1) - r2(1)
       dy  = r1(2) - r2(2)
       dz  = r1(3) - r2(3)
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
 
-      r12 = dx * dx + dy * dy + dz * dz
+      call mu_r_val_and_grad(r1, r2, mu_val, mu_der)
+      mu_tmp  = mu_val * r12
+      tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
+      gradx(jpoint) = tmp * mu_der(1)
+      grady(jpoint) = tmp * mu_der(2)
+      gradz(jpoint) = tmp * mu_der(3)
 
-      if(r12 .lt. 1d-20) then
+      if(r12 .lt. 1d-10) then
         gradx(jpoint) = 0.d0
         grady(jpoint) = 0.d0
         gradz(jpoint) = 0.d0
         cycle
       endif
 
-      r12_inv = 1.d0/dsqrt(r12)
-      r12 = r12*r12_inv
-
-      call mu_r_val_and_grad(r1, r2, mu_val, mu_der)
-
-      mu_tmp  = mu_val * r12
-      tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
-
-      gradx(jpoint) = tmp * mu_der(1)
-      grady(jpoint) = tmp * mu_der(2)
-      gradz(jpoint) = tmp * mu_der(3)
-
-      tmp = 0.5d0 * (1.d0 - derf(mu_tmp)) * r12_inv
+      tmp = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12
 
       gradx(jpoint) = gradx(jpoint) + tmp * dx
       grady(jpoint) = grady(jpoint) + tmp * dy
@@ -264,7 +254,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
     PROVIDE a_boys
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -273,17 +263,14 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       dx  = r1(1) - r2(1)
       dy  = r1(2) - r2(2)
       dz  = r1(3) - r2(3)
-      r12 = dx * dx + dy * dy + dz * dz
-
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
       if(r12 .lt. 1d-10) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
         cycle
       endif
 
-      r12 = dsqrt(r12)
-
       tmp = 1.d0 + a_boys * r12
       tmp = 0.5d0 / (r12 * tmp * tmp)
 
@@ -294,13 +281,16 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
   elseif(j2e_type .eq. "Boys_Handy") then
 
-    integer :: powmax
-    powmax = max(maxval(jBH_m),maxval(jBH_n))
-
+    integer                       :: powmax1, powmax, powmax2
     double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
-    allocate (f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
 
-    do p=0,powmax
+    powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+    powmax2 = maxval(jBH_o)
+    powmax  = max(powmax1, powmax2)
+
+    allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+    do p = 0, powmax
       double_p(p) = dble(p)
     enddo
 
@@ -318,11 +308,10 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       r2(2) = final_grid_points_extra(2,jpoint)
       r2(3) = final_grid_points_extra(3,jpoint)
 
-      gradx(jpoint) = 0.d0
-      grady(jpoint) = 0.d0
-      gradz(jpoint) = 0.d0
-
-      do i_nucl = 1, nucl_num
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
+      do i_nucl = 1, nucl_num 
 
         rn(1) = nucl_coord(i_nucl,1)
         rn(2) = nucl_coord(i_nucl,2)
@@ -332,23 +321,15 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
         call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
         call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
 
-
         ! Compute powers of f1A and f2A
-
-        do p = 1, maxval(jBH_m(:,i_nucl))
+        do p = 1, powmax1
           f1A_power(p) = f1A_power(p-1) * f1A
-        enddo
-
-        do p = 1, maxval(jBH_n(:,i_nucl))
           f2A_power(p) = f2A_power(p-1) * f2A
         enddo
-
-        do p = 1, maxval(jBH_o(:,i_nucl))
+        do p = 1, powmax2
           g12_power(p) = g12_power(p-1) * g12
         enddo
 
-
-
         do p = 1, jBH_size
           mpA = jBH_m(p,i_nucl)
           npA = jBH_n(p,i_nucl)
@@ -358,27 +339,22 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
             tmp = tmp * 0.5d0
           endif
 
-!TODO : Powers to optimize here
-
-!          tmp1 = 0.d0
-!          if(mpA .gt. 0) then
-!            tmp1 = tmp1 + dble(mpA) * f1A**(mpA-1) * f2A**npA
-!          endif
-!          if(npA .gt. 0) then
-!            tmp1 = tmp1 + dble(npA) * f1A**(npA-1) * f2A**mpA
-!          endif
-!          tmp1 = tmp1 * g12**(opA)
-!
-!          tmp2 = 0.d0
-!          if(opA .gt. 0) then
-!            tmp2 = tmp2 + dble(opA) * g12**(opA-1) * (f1A**(mpA) * f2A**(npA) + f1A**(npA) * f2A**(mpA))
-!          endif
-
           tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
           tmp1 = tmp1 * g12_power(opA)
-
           tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA))
 
+          !tmp1 = 0.d0
+          !if(mpA .gt. 0) then
+          !  tmp1 = tmp1 + dble(mpA) * f1A**dble(mpA-1) * f2A**dble(npA)
+          !endif
+          !if(npA .gt. 0) then
+          !  tmp1 = tmp1 + dble(npA) * f1A**dble(npA-1) * f2A**dble(mpA)
+          !endif
+          !tmp1 = tmp1 * g12**dble(opA)
+          !tmp2 = 0.d0
+          !if(opA .gt. 0) then
+          !  tmp2 = tmp2 + dble(opA) * g12**dble(opA-1) * (f1A**dble(mpA) * f2A**dble(npA) + f1A**dble(npA) * f2A**dble(mpA))
+          !endif
 
           gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
           grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
@@ -418,10 +394,10 @@ subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)
 
   integer                       :: jpoint
   double precision              :: r2(3)
-  double precision              :: dx, dy, dz, r12, r12_inv, tmp
+  double precision              :: dx, dy, dz, r12, tmp
 
 
-  do jpoint = 1, n_points_extra_final_grid ! r2
+  do jpoint = 1, n_points_extra_final_grid ! r2 
 
     r2(1) = final_grid_points_extra(1,jpoint)
     r2(2) = final_grid_points_extra(2,jpoint)
@@ -431,19 +407,15 @@ subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)
     dy = r1(2) - r2(2)
     dz = r1(3) - r2(3)
 
-    r12 = dx * dx + dy * dy + dz * dz
-
-    if(r12 .lt. 1d-20) then
-      gradx(jpoint) = 0.d0
-      grady(jpoint) = 0.d0
-      gradz(jpoint) = 0.d0
+    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+    if(r12 .lt. 1d-10) then
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
       cycle
     endif
 
-    r12_inv = 1.d0 / dsqrt(r12)
-    r12 = r12 * r12_inv
-
-    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) * r12_inv
+    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) / r12
 
     gradx(jpoint) = tmp * dx
     grady(jpoint) = tmp * dy
@@ -467,7 +439,7 @@ subroutine j12_r1_seq(r1, n_grid2, res)
   integer                       :: jpoint
   double precision              :: r2(3)
   double precision              :: dx, dy, dz
-  double precision              :: mu_tmp, r12, mu_erf_inv
+  double precision              :: mu_tmp, r12
 
   PROVIDE final_grid_points_extra
 
@@ -475,21 +447,20 @@ subroutine j12_r1_seq(r1, n_grid2, res)
 
     PROVIDE mu_erf
 
-    mu_erf_inv = 1.d0 / mu_erf
-    do jpoint = 1, n_points_extra_final_grid ! r2
-
+    do jpoint = 1, n_points_extra_final_grid ! r2 
+  
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
       r2(3) = final_grid_points_extra(3,jpoint)
-
+  
       dx  = r1(1) - r2(1)
       dy  = r1(2) - r2(2)
       dz  = r1(3) - r2(3)
       r12 = dsqrt(dx * dx + dy * dy + dz * dz)
 
       mu_tmp = mu_erf * r12
-
-      res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) * mu_erf_inv
+  
+      res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_erf
     enddo
 
   elseif(j2e_type .eq. "Boys") then
@@ -498,7 +469,7 @@ subroutine j12_r1_seq(r1, n_grid2, res)
 
     PROVIDE a_boys
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -540,19 +511,19 @@ subroutine jmu_r1_seq(mu, r1, n_grid2, res)
 
   tmp1 = inv_sq_pi_2 / mu
 
-  do jpoint = 1, n_points_extra_final_grid ! r2
-
+  do jpoint = 1, n_points_extra_final_grid ! r2 
+  
     r2(1) = final_grid_points_extra(1,jpoint)
     r2(2) = final_grid_points_extra(2,jpoint)
     r2(3) = final_grid_points_extra(3,jpoint)
-
+  
     dx  = r1(1) - r2(1)
     dy  = r1(2) - r2(2)
     dz  = r1(3) - r2(3)
     r12 = dsqrt(dx * dx + dy * dy + dz * dz)
 
     tmp2 = mu * r12
-
+  
     res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(tmp2)) - tmp1 * dexp(-tmp2*tmp2)
   enddo
 
@@ -579,7 +550,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -598,7 +569,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -618,7 +589,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -636,7 +607,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -666,7 +637,7 @@ end
 subroutine get_grad1_u12_2e_r1_seq(ipoint, n_grid2, resx, resy, resz)
 
   BEGIN_DOC
-  !
+  ! 
   ! grad_1 u_2e(r1,r2)
   !
   ! we use grid for r1 and extra_grid for r2
@@ -786,7 +757,7 @@ end
 subroutine get_u12_2e_r1_seq(ipoint, n_grid2, res)
 
   BEGIN_DOC
-  !
+  ! 
   ! u_2e(r1,r2)
   !
   ! we use grid for r1 and extra_grid for r2
@@ -909,7 +880,7 @@ subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
   endif
 
   return
-end
+end 
 
 ! ---
 
diff --git a/plugins/local/slater_tc/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f
index abec410d..cc780364 100644
--- a/plugins/local/slater_tc/tc_hmat.irp.f
+++ b/plugins/local/slater_tc/tc_hmat.irp.f
@@ -22,6 +22,7 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
 
   if(noL_standard) then
     PROVIDE noL_0e
+    print*, "noL_0e =", noL_0e
     PROVIDE noL_1e
     PROVIDE noL_2e
   endif
diff --git a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f b/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
index 1fa0c6d9..979d792b 100644
--- a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
+++ b/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
@@ -9,15 +9,6 @@ program print_tc_energy
   read_wf = .True.
   touch read_wf
 
-  PROVIDE j2e_type
-  PROVIDE j1e_type
-  PROVIDE env_type
-
-  print *, ' j2e_type = ', j2e_type
-  print *, ' j1e_type = ', j1e_type
-  print *, ' env_type = ', env_type
-
-
   my_grid_becke  = .True.
   PROVIDE tc_grid1_a tc_grid1_r
   my_n_pt_r_grid = tc_grid1_r
@@ -38,6 +29,24 @@ program print_tc_energy
     call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
   endif
 
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE j2e_type
+  PROVIDE j1e_type
+  PROVIDE env_type
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
   call write_tc_energy()
 
 end
diff --git a/plugins/local/tc_scf/tc_scf.irp.f b/plugins/local/tc_scf/tc_scf.irp.f
index f099b90e..83da03ec 100644
--- a/plugins/local/tc_scf/tc_scf.irp.f
+++ b/plugins/local/tc_scf/tc_scf.irp.f
@@ -7,15 +7,6 @@ program tc_scf
   END_DOC
 
   implicit none
-  integer :: i
-  logical :: good_angles
-
-  print *, ' TC-SCF with:'
-  print *, ' j2e_type = ', j2e_type
-  print *, ' j1e_type = ', j1e_type
-  print *, ' env_type = ', env_type
-
-  write(json_unit,json_array_open_fmt) 'tc-scf'
 
   my_grid_becke  = .True.
   PROVIDE tc_grid1_a tc_grid1_r
@@ -37,6 +28,26 @@ program tc_scf
     call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
   endif
 
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  integer :: i
+  logical :: good_angles
+
+  print *, ' TC-SCF with:'
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  write(json_unit,json_array_open_fmt) 'tc-scf'
+
   call rh_tcscf_diis()
 
   PROVIDE Fock_matrix_tc_diag_mo_tot
@@ -84,7 +95,7 @@ subroutine create_guess()
     SOFT_TOUCH mo_label
   endif
 
-end subroutine create_guess
+end
 
 ! ---
 

From 13785b267c36319925ffa72ebe42399fa932ffae Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 3 May 2024 11:34:30 +0200
Subject: [PATCH 025/131] fixed a bug in src/scf_utils/roothaan_hall_scf.irp.f

---
 .../extra_grid_vector.irp.f                   | 20 +++++++++----------
 .../grid_becke_vector.irp.f                   | 20 +++++++++----------
 src/scf_utils/roothaan_hall_scf.irp.f         |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/becke_numerical_grid/extra_grid_vector.irp.f b/src/becke_numerical_grid/extra_grid_vector.irp.f
index 16a52dc6..44fc4435 100644
--- a/src/becke_numerical_grid/extra_grid_vector.irp.f
+++ b/src/becke_numerical_grid/extra_grid_vector.irp.f
@@ -71,16 +71,16 @@ END_PROVIDER
         index_final_points_extra(3,i_count) = j
         index_final_points_extra_reverse(k,i,j) = i_count
 
-        if(final_weight_at_r_vector_extra(i_count) .lt. 0.d0) then
-          print *, ' !!! WARNING !!!'
-          print *, ' negative weight !!!!'
-          print *, i_count, final_weight_at_r_vector_extra(i_count)
-          if(dabs(final_weight_at_r_vector_extra(i_count)) .lt. 1d-10) then
-            final_weight_at_r_vector_extra(i_count) = 0.d0
-          else
-            stop
-          endif
-        endif
+!        if(final_weight_at_r_vector_extra(i_count) .lt. 0.d0) then
+!          print *, ' !!! WARNING !!!'
+!          print *, ' negative weight !!!!'
+!          print *, i_count, final_weight_at_r_vector_extra(i_count)
+!          if(dabs(final_weight_at_r_vector_extra(i_count)) .lt. 1d-10) then
+!            final_weight_at_r_vector_extra(i_count) = 0.d0
+!          else
+!            stop
+!          endif
+!        endif
       enddo
     enddo
   enddo
diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index c35918c3..7097dbb3 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -68,16 +68,16 @@ END_PROVIDER
         index_final_points(3,i_count) = j
         index_final_points_reverse(k,i,j) = i_count
 
-        if(final_weight_at_r_vector(i_count) .lt. 0.d0) then
-          print *, ' !!! WARNING !!!'
-          print *, ' negative weight !!!!'
-          print *, i_count, final_weight_at_r_vector(i_count)
-          if(dabs(final_weight_at_r_vector(i_count)) .lt. 1d-10) then
-            final_weight_at_r_vector(i_count) = 0.d0
-          else
-            stop
-          endif
-        endif 
+!        if(final_weight_at_r_vector(i_count) .lt. 0.d0) then
+!          print *, ' !!! WARNING !!!'
+!          print *, ' negative weight !!!!'
+!          print *, i_count, final_weight_at_r_vector(i_count)
+!          if(dabs(final_weight_at_r_vector(i_count)) .lt. 1d-10) then
+!            final_weight_at_r_vector(i_count) = 0.d0
+!          else
+!            stop
+!          endif
+!        endif 
       enddo
     enddo
   enddo
diff --git a/src/scf_utils/roothaan_hall_scf.irp.f b/src/scf_utils/roothaan_hall_scf.irp.f
index 3f5c8549..e0fe5319 100644
--- a/src/scf_utils/roothaan_hall_scf.irp.f
+++ b/src/scf_utils/roothaan_hall_scf.irp.f
@@ -217,7 +217,7 @@ END_DOC
   do while (i<mo_num)
     j=i
     m=1
-    do while ( (j<mo_num).and.(fock_matrix_diag_mo(j+1)-fock_matrix_diag_mo(i) < 1.d-8) )
+    do while ( (j+1<mo_num).and.(fock_matrix_diag_mo(j+1)-fock_matrix_diag_mo(i) < 1.d-8) )
       j += 1
       m += 1
     enddo

From 7335e29784ec4dd76e1e1b738dc18a2627ceb958 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 6 May 2024 17:47:48 +0200
Subject: [PATCH 026/131] updated get_fci_tc_conv.sh

---
 scripts/get_fci_tc_conv.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/get_fci_tc_conv.sh b/scripts/get_fci_tc_conv.sh
index 643f3ac0..f0c99baf 100755
--- a/scripts/get_fci_tc_conv.sh
+++ b/scripts/get_fci_tc_conv.sh
@@ -1,2 +1,2 @@
 file=$1
-grep "Ndet,E,E+PT2,E+RPT2,|PT2|=" $file | cut -d "=" -f 2 > ${file}.conv_fci_tc
+grep "Ndet,E,E+PT2,pt2_minus,pt2_plus,pt2_abs=" $file | cut -d "=" -f 2 > ${file}.conv_fci_tc

From b14325fef482bdf6cb471b40edf8fa46f2aeac65 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 6 May 2024 18:21:58 +0200
Subject: [PATCH 027/131] Introducing qmckl

---
 plugins/local/non_h_ints_mu/NEED          |   1 +
 plugins/local/non_h_ints_mu/deb_aos.irp.f |  49 ++++++++--
 plugins/local/non_h_ints_mu/qmckl.irp.f   | 104 ++++++++++++++++++++++
 3 files changed, 148 insertions(+), 6 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/NEED b/plugins/local/non_h_ints_mu/NEED
index 48c1c24b..5ca1d543 100644
--- a/plugins/local/non_h_ints_mu/NEED
+++ b/plugins/local/non_h_ints_mu/NEED
@@ -3,3 +3,4 @@ hamiltonian
 jastrow
 ao_tc_eff_map
 bi_ortho_mos
+trexio
diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f
index c9bc9c9a..86d011fb 100644
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@@ -34,21 +34,58 @@ subroutine print_aos()
 
   PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
 
-  do ipoint = 1, n_points_final_grid
-    r(:) = final_grid_points(:,ipoint)
-    print*, r
-  enddo
+!  do ipoint = 1, n_points_final_grid
+!    r(:) = final_grid_points(:,ipoint)
+!    print*, r
+!  enddo
+double precision :: accu_vgl(5)
+double precision :: accu_vgl_nrm(5)
 
   do ipoint = 1, n_points_final_grid
-    r(:) = final_grid_points(:,ipoint)
     do i = 1, ao_num
       ao_val    = aos_in_r_array     (i,ipoint)
       ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
       ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
-      write(*, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+      write(111, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
     enddo
   enddo
  
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      ao_val    = aos_in_r_array_qmckl     (i,ipoint)
+      ao_der(:) = aos_grad_in_r_array_qmckl(i,ipoint,:)
+      ao_lap    = aos_lapl_in_r_array_qmckl(i,ipoint)
+      write(222, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+    enddo
+  enddo
+ 
+  accu_vgl = 0.d0
+  accu_vgl_nrm = 0.d0
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      ao_val    = aos_in_r_array     (i,ipoint)
+      ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
+      ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
+      accu_vgl_nrm(1) += dabs(ao_val)
+      accu_vgl_nrm(2) += dabs(ao_der(1))
+      accu_vgl_nrm(3) += dabs(ao_der(2))
+      accu_vgl_nrm(4) += dabs(ao_der(3))
+      accu_vgl_nrm(5) += dabs(ao_lap)
+
+      ao_val    -= aos_in_r_array_qmckl     (i,ipoint)
+      ao_der(:) -= aos_grad_in_r_array_qmckl(i,ipoint,:)
+      ao_lap    -= aos_lapl_in_r_array_qmckl(i,ipoint)
+      accu_vgl(1) += dabs(ao_val)
+      accu_vgl(2) += dabs(ao_der(1))
+      accu_vgl(3) += dabs(ao_der(2))
+      accu_vgl(4) += dabs(ao_der(3))
+      accu_vgl(5) += dabs(ao_lap)
+    enddo
+
+  enddo
+  accu_vgl(:) *= 1.d0 / accu_vgl_nrm(:)
+  print *, accu_vgl
+
   return
 end
 
diff --git a/plugins/local/non_h_ints_mu/qmckl.irp.f b/plugins/local/non_h_ints_mu/qmckl.irp.f
index 1df80457..4d419e24 100644
--- a/plugins/local/non_h_ints_mu/qmckl.irp.f
+++ b/plugins/local/non_h_ints_mu/qmckl.irp.f
@@ -75,3 +75,107 @@ BEGIN_PROVIDER [ integer*8, qmckl_ctx_jastrow ]
   endif
 
 END_PROVIDER
+
+
+ BEGIN_PROVIDER [ double precision, aos_in_r_array_qmckl, (ao_num,n_points_final_grid)]
+&BEGIN_PROVIDER [ double precision, aos_grad_in_r_array_qmckl, (ao_num,n_points_final_grid,3)]
+&BEGIN_PROVIDER [ double precision, aos_lapl_in_r_array_qmckl, (ao_num, n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! AOS computed with qmckl
+ END_DOC
+ use qmckl
+
+ integer*8 :: qmckl_ctx
+ integer(qmckl_exit_code) :: rc
+
+ qmckl_ctx = qmckl_context_create()
+
+ rc = qmckl_trexio_read(qmckl_ctx, trexio_file, 1_8*len(trim(trexio_filename)))
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in read_trexio'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ rc = qmckl_set_point(qmckl_ctx, 'N', n_points_final_grid*1_8, final_grid_points, n_points_final_grid*3_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in set_electron_point'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ double precision, allocatable :: vgl(:,:,:)
+ allocate( vgl(ao_num,5,n_points_final_grid))
+ rc = qmckl_get_ao_basis_ao_vgl_inplace(qmckl_ctx, vgl, n_points_final_grid*ao_num*5_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in get_ao_vgl'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ integer :: i,k
+ do k=1,n_points_final_grid
+   do i=1,ao_num
+     aos_in_r_array_qmckl(i,k) = vgl(i,1,k)
+     aos_grad_in_r_array_qmckl(i,k,1) = vgl(i,2,k)
+     aos_grad_in_r_array_qmckl(i,k,2) = vgl(i,3,k)
+     aos_grad_in_r_array_qmckl(i,k,3) = vgl(i,4,k)
+     aos_lapl_in_r_array_qmckl(i,k) = vgl(i,5,k)
+   enddo
+ enddo
+
+END_PROVIDER
+
+
+ BEGIN_PROVIDER [ double precision, mos_in_r_array_qmckl, (mo_num,n_points_final_grid)]
+&BEGIN_PROVIDER [ double precision, mos_grad_in_r_array_qmckl, (mo_num,n_points_final_grid,3)]
+&BEGIN_PROVIDER [ double precision, mos_lapl_in_r_array_qmckl, (mo_num, n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! moS computed with qmckl
+ END_DOC
+ use qmckl
+
+ integer*8 :: qmckl_ctx
+ integer(qmckl_exit_code) :: rc
+
+ qmckl_ctx = qmckl_context_create()
+
+ rc = qmckl_trexio_read(qmckl_ctx, trexio_file, 1_8*len(trim(trexio_filename)))
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in read_trexio'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ rc = qmckl_set_point(qmckl_ctx, 'N', n_points_final_grid*1_8, final_grid_points, n_points_final_grid*3_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in set_electron_point'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ double precision, allocatable :: vgl(:,:,:)
+ allocate( vgl(mo_num,5,n_points_final_grid))
+ rc = qmckl_get_mo_basis_mo_vgl_inplace(qmckl_ctx, vgl, n_points_final_grid*mo_num*5_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in get_mo_vgl'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ integer :: i,k
+ do k=1,n_points_final_grid
+   do i=1,mo_num
+     mos_in_r_array_qmckl(i,k) = vgl(i,1,k)
+     mos_grad_in_r_array_qmckl(i,k,1) = vgl(i,2,k)
+     mos_grad_in_r_array_qmckl(i,k,2) = vgl(i,3,k)
+     mos_grad_in_r_array_qmckl(i,k,3) = vgl(i,4,k)
+     mos_lapl_in_r_array_qmckl(i,k) = vgl(i,5,k)
+   enddo
+ enddo
+
+END_PROVIDER
+
+

From 109a956f0d947665af7fbd3ed02d3569c49e592e Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 6 May 2024 18:30:05 +0200
Subject: [PATCH 028/131] does not compile but working on it

---
 plugins/local/slater_tc/h_mat_triple.irp.f    | 391 ------------------
 .../local/slater_tc/slater_tc_opt_diag.irp.f  | 311 +++++++++++++-
 .../slater_tc/symmetrized_3_e_int_prov.irp.f  | 140 -------
 plugins/local/slater_tc_no_opt/.gitignore     |  59 +++
 plugins/local/slater_tc_no_opt/NEED           |   8 +
 plugins/local/slater_tc_no_opt/README.rst     |   4 +
 .../h_biortho.irp.f                           |   0
 .../local/slater_tc_no_opt/h_mat_triple.irp.f | 193 +++++++++
 .../h_tc_bi_ortho_psi.irp.f                   |   0
 .../slater_tc_3e_slow.irp.f                   |   2 +-
 .../slater_tc_no_opt.irp.f}                   |   2 +-
 .../slater_tc_slow.irp.f                      |  73 +---
 src/determinants/slater_rules_general.irp.f   | 192 +++++++++
 13 files changed, 769 insertions(+), 606 deletions(-)
 delete mode 100644 plugins/local/slater_tc/h_mat_triple.irp.f
 delete mode 100644 plugins/local/slater_tc/symmetrized_3_e_int_prov.irp.f
 create mode 100644 plugins/local/slater_tc_no_opt/.gitignore
 create mode 100644 plugins/local/slater_tc_no_opt/NEED
 create mode 100644 plugins/local/slater_tc_no_opt/README.rst
 rename plugins/local/{slater_tc => slater_tc_no_opt}/h_biortho.irp.f (100%)
 create mode 100644 plugins/local/slater_tc_no_opt/h_mat_triple.irp.f
 rename plugins/local/{slater_tc => slater_tc_no_opt}/h_tc_bi_ortho_psi.irp.f (100%)
 rename plugins/local/{slater_tc => slater_tc_no_opt}/slater_tc_3e_slow.irp.f (99%)
 rename plugins/local/{slater_tc/slater_tc.irp.f => slater_tc_no_opt/slater_tc_no_opt.irp.f} (82%)
 rename plugins/local/{slater_tc => slater_tc_no_opt}/slater_tc_slow.irp.f (80%)
 create mode 100644 src/determinants/slater_rules_general.irp.f

diff --git a/plugins/local/slater_tc/h_mat_triple.irp.f b/plugins/local/slater_tc/h_mat_triple.irp.f
deleted file mode 100644
index 6f5697a2..00000000
--- a/plugins/local/slater_tc/h_mat_triple.irp.f
+++ /dev/null
@@ -1,391 +0,0 @@
-subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
- use bitmasks
- BEGIN_DOC
-! returns the array, for each spin, of holes/particles between key_i and key_j 
-!
-! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
- END_DOC
-  include 'utils/constants.include.F'
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
- double precision, intent(out)  :: phase
- integer :: ispin,k,i,pos
- integer(bit_kind) :: key_hole, key_particle
- integer(bit_kind) :: xorvec(N_int_max,2)
- holes_array = -1
- particles_array = -1
- degree_array = 0
-  do i = 1, N_int
-   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
-   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
-   degree_array(1) += popcnt(xorvec(i,1))
-   degree_array(2) += popcnt(xorvec(i,2))
-  enddo
-  degree_array(1) = shiftr(degree_array(1),1)
-  degree_array(2) = shiftr(degree_array(2),1)
-  
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE HOLES 
-  do i = 1, N_int
-   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE PARTICLES
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general '
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo
-  enddo 
- enddo
- integer :: h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree_array(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
-subroutine get_holes_general(key_i, key_j,Nint, holes_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of holes between key_i and key_j 
-!
-! with the following convention: a_{hole}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2)
- integer(bit_kind) :: key_hole
- integer :: ispin,k,i,pos
- holes_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_particles_general(key_i, key_j,Nint,particles_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of particles between key_i and key_j 
-!
-! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: particles_array(100,2)
- integer(bit_kind) :: key_particle
- integer :: ispin,k,i,pos
- particles_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'Those are the two determinants'
-     call debug_det(key_i, N_int)
-     call debug_det(key_j, N_int)
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
- implicit none
- integer, intent(in)            :: degree(2), Nint
- integer(bit_kind), intent(in)  :: key_i(Nint,2)
- integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
- double precision, intent(out)  :: phase
- integer :: i,ispin,h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
-subroutine H_tc_s2_u_0_with_pure_three(v_0, s_0, u_0, N_st, sze)
-  BEGIN_DOC
-  ! Computes $v_0 = H^TC | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
-  !
-  ! Assumes that the determinants are in psi_det
-  !
-  ! istart, iend, ishift, istep are used in ZMQ parallelization.
-  END_DOC
-
-  use bitmasks
-  implicit none
-
-  integer,          intent(in)  :: N_st,sze
-  double precision, intent(in)  :: u_0(sze,N_st)
-  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
-  call H_tc_s2_u_0_opt(v_0, s_0, u_0, N_st, sze)
-  integer :: i,j,degree,ist
-  double precision :: hmono, htwoe, hthree, htot
-  do i = 1, N_det
-   do j = 1, N_det
-    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
-    if(degree .ne. 3)cycle
-    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,i), psi_det(1,1,j), hmono, htwoe, hthree, htot)
-    do ist = 1, N_st
-     v_0(i,ist) += htot * u_0(j,ist)
-    enddo
-   enddo
-  enddo
-end
-
-subroutine H_tc_s2_u_0_with_pure_three_omp(v_0, s_0, u_0, N_st, sze)
-  BEGIN_DOC
-  ! Computes $v_0 = H^TC | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
-  !
-  ! Assumes that the determinants are in psi_det
-  !
-  ! istart, iend, ishift, istep are used in ZMQ parallelization.
-  END_DOC
-
-  use bitmasks
-  implicit none
-
-  integer,          intent(in)  :: N_st,sze
-  double precision, intent(in)  :: u_0(sze,N_st)
-  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
-  call H_tc_s2_u_0_opt(v_0, s_0, u_0, N_st, sze)
-  integer :: i,j,degree,ist
-  double precision :: hmono, htwoe, hthree, htot
-  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8) &
-  !$OMP SHARED(N_st, N_det, N_int, psi_det, u_0, v_0)       &
-  !$OMP PRIVATE(ist, i, j, degree, hmono, htwoe, hthree,htot)
-  do i = 1, N_det
-   do j = 1, N_det
-    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
-    if(degree .ne. 3)cycle
-    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,i), psi_det(1,1,j), hmono, htwoe, hthree, htot)
-    do ist = 1, N_st
-     v_0(i,ist) += htot * u_0(j,ist)
-    enddo
-   enddo
-  enddo
- !$OMP END PARALLEL DO
-end
-
-! ---
-
-subroutine H_tc_s2_dagger_u_0_with_pure_three(v_0, s_0, u_0, N_st, sze)
-  BEGIN_DOC
-  ! Computes $v_0 = (H^TC)^dagger | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
-  !
-  ! Assumes that the determinants are in psi_det
-  !
-  ! istart, iend, ishift, istep are used in ZMQ parallelization.
-  END_DOC
-
-  use bitmasks
-  implicit none
-
-  integer,          intent(in)  :: N_st,sze
-  double precision, intent(in)  :: u_0(sze,N_st)
-  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
-  call H_tc_s2_dagger_u_0_opt(v_0, s_0, u_0, N_st, sze)
-  integer :: i,j,degree,ist
-  double precision :: hmono, htwoe, hthree, htot
-  do i = 1, N_det
-   do j = 1, N_det
-    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
-    if(degree .ne. 3)cycle
-    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,j), psi_det(1,1,i), hmono, htwoe, hthree, htot)
-    do ist = 1, N_st
-     v_0(i,ist) += htot * u_0(j,ist)
-    enddo
-   enddo
-  enddo
-end
-
-subroutine H_tc_s2_dagger_u_0_with_pure_three_omp(v_0, s_0, u_0, N_st, sze)
-  BEGIN_DOC
-  ! Computes $v_0 = (H^TC)^dagger | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
-  !
-  ! Assumes that the determinants are in psi_det
-  !
-  ! istart, iend, ishift, istep are used in ZMQ parallelization.
-  END_DOC
-
-  use bitmasks
-  implicit none
-
-  integer,          intent(in)  :: N_st,sze
-  double precision, intent(in)  :: u_0(sze,N_st)
-  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
-  call H_tc_s2_dagger_u_0_opt(v_0, s_0, u_0, N_st, sze)
-  integer :: i,j,degree,ist
-  double precision :: hmono, htwoe, hthree, htot
-  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8) &
-  !$OMP SHARED(N_st, N_det, N_int, psi_det, u_0, v_0)       &
-  !$OMP PRIVATE(ist, i, j, degree, hmono, htwoe, hthree,htot)
-  do i = 1, N_det
-   do j = 1, N_det
-    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
-    if(degree .ne. 3)cycle
-    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,j), psi_det(1,1,i), hmono, htwoe, hthree, htot)
-    do ist = 1, N_st
-     v_0(i,ist) += htot * u_0(j,ist)
-    enddo
-   enddo
-  enddo
- !$OMP END PARALLEL DO
-end
-
-! ---
-subroutine triple_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe, hthree, htot)
-  use bitmasks
-  BEGIN_DOC
-! <key_j | H_tilde | key_i> for triple excitation  
-!!
-!! WARNING !!
-! 
-! Genuine triple excitations of the same spin are not yet implemented
-  END_DOC
-  implicit none
-  integer(bit_kind), intent(in)  :: key_j(N_int,2),key_i(N_int,2)
-  integer, intent(in)            :: Nint
-  double precision, intent(out)  :: hmono, htwoe, hthree, htot
-  integer                        :: degree
-  integer                        :: h1, p1, h2, p2, s1, s2, h3, p3, s3
-  integer                        :: holes_array(100,2),particles_array(100,2),degree_array(2)
-  double precision               :: phase,sym_3_e_int_from_6_idx_tensor
-
-  hmono  = 0.d0
-  htwoe  = 0.d0
-  hthree = 0.d0
-  htot   = 0.d0
-  call get_excitation_general(key_j, key_i, Nint,degree_array,holes_array, particles_array,phase)
-  degree = degree_array(1) + degree_array(2)
-  if(degree .ne. 3)return
-  if(degree_array(1)==3.or.degree_array(2)==3)then
-   if(degree_array(1) == 3)then
-    h1 = holes_array(1,1) 
-    h2 = holes_array(2,1) 
-    h3 = holes_array(3,1) 
-    p1 = particles_array(1,1) 
-    p2 = particles_array(2,1) 
-    p3 = particles_array(3,1) 
-   else
-    h1 = holes_array(1,2) 
-    h2 = holes_array(2,2) 
-    h3 = holes_array(3,2) 
-    p1 = particles_array(1,2) 
-    p2 = particles_array(2,2) 
-    p3 = particles_array(3,2) 
-   endif
-   hthree = sym_3_e_int_from_6_idx_tensor(p3, p2, p1, h3, h2, h1)
-  else 
-   if(degree_array(1) == 2.and.degree_array(2) == 1)then ! double alpha + single beta
-    h1 = holes_array(1,1) 
-    h2 = holes_array(2,1) 
-    h3 = holes_array(1,2) 
-    p1 = particles_array(1,1) 
-    p2 = particles_array(2,1) 
-    p3 = particles_array(1,2) 
-   else if(degree_array(2) == 2 .and. degree_array(1) == 1)then ! double beta + single alpha 
-    h1 = holes_array(1,2) 
-    h2 = holes_array(2,2) 
-    h3 = holes_array(1,1) 
-    p1 = particles_array(1,2) 
-    p2 = particles_array(2,2) 
-    p3 = particles_array(1,1) 
-   else 
-    print*,'PB !!'
-    stop
-   endif
-   hthree = three_body_ints_bi_ort(p3,p2,p1,h3,h2,h1) - three_body_ints_bi_ort(p3,p2,p1,h3,h1,h2)
-  endif
-  hthree  *= phase
-  htot = hthree
- end
-
diff --git a/plugins/local/slater_tc/slater_tc_opt_diag.irp.f b/plugins/local/slater_tc/slater_tc_opt_diag.irp.f
index 78f9dc66..3c5a5d12 100644
--- a/plugins/local/slater_tc/slater_tc_opt_diag.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_diag.irp.f
@@ -19,13 +19,13 @@
   PROVIDE HF_bitmask
   PROVIDE mo_l_coef mo_r_coef
 
-  call diag_htilde_mu_mat_bi_ortho_slow(N_int, HF_bitmask, hmono, htwoe, htot)
+  call diag_htc_bi_orth_2e_brute(N_int, HF_bitmask, hmono, htwoe, htot)
 
   ref_tc_energy_1e = hmono
   ref_tc_energy_2e = htwoe 
 
   if(three_body_h_tc) then
-    call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
+    call diag_htc_bi_orth_3e_brute(N_int, HF_bitmask, hthree)
     ref_tc_energy_3e = hthree
   else
     ref_tc_energy_3e = 0.d0
@@ -524,3 +524,310 @@ end
 
 ! ---
 
+subroutine diag_htc_bi_orth_2e_brute(Nint, key_i, hmono, htwoe, htot)
+
+  BEGIN_DOC
+  !
+  ! diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
+  !
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in)  :: Nint
+  integer(bit_kind), intent(in)  :: key_i(Nint,2)
+  double precision, intent(out)  :: hmono,htwoe,htot
+  integer                        :: occ(Nint*bit_kind_size,2)
+  integer                        :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  double precision               :: get_mo_two_e_integral_tc_int
+  integer(bit_kind)              :: key_i_core(Nint,2)
+
+  PROVIDE mo_bi_ortho_tc_two_e
+
+  hmono = 0.d0
+  htwoe = 0.d0
+  htot  = 0.d0
+
+  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+
+  do ispin = 1, 2
+    do i = 1, Ne(ispin)
+      ii = occ(i,ispin)
+      hmono += mo_bi_ortho_tc_one_e(ii,ii)
+    enddo
+  enddo
+
+  ! alpha/beta two-body
+  ispin = 1
+  jspin = 2
+  do i = 1, Ne(ispin) ! electron 1 (so it can be associated to mu(r1))
+    ii = occ(i,ispin) 
+    do j = 1, Ne(jspin) ! electron 2 
+      jj = occ(j,jspin) 
+      htwoe += mo_bi_ortho_tc_two_e(jj,ii,jj,ii) 
+    enddo
+  enddo
+ 
+  ! alpha/alpha two-body
+  do i = 1, Ne(ispin)
+    ii = occ(i,ispin) 
+    do j = i+1, Ne(ispin)
+      jj = occ(j,ispin) 
+      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
+    enddo
+  enddo
+ 
+  ! beta/beta two-body
+  do i = 1, Ne(jspin)
+    ii = occ(i,jspin) 
+    do j = i+1, Ne(jspin)
+      jj = occ(j,jspin) 
+      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
+    enddo
+  enddo
+
+  htot = hmono + htwoe 
+
+end
+
+! ---                                                                                           
+
+subroutine diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
+
+  BEGIN_DOC
+  !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2)
+  double precision, intent(out) :: hthree
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2),i,j,ii,jj,ispin,jspin,m,mm
+  integer(bit_kind)             :: key_i_core(Nint,2)
+  double precision              :: direct_int, exchange_int, ref
+  double precision, external    :: sym_3_e_int_from_6_idx_tensor
+  double precision, external    :: three_e_diag_parrallel_spin
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  if(core_tc_op) then
+    do i = 1, Nint
+      key_i_core(i,1) = xor(key_i(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(key_i(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, Nint)
+  else
+    call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+  endif
+
+  hthree = 0.d0
+
+  if((Ne(1)+Ne(2)) .ge. 3) then
+
+    ! alpha/alpha/beta three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1)
+      do j = i+1, Ne(1)
+        jj = occ(j,1)
+        do m = 1, Ne(2)
+          mm = occ(m,2)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
+    enddo
+
+    ! beta/beta/alpha three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2)
+      do j = i+1, Ne(2)
+        jj = occ(j,2)
+        do m = 1, Ne(1)
+          mm = occ(m,1)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
+    enddo
+
+    ! alpha/alpha/alpha three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1) ! 1
+      do j = i+1, Ne(1)
+        jj = occ(j,1) ! 2
+        do m = j+1, Ne(1)
+          mm = occ(m,1) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
+    ! beta/beta/beta three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2) ! 1
+      do j = i+1, Ne(2)
+        jj = occ(j,2) ! 2
+        do m = j+1, Ne(2)
+          mm = occ(m,2) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
+  endif
+
+end
+
+
+
+BEGIN_PROVIDER [ double precision, three_e_diag_parrallel_spin_prov, (mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator ON A BI ORTHONORMAL BASIS 
+  !
+  ! three_e_diag_parrallel_spin_prov(m,j,i) = All combinations of the form <mji|-L|mji> for same spin matrix elements  
+  ! 
+  ! notice the -1 sign: in this way three_e_diag_parrallel_spin_prov can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, m
+  double precision :: integral, wall1, wall0, three_e_diag_parrallel_spin
+
+  three_e_diag_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_diag_parrallel_spin_prov ...'
+
+ integral = three_e_diag_parrallel_spin(1,1,1) ! to provide all stuffs
+  call wall_time(wall0)
+ !$OMP PARALLEL                 &
+ !$OMP DEFAULT (NONE)           &
+ !$OMP PRIVATE (i,j,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_diag_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = j, mo_num
+        three_e_diag_parrallel_spin_prov(m,j,i) =  three_e_diag_parrallel_spin(m,j,i)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = 1, j
+        three_e_diag_parrallel_spin_prov(m,j,i) = three_e_diag_parrallel_spin_prov(j,m,i)
+      enddo
+    enddo
+  enddo
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_diag_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, three_e_single_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_single_parrallel_spin_prov(m,j,k,i) = All combination of <mjk|-L|mji> for same spin matrix elements 
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+ implicit none
+ integer          :: i, j, k, m
+ double precision :: integral, wall1, wall0, three_e_single_parrallel_spin
+
+  three_e_single_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_single_parrallel_spin_prov ...'
+
+  integral = three_e_single_parrallel_spin(1,1,1,1)
+  call wall_time(wall0)
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_single_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_single_parrallel_spin_prov(m,j,k,i) = three_e_single_parrallel_spin(m,j,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_single_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_double_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_double_parrallel_spin_prov(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0, three_e_double_parrallel_spin
+
+  three_e_double_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_double_parrallel_spin_prov ...'
+  call wall_time(wall0)
+
+ integral = three_e_double_parrallel_spin(1,1,1,1,1)
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP SHARED (mo_num,three_e_double_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            three_e_double_parrallel_spin_prov(m,l,j,k,i) = three_e_double_parrallel_spin(m,l,j,k,i)
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_double_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
diff --git a/plugins/local/slater_tc/symmetrized_3_e_int_prov.irp.f b/plugins/local/slater_tc/symmetrized_3_e_int_prov.irp.f
deleted file mode 100644
index e8277a74..00000000
--- a/plugins/local/slater_tc/symmetrized_3_e_int_prov.irp.f
+++ /dev/null
@@ -1,140 +0,0 @@
-
-BEGIN_PROVIDER [ double precision, three_e_diag_parrallel_spin_prov, (mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator ON A BI ORTHONORMAL BASIS 
-  !
-  ! three_e_diag_parrallel_spin_prov(m,j,i) = All combinations of the form <mji|-L|mji> for same spin matrix elements  
-  ! 
-  ! notice the -1 sign: in this way three_e_diag_parrallel_spin_prov can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, m
-  double precision :: integral, wall1, wall0, three_e_diag_parrallel_spin
-
-  three_e_diag_parrallel_spin_prov = 0.d0
-  print *, ' Providing the three_e_diag_parrallel_spin_prov ...'
-
- integral = three_e_diag_parrallel_spin(1,1,1) ! to provide all stuffs
-  call wall_time(wall0)
- !$OMP PARALLEL                 &
- !$OMP DEFAULT (NONE)           &
- !$OMP PRIVATE (i,j,m,integral) & 
- !$OMP SHARED (mo_num,three_e_diag_parrallel_spin_prov)
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do m = j, mo_num
-        three_e_diag_parrallel_spin_prov(m,j,i) =  three_e_diag_parrallel_spin(m,j,i)
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do m = 1, j
-        three_e_diag_parrallel_spin_prov(m,j,i) = three_e_diag_parrallel_spin_prov(j,m,i)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_diag_parrallel_spin_prov', wall1 - wall0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, three_e_single_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_single_parrallel_spin_prov(m,j,k,i) = All combination of <mjk|-L|mji> for same spin matrix elements 
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
- implicit none
- integer          :: i, j, k, m
- double precision :: integral, wall1, wall0, three_e_single_parrallel_spin
-
-  three_e_single_parrallel_spin_prov = 0.d0
-  print *, ' Providing the three_e_single_parrallel_spin_prov ...'
-
-  integral = three_e_single_parrallel_spin(1,1,1,1)
-  call wall_time(wall0)
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_single_parrallel_spin_prov)
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_single_parrallel_spin_prov(m,j,k,i) = three_e_single_parrallel_spin(m,j,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_single_parrallel_spin_prov', wall1 - wall0
-
-END_PROVIDER 
-
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_double_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_double_parrallel_spin_prov(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0, three_e_double_parrallel_spin
-
-  three_e_double_parrallel_spin_prov = 0.d0
-  print *, ' Providing the three_e_double_parrallel_spin_prov ...'
-  call wall_time(wall0)
-
- integral = three_e_double_parrallel_spin(1,1,1,1,1)
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
- !$OMP SHARED (mo_num,three_e_double_parrallel_spin_prov)
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            three_e_double_parrallel_spin_prov(m,l,j,k,i) = three_e_double_parrallel_spin(m,l,j,k,i)
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_double_parrallel_spin_prov', wall1 - wall0
-
-END_PROVIDER 
-
diff --git a/plugins/local/slater_tc_no_opt/.gitignore b/plugins/local/slater_tc_no_opt/.gitignore
new file mode 100644
index 00000000..1561915b
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/.gitignore
@@ -0,0 +1,59 @@
+IRPF90_temp/
+IRPF90_man/
+build.ninja
+irpf90.make
+ezfio_interface.irp.f
+irpf90_entities
+tags
+Makefile
+ao_basis
+ao_one_e_ints
+ao_two_e_erf_ints
+ao_two_e_ints
+aux_quantities
+becke_numerical_grid
+bitmask
+cis
+cisd
+cipsi
+davidson
+davidson_dressed
+davidson_undressed
+density_for_dft
+determinants
+dft_keywords
+dft_utils_in_r
+dft_utils_one_e
+dft_utils_two_body
+dressing
+dummy
+electrons
+ezfio_files
+fci
+generators_cas
+generators_full
+hartree_fock
+iterations
+kohn_sham
+kohn_sham_rs
+mo_basis
+mo_guess
+mo_one_e_ints
+mo_two_e_erf_ints
+mo_two_e_ints
+mpi
+mrpt_utils
+nuclei
+perturbation
+pseudo
+psiref_cas
+psiref_utils
+scf_utils
+selectors_cassd
+selectors_full
+selectors_utils
+single_ref_method
+slave
+tools
+utils
+zmq
diff --git a/plugins/local/slater_tc_no_opt/NEED b/plugins/local/slater_tc_no_opt/NEED
new file mode 100644
index 00000000..a8669866
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/NEED
@@ -0,0 +1,8 @@
+determinants
+normal_order_old
+bi_ort_ints
+bi_ortho_mos
+tc_keywords
+non_hermit_dav
+dav_general_mat
+tc_scf
diff --git a/plugins/local/slater_tc_no_opt/README.rst b/plugins/local/slater_tc_no_opt/README.rst
new file mode 100644
index 00000000..90679e4c
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/README.rst
@@ -0,0 +1,4 @@
+================
+slater_tc_no_opt
+================
+
diff --git a/plugins/local/slater_tc/h_biortho.irp.f b/plugins/local/slater_tc_no_opt/h_biortho.irp.f
similarity index 100%
rename from plugins/local/slater_tc/h_biortho.irp.f
rename to plugins/local/slater_tc_no_opt/h_biortho.irp.f
diff --git a/plugins/local/slater_tc_no_opt/h_mat_triple.irp.f b/plugins/local/slater_tc_no_opt/h_mat_triple.irp.f
new file mode 100644
index 00000000..e2c8f982
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/h_mat_triple.irp.f
@@ -0,0 +1,193 @@
+subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, for each spin, of holes/particles between key_i and key_j 
+!
+! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
+ END_DOC
+  include 'utils/constants.include.F'
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
+ double precision, intent(out)  :: phase
+ integer :: ispin,k,i,pos
+ integer(bit_kind) :: key_hole, key_particle
+ integer(bit_kind) :: xorvec(N_int_max,2)
+ holes_array = -1
+ particles_array = -1
+ degree_array = 0
+  do i = 1, N_int
+   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
+   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
+   degree_array(1) += popcnt(xorvec(i,1))
+   degree_array(2) += popcnt(xorvec(i,2))
+  enddo
+  degree_array(1) = shiftr(degree_array(1),1)
+  degree_array(2) = shiftr(degree_array(2),1)
+  
+ do ispin = 1, 2
+  k = 1
+  !!! GETTING THE HOLES 
+  do i = 1, N_int
+   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
+   do while(key_hole .ne.0_bit_kind)
+    pos = trailz(key_hole)
+    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_hole = ibclr(key_hole,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_excitation_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+ do ispin = 1, 2
+  k = 1
+  !!! GETTING THE PARTICLES
+  do i = 1, N_int
+   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
+   do while(key_particle .ne.0_bit_kind)
+    pos = trailz(key_particle)
+    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_particle = ibclr(key_particle,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_excitation_general '
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo
+  enddo 
+ enddo
+ integer :: h,p, i_ok
+ integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
+ integer                        :: exc(0:2,2,2)
+ double precision :: phase_tmp
+ allocate(det_i(Nint,2),det_ip(N_int,2))
+ det_i = key_i
+ phase = 1.d0
+ do ispin = 1, 2
+  do i = 1, degree_array(ispin)
+   h = holes_array(i,ispin)
+   p = particles_array(i,ispin)
+   det_ip = det_i
+   call do_single_excitation(det_ip,h,p,ispin,i_ok)
+   if(i_ok == -1)then
+     print*,'excitation was not possible '
+     stop
+   endif
+   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
+   phase *= phase_tmp
+   det_i = det_ip
+  enddo
+ enddo
+
+end
+
+subroutine get_holes_general(key_i, key_j,Nint, holes_array)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, per spin, of holes between key_i and key_j 
+!
+! with the following convention: a_{hole}|key_i> --> |key_j>
+ END_DOC
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: holes_array(100,2)
+ integer(bit_kind) :: key_hole
+ integer :: ispin,k,i,pos
+ holes_array = -1
+ do ispin = 1, 2
+  k = 1
+  do i = 1, N_int
+   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
+   do while(key_hole .ne.0_bit_kind)
+    pos = trailz(key_hole)
+    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_hole = ibclr(key_hole,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_holes_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+end
+
+subroutine get_particles_general(key_i, key_j,Nint,particles_array)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, per spin, of particles between key_i and key_j 
+!
+! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
+ END_DOC
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: particles_array(100,2)
+ integer(bit_kind) :: key_particle
+ integer :: ispin,k,i,pos
+ particles_array = -1
+ do ispin = 1, 2
+  k = 1
+  do i = 1, N_int
+   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
+   do while(key_particle .ne.0_bit_kind)
+    pos = trailz(key_particle)
+    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_particle = ibclr(key_particle,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_holes_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'Those are the two determinants'
+     call debug_det(key_i, N_int)
+     call debug_det(key_j, N_int)
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+end
+
+subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
+ implicit none
+ integer, intent(in)            :: degree(2), Nint
+ integer(bit_kind), intent(in)  :: key_i(Nint,2)
+ integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
+ double precision, intent(out)  :: phase
+ integer :: i,ispin,h,p, i_ok
+ integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
+ integer                        :: exc(0:2,2,2)
+ double precision :: phase_tmp
+ allocate(det_i(Nint,2),det_ip(N_int,2))
+ det_i = key_i
+ phase = 1.d0
+ do ispin = 1, 2
+  do i = 1, degree(ispin)
+   h = holes_array(i,ispin)
+   p = particles_array(i,ispin)
+   det_ip = det_i
+   call do_single_excitation(det_ip,h,p,ispin,i_ok)
+   if(i_ok == -1)then
+     print*,'excitation was not possible '
+     stop
+   endif
+   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
+   phase *= phase_tmp
+   det_i = det_ip
+  enddo
+ enddo
+
+end
+
diff --git a/plugins/local/slater_tc/h_tc_bi_ortho_psi.irp.f b/plugins/local/slater_tc_no_opt/h_tc_bi_ortho_psi.irp.f
similarity index 100%
rename from plugins/local/slater_tc/h_tc_bi_ortho_psi.irp.f
rename to plugins/local/slater_tc_no_opt/h_tc_bi_ortho_psi.irp.f
diff --git a/plugins/local/slater_tc/slater_tc_3e_slow.irp.f b/plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
similarity index 99%
rename from plugins/local/slater_tc/slater_tc_3e_slow.irp.f
rename to plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
index cb33d343..f7919653 100644
--- a/plugins/local/slater_tc/slater_tc_3e_slow.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
+subroutine diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
 
   BEGIN_DOC
   !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
diff --git a/plugins/local/slater_tc/slater_tc.irp.f b/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
similarity index 82%
rename from plugins/local/slater_tc/slater_tc.irp.f
rename to plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
index 27ab47c5..0fcc587f 100644
--- a/plugins/local/slater_tc/slater_tc.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
@@ -1,4 +1,4 @@
-program slater_tc
+program slater_tc_no_opt
   implicit none
   BEGIN_DOC
 ! TODO : Put the documentation of the program here
diff --git a/plugins/local/slater_tc/slater_tc_slow.irp.f b/plugins/local/slater_tc_no_opt/slater_tc_slow.irp.f
similarity index 80%
rename from plugins/local/slater_tc/slater_tc_slow.irp.f
rename to plugins/local/slater_tc_no_opt/slater_tc_slow.irp.f
index caf7d665..b06fd12f 100644
--- a/plugins/local/slater_tc/slater_tc_slow.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_slow.irp.f
@@ -61,7 +61,7 @@ subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree,
   if(degree.gt.2) return
 
   if(degree == 0) then
-    call diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
+    call diag_htc_bi_orth_2e_brute(Nint, key_i, hmono, htwoe, htot)
   else if (degree == 1) then
     call single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
   else if(degree == 2) then
@@ -76,7 +76,7 @@ subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree,
     else if((degree == 1) .and. (elec_num .gt. 2) .and. three_e_4_idx_term) then
       call single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
     else if((degree == 0) .and. (elec_num .gt. 2) .and. three_e_3_idx_term) then
-      call diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
+      call diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
     endif
   endif
 
@@ -95,75 +95,6 @@ end
 
 ! ---
 
-subroutine diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
-
-  BEGIN_DOC
-  !
-  ! diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
-  !
-  END_DOC
-
-  use bitmasks
-
-  implicit none
-  integer,           intent(in)  :: Nint
-  integer(bit_kind), intent(in)  :: key_i(Nint,2)
-  double precision, intent(out)  :: hmono,htwoe,htot
-  integer                        :: occ(Nint*bit_kind_size,2)
-  integer                        :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
-  double precision               :: get_mo_two_e_integral_tc_int
-  integer(bit_kind)              :: key_i_core(Nint,2)
-
-  PROVIDE mo_bi_ortho_tc_two_e
-
-  hmono = 0.d0
-  htwoe = 0.d0
-  htot  = 0.d0
-
-  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
-
-  do ispin = 1, 2 
-    do i = 1, Ne(ispin)
-      ii = occ(i,ispin) 
-      hmono += mo_bi_ortho_tc_one_e(ii,ii)
-    enddo
-  enddo
-
-  ! alpha/beta two-body
-  ispin = 1
-  jspin = 2 
-  do i = 1, Ne(ispin) ! electron 1 (so it can be associated to mu(r1))
-    ii = occ(i,ispin) 
-    do j = 1, Ne(jspin) ! electron 2 
-      jj = occ(j,jspin) 
-      htwoe += mo_bi_ortho_tc_two_e(jj,ii,jj,ii) 
-    enddo
-  enddo
- 
-  ! alpha/alpha two-body
-  do i = 1, Ne(ispin)
-    ii = occ(i,ispin) 
-    do j = i+1, Ne(ispin)
-      jj = occ(j,ispin) 
-      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
-    enddo
-  enddo
- 
-  ! beta/beta two-body
-  do i = 1, Ne(jspin)
-    ii = occ(i,jspin) 
-    do j = i+1, Ne(jspin)
-      jj = occ(j,jspin) 
-      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
-    enddo
-  enddo
-
-  htot = hmono + htwoe 
-
-end
-
-! ---
-
 subroutine double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
 
   BEGIN_DOC
diff --git a/src/determinants/slater_rules_general.irp.f b/src/determinants/slater_rules_general.irp.f
new file mode 100644
index 00000000..e987c846
--- /dev/null
+++ b/src/determinants/slater_rules_general.irp.f
@@ -0,0 +1,192 @@
+subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, for each spin, of holes/particles between key_i and key_j 
+!
+! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
+ END_DOC
+  include 'utils/constants.include.F'
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
+ double precision, intent(out)  :: phase
+ integer :: ispin,k,i,pos
+ integer(bit_kind) :: key_hole, key_particle
+ integer(bit_kind) :: xorvec(N_int_max,2)
+ holes_array = -1
+ particles_array = -1
+ degree_array = 0
+  do i = 1, N_int
+   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
+   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
+   degree_array(1) += popcnt(xorvec(i,1))
+   degree_array(2) += popcnt(xorvec(i,2))
+  enddo
+  degree_array(1) = shiftr(degree_array(1),1)
+  degree_array(2) = shiftr(degree_array(2),1)
+  
+ do ispin = 1, 2
+  k = 1
+  !!! GETTING THE HOLES 
+  do i = 1, N_int
+   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
+   do while(key_hole .ne.0_bit_kind)
+    pos = trailz(key_hole)
+    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_hole = ibclr(key_hole,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_excitation_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+ do ispin = 1, 2
+  k = 1
+  !!! GETTING THE PARTICLES
+  do i = 1, N_int
+   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
+   do while(key_particle .ne.0_bit_kind)
+    pos = trailz(key_particle)
+    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_particle = ibclr(key_particle,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_excitation_general '
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo
+  enddo 
+ enddo
+ integer :: h,p, i_ok
+ integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
+ integer                        :: exc(0:2,2,2)
+ double precision :: phase_tmp
+ allocate(det_i(Nint,2),det_ip(N_int,2))
+ det_i = key_i
+ phase = 1.d0
+ do ispin = 1, 2
+  do i = 1, degree_array(ispin)
+   h = holes_array(i,ispin)
+   p = particles_array(i,ispin)
+   det_ip = det_i
+   call do_single_excitation(det_ip,h,p,ispin,i_ok)
+   if(i_ok == -1)then
+     print*,'excitation was not possible '
+     stop
+   endif
+   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
+   phase *= phase_tmp
+   det_i = det_ip
+  enddo
+ enddo
+
+end
+
+subroutine get_holes_general(key_i, key_j,Nint, holes_array)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, per spin, of holes between key_i and key_j 
+!
+! with the following convention: a_{hole}|key_i> --> |key_j>
+ END_DOC
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: holes_array(100,2)
+ integer(bit_kind) :: key_hole
+ integer :: ispin,k,i,pos
+ holes_array = -1
+ do ispin = 1, 2
+  k = 1
+  do i = 1, N_int
+   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
+   do while(key_hole .ne.0_bit_kind)
+    pos = trailz(key_hole)
+    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_hole = ibclr(key_hole,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_holes_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+end
+
+subroutine get_particles_general(key_i, key_j,Nint,particles_array)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, per spin, of particles between key_i and key_j 
+!
+! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
+ END_DOC
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: particles_array(100,2)
+ integer(bit_kind) :: key_particle
+ integer :: ispin,k,i,pos
+ particles_array = -1
+ do ispin = 1, 2
+  k = 1
+  do i = 1, N_int
+   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
+   do while(key_particle .ne.0_bit_kind)
+    pos = trailz(key_particle)
+    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_particle = ibclr(key_particle,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_holes_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'Those are the two determinants'
+     call debug_det(key_i, N_int)
+     call debug_det(key_j, N_int)
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+end
+
+subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
+ implicit none
+ integer, intent(in)            :: degree(2), Nint
+ integer(bit_kind), intent(in)  :: key_i(Nint,2)
+ integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
+ double precision, intent(out)  :: phase
+ integer :: i,ispin,h,p, i_ok
+ integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
+ integer                        :: exc(0:2,2,2)
+ double precision :: phase_tmp
+ allocate(det_i(Nint,2),det_ip(N_int,2))
+ det_i = key_i
+ phase = 1.d0
+ do ispin = 1, 2
+  do i = 1, degree(ispin)
+   h = holes_array(i,ispin)
+   p = particles_array(i,ispin)
+   det_ip = det_i
+   call do_single_excitation(det_ip,h,p,ispin,i_ok)
+   if(i_ok == -1)then
+     print*,'excitation was not possible '
+     stop
+   endif
+   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
+   phase *= phase_tmp
+   det_i = det_ip
+  enddo
+ enddo
+
+end

From b749796d931401f2c7e966e2c7eeedfff2f4477c Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 6 May 2024 18:33:29 +0200
Subject: [PATCH 029/131] still not compiling

---
 .../tc_bi_ortho/dressing_vectors_lr.irp.f      |  8 ++++----
 .../local/tc_bi_ortho/e_corr_bi_ortho.irp.f    | 18 +++++++++---------
 plugins/local/tc_bi_ortho/print_tc_wf.irp.f    |  6 +++---
 .../local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f  |  2 +-
 plugins/local/tc_bi_ortho/tc_som.irp.f         |  4 ++--
 plugins/local/tc_bi_ortho/tc_utils.irp.f       | 10 +++++-----
 .../local/tc_bi_ortho/test_normal_order.irp.f  |  8 ++++----
 .../local/tc_bi_ortho/test_tc_bi_ortho.irp.f   | 10 +++++-----
 plugins/local/tc_bi_ortho/test_tc_fock.irp.f   |  4 ++--
 9 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f b/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f
index 0aff9980..135f9d17 100644
--- a/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f
+++ b/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f
@@ -27,7 +27,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
   call hmat_bi_ortho         (psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
 
   delta = 0.d0
@@ -39,7 +39,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
     do j = 1, ndet
 
       ! < I |Htilde | J >
-      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
       ! < I |H | J >
       call hmat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
 
@@ -78,7 +78,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
 
   delta = 0.d0
  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8)   &
@@ -88,7 +88,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
     do j = 1, ndet
 
       ! < I |Htilde | J >
-      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
 
       delta(i) = delta(i) + psicoef(j) * htc_tot
     enddo
diff --git a/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f b/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
index 6d5c3b21..4abdc25b 100644
--- a/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
+++ b/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
@@ -2,7 +2,7 @@
  BEGIN_PROVIDER [ double precision, e_tilde_00]
  implicit none
  double precision :: hmono,htwoe,hthree,htot
- call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
+ call htilde_mu_mat_opt_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
  e_tilde_00 = htot
  END_PROVIDER 
 
@@ -18,11 +18,11 @@
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
   if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
-   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    e_pt2_tc_bi_orth += coef_pt1 * htilde_ij
    if(degree == 1)then
     e_pt2_tc_bi_orth_single += coef_pt1 * htilde_ij
@@ -37,7 +37,7 @@
  BEGIN_PROVIDER [ double precision, e_tilde_bi_orth_00]
  implicit none
  double precision :: hmono,htwoe,hthree,htilde_ij
- call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
+ call htilde_mu_mat_opt_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
  e_tilde_bi_orth_00 += nuclear_repulsion
  END_PROVIDER 
 
@@ -57,7 +57,7 @@
  e_corr_double_bi_orth = 0.d0
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+  call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
   if(degree == 1)then
    e_corr_single_bi_orth += reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1)
    e_corr_single_bi_orth_abs += dabs(reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1))
@@ -80,7 +80,7 @@
  do i = 1, N_det
   accu += reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(i,1)
   do j = 1, N_det
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    e_tc_left_right += htilde_ij * reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(j,1)
   enddo
  enddo
@@ -99,8 +99,8 @@ BEGIN_PROVIDER [ double precision, coef_pt1_bi_ortho, (N_det)]
   if(degree==0)then
    coef_pt1_bi_ortho(i) = 1.d0
   else
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
    coef_pt1_bi_ortho(i)= coef_pt1
diff --git a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f b/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
index 2b88bc5b..ab5ce371 100644
--- a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
+++ b/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
@@ -61,12 +61,12 @@ subroutine routine
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
    if(degree == 1 .or. degree == 2)then
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
     delta_e = e_tilde_00 - e_i0
     coef_pt1 = htilde_ij / delta_e
  
-    call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
     contrib_pt = coef_pt1 * htilde_ij
     e_pt2 += contrib_pt
  
diff --git a/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f b/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
index 4c3c0788..5cbf26d2 100644
--- a/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
@@ -14,7 +14,7 @@
   call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
   if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
+   call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
   endif
  enddo
  reigvec_tc_bi_orth_tmp = 0.d0
diff --git a/plugins/local/tc_bi_ortho/tc_som.irp.f b/plugins/local/tc_bi_ortho/tc_som.irp.f
index 1d11c81b..6bdcc1f0 100644
--- a/plugins/local/tc_bi_ortho/tc_som.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_som.irp.f
@@ -49,8 +49,8 @@ subroutine main()
   U_SOM = 0.d0 
   do i = 1, N_det
     if(i == i_HF) cycle
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
     U_SOM += htot_1 * htot_2
   enddo
   U_SOM = 0.5d0 * U_SOM
diff --git a/plugins/local/tc_bi_ortho/tc_utils.irp.f b/plugins/local/tc_bi_ortho/tc_utils.irp.f
index 43a6865e..2aa148a3 100644
--- a/plugins/local/tc_bi_ortho/tc_utils.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_utils.irp.f
@@ -25,7 +25,7 @@ subroutine write_tc_energy()
     E_2e_tmp(i) = 0.d0
     E_3e_tmp(i) = 0.d0
     do j = 1, N_det
-      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+      call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
       E_TC_tmp(i) = E_TC_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * htot
       E_1e_tmp(i) = E_1e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * hmono
       E_2e_tmp(i) = E_2e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * htwoe
@@ -70,7 +70,7 @@ subroutine write_tc_energy()
     E_3e = 0.d0
     do i = 1, N_det
       do j = 1, N_det
-        call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+        call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
         E_TC = E_TC + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htot
         E_1e = E_1e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * hmono
         E_2e = E_2e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htwoe
@@ -109,8 +109,8 @@ subroutine write_tc_var()
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
-      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
+      call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
+      call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
       SIGMA_TC = SIGMA_TC + htot_1j * htot_j1
     enddo
 
@@ -132,7 +132,7 @@ subroutine write_tc_gs_var_HF()
 
   SIGMA_TC = 0.d0
   do j = 2, N_det
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot)
     SIGMA_TC = SIGMA_TC + htot * htot
   enddo
 
diff --git a/plugins/local/tc_bi_ortho/test_normal_order.irp.f b/plugins/local/tc_bi_ortho/test_normal_order.irp.f
index 0cf27396..7b4c558f 100644
--- a/plugins/local/tc_bi_ortho/test_normal_order.irp.f
+++ b/plugins/local/tc_bi_ortho/test_normal_order.irp.f
@@ -54,7 +54,7 @@ subroutine test
     if(i_ok.ne.1)cycle
      call do_single_excitation(det_i,h2,p2,s2,i_ok)
     if(i_ok.ne.1)cycle
-     call htilde_mu_mat_bi_ortho_slow(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
+     call htilde_mu_mat_opt_bi_ortho(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
      call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
      call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
      hthree_tmp *= phase
@@ -66,7 +66,7 @@ subroutine test
     if(i_ok.ne.1)cycle
      call do_single_excitation(det_i,h2,p2,s2,i_ok)
     if(i_ok.ne.1)cycle
-     call htilde_mu_mat_bi_ortho_slow(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
+     call htilde_mu_mat_opt_bi_ortho(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
      call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
      call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
      hthree_tmp *= phase
@@ -109,7 +109,7 @@ do h1 = 1, elec_alpha_num
     if(i_ok.ne.1)cycle
     call do_single_excitation(det_i,h2,p2,s2,i_ok)
     if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_opt_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
     integer :: hh1, pp1, hh2, pp2, ss1, ss2
@@ -145,7 +145,7 @@ do h1 = 1, elec_beta_num
     if(i_ok.ne.1)cycle
     call do_single_excitation(det_i,h2,p2,s2,i_ok)
     if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_opt_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
     call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
diff --git a/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f b/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f
index 369efd15..559c0200 100644
--- a/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f
@@ -88,7 +88,7 @@ subroutine test_slater_tc_opt
  i_count = 0.d0
  do i = 1, N_det
   do j = 1,N_det
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hnewmono, hnewtwoe, hnewthree, hnewtot)
    if(dabs(htot).gt.1.d-15)then
      i_count += 1.D0
@@ -124,7 +124,7 @@ subroutine timing_tot
   do j = 1, N_det
 !   call get_excitation_degree(psi_det(1,1,j), psi_det(1,1,i),degree,N_int)
    i_count += 1.d0
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   enddo
  enddo
  call wall_time(wall1)
@@ -171,7 +171,7 @@ subroutine timing_diag
  do i = 1, N_det
   do j = i,i 
    i_count += 1.d0
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   enddo
  enddo
  call wall_time(wall1)
@@ -208,7 +208,7 @@ subroutine timing_single
    if(degree.ne.1)cycle
    i_count += 1.d0
    call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call wall_time(wall1)
    accu += wall1 - wall0
   enddo
@@ -250,7 +250,7 @@ subroutine timing_double
    if(degree.ne.2)cycle
    i_count += 1.d0
    call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call wall_time(wall1)
    accu += wall1 - wall0
   enddo
diff --git a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f b/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
index 85f3ed97..b33b2e93 100644
--- a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
+++ b/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
@@ -64,7 +64,7 @@ subroutine routine_3()
        print*, ' excited det'
        call debug_det(det_i, N_int)
  
-       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
+       call htilde_mu_mat_opt_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
        if(dabs(hthree).lt.1.d-10)cycle
        ref = hthree 
        if(s1 == 1)then
@@ -130,7 +130,7 @@ subroutine routine_tot()
         stop
        endif
  
-       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
+       call htilde_mu_mat_opt_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
        print*,htilde_ij
 !       if(dabs(htilde_ij).lt.1.d-10)cycle
        print*, ' excited det'

From 366afb2933baba919db1ad85b7eee965ea56d0c6 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 6 May 2024 18:53:20 +0200
Subject: [PATCH 030/131] compiling after some cleaning

---
 plugins/local/old_delta_tc_qmc/NEED           |   1 +
 plugins/local/old_delta_tc_qmc/README.rst     |   4 +
 .../compute_deltamu_right.irp.f               |   0
 .../dressing_vectors_lr.irp.f                 |   0
 .../old_delta_tc_qmc/old_delta_tc_qmc.irp.f   |   7 +
 plugins/local/slater_tc/h_mat_triple.irp.f    | 198 ++++++++++++++++++
 .../local/slater_tc_no_opt/h_mat_triple.irp.f | 193 -----------------
 .../test_tc_bi_ortho.irp.f                    |   0
 plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f   | 129 ------------
 plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f   |  36 ----
 .../local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f | 145 -------------
 plugins/local/tc_bi_ortho/test_s2_tc.irp.f    | 170 ---------------
 12 files changed, 210 insertions(+), 673 deletions(-)
 create mode 100644 plugins/local/old_delta_tc_qmc/NEED
 create mode 100644 plugins/local/old_delta_tc_qmc/README.rst
 rename plugins/local/{tc_bi_ortho => old_delta_tc_qmc}/compute_deltamu_right.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => old_delta_tc_qmc}/dressing_vectors_lr.irp.f (100%)
 create mode 100644 plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
 create mode 100644 plugins/local/slater_tc/h_mat_triple.irp.f
 delete mode 100644 plugins/local/slater_tc_no_opt/h_mat_triple.irp.f
 rename plugins/local/{tc_bi_ortho => slater_tc_no_opt}/test_tc_bi_ortho.irp.f (100%)
 delete mode 100644 plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f
 delete mode 100644 plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f
 delete mode 100644 plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
 delete mode 100644 plugins/local/tc_bi_ortho/test_s2_tc.irp.f

diff --git a/plugins/local/old_delta_tc_qmc/NEED b/plugins/local/old_delta_tc_qmc/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/old_delta_tc_qmc/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/old_delta_tc_qmc/README.rst b/plugins/local/old_delta_tc_qmc/README.rst
new file mode 100644
index 00000000..1d56f96c
--- /dev/null
+++ b/plugins/local/old_delta_tc_qmc/README.rst
@@ -0,0 +1,4 @@
+================
+old_delta_tc_qmc
+================
+
diff --git a/plugins/local/tc_bi_ortho/compute_deltamu_right.irp.f b/plugins/local/old_delta_tc_qmc/compute_deltamu_right.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/compute_deltamu_right.irp.f
rename to plugins/local/old_delta_tc_qmc/compute_deltamu_right.irp.f
diff --git a/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f b/plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f
rename to plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
diff --git a/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f b/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
new file mode 100644
index 00000000..5ff08bd6
--- /dev/null
+++ b/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
@@ -0,0 +1,7 @@
+program old_delta_tc_qmc
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+end
diff --git a/plugins/local/slater_tc/h_mat_triple.irp.f b/plugins/local/slater_tc/h_mat_triple.irp.f
new file mode 100644
index 00000000..9cb4b60a
--- /dev/null
+++ b/plugins/local/slater_tc/h_mat_triple.irp.f
@@ -0,0 +1,198 @@
+subroutine H_tc_s2_u_0_with_pure_three(v_0, s_0, u_0, N_st, sze)
+  BEGIN_DOC
+  ! Computes $v_0 = H^TC | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
+  !
+  ! Assumes that the determinants are in psi_det
+  !
+  ! istart, iend, ishift, istep are used in ZMQ parallelization.
+  END_DOC
+
+  use bitmasks
+  implicit none
+
+  integer,          intent(in)  :: N_st,sze
+  double precision, intent(in)  :: u_0(sze,N_st)
+  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
+  call H_tc_s2_u_0_opt(v_0, s_0, u_0, N_st, sze)
+  integer :: i,j,degree,ist
+  double precision :: hmono, htwoe, hthree, htot
+  do i = 1, N_det
+   do j = 1, N_det
+    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
+    if(degree .ne. 3)cycle
+    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,i), psi_det(1,1,j), hmono, htwoe, hthree, htot)
+    do ist = 1, N_st
+     v_0(i,ist) += htot * u_0(j,ist)
+    enddo
+   enddo
+  enddo
+end
+
+subroutine H_tc_s2_u_0_with_pure_three_omp(v_0, s_0, u_0, N_st, sze)
+  BEGIN_DOC
+  ! Computes $v_0 = H^TC | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
+  !
+  ! Assumes that the determinants are in psi_det
+  !
+  ! istart, iend, ishift, istep are used in ZMQ parallelization.
+  END_DOC
+
+  use bitmasks
+  implicit none
+
+  integer,          intent(in)  :: N_st,sze
+  double precision, intent(in)  :: u_0(sze,N_st)
+  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
+  call H_tc_s2_u_0_opt(v_0, s_0, u_0, N_st, sze)
+  integer :: i,j,degree,ist
+  double precision :: hmono, htwoe, hthree, htot
+  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8) &
+  !$OMP SHARED(N_st, N_det, N_int, psi_det, u_0, v_0)       &
+  !$OMP PRIVATE(ist, i, j, degree, hmono, htwoe, hthree,htot)
+  do i = 1, N_det
+   do j = 1, N_det
+    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
+    if(degree .ne. 3)cycle
+    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,i), psi_det(1,1,j), hmono, htwoe, hthree, htot)
+    do ist = 1, N_st
+     v_0(i,ist) += htot * u_0(j,ist)
+    enddo
+   enddo
+  enddo
+ !$OMP END PARALLEL DO
+end
+
+! ---
+
+subroutine H_tc_s2_dagger_u_0_with_pure_three(v_0, s_0, u_0, N_st, sze)
+  BEGIN_DOC
+  ! Computes $v_0 = (H^TC)^dagger | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
+  !
+  ! Assumes that the determinants are in psi_det
+  !
+  ! istart, iend, ishift, istep are used in ZMQ parallelization.
+  END_DOC
+
+  use bitmasks
+  implicit none
+
+  integer,          intent(in)  :: N_st,sze
+  double precision, intent(in)  :: u_0(sze,N_st)
+  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
+  call H_tc_s2_dagger_u_0_opt(v_0, s_0, u_0, N_st, sze)
+  integer :: i,j,degree,ist
+  double precision :: hmono, htwoe, hthree, htot
+  do i = 1, N_det
+   do j = 1, N_det
+    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
+    if(degree .ne. 3)cycle
+    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,j), psi_det(1,1,i), hmono, htwoe, hthree, htot)
+    do ist = 1, N_st
+     v_0(i,ist) += htot * u_0(j,ist)
+    enddo
+   enddo
+  enddo
+end
+
+subroutine H_tc_s2_dagger_u_0_with_pure_three_omp(v_0, s_0, u_0, N_st, sze)
+  BEGIN_DOC
+  ! Computes $v_0 = (H^TC)^dagger | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
+  !
+  ! Assumes that the determinants are in psi_det
+  !
+  ! istart, iend, ishift, istep are used in ZMQ parallelization.
+  END_DOC
+
+  use bitmasks
+  implicit none
+
+  integer,          intent(in)  :: N_st,sze
+  double precision, intent(in)  :: u_0(sze,N_st)
+  double precision, intent(out) :: v_0(sze,N_st), s_0(sze,N_st)
+  call H_tc_s2_dagger_u_0_opt(v_0, s_0, u_0, N_st, sze)
+  integer :: i,j,degree,ist
+  double precision :: hmono, htwoe, hthree, htot
+  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8) &
+  !$OMP SHARED(N_st, N_det, N_int, psi_det, u_0, v_0)       &
+  !$OMP PRIVATE(ist, i, j, degree, hmono, htwoe, hthree,htot)
+  do i = 1, N_det
+   do j = 1, N_det
+    call get_excitation_degree(psi_det(1,1,i),psi_det(1,1,j),degree,N_int)
+    if(degree .ne. 3)cycle
+    call triple_htilde_mu_mat_fock_bi_ortho(N_int, psi_det(1,1,j), psi_det(1,1,i), hmono, htwoe, hthree, htot)
+    do ist = 1, N_st
+     v_0(i,ist) += htot * u_0(j,ist)
+    enddo
+   enddo
+  enddo
+ !$OMP END PARALLEL DO
+end
+
+! ---
+subroutine triple_htilde_mu_mat_fock_bi_ortho(Nint, key_j, key_i, hmono, htwoe, hthree, htot)
+  use bitmasks
+  BEGIN_DOC
+! <key_j | H_tilde | key_i> for triple excitation  
+!!
+!! WARNING !!
+! 
+! Genuine triple excitations of the same spin are not yet implemented
+  END_DOC
+  implicit none
+  integer(bit_kind), intent(in)  :: key_j(N_int,2),key_i(N_int,2)
+  integer, intent(in)            :: Nint
+  double precision, intent(out)  :: hmono, htwoe, hthree, htot
+  integer                        :: degree
+  integer                        :: h1, p1, h2, p2, s1, s2, h3, p3, s3
+  integer                        :: holes_array(100,2),particles_array(100,2),degree_array(2)
+  double precision               :: phase,sym_3_e_int_from_6_idx_tensor
+
+  hmono  = 0.d0
+  htwoe  = 0.d0
+  hthree = 0.d0
+  htot   = 0.d0
+  call get_excitation_general(key_j, key_i, Nint,degree_array,holes_array, particles_array,phase)
+  degree = degree_array(1) + degree_array(2)
+  if(degree .ne. 3)return
+  if(degree_array(1)==3.or.degree_array(2)==3)then
+   if(degree_array(1) == 3)then
+    h1 = holes_array(1,1) 
+    h2 = holes_array(2,1) 
+    h3 = holes_array(3,1) 
+    p1 = particles_array(1,1) 
+    p2 = particles_array(2,1) 
+    p3 = particles_array(3,1) 
+   else
+    h1 = holes_array(1,2) 
+    h2 = holes_array(2,2) 
+    h3 = holes_array(3,2) 
+    p1 = particles_array(1,2) 
+    p2 = particles_array(2,2) 
+    p3 = particles_array(3,2) 
+   endif
+   hthree = sym_3_e_int_from_6_idx_tensor(p3, p2, p1, h3, h2, h1)
+  else 
+   if(degree_array(1) == 2.and.degree_array(2) == 1)then ! double alpha + single beta
+    h1 = holes_array(1,1) 
+    h2 = holes_array(2,1) 
+    h3 = holes_array(1,2) 
+    p1 = particles_array(1,1) 
+    p2 = particles_array(2,1) 
+    p3 = particles_array(1,2) 
+   else if(degree_array(2) == 2 .and. degree_array(1) == 1)then ! double beta + single alpha 
+    h1 = holes_array(1,2) 
+    h2 = holes_array(2,2) 
+    h3 = holes_array(1,1) 
+    p1 = particles_array(1,2) 
+    p2 = particles_array(2,2) 
+    p3 = particles_array(1,1) 
+   else 
+    print*,'PB !!'
+    stop
+   endif
+   hthree = three_body_ints_bi_ort(p3,p2,p1,h3,h2,h1) - three_body_ints_bi_ort(p3,p2,p1,h3,h1,h2)
+  endif
+  hthree  *= phase
+  htot = hthree
+ end
+
diff --git a/plugins/local/slater_tc_no_opt/h_mat_triple.irp.f b/plugins/local/slater_tc_no_opt/h_mat_triple.irp.f
deleted file mode 100644
index e2c8f982..00000000
--- a/plugins/local/slater_tc_no_opt/h_mat_triple.irp.f
+++ /dev/null
@@ -1,193 +0,0 @@
-subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
- use bitmasks
- BEGIN_DOC
-! returns the array, for each spin, of holes/particles between key_i and key_j 
-!
-! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
- END_DOC
-  include 'utils/constants.include.F'
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
- double precision, intent(out)  :: phase
- integer :: ispin,k,i,pos
- integer(bit_kind) :: key_hole, key_particle
- integer(bit_kind) :: xorvec(N_int_max,2)
- holes_array = -1
- particles_array = -1
- degree_array = 0
-  do i = 1, N_int
-   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
-   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
-   degree_array(1) += popcnt(xorvec(i,1))
-   degree_array(2) += popcnt(xorvec(i,2))
-  enddo
-  degree_array(1) = shiftr(degree_array(1),1)
-  degree_array(2) = shiftr(degree_array(2),1)
-  
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE HOLES 
-  do i = 1, N_int
-   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE PARTICLES
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general '
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo
-  enddo 
- enddo
- integer :: h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree_array(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
-subroutine get_holes_general(key_i, key_j,Nint, holes_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of holes between key_i and key_j 
-!
-! with the following convention: a_{hole}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2)
- integer(bit_kind) :: key_hole
- integer :: ispin,k,i,pos
- holes_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_particles_general(key_i, key_j,Nint,particles_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of particles between key_i and key_j 
-!
-! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: particles_array(100,2)
- integer(bit_kind) :: key_particle
- integer :: ispin,k,i,pos
- particles_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'Those are the two determinants'
-     call debug_det(key_i, N_int)
-     call debug_det(key_j, N_int)
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
- implicit none
- integer, intent(in)            :: degree(2), Nint
- integer(bit_kind), intent(in)  :: key_i(Nint,2)
- integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
- double precision, intent(out)  :: phase
- integer :: i,ispin,h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
diff --git a/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f b/plugins/local/slater_tc_no_opt/test_tc_bi_ortho.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f
rename to plugins/local/slater_tc_no_opt/test_tc_bi_ortho.irp.f
diff --git a/plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f b/plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f
deleted file mode 100644
index 8940a4f6..00000000
--- a/plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f
+++ /dev/null
@@ -1,129 +0,0 @@
-program pt2_tc_cisd
-
-  BEGIN_DOC
-  !
-  ! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together 
-  !        with the energy. Saves the left-right wave functions at the end. 
-  !
-  END_DOC
-
-  implicit none
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  print*, ' nb of states = ', N_states
-  print*, ' nb of det    = ', N_det
-  call routine_diag()
-
-  call routine
-end
-
-subroutine routine
- implicit none
- integer :: i,h1,p1,h2,p2,s1,s2,degree
- double precision :: h0i,hi0,e00,ei,delta_e
- double precision :: norm,e_corr,coef,e_corr_pos,e_corr_neg,e_corr_abs
-
- integer                        :: exc(0:2,2,2)
- double precision               :: phase
- double precision :: eh1,ep1,eh2,ep2
-
- norm = 0.d0 
- e_corr = 0.d0
- e_corr_abs = 0.d0
- e_corr_pos = 0.d0
- e_corr_neg = 0.d0
- call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,1), N_int, e00) 
- do i = 2, N_det
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,1), N_int, hi0) 
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,i), N_int, h0i) 
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, ei) 
-  call get_excitation_degree(psi_det(1,1,1), psi_det(1,1,i),degree,N_int)
-  call get_excitation(psi_det(1,1,1), psi_det(1,1,i),exc,degree,phase,N_int)
-  call decode_exc(exc,degree,h1,p1,h2,p2,s1,s2)
-  eh1 = Fock_matrix_tc_diag_mo_tot(h1)
-  ep1 = Fock_matrix_tc_diag_mo_tot(p1)
-  delta_e = eh1 - ep1
-  if (degree==2)then
-   eh2 = Fock_matrix_tc_diag_mo_tot(h2)
-   ep2 = Fock_matrix_tc_diag_mo_tot(p2)
-   delta_e +=  eh2 - ep2
-  endif
-!  delta_e = e00 - ei
-  coef = hi0/delta_e
-  norm += coef*coef
-  e_corr = coef* h0i
-  if(e_corr.lt.0.d0)then
-   e_corr_neg += e_corr
-  elseif(e_corr.gt.0.d0)then
-   e_corr_pos += e_corr
-  endif
-  e_corr_abs += dabs(e_corr)
- enddo
- print*,'e_corr_abs = ',e_corr_abs
- print*,'e_corr_pos = ',e_corr_pos
- print*,'e_corr_neg = ',e_corr_neg
- print*,'norm       = ',dsqrt(norm)
-
-end
-
-subroutine routine_diag()
-
-  implicit none
-  integer          :: i, j, k
-  double precision :: dE
-
-  ! provide eigval_right_tc_bi_orth
-  ! provide overlap_bi_ortho
-  ! provide htilde_matrix_elmt_bi_ortho
-
-  if(N_states .eq. 1) then
-
-    print*,'eigval_right_tc_bi_orth   = ',eigval_right_tc_bi_orth(1)
-    print*,'e_tc_left_right           = ',e_tc_left_right
-    print*,'e_tilde_bi_orth_00        = ',e_tilde_bi_orth_00
-    print*,'e_pt2_tc_bi_orth          = ',e_pt2_tc_bi_orth
-    print*,'e_pt2_tc_bi_orth_single   = ',e_pt2_tc_bi_orth_single
-    print*,'e_pt2_tc_bi_orth_double   = ',e_pt2_tc_bi_orth_double
-    print*,'***'                      
-    print*,'e_corr_bi_orth            = ',e_corr_bi_orth
-    print*,'e_corr_bi_orth_proj       = ',e_corr_bi_orth_proj
-    print*,'e_corr_bi_orth_proj_abs   = ',e_corr_bi_orth_proj_abs
-    print*,'e_corr_single_bi_orth     = ',e_corr_single_bi_orth
-    print*,'e_corr_double_bi_orth     = ',e_corr_double_bi_orth
-    print*,'e_corr_single_bi_orth_abs = ',e_corr_single_bi_orth_abs
-    print*,'e_corr_double_bi_orth_abs = ',e_corr_double_bi_orth_abs
-    print*,'Left/right eigenvectors'
-    do i = 1,N_det
-      write(*,'(I5,X,(100(F12.7,X)))')i,leigvec_tc_bi_orth(i,1),reigvec_tc_bi_orth(i,1),leigvec_tc_bi_orth(i,1)*reigvec_tc_bi_orth(i,1)
-    enddo
-
-  else
-
-    print*,'eigval_right_tc_bi_orth : '
-    do i = 1, N_states
-      print*, i, eigval_right_tc_bi_orth(i)
-    enddo
-
-    print*,''
-    print*,'******************************************************'
-    print*,'TC Excitation energies (au)                     (eV)'
-    do i = 2, N_states
-      dE = eigval_right_tc_bi_orth(i) - eigval_right_tc_bi_orth(1)
-      print*, i, dE, dE/0.0367502d0
-    enddo
-    print*,''
-
-  endif
-
-end
-
-
-
diff --git a/plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f b/plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f
deleted file mode 100644
index d4c8c55d..00000000
--- a/plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f
+++ /dev/null
@@ -1,36 +0,0 @@
-
-! ---
-
-program tc_cisd_sc2
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call test
-
-end
-
-! ---
-
-subroutine test()
- implicit none
-! double precision, allocatable :: dressing_dets(:),e_corr_dets(:)
-! allocate(dressing_dets(N_det),e_corr_dets(N_det))
-! e_corr_dets = 0.d0
-! call get_cisd_sc2_dressing(psi_det,e_corr_dets,N_det,dressing_dets)
-  provide eigval_tc_cisd_sc2_bi_ortho
-end
diff --git a/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f b/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
deleted file mode 100644
index 5cbf26d2..00000000
--- a/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
+++ /dev/null
@@ -1,145 +0,0 @@
- BEGIN_PROVIDER [ double precision, reigvec_tc_cisd_sc2_bi_ortho, (N_det,N_states)]
-&BEGIN_PROVIDER [ double precision, leigvec_tc_cisd_sc2_bi_ortho, (N_det,N_states)]
-&BEGIN_PROVIDER [ double precision, eigval_tc_cisd_sc2_bi_ortho, (N_states)]
- implicit none
- integer :: it,n_real,degree,i,istate
- double precision :: e_before, e_current,thr, hmono,htwoe,hthree,accu
- double precision, allocatable :: e_corr_dets(:),h0j(:), h_sc2(:,:), dressing_dets(:)
- double precision, allocatable :: leigvec_tc_bi_orth_tmp(:,:),reigvec_tc_bi_orth_tmp(:,:),eigval_right_tmp(:)
- allocate(leigvec_tc_bi_orth_tmp(N_det,N_det),reigvec_tc_bi_orth_tmp(N_det,N_det),eigval_right_tmp(N_det))
- allocate(e_corr_dets(N_det),h0j(N_det),h_sc2(N_det,N_det),dressing_dets(N_det))
- allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),eigval_tmp(N_states))
- dressing_dets = 0.d0
- do i = 1, N_det
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
-  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
-  endif
- enddo
- reigvec_tc_bi_orth_tmp = 0.d0
- do i = 1, N_det 
-  reigvec_tc_bi_orth_tmp(i,1) = psi_r_coef_bi_ortho(i,1) 
- enddo
- vec_tmp = 0.d0
- do istate = 1, N_states
-  vec_tmp(:,istate) = reigvec_tc_bi_orth_tmp(:,istate)
- enddo
- do istate = N_states+1, n_states_diag
-  vec_tmp(istate,istate) = 1.d0
- enddo
- print*,'Diagonalizing the TC CISD '
- call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
- do i = 1, N_det 
-  e_corr_dets(i) = reigvec_tc_bi_orth_tmp(i,1) * h0j(i)/reigvec_tc_bi_orth_tmp(1,1)
- enddo
- E_before = eigval_tmp(1)
- print*,'Starting from ',E_before
-
- e_current = 10.d0
- thr = 1.d-5
- it = 0
- dressing_dets = 0.d0
-  double precision, allocatable :: H_jj(:),vec_tmp(:,:),eigval_tmp(:)
-  external                         htc_bi_ortho_calc_tdav_slow
-  external                         htcdag_bi_ortho_calc_tdav_slow
-  logical                       :: converged
- do while (dabs(E_before-E_current).gt.thr)
-  it += 1
-  E_before = E_current
-!  h_sc2 = htilde_matrix_elmt_bi_ortho
-  call get_cisd_sc2_dressing(psi_det,e_corr_dets,N_det,dressing_dets)
-  do i = 1, N_det
-!   print*,'dressing_dets(i) = ',dressing_dets(i)
-   h_sc2(i,i) += dressing_dets(i)
-  enddo
-  print*,'********************'
-  print*,'iteration       ',it
-!  call non_hrmt_real_diag(N_det,h_sc2,& 
-!       leigvec_tc_bi_orth_tmp,reigvec_tc_bi_orth_tmp,& 
-!       n_real,eigval_right_tmp)
-!  print*,'eigval_right_tmp(1)',eigval_right_tmp(1)
-  vec_tmp = 0.d0
-  do istate = 1, N_states
-   vec_tmp(:,istate) = reigvec_tc_bi_orth_tmp(:,istate)
-  enddo
-  do istate = N_states+1, n_states_diag
-   vec_tmp(istate,istate) = 1.d0
-  enddo
-  call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
-  print*,'outside Davidson'
-  print*,'eigval_tmp(1) = ',eigval_tmp(1)
-  do i = 1, N_det 
-   reigvec_tc_bi_orth_tmp(i,1) = vec_tmp(i,1)
-   e_corr_dets(i) = reigvec_tc_bi_orth_tmp(i,1) * h0j(i)/reigvec_tc_bi_orth_tmp(1,1)
-  enddo
-!  E_current = eigval_right_tmp(1)
-  E_current = eigval_tmp(1)
-  print*,'it, E(SC)^2 = ',it,E_current
- enddo
- eigval_tc_cisd_sc2_bi_ortho(1:N_states) = eigval_right_tmp(1:N_states)
- reigvec_tc_cisd_sc2_bi_ortho(1:N_det,1:N_states) = reigvec_tc_bi_orth_tmp(1:N_det,1:N_states)
- leigvec_tc_cisd_sc2_bi_ortho(1:N_det,1:N_states) = leigvec_tc_bi_orth_tmp(1:N_det,1:N_states)
- 
-END_PROVIDER 
-
-subroutine get_cisd_sc2_dressing(dets,e_corr_dets,ndet,dressing_dets)
- implicit none
-  use bitmasks
- integer, intent(in) :: ndet
- integer(bit_kind), intent(in)  :: dets(N_int,2,ndet)
- double precision, intent(in)   :: e_corr_dets(ndet)
- double precision, intent(out) :: dressing_dets(ndet)
- integer, allocatable  :: degree(:),hole(:,:),part(:,:),spin(:,:)
- integer(bit_kind), allocatable :: hole_part(:,:,:)
- integer :: i,j,k, exc(0:2,2,2),h1,p1,h2,p2,s1,s2
- integer(bit_kind) :: xorvec(2,N_int)
-
- double precision :: phase
- dressing_dets = 0.d0
- allocate(degree(ndet),hole(2,ndet),part(2,ndet), spin(2,ndet),hole_part(N_int,2,ndet))
- do i = 2, ndet
-  call get_excitation_degree(HF_bitmask,dets(1,1,i),degree(i),N_int)
-  do j = 1, N_int
-   hole_part(j,1,i) = xor( HF_bitmask(j,1), dets(j,1,i))
-   hole_part(j,2,i) = xor( HF_bitmask(j,2), dets(j,2,i))
-  enddo
-  if(degree(i) == 1)then
-   call get_single_excitation(HF_bitmask,psi_det(1,1,i),exc,phase,N_int)
-  else if(degree(i) == 2)then
-   call get_double_excitation(HF_bitmask,psi_det(1,1,i),exc,phase,N_int)
-  endif
-  call decode_exc(exc,degree,h1,p1,h2,p2,s1,s2)
-  hole(1,i) = h1
-  hole(2,i) = h2
-  part(1,i) = p1
-  part(2,i) = p2
-  spin(1,i) = s1
-  spin(2,i) = s2
- enddo
- 
- integer :: same
- if(elec_alpha_num+elec_beta_num<3)return
- do i = 2, ndet
-  do j = i+1, ndet
-   same = 0
-   if(degree(i) == degree(j) .and. degree(i)==1)cycle
-   do k = 1, N_int
-    xorvec(k,1) = iand(hole_part(k,1,i),hole_part(k,1,j))
-    xorvec(k,2) = iand(hole_part(k,2,i),hole_part(k,2,j))
-    same += popcnt(xorvec(k,1)) + popcnt(xorvec(k,2)) 
-   enddo
-!   print*,'i,j',i,j
-!   call debug_det(dets(1,1,i),N_int) 
-!   call debug_det(hole_part(1,1,i),N_int) 
-!   call debug_det(dets(1,1,j),N_int) 
-!   call debug_det(hole_part(1,1,j),N_int) 
-!   print*,'same = ',same
-   if(same.eq.0)then
-    dressing_dets(i) += e_corr_dets(j)  
-    dressing_dets(j) += e_corr_dets(i)  
-   endif
-  enddo
- enddo
- 
-end
diff --git a/plugins/local/tc_bi_ortho/test_s2_tc.irp.f b/plugins/local/tc_bi_ortho/test_s2_tc.irp.f
deleted file mode 100644
index 7c70b119..00000000
--- a/plugins/local/tc_bi_ortho/test_s2_tc.irp.f
+++ /dev/null
@@ -1,170 +0,0 @@
-
-! ---
-
-program test_tc 
-
-  implicit none
- 
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call provide_all_three_ints_bi_ortho()
-  call routine_h_triple_left
-  call routine_h_triple_right
-!  call routine_test_s2_davidson
-
-end
-
-subroutine routine_h_triple_right
- implicit none
- logical           :: do_right 
- integer :: sze ,i, N_st, j
- double precision :: sij, accu_e, accu_s, accu_e_0, accu_s_0
- double precision, allocatable :: v_0_ref(:,:),u_0(:,:),s_0_ref(:,:)
- double precision, allocatable :: v_0_new(:,:),s_0_new(:,:)
- sze = N_det
- N_st = 1
- allocate(v_0_ref(N_det,1),u_0(N_det,1),s_0_ref(N_det,1),s_0_new(N_det,1),v_0_new(N_det,1))
- print*,'Checking first the Right '
- do i = 1, sze
-  u_0(i,1) = psi_r_coef_bi_ortho(i,1)
- enddo
- double precision :: wall0,wall1
- call wall_time(wall0)
- call H_tc_s2_u_0_with_pure_three_omp(v_0_ref,s_0_ref, u_0,N_st,sze)
- call wall_time(wall1)
- print*,'time for omp',wall1 - wall0
- call wall_time(wall0)
- call H_tc_s2_u_0_with_pure_three(v_0_new, s_0_new, u_0, N_st, sze)
- call wall_time(wall1)
- print*,'time serial ',wall1 - wall0
- accu_e = 0.d0
- accu_s = 0.d0
- do i = 1, sze
-  accu_e += dabs(v_0_ref(i,1) - v_0_new(i,1))
-  accu_s += dabs(s_0_ref(i,1) - s_0_new(i,1))
- enddo
- print*,'accu_e   = ',accu_e
- print*,'accu_s   = ',accu_s
-
-end
-
-subroutine routine_h_triple_left
- implicit none
- logical           :: do_right 
- integer :: sze ,i, N_st, j
- double precision :: sij, accu_e, accu_s, accu_e_0, accu_s_0
- double precision, allocatable :: v_0_ref(:,:),u_0(:,:),s_0_ref(:,:)
- double precision, allocatable :: v_0_new(:,:),s_0_new(:,:)
- sze = N_det
- N_st = 1
- allocate(v_0_ref(N_det,1),u_0(N_det,1),s_0_ref(N_det,1),s_0_new(N_det,1),v_0_new(N_det,1))
- print*,'Checking the Left '
- do i = 1, sze
-  u_0(i,1) = psi_l_coef_bi_ortho(i,1)
- enddo
- double precision :: wall0,wall1
- call wall_time(wall0)
- call H_tc_s2_dagger_u_0_with_pure_three_omp(v_0_ref,s_0_ref, u_0,N_st,sze)
- call wall_time(wall1)
- print*,'time for omp',wall1 - wall0
- call wall_time(wall0)
- call H_tc_s2_dagger_u_0_with_pure_three(v_0_new, s_0_new, u_0, N_st, sze)
- call wall_time(wall1)
- print*,'time serial ',wall1 - wall0
- accu_e = 0.d0
- accu_s = 0.d0
- do i = 1, sze
-  accu_e += dabs(v_0_ref(i,1) - v_0_new(i,1))
-  accu_s += dabs(s_0_ref(i,1) - s_0_new(i,1))
- enddo
- print*,'accu_e   = ',accu_e
- print*,'accu_s   = ',accu_s
-
-end
-
-
-subroutine routine_test_s2_davidson
- implicit none
- double precision, allocatable :: H_jj(:),vec_tmp(:,:), energies(:) , s2(:)
- integer :: i,istate
- logical :: converged 
- external H_tc_s2_dagger_u_0_opt
- external H_tc_s2_u_0_opt
- allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),energies(n_states_diag), s2(n_states_diag))
- do i = 1, N_det
-   call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
- enddo
- ! Preparing the left-eigenvector
- print*,'Computing the left-eigenvector '
- vec_tmp = 0.d0
- do istate = 1, N_states
-  vec_tmp(1:N_det,istate) = psi_l_coef_bi_ortho(1:N_det,istate)
- enddo
- do istate = N_states+1, n_states_diag
-  vec_tmp(istate,istate) = 1.d0
- enddo
- do istate = 1, N_states
-  leigvec_tc_bi_orth(1:N_det,istate) = vec_tmp(1:N_det,istate)
- enddo
- integer :: n_it_max
- n_it_max = 1
- call davidson_hs2_nonsym_b1space(vec_tmp, H_jj, s2, energies, N_det, n_states, n_states_diag, n_it_max, converged, H_tc_s2_dagger_u_0_opt)
- double precision, allocatable :: v_0_new(:,:),s_0_new(:,:)
- integer :: sze,N_st
- logical           :: do_right 
- sze = N_det
- N_st = 1
- do_right = .False.
- allocate(s_0_new(N_det,1),v_0_new(N_det,1))
- call H_tc_s2_u_0_nstates_openmp(v_0_new,s_0_new,vec_tmp,N_st,sze, do_right)
- double precision :: accu_e_0, accu_s_0
- accu_e_0 = 0.d0
- accu_s_0 = 0.d0
- do i = 1, sze
-  accu_e_0 += v_0_new(i,1) * vec_tmp(i,1)
-  accu_s_0 += s_0_new(i,1) * vec_tmp(i,1)
- enddo
- print*,'energies = ',energies
- print*,'s2       = ',s2
- print*,'accu_e_0',accu_e_0
- print*,'accu_s_0',accu_s_0
-
- ! Preparing the right-eigenvector
- print*,'Computing the right-eigenvector '
- vec_tmp = 0.d0
- do istate = 1, N_states
-  vec_tmp(1:N_det,istate) = psi_r_coef_bi_ortho(1:N_det,istate)
- enddo
- do istate = N_states+1, n_states_diag
-  vec_tmp(istate,istate) = 1.d0
- enddo
- do istate = 1, N_states
-  leigvec_tc_bi_orth(1:N_det,istate) = vec_tmp(1:N_det,istate)
- enddo
- n_it_max = 1
- call davidson_hs2_nonsym_b1space(vec_tmp, H_jj, s2, energies, N_det, n_states, n_states_diag, n_it_max, converged, H_tc_s2_u_0_opt)
- sze = N_det
- N_st = 1
- do_right = .True.
- v_0_new = 0.d0
- s_0_new = 0.d0
- call H_tc_s2_u_0_nstates_openmp(v_0_new,s_0_new,vec_tmp,N_st,sze, do_right)
- accu_e_0 = 0.d0
- accu_s_0 = 0.d0
- do i = 1, sze
-  accu_e_0 += v_0_new(i,1) * vec_tmp(i,1)
-  accu_s_0 += s_0_new(i,1) * vec_tmp(i,1)
- enddo
- print*,'energies = ',energies
- print*,'s2       = ',s2
- print*,'accu_e_0',accu_e_0
- print*,'accu_s_0',accu_s_0
-
-end

From 2a8b9e544b8c9f47ce55dd8f0c4e7df5b0a67ea1 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Tue, 7 May 2024 01:56:14 +0200
Subject: [PATCH 031/131] working on aos debug

---
 plugins/local/non_h_ints_mu/deb_aos.irp.f |  16 +-
 src/ao_basis/aos_in_r.irp.f               | 508 ++++++++++++----------
 2 files changed, 285 insertions(+), 239 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f
index c9bc9c9a..a84e1b91 100644
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@@ -31,12 +31,14 @@ subroutine print_aos()
   integer          :: i, ipoint
   double precision :: r(3)
   double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: mo_val, mo_der(3), mo_lap
 
   PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
 
+  write(1000, *) n_points_final_grid
   do ipoint = 1, n_points_final_grid
     r(:) = final_grid_points(:,ipoint)
-    print*, r
+    write(1000, '(3(f15.7, 3X))') r
   enddo
 
   do ipoint = 1, n_points_final_grid
@@ -45,7 +47,17 @@ subroutine print_aos()
       ao_val    = aos_in_r_array     (i,ipoint)
       ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
       ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
-      write(*, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+      write(1010, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+    enddo
+  enddo
+
+  do ipoint = 1, n_points_final_grid
+    r(:) = final_grid_points(:,ipoint)
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      write(2010, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
     enddo
   enddo
  
diff --git a/src/ao_basis/aos_in_r.irp.f b/src/ao_basis/aos_in_r.irp.f
index 1b1595a3..053c86a2 100644
--- a/src/ao_basis/aos_in_r.irp.f
+++ b/src/ao_basis/aos_in_r.irp.f
@@ -1,67 +1,76 @@
-double precision function ao_value(i,r)
- implicit none
- BEGIN_DOC
-! Returns the value of the i-th ao at point $\textbf{r}$
- END_DOC
- double precision, intent(in) :: r(3)
- integer, intent(in) :: i
 
- integer :: m,num_ao
- double precision :: center_ao(3)
- double precision :: beta
- integer :: power_ao(3)
- double precision :: accu,dx,dy,dz,r2
- num_ao = ao_nucl(i)
- power_ao(1:3)= ao_power(i,1:3)
- center_ao(1:3) = nucl_coord(num_ao,1:3)
- dx = (r(1) - center_ao(1))
- dy = (r(2) - center_ao(2))
- dz = (r(3) - center_ao(3))
- r2 = dx*dx + dy*dy + dz*dz
- dx = dx**power_ao(1)
- dy = dy**power_ao(2)
- dz = dz**power_ao(3)
+! ---
 
- accu = 0.d0
- do m=1,ao_prim_num(i)
-   beta = ao_expo_ordered_transp(m,i)
-   accu += ao_coef_normalized_ordered_transp(m,i) * dexp(-beta*r2)
- enddo
- ao_value = accu * dx * dy * dz
+double precision function ao_value(i, r)
+
+  BEGIN_DOC
+  ! Returns the value of the i-th ao at point $\textbf{r}$
+  END_DOC
+
+  implicit none
+  integer,          intent(in) :: i
+  double precision, intent(in) :: r(3)
+
+  integer          :: m, num_ao
+  integer          :: power_ao(3)
+  double precision :: center_ao(3)
+  double precision :: beta
+  double precision :: accu, dx, dy, dz, r2
+
+  num_ao = ao_nucl(i)
+  power_ao(1:3) = ao_power(i,1:3)
+  center_ao(1:3) = nucl_coord(num_ao,1:3)
+  dx = r(1) - center_ao(1)
+  dy = r(2) - center_ao(2)
+  dz = r(3) - center_ao(3)
+  r2 = dx*dx + dy*dy + dz*dz
+  dx = dx**power_ao(1)
+  dy = dy**power_ao(2)
+  dz = dz**power_ao(3)
+ 
+  accu = 0.d0
+  do m = 1, ao_prim_num(i)
+    beta = ao_expo_ordered_transp(m,i)
+    accu += ao_coef_normalized_ordered_transp(m,i) * dexp(-beta*r2)
+  enddo
+  ao_value = accu * dx * dy * dz
 
 end
 
 
-double precision function primitive_value(i,j,r)
- implicit none
- BEGIN_DOC
-! Returns the value of the j-th primitive of the i-th |AO| at point $\textbf{r}
-! **without the coefficient**
- END_DOC
- double precision, intent(in) :: r(3)
- integer, intent(in) :: i,j
+double precision function primitive_value(i, j, r)
 
- integer :: m,num_ao
- double precision :: center_ao(3)
- double precision :: beta
- integer :: power_ao(3)
- double precision :: accu,dx,dy,dz,r2
- num_ao = ao_nucl(i)
- power_ao(1:3)= ao_power(i,1:3)
- center_ao(1:3) = nucl_coord(num_ao,1:3)
- dx = (r(1) - center_ao(1))
- dy = (r(2) - center_ao(2))
- dz = (r(3) - center_ao(3))
- r2 = dx*dx + dy*dy + dz*dz
- dx = dx**power_ao(1)
- dy = dy**power_ao(2)
- dz = dz**power_ao(3)
+  BEGIN_DOC
+  ! Returns the value of the j-th primitive of the i-th |AO| at point $\textbf{r}
+  ! **without the coefficient**
+  END_DOC
 
- accu = 0.d0
- m=j
- beta = ao_expo_ordered_transp(m,i)
- accu += dexp(-beta*r2)
- primitive_value = accu * dx * dy * dz
+  implicit none
+  integer,          intent(in) :: i, j
+  double precision, intent(in) :: r(3)
+ 
+  integer          :: m, num_ao
+  integer          :: power_ao(3)
+  double precision :: center_ao(3)
+  double precision :: beta
+  double precision :: accu, dx, dy, dz, r2
+
+  num_ao = ao_nucl(i)
+  power_ao(1:3)= ao_power(i,1:3)
+  center_ao(1:3) = nucl_coord(num_ao,1:3)
+  dx = r(1) - center_ao(1)
+  dy = r(2) - center_ao(2)
+  dz = r(3) - center_ao(3)
+  r2 = dx*dx + dy*dy + dz*dz
+  dx = dx**power_ao(1)
+  dy = dy**power_ao(2)
+  dz = dz**power_ao(3)
+ 
+  accu = 0.d0
+  m = j
+  beta = ao_expo_ordered_transp(m,i)
+  accu += dexp(-beta*r2)
+  primitive_value = accu * dx * dy * dz
 
 end
 
@@ -104,9 +113,9 @@ subroutine give_all_aos_at_r(r, tmp_array)
       dz2 = dz**p_ao(3)
 
       tmp_array(k) = 0.d0
-      do l = 1,ao_prim_num(k)
+      do l = 1, ao_prim_num(k)
         beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-        if(dabs(beta*r2).gt.40.d0) cycle
+        if(beta*r2.gt.50.d0) cycle
 
         tmp_array(k) += ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
       enddo
@@ -120,207 +129,232 @@ end
 
 ! ---
 
-subroutine give_all_aos_and_grad_at_r(r,aos_array,aos_grad_array)
- implicit none
- BEGIN_DOC
-! input : r(1) ==> r(1) = x, r(2) = y, r(3) = z
-!
-! output : 
-!
-! * aos_array(i) = ao(i) evaluated at ro
-! * aos_grad_array(1,i) = gradient X of the ao(i) evaluated at $\textbf{r}$
-!
- END_DOC
- double precision, intent(in) :: r(3)
- double precision, intent(out) :: aos_array(ao_num)
- double precision, intent(out) :: aos_grad_array(3,ao_num)
+subroutine give_all_aos_and_grad_at_r(r, aos_array, aos_grad_array)
 
- integer :: power_ao(3)
- integer :: i,j,k,l,m
- double precision :: dx,dy,dz,r2
- double precision ::      dx2,dy2,dz2
- double precision ::      dx1,dy1,dz1
- double precision :: center_ao(3)
- double precision :: beta,accu_1,accu_2,contrib
- do i = 1, nucl_num
-  center_ao(1:3) = nucl_coord(i,1:3)
-  dx = (r(1) - center_ao(1))
-  dy = (r(2) - center_ao(2))
-  dz = (r(3) - center_ao(3))
-  r2 = dx*dx + dy*dy + dz*dz
-  do j = 1,Nucl_N_Aos(i)
-   k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
-   aos_array(k) = 0.d0
-   aos_grad_array(1,k) = 0.d0
-   aos_grad_array(2,k) = 0.d0
-   aos_grad_array(3,k) = 0.d0
-   power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
-   dx2 = dx**power_ao(1)
-   dy2 = dy**power_ao(2)
-   dz2 = dz**power_ao(3)
-   if(power_ao(1) .ne. 0)then
-    dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
-   else
-    dx1 = 0.d0
-   endif
-   if(power_ao(2) .ne. 0)then
-    dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
-   else
-    dy1 = 0.d0
-   endif
-   if(power_ao(3) .ne. 0)then
-    dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
-   else
-    dz1 = 0.d0
-   endif
-   accu_1 = 0.d0
-   accu_2 = 0.d0
-   do l = 1,ao_prim_num(k)
-    beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-    contrib = 0.d0
-    if(beta*r2.gt.50.d0)cycle
-    contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
-    accu_1 += contrib
-    accu_2 += contrib * beta
-   enddo
-   aos_array(k) = accu_1 * dx2 * dy2 * dz2
-   aos_grad_array(1,k) = accu_1 * dx1  * dy2 * dz2- 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
-   aos_grad_array(2,k) = accu_1 * dx2  * dy1 * dz2- 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
-   aos_grad_array(3,k) = accu_1 * dx2  * dy2 * dz1- 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+  BEGIN_DOC
+  !
+  ! input : r(1) ==> r(1) = x, r(2) = y, r(3) = z
+  !
+  ! output : 
+  !
+  ! * aos_array(i) = ao(i) evaluated at ro
+  ! * aos_grad_array(1,i) = gradient X of the ao(i) evaluated at $\textbf{r}$
+  !
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: aos_array(ao_num)
+  double precision, intent(out) :: aos_grad_array(3,ao_num)
+
+  integer                       :: power_ao(3)
+  integer                       :: i, j, k, l, m
+  double precision              :: dx, dy, dz, r2
+  double precision              :: dx1, dy1, dz1
+  double precision              :: dx2, dy2, dz2
+  double precision              :: center_ao(3)
+  double precision              :: beta, accu_1, accu_2, contrib
+
+  do i = 1, nucl_num
+
+    center_ao(1:3) = nucl_coord(i,1:3)
+
+    dx = r(1) - center_ao(1)
+    dy = r(2) - center_ao(2)
+    dz = r(3) - center_ao(3)
+    r2 = dx*dx + dy*dy + dz*dz
+
+    do j = 1, Nucl_N_Aos(i)
+
+      k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
+
+      aos_array(k) = 0.d0
+      aos_grad_array(1,k) = 0.d0
+      aos_grad_array(2,k) = 0.d0
+      aos_grad_array(3,k) = 0.d0
+
+      power_ao(1:3) = ao_power_ordered_transp_per_nucl(1:3,j,i)
+      dx2 = dx**power_ao(1)
+      dy2 = dy**power_ao(2)
+      dz2 = dz**power_ao(3)
+
+      dx1 = 0.d0
+      if(power_ao(1) .ne. 0) then
+        dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
+      endif
+
+      dy1 = 0.d0
+      if(power_ao(2) .ne. 0) then
+        dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
+      endif
+
+      dz1 = 0.d0
+      if(power_ao(3) .ne. 0) then
+        dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
+      endif
+
+      accu_1 = 0.d0
+      accu_2 = 0.d0
+      do l = 1, ao_prim_num(k)
+        beta = ao_expo_ordered_transp_per_nucl(l,j,i)
+        if(beta*r2.gt.50.d0) cycle
+        contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
+        accu_1 += contrib
+        accu_2 += contrib * beta
+      enddo
+
+      aos_array(k) = accu_1 * dx2 * dy2 * dz2
+      aos_grad_array(1,k) = accu_1 * dx1 * dy2 * dz2 - 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
+      aos_grad_array(2,k) = accu_1 * dx2 * dy1 * dz2 - 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
+      aos_grad_array(3,k) = accu_1 * dx2 * dy2 * dz1 - 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+    enddo
   enddo
- enddo
+
 end
 
+! ---
 
-subroutine give_all_aos_and_grad_and_lapl_at_r(r,aos_array,aos_grad_array,aos_lapl_array)
- implicit none
- BEGIN_DOC
-! input  : r(1) ==> r(1) = x, r(2) = y, r(3) = z
-!
-! output :
-!
-! * aos_array(i) = ao(i) evaluated at $\textbf{r}$
-! * aos_grad_array(1,i) = $\nabla_x$ of the ao(i) evaluated at $\textbf{r}$
- END_DOC
- double precision, intent(in) :: r(3)
- double precision, intent(out) :: aos_array(ao_num)
- double precision, intent(out) :: aos_grad_array(3,ao_num)
- double precision, intent(out) :: aos_lapl_array(3,ao_num)
+subroutine give_all_aos_and_grad_and_lapl_at_r(r, aos_array, aos_grad_array, aos_lapl_array)
 
- integer :: power_ao(3)
- integer :: i,j,k,l,m
- double precision :: dx,dy,dz,r2
- double precision ::      dx2,dy2,dz2
- double precision ::      dx1,dy1,dz1
- double precision ::      dx3,dy3,dz3
- double precision ::      dx4,dy4,dz4
- double precision ::      dx5,dy5,dz5
- double precision :: center_ao(3)
- double precision :: beta,accu_1,accu_2,accu_3,contrib
- do i = 1, nucl_num
-  center_ao(1:3) = nucl_coord(i,1:3)
-  dx = (r(1) - center_ao(1))
-  dy = (r(2) - center_ao(2))
-  dz = (r(3) - center_ao(3))
-  r2 = dx*dx + dy*dy + dz*dz
-  do j = 1,Nucl_N_Aos(i)
-   k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
-   aos_array(k) = 0.d0
-   aos_grad_array(1,k) = 0.d0
-   aos_grad_array(2,k) = 0.d0
-   aos_grad_array(3,k) = 0.d0
+  BEGIN_DOC
+  !
+  ! input  : r(1) ==> r(1) = x, r(2) = y, r(3) = z
+  !
+  ! output :
+  !
+  ! * aos_array(i) = ao(i) evaluated at $\textbf{r}$
+  ! * aos_grad_array(1,i) = $\nabla_x$ of the ao(i) evaluated at $\textbf{r}$
+  !
+  END_DOC
 
-   aos_lapl_array(1,k) = 0.d0
-   aos_lapl_array(2,k) = 0.d0
-   aos_lapl_array(3,k) = 0.d0
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: aos_array(ao_num)
+  double precision, intent(out) :: aos_grad_array(3,ao_num)
+  double precision, intent(out) :: aos_lapl_array(3,ao_num)
 
-   power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
-   dx2 = dx**power_ao(1)
-   dy2 = dy**power_ao(2)
-   dz2 = dz**power_ao(3)
-   if(power_ao(1) .ne. 0)then
-    dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
-   else
-    dx1 = 0.d0
-   endif
-   ! For the Laplacian
-   if(power_ao(1) .ge. 2)then
-    dx3 = dble(power_ao(1)) * dble((power_ao(1)-1))  * dx**(power_ao(1)-2)
-   else
-    dx3 = 0.d0
-   endif
-   if(power_ao(1) .ge. 1)then
-    dx4 = dble((2 * power_ao(1) + 1))  * dx**(power_ao(1)) 
-   else
-    dx4 = dble((power_ao(1) + 1))  * dx**(power_ao(1)) 
-   endif
+  integer                       :: power_ao(3)
+  integer                       :: i, j, k, l, m
+  double precision              :: dx, dy, dz, r2
+  double precision              :: dx1, dy1, dz1
+  double precision              :: dx2, dy2, dz2
+  double precision              :: dx3, dy3, dz3
+  double precision              :: dx4, dy4, dz4
+  double precision              :: dx5, dy5, dz5
+  double precision              :: center_ao(3)
+  double precision              :: beta, accu_1, accu_2, accu_3, contrib
 
-   dx5 = dx**(power_ao(1)+2)
+  do i = 1, nucl_num
 
-   if(power_ao(2) .ne. 0)then
-    dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
-   else
-    dy1 = 0.d0
-   endif
-   ! For the Laplacian
-   if(power_ao(2) .ge. 2)then
-    dy3 = dble(power_ao(2)) * dble((power_ao(2)-1))  * dy**(power_ao(2)-2)
-   else
-    dy3 = 0.d0
-   endif
+    center_ao(1:3) = nucl_coord(i,1:3)
 
-   if(power_ao(2) .ge. 1)then
-    dy4 = dble((2 * power_ao(2) + 1))  * dy**(power_ao(2)) 
-   else
-    dy4 = dble((power_ao(2) + 1))  * dy**(power_ao(2)) 
-   endif
+    dx = r(1) - center_ao(1)
+    dy = r(2) - center_ao(2)
+    dz = r(3) - center_ao(3)
+    r2 = dx*dx + dy*dy + dz*dz
+    
+    do j = 1, Nucl_N_Aos(i)
 
-   dy5 = dy**(power_ao(2)+2)
+      k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
 
+      aos_array(k) = 0.d0
+      aos_grad_array(1,k) = 0.d0
+      aos_grad_array(2,k) = 0.d0
+      aos_grad_array(3,k) = 0.d0      
+      aos_lapl_array(1,k) = 0.d0
+      aos_lapl_array(2,k) = 0.d0
+      aos_lapl_array(3,k) = 0.d0
+      
+      power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
+      dx2 = dx**power_ao(1)
+      dy2 = dy**power_ao(2)
+      dz2 = dz**power_ao(3)
 
-   if(power_ao(3) .ne. 0)then
-    dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
-   else
-    dz1 = 0.d0
-   endif
-   ! For the Laplacian
-   if(power_ao(3) .ge. 2)then
-    dz3 = dble(power_ao(3)) * dble((power_ao(3)-1))  * dz**(power_ao(3)-2)
-   else
-    dz3 = 0.d0
-   endif
+      ! ---
 
-   if(power_ao(3) .ge. 1)then
-    dz4 = dble((2 * power_ao(3) + 1))  * dz**(power_ao(3)) 
-   else
-    dz4 = dble((power_ao(3) + 1))  * dz**(power_ao(3)) 
-   endif
+      dx1 = 0.d0
+      if(power_ao(1) .ne. 0) then
+        dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
+      endif
 
-   dz5 = dz**(power_ao(3)+2)
+      dx3 = 0.d0
+      if(power_ao(1) .ge. 2) then
+        dx3 = dble(power_ao(1)) * dble((power_ao(1)-1)) * dx**(power_ao(1)-2)
+      endif
 
+      if(power_ao(1) .ge. 1) then
+        dx4 = dble((2 * power_ao(1) + 1)) * dx**(power_ao(1)) 
+      else
+        dx4 = dble((power_ao(1) + 1)) * dx**(power_ao(1)) 
+      endif
+      
+      dx5 = dx**(power_ao(1)+2)
+  
+      ! ---
+      
+      dy1 = 0.d0
+      if(power_ao(2) .ne. 0) then
+        dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
+      endif
 
-   accu_1 = 0.d0
-   accu_2 = 0.d0
-   accu_3 = 0.d0
-   do l = 1,ao_prim_num(k)
-    beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-    contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
-    accu_1 += contrib
-    accu_2 += contrib * beta
-    accu_3 += contrib * beta**2
-   enddo
-   aos_array(k) = accu_1 * dx2 * dy2 * dz2
+      dy3 = 0.d0
+      if(power_ao(2) .ge. 2) then
+        dy3 = dble(power_ao(2)) * dble((power_ao(2)-1))  * dy**(power_ao(2)-2)
+      endif
+      
+      if(power_ao(2) .ge. 1) then
+        dy4 = dble((2 * power_ao(2) + 1)) * dy**(power_ao(2)) 
+      else
+        dy4 = dble((power_ao(2) + 1)) * dy**(power_ao(2)) 
+      endif
+      
+      dy5 = dy**(power_ao(2)+2)
 
-   aos_grad_array(1,k) = accu_1 * dx1  * dy2 * dz2- 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
-   aos_grad_array(2,k) = accu_1 * dx2  * dy1 * dz2- 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
-   aos_grad_array(3,k) = accu_1 * dx2  * dy2 * dz1- 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+      ! ---
+      
+      dz1 = 0.d0
+      if(power_ao(3) .ne. 0) then
+        dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
+      endif
 
-   aos_lapl_array(1,k) = accu_1 * dx3  * dy2 * dz2- 2.d0 * dx4 * dy2 * dz2* accu_2 +4.d0 * dx5 *dy2 * dz2* accu_3
-   aos_lapl_array(2,k) = accu_1 * dx2  * dy3 * dz2- 2.d0 * dx2 * dy4 * dz2* accu_2 +4.d0 * dx2 *dy5 * dz2* accu_3
-   aos_lapl_array(3,k) = accu_1 * dx2  * dy2 * dz3- 2.d0 * dx2 * dy2 * dz4* accu_2 +4.d0 * dx2 *dy2 * dz5* accu_3
+      dz3 = 0.d0
+      if(power_ao(3) .ge. 2) then
+        dz3 = dble(power_ao(3)) * dble((power_ao(3)-1)) * dz**(power_ao(3)-2)
+      endif
+      
+      if(power_ao(3) .ge. 1) then
+        dz4 = dble((2 * power_ao(3) + 1)) * dz**(power_ao(3)) 
+      else
+        dz4 = dble((power_ao(3) + 1)) * dz**(power_ao(3)) 
+      endif
+      
+      dz5 = dz**(power_ao(3)+2)
+      
+      ! ---
+      
+      accu_1 = 0.d0
+      accu_2 = 0.d0
+      accu_3 = 0.d0
+      do l = 1,ao_prim_num(k)
+        beta = ao_expo_ordered_transp_per_nucl(l,j,i)
+        if(beta*r2.gt.50.d0) cycle
+        contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
+        accu_1 += contrib
+        accu_2 += contrib * beta
+        accu_3 += contrib * beta**2
+      enddo
 
+      aos_array(k) = accu_1 * dx2 * dy2 * dz2
+      aos_grad_array(1,k) = accu_1 * dx1 * dy2 * dz2 - 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
+      aos_grad_array(2,k) = accu_1 * dx2 * dy1 * dz2 - 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
+      aos_grad_array(3,k) = accu_1 * dx2 * dy2 * dz1 - 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+      aos_lapl_array(1,k) = accu_1 * dx3 * dy2 * dz2 - 2.d0 * dx4 * dy2 * dz2 * accu_2 + 4.d0 * dx5 * dy2 * dz2 * accu_3
+      aos_lapl_array(2,k) = accu_1 * dx2 * dy3 * dz2 - 2.d0 * dx2 * dy4 * dz2 * accu_2 + 4.d0 * dx2 * dy5 * dz2 * accu_3
+      aos_lapl_array(3,k) = accu_1 * dx2 * dy2 * dz3 - 2.d0 * dx2 * dy2 * dz4 * accu_2 + 4.d0 * dx2 * dy2 * dz5 * accu_3
+    enddo
   enddo
- enddo
+
 end
 
+! ---
 

From 17ae4d8fe2f103bac46205380ae0e6a33736de71 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 7 May 2024 18:27:09 +0200
Subject: [PATCH 032/131] added tc_progs

---
 .../local/cipsi_tc_bi_ortho/selection.irp.f   |   4 +-
 ..._bi_ortho.irp.f => diagonalize_tc_h.irp.f} |   0
 plugins/local/tc_bi_ortho/test_natorb.irp.f   |  64 -------
 .../local/tc_bi_ortho/test_normal_order.irp.f | 173 ------------------
 plugins/local/tc_bi_ortho/test_tc_fock.irp.f  | 171 -----------------
 plugins/local/tc_progs/NEED                   |   1 +
 .../print_he_tc_energy.irp.f                  |   0
 .../print_tc_dump.irp.f                       |   0
 .../print_tc_energy.irp.f                     |   0
 .../print_tc_spin_dens.irp.f                  |   0
 .../print_tc_var.irp.f                        |   0
 .../print_tc_wf.irp.f                         |   0
 .../save_bitcpsileft_for_qmcchem.irp.f        |   0
 .../save_tc_bi_ortho_nat.irp.f                |   0
 .../select_dets_bi_ortho.irp.f                |   0
 .../tc_bi_ortho_prop.irp.f                    |   0
 .../{tc_bi_ortho => tc_progs}/tc_som.irp.f    |   0
 .../test_tc_two_rdm.irp.f                     |   0
 18 files changed, 3 insertions(+), 410 deletions(-)
 rename plugins/local/tc_bi_ortho/{tc_bi_ortho.irp.f => diagonalize_tc_h.irp.f} (100%)
 delete mode 100644 plugins/local/tc_bi_ortho/test_natorb.irp.f
 delete mode 100644 plugins/local/tc_bi_ortho/test_normal_order.irp.f
 delete mode 100644 plugins/local/tc_bi_ortho/test_tc_fock.irp.f
 create mode 100644 plugins/local/tc_progs/NEED
 rename plugins/local/{tc_bi_ortho => tc_progs}/print_he_tc_energy.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/print_tc_dump.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/print_tc_energy.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/print_tc_spin_dens.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/print_tc_var.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/print_tc_wf.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/save_bitcpsileft_for_qmcchem.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/save_tc_bi_ortho_nat.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/select_dets_bi_ortho.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/tc_bi_ortho_prop.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/tc_som.irp.f (100%)
 rename plugins/local/{tc_bi_ortho => tc_progs}/test_tc_two_rdm.irp.f (100%)

diff --git a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
index 12163e06..0b4345d5 100644
--- a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
@@ -892,8 +892,8 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
             psi_h_alpha = 0.d0
             alpha_h_psi = 0.d0
             do iii = 1, N_det_selectors
-              call htilde_mu_mat_bi_ortho_tot_slow(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_bi_ortho_tot_slow(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
+              call htilde_mu_mat_opt_bi_ortho_tot(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
+              call htilde_mu_mat_opt_bi_ortho_tot(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
               call get_excitation_degree(psi_selectors(1,1,iii), det,degree,N_int)
               if(degree == 0)then
                print*,'problem !!!'
diff --git a/plugins/local/tc_bi_ortho/tc_bi_ortho.irp.f b/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/tc_bi_ortho.irp.f
rename to plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
diff --git a/plugins/local/tc_bi_ortho/test_natorb.irp.f b/plugins/local/tc_bi_ortho/test_natorb.irp.f
deleted file mode 100644
index 5b8801f7..00000000
--- a/plugins/local/tc_bi_ortho/test_natorb.irp.f
+++ /dev/null
@@ -1,64 +0,0 @@
-
-! ---
-
-program test_natorb
-
-  BEGIN_DOC
-  ! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together with the energy. Saves the left-right wave functions at the end. 
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call routine()
-  ! call test()
-
-end
-
-! ---
-
-subroutine routine()
-
- implicit none
- double precision, allocatable :: fock_diag(:),eigval(:),leigvec(:,:),reigvec(:,:),mat_ref(:,:)
- allocate(eigval(mo_num),leigvec(mo_num,mo_num),reigvec(mo_num,mo_num),fock_diag(mo_num),mat_ref(mo_num, mo_num))
- double precision, allocatable :: eigval_ref(:),leigvec_ref(:,:),reigvec_ref(:,:)
- allocate(eigval_ref(mo_num),leigvec_ref(mo_num,mo_num),reigvec_ref(mo_num,mo_num))
- 
- double precision :: thr_deg
- integer :: i,n_real,j
- print*,'fock_matrix'
- do i = 1, mo_num
-  fock_diag(i) = Fock_matrix_mo(i,i)
-  print*,i,fock_diag(i)
- enddo
- thr_deg = 1.d-6
- mat_ref = -one_e_dm_mo
- print*,'diagonalization by block'
- call diag_mat_per_fock_degen(fock_diag,mat_ref,mo_num,thr_deg,leigvec,reigvec,eigval)
- call non_hrmt_bieig( mo_num, mat_ref&
-                     , leigvec_ref, reigvec_ref& 
-                     , n_real, eigval_ref)
- print*,'TEST ***********************************'
- double precision :: accu_l, accu_r
- do i = 1, mo_num
-  accu_l = 0.d0
-  accu_r = 0.d0
-  do j = 1, mo_num
-   accu_r += reigvec_ref(j,i) * reigvec(j,i)
-   accu_l += leigvec_ref(j,i) * leigvec(j,i)
-  enddo
-  print*,i
-  write(*,'(I3,X,100(F16.10,X))')i,eigval(i),eigval_ref(i),accu_l,accu_r
- enddo
-end
diff --git a/plugins/local/tc_bi_ortho/test_normal_order.irp.f b/plugins/local/tc_bi_ortho/test_normal_order.irp.f
deleted file mode 100644
index 7b4c558f..00000000
--- a/plugins/local/tc_bi_ortho/test_normal_order.irp.f
+++ /dev/null
@@ -1,173 +0,0 @@
-
-! ---
-
-program test_normal_order
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call provide_all_three_ints_bi_ortho()
-  call test()
-
-end
-
-! ---
-
-subroutine test
- implicit none
-  use bitmasks ! you need to include the bitmasks_module.f90 features
- integer :: h1,h2,p1,p2,s1,s2,i_ok,degree,Ne(2)
- integer                        :: exc(0:2,2,2)
- integer(bit_kind), allocatable :: det_i(:,:)
- double precision :: hmono,htwoe,hthree,htilde_ij,accu,phase,normal,hthree_tmp
- integer,           allocatable :: occ(:,:)
- allocate( occ(N_int*bit_kind_size,2) )
- call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
- allocate(det_i(N_int,2))
- s1 = 1 
- s2 = 2
- accu = 0.d0
- do h1 = 1, elec_beta_num
-  do p1 = elec_alpha_num+1, mo_num
-   do h2 = 1, elec_beta_num
-    do p2 = elec_beta_num+1, mo_num
-     hthree = 0.d0
-
-     det_i = ref_bitmask
- s1 = 1 
- s2 = 2
-     call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-     call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-     call htilde_mu_mat_opt_bi_ortho(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
-     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-     hthree_tmp *= phase
-     hthree += 0.5d0 * hthree_tmp
-     det_i = ref_bitmask
- s1 = 2 
- s2 = 1
-     call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-     call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-     call htilde_mu_mat_opt_bi_ortho(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
-     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-     hthree_tmp *= phase
-     hthree += 0.5d0 * hthree_tmp
-
-
-!     normal = normal_two_body_bi_orth_ab(p2,h2,p1,h1)
-     call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, normal)
-     if(dabs(hthree).lt.1.d-10)cycle
-     if(dabs(hthree-normal).gt.1.d-10)then
-!      print*,pp2,pp1,hh2,hh1
-      print*,p2,p1,h2,h1
-      print*,hthree,normal,dabs(hthree-normal)
-      stop
-     endif
-!     call three_comp_two_e_elem(det_i,h1,h2,p1,p2,s1,s2,normal)
-!     normal = eff_2_e_from_3_e_ab(p2,p1,h2,h1)
-     accu += dabs(hthree-normal)
-    enddo
-   enddo
-  enddo
- enddo
-print*,'accu opposite spin = ',accu
-stop
-
-!    p2=6
-!    p1=5
-!    h2=2
-!    h1=1
-
-s1 = 1 
-s2 = 1
-accu = 0.d0
-do h1 = 1, elec_alpha_num
- do p1 = elec_alpha_num+1, mo_num
-  do p2 = p1+1, mo_num
-   do h2 = h1+1, elec_alpha_num
-    det_i = ref_bitmask
-    call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-    call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-    call htilde_mu_mat_opt_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-    call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-    integer :: hh1, pp1, hh2, pp2, ss1, ss2
-    call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
-    hthree *= phase
-    normal = normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1)
-!     normal = eff_2_e_from_3_e_aa(p2,p1,h2,h1)
-    if(dabs(hthree).lt.1.d-10)cycle
-    if(dabs(hthree-normal).gt.1.d-10)then
-     print*,pp2,pp1,hh2,hh1
-     print*,p2,p1,h2,h1
-     print*,hthree,normal,dabs(hthree-normal)
-     stop
-    endif
-!     print*,hthree,normal,dabs(hthree-normal)
-    accu += dabs(hthree-normal)
-   enddo
-  enddo
- enddo
-enddo
-print*,'accu same spin alpha = ',accu
-
-
-s1 = 2 
-s2 = 2
-accu = 0.d0
-do h1 = 1, elec_beta_num
- do p1 = elec_beta_num+1, mo_num
-  do p2 = p1+1, mo_num
-   do h2 = h1+1, elec_beta_num
-    det_i = ref_bitmask
-    call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-    call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-    call htilde_mu_mat_opt_bi_ortho(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-    call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-    call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
-    hthree *= phase
-!    normal = normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1)
-     normal = eff_2_e_from_3_e_bb(p2,p1,h2,h1)
-    if(dabs(hthree).lt.1.d-10)cycle
-    if(dabs(hthree-normal).gt.1.d-10)then
-     print*,pp2,pp1,hh2,hh1
-     print*,p2,p1,h2,h1
-     print*,hthree,normal,dabs(hthree-normal)
-     stop
-    endif
-!     print*,hthree,normal,dabs(hthree-normal)
-    accu += dabs(hthree-normal)
-   enddo
-  enddo
- enddo
-enddo
-print*,'accu same spin beta  = ',accu
-
-
-end
-
-
diff --git a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f b/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
deleted file mode 100644
index b33b2e93..00000000
--- a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
+++ /dev/null
@@ -1,171 +0,0 @@
-
-! ---
-
-program test_tc_fock
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  !call routine_1
-  !call routine_2
-!  call routine_3()
-
-  call routine_tot
-
-end
-
-! ---
-
-subroutine routine_3()
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, a, i_ok, s1
-  double precision               :: hmono, htwoe, hthree, htilde_ij
-  double precision               :: err_ai, err_tot, ref, new
-  integer(bit_kind), allocatable :: det_i(:,:)
-
-  allocate(det_i(N_int,2))
-
-  err_tot = 0.d0
- 
-  do s1 = 1, 2
-
-   det_i = ref_bitmask
-   call debug_det(det_i, N_int)
-   print*, ' HF det'
-   call debug_det(det_i, N_int)
- 
-   do i = 1, elec_num_tab(s1)
-     do a = elec_num_tab(s1)+1, mo_num ! virtual 
- 
-       det_i = ref_bitmask
-       call do_single_excitation(det_i, i, a, s1, i_ok)
-       if(i_ok == -1) then
-        print*, 'PB !!'
-        print*, i, a
-        stop
-       endif
-       print*, ' excited det'
-       call debug_det(det_i, N_int)
- 
-       call htilde_mu_mat_opt_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
-       if(dabs(hthree).lt.1.d-10)cycle
-       ref = hthree 
-       if(s1 == 1)then
-        new = fock_a_tot_3e_bi_orth(a,i)
-       else if(s1 == 2)then
-        new = fock_b_tot_3e_bi_orth(a,i)
-       endif
-       err_ai = dabs(dabs(ref) - dabs(new))
-       if(err_ai .gt. 1d-7) then
-         print*,'s1 = ',s1
-         print*, ' warning on', i, a
-         print*, ref,new,err_ai
-       endif
-       print*, ref,new,err_ai
-       err_tot += err_ai
- 
-       write(22, *) htilde_ij
-     enddo
-   enddo
-  enddo
-
-  print *, ' err_tot = ', err_tot
-
-  deallocate(det_i)
-
-end subroutine routine_3
-
-! ---
-subroutine routine_tot()
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, a, i_ok, s1,other_spin(2)
-  double precision               :: hmono, htwoe, hthree, htilde_ij
-  double precision               :: err_ai, err_tot, ref, new
-  integer(bit_kind), allocatable :: det_i(:,:)
-
-  allocate(det_i(N_int,2))
-  other_spin(1) = 2
-  other_spin(2) = 1
-
-  err_tot = 0.d0
- 
-!  do s1 = 1, 2
-   s1 = 2
-   det_i = ref_bitmask
-   call debug_det(det_i, N_int)
-   print*, ' HF det'
-   call debug_det(det_i, N_int)
- 
-!   do i = 1, elec_num_tab(s1)
-!     do a = elec_num_tab(s1)+1, mo_num ! virtual 
-   do i = 1, elec_beta_num
-     do a = elec_beta_num+1, mo_num! virtual 
-       print*,i,a 
- 
-       det_i = ref_bitmask
-       call do_single_excitation(det_i, i, a, s1, i_ok)
-       if(i_ok == -1) then
-        print*, 'PB !!'
-        print*, i, a
-        stop
-       endif
- 
-       call htilde_mu_mat_opt_bi_ortho(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
-       print*,htilde_ij
-!       if(dabs(htilde_ij).lt.1.d-10)cycle
-       print*, ' excited det'
-       call debug_det(det_i, N_int)
-
-       if(s1 == 1)then
-        new = Fock_matrix_tc_mo_alpha(a,i)
-       else
-        new = Fock_matrix_tc_mo_beta(a,i)
-       endif
-       ref = htilde_ij
-!       if(s1 == 1)then
-!        new = fock_a_tot_3e_bi_orth(a,i)
-!       else if(s1 == 2)then
-!        new = fock_b_tot_3e_bi_orth(a,i)
-!       endif
-       err_ai = dabs(dabs(ref) - dabs(new))
-       if(err_ai .gt. 1d-7) then
-         print*,'---------'
-         print*,'s1 = ',s1
-         print*, ' warning on', i, a
-         print*, ref,new,err_ai
-         print*,hmono, htwoe, hthree
-         print*,'---------'
-       endif
-       print*, ref,new,err_ai
-       err_tot += err_ai
- 
-       write(22, *) htilde_ij
-     enddo
-   enddo
-!  enddo
-
-  print *, ' err_tot = ', err_tot
-
-  deallocate(det_i)
-
-end subroutine routine_3
diff --git a/plugins/local/tc_progs/NEED b/plugins/local/tc_progs/NEED
new file mode 100644
index 00000000..9deb3db4
--- /dev/null
+++ b/plugins/local/tc_progs/NEED
@@ -0,0 +1 @@
+tc_bi_ortho
diff --git a/plugins/local/tc_bi_ortho/print_he_tc_energy.irp.f b/plugins/local/tc_progs/print_he_tc_energy.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_he_tc_energy.irp.f
rename to plugins/local/tc_progs/print_he_tc_energy.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_dump.irp.f b/plugins/local/tc_progs/print_tc_dump.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_dump.irp.f
rename to plugins/local/tc_progs/print_tc_dump.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f b/plugins/local/tc_progs/print_tc_energy.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_energy.irp.f
rename to plugins/local/tc_progs/print_tc_energy.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_spin_dens.irp.f b/plugins/local/tc_progs/print_tc_spin_dens.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_spin_dens.irp.f
rename to plugins/local/tc_progs/print_tc_spin_dens.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_var.irp.f b/plugins/local/tc_progs/print_tc_var.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_var.irp.f
rename to plugins/local/tc_progs/print_tc_var.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f b/plugins/local/tc_progs/print_tc_wf.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_wf.irp.f
rename to plugins/local/tc_progs/print_tc_wf.irp.f
diff --git a/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f b/plugins/local/tc_progs/save_bitcpsileft_for_qmcchem.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f
rename to plugins/local/tc_progs/save_bitcpsileft_for_qmcchem.irp.f
diff --git a/plugins/local/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f b/plugins/local/tc_progs/save_tc_bi_ortho_nat.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f
rename to plugins/local/tc_progs/save_tc_bi_ortho_nat.irp.f
diff --git a/plugins/local/tc_bi_ortho/select_dets_bi_ortho.irp.f b/plugins/local/tc_progs/select_dets_bi_ortho.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/select_dets_bi_ortho.irp.f
rename to plugins/local/tc_progs/select_dets_bi_ortho.irp.f
diff --git a/plugins/local/tc_bi_ortho/tc_bi_ortho_prop.irp.f b/plugins/local/tc_progs/tc_bi_ortho_prop.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/tc_bi_ortho_prop.irp.f
rename to plugins/local/tc_progs/tc_bi_ortho_prop.irp.f
diff --git a/plugins/local/tc_bi_ortho/tc_som.irp.f b/plugins/local/tc_progs/tc_som.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/tc_som.irp.f
rename to plugins/local/tc_progs/tc_som.irp.f
diff --git a/plugins/local/tc_bi_ortho/test_tc_two_rdm.irp.f b/plugins/local/tc_progs/test_tc_two_rdm.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/test_tc_two_rdm.irp.f
rename to plugins/local/tc_progs/test_tc_two_rdm.irp.f

From b7787f5e6dce198bee06eb92f69b9904a7448bea Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 7 May 2024 19:43:05 +0200
Subject: [PATCH 033/131] trying to speed up the PT2 in TC by transposing the
 array of tc integrals

---
 .../local/bi_ort_ints/total_twoe_pot.irp.f    |   8 +-
 .../cipsi_tc_bi_ortho/get_d0_transp.irp.f     | 140 +++++++++++
 .../local/cipsi_tc_bi_ortho/get_d2_good.irp.f |   3 -
 .../cipsi_tc_bi_ortho/get_d2_transp.irp.f     | 235 ++++++++++++++++++
 plugins/local/cipsi_tc_bi_ortho/pt2.irp.f     |   1 +
 .../local/cipsi_tc_bi_ortho/selection.irp.f   |  94 +------
 .../cipsi_tc_bi_ortho/stochastic_cipsi.irp.f  |   3 +
 plugins/local/fci_tc_bi/pt2_tc.irp.f          |   2 +
 .../local/tc_bi_ortho/e_corr_bi_ortho.irp.f   |   1 -
 plugins/local/tc_keywords/EZFIO.cfg           |  11 +-
 10 files changed, 404 insertions(+), 94 deletions(-)
 create mode 100644 plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
 create mode 100644 plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f

diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 1e127fac..71269fdc 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -259,15 +259,21 @@ BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_transp, (mo_num, mo_num,
  END_DOC
 
  integer :: i,j,k,l
+ print*,'Providing mo_bi_ortho_tc_two_e_transp'
+ double precision :: t0,t1
+ call wall_time(t0)
  do i = 1, mo_num
   do j = 1, mo_num
    do k = 1, mo_num
     do l = 1, mo_num
-     mo_bi_ortho_tc_two_e_transp(i,j,k,l) = mo_bi_ortho_tc_two_e_transp(k,l,i,j)
+     mo_bi_ortho_tc_two_e_transp(i,j,k,l) = mo_bi_ortho_tc_two_e(k,l,i,j)
     enddo
    enddo
   enddo
  enddo
+ call wall_time(t1)
+ 
+ print *, ' WALL TIME for PROVIDING mo_bi_ortho_tc_two_e_transp (min)', (t1-t0)/60.d0
 
 END_PROVIDER 
 ! ---
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
new file mode 100644
index 00000000..56238e13
--- /dev/null
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
@@ -0,0 +1,140 @@
+subroutine get_d0_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be okay for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  integer :: i, j, k, s, h1, h2, p1, p2, puti, putj, mm
+  double precision :: phase
+  double precision :: hij,hji
+  double precision, external :: get_phase_bi
+  logical :: ok
+
+  integer, parameter :: bant=1
+  double precision, allocatable :: hij_cache1(:), hij_cache2(:)
+  allocate (hij_cache1(mo_num),hij_cache2(mo_num))
+  double precision, allocatable :: hji_cache1(:), hji_cache2(:)
+  allocate (hji_cache1(mo_num),hji_cache2(mo_num))
+!  print*,'in get_d0_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  if(sp == 3) then ! AB
+    h1 = p(1,1)
+    h2 = p(1,2)
+    do p1=1, mo_num
+      if(bannedOrb(p1, 1)) cycle
+!      call get_mo_two_e_integrals_complex(p1,h2,h1,mo_num,hij_cache1,mo_integrals_map)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,p1,h2,h1)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e_transp(mm,p1,h2,h1)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(det,gen,N_int, hij)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hij = hij_cache1(p2) * phase
+        end if
+        if (hij == (0.d0,0.d0)) cycle
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij  ! HOTSPOT
+        enddo
+      end do
+      !!!!!!!!!! <phi|H|alpha>
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(gen,det,N_int, hji)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hji = hji_cache1(p2) * phase
+        end if
+        if (hji == (0.d0,0.d0)) cycle
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji  ! HOTSPOT
+        enddo
+      end do
+    end do
+
+  else ! AA BB
+    p1 = p(1,sp)
+    p2 = p(2,sp)
+    do puti=1, mo_num
+      if(bannedOrb(puti, sp)) cycle
+!      call get_mo_two_e_integrals_complex(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,puti,p2,p1)
+       hij_cache2(mm) = mo_bi_ortho_tc_two_e(mm,puti,p1,p2)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e_transp(mm,puti,p2,p1)
+       hji_cache2(mm) = mo_bi_ortho_tc_two_e_transp(mm,puti,p1,p2)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          !call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(det,gen,N_int, hij)
+          if (hij == 0.d0) cycle
+        else
+!          hij = (mo_two_e_integral_complex(p1, p2, puti, putj) -  mo_two_e_integral_complex(p2, p1, puti, putj))
+!          hij = (mo_bi_ortho_tc_two_e(p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e(p2, p1, puti, putj))
+          hij = (mo_bi_ortho_tc_two_e(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e(puti, putj, p2, p1))
+          if (hij == 0.d0) cycle
+          hij = (hij) * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+        end if
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+        enddo
+      end do
+
+      !!!!!!!!!! <phi|H|alpha>
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(gen,det,N_int, hji)
+          if (hji == 0.d0) cycle
+        else
+!          hji = (mo_bi_ortho_tc_two_e( p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e( p2, p1, puti, putj))
+          hji = (mo_bi_ortho_tc_two_e_transp(puti, putj, p1, p2 ) -  mo_bi_ortho_tc_two_e_transp( puti, putj, p2, p1))
+          if (hji == 0.d0) cycle
+          hji = (hji) * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+        end if
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+        enddo
+      end do
+    end do
+  end if
+
+  deallocate(hij_cache1,hij_cache2)
+end
+
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
index d01ed433..86922ae9 100644
--- a/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
@@ -25,9 +25,6 @@ subroutine get_d2_new(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h,
 
   integer :: bant
   bant = 1
-!  print*, 'in get_d2_new'
-!  call debug_det(gen,N_int)
-!  print*,'coefs',coefs(1,:)
 
   tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f
new file mode 100644
index 00000000..b2a7ea31
--- /dev/null
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f
@@ -0,0 +1,235 @@
+
+subroutine get_d2_new_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be correct for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  double precision, external :: get_phase_bi
+
+  integer :: i, j, k, tip, ma, mi, puti, putj
+  integer :: h1, h2, p1, p2, i1, i2
+  double precision :: phase
+  double precision :: hij,hji
+
+  integer, parameter:: turn2d(2,3,4) = reshape((/0,0, 0,0, 0,0,  3,4, 0,0, 0,0,  2,4, 1,4, 0,0,  2,3, 1,3, 1,2 /), (/2,3,4/))
+  integer, parameter :: turn2(2) = (/2, 1/)
+  integer, parameter :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer :: bant
+  bant = 1
+
+  tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles
+
+  ma = sp !1:(alpha,alpha); 2:(b,b); 3:(a,b)
+  if(p(0,1) > p(0,2)) ma = 1 ! more alpha particles than beta particles
+  if(p(0,1) < p(0,2)) ma = 2 ! fewer alpha particles than beta particles
+  mi = mod(ma, 2) + 1
+
+  if(sp == 3) then ! if one alpha and one beta xhole 
+    !(where xholes refer to the ionizations from the generator, not the holes occupied in the ionized generator)
+    if(ma == 2) bant = 2 ! if more beta particles than alpha particles
+
+    if(tip == 3) then ! if 3 of one particle spin and 1 of the other particle spin
+      puti = p(1, mi)
+      if(bannedOrb(puti, mi)) return
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+
+      !! <alpha|H|psi>
+      do i = 1, 3    ! loop over all 3 combinations of 2 particles with spin ma
+        putj = p(i, ma)
+        if(banned(putj,puti,bant)) cycle
+        i1 = turn3(1,i)
+        i2 = turn3(2,i)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        
+     ! |G> = |psi_{gen,i}>
+     ! |G'> = a_{x1} a_{x2} |G>
+     ! |alpha> = a_{puti}^{\dagger} a_{putj}^{\dagger} |G'>
+     ! |alpha> = t_{x1,x2}^{puti,putj} |G>
+     ! hij = <psi_{selectors,i}|H|alpha>
+     ! |alpha> = t_{p1,p2}^{h1,h2}|psi_{selectors,i}>
+        !todo: <i|H|j>  =  (<h1,h2|p1,p2> - <h1,h2|p2,p1>) * phase
+        !    <psi|H|j> +=  dconjg(c_i) * <i|H|j>
+        !      <j|H|i>  =  (<p1,p2|h1,h2> - <p2,p1|h1,h2>) * phase
+        !    <j|H|psi> +=  <j|H|i> * c_i
+
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+
+!        hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e( h1, h2, p2, p1)
+!        hji = mo_bi_ortho_tc_two_e_transp(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e_transp( h1, h2, p2, p1)
+        hij = mo_bi_ortho_tc_two_e_transp(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e_transp( p1, p2, h2, h1)
+        hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)       - mo_bi_ortho_tc_two_e( p1, p2, h2, h1)
+        if (hij == 0.d0.or.hji==0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        phase = get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        hij = hij * phase
+        hji = hji * phase
+
+        if(ma == 1) then ! if particle spins are (alpha,alpha,alpha,beta), then puti is beta and putj is alpha
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,2) * hij
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,1) * hji
+          enddo
+        else            ! if particle spins are (beta,beta,beta,alpha), then puti is alpha and putj is beta
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        end if
+      end do
+    else ! if 2 alpha and 2 beta particles
+      h1 = h(1,1)
+      h2 = h(1,2)
+      !! <alpha|H|psi>
+      do j = 1,2 ! loop over all 4 combinations of one alpha and one beta particle
+        putj = p(j, 2)
+        if(bannedOrb(putj, 2)) cycle
+        p2 = p(turn2(j), 2)
+        do i = 1,2
+          puti = p(i, 1)
+          if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle
+          p1 = p(turn2(i), 1)
+    ! hij = <psi_{selectors,i}|H|alpha> 
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+!          hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2 )
+!          hji = mo_bi_ortho_tc_two_e_transp(h1, h2, p1, p2 )
+          hij = mo_bi_ortho_tc_two_e_transp(p1, p2 ,h1, h2 )
+          hji = mo_bi_ortho_tc_two_e( p1, p2, h1, h2)
+          if (hij /= 0.d0.or.hji==0.d0) then
+            ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!            hij = dconjg(hij) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+              mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+            enddo
+          endif
+        end do
+      end do
+    end if
+
+  else ! if holes are (a,a) or (b,b)
+    if(tip == 0) then ! if particles are (a,a,a,a) or (b,b,b,b)
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(i, ma)
+        if(bannedOrb(puti,ma)) cycle
+        do j=i+1,4
+          putj = p(j, ma)
+          if(bannedOrb(putj,ma)) cycle
+          if(banned(puti,putj,1)) cycle
+
+          i1 = turn2d(1, i, j)
+          i2 = turn2d(2, i, j)
+          p1 = p(i1, ma)
+          p2 = p(i2, ma)
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+          hij = mo_bi_ortho_tc_two_e_transp(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e_transp(p1, p2, h2,h1 )
+          hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p1, p2, h2,h1 )
+          if (hij == 0.d0.or.hji == 0.d0) cycle
+
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          phase = get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) +coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) +coefs(k,1) * hji
+          enddo
+        end do
+      end do
+    else if(tip == 3) then ! if particles are (a,a,a,b) (ma=1,mi=2) or (a,b,b,b) (ma=2,mi=1)
+      h1 = h(1, mi)
+      h2 = h(1, ma)
+      p1 = p(1, mi)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(turn3(1,i), ma)
+        if(bannedOrb(puti,ma)) cycle
+        putj = p(turn3(2,i), ma)
+        if(bannedOrb(putj,ma)) cycle
+        if(banned(puti,putj,1)) cycle
+        p2 = p(i, ma)
+
+!        hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = mo_bi_ortho_tc_two_e_transp(p1, p2 ,h1, h2)
+        hji = mo_bi_ortho_tc_two_e(p1, p2,h1, h2 )
+        if (hij == 0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        phase = get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        hij = hij * phase
+        hji = hji * phase
+        if (puti < putj) then
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        else
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,2) * hij
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,1) * hji
+          enddo
+        endif
+      end do
+    else ! tip == 4  (a,a,b,b)
+      puti = p(1, sp)
+      putj = p(2, sp)
+      if(.not. banned(puti,putj,1)) then
+        p1 = p(1, mi)
+        p2 = p(2, mi)
+        h1 = h(1, mi)
+        h2 = h(2, mi)
+      !! <alpha|H|psi>
+!        hij = (mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2))
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = (mo_bi_ortho_tc_two_e_transp(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e_transp(p2,p1,h1, h2))
+        hji = (mo_bi_ortho_tc_two_e(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e(p2,p1,h1, h2))
+        if (hij /= 0.d0.or.hji==0.d0) then
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          phase = get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          hij = hij * phase
+          hji = hji* phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        end if
+      end if
+    end if
+  end if
+end
diff --git a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
index 833cc0ea..ada19c6b 100644
--- a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
@@ -67,6 +67,7 @@ subroutine tc_pt2
   call pt2_alloc(pt2_data_err, N_states)
   call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
   call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  call print_summary_tc(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)
 
 end
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
index 0b4345d5..0f785ba2 100644
--- a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
@@ -636,10 +636,7 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
     negMask(i,2) = not(mask(i,2))
   end do
 
-!  print*,'in selection '
   do i = 1, N_sel
-!    call debug_det(det(1,1,i),N_int)
-!    print*,i,dabs(psi_selectors_coef_transp_tc(1,2,i) * psi_selectors_coef_transp_tc(1,1,i))
     if(interesting(i) < 0) then
       stop 'prefetch interesting(i) and det(i)'
     endif
@@ -691,11 +688,19 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
 
         call get_mask_phase(psi_det_sorted_tc(1,1,interesting(i)), phasemask,N_int)
         if(nt == 4) then
-          call get_d2_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          if(transpose_two_e_int)then
+           call get_d2_new_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          else
+           call get_d2_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          endif
         elseif(nt == 3) then
           call get_d1_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
         else
+         if(transpose_two_e_int)then
+          call get_d0_transp (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         else
           call get_d0_new (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         endif
         endif
     elseif(nt == 4) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
@@ -887,79 +892,11 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
         call diag_htilde_mu_mat_fock_bi_ortho(N_int, det, hmono, htwoe, hthree, hii)
         do istate = 1,N_states
           delta_E = E0(istate) - Hii + E_shift
-          double precision               :: alpha_h_psi_tmp, psi_h_alpha_tmp, error
-          if(debug_tc_pt2 == 1)then !! Using the old version
-            psi_h_alpha = 0.d0
-            alpha_h_psi = 0.d0
-            do iii = 1, N_det_selectors
-              call htilde_mu_mat_opt_bi_ortho_tot(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_opt_bi_ortho_tot(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-              call get_excitation_degree(psi_selectors(1,1,iii), det,degree,N_int)
-              if(degree == 0)then
-               print*,'problem !!!'
-               print*,'a determinant is already in the wave function !!'
-               print*,'it corresponds to the selector number ',iii
-               call debug_det(det,N_int)
-               stop
-              endif
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-              psi_h_alpha += i_h_alpha * psi_selectors_coef_tc(iii,2,1) ! left function
-              alpha_h_psi += alpha_h_i * psi_selectors_coef_tc(iii,1,1) ! right function
-            enddo
-          else if(debug_tc_pt2 == 2)then !! debugging the new version
-!            psi_h_alpha_tmp = 0.d0
-!            alpha_h_psi_tmp = 0.d0
-!            do iii = 1, N_det_selectors ! old version
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-!              psi_h_alpha_tmp += i_h_alpha * psi_selectors_coef_tc(iii,1,1) ! left function
-!              alpha_h_psi_tmp += alpha_h_i * psi_selectors_coef_tc(iii,2,1) ! right function
-!            enddo
-            psi_h_alpha_tmp = mat_l(istate, p1, p2) ! new version
-            alpha_h_psi_tmp = mat_r(istate, p1, p2) ! new version
-            psi_h_alpha = 0.d0
-            alpha_h_psi = 0.d0
-            do iii = 1, N_det ! old version
-              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_det(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_det(1,1,iii), N_int, alpha_h_i)
-              psi_h_alpha += i_h_alpha * psi_l_coef_bi_ortho(iii,1) ! left function
-              alpha_h_psi += alpha_h_i * psi_r_coef_bi_ortho(iii,1) ! right function
-            enddo
-            if(dabs(psi_h_alpha*alpha_h_psi/delta_E).gt.1.d-10)then
-              error = dabs(psi_h_alpha * alpha_h_psi - psi_h_alpha_tmp * alpha_h_psi_tmp)/dabs(psi_h_alpha * alpha_h_psi)
-              if(error.gt.1.d-2)then
-                call debug_det(det, N_int)
-                print*,'error =',error,psi_h_alpha * alpha_h_psi/delta_E,psi_h_alpha_tmp * alpha_h_psi_tmp/delta_E
-                print*,psi_h_alpha , alpha_h_psi
-                print*,psi_h_alpha_tmp , alpha_h_psi_tmp
-                print*,'selectors '
-                do iii = 1, N_det_selectors ! old version
-                 print*,'iii',iii,psi_selectors_coef_tc(iii,1,1),psi_selectors_coef_tc(iii,2,1)
-                 call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-                 call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-                 print*,i_h_alpha,alpha_h_i
-                 call debug_det(psi_selectors(1,1,iii),N_int)
-                enddo
-!                print*,'psi_det '
-!                do iii = 1, N_det! old version
-!                 print*,'iii',iii,psi_l_coef_bi_ortho(iii,1),psi_r_coef_bi_ortho(iii,1)
-!                 call debug_det(psi_det(1,1,iii),N_int)
-!                enddo
-                stop
-              endif
-            endif
-          else
-           psi_h_alpha = mat_l(istate, p1, p2)
-           alpha_h_psi = mat_r(istate, p1, p2)
-          endif
+          psi_h_alpha = mat_l(istate, p1, p2)
+          alpha_h_psi = mat_r(istate, p1, p2)
           val = 4.d0 * psi_h_alpha * alpha_h_psi
           tmp = dsqrt(delta_E * delta_E + val)
-!          if (delta_E < 0.d0) then
-!              tmp = -tmp
-!          endif
           e_pert(istate) = 0.25 * val / delta_E
-!          e_pert(istate) = 0.5d0 * (tmp - delta_E)
           if(dsqrt(tmp).gt.1.d-4.and.dabs(psi_h_alpha).gt.1.d-4)then
            coef(istate)   = e_pert(istate) / psi_h_alpha
           else
@@ -976,15 +913,6 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
            if(e_pert(istate).gt.0.d0)e_pert(istate)=0.d0
           endif
 
-!         if(selection_tc     ==  1 )then
-!          if(e_pert(istate).lt.0.d0)then
-!           e_pert(istate) = 0.d0
-!          endif
-!         else if(selection_tc == -1)then
-!          if(e_pert(istate).gt.0.d0)then
-!           e_pert(istate) = 0.d0
-!          endif
-!         endif
         enddo
 
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
index 99a8de7e..bb5a89a1 100644
--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@@ -88,6 +88,9 @@ subroutine run_stochastic_cipsi
     call pt2_dealloc(pt2_data_err)
     call pt2_alloc(pt2_data, N_states)
     call pt2_alloc(pt2_data_err, N_states)
+    if(transpose_two_e_int)then
+     provide mo_bi_ortho_tc_two_e_transp
+    endif
     call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
 !    stop
 
diff --git a/plugins/local/fci_tc_bi/pt2_tc.irp.f b/plugins/local/fci_tc_bi/pt2_tc.irp.f
index 390042bf..3c07e367 100644
--- a/plugins/local/fci_tc_bi/pt2_tc.irp.f
+++ b/plugins/local/fci_tc_bi/pt2_tc.irp.f
@@ -13,6 +13,8 @@ program tc_pt2_prog
 
   pruning = -1.d0
   touch pruning
+  read_wf = .True.
+  touch read_wf 
 
 !  pt2_relative_error = 0.01d0
 !  touch pt2_relative_error
diff --git a/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f b/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
index 4abdc25b..5a3971c5 100644
--- a/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
+++ b/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
@@ -27,7 +27,6 @@
    if(degree == 1)then
     e_pt2_tc_bi_orth_single += coef_pt1 * htilde_ij
    else 
-!    print*,'coef_pt1, e_pt2',coef_pt1,coef_pt1 * htilde_ij
     e_pt2_tc_bi_orth_double += coef_pt1 * htilde_ij
    endif
   endif
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index 1e89eaa4..39968ec8 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -184,12 +184,6 @@ doc: Read/Write normal_two_body_bi_orth from/to disk [ Write | Read | None ]
 interface: ezfio,provider,ocaml
 default: None
 
-[debug_tc_pt2]
-type: integer
-doc: If :: 1 then you compute the TC-PT2 the old way, :: 2 then you check with the new version but without three-body
-interface: ezfio,provider,ocaml
-default: -1
-
 [only_spin_tc_right]
 type: logical
 doc: If |true|, only the right part of WF is used to compute spin dens
@@ -268,3 +262,8 @@ doc: Thresholds on the Imag part of TC energy
 interface: ezfio,provider,ocaml
 default: 1.e-7
 
+[transpose_two_e_int]
+type: logical
+doc: If |true|, you duplicate the two-electron TC integrals with the transpose matrix. Acceleates the PT2.  
+interface: ezfio,provider,ocaml
+default: False

From 18fd70f1b88ee4a412a351a92a98f4b1ef1ee3d0 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 7 May 2024 20:18:24 +0200
Subject: [PATCH 034/131] added get_d1_transp.irp.f

---
 .../cipsi_tc_bi_ortho/get_d1_transp.irp.f     | 350 ++++++++++++++++++
 .../local/cipsi_tc_bi_ortho/selection.irp.f   |   6 +-
 2 files changed, 355 insertions(+), 1 deletion(-)
 create mode 100644 plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f

diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
new file mode 100644
index 00000000..3c6cbf60
--- /dev/null
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
@@ -0,0 +1,350 @@
+subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices should be okay for complex?
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in)  :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in)  :: phasemask(N_int,2)
+  logical, intent(in)            :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind)              :: det(N_int, 2)
+  double precision, intent(in)   :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+  double precision, external     :: get_phase_bi
+  double precision, external     :: mo_two_e_integral_complex
+  logical                        :: ok
+
+  logical, allocatable           :: lbanned(:,:)
+  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j
+  integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k, l, mm
+
+  integer, parameter             :: turn2(2) = (/2,1/)
+  integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer                        :: bant
+  double precision, allocatable :: hij_cache(:,:)
+  double precision               :: hij, tmp_rowij(N_states, mo_num), tmp_rowij2(N_states, mo_num),phase
+  double precision, allocatable :: hji_cache(:,:)
+  double precision               :: hji, tmp_rowji(N_states, mo_num), tmp_rowji2(N_states, mo_num)
+!  PROVIDE mo_integrals_map N_int
+!  print*,'in get_d1_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  allocate (lbanned(mo_num, 2))
+  allocate (hij_cache(mo_num,2))
+  allocate (hji_cache(mo_num,2))
+  lbanned = bannedOrb
+
+  do i=1, p(0,1)
+    lbanned(p(i,1), 1) = .true.
+  end do
+  do i=1, p(0,2)
+    lbanned(p(i,2), 2) = .true.
+  end do
+
+  ma = 1
+  if(p(0,2) >= 2) ma = 2
+  mi = turn2(ma)
+
+  bant = 1
+
+  if(sp == 3) then
+    !move MA
+    if(ma == 2) bant = 2
+    puti = p(1,mi)
+    hfix = h(1,ma)
+    p1 = p(1,ma)
+    p2 = p(2,ma)
+    if(.not. bannedOrb(puti, mi)) then
+!      call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+      enddo
+      !! <alpha|H|psi>
+      tmp_rowij = 0.d0
+      tmp_rowji = 0.d0
+      do putj=1, hfix-1
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,1) - hij_cache(putj,2)
+        hji = hji_cache(putj,1) - hji_cache(putj,2)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,2)
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,1)
+          enddo
+        endif
+      end do
+      do putj=hfix+1, mo_num
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,2) - hij_cache(putj,1)
+        hji = hji_cache(putj,2) - hji_cache(putj,1)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,2)
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,1)
+          enddo
+        endif
+      end do
+
+      if(ma == 1) then
+        mat_r(1:N_states,1:mo_num,puti) = mat_r(1:N_states,1:mo_num,puti) + tmp_rowij(1:N_states,1:mo_num)
+        mat_l(1:N_states,1:mo_num,puti) = mat_l(1:N_states,1:mo_num,puti) + tmp_rowji(1:N_states,1:mo_num)
+      else
+        do l=1,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k,puti,l) = mat_r(k,puti,l) + tmp_rowij(k,l)
+            mat_l(k,puti,l) = mat_l(k,puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end if
+
+    end if
+
+    !MOVE MI
+    pfix = p(1,mi)
+    tmp_rowij  = 0.d0
+    tmp_rowij2 = 0.d0
+    tmp_rowji  = 0.d0
+    tmp_rowji2 = 0.d0
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+    do mm = 1, mo_num
+     hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p1)
+     hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p2)
+     hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p1)
+     hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p2)
+    enddo
+    putj = p1
+    !! <alpha|H|psi>
+    do puti=1,mo_num !HOT
+      if(lbanned(puti,mi)) cycle
+      !p1 fixed
+      putj = p1
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,2)
+        hji = hji_cache(puti,2)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,2)
+            tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,1)
+          enddo
+        endif
+      end if
+!      
+      putj = p2
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,1)
+        hji = hji_cache(puti,1)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          do k=1,N_states
+            tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,2)
+            tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,1)
+          enddo
+        endif
+      end if
+    end do
+
+    if(mi == 1) then
+      mat_r(:,:,p1) = mat_r(:,:,p1) + tmp_rowij(:,:)
+      mat_r(:,:,p2) = mat_r(:,:,p2) + tmp_rowij2(:,:)
+      mat_l(:,:,p1) = mat_l(:,:,p1) + tmp_rowji(:,:)
+      mat_l(:,:,p2) = mat_l(:,:,p2) + tmp_rowji2(:,:)
+    else
+      do l=1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij(k,l)
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij2(k,l)
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+
+  else  ! sp /= 3
+
+    if(p(0,ma) == 3) then
+      do i=1,3
+        hfix = h(1,ma)
+        puti = p(i, ma)
+        p1 = p(turn3(1,i), ma)
+        p2 = p(turn3(2,i), ma)
+!        call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!        call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+        do mm = 1, mo_num
+         hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+         hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+         hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
+         hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+        enddo
+    !! <alpha|H|psi>
+        tmp_rowij = 0.d0
+        tmp_rowji = 0.d0
+        do putj=1,hfix-1
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,1) - hij_cache(putj,2)
+          hji = hji_cache(putj,1) - hji_cache(putj,2)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,2)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,1)
+          endif
+        end do
+        do putj=hfix+1,mo_num
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,2) - hij_cache(putj,1)
+          hji = hji_cache(putj,2) - hji_cache(putj,1)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,2)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,1)
+          endif
+        end do
+
+        mat_r(:, :puti-1, puti) = mat_r(:, :puti-1, puti) + tmp_rowij(:,:puti-1)
+        mat_l(:, :puti-1, puti) = mat_l(:, :puti-1, puti) + tmp_rowji(:,:puti-1)
+        do l=puti,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, l) = mat_r(k, puti,l) + tmp_rowij(k,l)
+            mat_l(k, puti, l) = mat_l(k, puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end do
+    else
+      hfix = h(1,mi)
+      pfix = p(1,mi)
+      p1 = p(1,ma)
+      p2 = p(2,ma)
+      tmp_rowij =  0.d0
+      tmp_rowij2 = 0.d0
+      tmp_rowji =  0.d0
+      tmp_rowji2 = 0.d0
+!      call get_mo_two_e_integrals_complex(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,pfix)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,pfix)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,pfix)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,pfix)
+      enddo
+      putj = p2
+    !! <alpha|H|psi>
+      do puti=1,mo_num
+        if(lbanned(puti,ma)) cycle
+        putj = p2
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,1)
+          hji = hji_cache(puti,1)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,2)
+              tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,1)
+            enddo
+          endif
+        end if
+
+        putj = p1
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,2)
+          hji = hji_cache(puti,2)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            do k=1,N_states
+              tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,2)
+              tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,1)
+            enddo
+          endif
+        end if
+      end do
+      mat_r(:,:p2-1,p2) = mat_r(:,:p2-1,p2) + tmp_rowij(:,:p2-1)
+      mat_l(:,:p2-1,p2) = mat_l(:,:p2-1,p2) + tmp_rowji(:,:p2-1)
+      do l=p2,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji(k,l)
+        enddo
+      enddo
+      mat_r(:,:p1-1,p1) = mat_r(:,:p1-1,p1) + tmp_rowij2(:,:p1-1)
+      mat_l(:,:p1-1,p1) = mat_l(:,:p1-1,p1) + tmp_rowji2(:,:p1-1)
+      do l=p1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij2(k,l)
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+  end if
+  deallocate(lbanned,hij_cache, hji_cache)
+
+ !! MONO
+    if(sp == 3) then
+      s1 = 1
+      s2 = 2
+    else
+      s1 = sp
+      s2 = sp
+    end if
+
+    do i1=1,p(0,s1)
+      ib = 1
+      if(s1 == s2) ib = i1+1
+      do i2=ib,p(0,s2)
+        p1 = p(i1,s1)
+        p2 = p(i2,s2)
+        if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
+        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+        ! gen is a selector; mask is ionized generator; det is alpha
+        ! hij is contribution to <psi|H|alpha>
+!        call i_h_j_complex(gen, det, N_int, hij)
+        call htilde_mu_mat_opt_bi_ortho_no_3e(det, gen, N_int, hij)
+        call htilde_mu_mat_opt_bi_ortho_no_3e(gen, det, N_int, hji)
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,1) * dconjg(hij)
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji
+        enddo
+      end do
+    end do
+end
+
diff --git a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
index 0f785ba2..17d34f43 100644
--- a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
@@ -694,7 +694,11 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
            call get_d2_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
           endif
         elseif(nt == 3) then
-          call get_d1_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          if(transpose_two_e_int)then
+           call get_d1_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          else
+           call get_d1_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          endif
         else
          if(transpose_two_e_int)then
           call get_d0_transp (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))

From 687259c25feb8ec568b31b89b760d2e08d07ad3a Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 7 May 2024 20:32:48 +0200
Subject: [PATCH 035/131] working on the matrix elements both

---
 .../local/bi_ort_ints/total_twoe_pot.irp.f    |  20 +++
 plugins/local/slater_tc/slater_tc_opt.irp.f   |  42 ++++++
 .../slater_tc/slater_tc_opt_double.irp.f      |  60 ++++++++
 .../slater_tc/slater_tc_opt_single.irp.f      | 142 ++++++++++++++++++
 4 files changed, 264 insertions(+)

diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 71269fdc..e27fdb7f 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -332,3 +332,23 @@ END_PROVIDER
 
 ! ---
 
+ BEGIN_PROVIDER [double precision, tc_2e_3idx_coulomb_integrals_transp , (mo_num,mo_num,mo_num)]
+&BEGIN_PROVIDER [double precision, tc_2e_3idx_exchange_integrals_transp, (mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  ! tc_2e_3idx_coulomb_integrals_transp (j,k,i) = <jk|ji> 
+  ! tc_2e_3idx_exchange_integrals_transp(j,k,i) = <kj|ji> 
+  END_DOC
+ implicit none
+ integer :: i, j, k
+
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        tc_2e_3idx_coulomb_integrals_transp(j, k,i) = mo_bi_ortho_tc_two_e_transp(j ,k ,j ,i ) 
+        tc_2e_3idx_exchange_integrals_transp(j,k,i) = mo_bi_ortho_tc_two_e_transp(k ,j ,j ,i ) 
+      enddo
+    enddo
+  enddo
+
+END_PROVIDER 
diff --git a/plugins/local/slater_tc/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f
index 59efc943..9ed2b389 100644
--- a/plugins/local/slater_tc/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@@ -181,3 +181,45 @@ end
 
 ! ---
 
+subroutine htilde_mu_mat_opt_bi_ortho_no_3e_both(key_j, key_i, Nint, htot)
+
+  BEGIN_DOC
+  !
+  ! <key_j |H_tilde | key_i> where |key_j> is developed on the LEFT basis and |key_i> is developed on the RIGHT basis
+  !!
+  ! Returns the detail of the matrix element WITHOUT ANY CONTRIBUTION FROM THE THREE ELECTRON TERMS
+  !! WARNING !!
+  !
+  ! Non hermitian !!
+  !
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: htot
+  integer                       :: degree
+
+  htot = 0.d0
+
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+  if(degree.gt.2) return
+
+  if(degree == 0) then
+    call diag_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_i,htot)
+  else if (degree == 1) then
+    call single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint,key_j, key_i , htot)
+  else if(degree == 2) then
+    call double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, htot)
+  endif
+
+  if(degree==0) then
+    htot += nuclear_repulsion
+  endif
+
+end
+
+! ---
+
diff --git a/plugins/local/slater_tc/slater_tc_opt_double.irp.f b/plugins/local/slater_tc/slater_tc_opt_double.irp.f
index 4067473c..181ae11d 100644
--- a/plugins/local/slater_tc/slater_tc_opt_double.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_double.irp.f
@@ -505,3 +505,63 @@ subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_j, key_i, htot)
 
 end
 
+subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+
+  BEGIN_DOC
+  ! <key_j |H_tilde | key_i> and <key_i |H_tilde | key_j> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS
+  !!
+  !! WARNING !!
+  !
+  ! Non hermitian !!
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
+  double precision, intent(out) :: hji,hij
+  double precision :: hmono, htwoe_ji, htwoe_ij
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  integer                       :: degree,exc(0:2,2,2)
+  integer                       :: h1, p1, h2, p2, s1, s2
+  double precision              :: get_mo_two_e_integral_tc_int,phase
+
+
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+
+  hmono  = 0.d0
+  htwoe_ji  = 0.d0
+  htwoe_ij  = 0.d0
+  hji   = 0.d0
+  hij   = 0.d0
+
+  if(degree.ne.2)then
+   return
+  endif
+  integer :: degree_i,degree_j
+  call get_excitation_degree(ref_bitmask,key_i,degree_i,N_int)
+  call get_excitation_degree(ref_bitmask,key_j,degree_j,N_int)
+  call get_double_excitation(key_i, key_j, exc, phase, Nint)
+  call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
+
+  if(s1.ne.s2)then
+   ! opposite spin two-body
+    htwoe_ji  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+    htwoe_ij  = mo_bi_ortho_tc_two_e_transp(p2,p1,h2,h1)
+  else
+   ! same spin two-body
+   ! direct terms
+   htwoe_ji  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+   htwoe_ij  = mo_bi_ortho_tc_two_e_transp(p2,p1,h2,h1)
+   ! exchange terms
+   htwoe_ji -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1)
+   htwoe_ij -= mo_bi_ortho_tc_two_e_transp(p1,p2,h2,h1)
+  endif
+  htwoe_ji  *= phase
+  hji    =  htwoe_ji
+  htwoe_ij  *= phase
+  hij    =  htwoe_ij
+
+end
diff --git a/plugins/local/slater_tc/slater_tc_opt_single.irp.f b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
index e57cb05c..3f4e17e2 100644
--- a/plugins/local/slater_tc/slater_tc_opt_single.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
@@ -618,3 +618,145 @@ subroutine get_single_excitation_from_fock_tc_no_3e(Nint, key_i, key_j, h, p, sp
 
 end
 
+
+subroutine  single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+
+  BEGIN_DOC
+  ! <key_j |H_tilde | key_i> and <key_i |H_tilde | key_j> for single excitation ONLY FOR ONE- AND TWO-BODY TERMS 
+  !!
+  !! WARNING !!
+  ! 
+  ! Non hermitian !!
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
+  double precision, intent(out) :: hji,hij
+
+  double precision              :: hmono, htwoe
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  integer                       :: degree,exc(0:2,2,2)
+  integer                       :: h1, p1, h2, p2, s1, s2
+  double precision              :: get_mo_two_e_integral_tc_int, phase
+  double precision              :: direct_int, exchange_int_12, exchange_int_23, exchange_int_13
+  integer                       :: other_spin(2)
+  integer(bit_kind)             :: key_j_core(Nint,2), key_i_core(Nint,2)
+
+  other_spin(1) = 2
+  other_spin(2) = 1
+
+  hmono  = 0.d0
+  htwoe  = 0.d0
+  hji   = 0.d0
+  hji   = 0.d0
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+  if(degree.ne.1)then
+   return
+  endif
+  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+
+  call get_single_excitation(key_i, key_j, exc, phase, Nint)
+  call decode_exc(exc,1,h1,p1,h2,p2,s1,s2)
+  call get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h1, p1, s1, phase, hmono, htwoe, hji,hij)
+
+end
+
+! ---
+
+subroutine get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h, p, spin, phase, hji,hij)
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer,           intent(in) :: h, p, spin
+  double precision,  intent(in) :: phase
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: hji,hij
+  double precision :: hmono_ji,htwoe_ji
+  double precision :: hmono_ij,htwoe_ij
+
+  integer(bit_kind)             :: differences(Nint,2)
+  integer(bit_kind)             :: hole(Nint,2)
+  integer(bit_kind)             :: partcl(Nint,2)
+  integer                       :: occ_hole(Nint*bit_kind_size,2)
+  integer                       :: occ_partcl(Nint*bit_kind_size,2)
+  integer                       :: n_occ_ab_hole(2),n_occ_ab_partcl(2)
+  integer                       :: i0,i
+  double precision              :: buffer_c_ji(mo_num), buffer_x_ji(mo_num)
+  double precision              :: buffer_c_ij(mo_num), buffer_x_ij(mo_num)
+
+  do i = 1, mo_num
+    buffer_c_ji(i) = tc_2e_3idx_coulomb_integrals(i,p,h)
+    buffer_x_ji(i) = tc_2e_3idx_exchange_integrals(i,p,h)
+    buffer_c_ij(i) = tc_2e_3idx_coulomb_integrals_transp(i,p,h)
+    buffer_x_ij(i) = tc_2e_3idx_exchange_integrals_transp(i,p,h)
+  enddo
+
+  do i = 1, Nint
+   differences(i,1) = xor(key_i(i,1),ref_closed_shell_bitmask(i,1))
+   differences(i,2) = xor(key_i(i,2),ref_closed_shell_bitmask(i,2))
+   hole(i,1) = iand(differences(i,1),ref_closed_shell_bitmask(i,1))
+   hole(i,2) = iand(differences(i,2),ref_closed_shell_bitmask(i,2))
+   partcl(i,1) = iand(differences(i,1),key_i(i,1))
+   partcl(i,2) = iand(differences(i,2),key_i(i,2))
+  enddo
+
+  call bitstring_to_list_ab(hole, occ_hole, n_occ_ab_hole, Nint)
+  call bitstring_to_list_ab(partcl, occ_partcl, n_occ_ab_partcl, Nint)
+  hmono_ji = mo_bi_ortho_tc_one_e(p,h)
+  htwoe_ji = fock_op_2_e_tc_closed_shell(p,h)
+  hmono_ij = mo_bi_ortho_tc_one_e(h,p)
+  htwoe_ij = fock_op_2_e_tc_closed_shell(h,p)
+
+  ! holes :: direct terms
+  do i0 = 1, n_occ_ab_hole(1)
+    i = occ_hole(i0,1)
+    htwoe_ji -= buffer_c_ji(i)
+    htwoe_ij -= buffer_c_ij(i)
+  enddo
+  do i0 = 1, n_occ_ab_hole(2)
+    i = occ_hole(i0,2)
+    htwoe_ji -= buffer_c_ji(i)
+    htwoe_ij -= buffer_c_ij(i)
+  enddo
+ 
+  ! holes :: exchange terms
+  do i0 = 1, n_occ_ab_hole(spin)
+    i = occ_hole(i0,spin)
+    htwoe_ji += buffer_x_ji(i)
+    htwoe_ij += buffer_x_ij(i)
+  enddo
+ 
+  ! particles :: direct terms
+  do i0 = 1, n_occ_ab_partcl(1)
+    i = occ_partcl(i0,1)
+    htwoe_ji += buffer_c_ji(i)
+    htwoe_ij += buffer_c_ij(i)
+  enddo
+  do i0 = 1, n_occ_ab_partcl(2)
+    i = occ_partcl(i0,2)
+    htwoe_ji += buffer_c_ji(i)
+    htwoe_ij += buffer_c_ij(i)
+  enddo
+ 
+  ! particles :: exchange terms
+  do i0 = 1, n_occ_ab_partcl(spin)
+    i = occ_partcl(i0,spin)
+    htwoe_ji -= buffer_x_ji(i)
+    htwoe_ij -= buffer_x_ij(i)
+  enddo
+  htwoe_ji = htwoe_ji * phase
+  hmono_ji = hmono_ji * phase
+  hji  = htwoe_ji + hmono_ji 
+
+  htwoe_ij = htwoe_ij * phase
+  hmono_ij = hmono_ij * phase
+  hij  = htwoe_ij + hmono_ij 
+
+end
+

From 42fdb3c4350c0452a7169614ff9dba4e0e381f62 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 7 May 2024 20:52:10 +0200
Subject: [PATCH 036/131] it works with new routines for pt2 tc

---
 .../cipsi_tc_bi_ortho/get_d0_transp.irp.f     | 54 ++++---------------
 .../cipsi_tc_bi_ortho/get_d1_transp.irp.f     |  4 +-
 plugins/local/slater_tc/slater_tc_opt.irp.f   | 17 +++---
 .../slater_tc/slater_tc_opt_single.irp.f      |  4 +-
 4 files changed, 25 insertions(+), 54 deletions(-)

diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
index 56238e13..f149e7c6 100644
--- a/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
@@ -45,33 +45,16 @@ subroutine get_d0_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
           call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
           ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
 !          call i_h_j_complex(det, gen, N_int, hij)
-          call htilde_mu_mat_opt_bi_ortho_no_3e(det,gen,N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e_both(det,gen,N_int, hij,hji)
         else
           phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
           hij = hij_cache1(p2) * phase
+          hji = hji_cache1(p2) * phase
         end if
-        if (hij == (0.d0,0.d0)) cycle
+        if (hij == 0.d0.or.hji == 0.d0) cycle
         !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij  ! HOTSPOT
-        enddo
-      end do
-      !!!!!!!!!! <phi|H|alpha>
-      do p2=1, mo_num
-        if(bannedOrb(p2,2)) cycle
-        if(banned(p1, p2, bant)) cycle ! rentable?
-        if(p1 == h1 .or. p2 == h2) then
-          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
-          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
-!          call i_h_j_complex(det, gen, N_int, hij)
-          call htilde_mu_mat_opt_bi_ortho_no_3e(gen,det,N_int, hji)
-        else
-          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-          hji = hji_cache1(p2) * phase
-        end if
-        if (hji == (0.d0,0.d0)) cycle
-        !DIR$ LOOP COUNT AVG(4)
-        do k=1,N_states
           mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji  ! HOTSPOT
         enddo
       end do
@@ -98,40 +81,25 @@ subroutine get_d0_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
           call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
           !call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
 !          call i_h_j_complex(det, gen, N_int, hij)
-          call htilde_mu_mat_opt_bi_ortho_no_3e(det,gen,N_int, hij)
-          if (hij == 0.d0) cycle
+          call htilde_mu_mat_opt_bi_ortho_no_3e_both(det,gen,N_int, hij,hji)
+         if (hij == 0.d0.or.hji == 0.d0) cycle
         else
 !          hij = (mo_two_e_integral_complex(p1, p2, puti, putj) -  mo_two_e_integral_complex(p2, p1, puti, putj))
 !          hij = (mo_bi_ortho_tc_two_e(p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e(p2, p1, puti, putj))
           hij = (mo_bi_ortho_tc_two_e(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e(puti, putj, p2, p1))
-          if (hij == 0.d0) cycle
-          hij = (hij) * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+          hji = (mo_bi_ortho_tc_two_e_transp(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e_transp(puti, putj, p2, p1))
+          if (hij == 0.d0.or.hji == 0.d0) cycle
+          phase = get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+          hij = (hij) * phase
+          hji = (hji) * phase
         end if
         !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
-        enddo
-      end do
-
-      !!!!!!!!!! <phi|H|alpha>
-      do putj=puti+1, mo_num
-        if(bannedOrb(putj, sp)) cycle
-        if(banned(puti, putj, bant)) cycle ! rentable?
-        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
-          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
-          call htilde_mu_mat_opt_bi_ortho_no_3e(gen,det,N_int, hji)
-          if (hji == 0.d0) cycle
-        else
-!          hji = (mo_bi_ortho_tc_two_e( p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e( p2, p1, puti, putj))
-          hji = (mo_bi_ortho_tc_two_e_transp(puti, putj, p1, p2 ) -  mo_bi_ortho_tc_two_e_transp( puti, putj, p2, p1))
-          if (hji == 0.d0) cycle
-          hji = (hji) * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
-        end if
-        !DIR$ LOOP COUNT AVG(4)
-        do k=1,N_states
           mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
         enddo
       end do
+
     end do
   end if
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
index 3c6cbf60..84a1ce24 100644
--- a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
@@ -335,8 +335,8 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
         ! gen is a selector; mask is ionized generator; det is alpha
         ! hij is contribution to <psi|H|alpha>
 !        call i_h_j_complex(gen, det, N_int, hij)
-        call htilde_mu_mat_opt_bi_ortho_no_3e(det, gen, N_int, hij)
-        call htilde_mu_mat_opt_bi_ortho_no_3e(gen, det, N_int, hji)
+        call htilde_mu_mat_opt_bi_ortho_no_3e_both(det, gen, N_int, hij,hji)
+!        call htilde_mu_mat_opt_bi_ortho_no_3e(gen, det, N_int, hji)
         !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
diff --git a/plugins/local/slater_tc/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f
index 9ed2b389..5651a299 100644
--- a/plugins/local/slater_tc/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@@ -181,7 +181,7 @@ end
 
 ! ---
 
-subroutine htilde_mu_mat_opt_bi_ortho_no_3e_both(key_j, key_i, Nint, htot)
+subroutine htilde_mu_mat_opt_bi_ortho_no_3e_both(key_j, key_i, Nint, hji,hij)
 
   BEGIN_DOC
   !
@@ -199,24 +199,27 @@ subroutine htilde_mu_mat_opt_bi_ortho_no_3e_both(key_j, key_i, Nint, htot)
   implicit none
   integer,           intent(in) :: Nint
   integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
-  double precision, intent(out) :: htot
+  double precision, intent(out) :: hji,hij
   integer                       :: degree
 
-  htot = 0.d0
+  hji = 0.d0
+  hij = 0.d0
 
   call get_excitation_degree(key_i, key_j, degree, Nint)
   if(degree.gt.2) return
 
   if(degree == 0) then
-    call diag_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_i,htot)
+    call diag_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_i,hji)
+    hij = hji
   else if (degree == 1) then
-    call single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint,key_j, key_i , htot)
+    call single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint,key_j, key_i , hji,hij)
   else if(degree == 2) then
-    call double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, htot)
+    call double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
   endif
 
   if(degree==0) then
-    htot += nuclear_repulsion
+    hji += nuclear_repulsion
+    hij += nuclear_repulsion
   endif
 
 end
diff --git a/plugins/local/slater_tc/slater_tc_opt_single.irp.f b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
index 3f4e17e2..47bcbe34 100644
--- a/plugins/local/slater_tc/slater_tc_opt_single.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
@@ -652,7 +652,7 @@ subroutine  single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hj
   hmono  = 0.d0
   htwoe  = 0.d0
   hji   = 0.d0
-  hji   = 0.d0
+  hij   = 0.d0
   call get_excitation_degree(key_i, key_j, degree, Nint)
   if(degree.ne.1)then
    return
@@ -661,7 +661,7 @@ subroutine  single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hj
 
   call get_single_excitation(key_i, key_j, exc, phase, Nint)
   call decode_exc(exc,1,h1,p1,h2,p2,s1,s2)
-  call get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h1, p1, s1, phase, hmono, htwoe, hji,hij)
+  call get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h1, p1, s1, phase, hji,hij)
 
 end
 

From a38bf00975365cc755fc7c8c24e9e74c02cd2a00 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 8 May 2024 17:26:48 +0200
Subject: [PATCH 037/131] updated default keywords in tc_keywords and
 ao_twoe_e_ints

---
 .../cipsi_tc_bi_ortho/get_d1_transp.irp.f     | 34 ++++++++++++-------
 plugins/local/cipsi_tc_bi_ortho/pt2.irp.f     |  3 ++
 .../local/cipsi_tc_bi_ortho/selection.irp.f   |  6 ++--
 .../cipsi_tc_bi_ortho/stochastic_cipsi.irp.f  |  2 +-
 .../local/tc_bi_ortho/diagonalize_tc_h.irp.f  | 34 +++++++++----------
 plugins/local/tc_keywords/EZFIO.cfg           |  4 +--
 src/ao_two_e_ints/EZFIO.cfg                   |  6 ++--
 7 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
index 84a1ce24..a3d7b076 100644
--- a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
@@ -16,7 +16,7 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
   logical                        :: ok
 
   logical, allocatable           :: lbanned(:,:)
-  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j
+  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j, istate
   integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k, l, mm
 
   integer, parameter             :: turn2(2) = (/2,1/)
@@ -65,10 +65,12 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
        hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
        hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
        hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+       do istate = 1,N_states
+        tmp_rowij(istate,mm)  = 0.d0
+        tmp_rowji(istate,mm)  = 0.d0
+       enddo
       enddo
       !! <alpha|H|psi>
-      tmp_rowij = 0.d0
-      tmp_rowji = 0.d0
       do putj=1, hfix-1
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
@@ -119,13 +121,15 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
 
     !MOVE MI
     pfix = p(1,mi)
-    tmp_rowij  = 0.d0
-    tmp_rowij2 = 0.d0
-    tmp_rowji  = 0.d0
-    tmp_rowji2 = 0.d0
 !    call get_mo_two_e_integrals_complex(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
 !    call get_mo_two_e_integrals_complex(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
     do mm = 1, mo_num
+     do istate = 1,N_states
+      tmp_rowij(istate,mm)  = 0.d0
+      tmp_rowij2(istate,mm) = 0.d0
+      tmp_rowji(istate,mm)  = 0.d0
+      tmp_rowji2(istate,mm) = 0.d0
+     enddo
      hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p1)
      hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p2)
      hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p1)
@@ -200,10 +204,12 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
          hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
          hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
          hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+         do istate = 1, N_states
+          tmp_rowij(istate,mm) = 0.d0
+          tmp_rowji(istate,mm) = 0.d0
+         enddo
         enddo
     !! <alpha|H|psi>
-        tmp_rowij = 0.d0
-        tmp_rowji = 0.d0
         do putj=1,hfix-1
           if(banned(putj,puti,1)) cycle
           if(lbanned(putj,ma)) cycle
@@ -246,10 +252,6 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
       pfix = p(1,mi)
       p1 = p(1,ma)
       p2 = p(2,ma)
-      tmp_rowij =  0.d0
-      tmp_rowij2 = 0.d0
-      tmp_rowji =  0.d0
-      tmp_rowji2 = 0.d0
 !      call get_mo_two_e_integrals_complex(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
 !      call get_mo_two_e_integrals_complex(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
       do mm = 1, mo_num
@@ -257,6 +259,12 @@ subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask,
        hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,pfix)
        hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,pfix)
        hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,pfix)
+       do istate = 1,N_states
+        tmp_rowij (istate,mm) =  0.d0
+        tmp_rowij2(istate,mm) = 0.d0
+        tmp_rowji (istate,mm) =  0.d0
+        tmp_rowji2(istate,mm) = 0.d0
+       enddo
       enddo
       putj = p2
     !! <alpha|H|psi>
diff --git a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
index ada19c6b..22381991 100644
--- a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
@@ -65,6 +65,9 @@ subroutine tc_pt2
   call pt2_dealloc(pt2_data_err)
   call pt2_alloc(pt2_data, N_states)
   call pt2_alloc(pt2_data_err, N_states)
+  if(transpose_two_e_int)then
+   provide mo_bi_ortho_tc_two_e_transp tc_2e_3idx_coulomb_integrals_transp
+  endif
   call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
   call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
   call print_summary_tc(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)
diff --git a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
index 17d34f43..72ccf9c4 100644
--- a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
@@ -691,19 +691,19 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
           if(transpose_two_e_int)then
            call get_d2_new_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
           else
-           call get_d2_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+           call get_d2_new       (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
           endif
         elseif(nt == 3) then
           if(transpose_two_e_int)then
            call get_d1_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
           else
-           call get_d1_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+           call get_d1_new   (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
           endif
         else
          if(transpose_two_e_int)then
           call get_d0_transp (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
          else
-          call get_d0_new (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          call get_d0_new    (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
          endif
         endif
     elseif(nt == 4) then
diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
index bb5a89a1..e363830d 100644
--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@@ -89,7 +89,7 @@ subroutine run_stochastic_cipsi
     call pt2_alloc(pt2_data, N_states)
     call pt2_alloc(pt2_data_err, N_states)
     if(transpose_two_e_int)then
-     provide mo_bi_ortho_tc_two_e_transp
+     provide mo_bi_ortho_tc_two_e_transp tc_2e_3idx_coulomb_integrals_transp
     endif
     call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
 !    stop
diff --git a/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f b/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
index 398e96db..03388898 100644
--- a/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
+++ b/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
@@ -35,8 +35,8 @@ program tc_bi_ortho
   print*, ' nb of det    = ', N_det
 
   call routine_diag()
-  call write_tc_energy()
-  call save_tc_bi_ortho_wavefunction()
+!  call write_tc_energy()
+!  call save_tc_bi_ortho_wavefunction()
 
 end
 
@@ -76,28 +76,26 @@ subroutine routine_diag()
     PROVIDE noL_2e
   endif
 
-  PROVIDE htilde_matrix_elmt_bi_ortho
-  return
 
   if(N_states .eq. 1) then
 
     print*,'eigval_right_tc_bi_orth   = ',eigval_right_tc_bi_orth(1)
-    print*,'e_tc_left_right           = ',e_tc_left_right
-    print*,'e_tilde_bi_orth_00        = ',e_tilde_bi_orth_00
-    print*,'e_pt2_tc_bi_orth          = ',e_pt2_tc_bi_orth
-    print*,'e_pt2_tc_bi_orth_single   = ',e_pt2_tc_bi_orth_single
-    print*,'e_pt2_tc_bi_orth_double   = ',e_pt2_tc_bi_orth_double
-    print*,'***'                      
-    print*,'e_corr_bi_orth            = ',e_corr_bi_orth
-    print*,'e_corr_bi_orth_proj       = ',e_corr_bi_orth_proj
-    print*,'e_corr_bi_orth_proj_abs   = ',e_corr_bi_orth_proj_abs
-    print*,'e_corr_single_bi_orth     = ',e_corr_single_bi_orth
-    print*,'e_corr_double_bi_orth     = ',e_corr_double_bi_orth
-    print*,'e_corr_single_bi_orth_abs = ',e_corr_single_bi_orth_abs
-    print*,'e_corr_double_bi_orth_abs = ',e_corr_double_bi_orth_abs
+!    print*,'e_tc_left_right           = ',e_tc_left_right
+!    print*,'e_tilde_bi_orth_00        = ',e_tilde_bi_orth_00
+!    print*,'e_pt2_tc_bi_orth          = ',e_pt2_tc_bi_orth
+!    print*,'e_pt2_tc_bi_orth_single   = ',e_pt2_tc_bi_orth_single
+!    print*,'e_pt2_tc_bi_orth_double   = ',e_pt2_tc_bi_orth_double
+!    print*,'***'                      
+!    print*,'e_corr_bi_orth            = ',e_corr_bi_orth
+!    print*,'e_corr_bi_orth_proj       = ',e_corr_bi_orth_proj
+!    print*,'e_corr_bi_orth_proj_abs   = ',e_corr_bi_orth_proj_abs
+!    print*,'e_corr_single_bi_orth     = ',e_corr_single_bi_orth
+!    print*,'e_corr_double_bi_orth     = ',e_corr_double_bi_orth
+!    print*,'e_corr_single_bi_orth_abs = ',e_corr_single_bi_orth_abs
+!    print*,'e_corr_double_bi_orth_abs = ',e_corr_double_bi_orth_abs
     print*,'Left/right eigenvectors'
     do i = 1,N_det
-      write(*,'(I5,X,(100(F12.7,X)))')i,leigvec_tc_bi_orth(i,1),reigvec_tc_bi_orth(i,1),leigvec_tc_bi_orth(i,1)*reigvec_tc_bi_orth(i,1)
+      write(*,'(I6,X,(100(F12.7,X)))')i,leigvec_tc_bi_orth(i,1),reigvec_tc_bi_orth(i,1),leigvec_tc_bi_orth(i,1)*reigvec_tc_bi_orth(i,1)
     enddo
 
   else
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index 39968ec8..d764224a 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -14,7 +14,7 @@ default: False
 type: logical
 doc: If |true|, three-body terms are included
 interface: ezfio,provider,ocaml
-default: True
+default: False
 
 [three_e_3_idx_term]
 type: logical
@@ -50,7 +50,7 @@ default: False
 type: logical
 doc: If |true|, standard normal-ordering for L (to be used with three_body_h_tc |false|)
 interface: ezfio,provider,ocaml
-default: False
+default: True
 
 [core_tc_op]
 type: logical
diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg
index ff932b0c..c2e083a3 100644
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@@ -25,16 +25,16 @@ default: 1.e-12
 
 [do_direct_integrals]
 type: logical
-doc: Compute integrals on the fly (very slow, only for debugging)
+doc: Compute integrals on the fly (Useful only for Cholesky decomposition)
 interface: ezfio,provider,ocaml
-default: False
+default: True
 ezfio_name: direct
 
 [do_ao_cholesky]
 type: logical
 doc: Perform Cholesky decomposition of AO integrals
 interface: ezfio,provider,ocaml
-default: False
+default: True
 
 [io_ao_two_e_integrals_erf]
 type: Disk_access

From 2af293fd291481896cd5114df6f653ca0f04f797 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 8 May 2024 17:38:54 +0200
Subject: [PATCH 038/131] minor modifs in BH jastrows

---
 .../local/non_h_ints_mu/jast_deriv_utils_vect.irp.f | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
index db06e835..09bb6528 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
@@ -340,8 +340,8 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
           endif
 
           tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
-          tmp1 = tmp1 * g12_power(opA)
-          tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA))
+          tmp1 = tmp1 * g12_power(opA) * tmp
+          tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
 
           !tmp1 = 0.d0
           !if(mpA .gt. 0) then
@@ -356,9 +356,12 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
           !  tmp2 = tmp2 + dble(opA) * g12**dble(opA-1) * (f1A**dble(mpA) * f2A**dble(npA) + f1A**dble(npA) * f2A**dble(mpA))
           !endif
 
-          gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
-          grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
-          gradz(jpoint) = gradz(jpoint) + tmp * (tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3))
+!          gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
+!          grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
+!          gradz(jpoint) = gradz(jpoint) + tmp * (tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3))
+          gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+          grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+          gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
         enddo ! p
       enddo ! i_nucl
     enddo ! jpoint

From 812e75982b96959485b8f7ccd333ef3a1b1b570b Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 10 May 2024 17:23:51 +0200
Subject: [PATCH 039/131] minor modifs in
 plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f

---
 .../non_h_ints_mu/jast_deriv_utils_vect.irp.f   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
index 09bb6528..2c41b535 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
@@ -867,19 +867,20 @@ subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
               + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
               + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
 
-  tmp1 = 1.d0 / (1.d0 + alpha * dist)
 
-  fct = alpha * dist * tmp1
-
-  if(dist .lt. 1d-10) then
-    grad1_fct(1) = 0.d0
-    grad1_fct(2) = 0.d0
-    grad1_fct(3) = 0.d0
-  else
+  if(dist .ge. 1d-10) then
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+    
+    fct = alpha * dist * tmp1
     tmp2 = alpha * tmp1 * tmp1 / dist
     grad1_fct(1) = tmp2 * (r1(1) - r2(1))
     grad1_fct(2) = tmp2 * (r1(2) - r2(2))
     grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+  else
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
   endif
 
   return

From 6e2f28b97fbf3c961c9461689a8db9adf81bd6f8 Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Sat, 11 May 2024 10:27:03 +0200
Subject: [PATCH 040/131] COLLAPSE(4) -> COLLAPSE(3)

---
 plugins/local/non_h_ints_mu/deb_aos.irp.f      | 6 +++---
 plugins/local/non_h_ints_mu/total_tc_int.irp.f | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f
index 4012f47c..70604f54 100644
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@@ -31,6 +31,9 @@ subroutine print_aos()
   integer          :: i, ipoint
   double precision :: r(3)
   double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: accu_vgl(5)
+  double precision :: accu_vgl_nrm(5)
+
   double precision :: mo_val, mo_der(3), mo_lap
 
   PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
@@ -40,9 +43,6 @@ subroutine print_aos()
     write(1000, '(3(f15.7, 3X))') r
   enddo
   
-double precision :: accu_vgl(5)
-double precision :: accu_vgl_nrm(5)
-
   do ipoint = 1, n_points_final_grid
     do i = 1, ao_num
       ao_val    = aos_in_r_array     (i,ipoint)
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index a1bbd6e0..656f5f16 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -78,7 +78,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
       !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
       !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do i = 1, ao_num
         do k = 1, ao_num
           do l = 1, ao_num
@@ -188,7 +188,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
         !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
         !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
         !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
-        !$OMP DO COLLAPSE(4)
+        !$OMP DO COLLAPSE(3)
         do i = 1, ao_num
           do k = 1, ao_num
             do l = 1, ao_num
@@ -270,7 +270,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PARALLEL DEFAULT(NONE)                     &
       !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
       !$OMP SHARED(ao_num, ao_two_e_tc_tot)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
@@ -293,7 +293,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP PARALLEL DEFAULT(NONE)                            &
       !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
       !$OMP PRIVATE(i, j, k, l)
-      !$OMP DO COLLAPSE(4)
+      !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
@@ -306,7 +306,6 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
-      !call clear_ao_map()
       FREE ao_integrals_map
     endif
 

From 8eea5d7f7f142103998d8bfa1b3bcc630935f69b Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 15 May 2024 15:41:35 +0200
Subject: [PATCH 041/131] fixed a bug in cholesk_ao_transp

---
 .../tuto_plugins/tuto_I/test_cholesky.irp.f   | 53 +++++++++++++++++++
 src/ao_two_e_ints/cholesky.irp.f              |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f

diff --git a/plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f b/plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f
new file mode 100644
index 00000000..d09d100a
--- /dev/null
+++ b/plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f
@@ -0,0 +1,53 @@
+program my_program_to_print_stuffs
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+ integer :: i,j,k,l,m
+ double precision :: integral, accu, accu_tot, integral_cholesky
+ double precision :: get_ao_two_e_integral, get_two_e_integral ! declaration of the functions 
+ print*,'AO integrals, physicist notations : <i j|k l>'
+ accu_tot = 0.D0
+ do i = 1, ao_num
+  do j = 1, ao_num
+   do k = 1, ao_num
+    do l = 1, ao_num
+     integral = get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+     integral_cholesky = 0.D0
+     do m = 1, cholesky_ao_num
+      integral_cholesky += cholesky_ao_transp(m,i,k) * cholesky_ao_transp(m,j,l)
+     enddo
+     accu = dabs(integral_cholesky-integral)
+     accu_tot += accu
+     if(accu.gt.1.d-10)then
+      print*,i,j,k,l
+      print*,accu, integral, integral_cholesky
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu_tot',accu_tot
+
+ print*,'MO integrals, physicist notations : <i j|k l>'
+ do i = 1, mo_num
+  do j = 1, mo_num
+   do k = 1, mo_num
+    do l = 1, mo_num
+     integral = get_two_e_integral(i, j, k, l, mo_integrals_map)
+     accu = 0.D0
+     integral_cholesky = 0.D0
+     do m = 1, cholesky_mo_num
+      integral_cholesky += cholesky_mo_transp(m,i,k) * cholesky_mo_transp(m,j,l)
+     enddo
+     accu = dabs(integral_cholesky-integral)
+     accu_tot += accu
+     if(accu.gt.1.d-10)then
+      print*,i,j,k,l
+      print*,accu, integral, integral_cholesky
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+end
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 33304026..5fbd166c 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -6,7 +6,7 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num,
  integer :: i,j,k
  do j=1,ao_num
   do i=1,ao_num
-   do k=1,ao_num
+   do k=1,cholesky_ao_num
     cholesky_ao_transp(k,i,j) = cholesky_ao(i,j,k)
    enddo
   enddo

From c6a61639445229eca3ecb2e32556ddef646064d6 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 16 May 2024 17:57:00 +0200
Subject: [PATCH 042/131] added f_hf with cholesky by default

---
 src/dft_utils_in_r/mo_in_r.irp.f     |   2 +-
 src/mu_of_r/f_cholesky.irp.f         | 221 +++++++++++++++++++++++++++
 src/mu_of_r/mu_of_r_conditions.irp.f |  46 +++++-
 3 files changed, 264 insertions(+), 5 deletions(-)
 create mode 100644 src/mu_of_r/f_cholesky.irp.f

diff --git a/src/dft_utils_in_r/mo_in_r.irp.f b/src/dft_utils_in_r/mo_in_r.irp.f
index 192cb25a..ad931402 100644
--- a/src/dft_utils_in_r/mo_in_r.irp.f
+++ b/src/dft_utils_in_r/mo_in_r.irp.f
@@ -48,7 +48,7 @@
  integer :: i,j
  do i = 1, n_points_final_grid
   do j = 1, mo_num
-   mos_in_r_array_transp(i,j) = mos_in_r_array(j,i) 
+   mos_in_r_array_transp(i,j) = mos_in_r_array_omp(j,i) 
   enddo
  enddo
  END_PROVIDER
diff --git a/src/mu_of_r/f_cholesky.irp.f b/src/mu_of_r/f_cholesky.irp.f
new file mode 100644
index 00000000..1ad4ce36
--- /dev/null
+++ b/src/mu_of_r/f_cholesky.irp.f
@@ -0,0 +1,221 @@
+BEGIN_PROVIDER [integer, list_couple_orb_r1, (2,n_couple_orb_r1)]
+ implicit none
+ integer :: ii,i,mm,m,itmp
+ itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(1)
+   i = list_valence_orb_for_hf(ii,1)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    itmp += 1
+    list_couple_orb_r1(1,itmp) = i
+    list_couple_orb_r1(2,itmp) = m
+   enddo
+  enddo
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [integer, list_couple_orb_r2, (2,n_couple_orb_r2)]
+ implicit none
+ integer :: ii,i,mm,m,itmp
+ itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(2)
+   i = list_valence_orb_for_hf(ii,2)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    itmp += 1
+    list_couple_orb_r2(1,itmp) = i
+    list_couple_orb_r2(2,itmp) = m
+   enddo
+  enddo
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [integer, n_couple_orb_r1] 
+ implicit none
+ BEGIN_DOC
+ ! number of couples of alpha occupied times any basis orbital
+ END_DOC
+ n_couple_orb_r1 = n_occ_val_orb_for_hf(1) * n_basis_orb
+END_PROVIDER 
+
+BEGIN_PROVIDER [integer, n_couple_orb_r2] 
+ implicit none
+ BEGIN_DOC
+ ! number of couples of beta occupied times any basis orbital
+ END_DOC
+ n_couple_orb_r2 = n_occ_val_orb_for_hf(2) * n_basis_orb
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, mos_times_cholesky_r1, (cholesky_mo_num,n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! V1_AR = \sum_{I}V_AI Phi_IR where "R" specifies the index of the grid point and A the number of cholesky point
+ ! 
+ ! here Phi_IR is phi_i(R)xphi_b(R) for r1 and V_AI = (ib|A) chollesky vector
+ END_DOC
+ double precision, allocatable :: mos_ib_r1(:,:),mo_chol_r1(:,:)
+ double precision, allocatable :: test(:,:)
+ double precision :: mo_i_r1,mo_b_r1
+ integer :: ii,i,mm,m,itmp,ipoint,ll
+ allocate(mos_ib_r1(n_couple_orb_r1,n_points_final_grid))
+ allocate(mo_chol_r1(cholesky_mo_num,n_couple_orb_r1))
+
+ do ipoint = 1, n_points_final_grid
+  itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(1)
+   i = list_valence_orb_for_hf(ii,1)
+   mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+    itmp += 1
+    mos_ib_r1(itmp,ipoint) = mo_i_r1 * mo_b_r1
+   enddo
+  enddo
+ enddo
+
+ itmp = 0
+ do ii = 1, n_occ_val_orb_for_hf(1)
+  i = list_valence_orb_for_hf(ii,1)
+  do mm = 1, n_basis_orb ! electron 1 
+   m = list_basis(mm)
+   itmp += 1
+   do ll = 1, cholesky_mo_num
+    mo_chol_r1(ll,itmp) = cholesky_mo_transp(ll,m,i)
+   enddo
+   enddo
+  enddo
+
+ call get_AB_prod(mo_chol_r1,cholesky_mo_num,n_couple_orb_r1,mos_ib_r1,n_points_final_grid,mos_times_cholesky_r1)
+ allocate(test(cholesky_mo_num,n_points_final_grid))
+ test = 0.d0
+ do ipoint = 1, n_points_final_grid
+  do itmp = 1, n_couple_orb_r1
+    i = list_couple_orb_r1(1,itmp)
+    m = list_couple_orb_r1(2,itmp) 
+    mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+    mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+    do mm = 1, cholesky_mo_num
+     test(mm,ipoint) += mo_i_r1 * mo_b_r1 * mo_chol_r1(mm,itmp)
+    enddo
+   enddo
+  enddo
+ double precision :: accu
+ accu = 0.d0
+ do ipoint = 1, n_points_final_grid
+  do mm = 1, cholesky_mo_num
+   accu += dabs(mos_times_cholesky_r1(mm,ipoint) - test(mm,ipoint) )
+   if(dabs(mos_times_cholesky_r1(mm,ipoint) - test(mm,ipoint)).gt.1.d-10)then
+    print*,'problem ! ',dabs(mos_times_cholesky_r1(mm,ipoint) - test(mm,ipoint)) & 
+                       ,     mos_times_cholesky_r1(mm,ipoint) , test(mm,ipoint) 
+   endif
+  enddo
+ enddo
+ print*,'accu = ',accu
+   
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, mos_times_cholesky_r2, (cholesky_mo_num,n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! V1_AR = \sum_{I}V_AI Phi_IR where "R" specifies the index of the grid point and A the number of cholesky point
+ ! 
+ ! here Phi_IR is phi_i(R)xphi_b(R) for r2 and V_AI = (ib|A) chollesky vector
+ END_DOC
+ double precision, allocatable :: mos_ib_r2(:,:),mo_chol_r2(:,:)
+ double precision, allocatable :: test(:,:)
+ double precision :: mo_i_r2,mo_b_r2
+ integer :: ii,i,mm,m,itmp,ipoint,ll
+ allocate(mos_ib_r2(n_couple_orb_r2,n_points_final_grid))
+ allocate(mo_chol_r2(cholesky_mo_num,n_couple_orb_r2))
+
+ do ipoint = 1, n_points_final_grid
+  itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(2)
+   i = list_valence_orb_for_hf(ii,2)
+   mo_i_r2 = mos_in_r_array_omp(i,ipoint)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    mo_b_r2 = mos_in_r_array_omp(m,ipoint)
+    itmp += 1
+    mos_ib_r2(itmp,ipoint) = mo_i_r2 * mo_b_r2
+   enddo
+  enddo
+ enddo
+
+ itmp = 0
+ do ii = 1, n_occ_val_orb_for_hf(2)
+  i = list_valence_orb_for_hf(ii,2)
+  do mm = 1, n_basis_orb ! electron 1 
+   m = list_basis(mm)
+   itmp += 1
+   do ll = 1, cholesky_mo_num
+    mo_chol_r2(ll,itmp) = cholesky_mo_transp(ll,m,i)
+   enddo
+   enddo
+  enddo
+
+ call get_AB_prod(mo_chol_r2,cholesky_mo_num,n_couple_orb_r2,mos_ib_r2,n_points_final_grid,mos_times_cholesky_r2)
+ allocate(test(cholesky_mo_num,n_points_final_grid))
+ test = 0.d0
+ do ipoint = 1, n_points_final_grid
+  do itmp = 1, n_couple_orb_r2
+    i = list_couple_orb_r2(1,itmp)
+    m = list_couple_orb_r2(2,itmp) 
+    mo_i_r2 = mos_in_r_array_omp(i,ipoint)
+    mo_b_r2 = mos_in_r_array_omp(m,ipoint)
+    do mm = 1, cholesky_mo_num
+     test(mm,ipoint) += mo_i_r2 * mo_b_r2 * mo_chol_r2(mm,itmp)
+    enddo
+   enddo
+  enddo
+ double precision :: accu
+ accu = 0.d0
+ do ipoint = 1, n_points_final_grid
+  do mm = 1, cholesky_mo_num
+   accu += dabs(mos_times_cholesky_r2(mm,ipoint) - test(mm,ipoint) )
+   if(dabs(mos_times_cholesky_r2(mm,ipoint) - test(mm,ipoint)).gt.1.d-10)then
+    print*,'problem ! ',dabs(mos_times_cholesky_r2(mm,ipoint) - test(mm,ipoint)) & 
+                       ,     mos_times_cholesky_r2(mm,ipoint) , test(mm,ipoint) 
+   endif
+  enddo
+ enddo
+ print*,'accu = ',accu
+
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v
+ do ipoint = 1, n_points_final_grid
+  f_hf_cholesky(ipoint) = 2.D0 * u_dot_v(mos_times_cholesky_r2(1,ipoint),mos_times_cholesky_r1(1,ipoint),cholesky_mo_num)
+ enddo
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,i,ii
+ double precision :: dm_a, dm_b
+ do ipoint = 1, n_points_final_grid
+  dm_a = 0.d0
+  do ii = 1, n_occ_val_orb_for_hf(1)
+   i = list_valence_orb_for_hf(ii,1)
+   dm_a += mos_in_r_array_omp(i,ipoint)*mos_in_r_array_omp(i,ipoint)
+  enddo
+  dm_b = 0.d0
+  do ii = 1, n_occ_val_orb_for_hf(2)
+   i = list_valence_orb_for_hf(ii,2)
+   dm_b += mos_in_r_array_omp(i,ipoint)*mos_in_r_array_omp(i,ipoint)
+  enddo
+   on_top_hf_grid(ipoint) = 2.D0 * dm_a*dm_b
+ enddo
+END_PROVIDER 
+
diff --git a/src/mu_of_r/mu_of_r_conditions.irp.f b/src/mu_of_r/mu_of_r_conditions.irp.f
index 6b49b9df..5b4d4b83 100644
--- a/src/mu_of_r/mu_of_r_conditions.irp.f
+++ b/src/mu_of_r/mu_of_r_conditions.irp.f
@@ -61,7 +61,7 @@
  END_DOC
  integer :: ipoint
  double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
- PROVIDE mo_two_e_integrals_in_map mo_integrals_map big_array_exchange_integrals 
+ PROVIDE f_hf_cholesky on_top_hf_grid
  print*,'providing mu_of_r_hf ...'
  call wall_time(wall0)
  sqpi = dsqrt(dacos(-1.d0))
@@ -69,10 +69,10 @@
  !$OMP PARALLEL DO &
  !$OMP DEFAULT (NONE)  &
  !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
- !$OMP ShARED (n_points_final_grid,mu_of_r_hf,f_psi_hf_ab,on_top_hf_mu_r,sqpi) 
+ !$OMP ShARED (n_points_final_grid,mu_of_r_hf,f_hf_cholesky,on_top_hf_grid,sqpi) 
  do ipoint = 1, n_points_final_grid
-  f_hf   = f_psi_hf_ab(ipoint)
-  on_top = on_top_hf_mu_r(ipoint)
+  f_hf   = f_hf_cholesky(ipoint)
+  on_top = on_top_hf_grid(ipoint)
   if(on_top.le.1.d-12.or.f_hf.le.0.d0.or.f_hf * on_top.lt.0.d0)then
     w_hf   = 1.d+10
   else 
@@ -85,6 +85,44 @@
  print*,'Time to provide mu_of_r_hf = ',wall1-wall0
  END_PROVIDER 
 
+ BEGIN_PROVIDER [double precision, mu_of_r_hf_old, (n_points_final_grid) ]
+ implicit none 
+ BEGIN_DOC
+ ! mu(r) computed with a HF wave function (assumes that HF MOs are stored in the EZFIO)
+ !
+ ! corresponds to Eq. (37) of J. Chem. Phys. 149, 194301 (2018) but for \Psi^B = HF^B
+ !
+ ! !!!!!! WARNING !!!!!! if no_core_density == .True. then all contributions from the core orbitals 
+ !
+ ! in the two-body density matrix are excluded
+ END_DOC
+ integer :: ipoint
+ double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
+ PROVIDE mo_two_e_integrals_in_map mo_integrals_map big_array_exchange_integrals 
+ print*,'providing mu_of_r_hf_old ...'
+ call wall_time(wall0)
+ sqpi = dsqrt(dacos(-1.d0))
+ provide f_psi_hf_ab 
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
+ !$OMP ShARED (n_points_final_grid,mu_of_r_hf_old,f_psi_hf_ab,on_top_hf_mu_r,sqpi) 
+ do ipoint = 1, n_points_final_grid
+  f_hf   = f_psi_hf_ab(ipoint)
+  on_top = on_top_hf_mu_r(ipoint)
+  if(on_top.le.1.d-12.or.f_hf.le.0.d0.or.f_hf * on_top.lt.0.d0)then
+    w_hf   = 1.d+10
+  else 
+    w_hf  = f_hf /  on_top
+  endif
+  mu_of_r_hf_old(ipoint) =  w_hf * sqpi * 0.5d0
+ enddo
+ !$OMP END PARALLEL DO
+ call wall_time(wall1)
+ print*,'Time to provide mu_of_r_hf_old = ',wall1-wall0
+ END_PROVIDER 
+
+
  BEGIN_PROVIDER [double precision, mu_of_r_psi_cas, (n_points_final_grid,N_states) ]
  implicit none 
  BEGIN_DOC

From ce042fbd787a21a600830596fa3caa5f7aa2cdb1 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 21 May 2024 12:01:28 +0200
Subject: [PATCH 043/131] basis set correction with cholesky works for hf

---
 .../local/basis_correction/51.basis_c.bats    |   8 --
 .../{01.convert.bats => convert_bats_old}     |   0
 src/hartree_fock/10.hf.bats                   |  13 --
 src/mu_of_r/basis_def.irp.f                   |  45 +++++++
 .../{f_cholesky.irp.f => f_hf_cholesky.irp.f} | 121 +++++++++---------
 5 files changed, 104 insertions(+), 83 deletions(-)
 rename src/ezfio_files/{01.convert.bats => convert_bats_old} (100%)
 rename src/mu_of_r/{f_cholesky.irp.f => f_hf_cholesky.irp.f} (67%)

diff --git a/plugins/local/basis_correction/51.basis_c.bats b/plugins/local/basis_correction/51.basis_c.bats
index 914b482b..1e20bae3 100644
--- a/plugins/local/basis_correction/51.basis_c.bats
+++ b/plugins/local/basis_correction/51.basis_c.bats
@@ -37,14 +37,6 @@ function run_sd() {
   eq $energy1 $1 $thresh
 }
 
-@test "O2 CAS" {  
-  qp set_file o2_cas.gms.ezfio
-  qp set_mo_class -c "[1-2]" -a "[3-10]" -d "[11-46]"
-  run -149.72435425 3.e-4 10000
-  qp set_mo_class -c "[1-2]" -a "[3-10]" -v "[11-46]"
-  run_md -0.1160222327 1.e-6 
-}
-
 
 @test "LiF RHF" {  
   qp set_file lif.ezfio 
diff --git a/src/ezfio_files/01.convert.bats b/src/ezfio_files/convert_bats_old
similarity index 100%
rename from src/ezfio_files/01.convert.bats
rename to src/ezfio_files/convert_bats_old
diff --git a/src/hartree_fock/10.hf.bats b/src/hartree_fock/10.hf.bats
index b496a089..214dfa86 100644
--- a/src/hartree_fock/10.hf.bats
+++ b/src/hartree_fock/10.hf.bats
@@ -115,9 +115,6 @@ rm -rf $EZFIO
   run hco.ezfio -113.1841002944744
 }
 
-@test "HBO" { # 0.805600 1.4543s
-  run  hbo.ezfio  -100.018582259096
-}
 
 @test "H2S" { # 1.655600 4.21402s
   run h2s.ezfio -398.6944130421982
@@ -127,9 +124,6 @@ rm -rf $EZFIO
   run h3coh.ezfio  -114.9865030596373
 }
 
-@test "H2O" { # 1.811100 1.84387s
-  run  h2o.ezfio  -0.760270218692179E+02
-}
 
 @test "H2O2" { # 2.217000 8.50267s
   run h2o2.ezfio -150.7806608469964
@@ -187,13 +181,6 @@ rm -rf $EZFIO
   run oh.ezfio -75.42025413469165
 }
 
-@test "[Cu(NH3)4]2+" { # 59.610100 4.18766m
-  [[ -n $TRAVIS ]] && skip
-  qp set_file cu_nh3_4_2plus.ezfio
-  qp set scf_utils thresh_scf 1.e-10
-  run  cu_nh3_4_2plus.ezfio -1862.97590358903
-}
-
 @test "SO2" { # 71.894900  3.22567m
   [[ -n $TRAVIS ]] && skip
   run so2.ezfio -41.55800401346361
diff --git a/src/mu_of_r/basis_def.irp.f b/src/mu_of_r/basis_def.irp.f
index fff9f581..e433f4d8 100644
--- a/src/mu_of_r/basis_def.irp.f
+++ b/src/mu_of_r/basis_def.irp.f
@@ -114,3 +114,48 @@ BEGIN_PROVIDER [double precision, basis_mos_in_r_array, (n_basis_orb,n_points_fi
   enddo
  enddo
 END_PROVIDER 
+
+! BEGIN_PROVIDER [integer, n_docc_val_orb_for_cas]
+!&BEGIN_PROVIDER [integer, n_max_docc_val_orb_for_cas]
+! implicit none
+! BEGIN_DOC
+! ! Number of DOUBLY OCCUPIED VALENCE ORBITALS for the CAS wave function
+! !
+! ! This determines the size of the space \mathcal{A} of Eqs. (15-16) of Phys.Chem.Lett.2019, 10, 2931   2937
+! END_DOC
+! integer :: i
+! n_docc_val_orb_for_cas = 0
+! ! You browse the BETA  ELECTRONS and check if its not a CORE ORBITAL 
+! do i = 1, elec_beta_num
+!  if(  trim(mo_class(i))=="Inactive" & 
+!  .or. trim(mo_class(i))=="Active"   &
+!  .or. trim(mo_class(i))=="Virtual" )then
+!   n_docc_val_orb_for_cas +=1 
+!  endif
+! enddo
+! n_max_docc_val_orb_for_cas = maxval(n_docc_val_orb_for_cas)
+!
+!END_PROVIDER 
+!
+!BEGIN_PROVIDER [integer, list_doc_valence_orb_for_cas, (n_max_docc_val_orb_for_cas)]
+! implicit none
+! BEGIN_DOC
+! ! List of OCCUPIED valence orbitals for each spin to build the f_{HF}(r_1,r_2) function 
+! !
+! ! This corresponds to ALL OCCUPIED orbitals in the HF wave function, except those defined as "core" 
+! !
+! ! This determines the space \mathcal{A} of Eqs. (15-16) of Phys.Chem.Lett.2019, 10, 2931   2937
+! END_DOC
+! j = 0
+! ! You browse the BETA  ELECTRONS and check if its not a CORE ORBITAL 
+! do i = 1, elec_beta_num
+!  if(  trim(mo_class(i))=="Inactive" & 
+!  .or. trim(mo_class(i))=="Active"   &
+!  .or. trim(mo_class(i))=="Virtual" )then
+!   j +=1 
+!   list_doc_valence_orb_for_cas(j) = i
+!  endif
+! enddo
+!
+!END_PROVIDER 
+
diff --git a/src/mu_of_r/f_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
similarity index 67%
rename from src/mu_of_r/f_cholesky.irp.f
rename to src/mu_of_r/f_hf_cholesky.irp.f
index 1ad4ce36..84097f09 100644
--- a/src/mu_of_r/f_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -1,4 +1,4 @@
-BEGIN_PROVIDER [integer, list_couple_orb_r1, (2,n_couple_orb_r1)]
+BEGIN_PROVIDER [integer, list_couple_hf_orb_r1, (2,n_couple_orb_r1)]
  implicit none
  integer :: ii,i,mm,m,itmp
  itmp = 0
@@ -7,14 +7,14 @@ BEGIN_PROVIDER [integer, list_couple_orb_r1, (2,n_couple_orb_r1)]
    do mm = 1, n_basis_orb ! electron 1 
     m = list_basis(mm)
     itmp += 1
-    list_couple_orb_r1(1,itmp) = i
-    list_couple_orb_r1(2,itmp) = m
+    list_couple_hf_orb_r1(1,itmp) = i
+    list_couple_hf_orb_r1(2,itmp) = m
    enddo
   enddo
 END_PROVIDER 
 
 
-BEGIN_PROVIDER [integer, list_couple_orb_r2, (2,n_couple_orb_r2)]
+BEGIN_PROVIDER [integer, list_couple_hf_orb_r2, (2,n_couple_orb_r2)]
  implicit none
  integer :: ii,i,mm,m,itmp
  itmp = 0
@@ -23,8 +23,8 @@ BEGIN_PROVIDER [integer, list_couple_orb_r2, (2,n_couple_orb_r2)]
    do mm = 1, n_basis_orb ! electron 1 
     m = list_basis(mm)
     itmp += 1
-    list_couple_orb_r2(1,itmp) = i
-    list_couple_orb_r2(2,itmp) = m
+    list_couple_hf_orb_r2(1,itmp) = i
+    list_couple_hf_orb_r2(2,itmp) = m
    enddo
   enddo
 END_PROVIDER 
@@ -87,31 +87,6 @@ BEGIN_PROVIDER [ double precision, mos_times_cholesky_r1, (cholesky_mo_num,n_poi
   enddo
 
  call get_AB_prod(mo_chol_r1,cholesky_mo_num,n_couple_orb_r1,mos_ib_r1,n_points_final_grid,mos_times_cholesky_r1)
- allocate(test(cholesky_mo_num,n_points_final_grid))
- test = 0.d0
- do ipoint = 1, n_points_final_grid
-  do itmp = 1, n_couple_orb_r1
-    i = list_couple_orb_r1(1,itmp)
-    m = list_couple_orb_r1(2,itmp) 
-    mo_i_r1 = mos_in_r_array_omp(i,ipoint)
-    mo_b_r1 = mos_in_r_array_omp(m,ipoint)
-    do mm = 1, cholesky_mo_num
-     test(mm,ipoint) += mo_i_r1 * mo_b_r1 * mo_chol_r1(mm,itmp)
-    enddo
-   enddo
-  enddo
- double precision :: accu
- accu = 0.d0
- do ipoint = 1, n_points_final_grid
-  do mm = 1, cholesky_mo_num
-   accu += dabs(mos_times_cholesky_r1(mm,ipoint) - test(mm,ipoint) )
-   if(dabs(mos_times_cholesky_r1(mm,ipoint) - test(mm,ipoint)).gt.1.d-10)then
-    print*,'problem ! ',dabs(mos_times_cholesky_r1(mm,ipoint) - test(mm,ipoint)) & 
-                       ,     mos_times_cholesky_r1(mm,ipoint) , test(mm,ipoint) 
-   endif
-  enddo
- enddo
- print*,'accu = ',accu
    
 
 END_PROVIDER 
@@ -157,53 +132,72 @@ BEGIN_PROVIDER [ double precision, mos_times_cholesky_r2, (cholesky_mo_num,n_poi
   enddo
 
  call get_AB_prod(mo_chol_r2,cholesky_mo_num,n_couple_orb_r2,mos_ib_r2,n_points_final_grid,mos_times_cholesky_r2)
- allocate(test(cholesky_mo_num,n_points_final_grid))
- test = 0.d0
- do ipoint = 1, n_points_final_grid
-  do itmp = 1, n_couple_orb_r2
-    i = list_couple_orb_r2(1,itmp)
-    m = list_couple_orb_r2(2,itmp) 
-    mo_i_r2 = mos_in_r_array_omp(i,ipoint)
-    mo_b_r2 = mos_in_r_array_omp(m,ipoint)
-    do mm = 1, cholesky_mo_num
-     test(mm,ipoint) += mo_i_r2 * mo_b_r2 * mo_chol_r2(mm,itmp)
-    enddo
-   enddo
-  enddo
- double precision :: accu
- accu = 0.d0
- do ipoint = 1, n_points_final_grid
-  do mm = 1, cholesky_mo_num
-   accu += dabs(mos_times_cholesky_r2(mm,ipoint) - test(mm,ipoint) )
-   if(dabs(mos_times_cholesky_r2(mm,ipoint) - test(mm,ipoint)).gt.1.d-10)then
-    print*,'problem ! ',dabs(mos_times_cholesky_r2(mm,ipoint) - test(mm,ipoint)) & 
-                       ,     mos_times_cholesky_r2(mm,ipoint) , test(mm,ipoint) 
-   endif
-  enddo
- enddo
- print*,'accu = ',accu
 
 END_PROVIDER 
 
 
 BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
  implicit none
- integer :: ipoint
+ integer :: ipoint,m,k
  !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
  !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
  !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
  !!     =  \sum_A V_AR G_AR 
  !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
- double precision :: u_dot_v
- do ipoint = 1, n_points_final_grid
-  f_hf_cholesky(ipoint) = 2.D0 * u_dot_v(mos_times_cholesky_r2(1,ipoint),mos_times_cholesky_r1(1,ipoint),cholesky_mo_num)
- enddo
+ double precision :: u_dot_v,wall0,wall1
+ if(elec_alpha_num == elec_beta_num)then
+  provide mos_times_cholesky_r1
+  print*,'providing f_hf_cholesky ...'
+  call wall_time(wall0)
+  !$OMP PARALLEL DO &
+  !$OMP DEFAULT (NONE)  &
+  !$OMP PRIVATE (ipoint,m) & 
+  !$OMP ShARED (mos_times_cholesky_r1,cholesky_mo_num,f_hf_cholesky,n_points_final_grid) 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky(ipoint) = 0.d0
+    do m = 1, cholesky_mo_num
+     f_hf_cholesky(ipoint) =  f_hf_cholesky(ipoint) + &
+       mos_times_cholesky_r1(m,ipoint) * mos_times_cholesky_r1(m,ipoint)
+    enddo
+    f_hf_cholesky(ipoint) *= 2.D0
+   enddo
+  !$OMP END PARALLEL DO
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky = ',wall1-wall0
+  free mos_times_cholesky_r1
+ else
+  provide mos_times_cholesky_r2 mos_times_cholesky_r1
+  !$OMP PARALLEL DO &
+  !$OMP DEFAULT (NONE)  &
+  !$OMP PRIVATE (ipoint,m) & 
+  !$OMP ShARED (mos_times_cholesky_r2,mos_times_cholesky_r1,cholesky_mo_num,f_hf_cholesky,n_points_final_grid) 
+  do ipoint = 1, n_points_final_grid
+   f_hf_cholesky(ipoint) = 0.D0
+    do m = 1, cholesky_mo_num
+     f_hf_cholesky(ipoint) =  f_hf_cholesky(ipoint) + &
+            mos_times_cholesky_r2(m,ipoint)*mos_times_cholesky_r1(m,ipoint)
+    enddo
+    f_hf_cholesky(ipoint) *= 2.D0
+  enddo
+  !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky = ',wall1-wall0
+  free mos_times_cholesky_r2 mos_times_cholesky_r1
+ endif
 END_PROVIDER 
 
 BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
  implicit none
  integer :: ipoint,i,ii
- double precision :: dm_a, dm_b
+ double precision :: dm_a, dm_b,wall0,wall1
+ print*,'providing on_top_hf_grid ...'
+ provide mos_in_r_array_omp
+ call wall_time(wall0)
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (ipoint,dm_a,dm_b,ii,i) & 
+ !$OMP ShARED (n_points_final_grid,n_occ_val_orb_for_hf,mos_in_r_array_omp,list_valence_orb_for_hf,on_top_hf_grid) 
  do ipoint = 1, n_points_final_grid
   dm_a = 0.d0
   do ii = 1, n_occ_val_orb_for_hf(1)
@@ -217,5 +211,8 @@ BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
   enddo
    on_top_hf_grid(ipoint) = 2.D0 * dm_a*dm_b
  enddo
+ !$OMP END PARALLEL DO
+ call wall_time(wall1)
+ print*,'Time to provide on_top_hf_grid = ',wall1-wall0
 END_PROVIDER 
 

From 112f113ccb3f363262930b53e21aed010a29f746 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 21 May 2024 12:26:30 +0200
Subject: [PATCH 044/131] fixed forgotten stuffs in normal_order_old/NEED

---
 plugins/local/normal_order_old/NEED | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/local/normal_order_old/NEED b/plugins/local/normal_order_old/NEED
index 8b137891..e8c8c478 100644
--- a/plugins/local/normal_order_old/NEED
+++ b/plugins/local/normal_order_old/NEED
@@ -1 +1 @@
-
+tc_scf

From 6fb0f2a58e803ea02a03fe01b4ce9daa6b2fba91 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 21 May 2024 12:53:55 +0200
Subject: [PATCH 045/131] modified scripts/get_fci_tc_conv.sh according to new
 printing

---
 plugins/local/tc_bi_ortho/print_tc_wf.irp.f | 2 +-
 scripts/get_fci_tc_conv.sh                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f b/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
index 2b88bc5b..4d9f7c48 100644
--- a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
+++ b/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
@@ -37,7 +37,7 @@ subroutine write_l_r_wf
  integer :: i
  print*,'Writing the left-right wf'
  do i = 1, N_det
-  write(i_unit_output,*)i, psi_coef_sorted_tc(i,1)/psi_coef_sorted_tc(i,1)                       &
+  write(i_unit_output,'(I8,X,10(F16.10,X))')i, psi_coef_sorted_tc(i,1),psi_coef_sorted_tc(i,1)/psi_coef_sorted_tc(1,1)&
                          , psi_l_coef_sorted_bi_ortho_left(i)/psi_l_coef_sorted_bi_ortho_left(1) &
                          , psi_r_coef_sorted_bi_ortho_right(i)/psi_r_coef_sorted_bi_ortho_right(1)
  enddo
diff --git a/scripts/get_fci_tc_conv.sh b/scripts/get_fci_tc_conv.sh
index 643f3ac0..f0c99baf 100755
--- a/scripts/get_fci_tc_conv.sh
+++ b/scripts/get_fci_tc_conv.sh
@@ -1,2 +1,2 @@
 file=$1
-grep "Ndet,E,E+PT2,E+RPT2,|PT2|=" $file | cut -d "=" -f 2 > ${file}.conv_fci_tc
+grep "Ndet,E,E+PT2,pt2_minus,pt2_plus,pt2_abs=" $file | cut -d "=" -f 2 > ${file}.conv_fci_tc

From 3600c3c5ca92c6c62a0bbbb6cc1d01ec595e148c Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 22 May 2024 17:02:26 +0200
Subject: [PATCH 046/131] removed stupid dead link for PYSCF_EOMCC.py

---
 scripts/PYSCF_EOMCC.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 scripts/PYSCF_EOMCC.py

diff --git a/scripts/PYSCF_EOMCC.py b/scripts/PYSCF_EOMCC.py
deleted file mode 120000
index 8ad341da..00000000
--- a/scripts/PYSCF_EOMCC.py
+++ /dev/null
@@ -1 +0,0 @@
-/home_lct/eginer/qp2/plugins/qp_plugins_lct/dev/fcidump_for_vbarb/PYSCF_EOMCC.py
\ No newline at end of file

From 29da3b6542cdfeb52f9d1b7f8c23f3967018bf0f Mon Sep 17 00:00:00 2001
From: AbdAmmar <abd.ammar.phys@gmail.com>
Date: Thu, 23 May 2024 00:45:56 +0200
Subject: [PATCH 047/131] bypass one_e_tr_dm_mo for large mo_num & n_states

---
 .../multi_s_dipole_moment.irp.f               | 154 ++++++++++++++++--
 1 file changed, 140 insertions(+), 14 deletions(-)

diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f
index c7216a61..8aae3bf4 100644
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@@ -18,7 +18,7 @@
 
 
 
-BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
+ BEGIN_PROVIDER [double precision, multi_s_dipole_moment  , (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
   ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
   END_DOC
 
-  integer          :: istate,jstate ! States
-  integer          :: i,j           ! general spatial MOs
+  integer          :: istate, jstate ! States
+  integer          :: i, j           ! general spatial MOs
   double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
  
   multi_s_x_dipole_moment = 0.d0
   multi_s_y_dipole_moment = 0.d0
   multi_s_z_dipole_moment = 0.d0
+
+  if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
  
-  do jstate = 1, N_states
-    do istate = 1, N_states
- 
-      do i = 1, mo_num  
-        do j = 1, mo_num  
-          multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
-          multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
-          multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
-        enddo
+    do jstate = 1, N_states
+      do istate = 1, N_states
+        do i = 1, mo_num  
+          do j = 1, mo_num  
+            multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
+            multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
+            multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
+          enddo
+        enddo 
       enddo
- 
     enddo
-  enddo
+
+  else
+
+    ! no enouph memory
+    ! on the fly scheme
+
+    PROVIDE psi_det_alpha_unique psi_det_beta_unique
+
+    integer           :: l, k_a, k_b
+    integer           :: occ(N_int*bit_kind_size,2)
+    integer           :: h1, h2, p1, p2, degree
+    integer           :: exc(0:2,2), n_occ(2)
+    integer           :: krow, kcol, lrow, lcol
+    integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
+    double precision  :: ck, ckl, phase
+
+    !$OMP PARALLEL DEFAULT(NONE)                                                      &
+    !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc,  & 
+    !$OMP         phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2)    &
+    !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det,               &
+    !$OMP        psi_bilinear_matrix_rows, psi_bilinear_matrix_columns,               &
+    !$OMP        psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
+    !$OMP        psi_det_alpha_unique, psi_det_beta_unique,                           &
+    !$OMP        psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values,       &
+    !$OMP        mo_dipole_x, mo_dipole_y, mo_dipole_z,                               &
+    !$OMP        multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
+    !$OMP DO COLLAPSE(2)
+    do istate = 1, N_states
+      do jstate = 1, N_states
+
+        do k_a = 1, N_det
+          krow = psi_bilinear_matrix_rows   (k_a)
+          kcol = psi_bilinear_matrix_columns(k_a)
+  
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+  
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
+          do l = 1, elec_alpha_num
+            j = occ(l,1)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+  
+          if (k_a == N_det) cycle
+          l = k_a + 1
+          lrow = psi_bilinear_matrix_rows   (l)
+          lcol = psi_bilinear_matrix_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lcol == kcol)
+            tmp_det2(:) = psi_det_alpha_unique(:,lrow)
+            call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_rows   (l)
+            lcol = psi_bilinear_matrix_columns(l)
+          enddo
+        enddo ! k_a
+  
+        do k_b = 1, N_det
+          krow = psi_bilinear_matrix_transp_rows   (k_b)
+          kcol = psi_bilinear_matrix_transp_columns(k_b)
+      
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+      
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
+          do l = 1, elec_beta_num
+            j = occ(l,2)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+      
+          if (k_b == N_det) cycle
+          l = k_b+1
+          lrow = psi_bilinear_matrix_transp_rows   (l)
+          lcol = psi_bilinear_matrix_transp_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lrow == krow)
+            tmp_det2(:) = psi_det_beta_unique(:,lcol)
+            call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_transp_rows   (l)
+            lcol = psi_bilinear_matrix_transp_columns(l)
+          enddo
+        enddo ! k_b
+
+      enddo ! istate
+    enddo ! jstate
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif ! memory condition
  
   ! Nuclei part
   nuclei_part_x = 0.d0

From 70f8019960140b965476e1efa88ee8e03850b0d9 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 23 May 2024 10:02:46 +0200
Subject: [PATCH 048/131] removed stupid print in tc_keywords/EZFIO.cfg

---
 plugins/local/tc_keywords/EZFIO.cfg |  1 -
 src/mu_of_r/f_hf_cholesky.irp.f     | 75 +++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index b7ce0b19..f3bd75c8 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -273,4 +273,3 @@ type: logical
 doc: If |true|, you duplicate the two-electron TC integrals with the transpose matrix. Acceleates the PT2.  
 interface: ezfio,provider,ocaml
 default: False
->>>>>>> 8c4183cf6e38711b097df202d1f430b76823aeff
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index 84097f09..101f9cc9 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -187,6 +187,81 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
  endif
 END_PROVIDER 
 
+BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,mm,i,ii,p
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ if(elec_alpha_num == elec_beta_num)then
+  call wall_time(wall0)
+!  !$OMP PARALLEL DO &
+!  !$OMP DEFAULT (NONE)  &
+!  !$OMP PRIVATE (ipoint,m) & 
+!  !$OMP ShARED (mos_times_cholesky_r1,cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid) 
+   do ipoint = 1, n_points_final_grid
+    f_hf_sparse_cholesky(ipoint) = 0.d0
+    do p = 1, cholesky_mo_num
+     accu_1 = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       accu_1 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
+      enddo
+     enddo
+     f_hf_sparse_cholesky(ipoint) += accu_1 * accu_1
+    enddo
+    f_hf_sparse_cholesky(ipoint) *= 2.D0
+   enddo
+!  !$OMP END PARALLEL DO
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_sparse_cholesky = ',wall1-wall0
+  else
+  call wall_time(wall0)
+!  !$OMP PARALLEL DO &
+!  !$OMP DEFAULT (NONE)  &
+!  !$OMP PRIVATE (ipoint,m) & 
+!  !$OMP ShARED (mos_times_cholesky_r1,cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid) 
+   do ipoint = 1, n_points_final_grid
+    f_hf_sparse_cholesky(ipoint) = 0.d0
+    do p = 1, cholesky_mo_num
+     accu_2 = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(2)
+      i = list_valence_orb_for_hf(ii,2)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       accu_2 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
+      enddo
+     enddo
+     accu_1 = accu_2
+     do ii = n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       accu_1 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
+      enddo
+     enddo
+     f_hf_sparse_cholesky(ipoint) += accu_1 * accu_2
+    enddo
+    f_hf_sparse_cholesky(ipoint) *= 2.D0
+   enddo
+!  !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_sparse_cholesky = ',wall1-wall0
+ endif
+END_PROVIDER 
+
 BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
  implicit none
  integer :: ipoint,i,ii

From 49a96d4400c640928ba213f33e0493eadb4457ad Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 23 May 2024 10:17:31 +0200
Subject: [PATCH 049/131] added f_hf_cholesky without big storage in memory

---
 .../basis_correction/test_chol_bas.irp.f      | 18 ++++++++++++
 src/mu_of_r/f_hf_cholesky.irp.f               | 28 +++++++++++--------
 2 files changed, 34 insertions(+), 12 deletions(-)
 create mode 100644 plugins/local/basis_correction/test_chol_bas.irp.f

diff --git a/plugins/local/basis_correction/test_chol_bas.irp.f b/plugins/local/basis_correction/test_chol_bas.irp.f
new file mode 100644
index 00000000..ae47ec09
--- /dev/null
+++ b/plugins/local/basis_correction/test_chol_bas.irp.f
@@ -0,0 +1,18 @@
+program pouet
+ implicit none
+ call test
+end
+subroutine test
+ implicit none
+! provide mos_times_cholesky_r1
+! provide mos_times_cholesky_r2
+ integer :: ipoint
+ double precision :: accu,weight
+ accu = 0.d0
+ do ipoint = 1, n_points_final_grid
+  weight = final_weight_at_r_vector(ipoint)
+!  accu += dabs(mu_of_r_hf(ipoint) - mu_of_r_hf_old(ipoint)) * weight
+  accu += dabs(f_hf_sparse_cholesky(ipoint) - f_hf_cholesky(ipoint)) * weight
+ enddo
+ print*,'accu = ',accu
+end
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index 101f9cc9..b937addf 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -146,9 +146,9 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
  !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
  double precision :: u_dot_v,wall0,wall1
  if(elec_alpha_num == elec_beta_num)then
-  provide mos_times_cholesky_r1
   print*,'providing f_hf_cholesky ...'
   call wall_time(wall0)
+  provide mos_times_cholesky_r1
   !$OMP PARALLEL DO &
   !$OMP DEFAULT (NONE)  &
   !$OMP PRIVATE (ipoint,m) & 
@@ -167,6 +167,8 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
   print*,'Time to provide f_hf_cholesky = ',wall1-wall0
   free mos_times_cholesky_r1
  else
+  print*,'providing f_hf_cholesky ...'
+  call wall_time(wall0)
   provide mos_times_cholesky_r2 mos_times_cholesky_r1
   !$OMP PARALLEL DO &
   !$OMP DEFAULT (NONE)  &
@@ -198,10 +200,11 @@ BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
  double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
  if(elec_alpha_num == elec_beta_num)then
   call wall_time(wall0)
-!  !$OMP PARALLEL DO &
-!  !$OMP DEFAULT (NONE)  &
-!  !$OMP PRIVATE (ipoint,m) & 
-!  !$OMP ShARED (mos_times_cholesky_r1,cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid) 
+  !$OMP PARALLEL DO &
+  !$OMP DEFAULT (NONE)  &
+  !$OMP PRIVATE (accu_1,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid,cholesky_mo,n_basis_orb) 
    do ipoint = 1, n_points_final_grid
     f_hf_sparse_cholesky(ipoint) = 0.d0
     do p = 1, cholesky_mo_num
@@ -219,16 +222,17 @@ BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
     enddo
     f_hf_sparse_cholesky(ipoint) *= 2.D0
    enddo
-!  !$OMP END PARALLEL DO
+  !$OMP END PARALLEL DO
   
   call wall_time(wall1)
   print*,'Time to provide f_hf_sparse_cholesky = ',wall1-wall0
-  else
+ else
   call wall_time(wall0)
-!  !$OMP PARALLEL DO &
-!  !$OMP DEFAULT (NONE)  &
-!  !$OMP PRIVATE (ipoint,m) & 
-!  !$OMP ShARED (mos_times_cholesky_r1,cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid) 
+  !$OMP PARALLEL DO &
+  !$OMP DEFAULT (NONE)  &
+  !$OMP PRIVATE (accu_2,accu_1,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid,cholesky_mo,n_basis_orb) 
    do ipoint = 1, n_points_final_grid
     f_hf_sparse_cholesky(ipoint) = 0.d0
     do p = 1, cholesky_mo_num
@@ -256,7 +260,7 @@ BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
     enddo
     f_hf_sparse_cholesky(ipoint) *= 2.D0
    enddo
-!  !$OMP END PARALLEL DO
+  !$OMP END PARALLEL DO
   call wall_time(wall1)
   print*,'Time to provide f_hf_sparse_cholesky = ',wall1-wall0
  endif

From 70745cbeaaf59900c3ce4e1df042ef88ff1ecb11 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 23 May 2024 14:45:33 +0200
Subject: [PATCH 050/131] added sparse cholesky mu_of_r

---
 external/irpf90                               |  2 +-
 .../basis_correction/basis_correction.irp.f   |  4 --
 .../basis_correction/print_routine.irp.f      |  2 +-
 .../basis_correction/test_chol_bas.irp.f      |  2 +-
 src/mu_of_r/f_hf_cholesky.irp.f               | 52 ++++++++++++-------
 src/mu_of_r/mu_of_r_conditions.irp.f          | 44 ++++++++++++++--
 6 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/external/irpf90 b/external/irpf90
index 0007f72f..4ab1b175 160000
--- a/external/irpf90
+++ b/external/irpf90
@@ -1 +1 @@
-Subproject commit 0007f72f677fe7d61c5e1ed461882cb239517102
+Subproject commit 4ab1b175fc7ed0d96c1912f13dc53579b24157a6
diff --git a/plugins/local/basis_correction/basis_correction.irp.f b/plugins/local/basis_correction/basis_correction.irp.f
index a7ea7244..f17b5d5b 100644
--- a/plugins/local/basis_correction/basis_correction.irp.f
+++ b/plugins/local/basis_correction/basis_correction.irp.f
@@ -7,10 +7,6 @@ program basis_correction
   touch read_wf
   no_core_density = .True.
   touch no_core_density
-  if(io_mo_two_e_integrals .ne. "Read")then
-   provide ao_two_e_integrals_in_map
-  endif
-  provide mo_two_e_integrals_in_map
   call print_basis_correction
 end
 
diff --git a/plugins/local/basis_correction/print_routine.irp.f b/plugins/local/basis_correction/print_routine.irp.f
index 96faba30..b3b38673 100644
--- a/plugins/local/basis_correction/print_routine.irp.f
+++ b/plugins/local/basis_correction/print_routine.irp.f
@@ -22,7 +22,7 @@ subroutine print_basis_correction
  print*, '****************************************'
  print*, '****************************************'
  print*, 'mu_of_r_potential = ',mu_of_r_potential
- if(mu_of_r_potential.EQ."hf")then
+ if(mu_of_r_potential.EQ."hf".or.mu_of_r_potential.EQ."hf_old".or.mu_of_r_potential.EQ."hf_sparse")then
    print*, ''
    print*,'Using a HF-like two-body density to define mu(r)'
    print*,'This assumes that HF is a qualitative representation of the wave function '
diff --git a/plugins/local/basis_correction/test_chol_bas.irp.f b/plugins/local/basis_correction/test_chol_bas.irp.f
index ae47ec09..076d888c 100644
--- a/plugins/local/basis_correction/test_chol_bas.irp.f
+++ b/plugins/local/basis_correction/test_chol_bas.irp.f
@@ -12,7 +12,7 @@ subroutine test
  do ipoint = 1, n_points_final_grid
   weight = final_weight_at_r_vector(ipoint)
 !  accu += dabs(mu_of_r_hf(ipoint) - mu_of_r_hf_old(ipoint)) * weight
-  accu += dabs(f_hf_sparse_cholesky(ipoint) - f_hf_cholesky(ipoint)) * weight
+  accu += dabs(f_hf_cholesky_sparse(ipoint) - f_hf_cholesky(ipoint)) * weight
  enddo
  print*,'accu = ',accu
 end
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index b937addf..17f0229a 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -189,7 +189,7 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
  endif
 END_PROVIDER 
 
-BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
+BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
  implicit none
  integer :: ipoint,m,mm,i,ii,p
  !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
@@ -198,43 +198,55 @@ BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
  !!     =  \sum_A V_AR G_AR 
  !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
  double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ double precision :: thresh_1,thresh_2
+ double precision, allocatable :: accu_vec(:)
+ thresh_2 = ao_cholesky_threshold * 100.d0
+ thresh_1 = dsqrt(thresh_2)
+ provide cholesky_mo_transp
  if(elec_alpha_num == elec_beta_num)then
   call wall_time(wall0)
-  !$OMP PARALLEL DO &
-  !$OMP DEFAULT (NONE)  &
-  !$OMP PRIVATE (accu_1,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
-  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp) & 
-  !$OMP ShARED (cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid,cholesky_mo,n_basis_orb) 
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num))
+  !$OMP DO 
    do ipoint = 1, n_points_final_grid
-    f_hf_sparse_cholesky(ipoint) = 0.d0
-    do p = 1, cholesky_mo_num
-     accu_1 = 0.d0
+    f_hf_cholesky_sparse(ipoint) = 0.d0
+     accu_vec = 0.d0
      do ii = 1, n_occ_val_orb_for_hf(1)
       i = list_valence_orb_for_hf(ii,1)
       mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
       do mm = 1, n_basis_orb ! electron 1 
        m = list_basis(mm)
        mo_b_r1 = mos_in_r_array_omp(m,ipoint)
-       accu_1 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) += mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
       enddo
      enddo
-     f_hf_sparse_cholesky(ipoint) += accu_1 * accu_1
-    enddo
-    f_hf_sparse_cholesky(ipoint) *= 2.D0
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse(ipoint) += accu_vec(p) * accu_vec(p)
+     enddo
+    f_hf_cholesky_sparse(ipoint) *= 2.D0
    enddo
-  !$OMP END PARALLEL DO
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
   
   call wall_time(wall1)
-  print*,'Time to provide f_hf_sparse_cholesky = ',wall1-wall0
+  print*,'Time to provide f_hf_cholesky_sparse = ',wall1-wall0
  else
   call wall_time(wall0)
   !$OMP PARALLEL DO &
   !$OMP DEFAULT (NONE)  &
   !$OMP PRIVATE (accu_2,accu_1,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
   !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp) & 
-  !$OMP ShARED (cholesky_mo_num,f_hf_sparse_cholesky,n_points_final_grid,cholesky_mo,n_basis_orb) 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse,n_points_final_grid,cholesky_mo,n_basis_orb) 
    do ipoint = 1, n_points_final_grid
-    f_hf_sparse_cholesky(ipoint) = 0.d0
+    f_hf_cholesky_sparse(ipoint) = 0.d0
     do p = 1, cholesky_mo_num
      accu_2 = 0.d0
      do ii = 1, n_occ_val_orb_for_hf(2)
@@ -256,13 +268,13 @@ BEGIN_PROVIDER [ double precision, f_hf_sparse_cholesky, (n_points_final_grid)]
        accu_1 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
       enddo
      enddo
-     f_hf_sparse_cholesky(ipoint) += accu_1 * accu_2
+     f_hf_cholesky_sparse(ipoint) += accu_1 * accu_2
     enddo
-    f_hf_sparse_cholesky(ipoint) *= 2.D0
+    f_hf_cholesky_sparse(ipoint) *= 2.D0
    enddo
   !$OMP END PARALLEL DO
   call wall_time(wall1)
-  print*,'Time to provide f_hf_sparse_cholesky = ',wall1-wall0
+  print*,'Time to provide f_hf_cholesky_sparse = ',wall1-wall0
  endif
 END_PROVIDER 
 
diff --git a/src/mu_of_r/mu_of_r_conditions.irp.f b/src/mu_of_r/mu_of_r_conditions.irp.f
index 5b4d4b83..f2bb7145 100644
--- a/src/mu_of_r/mu_of_r_conditions.irp.f
+++ b/src/mu_of_r/mu_of_r_conditions.irp.f
@@ -13,7 +13,6 @@
  integer :: ipoint,istate
  double precision :: wall0,wall1
  print*,'providing mu_of_r ...'
-! PROVIDE mo_two_e_integrals_in_map mo_integrals_map big_array_exchange_integrals 
  call wall_time(wall0)
 
  if (read_mu_of_r) then
@@ -26,6 +25,10 @@
   do ipoint = 1, n_points_final_grid
    if(mu_of_r_potential.EQ."hf")then
     mu_of_r_prov(ipoint,istate) =  mu_of_r_hf(ipoint)
+   else if(mu_of_r_potential.EQ."hf_old")then
+    mu_of_r_prov(ipoint,istate) =  mu_of_r_hf_old(ipoint)
+   else if(mu_of_r_potential.EQ."hf_sparse")then
+    mu_of_r_prov(ipoint,istate) =  mu_of_r_hf_sparse(ipoint)
    else if(mu_of_r_potential.EQ."cas_full".or.mu_of_r_potential.EQ."cas_truncated".or.mu_of_r_potential.EQ."pure_act")then
     mu_of_r_prov(ipoint,istate) =  mu_of_r_psi_cas(ipoint,istate)
    else 
@@ -61,11 +64,10 @@
  END_DOC
  integer :: ipoint
  double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
- PROVIDE f_hf_cholesky on_top_hf_grid
  print*,'providing mu_of_r_hf ...'
  call wall_time(wall0)
+ PROVIDE f_hf_cholesky on_top_hf_grid
  sqpi = dsqrt(dacos(-1.d0))
- provide f_psi_hf_ab 
  !$OMP PARALLEL DO &
  !$OMP DEFAULT (NONE)  &
  !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
@@ -85,6 +87,42 @@
  print*,'Time to provide mu_of_r_hf = ',wall1-wall0
  END_PROVIDER 
 
+ BEGIN_PROVIDER [double precision, mu_of_r_hf_sparse, (n_points_final_grid) ]
+ implicit none 
+ BEGIN_DOC
+ ! mu(r) computed with a HF wave function (assumes that HF MOs are stored in the EZFIO)
+ !
+ ! corresponds to Eq. (37) of J. Chem. Phys. 149, 194301 (2018) but for \Psi^B = HF^B
+ !
+ ! !!!!!! WARNING !!!!!! if no_core_density == .True. then all contributions from the core orbitals 
+ !
+ ! in the two-body density matrix are excluded
+ END_DOC
+ integer :: ipoint
+ double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
+ print*,'providing mu_of_r_hf_sparse ...'
+ call wall_time(wall0)
+ sqpi = dsqrt(dacos(-1.d0))
+ PROVIDE f_hf_cholesky_sparse on_top_hf_grid
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
+ !$OMP ShARED (n_points_final_grid,mu_of_r_hf_sparse,f_hf_cholesky_sparse,on_top_hf_grid,sqpi) 
+ do ipoint = 1, n_points_final_grid
+  f_hf   = f_hf_cholesky_sparse(ipoint)
+  on_top = on_top_hf_grid(ipoint)
+  if(on_top.le.1.d-12.or.f_hf.le.0.d0.or.f_hf * on_top.lt.0.d0)then
+    w_hf   = 1.d+10
+  else 
+    w_hf  = f_hf /  on_top
+  endif
+  mu_of_r_hf_sparse(ipoint) =  w_hf * sqpi * 0.5d0
+ enddo
+ !$OMP END PARALLEL DO
+ call wall_time(wall1)
+ print*,'Time to provide mu_of_r_hf_sparse = ',wall1-wall0
+ END_PROVIDER 
+
  BEGIN_PROVIDER [double precision, mu_of_r_hf_old, (n_points_final_grid) ]
  implicit none 
  BEGIN_DOC

From 1e886ac128187624af9efac5dd0cbe29e594ff5b Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 27 May 2024 10:21:29 +0200
Subject: [PATCH 051/131] implemented the f_hf_sparse for open systems in basis
 set correction

---
 src/dft_utils_func/on_top_from_ueg.irp.f |  1 -
 src/mo_two_e_ints/cholesky.irp.f         |  4 +++
 src/mu_of_r/f_hf_cholesky.irp.f          | 45 +++++++++++++++---------
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/dft_utils_func/on_top_from_ueg.irp.f b/src/dft_utils_func/on_top_from_ueg.irp.f
index 4e28ad89..711ffc39 100644
--- a/src/dft_utils_func/on_top_from_ueg.irp.f
+++ b/src/dft_utils_func/on_top_from_ueg.irp.f
@@ -32,7 +32,6 @@ double precision function g0_UEG_mu_inf(rho_a,rho_b)
  C = 0.08193d0
  D = -0.01277d0
  E = 0.001859d0                     
- x = -d2*rs
  if (dabs(rho) > 1.d-20) then
   rs = (3d0 / (4d0*pi*rho))**(1d0/3d0) ! JT: serious bug fixed 20/03/19
   x = -d2*rs
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 349f13b9..0d0989d7 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -34,8 +34,10 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
  END_DOC
 
  double precision, allocatable :: X(:,:,:)
+ double precision :: wall0, wall1
  integer :: ierr
  print *, 'AO->MO Transformation of Cholesky vectors'
+  call wall_time(wall0)
 
  allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
  if (ierr /= 0) then
@@ -46,6 +48,8 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
  call dgemm('T','N', cholesky_mo_num*mo_num, mo_num, ao_num, 1.d0, &
      X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_mo_num*mo_num)
  deallocate(X)
+  call wall_time(wall1)
+ print*,'Time for AO->MO Cholesky vectors = ',wall1-wall0
 
 END_PROVIDER
 
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index 17f0229a..472abb1c 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -199,7 +199,7 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
  !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
  double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
  double precision :: thresh_1,thresh_2
- double precision, allocatable :: accu_vec(:)
+ double precision, allocatable :: accu_vec(:),delta_vec(:)
  thresh_2 = ao_cholesky_threshold * 100.d0
  thresh_1 = dsqrt(thresh_2)
  provide cholesky_mo_transp
@@ -223,12 +223,12 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
        mo_b_r1 = mos_in_r_array_omp(m,ipoint)
        if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
        do p = 1, cholesky_mo_num
-        accu_vec(p) += mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
        enddo
       enddo
      enddo
      do p = 1, cholesky_mo_num
-      f_hf_cholesky_sparse(ipoint) += accu_vec(p) * accu_vec(p)
+      f_hf_cholesky_sparse(ipoint) = f_hf_cholesky_sparse(ipoint) + accu_vec(p) * accu_vec(p)
      enddo
     f_hf_cholesky_sparse(ipoint) *= 2.D0
    enddo
@@ -240,39 +240,50 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
   print*,'Time to provide f_hf_cholesky_sparse = ',wall1-wall0
  else
   call wall_time(wall0)
-  !$OMP PARALLEL DO &
-  !$OMP DEFAULT (NONE)  &
-  !$OMP PRIVATE (accu_2,accu_1,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
-  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp) & 
-  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse,n_points_final_grid,cholesky_mo,n_basis_orb) 
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
+  !$OMP DO 
    do ipoint = 1, n_points_final_grid
     f_hf_cholesky_sparse(ipoint) = 0.d0
-    do p = 1, cholesky_mo_num
-     accu_2 = 0.d0
+     accu_vec = 0.d0
      do ii = 1, n_occ_val_orb_for_hf(2)
       i = list_valence_orb_for_hf(ii,2)
       mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
       do mm = 1, n_basis_orb ! electron 1 
        m = list_basis(mm)
        mo_b_r1 = mos_in_r_array_omp(m,ipoint)
-       accu_2 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
       enddo
      enddo
-     accu_1 = accu_2
-     do ii = n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+     delta_vec = 0.d0
+     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
       i = list_valence_orb_for_hf(ii,1)
       mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
       do mm = 1, n_basis_orb ! electron 1 
        m = list_basis(mm)
        mo_b_r1 = mos_in_r_array_omp(m,ipoint)
-       accu_1 += mo_i_r1 * mo_b_r1 * cholesky_mo(m,i,p)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
       enddo
      enddo
-     f_hf_cholesky_sparse(ipoint) += accu_1 * accu_2
-    enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse(ipoint) = f_hf_cholesky_sparse(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
+     enddo
     f_hf_cholesky_sparse(ipoint) *= 2.D0
    enddo
-  !$OMP END PARALLEL DO
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
   call wall_time(wall1)
   print*,'Time to provide f_hf_cholesky_sparse = ',wall1-wall0
  endif

From 4d18a0124e9a3cd088ce19e0c56801d7e4fb478d Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Mon, 27 May 2024 10:23:47 +0200
Subject: [PATCH 052/131] changed the default in mu_of_r_potential

---
 src/mu_of_r/EZFIO.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mu_of_r/EZFIO.cfg b/src/mu_of_r/EZFIO.cfg
index a66b00ef..7a39b012 100644
--- a/src/mu_of_r/EZFIO.cfg
+++ b/src/mu_of_r/EZFIO.cfg
@@ -6,9 +6,9 @@ size: (becke_numerical_grid.n_points_final_grid,determinants.n_states)
 
 [mu_of_r_potential]
 type: character*(32)
-doc: type of potential for the mu(r) interaction: can be [ hf| cas_full | cas_truncated | pure_act]
+doc: type of potential for the mu(r) interaction: can be [ hf| hf_sparse | cas_full | cas_truncated | pure_act]
 interface: ezfio, provider, ocaml
-default: hf
+default: hf_sparse
 
 [io_mu_of_r]                                                                                                                      
 type: Disk_access

From 09d9a814d2646cc599bdbdcb0886461fd8ed6688 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 27 May 2024 11:38:57 +0200
Subject: [PATCH 053/131] Added stop in (T)

---
 plugins/local/non_h_ints_mu/deb_mos.irp.f | 101 ++++++++++++++++++++++
 plugins/local/non_h_ints_mu/qmckl.irp.f   |   2 +-
 src/ccsd/ccsd_t_space_orb_stoch.irp.f     |  17 +++-
 src/utils_cc/EZFIO.cfg                    |   6 ++
 4 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 plugins/local/non_h_ints_mu/deb_mos.irp.f

diff --git a/plugins/local/non_h_ints_mu/deb_mos.irp.f b/plugins/local/non_h_ints_mu/deb_mos.irp.f
new file mode 100644
index 00000000..26344786
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_mos.irp.f
@@ -0,0 +1,101 @@
+
+! ---
+
+program deb_mos
+
+  implicit none
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+  endif
+
+  call print_mos()
+  
+end
+
+! ---
+
+subroutine print_mos()
+
+  implicit none
+  integer          :: i, ipoint
+  double precision :: r(3)
+  double precision :: mo_val, mo_der(3), mo_lap
+
+  PROVIDE final_grid_points mos_in_r_array mos_grad_in_r_array mos_lapl_in_r_array
+
+!  do ipoint = 1, n_points_final_grid
+!    r(:) = final_grid_points(:,ipoint)
+!    print*, r
+!  enddo
+double precision :: accu_vgl(5)
+double precision :: accu_vgl_nrm(5)
+
+  do ipoint = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    write(1111, '(5(f15.7, 3X))') r
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      write(1111, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
+    enddo
+  enddo
+ 
+  do ipoint = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    write(2222, '(5(f15.7, 3X))') r
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array_qmckl     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array_qmckl(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array_qmckl(i,ipoint)
+      write(2222, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
+    enddo
+  enddo
+ 
+  accu_vgl = 0.d0
+  accu_vgl_nrm = 0.d0
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      accu_vgl_nrm(1) += dabs(mo_val)
+      accu_vgl_nrm(2) += dabs(mo_der(1))
+      accu_vgl_nrm(3) += dabs(mo_der(2))
+      accu_vgl_nrm(4) += dabs(mo_der(3))
+      accu_vgl_nrm(5) += dabs(mo_lap)
+
+      mo_val    -= mos_in_r_array_qmckl     (i,ipoint)
+      mo_der(:) -= mos_grad_in_r_array_qmckl(i,ipoint,:)
+      mo_lap    -= mos_lapl_in_r_array_qmckl(i,ipoint)
+      accu_vgl(1) += dabs(mo_val)
+      accu_vgl(2) += dabs(mo_der(1))
+      accu_vgl(3) += dabs(mo_der(2))
+      accu_vgl(4) += dabs(mo_der(3))
+      accu_vgl(5) += dabs(mo_lap)
+    enddo
+
+  enddo
+  accu_vgl(:) *= 1.d0 / accu_vgl_nrm(:)
+  print *, accu_vgl
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/non_h_ints_mu/qmckl.irp.f b/plugins/local/non_h_ints_mu/qmckl.irp.f
index 4d419e24..de440f14 100644
--- a/plugins/local/non_h_ints_mu/qmckl.irp.f
+++ b/plugins/local/non_h_ints_mu/qmckl.irp.f
@@ -158,7 +158,7 @@ END_PROVIDER
 
  double precision, allocatable :: vgl(:,:,:)
  allocate( vgl(mo_num,5,n_points_final_grid))
- rc = qmckl_get_mo_basis_mo_vgl_inplace(qmckl_ctx, vgl, n_points_final_grid*mo_num*5_8)
+ rc = qmckl_get_mo_basis_mo_vgl(qmckl_ctx, vgl, n_points_final_grid*mo_num*5_8)
  if (rc /= QMCKL_SUCCESS) then
    print *, irp_here, 'qmckl error in get_mo_vgl'
    rc = qmckl_check(qmckl_ctx, rc)
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 1093c59d..485382e2 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -217,11 +217,14 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   print '(A)', ' ======================= ============== =========='
 
 
+  double precision :: t_error
   call set_multiple_levels_omp(.False.)
   call wall_time(t00)
   imin = 1_8
-  !$OMP PARALLEL                                                     &
-      !$OMP PRIVATE(ieta,eta,a,b,c,kiter,isample)                    &
+  t_error = huge(1.d0)
+
+  !$OMP PARALLEL                                      &
+      !$OMP PRIVATE(ieta,eta,a,b,c,kiter,isample)     &
       !$OMP DEFAULT(SHARED) NUM_THREADS(nthreads_pt2)
 
   do kiter=1,Nabc
@@ -328,15 +331,23 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
       if (norm > 0.d0) then
         energy_stoch = ET / norm
         variance = ET2 / norm - energy_stoch*energy_stoch
+        if (norm > 1.d0) then
+          t_error = dsqrt(variance/(norm-1.d0))
+        else
+          t_error = dsqrt(variance)
+        endif
       endif
 
       energy = energy_det + energy_stoch
 
-      print '(''   '',F20.8, ''   '', ES12.4,''   '', F8.2,''  '')', eccsd+energy, dsqrt(variance/(norm-1.d0)), 100.*real(Ncomputed)/real(Nabc)
+      print '(''   '',F20.8, ''   '', ES12.4,''   '', F8.2,''  '')', eccsd+energy, t_error, 100.*real(Ncomputed)/real(Nabc)
+
     endif
     !$OMP END MASTER
+    if (t_error < cc_par_t_stop) exit
     if (imin > Nabc) exit
   enddo
+  !$OMP TASKWAIT
 
   !$OMP END PARALLEL
   print '(A)', ' ======================= ============== ========== '
diff --git a/src/utils_cc/EZFIO.cfg b/src/utils_cc/EZFIO.cfg
index fb6d9034..7d50d66a 100644
--- a/src/utils_cc/EZFIO.cfg
+++ b/src/utils_cc/EZFIO.cfg
@@ -58,6 +58,12 @@ doc: If true, the CCSD(T) will be computed.
 interface: ezfio,ocaml,provider
 default: False
 
+[cc_par_t_stop]
+type: double precision
+doc: Stops the calculation when the statistical error bar is below the given value.
+interface: ezfio,ocaml,provider
+default: 1.e-5
+
 [cc_dev]
 type: logical
 doc: Only for dev purposes.

From ed8cfdc599bdbeb6b66fd444cef6dc0ac2756d55 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 27 May 2024 12:58:07 +0200
Subject: [PATCH 054/131] Added print of the error bar in (T)

---
 src/ccsd/ccsd_space_orb_sub.irp.f     | 14 ++++++++------
 src/ccsd/ccsd_t_space_orb_stoch.irp.f |  5 ++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index b48ca7da..b8cfab2a 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -192,7 +192,7 @@ subroutine run_ccsd_space_orb
   deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
 
   ! CCSD(T)
-  double precision :: e_t
+  double precision :: e_t, e_t_err
   e_t = 0.d0
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
@@ -210,22 +210,24 @@ subroutine run_ccsd_space_orb
     !print*,''
 
     ! New
+    e_t = uncorr_energy + energy ! For print in (T) call
+    e_t_err = 0.d0
+
     print*,'Computing (T) correction...'
     call wall_time(ta)
 !    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
 !         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
 
-    e_t = uncorr_energy + energy ! For print in next call
     call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
+         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
 
     call wall_time(tb)
     print*,'Time: ',tb-ta, ' s'
 
     print*,''
-    write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
+    write(*,'(A15,F18.12,A7,F18.12)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha +/- ', e_t_err
+    write(*,'(A15,F18.12,A7,F18.12)') ' E(T)        = ', e_t, ' Ha +/- ', e_t_err
+    write(*,'(A15,F18.12,A7,F18.12)') ' Correlation = ', energy + e_t, ' Ha +/- ', e_t_err
     print*,''
   endif
 
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 485382e2..851b6a9f 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -1,5 +1,5 @@
 ! Main
-subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
+subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy,t_error)
 
   implicit none
 
@@ -7,7 +7,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
   double precision, intent(in)  :: t2(nO,nO,nV,nV)
   double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
-  double precision, intent(inout) :: energy
+  double precision, intent(inout) :: energy, t_error
 
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
@@ -217,7 +217,6 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   print '(A)', ' ======================= ============== =========='
 
 
-  double precision :: t_error
   call set_multiple_levels_omp(.False.)
   call wall_time(t00)
   imin = 1_8

From b773a361b204a1f8424ae26bc2ac1a1a9d424a9f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 31 May 2024 20:07:29 +0200
Subject: [PATCH 055/131] fixed uninitialized variable in cholesky'

---
 src/ao_two_e_ints/cholesky.irp.f | 38 ++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 5fbd166c..09d86679 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -22,6 +22,9 @@ END_PROVIDER
    ! <ij|kl> = (ik|jl) = sum_a (ik|a).(a|jl)
    !
    ! Last dimension of cholesky_ao is cholesky_ao_num
+   !
+   ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
+   ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
    END_DOC
 
    integer                        :: rank, ndim
@@ -86,20 +89,25 @@ END_PROVIDER
      call print_memory_usage()
 
      allocate(L(ndim,1))
+!print *, 'allocate : (L(ndim,1))', memory_of_double(ndim)
 
      print *,  ''
      print *,  'Cholesky decomposition of AO integrals'
      print *,  '======================================'
      print *,  ''
-     print *,  '============ ============='
-     print *,  '    Rank      Threshold'
-     print *,  '============ ============='
+     print *,  '============ ============ ============='
+     print *,  '    Rank      Block size    Threshold'
+     print *,  '============ ============ ============='
 
 
      rank = 0
 
      allocate( D(ndim), Lset(ndim), Dset(ndim) )
      allocate( addr(3,ndim) )
+!print *, 'allocate : (D(ndim))', memory_of_int(ndim)
+!print *, 'allocate : (Lset(ndim))', memory_of_int(ndim)
+!print *, 'allocate : (Dset(ndim))', memory_of_int(ndim)
+!print *, 'allocate : (3,addr(ndim))', memory_of_int(3*ndim)
 
      ! 1.
      k=0
@@ -151,9 +159,10 @@ END_PROVIDER
        ! a.
        i = i+1
 
-       s = 0.01d0
 
        ! Inrease s until the arrays fit in memory
+       s = 0.01d0
+       block_size = max(N,24)
        do while (.True.)
 
          ! b.
@@ -168,6 +177,7 @@ END_PROVIDER
            endif
          enddo
 
+
          call total_memory(mem)
          mem = mem                                                   &
              + np*memory_of_double(nq)                               &! Delta(np,nq)
@@ -176,23 +186,28 @@ END_PROVIDER
 
          if (mem > qp_max_mem) then
            s = s*2.d0
+           block_size = block_size / 2
          else
            exit
          endif
 
          if ((s > 1.d0).or.(nq == 0)) then
            call print_memory_usage()
-           print *,  'Not enough memory. Reduce cholesky threshold'
+           print *, 'Required peak memory: ', mem, 'Gb'
+           call total_memory(mem)
+           print *, 'Already used  memory: ', mem, 'Gb'
+           print *, 'Not enough memory. Reduce cholesky threshold'
            stop -1
          endif
 
        enddo
 
        ! d., e.
-       block_size = max(N,24)
 
        L_old => L
        allocate(L(ndim,rank+nq), stat=ierr)
+!print *, 'allocate : L(ndim,rank+nq)', memory_of_double(ndim*(rank+nq))
+
        if (ierr /= 0) then
          call print_memory_usage()
          print *,  irp_here, ': allocation failed : (L(ndim,rank+nq))'
@@ -210,6 +225,8 @@ END_PROVIDER
        deallocate(L_old)
 
        allocate(Delta(np,nq), stat=ierr)
+!print *, 'allocate : Delta(np,nq)', memory_of_double(np*nq)
+
        if (ierr /= 0) then
          call print_memory_usage()
          print *,  irp_here, ': allocation failed : (Delta(np,nq))'
@@ -217,6 +234,8 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_p(np,block_size), stat=ierr)
+!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double(np*block_size)
+
        if (ierr /= 0) then
          call print_memory_usage()
          print *,  irp_here, ': allocation failed : (Ltmp_p(np,block_size))'
@@ -224,6 +243,8 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_q(nq,block_size), stat=ierr)
+!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double(nq*block_size)
+
        if (ierr /= 0) then
          call print_memory_usage()
          print *,  irp_here, ': allocation failed : (Ltmp_q(nq,block_size))'
@@ -232,6 +253,7 @@ END_PROVIDER
 
 
        allocate(computed(nq))
+!print *, 'allocate : computed(nq)', memory_of_int(nq)
 
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j)
 
@@ -353,7 +375,7 @@ END_PROVIDER
 
        enddo
 
-       print '(I10, 4X, ES12.3)', rank, Qmax
+       print '(I10, 4X, I10, 4X, ES12.3)', rank, block_size, Qmax
 
        deallocate(computed)
        deallocate(Delta)
@@ -380,6 +402,8 @@ END_PROVIDER
      enddo
 
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
+!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', ao_num*ao_num*(rank*1_8) * 8_8 / 1024_8**3
+
      if (ierr /= 0) then
        call print_memory_usage()
        print *,  irp_here, ': Allocation failed'

From 0dca6cfde4ba658518637c2f42ecb2e45c04de6a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 31 May 2024 20:20:04 +0200
Subject: [PATCH 056/131] block size in cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 09d86679..5a44571c 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -186,7 +186,7 @@ END_PROVIDER
 
          if (mem > qp_max_mem) then
            s = s*2.d0
-           block_size = block_size / 2
+           block_size = max(block_size / 2, 1)
          else
            exit
          endif

From 0a3d462510a6146da21770ff6fff2f7a1794a0fb Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 31 May 2024 20:30:48 +0200
Subject: [PATCH 057/131] Clean up openmp in cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 5a44571c..cfd57050 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -255,7 +255,7 @@ END_PROVIDER
        allocate(computed(nq))
 !print *, 'allocate : computed(nq)', memory_of_int(nq)
 
-       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j)
+       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
 
        !$OMP DO
        do q=1,nq
@@ -351,7 +351,7 @@ END_PROVIDER
          ! iii.
          f = 1.d0/dsqrt(Qmax)
 
-         !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared)
+         !$OMP PARALLEL PRIVATE(p,q) DEFAULT(shared)
          !$OMP DO
          do p=1,np
            Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f

From b743201efe692294db887f175dceb02a81f73422 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 31 May 2024 20:50:30 +0200
Subject: [PATCH 058/131] Use integer*8 in cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 152 ++++++++++++++++++-------------
 src/utils/memory.irp.f           |  20 ++++
 2 files changed, 109 insertions(+), 63 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index cfd57050..daa29079 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -27,19 +27,22 @@ END_PROVIDER
    ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
    END_DOC
 
-   integer                        :: rank, ndim
+   integer*8                      :: ndim8
+   integer                        :: rank
    double precision               :: tau
    double precision, pointer      :: L(:,:), L_old(:,:)
 
 
    double precision               :: s
-   double precision, parameter    :: dscale = 1.d0
+   double precision               :: dscale
 
    double precision, allocatable  :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:)
-   integer, allocatable           :: Lset(:), Dset(:), addr(:,:)
+   integer, allocatable           :: addr1(:,:), addr2(:,:)
+   integer**, allocatable         :: Lset(:), Dset(:), addr3(:,:)
    logical, allocatable           :: computed(:)
 
-   integer                        :: i,j,k,m,p,q, qj, dj, p2, q2
+   integer                        :: i,j,k,m,p,q, dj, p2, q2
+   integer*8                      :: i8, j8, p8, qj8
    integer                        :: N, np, nq
 
    double precision               :: Dmax, Dmin, Qmax, f
@@ -47,15 +50,15 @@ END_PROVIDER
    logical, external              :: ao_two_e_integral_zero
 
    double precision, external     :: ao_two_e_integral
-   integer                        :: block_size, iblock, ierr
+   integer                        :: block_size, iblock
 
    double precision               :: mem
-   double precision, external     :: memory_of_double, memory_of_int
+   double precision, external     :: memory_of_double8, memory_of_int8
 
    integer, external              :: getUnitAndOpen
-   integer                        :: iunit
+   integer                        :: iunit, ierr
 
-   ndim = ao_num*ao_num
+   ndim8 = ao_num*ao_num*1_8
    deallocate(cholesky_ao)
 
    if (read_ao_cholesky) then
@@ -83,13 +86,13 @@ END_PROVIDER
 
      tau = ao_cholesky_threshold
 
-     mem = 6.d0 * memory_of_double(ndim) + 6.d0 * memory_of_int(ndim)
+     mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8)
      call check_mem(mem, irp_here)
 
      call print_memory_usage()
 
-     allocate(L(ndim,1))
-!print *, 'allocate : (L(ndim,1))', memory_of_double(ndim)
+     allocate(L(ndim8,1))
+print *, 'allocate : (L(ndim8,1))', memory_of_double8(ndim8)
 
      print *,  ''
      print *,  'Cholesky decomposition of AO integrals'
@@ -102,36 +105,36 @@ END_PROVIDER
 
      rank = 0
 
-     allocate( D(ndim), Lset(ndim), Dset(ndim) )
-     allocate( addr(3,ndim) )
-!print *, 'allocate : (D(ndim))', memory_of_int(ndim)
-!print *, 'allocate : (Lset(ndim))', memory_of_int(ndim)
-!print *, 'allocate : (Dset(ndim))', memory_of_int(ndim)
-!print *, 'allocate : (3,addr(ndim))', memory_of_int(3*ndim)
+     allocate( D(ndim8), Lset(ndim8), Dset(ndim8) )
+     allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8), )
+print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
+print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
+print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8)
+print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
 
      ! 1.
      k=0
      do j=1,ao_num
        do i=1,ao_num
          k = k+1
-         addr(1,k) = i
-         addr(2,k) = j
-         addr(3,k) = (i-1)*ao_num + j
+         addr1(k) = i
+         addr2(k) = j
+         addr3(k) = (i-1)*ao_num + j
        enddo
      enddo
 
      if (do_direct_integrals) then
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided)
-       do i=1,ndim
-         D(i) = ao_two_e_integral(addr(1,i), addr(2,i),              &
-             addr(1,i), addr(2,i))
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(guided)
+       do i8=1,ndim8
+         D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
+             addr1(i8), addr2(i8))
        enddo
        !$OMP END PARALLEL DO
      else
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided)
-       do i=1,ndim
-         D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i),          &
-             addr(2,i), addr(2,i),                                   &
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(guided)
+       do i8=1,ndim8
+         D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
+             addr2(i8), addr2(i8),                                   &
              ao_integrals_map)
        enddo
        !$OMP END PARALLEL DO
@@ -140,12 +143,21 @@ END_PROVIDER
      Dmax = maxval(D)
 
      ! 2.
-     np=0
-     do p=1,ndim
-       if ( dscale*dscale*Dmax*D(p) > tau*tau ) then
-         np = np+1
-         Lset(np) = p
-       endif
+     np = huge(1_4)
+     dscale = 1.d0
+     do while (np == huge(1_4))
+       np=0
+       do p8=1,ndim8
+         if ( dscale*dscale*Dmax*D(p8) > tau*tau ) then
+           np = np+1
+           Lset(np) = p8
+           if (np == huge(1_4)) then
+             ! Overflow detected
+             dscale = dscale*0.5d0
+             exit
+           endif
+         endif
+       enddo
      enddo
 
      ! 3.
@@ -155,7 +167,7 @@ END_PROVIDER
      i = 0
 
      ! 5.
-     do while ( (Dmax > tau).and.(rank < ndim) )
+     do while ( (Dmax > tau).and.(rank < min(ndim8,huge(1_4)) )
        ! a.
        i = i+1
 
@@ -181,7 +193,8 @@ END_PROVIDER
          call total_memory(mem)
          mem = mem                                                   &
              + np*memory_of_double(nq)                               &! Delta(np,nq)
-             + (rank+nq)* memory_of_double(ndim)                     &! L(ndim,rank+nq)
+             + (rank+nq)* memory_of_double8(ndim8)
+&! L(ndim8,rank+nq)
              + (np+nq)*memory_of_double(block_size)    ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
          if (mem > qp_max_mem) then
@@ -205,19 +218,19 @@ END_PROVIDER
        ! d., e.
 
        L_old => L
-       allocate(L(ndim,rank+nq), stat=ierr)
-!print *, 'allocate : L(ndim,rank+nq)', memory_of_double(ndim*(rank+nq))
+       allocate(L(ndim8,rank+nq), stat=ierr)
+print *, 'allocate : L(ndim8,rank+nq)', memory_of_double8(ndim8*(rank+nq))
 
        if (ierr /= 0) then
          call print_memory_usage()
-         print *,  irp_here, ': allocation failed : (L(ndim,rank+nq))'
+         print *,  irp_here, ': allocation failed : (L(ndim8,rank+nq))'
          stop -1
        endif
 
        !$OMP PARALLEL DO PRIVATE(k,j)
        do k=1,rank
-         do j=1,ndim
-           L(j,k) = L_old(j,k)
+         do j8=1,ndim8
+           L(j8,k) = L_old(j8,k)
          enddo
        enddo
        !$OMP END PARALLEL DO
@@ -225,7 +238,7 @@ END_PROVIDER
        deallocate(L_old)
 
        allocate(Delta(np,nq), stat=ierr)
-!print *, 'allocate : Delta(np,nq)', memory_of_double(np*nq)
+print *, 'allocate : Delta(np,nq)', memory_of_double8(np*nq*1_8)
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -234,7 +247,7 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_p(np,block_size), stat=ierr)
-!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double(np*block_size)
+print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8)
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -243,7 +256,7 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_q(nq,block_size), stat=ierr)
-!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double(nq*block_size)
+print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8) 
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -253,8 +266,9 @@ END_PROVIDER
 
 
        allocate(computed(nq))
-!print *, 'allocate : computed(nq)', memory_of_int(nq)
+print *, 'allocate : computed(nq)', memory_of_int(nq)
 
+print *, 'p1'
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
 
        !$OMP DO
@@ -296,7 +310,7 @@ END_PROVIDER
        iblock = 0
        do j=1,nq
 
-         if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit
+         if ( (Qmax <= Dmin).or.(N+j*1_8 > ndim8) ) exit
          ! i.
          rank = N+j
 
@@ -308,28 +322,28 @@ END_PROVIDER
 
          ! ii.
          do dj=1,nq
-           qj = Dset(dj)
-           if (D(qj) == Qmax) then
+           qj8 = Dset(dj)
+           if (D(qj8) == Qmax) then
              exit
            endif
          enddo
 
-         L(1:ndim, rank) = 0.d0
+         L(1:ndim8, rank) = 0.d0
 
          if (.not.computed(dj)) then
            m = dj
            !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(guided)
            do k=np,1,-1
-             if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)),&
-                   addr(2,Lset(k)), addr(2,Dset(m)) ) ) then
+             if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
+                   addr2(Lset(k)), addr2(Dset(m)) ) ) then
                if (do_direct_integrals) then
                  Delta(k,m) = Delta(k,m) + &
-                     ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)),&
-                     addr(1,Dset(m)), addr(2,Dset(m)))
+                     ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
+                     addr1(Dset(m)), addr2(Dset(m)))
                else
                  Delta(k,m) = Delta(k,m) + &
-                     get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)),&
-                     addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map)
+                     get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
+                     addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
                endif
              endif
            enddo
@@ -391,18 +405,28 @@ END_PROVIDER
          Dmax = max(Dmax, D(Lset(p)))
        enddo
 
-       np=0
-       do p=1,ndim
-         if ( dscale*dscale*Dmax*D(p) > tau*tau ) then
-           np = np+1
-           Lset(np) = p
-         endif
+       np = huge(1_4)
+       dscale = 1.d0
+       do while (np == huge(1_4))
+         np=0
+         do p8=1,ndim8
+           if ( dscale*dscale*Dmax*D(p8) > tau*tau ) then
+             np = np+1
+             Lset(np) = p8
+             if (np == huge(1_4)) then
+               ! Overflow detected
+               dscale = dscale*0.5d0
+               exit
+             endif
+           endif
+         enddo
        enddo
 
+
      enddo
 
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
-!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', ao_num*ao_num*(rank*1_8) * 8_8 / 1024_8**3
+print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
 
      if (ierr /= 0) then
        call print_memory_usage()
@@ -411,7 +435,9 @@ END_PROVIDER
      endif
      !$OMP PARALLEL DO PRIVATE(k)
      do k=1,rank
-       call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1)
+       do j=1,ao_num
+         call dcopy(ao_num, L((j-1)*ao_num+1,k), 1, cholesky_ao(1,j,k), 1)
+       enddo
      enddo
      !$OMP END PARALLEL DO
      deallocate(L)
diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f
index e69bf71e..043562db 100644
--- a/src/utils/memory.irp.f
+++ b/src/utils/memory.irp.f
@@ -79,6 +79,26 @@ IRP_ENDIF
   call unlock_io()
 end function
 
+double precision function memory_of_double8(n)
+  implicit none
+  BEGIN_DOC
+! Computes the memory required for n double precision elements in gigabytes.
+  END_DOC
+  integer*8, intent(in) :: n
+  double precision, parameter :: f = 8.d0 / (1024.d0*1024.d0*1024.d0)
+  memory_of_double8 = dble(n) * f
+end function
+
+double precision function memory_of_int8(n)
+  implicit none
+  BEGIN_DOC
+! Computes the memory required for n double precision elements in gigabytes.
+  END_DOC
+  integer*8, intent(in) :: n
+  double precision, parameter :: f = 4.d0 / (1024.d0*1024.d0*1024.d0)
+  memory_of_int8 = dble(n) * f
+end function
+
 double precision function memory_of_double(n)
   implicit none
   BEGIN_DOC

From 38d386d36c78ff87a6c1062d065ab2b12fa4dcc9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sun, 2 Jun 2024 19:03:05 +0200
Subject: [PATCH 059/131] Debug cholesky

---
 config/gfortran_debug_mkl.cfg    |  63 ++++++++++++
 src/ao_two_e_ints/cholesky.irp.f | 169 +++++++++++++++++++------------
 2 files changed, 166 insertions(+), 66 deletions(-)
 create mode 100644 config/gfortran_debug_mkl.cfg

diff --git a/config/gfortran_debug_mkl.cfg b/config/gfortran_debug_mkl.cfg
new file mode 100644
index 00000000..1dc3f2b2
--- /dev/null
+++ b/config/gfortran_debug_mkl.cfg
@@ -0,0 +1,63 @@
+# Common flags
+##############
+#
+# -ffree-line-length-none : Needed for IRPF90 which produces long lines
+# -lblas -llapack         : Link with libblas and liblapack libraries provided by the system
+# -I .                    : Include the curent directory (Mandatory)
+#
+# --ninja                 : Allow the utilisation of ninja. (Mandatory)
+# --align=32              : Align all provided arrays on a 32-byte boundary
+#
+#
+[COMMON]
+FC           : gfortran -g -ffree-line-length-none -I . -fPIC -std=legacy
+LAPACK_LIB   : -I${MKLROOT}/include -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_gf_lp64 -lmkl_core -lpthread -lm -ldl -lmkl_gnu_thread -lgomp -fopenmp
+IRPF90       : irpf90
+IRPF90_FLAGS : --ninja --align=32 --assert -DSET_NESTED
+
+# Global options
+################
+#
+# 1 : Activate
+# 0 : Deactivate
+#
+[OPTION]
+MODE    : DEBUG      ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
+CACHE   : 0          ; Enable cache_compile.py
+OPENMP  : 1          ; Append OpenMP flags
+
+# Optimization flags
+####################
+#
+# -Ofast                  : Disregard strict standards compliance. Enables all -O3 optimizations.
+#                           It also enables optimizations that are not valid
+#                           for all standard-compliant programs.  It turns on
+#                           -ffast-math and the Fortran-specific
+#                           -fno-protect-parens and -fstack-arrays.
+[OPT]
+FCFLAGS : -Ofast
+
+# Profiling flags
+#################
+#
+[PROFILE]
+FC      : -p -g
+FCFLAGS : -Ofast
+
+# Debugging flags
+#################
+#
+# -fcheck=all  : Checks uninitialized variables,  array subscripts, etc...
+# -g           : Extra debugging information
+#
+[DEBUG]
+#FCFLAGS : -g -msse4.2  -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant -Wuninitialized  -fbacktrace -ffpe-trap=zero,overflow,underflow -finit-real=nan
+FCFLAGS : -g -mavx  -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant -Wuninitialized  -fbacktrace -ffpe-trap=zero,overflow -finit-real=nan
+
+# OpenMP flags
+#################
+#
+[OPENMP]
+FC           : -fopenmp
+IRPF90_FLAGS : --openmp
+
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index daa29079..3d0baa48 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -29,21 +29,20 @@ END_PROVIDER
 
    integer*8                      :: ndim8
    integer                        :: rank
-   double precision               :: tau
+   double precision               :: tau, tau2
    double precision, pointer      :: L(:,:), L_old(:,:)
 
-
    double precision               :: s
-   double precision               :: dscale
+   double precision               :: dscale, dscale_tmp
 
    double precision, allocatable  :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:)
-   integer, allocatable           :: addr1(:,:), addr2(:,:)
-   integer**, allocatable         :: Lset(:), Dset(:), addr3(:,:)
+   integer, allocatable           :: addr1(:), addr2(:)
+   integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
    logical, allocatable           :: computed(:)
 
    integer                        :: i,j,k,m,p,q, dj, p2, q2
    integer*8                      :: i8, j8, p8, qj8
-   integer                        :: N, np, nq
+   integer                        :: N, np, nq, npmax
 
    double precision               :: Dmax, Dmin, Qmax, f
    double precision, external     :: get_ao_two_e_integral
@@ -53,6 +52,7 @@ END_PROVIDER
    integer                        :: block_size, iblock
 
    double precision               :: mem
+   double precision, external     :: memory_of_double, memory_of_int
    double precision, external     :: memory_of_double8, memory_of_int8
 
    integer, external              :: getUnitAndOpen
@@ -61,6 +61,9 @@ END_PROVIDER
    ndim8 = ao_num*ao_num*1_8
    deallocate(cholesky_ao)
 
+
+! TODO : Save L() to disk
+
    if (read_ao_cholesky) then
      print *,  'Reading Cholesky vectors from disk...'
      iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'R')
@@ -85,6 +88,7 @@ END_PROVIDER
      endif
 
      tau = ao_cholesky_threshold
+     tau2 = tau*tau
 
      mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8)
      call check_mem(mem, irp_here)
@@ -92,25 +96,25 @@ END_PROVIDER
      call print_memory_usage()
 
      allocate(L(ndim8,1))
-print *, 'allocate : (L(ndim8,1))', memory_of_double8(ndim8)
+!print *, 'allocate : (L(ndim8,1))', memory_of_double8(ndim8)
 
      print *,  ''
      print *,  'Cholesky decomposition of AO integrals'
      print *,  '======================================'
      print *,  ''
-     print *,  '============ ============ ============='
-     print *,  '    Rank      Block size    Threshold'
-     print *,  '============ ============ ============='
+     print *,  '============ ============='
+     print *,  '    Rank       Threshold'
+     print *,  '============ ============='
 
 
      rank = 0
 
      allocate( D(ndim8), Lset(ndim8), Dset(ndim8) )
-     allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8), )
-print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
-print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
-print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8)
-print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
+     allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) )
+!print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
+!print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
+!print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8)
+!print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
 
      ! 1.
      k=0
@@ -124,14 +128,14 @@ print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
      enddo
 
      if (do_direct_integrals) then
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(guided)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
        do i8=1,ndim8
          D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
              addr1(i8), addr2(i8))
        enddo
        !$OMP END PARALLEL DO
      else
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(guided)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
        do i8=1,ndim8
          D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
              addr2(i8), addr2(i8),                                   &
@@ -143,17 +147,22 @@ print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
      Dmax = maxval(D)
 
      ! 2.
-     np = huge(1_4)
+     npmax = huge(1_4)*1_8
+     np = npmax
      dscale = 1.d0
-     do while (np == huge(1_4))
+     dscale_tmp = Dmax
+     do while (np == npmax)
        np=0
        do p8=1,ndim8
-         if ( dscale*dscale*Dmax*D(p8) > tau*tau ) then
+         if ( dscale_tmp*D(p8) > tau2 ) then
            np = np+1
            Lset(np) = p8
-           if (np == huge(1_4)) then
+           if (np == npmax) then
              ! Overflow detected
-             dscale = dscale*0.5d0
+             dscale = dscale*0.1d0
+             dscale_tmp = dscale*dscale*Dmax
+!print *, 'Overflow detected '
+!print *, 'dscale = ', dscale
              exit
            endif
          endif
@@ -167,7 +176,7 @@ print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
      i = 0
 
      ! 5.
-     do while ( (Dmax > tau).and.(rank < min(ndim8,huge(1_4)) )
+     do while ( (Dmax > tau).and.(rank < min(ndim8,huge(1_4))) )
        ! a.
        i = i+1
 
@@ -191,15 +200,13 @@ print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
 
 
          call total_memory(mem)
-         mem = mem                                                   &
-             + np*memory_of_double(nq)                               &! Delta(np,nq)
-             + (rank+nq)* memory_of_double8(ndim8)
-&! L(ndim8,rank+nq)
-             + (np+nq)*memory_of_double(block_size)    ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
+         mem = mem                                 &
+             + np*memory_of_double(nq)             &! Delta(np,nq)
+             + (rank+nq)* memory_of_double8(ndim8) &! L(ndim8,rank+nq)
+             + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
          if (mem > qp_max_mem) then
            s = s*2.d0
-           block_size = max(block_size / 2, 1)
          else
            exit
          endif
@@ -219,7 +226,7 @@ print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
 
        L_old => L
        allocate(L(ndim8,rank+nq), stat=ierr)
-print *, 'allocate : L(ndim8,rank+nq)', memory_of_double8(ndim8*(rank+nq))
+!print *, 'allocate : L(ndim8,rank+nq)', memory_of_double8(ndim8*(rank+nq))
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -227,7 +234,7 @@ print *, 'allocate : L(ndim8,rank+nq)', memory_of_double8(ndim8*(rank+nq))
          stop -1
        endif
 
-       !$OMP PARALLEL DO PRIVATE(k,j)
+       !$OMP PARALLEL DO PRIVATE(k,j8)
        do k=1,rank
          do j8=1,ndim8
            L(j8,k) = L_old(j8,k)
@@ -238,7 +245,7 @@ print *, 'allocate : L(ndim8,rank+nq)', memory_of_double8(ndim8*(rank+nq))
        deallocate(L_old)
 
        allocate(Delta(np,nq), stat=ierr)
-print *, 'allocate : Delta(np,nq)', memory_of_double8(np*nq*1_8)
+!print *, 'allocate : Delta(np,nq)', memory_of_double8(np*nq*1_8)
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -247,7 +254,7 @@ print *, 'allocate : Delta(np,nq)', memory_of_double8(np*nq*1_8)
        endif
 
        allocate(Ltmp_p(np,block_size), stat=ierr)
-print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8)
+!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -256,7 +263,7 @@ print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8
        endif
 
        allocate(Ltmp_q(nq,block_size), stat=ierr)
-print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8) 
+!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8), nq, block_size
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -266,34 +273,47 @@ print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8
 
 
        allocate(computed(nq))
-print *, 'allocate : computed(nq)', memory_of_int(nq)
+!print *, 'allocate : computed(nq)', memory_of_int(nq)
 
-print *, 'p1'
+!print *, 'N, rank, block_size', N, rank, block_size
+!print *, 'p1'
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
 
+!print *, 'computed'
+       !$OMP DO
+       do q=1,nq
+         computed(q) = .False.
+       enddo
+       !$OMP ENDDO NOWAIT
+
+!print *, 'Delta'
        !$OMP DO
        do q=1,nq
          do j=1,np
            Delta(j,q) = 0.d0
          enddo
-         computed(q) = .False.
        enddo
        !$OMP ENDDO NOWAIT
 
-       !$OMP DO
+!print *, 'Ltmp_p'
        do k=1,N
+         !$OMP DO
          do p=1,np
            Ltmp_p(p,k) = L(Lset(p),k)
          enddo
+         !$OMP END DO NOWAIT
+
+         !$OMP DO
          do q=1,nq
            Ltmp_q(q,k) = L(Dset(q),k)
          enddo
+         !$OMP END DO NOWAIT
        enddo
-       !$OMP END DO NOWAIT
 
        !$OMP BARRIER
        !$OMP END PARALLEL
 
+!print *, 'p2', np, nq, N
        if (N>0) then
          call dgemm('N','T', np, nq, N, -1.d0,                       &
              Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
@@ -315,8 +335,10 @@ print *, 'p1'
          rank = N+j
 
          if (iblock == block_size) then
+!print *, 'dgemm'
            call dgemm('N','T',np,nq,block_size,-1.d0,                &
                Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
+
            iblock = 0
          endif
 
@@ -328,36 +350,47 @@ print *, 'p1'
            endif
          enddo
 
-         L(1:ndim8, rank) = 0.d0
+         do i8=1,ndim8
+           L(i8, rank) = 0.d0
+         enddo
 
          if (.not.computed(dj)) then
            m = dj
-           !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(guided)
-           do k=np,1,-1
-             if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
-                   addr2(Lset(k)), addr2(Dset(m)) ) ) then
-               if (do_direct_integrals) then
-                 Delta(k,m) = Delta(k,m) + &
-                     ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
-                     addr1(Dset(m)), addr2(Dset(m)))
-               else
-                 Delta(k,m) = Delta(k,m) + &
-                     get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
-                     addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
-               endif
-             endif
-           enddo
-           !$OMP END PARALLEL DO
+           if (do_direct_integrals) then
+               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,16)
+               do k=np,1,-1
+                 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
+                       addr2(Lset(k)), addr2(Dset(m)) ) ) then
+                     Delta(k,m) = Delta(k,m) + &
+                         ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
+                         addr1(Dset(m)), addr2(Dset(m)))
+                 endif
+               enddo
+               !$OMP END PARALLEL DO
+           else 
+               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,16)
+               do k=np,1,-1
+                 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
+                       addr2(Lset(k)), addr2(Dset(m)) ) ) then
+                     Delta(k,m) = Delta(k,m) + &
+                         get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
+                         addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
+                 endif
+               enddo
+               !$OMP END PARALLEL DO
+           endif
            computed(dj) = .True.
          endif
 
          iblock = iblock+1
+!print *, iblock
          do p=1,np
            Ltmp_p(p,iblock) = Delta(p,dj)
          enddo
 
          ! iv.
          if (iblock > 1) then
+!print *, 'dgemv', iblock
            call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0,&
                Ltmp_p(1,iblock), 1)
          endif
@@ -365,6 +398,7 @@ print *, 'p1'
          ! iii.
          f = 1.d0/dsqrt(Qmax)
 
+!print *, 'p4'
          !$OMP PARALLEL PRIVATE(p,q) DEFAULT(shared)
          !$OMP DO
          do p=1,np
@@ -379,7 +413,6 @@ print *, 'p1'
            Ltmp_q(q,iblock) = L(Dset(q), rank)
          enddo
          !$OMP END DO
-
          !$OMP END PARALLEL
 
          Qmax = D(Dset(1))
@@ -389,12 +422,12 @@ print *, 'p1'
 
        enddo
 
-       print '(I10, 4X, I10, 4X, ES12.3)', rank, block_size, Qmax
+       print '(I10, 4X, ES12.3)', rank, Qmax
 
-       deallocate(computed)
-       deallocate(Delta)
        deallocate(Ltmp_p)
        deallocate(Ltmp_q)
+       deallocate(computed)
+       deallocate(Delta)
 
        ! i.
        N = rank
@@ -405,17 +438,21 @@ print *, 'p1'
          Dmax = max(Dmax, D(Lset(p)))
        enddo
 
-       np = huge(1_4)
+       np = npmax
        dscale = 1.d0
-       do while (np == huge(1_4))
+       dscale_tmp = Dmax
+       do while (np == npmax)
          np=0
          do p8=1,ndim8
-           if ( dscale*dscale*Dmax*D(p8) > tau*tau ) then
+           if ( dscale_tmp*D(p8) > tau2 ) then
              np = np+1
              Lset(np) = p8
-             if (np == huge(1_4)) then
+             if (np == npmax) then
                ! Overflow detected
                dscale = dscale*0.5d0
+               dscale_tmp = dscale*dscale*Dmax
+!print *, 'Overflow detected '
+!print *, 'dscale = ', dscale
                exit
              endif
            endif
@@ -426,7 +463,7 @@ print *, 'p1'
      enddo
 
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
-print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
+!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
 
      if (ierr /= 0) then
        call print_memory_usage()
@@ -436,7 +473,7 @@ print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*
      !$OMP PARALLEL DO PRIVATE(k)
      do k=1,rank
        do j=1,ao_num
-         call dcopy(ao_num, L((j-1)*ao_num+1,k), 1, cholesky_ao(1,j,k), 1)
+           cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k)
        enddo
      enddo
      !$OMP END PARALLEL DO

From ff59e9efcc2ab1e4ffaa352e86bbb14491ce1531 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Sun, 2 Jun 2024 19:16:56 +0200
Subject: [PATCH 060/131] added print in src/ao_two_e_ints/cholesky.irp.f

---
 src/ao_two_e_ints/cholesky.irp.f | 4 ++++
 src/mu_of_r/f_hf_cholesky.irp.f  | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 5fbd166c..1d8b8948 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -51,7 +51,9 @@ END_PROVIDER
 
    integer, external              :: getUnitAndOpen
    integer                        :: iunit
+   double precision :: wall0,wall1
 
+   call wall_time(wall0)
    ndim = ao_num*ao_num
    deallocate(cholesky_ao)
 
@@ -409,6 +411,8 @@ END_PROVIDER
 
    print *, 'Rank  : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)'
    print *,  ''
+   call wall_time(wall1)
+   print*,'Time to provide AO cholesky vectors = ',wall1-wall0
 
 END_PROVIDER
 
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index 472abb1c..5dd69eb6 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -220,10 +220,10 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
       if(dabs(mo_i_r1).lt.thresh_1)cycle
       do mm = 1, n_basis_orb ! electron 1 
        m = list_basis(mm)
-       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
-       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)*mo_i_r1
+       if(dabs(mo_b_r1).lt.thresh_2)cycle
        do p = 1, cholesky_mo_num
-        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_mo_transp(p,m,i)
        enddo
       enddo
      enddo

From c1ca673a6fd39ef574bb5a41b420a18a45f85d58 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 4 Jun 2024 11:24:25 +0200
Subject: [PATCH 061/131] Added omp_lib.F file

---
 etc/paths.rc              | 9 +++++++++
 src/ezfio_files/omp_lib.F | 4 ++++
 2 files changed, 13 insertions(+)
 create mode 100644 src/ezfio_files/omp_lib.F

diff --git a/etc/paths.rc b/etc/paths.rc
index dc1741e8..843ec383 100644
--- a/etc/paths.rc
+++ b/etc/paths.rc
@@ -28,6 +28,15 @@ function qp_prepend_export () {
   fi
 }
 
+function qp_append_export () {
+  eval "value_1="\${$1}""
+  if [[ -z $value_1 ]] ; then
+      echo "${2}:"
+  else
+      echo "${value_1}:${2}"
+  fi
+}
+
 export PYTHONPATH=$(qp_prepend_export "PYTHONPATH" "${QP_EZFIO}/Python":"${QP_PYTHON}")
 
 export PATH=$(qp_prepend_export "PATH" "${QP_PYTHON}":"${QP_ROOT}"/bin:"${QP_ROOT}"/ocaml)
diff --git a/src/ezfio_files/omp_lib.F b/src/ezfio_files/omp_lib.F
new file mode 100644
index 00000000..b3df8e0a
--- /dev/null
+++ b/src/ezfio_files/omp_lib.F
@@ -0,0 +1,4 @@
+       module omp_lib
+#include <omp_lib.h>
+       end module
+

From 2a9b8c56a121fbe9880b9d6bebe344e78ded6355 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 4 Jun 2024 11:32:56 +0200
Subject: [PATCH 062/131] omp_lib was a bad idea...

---
 src/ezfio_files/omp_lib.F | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 src/ezfio_files/omp_lib.F

diff --git a/src/ezfio_files/omp_lib.F b/src/ezfio_files/omp_lib.F
deleted file mode 100644
index b3df8e0a..00000000
--- a/src/ezfio_files/omp_lib.F
+++ /dev/null
@@ -1,4 +0,0 @@
-       module omp_lib
-#include <omp_lib.h>
-       end module
-

From c95a0b2d87f34e07e52f535e1548081c97eba0eb Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 4 Jun 2024 16:15:09 +0200
Subject: [PATCH 063/131] Disk-based cholesky

---
 src/ao_two_e_ints/cholesky.irp.f        | 180 ++++++++++++------------
 src/ao_two_e_ints/two_e_integrals.irp.f |   4 +-
 src/ezfio_files/get_unit_and_open.irp.f |   8 +-
 3 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index b98dfd5b..f689a65e 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -16,6 +16,7 @@ END_PROVIDER
 
  BEGIN_PROVIDER [ integer, cholesky_ao_num ]
 &BEGIN_PROVIDER [ double precision, cholesky_ao, (ao_num, ao_num, 1) ]
+   use mmap_module
    implicit none
    BEGIN_DOC
    ! Cholesky vectors in AO basis: (ik|a):
@@ -30,19 +31,19 @@ END_PROVIDER
    integer*8                      :: ndim8
    integer                        :: rank
    double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:), L_old(:,:)
+   double precision, pointer      :: L(:,:)
 
    double precision               :: s
    double precision               :: dscale, dscale_tmp
 
-   double precision, allocatable  :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:)
+   double precision, allocatable  :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:)
    integer, allocatable           :: addr1(:), addr2(:)
    integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
    logical, allocatable           :: computed(:)
 
    integer                        :: i,j,k,m,p,q, dj, p2, q2
-   integer*8                      :: i8, j8, p8, qj8
-   integer                        :: N, np, nq, npmax
+   integer*8                      :: i8, j8, p8, qj8, rank_max, np8
+   integer                        :: N, np, nq
 
    double precision               :: Dmax, Dmin, Qmax, f
    double precision, external     :: get_ao_two_e_integral
@@ -61,12 +62,12 @@ END_PROVIDER
    ndim8 = ao_num*ao_num*1_8
    double precision :: wall0,wall1
 
+   type(c_ptr)                    :: c_pointer(2)
+   integer                        :: fd(2)
+
    call wall_time(wall0)
    deallocate(cholesky_ao)
 
-
-! TODO : Save L() to disk
-
    if (read_ao_cholesky) then
      print *,  'Reading Cholesky vectors from disk...'
      iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'R')
@@ -81,6 +82,16 @@ END_PROVIDER
      PROVIDE nucl_coord ao_two_e_integral_schwartz
      call set_multiple_levels_omp(.False.)
 
+     rank_max = min(ndim8,274877906944_8/1_8/ndim8)
+     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., c_pointer(1))
+     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
+!print *, 'rank_max/ndim8', dble(rank_max) / dble(ndim8)
+
+     ! Deleting the file while it is open makes the file invisible on the filesystem,
+     ! and automatically deleted, even if the program crashes
+     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
+     close(iunit,status='delete')
+
      if (do_direct_integrals) then
        if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then
          ! Trigger providers inside ao_two_e_integral
@@ -98,9 +109,6 @@ END_PROVIDER
 
      call print_memory_usage()
 
-     allocate(L(ndim8,1))
-!print *, 'allocate : (L(ndim8,1))', memory_of_double8(ndim8)
-
      print *,  ''
      print *,  'Cholesky decomposition of AO integrals'
      print *,  '======================================'
@@ -112,7 +120,7 @@ END_PROVIDER
 
      rank = 0
 
-     allocate( D(ndim8), Lset(ndim8), Dset(ndim8) )
+     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
      allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) )
 !print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
 !print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
@@ -132,44 +140,52 @@ END_PROVIDER
 
      if (do_direct_integrals) then
        !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
-       do i8=1,ndim8
+       do i8=ndim8,1,-1
          D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
              addr1(i8), addr2(i8))
        enddo
        !$OMP END PARALLEL DO
      else
        !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
-       do i8=1,ndim8
+       do i8=ndim8,1,-1
          D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
              addr2(i8), addr2(i8),                                   &
              ao_integrals_map)
        enddo
        !$OMP END PARALLEL DO
      endif
+     D_sorted(:) = -D(:)
+     call dsort_noidx_big(D_sorted,ndim8)
+     D_sorted(:) = dabs(D_sorted(:))
 
-     Dmax = maxval(D)
+     Dmax = D_sorted(1)
 
      ! 2.
-     npmax = huge(1_4)*1_8
-     np = npmax
-     dscale = 1.d0
-     dscale_tmp = Dmax
-     do while (np == npmax)
-       np=0
+     dscale = tau2/Dmax
+     do i8=1,ndim8
+       if (D_sorted(i8) <= dscale) exit
+     enddo
+
+
+     mem = qp_max_mem+1
+     do while ( (mem > qp_max_mem).and.(i8>1_8) )
+       dscale = min(1.d0,dsqrt(tau2/(D_sorted(i8)*Dmax)))
+       dscale_tmp = dscale*dscale*Dmax
+! print *, 'dscale = ', dscale, dble(i8)/dble(ndim8)
+       np8=0_8
        do p8=1,ndim8
          if ( dscale_tmp*D(p8) > tau2 ) then
-           np = np+1
-           Lset(np) = p8
-           if (np == npmax) then
-             ! Overflow detected
-             dscale = dscale*0.1d0
-             dscale_tmp = dscale*dscale*Dmax
-!print *, 'Overflow detected '
-!print *, 'dscale = ', dscale
-             exit
-           endif
+           np8 = np8+1_8
+           Lset(np8) = p8
          endif
        enddo
+       i8 = i8*3_8/4_8
+       if (np8 > huge(1_4)/64_8) cycle
+       np = np8
+! print *, 'np = ', np
+       call resident_memory(mem)
+       mem = mem                                 &
+             + 0.1d0*np*memory_of_double(np)      ! Delta(np,nq)
      enddo
 
      ! 3.
@@ -179,7 +195,7 @@ END_PROVIDER
      i = 0
 
      ! 5.
-     do while ( (Dmax > tau).and.(rank < min(ndim8,huge(1_4))) )
+     do while ( (Dmax > tau).and.(rank*1_8 < min(ndim8,rank_max)) )
        ! a.
        i = i+1
 
@@ -202,12 +218,12 @@ END_PROVIDER
          enddo
 
 
-         call total_memory(mem)
+         call resident_memory(mem)
          mem = mem                                 &
              + np*memory_of_double(nq)             &! Delta(np,nq)
-             + (rank+nq)* memory_of_double8(ndim8) &! L(ndim8,rank+nq)
              + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
+!print *, 'mem = ', mem
          if (mem > qp_max_mem) then
            s = s*2.d0
          else
@@ -217,7 +233,7 @@ END_PROVIDER
          if ((s > 1.d0).or.(nq == 0)) then
            call print_memory_usage()
            print *, 'Required peak memory: ', mem, 'Gb'
-           call total_memory(mem)
+           call resident_memory(mem)
            print *, 'Already used  memory: ', mem, 'Gb'
            print *, 'Not enough memory. Reduce cholesky threshold'
            stop -1
@@ -227,26 +243,6 @@ END_PROVIDER
 
        ! d., e.
 
-       L_old => L
-       allocate(L(ndim8,rank+nq), stat=ierr)
-!print *, 'allocate : L(ndim8,rank+nq)', memory_of_double8(ndim8*(rank+nq))
-
-       if (ierr /= 0) then
-         call print_memory_usage()
-         print *,  irp_here, ': allocation failed : (L(ndim8,rank+nq))'
-         stop -1
-       endif
-
-       !$OMP PARALLEL DO PRIVATE(k,j8)
-       do k=1,rank
-         do j8=1,ndim8
-           L(j8,k) = L_old(j8,k)
-         enddo
-       enddo
-       !$OMP END PARALLEL DO
-
-       deallocate(L_old)
-
        allocate(Delta(np,nq), stat=ierr)
 !print *, 'allocate : Delta(np,nq)', memory_of_double8(np*nq*1_8)
 
@@ -280,39 +276,29 @@ END_PROVIDER
 
 !print *, 'N, rank, block_size', N, rank, block_size
 !print *, 'p1'
-       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
-
-!print *, 'computed'
-       !$OMP DO
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(p,q,j)
        do q=1,nq
          computed(q) = .False.
-       enddo
-       !$OMP ENDDO NOWAIT
-
-!print *, 'Delta'
-       !$OMP DO
-       do q=1,nq
          do j=1,np
            Delta(j,q) = 0.d0
          enddo
        enddo
-       !$OMP ENDDO NOWAIT
+       !$OMP END PARALLEL DO
 
-!print *, 'Ltmp_p'
+       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
        do k=1,N
          !$OMP DO
          do p=1,np
-           Ltmp_p(p,k) = L(Lset(p),k)
+              Ltmp_p(p,k) = L(Lset(p),k)
          enddo
          !$OMP END DO NOWAIT
 
          !$OMP DO
          do q=1,nq
-           Ltmp_q(q,k) = L(Dset(q),k)
+              Ltmp_q(q,k) = L(Dset(q),k)
          enddo
          !$OMP END DO NOWAIT
        enddo
-
        !$OMP BARRIER
        !$OMP END PARALLEL
 
@@ -338,7 +324,7 @@ END_PROVIDER
          rank = N+j
 
          if (iblock == block_size) then
-!print *, 'dgemm'
+!print *, 'dgemm', np, nq
            call dgemm('N','T',np,nq,block_size,-1.d0,                &
                Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
 
@@ -401,7 +387,6 @@ END_PROVIDER
          ! iii.
          f = 1.d0/dsqrt(Qmax)
 
-!print *, 'p4'
          !$OMP PARALLEL PRIVATE(p,q) DEFAULT(shared)
          !$OMP DO
          do p=1,np
@@ -441,30 +426,42 @@ END_PROVIDER
          Dmax = max(Dmax, D(Lset(p)))
        enddo
 
-       np = npmax
-       dscale = 1.d0
-       dscale_tmp = Dmax
-       do while (np == npmax)
-         np=0
+       mem = qp_max_mem+1
+       do while ( (mem > qp_max_mem).and.(i8>1_8) )
+         dscale = min(1.d0,dsqrt(tau2/(D_sorted(i8)*Dmax)))
+         dscale_tmp = dscale*dscale*Dmax
+!print *, 'dscale = ', dscale, dble(i8)/dble(ndim8)
+         np8=0_8
          do p8=1,ndim8
            if ( dscale_tmp*D(p8) > tau2 ) then
-             np = np+1
-             Lset(np) = p8
-             if (np == npmax) then
-               ! Overflow detected
-               dscale = dscale*0.5d0
-               dscale_tmp = dscale*dscale*Dmax
-!print *, 'Overflow detected '
-!print *, 'dscale = ', dscale
-               exit
-             endif
+             np8 = np8+1_8
+             Lset(np8) = p8
            endif
          enddo
+         i8 = i8*3_8/4_8
+         if (np8 > huge(1_4)/64_8) cycle
+         np = np8
+!print *, 'np = ', np
+         call resident_memory(mem)
+         mem = mem                                 &
+               + 0.1d0*np*memory_of_double(np)      ! Delta(np,nq)
        enddo
 
+       if (np == 0) then
+           call print_memory_usage()
+           print *, 'Required peak memory: ', mem, 'Gb'
+           call resident_memory(mem)
+           print *, 'Already used  memory: ', mem, 'Gb'
+           print *, 'Not enough memory. Reduce cholesky threshold'
+           stop -1
+       endif
 
      enddo
 
+
+     print *,  '============ ============='
+     print *,  ''
+
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
 !print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
 
@@ -473,18 +470,19 @@ END_PROVIDER
        print *,  irp_here, ': Allocation failed'
        stop -1
      endif
-     !$OMP PARALLEL DO PRIVATE(k)
+
+
+     !$OMP PARALLEL DO PRIVATE(k,j)
      do k=1,rank
        do j=1,ao_num
            cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k)
        enddo
      enddo
      !$OMP END PARALLEL DO
-     deallocate(L)
-     cholesky_ao_num = rank
 
-     print *,  '============ ============='
-     print *,  ''
+     call munmap( (/ ndim8, ndim8 /), 8, fd(1), c_pointer(1) )
+
+     cholesky_ao_num = rank
 
      if (write_ao_cholesky) then
        print *,  'Writing Cholesky vectors to disk...'
diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f
index b55b5f0d..d12f3d45 100644
--- a/src/ao_two_e_ints/two_e_integrals.irp.f
+++ b/src/ao_two_e_ints/two_e_integrals.irp.f
@@ -460,8 +460,8 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_schwartz, (ao_num, ao_num)
   !$OMP PARALLEL DO PRIVATE(i,k)                                     &
       !$OMP DEFAULT(NONE)                                            &
       !$OMP SHARED (ao_num,ao_two_e_integral_schwartz)              &
-      !$OMP SCHEDULE(guided)
-  do i=1,ao_num
+      !$OMP SCHEDULE(dynamic)
+  do i=ao_num,1,-1
     do k=1,i
       ao_two_e_integral_schwartz(i,k) = dsqrt(ao_two_e_integral(i,i,k,k))
       ao_two_e_integral_schwartz(k,i) = ao_two_e_integral_schwartz(i,k)
diff --git a/src/ezfio_files/get_unit_and_open.irp.f b/src/ezfio_files/get_unit_and_open.irp.f
index 6440579f..d6a7efac 100644
--- a/src/ezfio_files/get_unit_and_open.irp.f
+++ b/src/ezfio_files/get_unit_and_open.irp.f
@@ -47,11 +47,13 @@ integer function getUnitAndOpen(f,mode)
     endif
     open(unit=getUnitAndOpen,file=f,status='OLD',action='READ',form='UNFORMATTED')
   else if (mode.eq.'W') then
-    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='WRITE',form='UNFORMATTED')
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',form='UNFORMATTED')
+  else if (mode.eq.'A') then
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',position='APPEND',form='UNFORMATTED')
   else if (mode.eq.'w') then
-    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='WRITE',form='FORMATTED')
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',form='FORMATTED')
   else if (mode.eq.'a') then
-    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='WRITE',position='APPEND',form='FORMATTED')
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',position='APPEND',form='FORMATTED')
   else if (mode.eq.'x') then
     open(unit=getUnitAndOpen,file=new_f,form='FORMATTED')
   endif

From b9f041e5e587e0ce96e845fd437f3fc5abeb3272 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 4 Jun 2024 19:31:39 +0200
Subject: [PATCH 064/131] More I/O in Cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 198 +++++++++++++++----------------
 1 file changed, 94 insertions(+), 104 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index f689a65e..34b91f0f 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -31,12 +31,12 @@ END_PROVIDER
    integer*8                      :: ndim8
    integer                        :: rank
    double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:)
+   double precision, pointer      :: L(:,:), Delta(:,:)
 
    double precision               :: s
    double precision               :: dscale, dscale_tmp
 
-   double precision, allocatable  :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:)
    integer, allocatable           :: addr1(:), addr2(:)
    integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
    logical, allocatable           :: computed(:)
@@ -52,7 +52,7 @@ END_PROVIDER
    double precision, external     :: ao_two_e_integral
    integer                        :: block_size, iblock
 
-   double precision               :: mem
+   double precision               :: mem, mem0
    double precision, external     :: memory_of_double, memory_of_int
    double precision, external     :: memory_of_double8, memory_of_int8
 
@@ -64,8 +64,11 @@ END_PROVIDER
 
    type(c_ptr)                    :: c_pointer(2)
    integer                        :: fd(2)
+   logical                        :: delta_on_disk
 
    call wall_time(wall0)
+
+   ! Will be reallocated at the end
    deallocate(cholesky_ao)
 
    if (read_ao_cholesky) then
@@ -82,11 +85,11 @@ END_PROVIDER
      PROVIDE nucl_coord ao_two_e_integral_schwartz
      call set_multiple_levels_omp(.False.)
 
+     call resident_memory(mem0)
+
      rank_max = min(ndim8,274877906944_8/1_8/ndim8)
      call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., c_pointer(1))
      call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
-!print *, 'rank_max/ndim8', dble(rank_max) / dble(ndim8)
-
      ! Deleting the file while it is open makes the file invisible on the filesystem,
      ! and automatically deleted, even if the program crashes
      iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
@@ -161,32 +164,16 @@ END_PROVIDER
      Dmax = D_sorted(1)
 
      ! 2.
-     dscale = tau2/Dmax
-     do i8=1,ndim8
-       if (D_sorted(i8) <= dscale) exit
-     enddo
-
-
-     mem = qp_max_mem+1
-     do while ( (mem > qp_max_mem).and.(i8>1_8) )
-       dscale = min(1.d0,dsqrt(tau2/(D_sorted(i8)*Dmax)))
-       dscale_tmp = dscale*dscale*Dmax
-! print *, 'dscale = ', dscale, dble(i8)/dble(ndim8)
-       np8=0_8
-       do p8=1,ndim8
-         if ( dscale_tmp*D(p8) > tau2 ) then
-           np8 = np8+1_8
-           Lset(np8) = p8
-         endif
-       enddo
-       i8 = i8*3_8/4_8
-       if (np8 > huge(1_4)/64_8) cycle
-       np = np8
-! print *, 'np = ', np
-       call resident_memory(mem)
-       mem = mem                                 &
-             + 0.1d0*np*memory_of_double(np)      ! Delta(np,nq)
+     dscale = 1.d0
+     dscale_tmp = dscale*dscale*Dmax
+     np8=0_8
+     do p8=1,ndim8
+       if ( dscale_tmp*D(p8) > tau2 ) then
+         np8 = np8+1_8
+         Lset(np8) = p8
+       endif
      enddo
+     np = np8
 
      ! 3.
      N = 0
@@ -218,13 +205,11 @@ END_PROVIDER
          enddo
 
 
-         call resident_memory(mem)
-         mem = mem                                 &
-             + np*memory_of_double(nq)             &! Delta(np,nq)
-             + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
+         mem = mem0                                &
+             + np*memory_of_double(nq)
 
 !print *, 'mem = ', mem
-         if (mem > qp_max_mem) then
+         if (mem > 300.d0) then ! 300GB max for Delta
            s = s*2.d0
          else
            exit
@@ -239,18 +224,33 @@ END_PROVIDER
            stop -1
          endif
 
+         if (s > 0.1d0) then
+           exit
+         endif
+
        enddo
 
        ! d., e.
+       mem = mem0                                &
+             + memory_of_int(nq)             &! computed(nq)
+             + np*memory_of_int(nq)          &! computed(nq)
+             + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
-       allocate(Delta(np,nq), stat=ierr)
-!print *, 'allocate : Delta(np,nq)', memory_of_double8(np*nq*1_8)
-
-       if (ierr /= 0) then
-         call print_memory_usage()
-         print *,  irp_here, ': allocation failed : (Delta(np,nq))'
-         stop -1
+       if (mem > qp_max_mem) then
+         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., c_pointer(2))
+         call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
+         ! Deleting the file while it is open makes the file invisible on the filesystem,
+         ! and automatically deleted, even if the program crashes
+         iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R')
+         close(iunit,status='delete')
+         delta_on_disk = .True.
+       else
+         allocate(Delta(np,nq))
+         delta_on_disk = .False.
        endif
+       print *, delta_on_disk
+
+       allocate(Delta_col(np))
 
        allocate(Ltmp_p(np,block_size), stat=ierr)
 !print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size
@@ -272,40 +272,38 @@ END_PROVIDER
 
 
        allocate(computed(nq))
-!print *, 'allocate : computed(nq)', memory_of_int(nq)
 
 !print *, 'N, rank, block_size', N, rank, block_size
-!print *, 'p1'
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(p,q,j)
-       do q=1,nq
-         computed(q) = .False.
-         do j=1,np
-           Delta(j,q) = 0.d0
-         enddo
-       enddo
-       !$OMP END PARALLEL DO
 
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
        do k=1,N
          !$OMP DO
          do p=1,np
-              Ltmp_p(p,k) = L(Lset(p),k)
+            Ltmp_p(p,k) = L(Lset(p),k)
          enddo
          !$OMP END DO NOWAIT
 
          !$OMP DO
          do q=1,nq
-              Ltmp_q(q,k) = L(Dset(q),k)
+           computed(q) = .False.
+           Ltmp_q(q,k) = L(Dset(q),k)
          enddo
          !$OMP END DO NOWAIT
        enddo
        !$OMP BARRIER
        !$OMP END PARALLEL
 
-!print *, 'p2', np, nq, N
        if (N>0) then
-         call dgemm('N','T', np, nq, N, -1.d0,                       &
-             Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
+           call dgemm('N','T', np, nq, N, -1.d0,                       &
+               Ltmp_p, np, Ltmp_q, nq, 0.d0, Delta, np)
+       else
+         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) SCHEDULE(static,1)
+         do q=1,nq
+           do j=1,np
+             Delta(j,q) = 0.d0
+           enddo
+         enddo
+         !$OMP END PARALLEL DO
        endif
 
        ! f.
@@ -324,10 +322,8 @@ END_PROVIDER
          rank = N+j
 
          if (iblock == block_size) then
-!print *, 'dgemm', np, nq
            call dgemm('N','T',np,nq,block_size,-1.d0,                &
-               Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
-
+                 Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
            iblock = 0
          endif
 
@@ -343,43 +339,51 @@ END_PROVIDER
            L(i8, rank) = 0.d0
          enddo
 
+         iblock = iblock+1
+         !$OMP PARALLEL DO PRIVATE(p)
+         do p=1,np
+           Ltmp_p(p,iblock) = Delta(p,dj)
+         enddo
+         !$OMP END PARALLEL DO
+
          if (.not.computed(dj)) then
            m = dj
            if (do_direct_integrals) then
-               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,16)
-               do k=np,1,-1
+               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
+               do k=1,np
                  if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                        addr2(Lset(k)), addr2(Dset(m)) ) ) then
-                     Delta(k,m) = Delta(k,m) + &
+                     Delta_col(k) = &
                          ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
                          addr1(Dset(m)), addr2(Dset(m)))
                  endif
                enddo
                !$OMP END PARALLEL DO
-           else 
-               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,16)
-               do k=np,1,-1
+           else
+               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
+               do k=1,np
                  if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                        addr2(Lset(k)), addr2(Dset(m)) ) ) then
-                     Delta(k,m) = Delta(k,m) + &
+                     Delta_col(k) = &
                          get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
                          addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
                  endif
                enddo
                !$OMP END PARALLEL DO
            endif
+
+           !$OMP PARALLEL DO PRIVATE(p)
+           do p=1,np
+             Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
+             Delta(p,dj) =  Ltmp_p(p,iblock)
+           enddo
+           !$OMP END PARALLEL DO
+
            computed(dj) = .True.
          endif
 
-         iblock = iblock+1
-!print *, iblock
-         do p=1,np
-           Ltmp_p(p,iblock) = Delta(p,dj)
-         enddo
-
          ! iv.
          if (iblock > 1) then
-!print *, 'dgemv', iblock
            call dgemv('N', np, iblock-1, -1.d0, Ltmp_p, np, Ltmp_q(dj,1), nq, 1.d0,&
                Ltmp_p(1,iblock), 1)
          endif
@@ -412,10 +416,15 @@ END_PROVIDER
 
        print '(I10, 4X, ES12.3)', rank, Qmax
 
+       deallocate(Delta_col)
        deallocate(Ltmp_p)
        deallocate(Ltmp_q)
        deallocate(computed)
-       deallocate(Delta)
+       if (delta_on_disk) then
+         call munmap( (/ np*1_8, nq*1_8 /), 8, fd(2), c_pointer(2) )
+       else
+         deallocate(Delta)
+       endif
 
        ! i.
        N = rank
@@ -426,35 +435,16 @@ END_PROVIDER
          Dmax = max(Dmax, D(Lset(p)))
        enddo
 
-       mem = qp_max_mem+1
-       do while ( (mem > qp_max_mem).and.(i8>1_8) )
-         dscale = min(1.d0,dsqrt(tau2/(D_sorted(i8)*Dmax)))
-         dscale_tmp = dscale*dscale*Dmax
-!print *, 'dscale = ', dscale, dble(i8)/dble(ndim8)
-         np8=0_8
-         do p8=1,ndim8
-           if ( dscale_tmp*D(p8) > tau2 ) then
-             np8 = np8+1_8
-             Lset(np8) = p8
-           endif
-         enddo
-         i8 = i8*3_8/4_8
-         if (np8 > huge(1_4)/64_8) cycle
-         np = np8
-!print *, 'np = ', np
-         call resident_memory(mem)
-         mem = mem                                 &
-               + 0.1d0*np*memory_of_double(np)      ! Delta(np,nq)
+       dscale = 1.d0
+       dscale_tmp = dscale*dscale*Dmax
+       np8=0_8
+       do p8=1,ndim8
+         if ( dscale_tmp*D(p8) > tau2 ) then
+           np8 = np8+1_8
+           Lset(np8) = p8
+         endif
        enddo
-
-       if (np == 0) then
-           call print_memory_usage()
-           print *, 'Required peak memory: ', mem, 'Gb'
-           call resident_memory(mem)
-           print *, 'Already used  memory: ', mem, 'Gb'
-           print *, 'Not enough memory. Reduce cholesky threshold'
-           stop -1
-       endif
+       np = np8
 
      enddo
 
@@ -480,7 +470,7 @@ END_PROVIDER
      enddo
      !$OMP END PARALLEL DO
 
-     call munmap( (/ ndim8, ndim8 /), 8, fd(1), c_pointer(1) )
+     call munmap( (/ ndim8, rank_max /), 8, fd(1), c_pointer(1) )
 
      cholesky_ao_num = rank
 

From 19286bede43e5b01b5997ccf5e709a2ade9a4456 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 5 Jun 2024 02:51:12 +0200
Subject: [PATCH 065/131] Initialization

---
 src/ao_two_e_ints/cholesky.irp.f | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 34b91f0f..a7b2389f 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -248,7 +248,7 @@ END_PROVIDER
          allocate(Delta(np,nq))
          delta_on_disk = .False.
        endif
-       print *, delta_on_disk
+!print *, delta_on_disk
 
        allocate(Delta_col(np))
 
@@ -275,7 +275,7 @@ END_PROVIDER
 
 !print *, 'N, rank, block_size', N, rank, block_size
 
-       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q,j)
+       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
        do k=1,N
          !$OMP DO
          do p=1,np
@@ -356,6 +356,8 @@ END_PROVIDER
                      Delta_col(k) = &
                          ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
                          addr1(Dset(m)), addr2(Dset(m)))
+                 else
+                     Delta_col(k) = 0.d0
                  endif
                enddo
                !$OMP END PARALLEL DO
@@ -367,6 +369,8 @@ END_PROVIDER
                      Delta_col(k) = &
                          get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
                          addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
+                 else
+                     Delta_col(k) = 0.d0
                  endif
                enddo
                !$OMP END PARALLEL DO

From 36a2f0b46ff535607764b662e8013cba49347ff8 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 5 Jun 2024 03:16:55 +0200
Subject: [PATCH 066/131] Fixed cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index a7b2389f..09131b5d 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -272,9 +272,10 @@ END_PROVIDER
 
 
        allocate(computed(nq))
+       computed(:) = .False.
+
 
 !print *, 'N, rank, block_size', N, rank, block_size
-
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
        do k=1,N
          !$OMP DO
@@ -285,7 +286,6 @@ END_PROVIDER
 
          !$OMP DO
          do q=1,nq
-           computed(q) = .False.
            Ltmp_q(q,k) = L(Dset(q),k)
          enddo
          !$OMP END DO NOWAIT

From 2241096a6485a71406c12e5c2ae3165a4a838aeb Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 6 Jun 2024 13:53:30 +0200
Subject: [PATCH 067/131] Trying to improve mmap performance

---
 src/ao_two_e_ints/cholesky.irp.f              |  4 ++--
 .../dav_diag_dressed_ext_rout.irp.f           |  1 -
 ...diag_dressed_ext_rout_nonsym_B1space.irp.f |  2 --
 .../dav_double_dress_ext_rout.irp.f           |  1 -
 .../dav_dressed_ext_rout.irp.f                |  1 -
 src/dav_general_mat/dav_ext_rout.irp.f        |  1 -
 .../dav_ext_rout_nonsym_B1space.irp.f         |  2 --
 src/dav_general_mat/dav_general.irp.f         |  6 +++---
 src/davidson/diagonalization_h_dressed.irp.f  |  2 +-
 .../diagonalization_hcsf_dressed.irp.f        |  2 +-
 .../diagonalization_hs2_dressed.irp.f         |  4 ++--
 .../diagonalization_nonsym_h_dressed.irp.f    |  2 +-
 src/utils/fortran_mmap.c                      | 13 +++++++++----
 src/utils/map_functions.irp.f                 | 12 ++++++------
 src/utils/memory.irp.f                        |  2 +-
 src/utils/mmap.f90                            | 19 +++++++++++--------
 16 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 09131b5d..3cd400f8 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -88,7 +88,7 @@ END_PROVIDER
      call resident_memory(mem0)
 
      rank_max = min(ndim8,274877906944_8/1_8/ndim8)
-     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., c_pointer(1))
+     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
      call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
      ! Deleting the file while it is open makes the file invisible on the filesystem,
      ! and automatically deleted, even if the program crashes
@@ -237,7 +237,7 @@ END_PROVIDER
              + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
        if (mem > qp_max_mem) then
-         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., c_pointer(2))
+         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2))
          call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
          ! Deleting the file while it is open makes the file invisible on the filesystem,
          ! and automatically deleted, even if the program crashes
diff --git a/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f b/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f
index 0dc939cb..f57b7f92 100644
--- a/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f
@@ -1,6 +1,5 @@
 
 subroutine davidson_general_ext_rout_diag_dressed(u_in,H_jj,Dress_jj,energies,sze,N_st,N_st_diag_in,converged,hcalc)
-  use mmap_module
   implicit none
   BEGIN_DOC
   ! Generic Davidson diagonalization with ONE DIAGONAL DRESSING OPERATOR 
diff --git a/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f b/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
index 1a8269f4..c8848998 100644
--- a/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
@@ -3,8 +3,6 @@
 
 subroutine davidson_general_diag_dressed_ext_rout_nonsym_b1space(u_in, H_jj, Dress_jj,energies, sze, N_st, N_st_diag_in, converged, hcalc)
 
-  use mmap_module
-
   BEGIN_DOC
   ! Generic modified-Davidson diagonalization 
   !
diff --git a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
index 24f4fa10..1ff6632c 100644
--- a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
@@ -1,5 +1,4 @@
 subroutine dav_double_dressed(u_in,H_jj,Dress_jj,Dressing_vec,idx_dress,energies,sze,N_st,N_st_diag,converged,hcalc)
-  use mmap_module
   BEGIN_DOC
   ! Generic Davidson diagonalization with TWO DRESSING VECTORS 
   !
diff --git a/src/dav_general_mat/dav_dressed_ext_rout.irp.f b/src/dav_general_mat/dav_dressed_ext_rout.irp.f
index cedaaf0a..ca59a688 100644
--- a/src/dav_general_mat/dav_dressed_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_dressed_ext_rout.irp.f
@@ -1,5 +1,4 @@
 subroutine davidson_general_ext_rout_dressed(u_in,H_jj,energies,sze,N_st,N_st_diag,dressing_state,dressing_vec,idress,converged,hcalc)
-  use mmap_module
   implicit none
   BEGIN_DOC
   ! Davidson diagonalization.
diff --git a/src/dav_general_mat/dav_ext_rout.irp.f b/src/dav_general_mat/dav_ext_rout.irp.f
index deb7e3a9..ad60b2a8 100644
--- a/src/dav_general_mat/dav_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_ext_rout.irp.f
@@ -1,6 +1,5 @@
 
 subroutine davidson_general_ext_rout(u_in,H_jj,energies,sze,N_st,N_st_diag_in,converged,hcalc)
-  use mmap_module
   implicit none
   BEGIN_DOC
   ! Generic Davidson diagonalization 
diff --git a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
index d89aaadb..ca0a835e 100644
--- a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
@@ -3,8 +3,6 @@
 
 subroutine davidson_general_ext_rout_nonsym_b1space(u_in, H_jj, energies, sze, N_st, N_st_diag_in, converged, hcalc)
 
-  use mmap_module
-
   BEGIN_DOC
   ! Generic modified-Davidson diagonalization 
   !
diff --git a/src/dav_general_mat/dav_general.irp.f b/src/dav_general_mat/dav_general.irp.f
index 9940bf1e..a277d9ef 100644
--- a/src/dav_general_mat/dav_general.irp.f
+++ b/src/dav_general_mat/dav_general.irp.f
@@ -1,6 +1,6 @@
 
 subroutine davidson_general(u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag_in,converged,h_mat)
-  use mmap_module
+!  use mmap_module
   implicit none
   BEGIN_DOC
   ! Davidson diagonalization with specific diagonal elements of the H matrix
@@ -160,9 +160,9 @@ subroutine davidson_general(u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag_in,conv
 !    type(c_ptr) :: ptr_w, ptr_s
 !    integer :: fd_s, fd_w
 !    call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-!        8, fd_w, .False., ptr_w)
+!        8, fd_w, .False., .True., ptr_w)
 !    call mmap(trim(ezfio_work_dir)//'davidson_s', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-!        4, fd_s, .False., ptr_s)
+!        4, fd_s, .False., .True., ptr_s)
 !    call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
 !    call c_f_pointer(ptr_s, s, (/sze,N_st_diag*itermax/))
 !  else
diff --git a/src/davidson/diagonalization_h_dressed.irp.f b/src/davidson/diagonalization_h_dressed.irp.f
index b7179c18..15bf256d 100644
--- a/src/davidson/diagonalization_h_dressed.irp.f
+++ b/src/davidson/diagonalization_h_dressed.irp.f
@@ -228,7 +228,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_dia
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
   else
     allocate(W(sze,N_st_diag*itermax))
diff --git a/src/davidson/diagonalization_hcsf_dressed.irp.f b/src/davidson/diagonalization_hcsf_dressed.irp.f
index fa8aff80..656dd1d9 100644
--- a/src/davidson/diagonalization_hcsf_dressed.irp.f
+++ b/src/davidson/diagonalization_hcsf_dressed.irp.f
@@ -229,7 +229,7 @@ subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call c_f_pointer(ptr_w, W_csf, (/sze_csf,N_st_diag*itermax/))
   else
     allocate(W(sze,N_st_diag),W_csf(sze_csf,N_st_diag*itermax))
diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f
index fd967ecc..fb04b29b 100644
--- a/src/davidson/diagonalization_hs2_dressed.irp.f
+++ b/src/davidson/diagonalization_hs2_dressed.irp.f
@@ -270,9 +270,9 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call mmap(trim(ezfio_work_dir)//'davidson_s', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        4, fd_s, .False., ptr_s)
+        4, fd_s, .False., .True., ptr_s)
     call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
     call c_f_pointer(ptr_s, s, (/sze,N_st_diag*itermax/))
   else
diff --git a/src/davidson/diagonalization_nonsym_h_dressed.irp.f b/src/davidson/diagonalization_nonsym_h_dressed.irp.f
index 96ca84ab..86df3a19 100644
--- a/src/davidson/diagonalization_nonsym_h_dressed.irp.f
+++ b/src/davidson/diagonalization_nonsym_h_dressed.irp.f
@@ -251,7 +251,7 @@ subroutine davidson_diag_nonsym_hjj(dets_in, u_in, H_jj, energies, dim_in, sze,
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
   else
     allocate(W(sze,N_st_diag*itermax))
diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c
index e8d85a2f..fdf7fb6f 100644
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@@ -7,7 +7,7 @@
 #include <sys/mman.h>
 
 
-void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
+void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only, int single_node)
 {
     int fd;
     int result;
@@ -21,7 +21,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
             perror("Error opening mmap file for reading");
             exit(EXIT_FAILURE);
         }
-        map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0);
+        map = mmap(NULL, bytes, PROT_READ, MAP_PRIVATE, fd, 0);
     }
     else
     {
@@ -39,7 +39,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
             perror("Error calling lseek() to stretch the file");
             exit(EXIT_FAILURE);
         }
-        
+
         result = write(fd, "", 1);
         if (result != 1) {
             close(fd);
@@ -48,7 +48,12 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
             exit(EXIT_FAILURE);
         }
 
-        map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (single_node == 1) {
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, 
+             MAP_PRIVATE | MAP_HUGETLB , fd, 0);
+        } else {
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_HUGETLB, fd, 0);
+        }
     }
 
     if (map == MAP_FAILED) {
diff --git a/src/utils/map_functions.irp.f b/src/utils/map_functions.irp.f
index 97d0e8bf..e3a62b07 100644
--- a/src/utils/map_functions.irp.f
+++ b/src/utils/map_functions.irp.f
@@ -21,13 +21,13 @@ subroutine map_save_to_disk(filename,map)
     stop 'map already consolidated'
   endif
 
-  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .False., c_pointer(1))
+  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .False., .False., c_pointer(1))
   call c_f_pointer(c_pointer(1),map % consolidated_idx, (/ map % map_size +2_8/))
 
-  call mmap(trim(filename)//'_consolidated_key', (/ n_elements /), cache_key_kind, fd(2), .False., c_pointer(2))
+  call mmap(trim(filename)//'_consolidated_key', (/ n_elements /), cache_key_kind, fd(2), .False., .False., c_pointer(2))
   call c_f_pointer(c_pointer(2),map % consolidated_key, (/ n_elements /))
 
-  call mmap(trim(filename)//'_consolidated_value', (/ n_elements /), integral_kind, fd(3), .False., c_pointer(3))
+  call mmap(trim(filename)//'_consolidated_value', (/ n_elements /), integral_kind, fd(3), .False., .False., c_pointer(3))
   call c_f_pointer(c_pointer(3),map % consolidated_value, (/ n_elements /))
 
   if (.not.associated(map%consolidated_key)) then
@@ -85,15 +85,15 @@ subroutine map_load_from_disk(filename,map)
     stop 'map already consolidated'
   endif
 
-  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .True., c_pointer(1))
+  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .True., .False., c_pointer(1))
   call c_f_pointer(c_pointer(1),map % consolidated_idx, (/ map % map_size + 2_8/))
 
   map% n_elements = map % consolidated_idx (map % map_size+2_8)-1_8
 
-  call mmap(trim(filename)//'_consolidated_key', (/ map % n_elements /), cache_key_kind, fd(2), .True., c_pointer(2))
+  call mmap(trim(filename)//'_consolidated_key', (/ map % n_elements /), cache_key_kind, fd(2), .True., .False., c_pointer(2))
   call c_f_pointer(c_pointer(2),map % consolidated_key, (/ map % n_elements /))
 
-  call mmap(trim(filename)//'_consolidated_value', (/ map % n_elements /), integral_kind, fd(3), .True., c_pointer(3))
+  call mmap(trim(filename)//'_consolidated_value', (/ map % n_elements /), integral_kind, fd(3), .True., .False., c_pointer(3))
   call c_f_pointer(c_pointer(3),map % consolidated_value, (/ map % n_elements /))
 
   l = 0_8
diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f
index 043562db..e2e8dd76 100644
--- a/src/utils/memory.irp.f
+++ b/src/utils/memory.irp.f
@@ -6,7 +6,7 @@ BEGIN_PROVIDER [ integer, qp_max_mem ]
  character*(128) :: env
  integer, external :: get_total_available_memory
 
- qp_max_mem = get_total_available_memory()
+ qp_max_mem = max(get_total_available_memory() - 1,3)
  call write_int(6,qp_max_mem,'Total available memory (GB)')
  call getenv('QP_MAXMEM',env)
  if (trim(env) /= '') then
diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90
index 41e60224..723cb771 100644
--- a/src/utils/mmap.f90
+++ b/src/utils/mmap.f90
@@ -7,12 +7,13 @@ module mmap_module
     ! File descriptors
     ! ----------------
 
-    type(c_ptr) function c_mmap_fortran(filename, length, fd, read_only) bind(c,name='mmap_fortran')
+    type(c_ptr) function c_mmap_fortran(filename, length, fd, read_only, single_node) bind(c,name='mmap_fortran')
       use iso_c_binding
       character(c_char), intent(in)  :: filename(*)
       integer(c_size_t), intent(in), value :: length
       integer(c_int), intent(out)    :: fd
       integer(c_int), intent(in), value    :: read_only
+      integer(c_int), intent(in), value    :: single_node
     end function
 
     subroutine c_munmap_fortran(length, fd, map) bind(c,name='munmap_fortran')
@@ -33,31 +34,33 @@ module mmap_module
 
   contains
 
-  subroutine mmap(filename, shape, bytes, fd, read_only, map)
+  subroutine mmap(filename, shape, bytes, fd, read_only, single_node, map)
       use iso_c_binding
       implicit none
       character*(*), intent(in)      :: filename   ! Name of the mapped file
       integer*8, intent(in)          :: shape(:)   ! Shape of the array to map
       integer, intent(in)            :: bytes      ! Number of bytes per element
       logical, intent(in)            :: read_only  ! If true, mmap is read-only
+      logical, intent(in)            :: single_node! If true, mmap is on a single node
       integer, intent(out)           :: fd         ! File descriptor
       type(c_ptr), intent(out)       :: map        ! C Pointer
 
       integer(c_size_t)              :: length
       integer(c_int)                 :: fd_
 
-      integer :: i
+      integer :: i, read_only_, single_node_
+
+      read_only_ = 0
+      single_node_ = 0
+      if (read_only_) read_only_ = 1
+      if (single_node_) single_node_ = 1
 
       length = int(bytes,8)
       do i=1,size(shape)
         length = length * shape(i)
       enddo
 
-      if (read_only) then
-          map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 1)
-      else
-          map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 0)
-      endif
+      map = c_mmap_fortran( trim(filename)//char(0), length, fd_, read_only, single_node)
       fd = fd_
   end subroutine
 

From 6ae162b6c93d0bb31749086736dea13af9edfb56 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 14:33:13 +0200
Subject: [PATCH 068/131] Disk-based cholesky OK

---
 src/ao_two_e_ints/cholesky.irp.f | 13 ++++++++++---
 src/utils/fortran_mmap.c         |  5 ++---
 src/utils/mmap.f90               |  6 +++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 3cd400f8..6778d5c7 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -66,6 +66,10 @@ END_PROVIDER
    integer                        :: fd(2)
    logical                        :: delta_on_disk
 
+   PROVIDE nproc
+   PROVIDE nucl_coord ao_two_e_integral_schwartz
+   call set_multiple_levels_omp(.False.)
+
    call wall_time(wall0)
 
    ! Will be reallocated at the end
@@ -87,7 +91,7 @@ END_PROVIDER
 
      call resident_memory(mem0)
 
-     rank_max = min(ndim8,274877906944_8/1_8/ndim8)
+     rank_max = min(ndim8,(qp_max_mem*1024_8*1024_8*1024_8/8_8)/ndim8)
      call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
      call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
      ! Deleting the file while it is open makes the file invisible on the filesystem,
@@ -209,7 +213,7 @@ END_PROVIDER
              + np*memory_of_double(nq)
 
 !print *, 'mem = ', mem
-         if (mem > 300.d0) then ! 300GB max for Delta
+         if (mem > qp_max_mem/2) then
            s = s*2.d0
          else
            exit
@@ -231,9 +235,12 @@ END_PROVIDER
        enddo
 
        ! d., e.
-       mem = mem0                                &
+       mem = mem0                            &
              + memory_of_int(nq)             &! computed(nq)
              + np*memory_of_int(nq)          &! computed(nq)
+             + memory_of_double(np)          &! Delta_col(np)
+             + 7*memory_of_double(ndim8)     &! D, Lset, Dset, D_sorted, addr[1-3]
+             + np*memory_of_double(nq)       &! Delta(np,nq)
              + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
        if (mem > qp_max_mem) then
diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c
index fdf7fb6f..711a9c34 100644
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@@ -49,10 +49,9 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
         }
 
         if (single_node == 1) {
-          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, 
-             MAP_PRIVATE | MAP_HUGETLB , fd, 0);
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK, fd, 0);
         } else {
-          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_HUGETLB, fd, 0);
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
         }
     }
 
diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90
index 723cb771..af3fe6ed 100644
--- a/src/utils/mmap.f90
+++ b/src/utils/mmap.f90
@@ -46,9 +46,9 @@ module mmap_module
       type(c_ptr), intent(out)       :: map        ! C Pointer
 
       integer(c_size_t)              :: length
-      integer(c_int)                 :: fd_
+      integer(c_int)                 :: fd_, read_only_, single_node_
 
-      integer :: i, read_only_, single_node_
+      integer :: i
 
       read_only_ = 0
       single_node_ = 0
@@ -60,7 +60,7 @@ module mmap_module
         length = length * shape(i)
       enddo
 
-      map = c_mmap_fortran( trim(filename)//char(0), length, fd_, read_only, single_node)
+      map = c_mmap_fortran( trim(filename)//char(0), length, fd_, read_only_, single_node_)
       fd = fd_
   end subroutine
 

From af8973770e9265f6b8f997edde1873b97c1a48da Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 14:39:34 +0200
Subject: [PATCH 069/131] Typo in mmap

---
 src/utils/mmap.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90
index af3fe6ed..e342b422 100644
--- a/src/utils/mmap.f90
+++ b/src/utils/mmap.f90
@@ -52,8 +52,8 @@ module mmap_module
 
       read_only_ = 0
       single_node_ = 0
-      if (read_only_) read_only_ = 1
-      if (single_node_) single_node_ = 1
+      if (read_only) read_only_ = 1
+      if (single_node) single_node_ = 1
 
       length = int(bytes,8)
       do i=1,size(shape)

From f58df5e81669226e728f744308593db5a5e4cad0 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 16:09:53 +0200
Subject: [PATCH 070/131] Added do_mo_cholesky

---
 src/ao_two_e_ints/EZFIO.cfg               |   2 +-
 src/ao_two_e_ints/cholesky.irp.f          |   6 +-
 src/mo_two_e_ints/EZFIO.cfg               |   6 ++
 src/mo_two_e_ints/cholesky.irp.f          | 117 +++++++++++++++-------
 src/mo_two_e_ints/four_idx_novvvv.irp.f   |   9 --
 src/mo_two_e_ints/integrals_3_index.irp.f |   2 +-
 src/mo_two_e_ints/mo_bi_integrals.irp.f   |   2 +-
 src/trexio/import_trexio_integrals.irp.f  |   5 +
 8 files changed, 97 insertions(+), 52 deletions(-)

diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg
index c2e083a3..a985149e 100644
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@@ -6,7 +6,7 @@ default: None
 
 [io_ao_cholesky]
 type: Disk_access
-doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ] 
+doc: Read/Write |AO| Cholesky integrals from/to disk [ Write | Read | None ] 
 interface: ezfio,provider,ocaml
 default: None
 
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 6778d5c7..a1cd8e5b 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -76,7 +76,7 @@ END_PROVIDER
    deallocate(cholesky_ao)
 
    if (read_ao_cholesky) then
-     print *,  'Reading Cholesky vectors from disk...'
+     print *,  'Reading Cholesky AO vectors from disk...'
      iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'R')
      read(iunit) rank
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
@@ -486,7 +486,7 @@ END_PROVIDER
      cholesky_ao_num = rank
 
      if (write_ao_cholesky) then
-       print *,  'Writing Cholesky vectors to disk...'
+       print *,  'Writing Cholesky AO vectors to disk...'
        iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
        write(iunit) rank
        write(iunit) cholesky_ao
@@ -499,7 +499,7 @@ END_PROVIDER
    print *, 'Rank  : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)'
    print *,  ''
    call wall_time(wall1)
-   print*,'Time to provide AO cholesky vectors = ',wall1-wall0
+   print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
 
 END_PROVIDER
 
diff --git a/src/mo_two_e_ints/EZFIO.cfg b/src/mo_two_e_ints/EZFIO.cfg
index 088a2416..49a2952c 100644
--- a/src/mo_two_e_ints/EZFIO.cfg
+++ b/src/mo_two_e_ints/EZFIO.cfg
@@ -1,3 +1,9 @@
+[io_mo_cholesky]
+type: Disk_access
+doc: Read/Write |MO| Cholesky integrals from/to disk [ Write | Read | None ]
+interface: ezfio,provider,ocaml
+default: None
+
 [io_mo_two_e_integrals]
 type: Disk_access
 doc: Read/Write |MO| integrals from/to disk [ Write | Read | None ] 
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 0d0989d7..d3affd68 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -1,55 +1,98 @@
+BEGIN_PROVIDER [ logical, do_mo_cholesky ]
+ implicit none
+ BEGIN_DOC
+ ! If True, use Cholesky vectors for MO integrals
+ END_DOC
+ do_mo_cholesky = do_ao_cholesky
+END_PROVIDER
+
 BEGIN_PROVIDER [ integer, cholesky_mo_num ]
  implicit none
  BEGIN_DOC
  ! Number of Cholesky vectors in MO basis
  END_DOC
- cholesky_mo_num = cholesky_ao_num
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num) ]
- implicit none
- BEGIN_DOC
- ! Cholesky vectors in MO basis
- END_DOC
-
- integer :: k, i, j
-
- call set_multiple_levels_omp(.False.)
- !$OMP PARALLEL DO PRIVATE(k)
- do k=1,cholesky_mo_num
-  do j=1,mo_num
-    do i=1,mo_num
-      cholesky_mo(i,j,k) = cholesky_mo_transp(k,i,j)
-    enddo
-  enddo
- enddo
- !$OMP END PARALLEL DO
-
+ integer, external              :: getUnitAndOpen
+ integer                        :: iunit
+ if (read_mo_cholesky) then
+   iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'R')
+   read(iunit) cholesky_mo_num
+   close(iunit)
+ else
+   cholesky_mo_num = cholesky_ao_num
+ endif
 END_PROVIDER
 
+!BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num) ]
+! implicit none
+! BEGIN_DOC
+! ! Cholesky vectors in MO basis
+! END_DOC
+!
+! integer :: k, i, j
+!
+! call set_multiple_levels_omp(.False.)
+! !$OMP PARALLEL DO PRIVATE(k)
+! do k=1,cholesky_mo_num
+!  do j=1,mo_num
+!    do i=1,mo_num
+!      cholesky_mo(i,j,k) = cholesky_mo_transp(k,i,j)
+!    enddo
+!  enddo
+! enddo
+! !$OMP END PARALLEL DO
+!
+!END_PROVIDER
+!
 BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num, mo_num) ]
  implicit none
  BEGIN_DOC
- ! Cholesky vectors in MO basis
+ ! Cholesky vectors in MO basis. Warning: it is transposed wrt cholesky_ao:
+ !
+ ! -  cholesky_ao        is (ao_num^2 x cholesky_ao_num)
+ !
+ ! - cholesky_mo_transp is (cholesky_mo_num x mo_num^2)
  END_DOC
 
  double precision, allocatable :: X(:,:,:)
  double precision :: wall0, wall1
- integer :: ierr
- print *, 'AO->MO Transformation of Cholesky vectors'
-  call wall_time(wall0)
+ integer, external              :: getUnitAndOpen
+ integer                        :: iunit, ierr, rank
 
- allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
- if (ierr /= 0) then
-   print *, irp_here, ': Allocation failed'
- endif
- call dgemm('T','N', ao_num*cholesky_mo_num, mo_num, ao_num, 1.d0, &
-     cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_mo_num)
- call dgemm('T','N', cholesky_mo_num*mo_num, mo_num, ao_num, 1.d0, &
-     X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_mo_num*mo_num)
- deallocate(X)
-  call wall_time(wall1)
- print*,'Time for AO->MO Cholesky vectors = ',wall1-wall0
+  if (read_mo_cholesky) then
+      print *,  'Reading Cholesky MO vectors from disk...'
+     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'R')
+     read(iunit) rank
+     if (cholesky_mo_num /= rank) then
+        stop 'inconsistent rank'
+     endif
+     read(iunit) cholesky_mo_transp
+     close(iunit)
+  else
+     print *, 'AO->MO Transformation of Cholesky vectors'
+     call wall_time(wall0)
+
+     allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
+     if (ierr /= 0) then
+       print *, irp_here, ': Allocation failed'
+     endif
+     call dgemm('T','N', ao_num*cholesky_mo_num, mo_num, ao_num, 1.d0, &
+         cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_mo_num)
+     call dgemm('T','N', cholesky_mo_num*mo_num, mo_num, ao_num, 1.d0, &
+         X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_mo_num*mo_num)
+     deallocate(X)
+     call wall_time(wall1)
+     print*,'Time to provide MO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
+
+
+     if (write_mo_cholesky) then
+       print *,  'Writing Cholesky MO vectors to disk...'
+       iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
+       write(iunit) rank
+       write(iunit) cholesky_mo_transp
+       close(iunit)
+       call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
+     endif
+  endif
 
 END_PROVIDER
 
diff --git a/src/mo_two_e_ints/four_idx_novvvv.irp.f b/src/mo_two_e_ints/four_idx_novvvv.irp.f
index 2be09689..80af35dc 100644
--- a/src/mo_two_e_ints/four_idx_novvvv.irp.f
+++ b/src/mo_two_e_ints/four_idx_novvvv.irp.f
@@ -1,12 +1,3 @@
-!BEGIN_PROVIDER [ logical, no_vvvv_integrals  ]
-!  implicit none
-!  BEGIN_DOC
-! If `True`, computes all integrals except for the integrals having 3 or 4 virtual indices
-!  END_DOC
-!
-!  no_vvvv_integrals = .False.
-!END_PROVIDER
-
 BEGIN_PROVIDER [ double precision, mo_coef_novirt, (ao_num,n_core_inact_act_orb) ]
  implicit none
  BEGIN_DOC
diff --git a/src/mo_two_e_ints/integrals_3_index.irp.f b/src/mo_two_e_ints/integrals_3_index.irp.f
index eb05da84..c0dab506 100644
--- a/src/mo_two_e_ints/integrals_3_index.irp.f
+++ b/src/mo_two_e_ints/integrals_3_index.irp.f
@@ -10,7 +10,7 @@
  double precision :: get_two_e_integral
  double precision :: integral
 
- if (do_ao_cholesky) then
+ if (do_mo_cholesky) then
 
     double precision, allocatable :: buffer_jj(:,:), buffer(:,:,:)
     allocate(buffer_jj(cholesky_mo_num,mo_num), buffer(mo_num,mo_num,mo_num))
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index 0e77b6a2..cb3f4bc6 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -1362,7 +1362,7 @@ END_PROVIDER
   double precision               :: get_two_e_integral
 
 
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
     double precision, allocatable :: buffer(:,:)
     allocate (buffer(cholesky_mo_num,mo_num))
     do k=1,cholesky_mo_num
diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f
index 8c6b79d7..5a6b3c03 100644
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@@ -41,6 +41,11 @@ subroutine run(f)
   integer         , allocatable :: Vi(:,:)
   double precision              :: s
 
+! TODO:
+! - If Cholesky AO in trexio file, read cholesky ao vectors
+! - If Cholesky MO in trexio file, read cholesky mo vectors
+! - If Cholesky MO not in trexio file, force do_cholesky_mo to False
+
   if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
     rc = trexio_read_nucleus_repulsion(f, s)
     if (rc /= TREXIO_SUCCESS) then

From e55390c70c98d9c475bde599ab3621246cac88af Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 16:11:10 +0200
Subject: [PATCH 071/131] Type error in cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 6778d5c7..ffb37565 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -239,7 +239,7 @@ END_PROVIDER
              + memory_of_int(nq)             &! computed(nq)
              + np*memory_of_int(nq)          &! computed(nq)
              + memory_of_double(np)          &! Delta_col(np)
-             + 7*memory_of_double(ndim8)     &! D, Lset, Dset, D_sorted, addr[1-3]
+             + 7*memory_of_double8(ndim8)     &! D, Lset, Dset, D_sorted, addr[1-3]
              + np*memory_of_double(nq)       &! Delta(np,nq)
              + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 

From b080a7a5e9adecf0544cc34b65dbf967872a77e3 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 16:34:14 +0200
Subject: [PATCH 072/131] Clean Cholesky MO

---
 src/mo_two_e_ints/EZFIO.cfg             |  6 --
 src/mo_two_e_ints/cholesky.irp.f        |  6 +-
 src/mo_two_e_ints/mo_bi_integrals.irp.f | 39 +++++------
 src/mo_two_e_ints/no_vvvv.irp.f         | 88 -------------------------
 4 files changed, 25 insertions(+), 114 deletions(-)
 delete mode 100644 src/mo_two_e_ints/no_vvvv.irp.f

diff --git a/src/mo_two_e_ints/EZFIO.cfg b/src/mo_two_e_ints/EZFIO.cfg
index 49a2952c..c967969f 100644
--- a/src/mo_two_e_ints/EZFIO.cfg
+++ b/src/mo_two_e_ints/EZFIO.cfg
@@ -17,12 +17,6 @@ interface: ezfio,provider,ocaml
 default: 1.e-15
 ezfio_name: threshold_mo
 
-[no_vvvv_integrals]
-type: logical
-doc: If `True`, computes all integrals except for the integrals having 3 or 4 virtual indices
-interface: ezfio,provider,ocaml
-default: false
-
 [io_mo_two_e_integrals_erf]
 type: Disk_access
 doc: Read/Write MO integrals with the long range interaction from/to disk [    Write | Read | None ]
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index d3affd68..5d34fb33 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -68,7 +68,11 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
      read(iunit) cholesky_mo_transp
      close(iunit)
   else
+     print *, ''
      print *, 'AO->MO Transformation of Cholesky vectors'
+     print *, '-----------------------------------------'
+     print *, ''
+
      call wall_time(wall0)
 
      allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
@@ -87,7 +91,7 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
      if (write_mo_cholesky) then
        print *,  'Writing Cholesky MO vectors to disk...'
        iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
-       write(iunit) rank
+       write(iunit) cholesky_mo_num
        write(iunit) cholesky_mo_transp
        close(iunit)
        call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index cb3f4bc6..4b9bf97f 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -39,29 +39,16 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ]
     return
   endif
 
-  if (.not. do_direct_integrals) then
-    PROVIDE ao_two_e_integrals_in_map
-  endif
-
-  print *,  ''
-  print *,  'AO -> MO integrals transformation'
-  print *,  '---------------------------------'
-  print *,  ''
-
   call wall_time(wall_1)
   call cpu_time(cpu_1)
 
-  if(no_vvvv_integrals)then
-    call four_idx_novvvv_old
+  if (do_mo_cholesky) then
+    call add_integrals_to_map_cholesky
   else
-    if (do_ao_cholesky) then
-      call add_integrals_to_map_cholesky
+    if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
+      call four_idx_dgemm
     else
-      if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
-        call four_idx_dgemm
-      else
-        call add_integrals_to_map(full_ijkl_bitmask_4)
-      endif
+      call add_integrals_to_map(full_ijkl_bitmask_4)
     endif
   endif
 
@@ -92,8 +79,15 @@ subroutine four_idx_dgemm
   double precision, allocatable :: a1(:,:,:,:)
   double precision, allocatable :: a2(:,:,:,:)
 
+  PROVIDE ao_two_e_integrals_in_map mo_coef
+
+  print *, ''
+  print *, 'DGEMM-based AO->MO Transformation'
+  print *, '---------------------------------'
+  print *, ''
+
   if (ao_num > 1289) then
-    print *,  irp_here, ': Integer overflow in ao_num**3'
+    print *,  irp_here, ': Integer overflow in ao_num**3. Set do_ao_cholesky=.True.'
   endif
 
   allocate (a1(ao_num,ao_num,ao_num,ao_num))
@@ -213,6 +207,12 @@ subroutine add_integrals_to_map(mask_ijkl)
 
   PROVIDE ao_two_e_integrals_in_map  mo_coef
 
+
+  print *, ''
+  print *, 'Sparse AO->MO Transformation'
+  print *, '----------------------------'
+  print *, ''
+
   !Get list of MOs for i,j,k and l
   !-------------------------------
 
@@ -469,6 +469,7 @@ subroutine add_integrals_to_map_cholesky
   integer(key_kind)  , allocatable :: buffer_i(:)
   real(integral_kind), allocatable :: buffer_value(:)
 
+  PROVIDE cholesky_mo_transp
   call set_multiple_levels_omp(.False.)
 
   !$OMP PARALLEL DEFAULT(SHARED) &
diff --git a/src/mo_two_e_ints/no_vvvv.irp.f b/src/mo_two_e_ints/no_vvvv.irp.f
deleted file mode 100644
index 48a7f5e2..00000000
--- a/src/mo_two_e_ints/no_vvvv.irp.f
+++ /dev/null
@@ -1,88 +0,0 @@
-
-subroutine four_idx_novvvv_old
-  use map_module
-  use bitmasks
-  implicit none
-  BEGIN_DOC
-  ! Retransform MO integrals for next CAS-SCF step
-  END_DOC
-  integer(bit_kind)              :: mask_ijkl(N_int,4)
-  integer(bit_kind)              :: mask_ijk(N_int,3)
-
-  print*,'Using partial transformation'
-  print*,'It will not transform all integrals with at least 3 indices within the virtuals'
-    integer                        :: i,j,k,l
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I I I I !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 4
-    ! <ii|ii>
-    print*, ''
-    print*, '<ii|ii>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  core_inact_act_bitmask_4(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I I V V !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 2  (virt) ^2
-    ! <iv|iv>  = J_iv
-    print*, ''
-    print*, '<iv|iv>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  virt_bitmask(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-
-    ! (core+inact+act) ^ 2  (virt) ^2
-    ! <ii|vv> = (iv|iv)
-    print*, ''
-    print*, '<ii|vv>'
-    do i = 1,N_int
-      mask_ijkl(i,1) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) = virt_bitmask(i,1)
-      mask_ijkl(i,4) = virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! V V V !!!!!!!!!!!!!!!!!!!!!!!
-!    if(.not.no_vvv_integrals)then
-      print*, ''
-      print*, '<rv|sv> and <rv|vs>'
-      do i = 1,N_int
-        mask_ijk(i,1) =  virt_bitmask(i,1)
-        mask_ijk(i,2) =  virt_bitmask(i,1)
-        mask_ijk(i,3) =  virt_bitmask(i,1)
-      enddo
-      call add_integrals_to_map_three_indices(mask_ijk)
-!    endif
-
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I I I V !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 3  (virt) ^1
-    ! <iv|ii>
-    print*, ''
-    print*, '<iv|ii>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I V V V !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 1  (virt) ^3
-    ! <iv|vv>
-!    if(.not.no_ivvv_integrals)then
-      print*, ''
-      print*, '<iv|vv>'
-      do i = 1,N_int
-        mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-        mask_ijkl(i,2) =  virt_bitmask(i,1)
-        mask_ijkl(i,3) =  virt_bitmask(i,1)
-        mask_ijkl(i,4) =  virt_bitmask(i,1)
-      enddo
-      call add_integrals_to_map_no_exit_34(mask_ijkl)
-end

From ca98a6b529b3aa47d125e27d4b8031ac88a55c41 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 16:46:25 +0200
Subject: [PATCH 073/131] Fixed previous commit

---
 src/mo_two_e_ints/cholesky.irp.f | 42 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 5d34fb33..971ab38d 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -22,27 +22,27 @@ BEGIN_PROVIDER [ integer, cholesky_mo_num ]
  endif
 END_PROVIDER
 
-!BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num) ]
-! implicit none
-! BEGIN_DOC
-! ! Cholesky vectors in MO basis
-! END_DOC
-!
-! integer :: k, i, j
-!
-! call set_multiple_levels_omp(.False.)
-! !$OMP PARALLEL DO PRIVATE(k)
-! do k=1,cholesky_mo_num
-!  do j=1,mo_num
-!    do i=1,mo_num
-!      cholesky_mo(i,j,k) = cholesky_mo_transp(k,i,j)
-!    enddo
-!  enddo
-! enddo
-! !$OMP END PARALLEL DO
-!
-!END_PROVIDER
-!
+BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num) ]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors in MO basis
+ END_DOC
+
+ integer :: k, i, j
+
+ call set_multiple_levels_omp(.False.)
+ !$OMP PARALLEL DO PRIVATE(k)
+ do k=1,cholesky_mo_num
+  do j=1,mo_num
+    do i=1,mo_num
+      cholesky_mo(i,j,k) = cholesky_mo_transp(k,i,j)
+    enddo
+  enddo
+ enddo
+ !$OMP END PARALLEL DO
+
+END_PROVIDER
+
 BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num, mo_num) ]
  implicit none
  BEGIN_DOC

From 7e1ed69eef611e6ff0336814c8ffded0eb9cf323 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 7 Jun 2024 18:03:51 +0200
Subject: [PATCH 074/131] Starting Cholesky transition

---
 src/mo_two_e_ints/map_integrals.irp.f   |  38 +-
 src/mo_two_e_ints/mo_bi_integrals.irp.f | 856 +-----------------------
 2 files changed, 57 insertions(+), 837 deletions(-)

diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index ada256a2..290fdeab 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -98,7 +98,10 @@ double precision function get_two_e_integral(i,j,k,l,map)
   integer*8                      :: ii_8
   type(map_type), intent(inout)  :: map
   real(integral_kind)            :: tmp
-  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
+  integer                        :: kk
+
+  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache do_mo_cholesky 
+
   if (use_banned_excitation) then
     if (banned_excitation(i,k)) then
       get_two_e_integral = 0.d0
@@ -109,22 +112,43 @@ double precision function get_two_e_integral(i,j,k,l,map)
       return
     endif
   endif
+
+
   ii = l-mo_integrals_cache_min
   ii = ior(ii, k-mo_integrals_cache_min)
   ii = ior(ii, j-mo_integrals_cache_min)
   ii = ior(ii, i-mo_integrals_cache_min)
-  if (iand(ii, -128) /= 0) then
-    !DIR$ FORCEINLINE
-    call two_e_integrals_index(i,j,k,l,idx)
-    !DIR$ FORCEINLINE
-    call map_get(map,idx,tmp)
-    get_two_e_integral = dble(tmp)
+
+!  if (iand(ii, -128) /= 0) then
+  if (.True.) then
+    ! Integral is not in the cache
+
+    if  (do_mo_cholesky) then
+
+      get_two_e_integral = 0.d0
+      do kk=1,cholesky_mo_num
+        get_two_e_integral = get_two_e_integral + cholesky_mo_transp(kk,i,k) * cholesky_mo_transp(kk,j,l)
+      enddo
+
+    else
+      ! Integrals is in the map
+
+      !DIR$ FORCEINLINE
+      call two_e_integrals_index(i,j,k,l,idx)
+      !DIR$ FORCEINLINE
+      call map_get(map,idx,tmp)
+      get_two_e_integral = dble(tmp)
+    endif
+
   else
+    ! Integrals is in the cache
+
     ii_8 = int(l,8)-mo_integrals_cache_min_8
     ii_8 = ior( shiftl(ii_8,7), int(k,8)-mo_integrals_cache_min_8)
     ii_8 = ior( shiftl(ii_8,7), int(j,8)-mo_integrals_cache_min_8)
     ii_8 = ior( shiftl(ii_8,7), int(i,8)-mo_integrals_cache_min_8)
     get_two_e_integral = mo_integrals_cache(ii_8)
+
   endif
 end
 
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index 4b9bf97f..d44bb38a 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -1,3 +1,26 @@
+! 1,2-index integrals are always taken from:
+! - mo_two_e_integrals_jj_exchange
+! - mo_two_e_integrals_jj_anti
+! - mo_two_e_integrals_jj
+! 
+! 3-index integrals are always taken from:
+! - big_array_exchange_integrals
+! - big_array_coulomb_integrals
+!
+! If (do_mo_cholesky):
+! - Integrals with four 4 active orbitals are stored in the cache map,
+!   all other integrals are used from cholesky vectors
+! - 1,2,3-index arrays are built from cholesky vectors
+! Else:
+! - All integrals are stored in the map or cache map
+! - 1,2,3-index arrays are built from the map
+!
+! TODO:
+! - build cache map from cholesky vectors
+! - get_mo_integrals using cholesky
+! - get_mo_integralss using cholesky
+! - get_mo_integralss in PT2
+
 subroutine mo_two_e_integrals_index(i,j,k,l,i1)
   use map_module
   implicit none
@@ -453,6 +476,9 @@ subroutine add_integrals_to_map(mask_ijkl)
 
 end
 
+
+
+
 subroutine add_integrals_to_map_cholesky
   use bitmasks
   implicit none
@@ -516,837 +542,7 @@ subroutine add_integrals_to_map_cholesky
 
 end
 
-subroutine add_integrals_to_map_three_indices(mask_ijk)
-  use bitmasks
-  implicit none
 
-  BEGIN_DOC
-  ! Adds integrals to the MO map according to some bitmask
-  END_DOC
-
-  integer(bit_kind), intent(in)  :: mask_ijk(N_int,3)
-
-  integer                        :: i,j,k,l
-  integer                        :: i0,j0,k0,l0
-  double precision               :: c, cpu_1, cpu_2, wall_1, wall_2, wall_0
-
-  integer, allocatable           :: list_ijkl(:,:)
-  integer                        :: n_i, n_j, n_k
-  integer                        :: m
-  integer, allocatable           :: two_e_tmp_0_idx(:)
-  real(integral_kind), allocatable :: two_e_tmp_0(:,:)
-  double precision, allocatable  :: two_e_tmp_1(:)
-  double precision, allocatable  :: two_e_tmp_2(:,:)
-  double precision, allocatable  :: two_e_tmp_3(:,:,:)
-  !DIR$ ATTRIBUTES ALIGN : 64    :: two_e_tmp_1, two_e_tmp_2, two_e_tmp_3
-
-  integer                        :: n_integrals
-  integer                        :: size_buffer
-  integer(key_kind),allocatable  :: buffer_i(:)
-  real(integral_kind),allocatable :: buffer_value(:)
-  double precision               :: map_mb
-
-  integer                        :: i1,j1,k1,l1, ii1, kmax, thread_num
-  integer                        :: i2,i3,i4
-  double precision,parameter     :: thr_coef = 1.d-10
-
-  PROVIDE ao_two_e_integrals_in_map  mo_coef
-
-  !Get list of MOs for i,j,k and l
-  !-------------------------------
-
-  allocate(list_ijkl(mo_num,4))
-  call bitstring_to_list( mask_ijk(1,1), list_ijkl(1,1), n_i, N_int )
-  call bitstring_to_list( mask_ijk(1,2), list_ijkl(1,2), n_j, N_int )
-  call bitstring_to_list( mask_ijk(1,3), list_ijkl(1,3), n_k, N_int )
-  j = 0
-  do i = 1, N_int
-    j += popcnt(mask_ijk(i,1))
-  enddo
-  if(j==0)then
-    return
-  endif
-
-  j = 0
-  do i = 1, N_int
-    j += popcnt(mask_ijk(i,2))
-  enddo
-  if(j==0)then
-    return
-  endif
-
-  j = 0
-  do i = 1, N_int
-    j += popcnt(mask_ijk(i,3))
-  enddo
-  if(j==0)then
-    return
-  endif
-
-  if (ao_num > 1289) then
-    print *,  irp_here, ': Integer overflow in ao_num**3'
-  endif
-  size_buffer = min(ao_num*ao_num*ao_num,16000000)
-  print*, 'Providing the molecular integrals '
-  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
-      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'
-
-  call wall_time(wall_1)
-  call cpu_time(cpu_1)
-  !$OMP PARALLEL PRIVATE(m,l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, &
-      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
-      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num)                             &
-      !$OMP  DEFAULT(NONE)                                           &
-      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,       &
-      !$OMP  mo_coef_transp,                                         &
-      !$OMP  mo_coef_transp_is_built, list_ijkl,                     &
-      !$OMP  mo_coef_is_built, wall_1,                               &
-      !$OMP  mo_coef,mo_integrals_threshold,mo_integrals_map)
-  n_integrals = 0
-  wall_0 = wall_1
-  allocate(two_e_tmp_3(mo_num, n_j, n_k),                 &
-      two_e_tmp_1(mo_num),                                &
-      two_e_tmp_0(ao_num,ao_num),                             &
-      two_e_tmp_0_idx(ao_num),                                &
-      two_e_tmp_2(mo_num, n_j),                           &
-      buffer_i(size_buffer),                                   &
-      buffer_value(size_buffer) )
-
-  thread_num = 0
-  !$  thread_num = omp_get_thread_num()
-  !$OMP DO SCHEDULE(guided)
-  do l1 = 1,ao_num
-    two_e_tmp_3 = 0.d0
-    do k1 = 1,ao_num
-      two_e_tmp_2 = 0.d0
-      do j1 = 1,ao_num
-        call get_ao_two_e_integrals(j1,k1,l1,ao_num,two_e_tmp_0(1,j1))
-      enddo
-      do j1 = 1,ao_num
-        kmax = 0
-        do i1 = 1,ao_num
-          c = two_e_tmp_0(i1,j1)
-          if (c == 0.d0) then
-            cycle
-          endif
-          kmax += 1
-          two_e_tmp_0(kmax,j1) = c
-          two_e_tmp_0_idx(kmax) = i1
-        enddo
-
-        if (kmax==0) then
-          cycle
-        endif
-
-        two_e_tmp_1 = 0.d0
-        ii1=1
-        do ii1 = 1,kmax-4,4
-          i1 = two_e_tmp_0_idx(ii1)
-          i2 = two_e_tmp_0_idx(ii1+1)
-          i3 = two_e_tmp_0_idx(ii1+2)
-          i4 = two_e_tmp_0_idx(ii1+3)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i)  =  two_e_tmp_1(i) +                    &
-                mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1) +        &
-                mo_coef_transp(i,i2) * two_e_tmp_0(ii1+1,j1) +      &
-                mo_coef_transp(i,i3) * two_e_tmp_0(ii1+2,j1) +      &
-                mo_coef_transp(i,i4) * two_e_tmp_0(ii1+3,j1)
-          enddo ! i
-        enddo  ! ii1
-
-        i2 = ii1
-        do ii1 = i2,kmax
-          i1 = two_e_tmp_0_idx(ii1)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i) = two_e_tmp_1(i) + mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1)
-          enddo ! i
-        enddo  ! ii1
-        c = 0.d0
-
-        do i = list_ijkl(1,1), list_ijkl(n_i,1)
-          c = max(c,abs(two_e_tmp_1(i)))
-          if (c>mo_integrals_threshold) exit
-        enddo
-        if ( c < mo_integrals_threshold ) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          c = mo_coef_transp(j,j1)
-          if (abs(c) < thr_coef) then
-            cycle
-          endif
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_2(i,j0)  = two_e_tmp_2(i,j0) + c * two_e_tmp_1(i)
-          enddo ! i
-        enddo  ! j
-      enddo !j1
-      if ( maxval(abs(two_e_tmp_2)) < mo_integrals_threshold ) then
-        cycle
-      endif
-
-
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        c = mo_coef_transp(k,k1)
-        if (abs(c) < thr_coef) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          do i = list_ijkl(1,1), k
-            two_e_tmp_3(i,j0,k0) = two_e_tmp_3(i,j0,k0) + c* two_e_tmp_2(i,j0)
-          enddo!i
-        enddo !j
-
-      enddo  !k
-    enddo   !k1
-
-
-
-    do l0 = 1,n_j
-      l = list_ijkl(l0,2)
-      c = mo_coef_transp(l,l1)
-      if (abs(c) < thr_coef) then
-        cycle
-      endif
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        i1 = shiftr((k*k-k),1)
-        two_e_tmp_1 = 0.d0
-        j0 = l0
-        j = list_ijkl(j0,2)
-        do i0 = 1, n_i
-          i = list_ijkl(i0,1)
-          if (i>k) then
-            exit
-          endif
-          two_e_tmp_1(i) = c*two_e_tmp_3(i,j0,k0)
-        enddo
-
-        do i0 = 1, n_i
-          i = list_ijkl(i0,1)
-          if (i>k) then !min(k,j1-i1)
-            exit
-          endif
-          if (abs(two_e_tmp_1(i)) < mo_integrals_threshold) then
-            cycle
-          endif
-          n_integrals += 1
-          buffer_value(n_integrals) = two_e_tmp_1(i)
-          if(i==k .and. j==l .and. i.ne.j)then
-            buffer_value(n_integrals) = buffer_value(n_integrals) *0.5d0
-          endif
-          !DIR$ FORCEINLINE
-          call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
-          if (n_integrals == size_buffer) then
-            call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-                real(mo_integrals_threshold,integral_kind))
-            n_integrals = 0
-          endif
-        enddo
-      enddo
-    enddo
-
-    do l0 = 1,n_j
-      l = list_ijkl(l0,2)
-      c = mo_coef_transp(l,l1)
-      if (abs(c) < thr_coef) then
-        cycle
-      endif
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        i1 = shiftr((k*k-k),1)
-        two_e_tmp_1 = 0.d0
-        j0 = k0
-        j = list_ijkl(k0,2)
-        i0 = l0
-        i = list_ijkl(i0,2)
-        if (k==l) then
-          cycle
-        endif
-        two_e_tmp_1(i) = c*two_e_tmp_3(i,j0,k0)
-
-        n_integrals += 1
-        buffer_value(n_integrals) = two_e_tmp_1(i)
-        !DIR$ FORCEINLINE
-        call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
-        if (n_integrals == size_buffer) then
-          call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-              real(mo_integrals_threshold,integral_kind))
-          n_integrals = 0
-        endif
-      enddo
-    enddo
-
-    call wall_time(wall_2)
-    if (thread_num == 0) then
-      if (wall_2 - wall_0 > 1.d0) then
-        wall_0 = wall_2
-        print*, 100.*float(l1)/float(ao_num), '% in ',               &
-            wall_2-wall_1, 's', map_mb(mo_integrals_map) ,'MB'
-      endif
-    endif
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)
-
-  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-      real(mo_integrals_threshold,integral_kind))
-  deallocate(buffer_i, buffer_value)
-  !$OMP END PARALLEL
-  call map_merge(mo_integrals_map)
-
-  call wall_time(wall_2)
-  call cpu_time(cpu_2)
-  integer*8                      :: get_mo_map_size, mo_map_size
-  mo_map_size = get_mo_map_size()
-
-  deallocate(list_ijkl)
-
-
-  print*,'Molecular integrals provided:'
-  print*,' Size of MO map           ', map_mb(mo_integrals_map) ,'MB'
-  print*,' Number of MO integrals: ',  mo_map_size
-  print*,' cpu  time :',cpu_2 - cpu_1, 's'
-  print*,' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1), ')'
-
-end
-
-
-subroutine add_integrals_to_map_no_exit_34(mask_ijkl)
-  use bitmasks
-  implicit none
-
-  BEGIN_DOC
-  ! Adds integrals to tha MO map according to some bitmask
-  END_DOC
-
-  integer(bit_kind), intent(in)  :: mask_ijkl(N_int,4)
-
-  integer                        :: i,j,k,l
-  integer                        :: i0,j0,k0,l0
-  double precision               :: c, cpu_1, cpu_2, wall_1, wall_2, wall_0
-
-  integer, allocatable           :: list_ijkl(:,:)
-  integer                        :: n_i, n_j, n_k, n_l
-  integer, allocatable           :: two_e_tmp_0_idx(:)
-  real(integral_kind), allocatable :: two_e_tmp_0(:,:)
-  double precision, allocatable  :: two_e_tmp_1(:)
-  double precision, allocatable  :: two_e_tmp_2(:,:)
-  double precision, allocatable  :: two_e_tmp_3(:,:,:)
-  !DIR$ ATTRIBUTES ALIGN : 64    :: two_e_tmp_1, two_e_tmp_2, two_e_tmp_3
-
-  integer                        :: n_integrals
-  integer                        :: size_buffer
-  integer(key_kind),allocatable  :: buffer_i(:)
-  real(integral_kind),allocatable :: buffer_value(:)
-  double precision               :: map_mb
-
-  integer                        :: i1,j1,k1,l1, ii1, kmax, thread_num
-  integer                        :: i2,i3,i4
-  double precision,parameter     :: thr_coef = 1.d-10
-
-  PROVIDE ao_two_e_integrals_in_map  mo_coef
-
-  !Get list of MOs for i,j,k and l
-  !-------------------------------
-
-  allocate(list_ijkl(mo_num,4))
-  call bitstring_to_list( mask_ijkl(1,1), list_ijkl(1,1), n_i, N_int )
-  call bitstring_to_list( mask_ijkl(1,2), list_ijkl(1,2), n_j, N_int )
-  call bitstring_to_list( mask_ijkl(1,3), list_ijkl(1,3), n_k, N_int )
-  call bitstring_to_list( mask_ijkl(1,4), list_ijkl(1,4), n_l, N_int )
-
-  if (ao_num > 1289) then
-    print *,  irp_here, ': Integer overflow in ao_num**3'
-  endif
-  size_buffer = min(ao_num*ao_num*ao_num,16000000)
-  print*, 'Providing the molecular integrals '
-  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
-      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'
-
-  call wall_time(wall_1)
-  call cpu_time(cpu_1)
-
-  !$OMP PARALLEL PRIVATE(l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax,   &
-      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
-      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num)                                      &
-      !$OMP  DEFAULT(NONE)                                           &
-      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,n_l,   &
-      !$OMP  mo_coef_transp,                                         &
-      !$OMP  mo_coef_transp_is_built, list_ijkl,                     &
-      !$OMP  mo_coef_is_built, wall_1,                               &
-      !$OMP  mo_coef,mo_integrals_threshold,mo_integrals_map)
-  n_integrals = 0
-  wall_0 = wall_1
-  allocate(two_e_tmp_3(mo_num, n_j, n_k),                 &
-      two_e_tmp_1(mo_num),                                &
-      two_e_tmp_0(ao_num,ao_num),                                   &
-      two_e_tmp_0_idx(ao_num),                                      &
-      two_e_tmp_2(mo_num, n_j),                           &
-      buffer_i(size_buffer),                                         &
-      buffer_value(size_buffer) )
-
-  thread_num = 0
-  !$  thread_num = omp_get_thread_num()
-  !$OMP DO SCHEDULE(guided)
-  do l1 = 1,ao_num
-    !IRP_IF COARRAY
-    !    if (mod(l1-this_image(),num_images()) /= 0 ) then
-    !      cycle
-    !    endif
-    !IRP_ENDIF
-    two_e_tmp_3 = 0.d0
-    do k1 = 1,ao_num
-      two_e_tmp_2 = 0.d0
-      do j1 = 1,ao_num
-        call get_ao_two_e_integrals(j1,k1,l1,ao_num,two_e_tmp_0(1,j1))
-      enddo
-      do j1 = 1,ao_num
-        kmax = 0
-        do i1 = 1,ao_num
-          c = two_e_tmp_0(i1,j1)
-          if (c == 0.d0) then
-            cycle
-          endif
-          kmax += 1
-          two_e_tmp_0(kmax,j1) = c
-          two_e_tmp_0_idx(kmax) = i1
-        enddo
-
-        if (kmax==0) then
-          cycle
-        endif
-
-        two_e_tmp_1 = 0.d0
-        ii1=1
-        do ii1 = 1,kmax-4,4
-          i1 = two_e_tmp_0_idx(ii1)
-          i2 = two_e_tmp_0_idx(ii1+1)
-          i3 = two_e_tmp_0_idx(ii1+2)
-          i4 = two_e_tmp_0_idx(ii1+3)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i)  =  two_e_tmp_1(i) +                    &
-                mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1) +        &
-                mo_coef_transp(i,i2) * two_e_tmp_0(ii1+1,j1) +      &
-                mo_coef_transp(i,i3) * two_e_tmp_0(ii1+2,j1) +      &
-                mo_coef_transp(i,i4) * two_e_tmp_0(ii1+3,j1)
-          enddo ! i
-        enddo  ! ii1
-
-        i2 = ii1
-        do ii1 = i2,kmax
-          i1 = two_e_tmp_0_idx(ii1)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i) = two_e_tmp_1(i) + mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1)
-          enddo ! i
-        enddo  ! ii1
-        c = 0.d0
-
-        do i = list_ijkl(1,1), list_ijkl(n_i,1)
-          c = max(c,abs(two_e_tmp_1(i)))
-          if (c>mo_integrals_threshold) exit
-        enddo
-        if ( c < mo_integrals_threshold ) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          c = mo_coef_transp(j,j1)
-          if (abs(c) < thr_coef) then
-            cycle
-          endif
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_2(i,j0)  = two_e_tmp_2(i,j0) + c * two_e_tmp_1(i)
-          enddo ! i
-        enddo  ! j
-      enddo !j1
-      if ( maxval(abs(two_e_tmp_2)) < mo_integrals_threshold ) then
-        cycle
-      endif
-
-
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        c = mo_coef_transp(k,k1)
-        if (abs(c) < thr_coef) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          do i = list_ijkl(1,1), k
-            two_e_tmp_3(i,j0,k0) = two_e_tmp_3(i,j0,k0) + c* two_e_tmp_2(i,j0)
-          enddo!i
-        enddo !j
-
-      enddo  !k
-    enddo   !k1
-
-
-
-    do l0 = 1,n_l
-      l = list_ijkl(l0,4)
-      c = mo_coef_transp(l,l1)
-      if (abs(c) < thr_coef) then
-        cycle
-      endif
-      j1 = shiftr((l*l-l),1)
-      do j0 = 1, n_j
-        j = list_ijkl(j0,2)
-        if (j > l)  then
-          exit
-        endif
-        j1 += 1
-        do k0 = 1, n_k
-          k = list_ijkl(k0,3)
-          i1 = shiftr((k*k-k),1)
-          two_e_tmp_1 = 0.d0
-          do i0 = 1, n_i
-            i = list_ijkl(i0,1)
-            if (i>k) then
-              exit
-            endif
-            two_e_tmp_1(i) = c*two_e_tmp_3(i,j0,k0)
-          enddo
-
-          do i0 = 1, n_i
-            i = list_ijkl(i0,1)
-            if(i> k)then
-              exit
-            endif
-
-            if (abs(two_e_tmp_1(i)) < mo_integrals_threshold) then
-              cycle
-            endif
-            n_integrals += 1
-            buffer_value(n_integrals) = two_e_tmp_1(i)
-            !DIR$ FORCEINLINE
-            call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
-            if (n_integrals == size_buffer) then
-              call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-                  real(mo_integrals_threshold,integral_kind))
-              n_integrals = 0
-            endif
-          enddo
-        enddo
-      enddo
-    enddo
-
-    call wall_time(wall_2)
-    if (thread_num == 0) then
-      if (wall_2 - wall_0 > 1.d0) then
-        wall_0 = wall_2
-        print*, 100.*float(l1)/float(ao_num), '% in ',               &
-            wall_2-wall_1, 's', map_mb(mo_integrals_map) ,'MB'
-      endif
-    endif
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)
-
-  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-      real(mo_integrals_threshold,integral_kind))
-  deallocate(buffer_i, buffer_value)
-  !$OMP END PARALLEL
-  !IRP_IF COARRAY
-  !  print*, 'Communicating the map'
-  !  call communicate_mo_integrals()
-  !IRP_ENDIF
-  call map_merge(mo_integrals_map)
-
-  call wall_time(wall_2)
-  call cpu_time(cpu_2)
-  integer*8                      :: get_mo_map_size, mo_map_size
-  mo_map_size = get_mo_map_size()
-
-  deallocate(list_ijkl)
-
-
-  print*,'Molecular integrals provided:'
-  print*,' Size of MO map           ', map_mb(mo_integrals_map) ,'MB'
-  print*,' Number of MO integrals: ',  mo_map_size
-  print*,' cpu  time :',cpu_2 - cpu_1, 's'
-  print*,' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1), ')'
-
-
-end
-
-
-
- BEGIN_PROVIDER [ double precision, mo_two_e_integral_jj_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_jj_exchange_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_jj_anti_from_ao, (mo_num,mo_num) ]
-  implicit none
-  BEGIN_DOC
-  ! mo_two_e_integral_jj_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_jj_exchange_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_jj_anti_from_ao(i,j) = J_ij - K_ij
-  END_DOC
-
-  integer                        :: i,j,p,q,r,s
-  double precision               :: c
-  real(integral_kind)            :: integral
-  integer                        :: n, pp
-  real(integral_kind), allocatable :: int_value(:)
-  integer, allocatable           :: int_idx(:)
-
-  double precision, allocatable  :: iqrs(:,:), iqsr(:,:), iqis(:), iqri(:)
-
-  if (.not.do_direct_integrals) then
-    PROVIDE ao_two_e_integrals_in_map mo_coef
-  endif
-
-  mo_two_e_integral_jj_from_ao = 0.d0
-  mo_two_e_integrals_jj_exchange_from_ao = 0.d0
-
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: iqrs, iqsr
-
-
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE (i,j,p,q,r,s,integral,c,n,pp,int_value,int_idx,  &
-      !$OMP  iqrs, iqsr,iqri,iqis)                                   &
-      !$OMP SHARED(mo_num,mo_coef_transp,ao_num,                 &
-      !$OMP  ao_integrals_threshold,do_direct_integrals)             &
-      !$OMP REDUCTION(+:mo_two_e_integral_jj_from_ao,mo_two_e_integrals_jj_exchange_from_ao)
-
-  allocate( int_value(ao_num), int_idx(ao_num),                      &
-      iqrs(mo_num,ao_num), iqis(mo_num), iqri(mo_num),   &
-      iqsr(mo_num,ao_num) )
-
-  !$OMP DO SCHEDULE (guided)
-  do s=1,ao_num
-    do q=1,ao_num
-
-      do j=1,ao_num
-        do i=1,mo_num
-          iqrs(i,j) = 0.d0
-          iqsr(i,j) = 0.d0
-        enddo
-      enddo
-
-      if (do_direct_integrals) then
-        double precision               :: ao_two_e_integral
-        do r=1,ao_num
-          call compute_ao_two_e_integrals(q,r,s,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call compute_ao_two_e_integrals(q,s,r,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-
-      else
-
-        do r=1,ao_num
-          call get_ao_two_e_integrals_non_zero(q,r,s,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call get_ao_two_e_integrals_non_zero(q,s,r,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-      endif
-      iqis = 0.d0
-      iqri = 0.d0
-      do r=1,ao_num
-        do i=1,mo_num
-          iqis(i) += mo_coef_transp(i,r) * iqrs(i,r)
-          iqri(i) += mo_coef_transp(i,r) * iqsr(i,r)
-        enddo
-      enddo
-      do i=1,mo_num
-        do j=1,mo_num
-          c = mo_coef_transp(j,q)*mo_coef_transp(j,s)
-          mo_two_e_integral_jj_from_ao(j,i) += c * iqis(i)
-          mo_two_e_integrals_jj_exchange_from_ao(j,i) += c * iqri(i)
-        enddo
-      enddo
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate(iqrs,iqsr,int_value,int_idx)
-  !$OMP END PARALLEL
-
-  mo_two_e_integrals_jj_anti_from_ao = mo_two_e_integral_jj_from_ao - mo_two_e_integrals_jj_exchange_from_ao
-
-
-END_PROVIDER
-
- BEGIN_PROVIDER [ double precision, mo_two_e_integrals_vv_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_vv_exchange_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_vv_anti_from_ao, (mo_num,mo_num) ]
-  implicit none
-  BEGIN_DOC
-  ! mo_two_e_integrals_vv_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_vv_exchange_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_vv_anti_from_ao(i,j) = J_ij - K_ij
-  ! but only for the virtual orbitals
-  END_DOC
-
-  integer                        :: i,j,p,q,r,s
-  integer                        :: i0,j0
-  double precision               :: c
-  real(integral_kind)            :: integral
-  integer                        :: n, pp
-  real(integral_kind), allocatable :: int_value(:)
-  integer, allocatable           :: int_idx(:)
-
-  double precision, allocatable  :: iqrs(:,:), iqsr(:,:), iqis(:), iqri(:)
-
-  if (.not.do_direct_integrals) then
-    PROVIDE ao_two_e_integrals_in_map mo_coef
-  endif
-
-  mo_two_e_integrals_vv_from_ao = 0.d0
-  mo_two_e_integrals_vv_exchange_from_ao = 0.d0
-
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: iqrs, iqsr
-
-
-  !$OMP PARALLEL DEFAULT(NONE)                                            &
-      !$OMP PRIVATE (i0,j0,i,j,p,q,r,s,integral,c,n,pp,int_value,int_idx, &
-      !$OMP  iqrs, iqsr,iqri,iqis)                                        &
-      !$OMP SHARED(n_virt_orb,mo_num,list_virt,mo_coef_transp,ao_num, &
-      !$OMP  ao_integrals_threshold,do_direct_integrals)                  &
-      !$OMP REDUCTION(+:mo_two_e_integrals_vv_from_ao,mo_two_e_integrals_vv_exchange_from_ao)
-
-  allocate( int_value(ao_num), int_idx(ao_num),                      &
-      iqrs(mo_num,ao_num), iqis(mo_num), iqri(mo_num),&
-      iqsr(mo_num,ao_num) )
-
-  !$OMP DO SCHEDULE (guided)
-  do s=1,ao_num
-    do q=1,ao_num
-
-      do j=1,ao_num
-        do i0=1,n_virt_orb
-          i = list_virt(i0)
-          iqrs(i,j) = 0.d0
-          iqsr(i,j) = 0.d0
-        enddo
-      enddo
-
-      if (do_direct_integrals) then
-        double precision               :: ao_two_e_integral
-        do r=1,ao_num
-          call compute_ao_two_e_integrals(q,r,s,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i = list_virt(i0)
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call compute_ao_two_e_integrals(q,s,r,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i =list_virt(i0)
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-
-      else
-
-        do r=1,ao_num
-          call get_ao_two_e_integrals_non_zero(q,r,s,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i =list_virt(i0)
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call get_ao_two_e_integrals_non_zero(q,s,r,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i = list_virt(i0)
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-      endif
-      iqis = 0.d0
-      iqri = 0.d0
-      do r=1,ao_num
-        do i0=1,n_virt_orb
-          i = list_virt(i0)
-          iqis(i) += mo_coef_transp(i,r) * iqrs(i,r)
-          iqri(i) += mo_coef_transp(i,r) * iqsr(i,r)
-        enddo
-      enddo
-      do i0=1,n_virt_orb
-        i= list_virt(i0)
-        do j0=1,n_virt_orb
-          j = list_virt(j0)
-          c = mo_coef_transp(j,q)*mo_coef_transp(j,s)
-          mo_two_e_integrals_vv_from_ao(j,i) += c * iqis(i)
-          mo_two_e_integrals_vv_exchange_from_ao(j,i) += c * iqri(i)
-        enddo
-      enddo
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate(iqrs,iqsr,int_value,int_idx)
-  !$OMP END PARALLEL
-
-  mo_two_e_integrals_vv_anti_from_ao = mo_two_e_integrals_vv_from_ao - mo_two_e_integrals_vv_exchange_from_ao
-  ! print*, '**********'
-  ! do i0 =1, n_virt_orb
-  !  i = list_virt(i0)
-  !  print*, mo_two_e_integrals_vv_from_ao(i,i)
-  ! enddo
-  ! print*, '**********'
-
-
-END_PROVIDER
 
 
  BEGIN_PROVIDER [ double precision, mo_two_e_integrals_jj, (mo_num,mo_num) ]

From bd534589e123c939de8ddd3147837b97837581dc Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 10 Jun 2024 17:36:14 +0200
Subject: [PATCH 075/131] Building mo cache from cholesky

---
 src/mo_two_e_ints/map_integrals.irp.f   | 68 ++++++++++++++++---------
 src/mo_two_e_ints/mo_bi_integrals.irp.f |  1 -
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 290fdeab..e99e89fb 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -59,29 +59,50 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:128_8*128_8*128_8*12
  integer(key_kind)              :: idx
  real(integral_kind)            :: integral
  FREE ao_integrals_cache
- !$OMP PARALLEL DO PRIVATE (i,j,k,l,i4,j4,k4,l4,idx,ii,integral)
- do l=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-   l4 = int(l,4)
-   do k=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-     k4 = int(k,4)
-     do j=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-       j4 = int(j,4)
-       do i=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-         i4 = int(i,4)
-         !DIR$ FORCEINLINE
-         call two_e_integrals_index(i4,j4,k4,l4,idx)
-         !DIR$ FORCEINLINE
-         call map_get(mo_integrals_map,idx,integral)
+ if (do_mo_cholesky) then
+
+   call set_multiple_levels_omp(.False.)
+   !$OMP PARALLEL DO PRIVATE (k,l,ii)
+   do l=mo_integrals_cache_min_8,mo_integrals_cache_max_8
+     do k=mo_integrals_cache_min_8,mo_integrals_cache_max_8
          ii = l-mo_integrals_cache_min_8
          ii = ior( shiftl(ii,7), k-mo_integrals_cache_min_8)
-         ii = ior( shiftl(ii,7), j-mo_integrals_cache_min_8)
-         ii = ior( shiftl(ii,7), i-mo_integrals_cache_min_8)
-         mo_integrals_cache(ii) = integral
+         ii = shiftl(ii,14)
+         call dgemm('T','N', mo_integrals_cache_max-mo_integrals_cache_min+1, &
+                             mo_integrals_cache_max-mo_integrals_cache_min+1, &
+           cholesky_mo_num, 1.d0, &
+           cholesky_mo_transp(1,mo_integrals_cache_min,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,mo_integrals_cache_min,l), cholesky_mo_num, 0.d0, &
+           mo_integrals_cache(ii), 128)
+     enddo
+   enddo
+   !$OMP END PARALLEL DO
+
+ else
+   !$OMP PARALLEL DO PRIVATE (i,j,k,l,i4,j4,k4,l4,idx,ii,integral)
+   do l=mo_integrals_cache_min_8,mo_integrals_cache_max_8
+     l4 = int(l,4)
+     do k=mo_integrals_cache_min_8,mo_integrals_cache_max_8
+       k4 = int(k,4)
+       do j=mo_integrals_cache_min_8,mo_integrals_cache_max_8
+         j4 = int(j,4)
+         do i=mo_integrals_cache_min_8,mo_integrals_cache_max_8
+           i4 = int(i,4)
+           !DIR$ FORCEINLINE
+           call two_e_integrals_index(i4,j4,k4,l4,idx)
+           !DIR$ FORCEINLINE
+           call map_get(mo_integrals_map,idx,integral)
+           ii = l-mo_integrals_cache_min_8
+           ii = ior( shiftl(ii,7), k-mo_integrals_cache_min_8)
+           ii = ior( shiftl(ii,7), j-mo_integrals_cache_min_8)
+           ii = ior( shiftl(ii,7), i-mo_integrals_cache_min_8)
+           mo_integrals_cache(ii) = integral
+         enddo
        enddo
      enddo
    enddo
- enddo
- !$OMP END PARALLEL DO
+   !$OMP END PARALLEL DO
+ endif
 
 END_PROVIDER
 
@@ -100,7 +121,7 @@ double precision function get_two_e_integral(i,j,k,l,map)
   real(integral_kind)            :: tmp
   integer                        :: kk
 
-  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache do_mo_cholesky 
+  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache do_mo_cholesky
 
   if (use_banned_excitation) then
     if (banned_excitation(i,k)) then
@@ -119,16 +140,13 @@ double precision function get_two_e_integral(i,j,k,l,map)
   ii = ior(ii, j-mo_integrals_cache_min)
   ii = ior(ii, i-mo_integrals_cache_min)
 
-!  if (iand(ii, -128) /= 0) then
-  if (.True.) then
+  if (iand(ii, -128) /= 0) then
     ! Integral is not in the cache
 
     if  (do_mo_cholesky) then
 
-      get_two_e_integral = 0.d0
-      do kk=1,cholesky_mo_num
-        get_two_e_integral = get_two_e_integral + cholesky_mo_transp(kk,i,k) * cholesky_mo_transp(kk,j,l)
-      enddo
+      double precision, external :: ddot
+      get_two_e_integral = ddot(cholesky_mo_num, cholesky_mo_transp(1,i,k), 1, cholesky_mo_transp(1,j,l), 1)
 
     else
       ! Integrals is in the map
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index d44bb38a..6079c9f7 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -16,7 +16,6 @@
 ! - 1,2,3-index arrays are built from the map
 !
 ! TODO:
-! - build cache map from cholesky vectors
 ! - get_mo_integrals using cholesky
 ! - get_mo_integralss using cholesky
 ! - get_mo_integralss in PT2

From 10fb3a0636d74ef5a3b78dc69bb9cc4d6be63455 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 10 Jun 2024 18:23:45 +0200
Subject: [PATCH 076/131] Introducing dgemm and dgemv to get integrals

---
 src/mo_two_e_ints/map_integrals.irp.f   | 35 ++++++++++++++++++++-----
 src/mo_two_e_ints/mo_bi_integrals.irp.f |  6 +++--
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index e99e89fb..c9fa81c4 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -262,9 +262,25 @@ subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
   integer                        :: j
   real(integral_kind), allocatable :: tmp_val(:)
 
-  do j=1,sze
-    call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
-  enddo
+  if (do_mo_cholesky) then
+    call dgemm('T', 'N', mo_num, mo_num, cholesky_mo_num, 1.d0, &
+       cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+       cholesky_mo_transp(1,1,l), cholesky_mo_num, 0.d0, &
+       out_array, sze)
+! integer :: i
+! do j=1,mo_num
+!  do i=1,mo_num
+!   double precision, external :: get_two_e_integral
+!   print *, i, j, real(out_array(i,j)), real(get_two_e_integral(i,j,k,l,map))
+!  enddo
+! enddo
+! print *, irp_here
+! pause
+  else
+    do j=1,sze
+      call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
+    enddo
+  endif
 end
 
 subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
@@ -303,9 +319,16 @@ subroutine get_mo_two_e_integrals_coulomb_ii(k,l,sze,out_val,map)
   double precision, external     :: get_two_e_integral
   PROVIDE mo_two_e_integrals_in_map
 
-  do i=1,sze
-    out_val(i) = get_two_e_integral(k,i,l,i,map)
-  enddo
+  if (do_mo_cholesky) then
+    call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
+       cholesky_mo_transp(1,1,1), cholesky_mo_num*(mo_num+1), &
+       cholesky_mo_transp(1,k,l), 1, 0.d0, &
+       out_val, 1)
+  else
+    do i=1,sze
+      out_val(i) = get_two_e_integral(k,i,l,i,map)
+    enddo
+  endif
 
 end
 
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index 6079c9f7..04e6c3e6 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -65,9 +65,11 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ]
   call cpu_time(cpu_1)
 
   if (do_mo_cholesky) then
-    call add_integrals_to_map_cholesky
+     PROVIDE cholesky_mo_transp
   else
-    if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
+    if (do_ao_cholesky) then
+      call add_integrals_to_map_cholesky
+    else if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
       call four_idx_dgemm
     else
       call add_integrals_to_map(full_ijkl_bitmask_4)

From 47b80703397ec92dfec008ef2a641efba9c22f44 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 11 Jun 2024 11:53:11 +0200
Subject: [PATCH 077/131] Cache map in integer*4

---
 src/mo_two_e_ints/map_integrals.irp.f | 207 ++++++++++++++++++--------
 1 file changed, 145 insertions(+), 62 deletions(-)

diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index c9fa81c4..9155cd8f 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -34,28 +34,28 @@ end
 
  BEGIN_PROVIDER [ integer*4, mo_integrals_cache_min ]
 &BEGIN_PROVIDER [ integer*4, mo_integrals_cache_max ]
-&BEGIN_PROVIDER [ integer*8, mo_integrals_cache_min_8 ]
-&BEGIN_PROVIDER [ integer*8, mo_integrals_cache_max_8 ]
+&BEGIN_PROVIDER [ integer*4, mo_integrals_cache_shift]
+&BEGIN_PROVIDER [ integer*4, mo_integrals_cache_size ]
  implicit none
  BEGIN_DOC
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
- mo_integrals_cache_min_8 = max(1_8,elec_alpha_num - 63_8)
- mo_integrals_cache_max_8 = min(int(mo_num,8),mo_integrals_cache_min_8+127_8)
- mo_integrals_cache_min   = max(1,elec_alpha_num - 63)
- mo_integrals_cache_max   = min(mo_num,mo_integrals_cache_min+127)
+ mo_integrals_cache_shift = 7  ! 7 = log(128). Max 7
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
+
+ mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
+ mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
 
 END_PROVIDER
 
-BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:128_8*128_8*128_8*128_8) ]
+BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:mo_integrals_cache_size**4) ]
  implicit none
  BEGIN_DOC
  ! Cache of MO integrals for fast access
  END_DOC
  PROVIDE mo_two_e_integrals_in_map
- integer*8                      :: i,j,k,l
- integer*4                      :: i4,j4,k4,l4
- integer*8                      :: ii
+ integer                        :: i,j,k,l
+ integer                        :: ii
  integer(key_kind)              :: idx
  real(integral_kind)            :: integral
  FREE ao_integrals_cache
@@ -63,39 +63,36 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:128_8*128_8*128_8*12
 
    call set_multiple_levels_omp(.False.)
    !$OMP PARALLEL DO PRIVATE (k,l,ii)
-   do l=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-     do k=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-         ii = l-mo_integrals_cache_min_8
-         ii = ior( shiftl(ii,7), k-mo_integrals_cache_min_8)
-         ii = shiftl(ii,14)
+   do l=mo_integrals_cache_min,mo_integrals_cache_max
+     do k=mo_integrals_cache_min,mo_integrals_cache_max
+         ii = l-mo_integrals_cache_min
+         ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+         ii = shiftl(ii,mo_integrals_cache_shift)
+         ii = shiftl(ii,mo_integrals_cache_shift)
          call dgemm('T','N', mo_integrals_cache_max-mo_integrals_cache_min+1, &
                              mo_integrals_cache_max-mo_integrals_cache_min+1, &
            cholesky_mo_num, 1.d0, &
            cholesky_mo_transp(1,mo_integrals_cache_min,k), cholesky_mo_num, &
            cholesky_mo_transp(1,mo_integrals_cache_min,l), cholesky_mo_num, 0.d0, &
-           mo_integrals_cache(ii), 128)
+           mo_integrals_cache(ii), mo_integrals_cache_size)
      enddo
    enddo
    !$OMP END PARALLEL DO
 
  else
-   !$OMP PARALLEL DO PRIVATE (i,j,k,l,i4,j4,k4,l4,idx,ii,integral)
-   do l=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-     l4 = int(l,4)
-     do k=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-       k4 = int(k,4)
-       do j=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-         j4 = int(j,4)
-         do i=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-           i4 = int(i,4)
+   !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx,ii,integral)
+   do l=mo_integrals_cache_min,mo_integrals_cache_max
+     do k=mo_integrals_cache_min,mo_integrals_cache_max
+       do j=mo_integrals_cache_min,mo_integrals_cache_max
+         do i=mo_integrals_cache_min,mo_integrals_cache_max
            !DIR$ FORCEINLINE
-           call two_e_integrals_index(i4,j4,k4,l4,idx)
+           call two_e_integrals_index(i,j,k,l,idx)
            !DIR$ FORCEINLINE
            call map_get(mo_integrals_map,idx,integral)
-           ii = l-mo_integrals_cache_min_8
-           ii = ior( shiftl(ii,7), k-mo_integrals_cache_min_8)
-           ii = ior( shiftl(ii,7), j-mo_integrals_cache_min_8)
-           ii = ior( shiftl(ii,7), i-mo_integrals_cache_min_8)
+           ii = l-mo_integrals_cache_min
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
            mo_integrals_cache(ii) = integral
          enddo
        enddo
@@ -116,7 +113,6 @@ double precision function get_two_e_integral(i,j,k,l,map)
   integer, intent(in)            :: i,j,k,l
   integer(key_kind)              :: idx
   integer                        :: ii
-  integer*8                      :: ii_8
   type(map_type), intent(inout)  :: map
   real(integral_kind)            :: tmp
   integer                        :: kk
@@ -140,7 +136,7 @@ double precision function get_two_e_integral(i,j,k,l,map)
   ii = ior(ii, j-mo_integrals_cache_min)
   ii = ior(ii, i-mo_integrals_cache_min)
 
-  if (iand(ii, -128) /= 0) then
+  if (iand(ii, -mo_integrals_cache_size) /= 0) then
     ! Integral is not in the cache
 
     if  (do_mo_cholesky) then
@@ -161,11 +157,11 @@ double precision function get_two_e_integral(i,j,k,l,map)
   else
     ! Integrals is in the cache
 
-    ii_8 = int(l,8)-mo_integrals_cache_min_8
-    ii_8 = ior( shiftl(ii_8,7), int(k,8)-mo_integrals_cache_min_8)
-    ii_8 = ior( shiftl(ii_8,7), int(j,8)-mo_integrals_cache_min_8)
-    ii_8 = ior( shiftl(ii_8,7), int(i,8)-mo_integrals_cache_min_8)
-    get_two_e_integral = mo_integrals_cache(ii_8)
+    ii = l-mo_integrals_cache_min
+    ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+    ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
+    ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
+    get_two_e_integral = mo_integrals_cache(ii)
 
   endif
 end
@@ -197,19 +193,12 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   integer                        :: i
   double precision, external :: get_two_e_integral
 
-  integer                        :: ii, ii0
-  integer*8                      :: ii_8, ii0_8
+  integer                        :: ii
   real(integral_kind)            :: tmp
   integer(key_kind)              :: i1, idx
   integer(key_kind)              :: p,q,r,s,i2
   PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
 
-!DEBUG
-!  do i=1,sze
-!    out_val(i) = get_two_e_integral(i,j,k,l,map)
-!  enddo
-!  return
-!DEBUG
 
   out_val(1:sze) = 0.d0
   if (banned_excitation(j,l)) then
@@ -220,9 +209,10 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   ii0 = ior(ii0, k-mo_integrals_cache_min)
   ii0 = ior(ii0, j-mo_integrals_cache_min)
 
-  ii0_8 = int(l,8)-mo_integrals_cache_min_8
-  ii0_8 = ior( shiftl(ii0_8,7), int(k,8)-mo_integrals_cache_min_8)
-  ii0_8 = ior( shiftl(ii0_8,7), int(j,8)-mo_integrals_cache_min_8)
+  integer :: ii0, ii0_8, ii_8
+  ii0_8 = l-mo_integrals_cache_min
+  ii0_8 = ior( shiftl(ii0_8,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+  ii0_8 = ior( shiftl(ii0_8,mo_integrals_cache_shift), j-mo_integrals_cache_min)
 
   q = min(j,l)
   s = max(j,l)
@@ -231,8 +221,8 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   do i=1,sze
     if (banned_excitation(i,k)) cycle
     ii = ior(ii0, i-mo_integrals_cache_min)
-    if (iand(ii, -128) == 0) then
-      ii_8 = ior( shiftl(ii0_8,7), int(i,8)-mo_integrals_cache_min_8)
+    if (iand(ii, -mo_integrals_cache_size) == 0) then
+      ii_8 = ior( shiftl(ii0_8,mo_integrals_cache_shift), i-mo_integrals_cache_min)
       out_val(i) = mo_integrals_cache(ii_8)
     else
       p = min(i,k)
@@ -246,6 +236,93 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
       out_val(i) = dble(tmp)
     endif
   enddo
+
+!  if (banned_excitation(j,l)) then
+!      out_val(1:sze) = 0.d0
+!      return
+!  endif
+!
+!  if (mo_integrals_cache_min > 1) then
+!
+!    if (do_mo_cholesky) then
+!
+!      call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
+!         cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+!         cholesky_mo_transp(1,j,l), 1, 0.d0, &
+!         out_val, 1)
+!
+!    else
+!
+!      q = min(j,l)
+!      s = max(j,l)
+!      q = q+shiftr(s*s-s,1)
+!
+!      do i=1,mo_integrals_cache_min-1
+!        if (banned_excitation(i,k)) then
+!          out_val(i) = 0.d0
+!          cycle
+!        endif
+!        p = min(i,k)
+!        r = max(i,k)
+!        p = p+shiftr(r*r-r,1)
+!        i1 = min(p,q)
+!        i2 = max(p,q)
+!        idx = i1+shiftr(i2*i2-i2,1)
+!        !DIR$ FORCEINLINE
+!        call map_get(map,idx,tmp)
+!        out_val(i) = dble(tmp)
+!      enddo
+!
+!    endif
+!
+!  endif
+!
+!
+!  ii = l-mo_integrals_cache_min
+!  ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
+!  ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
+!  ii = shiftl(ii, mo_integrals_cache_shift)
+!  do i=mo_integrals_cache_min, mo_integrals_cache_max
+!    ii = ii+1
+!    out_val(i) = mo_integrals_cache(ii)
+!  enddo
+!
+!
+!  if (mo_integrals_cache_max < mo_num) then
+!
+!    if (do_mo_cholesky) then
+!
+!      call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
+!         cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
+!         cholesky_mo_transp(1,j,l), 1, 0.d0, &
+!         out_val(mo_integrals_cache_max+1), 1)
+!
+!    else
+!
+!      q = min(j,l)
+!      s = max(j,l)
+!      q = q+shiftr(s*s-s,1)
+!
+!      do i=mo_integrals_cache_max+1,mo_num
+!        if (banned_excitation(i,k)) then
+!          out_val(i) = 0.d0
+!          cycle
+!        endif
+!        p = min(i,k)
+!        r = max(i,k)
+!        p = p+shiftr(r*r-r,1)
+!        i1 = min(p,q)
+!        i2 = max(p,q)
+!        idx = i1+shiftr(i2*i2-i2,1)
+!        !DIR$ FORCEINLINE
+!        call map_get(map,idx,tmp)
+!        out_val(i) = dble(tmp)
+!      enddo
+!
+!    endif
+!
+!  endif
+
 end
 
 subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
@@ -267,15 +344,6 @@ subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
        cholesky_mo_transp(1,1,k), cholesky_mo_num, &
        cholesky_mo_transp(1,1,l), cholesky_mo_num, 0.d0, &
        out_array, sze)
-! integer :: i
-! do j=1,mo_num
-!  do i=1,mo_num
-!   double precision, external :: get_two_e_integral
-!   print *, i, j, real(out_array(i,j)), real(get_two_e_integral(i,j,k,l,map))
-!  enddo
-! enddo
-! print *, irp_here
-! pause
   else
     do j=1,sze
       call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
@@ -297,9 +365,20 @@ subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
   integer                        :: j
   PROVIDE mo_two_e_integrals_in_map
 
-  do j=1,sze
-    call get_mo_two_e_integrals(k,j,l,sze,out_array(1,j),map)
-  enddo
+  if (do_mo_cholesky) then
+
+    call dgemv('T', cholesky_mo_num, mo_num*mo_num, 1.d0, &
+       cholesky_mo_transp(1,1,1), cholesky_mo_num, &
+       cholesky_mo_transp(1,k,l), 1, 0.d0, &
+       out_array, 1)
+
+  else
+
+    do j=1,sze
+      call get_mo_two_e_integrals(k,j,l,sze,out_array(1,j),map)
+    enddo
+
+  endif
 
 end
 
@@ -320,14 +399,18 @@ subroutine get_mo_two_e_integrals_coulomb_ii(k,l,sze,out_val,map)
   PROVIDE mo_two_e_integrals_in_map
 
   if (do_mo_cholesky) then
+
     call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
        cholesky_mo_transp(1,1,1), cholesky_mo_num*(mo_num+1), &
        cholesky_mo_transp(1,k,l), 1, 0.d0, &
        out_val, 1)
+
   else
+
     do i=1,sze
       out_val(i) = get_two_e_integral(k,i,l,i,map)
     enddo
+
   endif
 
 end

From a4516fb8f96acabf86e5effa497fd7b0035cc0cb Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 11 Jun 2024 12:12:14 +0200
Subject: [PATCH 078/131] Accelerated cache-map access

---
 src/mo_two_e_ints/map_integrals.irp.f | 188 ++++++++++----------------
 1 file changed, 74 insertions(+), 114 deletions(-)

diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 9155cd8f..fb155073 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -200,128 +200,88 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
 
 
-  out_val(1:sze) = 0.d0
   if (banned_excitation(j,l)) then
-    return
+      out_val(1:sze) = 0.d0
+      return
   endif
 
-  ii0 = l-mo_integrals_cache_min
-  ii0 = ior(ii0, k-mo_integrals_cache_min)
-  ii0 = ior(ii0, j-mo_integrals_cache_min)
+  if (mo_integrals_cache_min > 1) then
 
-  integer :: ii0, ii0_8, ii_8
-  ii0_8 = l-mo_integrals_cache_min
-  ii0_8 = ior( shiftl(ii0_8,mo_integrals_cache_shift), k-mo_integrals_cache_min)
-  ii0_8 = ior( shiftl(ii0_8,mo_integrals_cache_shift), j-mo_integrals_cache_min)
+    if (do_mo_cholesky) then
 
-  q = min(j,l)
-  s = max(j,l)
-  q = q+shiftr(s*s-s,1)
+      call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
+         cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+         cholesky_mo_transp(1,j,l), 1, 0.d0, &
+         out_val, 1)
 
-  do i=1,sze
-    if (banned_excitation(i,k)) cycle
-    ii = ior(ii0, i-mo_integrals_cache_min)
-    if (iand(ii, -mo_integrals_cache_size) == 0) then
-      ii_8 = ior( shiftl(ii0_8,mo_integrals_cache_shift), i-mo_integrals_cache_min)
-      out_val(i) = mo_integrals_cache(ii_8)
     else
-      p = min(i,k)
-      r = max(i,k)
-      p = p+shiftr(r*r-r,1)
-      i1 = min(p,q)
-      i2 = max(p,q)
-      idx = i1+shiftr(i2*i2-i2,1)
-      !DIR$ FORCEINLINE
-      call map_get(map,idx,tmp)
-      out_val(i) = dble(tmp)
-    endif
-  enddo
 
-!  if (banned_excitation(j,l)) then
-!      out_val(1:sze) = 0.d0
-!      return
-!  endif
-!
-!  if (mo_integrals_cache_min > 1) then
-!
-!    if (do_mo_cholesky) then
-!
-!      call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
-!         cholesky_mo_transp(1,1,k), cholesky_mo_num, &
-!         cholesky_mo_transp(1,j,l), 1, 0.d0, &
-!         out_val, 1)
-!
-!    else
-!
-!      q = min(j,l)
-!      s = max(j,l)
-!      q = q+shiftr(s*s-s,1)
-!
-!      do i=1,mo_integrals_cache_min-1
-!        if (banned_excitation(i,k)) then
-!          out_val(i) = 0.d0
-!          cycle
-!        endif
-!        p = min(i,k)
-!        r = max(i,k)
-!        p = p+shiftr(r*r-r,1)
-!        i1 = min(p,q)
-!        i2 = max(p,q)
-!        idx = i1+shiftr(i2*i2-i2,1)
-!        !DIR$ FORCEINLINE
-!        call map_get(map,idx,tmp)
-!        out_val(i) = dble(tmp)
-!      enddo
-!
-!    endif
-!
-!  endif
-!
-!
-!  ii = l-mo_integrals_cache_min
-!  ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
-!  ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
-!  ii = shiftl(ii, mo_integrals_cache_shift)
-!  do i=mo_integrals_cache_min, mo_integrals_cache_max
-!    ii = ii+1
-!    out_val(i) = mo_integrals_cache(ii)
-!  enddo
-!
-!
-!  if (mo_integrals_cache_max < mo_num) then
-!
-!    if (do_mo_cholesky) then
-!
-!      call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
-!         cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
-!         cholesky_mo_transp(1,j,l), 1, 0.d0, &
-!         out_val(mo_integrals_cache_max+1), 1)
-!
-!    else
-!
-!      q = min(j,l)
-!      s = max(j,l)
-!      q = q+shiftr(s*s-s,1)
-!
-!      do i=mo_integrals_cache_max+1,mo_num
-!        if (banned_excitation(i,k)) then
-!          out_val(i) = 0.d0
-!          cycle
-!        endif
-!        p = min(i,k)
-!        r = max(i,k)
-!        p = p+shiftr(r*r-r,1)
-!        i1 = min(p,q)
-!        i2 = max(p,q)
-!        idx = i1+shiftr(i2*i2-i2,1)
-!        !DIR$ FORCEINLINE
-!        call map_get(map,idx,tmp)
-!        out_val(i) = dble(tmp)
-!      enddo
-!
-!    endif
-!
-!  endif
+      q = min(j,l)
+      s = max(j,l)
+      q = q+shiftr(s*s-s,1)
+
+      do i=1,mo_integrals_cache_min-1
+        if (banned_excitation(i,k)) then
+          out_val(i) = 0.d0
+          cycle
+        endif
+        p = min(i,k)
+        r = max(i,k)
+        p = p+shiftr(r*r-r,1)
+        i1 = min(p,q)
+        i2 = max(p,q)
+        idx = i1+shiftr(i2*i2-i2,1)
+        !DIR$ FORCEINLINE
+        call map_get(map,idx,tmp)
+        out_val(i) = dble(tmp)
+      enddo
+
+    endif
+
+  endif
+
+
+  ii = l-mo_integrals_cache_min
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
+  ii = shiftl(ii, mo_integrals_cache_shift)
+  out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
+     mo_integrals_cache(ii:ii+mo_integrals_cache_max-mo_integrals_cache_min)
+
+  if (mo_integrals_cache_max < mo_num) then
+
+    if (do_mo_cholesky) then
+
+      call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
+         cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
+         cholesky_mo_transp(1,j,l), 1, 0.d0, &
+         out_val(mo_integrals_cache_max+1), 1)
+
+    else
+
+      q = min(j,l)
+      s = max(j,l)
+      q = q+shiftr(s*s-s,1)
+
+      do i=mo_integrals_cache_max+1,mo_num
+        if (banned_excitation(i,k)) then
+          out_val(i) = 0.d0
+          cycle
+        endif
+        p = min(i,k)
+        r = max(i,k)
+        p = p+shiftr(r*r-r,1)
+        i1 = min(p,q)
+        i2 = max(p,q)
+        idx = i1+shiftr(i2*i2-i2,1)
+        !DIR$ FORCEINLINE
+        call map_get(map,idx,tmp)
+        out_val(i) = dble(tmp)
+      enddo
+
+    endif
+
+  endif
 
 end
 

From 82654efdf9c9a19ac593b7bec54f16372fb03460 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 11 Jun 2024 13:06:32 +0200
Subject: [PATCH 079/131] Optimized get_integrals

---
 src/mo_two_e_ints/cholesky.irp.f      |   1 +
 src/mo_two_e_ints/map_integrals.irp.f | 164 +++++++++++++++++---------
 2 files changed, 107 insertions(+), 58 deletions(-)

diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 971ab38d..773d240a 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -4,6 +4,7 @@ BEGIN_PROVIDER [ logical, do_mo_cholesky ]
  ! If True, use Cholesky vectors for MO integrals
  END_DOC
  do_mo_cholesky = do_ao_cholesky
+ do_mo_cholesky = .False.
 END_PROVIDER
 
 BEGIN_PROVIDER [ integer, cholesky_mo_num ]
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index fb155073..571de573 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -32,19 +32,27 @@ subroutine insert_into_mo_integrals_map(n_integrals,                 &
   call map_update(mo_integrals_map, buffer_i, buffer_values, n_integrals, thr)
 end
 
- BEGIN_PROVIDER [ integer*4, mo_integrals_cache_min ]
-&BEGIN_PROVIDER [ integer*4, mo_integrals_cache_max ]
-&BEGIN_PROVIDER [ integer*4, mo_integrals_cache_shift]
-&BEGIN_PROVIDER [ integer*4, mo_integrals_cache_size ]
+ BEGIN_PROVIDER [ integer, mo_integrals_cache_min ]
+&BEGIN_PROVIDER [ integer, mo_integrals_cache_max ]
+&BEGIN_PROVIDER [ integer, mo_integrals_cache_shift]
+&BEGIN_PROVIDER [ integer, mo_integrals_cache_size ]
  implicit none
  BEGIN_DOC
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
- mo_integrals_cache_shift = 7  ! 7 = log(128). Max 7
+ if (qp_max_mem < 1) then
+   mo_integrals_cache_shift = 5  ! 5 = log(32).
+ else if (qp_max_mem < 2) then
+   mo_integrals_cache_shift = 6  ! 6 = log(64).
+ else
+   mo_integrals_cache_shift = 7  ! 7 = log(128). Max 7
+ endif
+
  mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 
  mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
  mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
+print *, 'mo_integrals_cache: (', mo_integrals_cache_min, ', ', mo_integrals_cache_max, ')'
 
 END_PROVIDER
 
@@ -136,7 +144,17 @@ double precision function get_two_e_integral(i,j,k,l,map)
   ii = ior(ii, j-mo_integrals_cache_min)
   ii = ior(ii, i-mo_integrals_cache_min)
 
-  if (iand(ii, -mo_integrals_cache_size) /= 0) then
+  if (iand(ii, -mo_integrals_cache_size) == 0) then
+    ! Integrals is in the cache
+
+    ii = l-mo_integrals_cache_min
+    ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+    ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
+    ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
+    get_two_e_integral = mo_integrals_cache(ii)
+
+  else
+
     ! Integral is not in the cache
 
     if  (do_mo_cholesky) then
@@ -145,7 +163,6 @@ double precision function get_two_e_integral(i,j,k,l,map)
       get_two_e_integral = ddot(cholesky_mo_num, cholesky_mo_transp(1,i,k), 1, cholesky_mo_transp(1,j,l), 1)
 
     else
-      ! Integrals is in the map
 
       !DIR$ FORCEINLINE
       call two_e_integrals_index(i,j,k,l,idx)
@@ -154,15 +171,6 @@ double precision function get_two_e_integral(i,j,k,l,map)
       get_two_e_integral = dble(tmp)
     endif
 
-  else
-    ! Integrals is in the cache
-
-    ii = l-mo_integrals_cache_min
-    ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
-    ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
-    ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
-    get_two_e_integral = mo_integrals_cache(ii)
-
   endif
 end
 
@@ -200,62 +208,105 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
 
 
+
   if (banned_excitation(j,l)) then
       out_val(1:sze) = 0.d0
       return
   endif
 
-  if (mo_integrals_cache_min > 1) then
+  ii = l-mo_integrals_cache_min
+  ii = ior(ii, k-mo_integrals_cache_min)
+  ii = ior(ii, j-mo_integrals_cache_min)
 
-    if (do_mo_cholesky) then
+  if (iand(ii, -mo_integrals_cache_size) == 0) then
+    ! Some integrals are in the cache
 
-      call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
-         cholesky_mo_transp(1,1,k), cholesky_mo_num, &
-         cholesky_mo_transp(1,j,l), 1, 0.d0, &
-         out_val, 1)
+    if (mo_integrals_cache_min > 1) then
 
-    else
+      if (do_mo_cholesky) then
 
-      q = min(j,l)
-      s = max(j,l)
-      q = q+shiftr(s*s-s,1)
+        call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
+           cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,j,l), 1, 0.d0, &
+           out_val, 1)
 
-      do i=1,mo_integrals_cache_min-1
-        if (banned_excitation(i,k)) then
-          out_val(i) = 0.d0
-          cycle
-        endif
-        p = min(i,k)
-        r = max(i,k)
-        p = p+shiftr(r*r-r,1)
-        i1 = min(p,q)
-        i2 = max(p,q)
-        idx = i1+shiftr(i2*i2-i2,1)
-        !DIR$ FORCEINLINE
-        call map_get(map,idx,tmp)
-        out_val(i) = dble(tmp)
-      enddo
+      else
+
+        q = min(j,l)
+        s = max(j,l)
+        q = q+shiftr(s*s-s,1)
+
+        do i=1,mo_integrals_cache_min-1
+          if (banned_excitation(i,k)) then
+            out_val(i) = 0.d0
+            cycle
+          endif
+          p = min(i,k)
+          r = max(i,k)
+          p = p+shiftr(r*r-r,1)
+          i1 = min(p,q)
+          i2 = max(p,q)
+          idx = i1+shiftr(i2*i2-i2,1)
+          !DIR$ FORCEINLINE
+          call map_get(map,idx,tmp)
+          out_val(i) = dble(tmp)
+        enddo
+
+      endif
 
     endif
 
-  endif
 
+    ii = l-mo_integrals_cache_min
+    ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
+    ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
+    ii = shiftl(ii, mo_integrals_cache_shift)
+    out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
+       mo_integrals_cache(ii:ii+mo_integrals_cache_max-mo_integrals_cache_min)
 
-  ii = l-mo_integrals_cache_min
-  ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
-  ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
-  ii = shiftl(ii, mo_integrals_cache_shift)
-  out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
-     mo_integrals_cache(ii:ii+mo_integrals_cache_max-mo_integrals_cache_min)
+    if (mo_integrals_cache_max < mo_num) then
 
-  if (mo_integrals_cache_max < mo_num) then
+      if (do_mo_cholesky) then
+
+        call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
+           cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,j,l), 1, 0.d0, &
+           out_val(mo_integrals_cache_max+1), 1)
+
+      else
+
+        q = min(j,l)
+        s = max(j,l)
+        q = q+shiftr(s*s-s,1)
+
+        do i=mo_integrals_cache_max+1,mo_num
+          if (banned_excitation(i,k)) then
+            out_val(i) = 0.d0
+            cycle
+          endif
+          p = min(i,k)
+          r = max(i,k)
+          p = p+shiftr(r*r-r,1)
+          i1 = min(p,q)
+          i2 = max(p,q)
+          idx = i1+shiftr(i2*i2-i2,1)
+          !DIR$ FORCEINLINE
+          call map_get(map,idx,tmp)
+          out_val(i) = dble(tmp)
+        enddo
+
+      endif
+
+    endif
+
+  else
 
     if (do_mo_cholesky) then
 
-      call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
-         cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
-         cholesky_mo_transp(1,j,l), 1, 0.d0, &
-         out_val(mo_integrals_cache_max+1), 1)
+      call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
+           cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,j,l), 1, 0.d0, &
+           out_val, 1)
 
     else
 
@@ -263,11 +314,8 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
       s = max(j,l)
       q = q+shiftr(s*s-s,1)
 
-      do i=mo_integrals_cache_max+1,mo_num
-        if (banned_excitation(i,k)) then
-          out_val(i) = 0.d0
-          cycle
-        endif
+      do i=1,sze
+        if (banned_excitation(i,k)) cycle
         p = min(i,k)
         r = max(i,k)
         p = p+shiftr(r*r-r,1)

From 90c3db31036b3aeffa3b94445960f4d092c6f929 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 11 Jun 2024 14:38:50 +0200
Subject: [PATCH 080/131] Accelerated cache

---
 src/mo_two_e_ints/cholesky.irp.f        |   2 +-
 src/mo_two_e_ints/four_idx_novvvv.irp.f | 180 ------------------------
 src/mo_two_e_ints/map_integrals.irp.f   | 163 ++++++++++++++++-----
 src/mo_two_e_ints/mo_bi_integrals.irp.f |   4 -
 4 files changed, 128 insertions(+), 221 deletions(-)
 delete mode 100644 src/mo_two_e_ints/four_idx_novvvv.irp.f

diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 773d240a..7e2c8b37 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -4,7 +4,7 @@ BEGIN_PROVIDER [ logical, do_mo_cholesky ]
  ! If True, use Cholesky vectors for MO integrals
  END_DOC
  do_mo_cholesky = do_ao_cholesky
- do_mo_cholesky = .False.
+! do_mo_cholesky = .False.
 END_PROVIDER
 
 BEGIN_PROVIDER [ integer, cholesky_mo_num ]
diff --git a/src/mo_two_e_ints/four_idx_novvvv.irp.f b/src/mo_two_e_ints/four_idx_novvvv.irp.f
deleted file mode 100644
index 80af35dc..00000000
--- a/src/mo_two_e_ints/four_idx_novvvv.irp.f
+++ /dev/null
@@ -1,180 +0,0 @@
-BEGIN_PROVIDER [ double precision, mo_coef_novirt, (ao_num,n_core_inact_act_orb) ]
- implicit none
- BEGIN_DOC
- ! MO coefficients without virtual MOs
- END_DOC
- integer :: j,jj
-
- do j=1,n_core_inact_act_orb
-   jj = list_core_inact_act(j)
-   mo_coef_novirt(:,j) = mo_coef(:,jj)
- enddo
-
-END_PROVIDER
-
-subroutine ao_to_mo_novirt(A_ao,LDA_ao,A_mo,LDA_mo)
-  implicit none
-  BEGIN_DOC
-  ! Transform A from the |AO| basis to the |MO| basis excluding virtuals
-  !
-  ! $C^\dagger.A_{ao}.C$
-  END_DOC
-  integer, intent(in)            :: LDA_ao,LDA_mo
-  double precision, intent(in)   :: A_ao(LDA_ao,ao_num)
-  double precision, intent(out)  :: A_mo(LDA_mo,n_core_inact_act_orb)
-  double precision, allocatable  :: T(:,:)
-
-  allocate ( T(ao_num,n_core_inact_act_orb) )
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: T
-
-  call dgemm('N','N', ao_num, n_core_inact_act_orb, ao_num,          &
-      1.d0, A_ao,LDA_ao,                                             &
-      mo_coef_novirt, size(mo_coef_novirt,1),                        &
-      0.d0, T, size(T,1))
-  
-  call dgemm('T','N', n_core_inact_act_orb, n_core_inact_act_orb, ao_num,&
-      1.d0, mo_coef_novirt,size(mo_coef_novirt,1),                   &
-      T, ao_num,                                                     &
-      0.d0, A_mo, size(A_mo,1))
-  
-  deallocate(T)
-end
-
-
-subroutine four_idx_novvvv
-  print*,'********'
-  print*,'********'
-  print*,'********'
-  print*,'WARNING :: Using four_idx_novvvv, and we are not sure that this routine is not bugged ...'
-  print*,'********'
-  print*,'********'
-  print*,'********'
-  use map_module
-  implicit none
-  BEGIN_DOC
-  ! Retransform MO integrals for next CAS-SCF step
-  END_DOC
-  print*,'Using partial transformation'
-  print*,'It will not transform all integrals with at least 3 indices within the virtuals'
-  integer                        :: i,j,k,l,n_integrals
-  double precision, allocatable  :: f(:,:,:), f2(:,:,:), d(:,:), T(:,:,:,:), T2(:,:,:,:)
-  double precision, external     :: get_ao_two_e_integral
-  integer(key_kind), allocatable :: idx(:)
-  real(integral_kind), allocatable :: values(:)
-
-  integer                        :: p,q,r,s
-  double precision               :: c
-  allocate( T(n_core_inact_act_orb,n_core_inact_act_orb,ao_num,ao_num) , &
-            T2(n_core_inact_act_orb,n_core_inact_act_orb,ao_num,ao_num) )
-  
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-  !$OMP SHARED(mo_num,ao_num,T,n_core_inact_act_orb, mo_coef_transp, &
-  !$OMP   mo_integrals_threshold,mo_coef,mo_integrals_map,           &
-  !$OMP   list_core_inact_act,T2,ao_integrals_map)            &
-  !$OMP PRIVATE(i,j,k,l,p,q,r,s,idx,values,n_integrals,              &
-  !$OMP   f,f2,d,c)
-  allocate(f(ao_num,ao_num,ao_num), f2(ao_num,ao_num,ao_num), d(mo_num,mo_num), &
-           idx(mo_num*mo_num), values(mo_num*mo_num) )
-  
-  ! <aa|vv>
-  !$OMP DO
-  do s=1,ao_num
-    do r=1,ao_num
-      do q=1,ao_num
-        do p=1,r
-          f (p,q,r) = get_ao_two_e_integral(p,q,r,s,ao_integrals_map)
-          f (r,q,p) = f(p,q,r)
-        enddo
-      enddo
-    enddo
-    do r=1,ao_num
-      do q=1,ao_num
-        do p=1,ao_num
-          f2(p,q,r) = f(p,r,q)
-        enddo
-      enddo
-    enddo
-    ! f (p,q,r) = <pq|rs>
-    ! f2(p,q,r) = <pr|qs>
-
-    do r=1,ao_num
-      call ao_to_mo_novirt(f (1,1,r),size(f ,1),T (1,1,r,s),size(T,1))
-      call ao_to_mo_novirt(f2(1,1,r),size(f2,1),T2(1,1,r,s),size(T,1))
-    enddo
-    ! T (i,j,p,q) = <ij|rs>
-    ! T2(i,j,p,q) = <ir|js>
-
-  enddo
-  !$OMP END DO 
-
-  !$OMP DO
-  do j=1,n_core_inact_act_orb
-    do i=1,n_core_inact_act_orb
-      do s=1,ao_num
-        do r=1,ao_num
-          f (r,s,1) = T (i,j,r,s)
-          f2(r,s,1) = T2(i,j,r,s)
-        enddo
-      enddo
-      call ao_to_mo(f ,size(f ,1),d,size(d,1))
-      n_integrals = 0
-      do l=1,mo_num
-        do k=1,mo_num
-          n_integrals+=1
-          call two_e_integrals_index(list_core_inact_act(i),list_core_inact_act(j),k,l,idx(n_integrals))
-          values(n_integrals) = d(k,l)
-        enddo
-      enddo
-      call map_append(mo_integrals_map, idx, values, n_integrals)
-
-      call ao_to_mo(f2,size(f2,1),d,size(d,1))
-      n_integrals = 0
-      do l=1,mo_num
-        do k=1,mo_num
-          n_integrals+=1
-          call two_e_integrals_index(list_core_inact_act(i),k,list_core_inact_act(j),l,idx(n_integrals))
-          values(n_integrals) = d(k,l)
-        enddo
-      enddo
-      call map_append(mo_integrals_map, idx, values, n_integrals)
-    enddo
-  enddo
-  !$OMP END DO
-  deallocate(f,f2,d,idx,values)
-  
-  !$OMP END PARALLEL
-
-  deallocate(T,T2)
-  
-  
-  call map_sort(mo_integrals_map)
-  call map_unique(mo_integrals_map)
-  call map_shrink(mo_integrals_map,real(mo_integrals_threshold,integral_kind))
-  
-end
-
-subroutine four_idx_novvvv2
-  use bitmasks
-  implicit none
-  integer                        :: i
-  integer(bit_kind)              :: mask_ijkl(N_int,4)
-
-    print*, '<ix|ix>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  full_ijkl_bitmask_4(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  full_ijkl_bitmask_4(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-
-    print*, '<ii|vv>'
-    do i = 1,N_int
-      mask_ijkl(i,1) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) = virt_bitmask(i,1)
-      mask_ijkl(i,4) = virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-
-end
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 571de573..90257bbd 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -48,6 +48,8 @@ end
    mo_integrals_cache_shift = 7  ! 7 = log(128). Max 7
  endif
 
+!mo_integrals_cache_shift = 2  ! 5 = log(32).
+
  mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 
  mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
@@ -112,6 +114,24 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:mo_integrals_cache_siz
 END_PROVIDER
 
 
+double precision function get_two_e_integral_cache(i,j,k,l)
+  use map_module
+  implicit none
+  BEGIN_DOC
+  ! Returns one integral <ij|kl> in the MO basis taken from the cache
+  END_DOC
+  integer, intent(in)            :: i,j,k,l
+  integer                        :: ii
+
+  ii = l-mo_integrals_cache_min
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
+  get_two_e_integral_cache = mo_integrals_cache(ii)
+
+end
+
+
 double precision function get_two_e_integral(i,j,k,l,map)
   use map_module
   implicit none
@@ -123,7 +143,6 @@ double precision function get_two_e_integral(i,j,k,l,map)
   integer                        :: ii
   type(map_type), intent(inout)  :: map
   real(integral_kind)            :: tmp
-  integer                        :: kk
 
   PROVIDE mo_two_e_integrals_in_map mo_integrals_cache do_mo_cholesky
 
@@ -145,13 +164,9 @@ double precision function get_two_e_integral(i,j,k,l,map)
   ii = ior(ii, i-mo_integrals_cache_min)
 
   if (iand(ii, -mo_integrals_cache_size) == 0) then
-    ! Integrals is in the cache
 
-    ii = l-mo_integrals_cache_min
-    ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
-    ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
-    ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
-    get_two_e_integral = mo_integrals_cache(ii)
+    double precision, external :: get_two_e_integral_cache
+    get_two_e_integral = get_two_e_integral_cache(i,j,k,l)
 
   else
 
@@ -199,7 +214,6 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   double precision, intent(out)  :: out_val(sze)
   type(map_type), intent(inout)  :: map
   integer                        :: i
-  double precision, external :: get_two_e_integral
 
   integer                        :: ii
   real(integral_kind)            :: tmp
@@ -256,13 +270,7 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
 
     endif
 
-
-    ii = l-mo_integrals_cache_min
-    ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
-    ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
-    ii = shiftl(ii, mo_integrals_cache_shift)
-    out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
-       mo_integrals_cache(ii:ii+mo_integrals_cache_max-mo_integrals_cache_min)
+    call get_mo_two_e_integrals_cache(j,k,l,sze,out_val)
 
     if (mo_integrals_cache_max < mo_num) then
 
@@ -333,6 +341,26 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
 
 end
 
+subroutine get_mo_two_e_integrals_cache(j,k,l,sze,out_val)
+  use map_module
+  implicit none
+  BEGIN_DOC
+  ! Returns multiple integrals <ij|kl> in the MO basis, all
+  ! i for j,k,l fixed, all integrals from the cache
+  END_DOC
+  integer, intent(in)            :: j,k,l, sze
+  double precision, intent(out)  :: out_val(sze)
+  integer                        :: ii
+
+  ii = l-mo_integrals_cache_min
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
+  ii = shiftl(ii, mo_integrals_cache_shift)
+  out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
+       mo_integrals_cache(ii:ii+mo_integrals_cache_max-mo_integrals_cache_min)
+
+end
+
 subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
   use map_module
   implicit none
@@ -347,16 +375,32 @@ subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
   integer                        :: j
   real(integral_kind), allocatable :: tmp_val(:)
 
-  if (do_mo_cholesky) then
-    call dgemm('T', 'N', mo_num, mo_num, cholesky_mo_num, 1.d0, &
-       cholesky_mo_transp(1,1,k), cholesky_mo_num, &
-       cholesky_mo_transp(1,1,l), cholesky_mo_num, 0.d0, &
-       out_array, sze)
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
+
+    if (do_mo_cholesky) then
+
+      call dgemm('T', 'N', mo_num, mo_num, cholesky_mo_num, 1.d0, &
+         cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+         cholesky_mo_transp(1,1,l), cholesky_mo_num, 0.d0, &
+         out_array, sze)
+
+    else
+
+      do j=1,sze
+        call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
+      enddo
+
+    endif
+
   else
+
+    double precision, external :: get_two_e_integral_cache
     do j=1,sze
-      call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
+      call get_mo_two_e_integrals_cache(j,k,l,sze,out_array(1,j))
     enddo
+
   endif
+
 end
 
 subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
@@ -373,17 +417,28 @@ subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
   integer                        :: j
   PROVIDE mo_two_e_integrals_in_map
 
-  if (do_mo_cholesky) then
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
 
-    call dgemv('T', cholesky_mo_num, mo_num*mo_num, 1.d0, &
-       cholesky_mo_transp(1,1,1), cholesky_mo_num, &
-       cholesky_mo_transp(1,k,l), 1, 0.d0, &
-       out_array, 1)
+    if (do_mo_cholesky) then
+
+      call dgemv('T', cholesky_mo_num, mo_num*mo_num, 1.d0, &
+         cholesky_mo_transp(1,1,1), cholesky_mo_num, &
+         cholesky_mo_transp(1,k,l), 1, 0.d0, &
+         out_array, 1)
+
+    else
+
+      do j=1,sze
+        call get_mo_two_e_integrals(k,j,l,sze,out_array(1,j),map)
+      enddo
+
+    endif
 
   else
 
+    double precision, external :: get_two_e_integral_cache
     do j=1,sze
-      call get_mo_two_e_integrals(k,j,l,sze,out_array(1,j),map)
+      call get_mo_two_e_integrals_cache(k,j,l,sze,out_array(1,j))
     enddo
 
   endif
@@ -406,21 +461,33 @@ subroutine get_mo_two_e_integrals_coulomb_ii(k,l,sze,out_val,map)
   double precision, external     :: get_two_e_integral
   PROVIDE mo_two_e_integrals_in_map
 
-  if (do_mo_cholesky) then
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
 
-    call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
-       cholesky_mo_transp(1,1,1), cholesky_mo_num*(mo_num+1), &
-       cholesky_mo_transp(1,k,l), 1, 0.d0, &
-       out_val, 1)
+    if (do_mo_cholesky) then
+
+      call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
+         cholesky_mo_transp(1,1,1), cholesky_mo_num*(mo_num+1), &
+         cholesky_mo_transp(1,k,l), 1, 0.d0, &
+         out_val, 1)
+
+    else
+
+      do i=1,sze
+        out_val(i) = get_two_e_integral(i,k,i,l,map)
+      enddo
+
+    endif
 
   else
 
+    double precision, external :: get_two_e_integral_cache
     do i=1,sze
-      out_val(i) = get_two_e_integral(k,i,l,i,map)
+      out_val(i) = get_two_e_integral_cache(i,k,i,l)
     enddo
 
   endif
 
+
 end
 
 subroutine get_mo_two_e_integrals_exch_ii(k,l,sze,out_val,map)
@@ -438,9 +505,33 @@ subroutine get_mo_two_e_integrals_exch_ii(k,l,sze,out_val,map)
   double precision, external     :: get_two_e_integral
   PROVIDE mo_two_e_integrals_in_map
 
-  do i=1,sze
-    out_val(i) = get_two_e_integral(k,i,i,l,map)
-  enddo
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
+
+    if (do_mo_cholesky) then
+
+      double precision, external :: ddot
+      do i=1,sze
+        out_val(i) = ddot(cholesky_mo_num, cholesky_mo_transp(1,i,k), 1, &
+                                           cholesky_mo_transp(1,i,l), 1)
+      enddo
+
+    else
+
+      do i=1,sze
+        out_val(i) = get_two_e_integral(i,i,k,l,map)
+      enddo
+
+    endif
+
+  else
+
+    double precision, external :: get_two_e_integral_cache
+    do i=1,sze
+      out_val(i) = get_two_e_integral_cache(i,i,k,l)
+    enddo
+
+  endif
+
 
 end
 
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index 04e6c3e6..549bbed2 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -15,10 +15,6 @@
 ! - All integrals are stored in the map or cache map
 ! - 1,2,3-index arrays are built from the map
 !
-! TODO:
-! - get_mo_integrals using cholesky
-! - get_mo_integralss using cholesky
-! - get_mo_integralss in PT2
 
 subroutine mo_two_e_integrals_index(i,j,k,l,i1)
   use map_module

From 7ecc086cac4f9104e6df05dd46d1df4010bb208d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 12 Jun 2024 14:59:26 +0200
Subject: [PATCH 081/131] Introduce hij_cache in PT2

---
 src/cipsi/selection.irp.f             | 72 ++++++++++++++-------------
 src/mo_two_e_ints/map_integrals.irp.f | 16 +++++-
 2 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 0281a1d4..88a2cbdc 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -178,7 +178,7 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
   integer(bit_kind), allocatable        :: minilist(:, :, :), fullminilist(:, :, :)
   logical, allocatable                  :: banned(:,:,:), bannedOrb(:,:)
   double precision, allocatable         :: coef_fullminilist_rev(:,:)
-  double precision, allocatable         :: mat(:,:,:)
+  double precision, allocatable         :: mat(:,:,:), hij_cache(:,:,:)
 
 
   PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
@@ -205,7 +205,7 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
   ! Removed to avoid introducing determinants already presents in the wf
   !double precision, parameter :: norm_thr = 1.d-16
 
-  allocate (indices(N_det),                                          &
+  allocate (indices(N_det), hij_cache(mo_num,mo_num,2),          &
       exc_degree(max(N_det_alpha_unique,N_det_beta_unique)))
 
   ! Pre-compute excitation degrees wrt alpha determinants
@@ -511,11 +511,15 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
 
           maskInd = maskInd + 1
           if(mod(maskInd, csubset) == (subset-1)) then
+            call get_mo_two_e_integrals_ij(h2,h1,mo_num,hij_cache(1,1,1),mo_integrals_map)
+            if (sp /= 3) then  ! AA or BB
+              call get_mo_two_e_integrals_ij(h1,h2,mo_num,hij_cache(1,1,2),mo_integrals_map)
+            endif
 
             call spot_isinwf(mask, fullminilist, i_generator, fullinteresting(0), banned, fullMatch, fullinteresting)
             if(fullMatch) cycle
 
-            call splash_pq(mask, sp, minilist, i_generator, interesting(0), bannedOrb, banned, mat, interesting)
+            call splash_pq(mask, sp, minilist, i_generator, interesting(0), bannedOrb, banned, mat, interesting, hij_cache)
 
             call fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_diag_tmp, E0, pt2_data, mat, buf)
           end if
@@ -531,7 +535,7 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
     enddo
   enddo
   deallocate(preinteresting, prefullinteresting, interesting, fullinteresting)
-  deallocate(banned, bannedOrb,mat)
+  deallocate(banned, bannedOrb, mat, hij_cache)
 end subroutine
 
 BEGIN_TEMPLATE
@@ -914,7 +918,7 @@ single ; do p1=1,mo_num ; enddo ; p2=1            ;       ; .False. ;;
 
 END_TEMPLATE
 
-subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, interesting)
+subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, interesting, hij_cache)
   use bitmasks
   implicit none
   BEGIN_DOC
@@ -926,6 +930,7 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
   integer, intent(in)            :: sp, i_gen, N_sel
   integer, intent(in)            :: interesting(0:N_sel)
   integer(bit_kind),intent(in)   :: mask(N_int, 2), det(N_int, 2, N_sel)
+  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
   logical, intent(inout)         :: bannedOrb(mo_num, 2), banned(mo_num, mo_num, 2)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
 
@@ -995,9 +1000,9 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
         if(nt == 4) then
           call get_d2(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
         else if(nt == 3) then
-          call get_d1(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
+          call get_d1(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i))) !, hij_cache)
         else
-          call get_d0(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
+          call get_d0(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)), hij_cache)
         end if
     else if(nt == 4) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
@@ -1190,6 +1195,8 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   double precision, intent(in)   :: coefs(N_states)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
   integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+!  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
+
   double precision, external     :: get_phase_bi, mo_two_e_integral
   logical                        :: ok
 
@@ -1201,12 +1208,12 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
 
   integer                        :: bant
-  double precision, allocatable :: hij_cache(:,:)
+  double precision, allocatable :: hij_cache1(:,:)
   double precision               :: hij, tmp_row(N_states, mo_num), tmp_row2(N_states, mo_num)
   PROVIDE mo_integrals_map N_int
 
   allocate (lbanned(mo_num, 2))
-  allocate (hij_cache(mo_num,2))
+  allocate (hij_cache1(mo_num,2))
   lbanned = bannedOrb
 
   do i=1, p(0,1)
@@ -1230,13 +1237,13 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     p1 = p(1,ma)
     p2 = p(2,ma)
     if(.not. bannedOrb(puti, mi)) then
-      call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map)
-      call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map)
+      call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache1(1,1),mo_integrals_map)
+      call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache1(1,2),mo_integrals_map)
       tmp_row = 0d0
       do putj=1, hfix-1
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
-        hij = hij_cache(putj,1) - hij_cache(putj,2)
+        hij = hij_cache1(putj,1) - hij_cache1(putj,2)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1248,7 +1255,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       do putj=hfix+1, mo_num
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
-        hij = hij_cache(putj,2) - hij_cache(putj,1)
+        hij = hij_cache1(putj,2) - hij_cache1(putj,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1274,15 +1281,15 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     pfix = p(1,mi)
     tmp_row = 0d0
     tmp_row2 = 0d0
-    call get_mo_two_e_integrals(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map)
-    call get_mo_two_e_integrals(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map)
+    call get_mo_two_e_integrals(hfix,pfix,p1,mo_num,hij_cache1(1,1),mo_integrals_map)
+    call get_mo_two_e_integrals(hfix,pfix,p2,mo_num,hij_cache1(1,2),mo_integrals_map)
     putj = p1
     do puti=1,mo_num !HOT
       if(lbanned(puti,mi)) cycle
       !p1 fixed
       putj = p1
       if(.not. banned(putj,puti,bant)) then
-        hij = hij_cache(puti,2)
+        hij = hij_cache1(puti,2)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1296,7 +1303,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       putj = p2
 !    do puti=1,mo_num !HOT
       if(.not. banned(putj,puti,bant)) then
-        hij = hij_cache(puti,1)
+        hij = hij_cache1(puti,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
           do k=1,N_states
@@ -1327,13 +1334,13 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         puti = p(i, ma)
         p1 = p(turn3(1,i), ma)
         p2 = p(turn3(2,i), ma)
-        call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map)
-        call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map)
+        call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache1(1,1),mo_integrals_map)
+        call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache1(1,2),mo_integrals_map)
         tmp_row = 0d0
         do putj=1,hfix-1
           if(banned(putj,puti,1)) cycle
           if(lbanned(putj,ma)) cycle
-          hij = hij_cache(putj,1) - hij_cache(putj,2)
+          hij = hij_cache1(putj,1) - hij_cache1(putj,2)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
             tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
@@ -1342,7 +1349,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         do putj=hfix+1,mo_num
           if(banned(putj,puti,1)) cycle
           if(lbanned(putj,ma)) cycle
-          hij = hij_cache(putj,2) - hij_cache(putj,1)
+          hij = hij_cache1(putj,2) - hij_cache1(putj,1)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
             tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
@@ -1364,14 +1371,14 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       p2 = p(2,ma)
       tmp_row = 0d0
       tmp_row2 = 0d0
-      call get_mo_two_e_integrals(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map)
-      call get_mo_two_e_integrals(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map)
+      call get_mo_two_e_integrals(hfix,p1,pfix,mo_num,hij_cache1(1,1),mo_integrals_map)
+      call get_mo_two_e_integrals(hfix,p2,pfix,mo_num,hij_cache1(1,2),mo_integrals_map)
       putj = p2
       do puti=1,mo_num
         if(lbanned(puti,ma)) cycle
         putj = p2
         if(.not. banned(puti,putj,1)) then
-          hij = hij_cache(puti,1)
+          hij = hij_cache1(puti,1)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
             !DIR$ LOOP COUNT AVG(4)
@@ -1383,7 +1390,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
 
         putj = p1
         if(.not. banned(puti,putj,1)) then
-          hij = hij_cache(puti,2)
+          hij = hij_cache1(puti,2)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
             do k=1,N_states
@@ -1408,7 +1415,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       enddo
     end if
   end if
-  deallocate(lbanned,hij_cache)
+  deallocate(lbanned,hij_cache1)
 
  !! MONO
     if(sp == 3) then
@@ -1439,7 +1446,7 @@ end
 
 
 
-subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs, hij_cache)
   use bitmasks
   implicit none
 
@@ -1450,6 +1457,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   double precision, intent(in) :: coefs(N_states)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
   integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
 
   integer :: i, j, k, s, h1, h2, p1, p2, puti, putj
   double precision :: hij, phase
@@ -1457,8 +1465,6 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   logical :: ok
 
   integer, parameter :: bant=1
-  double precision, allocatable :: hij_cache1(:), hij_cache2(:)
-  allocate (hij_cache1(mo_num),hij_cache2(mo_num))
 
 
   if(sp == 3) then ! AB
@@ -1466,7 +1472,6 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     h2 = p(1,2)
     do p1=1, mo_num
       if(bannedOrb(p1, 1)) cycle
-      call get_mo_two_e_integrals(p1,h2,h1,mo_num,hij_cache1,mo_integrals_map)
       do p2=1, mo_num
         if(bannedOrb(p2,2)) cycle
         if(banned(p1, p2, bant)) cycle ! rentable?
@@ -1475,7 +1480,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           call i_h_j(gen, det, N_int, hij)
         else
           phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-          hij = hij_cache1(p2) * phase
+          hij = hij_cache(p2,p1,1) * phase
         end if
         if (hij == 0.d0) cycle
         !DIR$ LOOP COUNT AVG(4)
@@ -1490,8 +1495,6 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     p2 = p(2,sp)
     do puti=1, mo_num
       if (bannedOrb(puti, sp)) cycle
-      call get_mo_two_e_integrals(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map)
-      call get_mo_two_e_integrals(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map)
       do putj=puti+1, mo_num
         if(bannedOrb(putj, sp)) cycle
         if(banned(puti, putj, bant)) cycle ! rentable?
@@ -1500,7 +1503,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           call i_h_j(gen, det, N_int, hij)
           if (hij == 0.d0) cycle
         else
-          hij = hij_cache1(putj) - hij_cache2(putj)
+          hij = hij_cache(putj,puti,1) - hij_cache(putj,puti,2)
           if (hij == 0.d0) cycle
           hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
         end if
@@ -1512,7 +1515,6 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     end do
   end if
 
-  deallocate(hij_cache1,hij_cache2)
 end
 
 
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 90257bbd..516851b9 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -48,7 +48,7 @@ end
    mo_integrals_cache_shift = 7  ! 7 = log(128). Max 7
  endif
 
-!mo_integrals_cache_shift = 2  ! 5 = log(32).
+mo_integrals_cache_shift = 2  ! 5 = log(32).
 
  mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 
@@ -176,6 +176,8 @@ double precision function get_two_e_integral(i,j,k,l,map)
 
       double precision, external :: ddot
       get_two_e_integral = ddot(cholesky_mo_num, cholesky_mo_transp(1,i,k), 1, cholesky_mo_transp(1,j,l), 1)
+!       double precision, external :: get_from_mo_cholesky_cache
+!       get_two_e_integral = get_from_mo_cholesky_cache(i,j,k,l,.False.)
 
     else
 
@@ -227,6 +229,11 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
       out_val(1:sze) = 0.d0
       return
   endif
+!
+!  if (do_mo_cholesky) then
+!    call get_from_mo_cholesky_caches(j,k,l,out_val)
+!    return
+!  endif
 
   ii = l-mo_integrals_cache_min
   ii = ior(ii, k-mo_integrals_cache_min)
@@ -239,6 +246,7 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
 
       if (do_mo_cholesky) then
 
+        !TODO: here
         call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
            cholesky_mo_transp(1,1,k), cholesky_mo_num, &
            cholesky_mo_transp(1,j,l), 1, 0.d0, &
@@ -276,6 +284,7 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
 
       if (do_mo_cholesky) then
 
+        !TODO: here
         call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
            cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
            cholesky_mo_transp(1,j,l), 1, 0.d0, &
@@ -311,6 +320,7 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
 
     if (do_mo_cholesky) then
 
+      !TODO: here
       call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
            cholesky_mo_transp(1,1,k), cholesky_mo_num, &
            cholesky_mo_transp(1,j,l), 1, 0.d0, &
@@ -425,6 +435,10 @@ subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
          cholesky_mo_transp(1,1,1), cholesky_mo_num, &
          cholesky_mo_transp(1,k,l), 1, 0.d0, &
          out_array, 1)
+!
+!      do j=1,sze
+!        call get_from_mo_cholesky_caches(k,j,l,out_array(1,j))
+!      enddo
 
     else
 

From acc0b97fbad1589c5453ec9e077668483b419759 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 13 Jun 2024 13:29:38 +0200
Subject: [PATCH 082/131] Inline past_d1 and past_d2

---
 src/cipsi/selection.irp.f             | 94 ++++++++++-----------------
 src/mo_two_e_ints/map_integrals.irp.f | 29 ++++-----
 2 files changed, 50 insertions(+), 73 deletions(-)

diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 88a2cbdc..517220a8 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -560,7 +560,7 @@ subroutine fill_buffer_$DOUBLE(i_generator, sp, h1, h2, bannedOrb, banned, fock_
   double precision, external :: diag_H_mat_elem_fock
   double precision :: E_shift
   double precision :: s_weight(N_states,N_states)
-  PROVIDE dominant_dets_of_cfgs N_dominant_dets_of_cfgs
+  PROVIDE dominant_dets_of_cfgs N_dominant_dets_of_cfgs thresh_sym excitation_ref hf_bitmask elec_alpha_num
   do jstate=1,N_states
     do istate=1,N_states
       s_weight(istate,jstate) = dsqrt(selection_weight(istate)*selection_weight(jstate))
@@ -746,7 +746,7 @@ subroutine fill_buffer_$DOUBLE(i_generator, sp, h1, h2, bannedOrb, banned, fock_
       do istate=1,N_states
         delta_E = E0(istate) - Hii + E_shift
         alpha_h_psi = mat(istate, p1, p2)
-        if (alpha_h_psi == 0.d0) cycle
+        if (dabs(alpha_h_psi) < mo_integrals_threshold) cycle
 
         val = alpha_h_psi + alpha_h_psi
         tmp = dsqrt(delta_E * delta_E + val * val)
@@ -1000,18 +1000,36 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
         if(nt == 4) then
           call get_d2(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
         else if(nt == 3) then
-          call get_d1(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i))) !, hij_cache)
+          call get_d1(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)), hij_cache)
         else
           call get_d0(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)), hij_cache)
         end if
     else if(nt == 4) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
         call bitstring_to_list_in_selection(mobMask(1,2), p(1,2), p(0,2), N_int)
-        call past_d2(banned, p, sp)
+        if(sp == 3) then
+          do j=1,p(0,2)
+            do ii=1,p(0,1)
+              banned(p(ii,1), p(j,2),1) = .true.
+            end do
+          end do
+        else
+          do ii=1,p(0, sp)
+            do j=1,ii-1
+              banned(p(j,sp), p(ii,sp),1) = .true.
+              banned(p(ii,sp), p(j,sp),1) = .true.
+            end do
+          end do
+        end if
     else if(nt == 3) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
         call bitstring_to_list_in_selection(mobMask(1,2), p(1,2), p(0,2), N_int)
-        call past_d1(bannedOrb, p)
+        do ii = 1, p(0, 1)
+          bannedOrb(p(ii, 1), 1) = .true.
+        end do
+        do ii = 1, p(0, 2)
+          bannedOrb(p(ii, 2), 2) = .true.
+        end do
     end if
   end do
 
@@ -1042,6 +1060,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   integer :: bant
   bant = 1
 
+  PROVIDE mo_integrals_threshold
   tip = p(0,1) * p(0,2)
 
   ma = sp
@@ -1067,7 +1086,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         p2 = p(i2, ma)
 
         hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2, p1, h1, h2)
-        if (hij == 0.d0) cycle
+        if (dabs(hij) < mo_integrals_threshold) cycle
 
         hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
 
@@ -1097,7 +1116,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           p1 = p(turn2(i), 1)
 
           hij = mo_two_e_integral(p1, p2, h1, h2)
-          if (hij /= 0.d0) then
+          if (dabs(hij) > mo_integrals_threshold) then
             hij = hij * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
             !DIR$ LOOP COUNT AVG(4)
             do k=1,N_states
@@ -1125,7 +1144,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           p1 = p(i1, ma)
           p2 = p(i2, ma)
           hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)
-          if (hij == 0.d0) cycle
+          if (dabs(hij) < mo_integrals_threshold) cycle
 
           hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1147,7 +1166,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         p2 = p(i, ma)
 
         hij = mo_two_e_integral(p1, p2, h1, h2)
-        if (hij == 0.d0) cycle
+        if (dabs(hij) < mo_integrals_threshold) cycle
 
         hij = hij * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
         if (puti < putj) then
@@ -1184,7 +1203,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
 end
 
 
-subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs, hij_cache)
   use bitmasks
   implicit none
 
@@ -1195,7 +1214,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   double precision, intent(in)   :: coefs(N_states)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
   integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
-!  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
+  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
 
   double precision, external     :: get_phase_bi, mo_two_e_integral
   logical                        :: ok
@@ -1237,13 +1256,11 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     p1 = p(1,ma)
     p2 = p(2,ma)
     if(.not. bannedOrb(puti, mi)) then
-      call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache1(1,1),mo_integrals_map)
-      call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache1(1,2),mo_integrals_map)
       tmp_row = 0d0
       do putj=1, hfix-1
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
-        hij = hij_cache1(putj,1) - hij_cache1(putj,2)
+        hij = hij_cache(hfix,putj,1) - hij_cache(putj,hfix,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1255,7 +1272,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       do putj=hfix+1, mo_num
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
-        hij = hij_cache1(putj,2) - hij_cache1(putj,1)
+        hij = hij_cache(putj,hfix,1) - hij_cache(hfix,putj,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1466,6 +1483,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs,
 
   integer, parameter :: bant=1
 
+  PROVIDE mo_integrals_threshold
 
   if(sp == 3) then ! AB
     h1 = p(1,1)
@@ -1482,7 +1500,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs,
           phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
           hij = hij_cache(p2,p1,1) * phase
         end if
-        if (hij == 0.d0) cycle
+        if (dabs(hij) < mo_integrals_threshold) cycle
         !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij  ! HOTSPOT
@@ -1501,10 +1519,10 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs,
         if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
           call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
           call i_h_j(gen, det, N_int, hij)
-          if (hij == 0.d0) cycle
+          if (dabs(hij) < mo_integrals_threshold) cycle
         else
           hij = hij_cache(putj,puti,1) - hij_cache(putj,puti,2)
-          if (hij == 0.d0) cycle
+          if (dabs(hij) < mo_integrals_threshold) cycle
           hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
         end if
         !DIR$ LOOP COUNT AVG(4)
@@ -1518,46 +1536,6 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs,
 end
 
 
-subroutine past_d1(bannedOrb, p)
-  use bitmasks
-  implicit none
-
-  logical, intent(inout) :: bannedOrb(mo_num, 2)
-  integer, intent(in) :: p(0:4, 2)
-  integer :: i,s
-
-  do s = 1, 2
-    do i = 1, p(0, s)
-      bannedOrb(p(i, s), s) = .true.
-    end do
-  end do
-end
-
-
-subroutine past_d2(banned, p, sp)
-  use bitmasks
-  implicit none
-
-  logical, intent(inout) :: banned(mo_num, mo_num)
-  integer, intent(in) :: p(0:4, 2), sp
-  integer :: i,j
-
-  if(sp == 3) then
-    do j=1,p(0,2)
-      do i=1,p(0,1)
-        banned(p(i,1), p(j,2)) = .true.
-      end do
-    end do
-  else
-    do i=1,p(0, sp)
-      do j=1,i-1
-        banned(p(j,sp), p(i,sp)) = .true.
-        banned(p(i,sp), p(j,sp)) = .true.
-      end do
-    end do
-  end if
-end
-
 subroutine spot_isinwf(mask, det, i_gen, N, banned, fullMatch, interesting)
   use bitmasks
   implicit none
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 516851b9..13fcc19a 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -192,19 +192,6 @@ double precision function get_two_e_integral(i,j,k,l,map)
 end
 
 
-double precision function mo_two_e_integral(i,j,k,l)
-  implicit none
-  BEGIN_DOC
-  ! Returns one integral <ij|kl> in the MO basis
-  END_DOC
-  integer, intent(in)            :: i,j,k,l
-  double precision               :: get_two_e_integral
-  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
-  !DIR$ FORCEINLINE
-  mo_two_e_integral = get_two_e_integral(i,j,k,l,mo_integrals_map)
-  return
-end
-
 subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   use map_module
   implicit none
@@ -223,8 +210,6 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
   integer(key_kind)              :: p,q,r,s,i2
   PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
 
-
-
   if (banned_excitation(j,l)) then
       out_val(1:sze) = 0.d0
       return
@@ -351,6 +336,20 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
 
 end
 
+double precision function mo_two_e_integral(i,j,k,l)
+  implicit none
+  BEGIN_DOC
+  ! Returns one integral <ij|kl> in the MO basis
+  END_DOC
+  integer, intent(in)            :: i,j,k,l
+  double precision               :: get_two_e_integral
+  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
+  !DIR$ FORCEINLINE
+  mo_two_e_integral = get_two_e_integral(i,j,k,l,mo_integrals_map)
+  return
+end
+
+
 subroutine get_mo_two_e_integrals_cache(j,k,l,sze,out_val)
   use map_module
   implicit none

From 70317b2a158d69404ac9047bde280f29fa8ec82f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 13 Jun 2024 14:54:32 +0200
Subject: [PATCH 083/131] Put mo_integrals_cache_shift in EZFIO

---
 src/mo_two_e_ints/EZFIO.cfg           |  6 +++
 src/mo_two_e_ints/map_integrals.irp.f | 59 +++++++++------------------
 2 files changed, 26 insertions(+), 39 deletions(-)

diff --git a/src/mo_two_e_ints/EZFIO.cfg b/src/mo_two_e_ints/EZFIO.cfg
index c967969f..da9d8fc9 100644
--- a/src/mo_two_e_ints/EZFIO.cfg
+++ b/src/mo_two_e_ints/EZFIO.cfg
@@ -10,6 +10,12 @@ doc: Read/Write |MO| integrals from/to disk [ Write | Read | None ]
 interface: ezfio,provider,ocaml
 default: None
 
+[mo_integrals_cache_shift]
+type: integer
+doc: Adjusts the size of the MO integrals cache. 2: 2KB, 3: 32KB, 4: 512KB, 5: 8MB, 6: 128MB, 7: 2GB, 8: 32GB, 9: 512GB
+interface: ezfio, provider, ocaml
+default: 7
+
 [mo_integrals_threshold]
 type: Threshold
 doc: If | <ij|kl> | < `mo_integrals_threshold` then <ij|kl> is zero
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 13fcc19a..168c34b4 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -34,38 +34,28 @@ end
 
  BEGIN_PROVIDER [ integer, mo_integrals_cache_min ]
 &BEGIN_PROVIDER [ integer, mo_integrals_cache_max ]
-&BEGIN_PROVIDER [ integer, mo_integrals_cache_shift]
 &BEGIN_PROVIDER [ integer, mo_integrals_cache_size ]
  implicit none
  BEGIN_DOC
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
- if (qp_max_mem < 1) then
-   mo_integrals_cache_shift = 5  ! 5 = log(32).
- else if (qp_max_mem < 2) then
-   mo_integrals_cache_shift = 6  ! 6 = log(64).
- else
-   mo_integrals_cache_shift = 7  ! 7 = log(128). Max 7
- endif
 
-mo_integrals_cache_shift = 2  ! 5 = log(32).
-
- mo_integrals_cache_size  = 2**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
 
  mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
  mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
-print *, 'mo_integrals_cache: (', mo_integrals_cache_min, ', ', mo_integrals_cache_max, ')'
+ print *, 'MO integrals cache: (', mo_integrals_cache_min, ', ', mo_integrals_cache_max, ')'
 
 END_PROVIDER
 
-BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:mo_integrals_cache_size**4) ]
+BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:(1_8*mo_integrals_cache_size)**4) ]
  implicit none
  BEGIN_DOC
  ! Cache of MO integrals for fast access
  END_DOC
  PROVIDE mo_two_e_integrals_in_map
  integer                        :: i,j,k,l
- integer                        :: ii
+ integer*8                      :: ii
  integer(key_kind)              :: idx
  real(integral_kind)            :: integral
  FREE ao_integrals_cache
@@ -75,8 +65,8 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:mo_integrals_cache_siz
    !$OMP PARALLEL DO PRIVATE (k,l,ii)
    do l=mo_integrals_cache_min,mo_integrals_cache_max
      do k=mo_integrals_cache_min,mo_integrals_cache_max
-         ii = l-mo_integrals_cache_min
-         ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
+         ii = int(l-mo_integrals_cache_min,8)
+         ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
          ii = shiftl(ii,mo_integrals_cache_shift)
          ii = shiftl(ii,mo_integrals_cache_shift)
          call dgemm('T','N', mo_integrals_cache_max-mo_integrals_cache_min+1, &
@@ -99,10 +89,10 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0:mo_integrals_cache_siz
            call two_e_integrals_index(i,j,k,l,idx)
            !DIR$ FORCEINLINE
            call map_get(mo_integrals_map,idx,integral)
-           ii = l-mo_integrals_cache_min
-           ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
-           ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
-           ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
+           ii = int(l-mo_integrals_cache_min,8)
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), int(j-mo_integrals_cache_min,8))
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), int(i-mo_integrals_cache_min,8))
            mo_integrals_cache(ii) = integral
          enddo
        enddo
@@ -121,12 +111,12 @@ double precision function get_two_e_integral_cache(i,j,k,l)
   ! Returns one integral <ij|kl> in the MO basis taken from the cache
   END_DOC
   integer, intent(in)            :: i,j,k,l
-  integer                        :: ii
+  integer*8                      :: ii
 
-  ii = l-mo_integrals_cache_min
-  ii = ior( shiftl(ii,mo_integrals_cache_shift), k-mo_integrals_cache_min)
-  ii = ior( shiftl(ii,mo_integrals_cache_shift), j-mo_integrals_cache_min)
-  ii = ior( shiftl(ii,mo_integrals_cache_shift), i-mo_integrals_cache_min)
+  ii = int(l-mo_integrals_cache_min,8)
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), int(j-mo_integrals_cache_min,8))
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), int(i-mo_integrals_cache_min,8))
   get_two_e_integral_cache = mo_integrals_cache(ii)
 
 end
@@ -215,11 +205,6 @@ subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
       return
   endif
 !
-!  if (do_mo_cholesky) then
-!    call get_from_mo_cholesky_caches(j,k,l,out_val)
-!    return
-!  endif
-
   ii = l-mo_integrals_cache_min
   ii = ior(ii, k-mo_integrals_cache_min)
   ii = ior(ii, j-mo_integrals_cache_min)
@@ -359,14 +344,14 @@ subroutine get_mo_two_e_integrals_cache(j,k,l,sze,out_val)
   END_DOC
   integer, intent(in)            :: j,k,l, sze
   double precision, intent(out)  :: out_val(sze)
-  integer                        :: ii
+  integer*8                      :: ii
 
-  ii = l-mo_integrals_cache_min
-  ii = ior( shiftl(ii, mo_integrals_cache_shift), k-mo_integrals_cache_min)
-  ii = ior( shiftl(ii, mo_integrals_cache_shift), j-mo_integrals_cache_min)
+  ii = int(l-mo_integrals_cache_min,8)
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), int(j-mo_integrals_cache_min,8))
   ii = shiftl(ii, mo_integrals_cache_shift)
   out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
-       mo_integrals_cache(ii:ii+mo_integrals_cache_max-mo_integrals_cache_min)
+       mo_integrals_cache(ii:ii+int(mo_integrals_cache_max-mo_integrals_cache_min,8))
 
 end
 
@@ -434,10 +419,6 @@ subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
          cholesky_mo_transp(1,1,1), cholesky_mo_num, &
          cholesky_mo_transp(1,k,l), 1, 0.d0, &
          out_array, 1)
-!
-!      do j=1,sze
-!        call get_from_mo_cholesky_caches(k,j,l,out_array(1,j))
-!      enddo
 
     else
 

From d89682cb7ee8beeb0e01d6129afd18d8cd9c78ee Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 13 Jun 2024 17:50:27 +0200
Subject: [PATCH 084/131] Improved disk access in Cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 98 +++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 9 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 41cdb80d..05f7115d 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -41,7 +41,7 @@ END_PROVIDER
    integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
    logical, allocatable           :: computed(:)
 
-   integer                        :: i,j,k,m,p,q, dj, p2, q2
+   integer                        :: i,j,k,m,p,q, dj, p2, q2, ii, jj
    integer*8                      :: i8, j8, p8, qj8, rank_max, np8
    integer                        :: N, np, nq
 
@@ -65,6 +65,8 @@ END_PROVIDER
    type(c_ptr)                    :: c_pointer(2)
    integer                        :: fd(2)
    logical                        :: delta_on_disk
+   integer                        :: dgemm_block_size, nqq
+   double precision, allocatable  :: dgemm_buffer1(:,:), dgemm_buffer2(:,:)
 
    PROVIDE nproc
    PROVIDE nucl_coord ao_two_e_integral_schwartz
@@ -300,17 +302,58 @@ END_PROVIDER
        !$OMP BARRIER
        !$OMP END PARALLEL
 
+       PROVIDE nproc
        if (N>0) then
-           call dgemm('N','T', np, nq, N, -1.d0,                       &
-               Ltmp_p, np, Ltmp_q, nq, 0.d0, Delta, np)
-       else
-         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) SCHEDULE(static,1)
-         do q=1,nq
-           do j=1,np
-             Delta(j,q) = 0.d0
+
+         if (delta_on_disk) then
+           ! Blocking improves I/O performance
+
+           dgemm_block_size = nproc*4
+
+           allocate (dgemm_buffer1(np,dgemm_block_size))
+           allocate (dgemm_buffer2(dgemm_block_size,N))
+
+           do jj=1,nq,dgemm_block_size
+
+             nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
+
+             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
+             do ii=1,N
+               do q=jj,jj+nqq-1
+                 dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
+               enddo
+             enddo
+             !$OMP END PARALLEL DO
+
+             call dgemm('N', 'T', np, nqq, N, 1.d0,                       &
+                   Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
+
+             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
+             do q=jj,jj+nqq-1
+               Delta(:,q) = - dgemm_buffer1(:, q-jj+1)
+             enddo
+             !$OMP END PARALLEL DO
+
            enddo
+
+           deallocate(dgemm_buffer1, dgemm_buffer2)
+
+         else
+
+           call dgemm('N', 'T', np, nq, N, -1.d0,                       &
+                  Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
+
+         endif
+
+
+       else
+
+         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
+         do q=1,nq
+           Delta(:,q) = 0.d0
          enddo
          !$OMP END PARALLEL DO
+
        endif
 
        ! f.
@@ -329,9 +372,46 @@ END_PROVIDER
          rank = N+j
 
          if (iblock == block_size) then
-           call dgemm('N','T',np,nq,block_size,-1.d0,                &
+
+           if (delta_on_disk) then
+             ! Blocking improves I/O performance
+
+             dgemm_block_size = nproc*4
+             allocate (dgemm_buffer1(np,dgemm_block_size))
+             allocate (dgemm_buffer2(dgemm_block_size,block_size))
+
+             do jj=1,nq,dgemm_block_size
+               nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
+
+               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
+               do ii=1,block_size
+                 do q=jj,jj+nqq-1
+                   dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
+                 enddo
+               enddo
+               !$OMP END PARALLEL DO
+
+               call dgemm('N', 'T', np, nqq, block_size, 1.d0,                       &
+                     Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
+
+               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
+               do q=jj,jj+nqq-1
+                 Delta(:,q) = Delta(:,q) - dgemm_buffer1(:, q-jj+1)
+               enddo
+               !$OMP END PARALLEL DO
+
+             enddo
+             deallocate(dgemm_buffer1, dgemm_buffer2)
+
+           else
+
+             call dgemm('N','T',np,nq,block_size,-1.d0,                &
                  Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
+
+           endif
+
            iblock = 0
+
          endif
 
          ! ii.

From e876f635d636b1cb878cdb9baf9e7b3906bf955f Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 14 Jun 2024 16:26:23 +0200
Subject: [PATCH 085/131] Asyc Fortran I/O

---
 src/ao_two_e_ints/cholesky.irp.f | 161 ++++++++++++++++++++++---------
 src/utils/fortran_mmap.c         |   7 +-
 2 files changed, 118 insertions(+), 50 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 05f7115d..d731ef04 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -31,12 +31,12 @@ END_PROVIDER
    integer*8                      :: ndim8
    integer                        :: rank
    double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:), Delta(:,:)
+   double precision, pointer      :: L(:,:)
 
    double precision               :: s
    double precision               :: dscale, dscale_tmp
 
-   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:)
    integer, allocatable           :: addr1(:), addr2(:)
    integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
    logical, allocatable           :: computed(:)
@@ -66,7 +66,7 @@ END_PROVIDER
    integer                        :: fd(2)
    logical                        :: delta_on_disk
    integer                        :: dgemm_block_size, nqq
-   double precision, allocatable  :: dgemm_buffer1(:,:), dgemm_buffer2(:,:)
+   double precision, allocatable  :: dgemm_buffer1(:,:), dgemm_buffer2(:,:), dgemm_buffer3(:,:)
 
    PROVIDE nproc
    PROVIDE nucl_coord ao_two_e_integral_schwartz
@@ -230,7 +230,7 @@ END_PROVIDER
            stop -1
          endif
 
-         if (s > 0.1d0) then
+         if (s > 0.3d0) then
            exit
          endif
 
@@ -245,13 +245,16 @@ END_PROVIDER
              + np*memory_of_double(nq)       &! Delta(np,nq)
              + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
-       if (mem > qp_max_mem) then
-         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2))
-         call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
-         ! Deleting the file while it is open makes the file invisible on the filesystem,
-         ! and automatically deleted, even if the program crashes
+       if (1.1*mem > qp_max_mem) then
+!         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2))
+!         call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
+
+!         ! Deleting the file while it is open makes the file invisible on the filesystem,
+!         ! and automatically deleted, even if the program crashes
          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R')
          close(iunit,status='delete')
+         open(unit=iunit, access='DIRECT', form='UNFORMATTED', RECL=(np+1)*8, &
+            ASYNCHRONOUS='YES', file=trim(ezfio_work_dir)//'cholesky_delta')
          delta_on_disk = .True.
        else
          allocate(Delta(np,nq))
@@ -303,15 +306,18 @@ END_PROVIDER
        !$OMP END PARALLEL
 
        PROVIDE nproc
-       if (N>0) then
 
-         if (delta_on_disk) then
+       if (delta_on_disk) then
+
+         dgemm_block_size = nproc*4
+
+         allocate (dgemm_buffer1(np,dgemm_block_size))
+         allocate (dgemm_buffer3(np,dgemm_block_size))
+         allocate (dgemm_buffer2(dgemm_block_size,N))
+
+         if (N>0) then
            ! Blocking improves I/O performance
 
-           dgemm_block_size = nproc*4
-
-           allocate (dgemm_buffer1(np,dgemm_block_size))
-           allocate (dgemm_buffer2(dgemm_block_size,N))
 
            do jj=1,nq,dgemm_block_size
 
@@ -325,34 +331,55 @@ END_PROVIDER
              enddo
              !$OMP END PARALLEL DO
 
-             call dgemm('N', 'T', np, nqq, N, 1.d0,                       &
+print *, np, nq, jj, nqq, N
+             call dgemm('N', 'T', np, nqq, N, -1.d0,                       &
                    Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
 
-             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
+             wait(iunit)
+             dgemm_buffer3(:,:) = dgemm_buffer1(:,:)
+!             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
              do q=jj,jj+nqq-1
-               Delta(:,q) = - dgemm_buffer1(:, q-jj+1)
+               write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(1:np, q-jj+1)
              enddo
-             !$OMP END PARALLEL DO
+!             !$OMP END PARALLEL DO
 
            enddo
-
-           deallocate(dgemm_buffer1, dgemm_buffer2)
+print *, 'ok1'
 
          else
 
-           call dgemm('N', 'T', np, nq, N, -1.d0,                       &
-                  Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
+
+           dgemm_buffer1(1:np,1) = 0.d0
+
+!           !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
+           do q=1,nq
+             write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer1(1:np, 1)
+           enddo
+!           !$OMP END PARALLEL DO
 
          endif
 
+         deallocate(dgemm_buffer1, dgemm_buffer2)
+         if (delta_on_disk) wait(iunit)
+         deallocate(dgemm_buffer3)
+
 
        else
 
-         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
-         do q=1,nq
-           Delta(:,q) = 0.d0
-         enddo
-         !$OMP END PARALLEL DO
+         if (N>0) then
+
+           call dgemm('N', 'T', np, nq, N, -1.d0,                       &
+                  Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
+
+         else
+
+           !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
+           do q=1,nq
+             Delta(:,q) = 0.d0
+           enddo
+           !$OMP END PARALLEL DO
+
+         endif
 
        endif
 
@@ -383,25 +410,40 @@ END_PROVIDER
              do jj=1,nq,dgemm_block_size
                nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
 
-               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
+               !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(q,ii)
+               !$OMP DO
                do ii=1,block_size
                  do q=jj,jj+nqq-1
                    dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
                  enddo
                enddo
-               !$OMP END PARALLEL DO
+               !$OMP END DO
+               !$OMP END PARALLEL
 
-               call dgemm('N', 'T', np, nqq, block_size, 1.d0,                       &
-                     Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
-
-               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
+!               !$OMP DO
                do q=jj,jj+nqq-1
-                 Delta(:,q) = Delta(:,q) - dgemm_buffer1(:, q-jj+1)
+                 read(iunit, rec=q) dgemm_buffer1(1:np, q-jj+1)
                enddo
-               !$OMP END PARALLEL DO
+!               !$OMP END DO
+
+print *, np, nq, jj, nqq, block_size
+               call dgemm('N', 'T', np, nqq, block_size, -1.d0,                       &
+                     Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 1.d0, dgemm_buffer1, np)
+
+               wait(iunit)
+               dgemm_buffer3 = dgemm_buffer1
+
+!               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
+               do q=jj,jj+nqq-1
+                 write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(:, q-jj+1)
+               enddo
+!               !$OMP END PARALLEL DO
 
              enddo
+print *, 'ok'
              deallocate(dgemm_buffer1, dgemm_buffer2)
+             wait(iunit)
+             deallocate(dgemm_buffer3)
 
            else
 
@@ -427,11 +469,20 @@ END_PROVIDER
          enddo
 
          iblock = iblock+1
-         !$OMP PARALLEL DO PRIVATE(p)
-         do p=1,np
-           Ltmp_p(p,iblock) = Delta(p,dj)
-         enddo
-         !$OMP END PARALLEL DO
+
+         if (delta_on_disk) then
+
+           read(iunit,rec=dj) Ltmp_p(1:np,iblock)
+
+         else
+
+           !$OMP PARALLEL DO PRIVATE(p)
+           do p=1,np
+             Ltmp_p(p,iblock) = Delta(p,dj)
+           enddo
+           !$OMP END PARALLEL DO
+
+         endif
 
          if (.not.computed(dj)) then
            m = dj
@@ -463,12 +514,26 @@ END_PROVIDER
                !$OMP END PARALLEL DO
            endif
 
-           !$OMP PARALLEL DO PRIVATE(p)
-           do p=1,np
-             Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
-             Delta(p,dj) =  Ltmp_p(p,iblock)
-           enddo
-           !$OMP END PARALLEL DO
+           if (delta_on_disk) then
+
+             !$OMP PARALLEL DO PRIVATE(p)
+             do p=1,np
+               Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
+             enddo
+             !$OMP END PARALLEL DO
+
+             write(iunit, ASYNCHRONOUS='YES', rec=dj) Ltmp_p(1:np,iblock)
+
+           else
+
+             !$OMP PARALLEL DO PRIVATE(p)
+             do p=1,np
+               Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
+               Delta(p,dj) =  Ltmp_p(p,iblock)
+             enddo
+             !$OMP END PARALLEL DO
+
+           endif
 
            computed(dj) = .True.
          endif
@@ -512,7 +577,7 @@ END_PROVIDER
        deallocate(Ltmp_q)
        deallocate(computed)
        if (delta_on_disk) then
-         call munmap( (/ np*1_8, nq*1_8 /), 8, fd(2), c_pointer(2) )
+         close(iunit, status='delete')
        else
          deallocate(Delta)
        endif
diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c
index 711a9c34..2dbe42b8 100644
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@@ -40,7 +40,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
             exit(EXIT_FAILURE);
         }
 
-        result = write(fd, "", 1);
+        result = write(fd, " ", 1);
         if (result != 1) {
             close(fd);
             printf("%s:\n", filename);
@@ -49,7 +49,10 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
         }
 
         if (single_node == 1) {
-          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK, fd, 0);
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0);
+          if (map == MAP_FAILED) {
+             map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+          }
         } else {
           map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
         }

From f671c669f8cae460911c1e016e9e44297e817d79 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 17 Jun 2024 14:57:48 +0200
Subject: [PATCH 086/131] Use less memory in Cholesky

---
 src/ao_two_e_ints/cholesky.irp.f      | 378 +++++++-------------------
 src/hartree_fock/fock_matrix_hf.irp.f |  21 +-
 2 files changed, 119 insertions(+), 280 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index d731ef04..a680e7ee 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -31,14 +31,14 @@ END_PROVIDER
    integer*8                      :: ndim8
    integer                        :: rank
    double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:)
+   double precision, pointer      :: L(:,:), Delta(:,:)
 
    double precision               :: s
    double precision               :: dscale, dscale_tmp
 
-   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:)
    integer, allocatable           :: addr1(:), addr2(:)
-   integer*8, allocatable         :: Lset(:), Dset(:), addr3(:)
+   integer*8, allocatable         :: Lset(:), Dset(:)
    logical, allocatable           :: computed(:)
 
    integer                        :: i,j,k,m,p,q, dj, p2, q2, ii, jj
@@ -64,11 +64,8 @@ END_PROVIDER
 
    type(c_ptr)                    :: c_pointer(2)
    integer                        :: fd(2)
-   logical                        :: delta_on_disk
-   integer                        :: dgemm_block_size, nqq
-   double precision, allocatable  :: dgemm_buffer1(:,:), dgemm_buffer2(:,:), dgemm_buffer3(:,:)
 
-   PROVIDE nproc
+   PROVIDE nproc ao_cholesky_threshold do_direct_integrals qp_max_mem
    PROVIDE nucl_coord ao_two_e_integral_schwartz
    call set_multiple_levels_omp(.False.)
 
@@ -88,19 +85,8 @@ END_PROVIDER
 
    else
 
-     PROVIDE nucl_coord ao_two_e_integral_schwartz
      call set_multiple_levels_omp(.False.)
 
-     call resident_memory(mem0)
-
-     rank_max = min(ndim8,(qp_max_mem*1024_8*1024_8*1024_8/8_8)/ndim8)
-     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
-     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
-     ! Deleting the file while it is open makes the file invisible on the filesystem,
-     ! and automatically deleted, even if the program crashes
-     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
-     close(iunit,status='delete')
-
      if (do_direct_integrals) then
        if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then
          ! Trigger providers inside ao_two_e_integral
@@ -113,8 +99,12 @@ END_PROVIDER
      tau = ao_cholesky_threshold
      tau2 = tau*tau
 
-     mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8)
-     call check_mem(mem, irp_here)
+     rank = 0
+
+     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
+     allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8) )
+
+     call resident_memory(mem0)
 
      call print_memory_usage()
 
@@ -127,46 +117,35 @@ END_PROVIDER
      print *,  '============ ============='
 
 
-     rank = 0
-
-     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
-     allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) )
-!print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8)
-!print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8)
-!print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8)
-!print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8)
-
      ! 1.
-     k=0
+     i8=0
      do j=1,ao_num
        do i=1,ao_num
-         k = k+1
-         addr1(k) = i
-         addr2(k) = j
-         addr3(k) = (i-1)*ao_num + j
+         i8 = i8+1
+         addr1(i8) = i
+         addr2(i8) = j
        enddo
      enddo
 
      if (do_direct_integrals) then
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
        do i8=ndim8,1,-1
          D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
              addr1(i8), addr2(i8))
        enddo
        !$OMP END PARALLEL DO
      else
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
        do i8=ndim8,1,-1
          D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
-             addr2(i8), addr2(i8),                                   &
-             ao_integrals_map)
+             addr2(i8), addr2(i8), ao_integrals_map)
        enddo
        !$OMP END PARALLEL DO
      endif
+
      D_sorted(:) = -D(:)
      call dsort_noidx_big(D_sorted,ndim8)
-     D_sorted(:) = dabs(D_sorted(:))
-
+     D_sorted(:) = -D_sorted(:)
      Dmax = D_sorted(1)
 
      ! 2.
@@ -174,12 +153,24 @@ END_PROVIDER
      dscale_tmp = dscale*dscale*Dmax
      np8=0_8
      do p8=1,ndim8
-       if ( dscale_tmp*D(p8) > tau2 ) then
+       if ( dscale_tmp*D(p8) >= tau2 ) then
          np8 = np8+1_8
          Lset(np8) = p8
        endif
      enddo
      np = np8
+     if (np <= 0) stop 'np<=0'
+     if (np > ndim8) stop 'np>ndim8'
+
+     rank_max = min(np,20*elec_num*elec_num)
+     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
+     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
+
+     ! Deleting the file while it is open makes the file invisible on the filesystem,
+     ! and automatically deleted, even if the program crashes
+     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
+     close(iunit,status='delete')
+
 
      ! 3.
      N = 0
@@ -187,85 +178,59 @@ END_PROVIDER
      ! 4.
      i = 0
 
+     mem = memory_of_double(np)                & ! Delta(np,nq)
+         + (np+1)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
+
+!     call check_mem(mem)
+
      ! 5.
-     do while ( (Dmax > tau).and.(rank*1_8 < min(ndim8,rank_max)) )
+     do while ( (Dmax > tau).and.(np > 0) )
        ! a.
        i = i+1
 
 
-       ! Inrease s until the arrays fit in memory
-       s = 0.01d0
        block_size = max(N,24)
+
+       ! Determine nq so that Delta fits in memory
+
+       s = 0.1d0
+       Dmin = max(s*Dmax,tau)
+       do nq=2,np-1
+         if (D_sorted(nq) < Dmin) exit
+       enddo
+
        do while (.True.)
 
-         ! b.
-         Dmin = max(s*Dmax,tau)
+         mem = mem0                                 &
+             + np*memory_of_double(nq)              & ! Delta(np,nq)
+             + (np+nq)*memory_of_double(block_size) & ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
+             + memory_of_int(nq)                      ! computed(nq)
 
-         ! c.
-         nq=0
-         do p=1,np
-           if ( D(Lset(p)) > Dmin ) then
-             nq = nq+1
-             Dset(nq) = Lset(p)
-           endif
-         enddo
-
-
-         mem = mem0                                &
-             + np*memory_of_double(nq)
-
-!print *, 'mem = ', mem
-         if (mem > qp_max_mem/2) then
-           s = s*2.d0
+         if (mem > qp_max_mem*0.5d0) then
+           nq = nq/2
          else
            exit
          endif
 
-         if ((s > 1.d0).or.(nq == 0)) then
-           call print_memory_usage()
-           print *, 'Required peak memory: ', mem, 'Gb'
-           call resident_memory(mem)
-           print *, 'Already used  memory: ', mem, 'Gb'
-           print *, 'Not enough memory. Reduce cholesky threshold'
-           stop -1
-         endif
-
-         if (s > 0.3d0) then
-           exit
-         endif
-
        enddo
 
-       ! d., e.
-       mem = mem0                            &
-             + memory_of_int(nq)             &! computed(nq)
-             + np*memory_of_int(nq)          &! computed(nq)
-             + memory_of_double(np)          &! Delta_col(np)
-             + 7*memory_of_double8(ndim8)     &! D, Lset, Dset, D_sorted, addr[1-3]
-             + np*memory_of_double(nq)       &! Delta(np,nq)
-             + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
-
-       if (1.1*mem > qp_max_mem) then
-!         call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2))
-!         call c_f_pointer(c_pointer(2), Delta, (/ np, nq /))
-
-!         ! Deleting the file while it is open makes the file invisible on the filesystem,
-!         ! and automatically deleted, even if the program crashes
-         iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R')
-         close(iunit,status='delete')
-         open(unit=iunit, access='DIRECT', form='UNFORMATTED', RECL=(np+1)*8, &
-            ASYNCHRONOUS='YES', file=trim(ezfio_work_dir)//'cholesky_delta')
-         delta_on_disk = .True.
-       else
-         allocate(Delta(np,nq))
-         delta_on_disk = .False.
+       if (nq <= 0) then
+         print *, nq
+         stop 'bug in cholesky: nq <= 0'
        endif
-!print *, delta_on_disk
 
-       allocate(Delta_col(np))
+       Dmin = D_sorted(nq)
+       nq=0
+       do p=1,np
+         if ( D(Lset(p)) >= Dmin ) then
+           nq = nq+1
+           Dset(nq) = Lset(p)
+         endif
+       enddo
 
+
+       allocate(Delta(np,nq))
        allocate(Ltmp_p(np,block_size), stat=ierr)
-!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -274,7 +239,6 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_q(nq,block_size), stat=ierr)
-!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8), nq, block_size
 
        if (ierr /= 0) then
          call print_memory_usage()
@@ -287,7 +251,6 @@ END_PROVIDER
        computed(:) = .False.
 
 
-!print *, 'N, rank, block_size', N, rank, block_size
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
        do k=1,N
          !$OMP DO
@@ -305,81 +268,18 @@ END_PROVIDER
        !$OMP BARRIER
        !$OMP END PARALLEL
 
-       PROVIDE nproc
-
-       if (delta_on_disk) then
-
-         dgemm_block_size = nproc*4
-
-         allocate (dgemm_buffer1(np,dgemm_block_size))
-         allocate (dgemm_buffer3(np,dgemm_block_size))
-         allocate (dgemm_buffer2(dgemm_block_size,N))
-
-         if (N>0) then
-           ! Blocking improves I/O performance
-
-
-           do jj=1,nq,dgemm_block_size
-
-             nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
-
-             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii)
-             do ii=1,N
-               do q=jj,jj+nqq-1
-                 dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
-               enddo
-             enddo
-             !$OMP END PARALLEL DO
-
-print *, np, nq, jj, nqq, N
-             call dgemm('N', 'T', np, nqq, N, -1.d0,                       &
-                   Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np)
-
-             wait(iunit)
-             dgemm_buffer3(:,:) = dgemm_buffer1(:,:)
-!             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
-             do q=jj,jj+nqq-1
-               write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(1:np, q-jj+1)
-             enddo
-!             !$OMP END PARALLEL DO
-
-           enddo
-print *, 'ok1'
-
-         else
-
-
-           dgemm_buffer1(1:np,1) = 0.d0
-
-!           !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
-           do q=1,nq
-             write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer1(1:np, 1)
-           enddo
-!           !$OMP END PARALLEL DO
-
-         endif
-
-         deallocate(dgemm_buffer1, dgemm_buffer2)
-         if (delta_on_disk) wait(iunit)
-         deallocate(dgemm_buffer3)
-
-
-       else
-
-         if (N>0) then
+       if (N>0) then
 
            call dgemm('N', 'T', np, nq, N, -1.d0,                       &
                   Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
 
-         else
+       else
 
-           !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
-           do q=1,nq
-             Delta(:,q) = 0.d0
-           enddo
-           !$OMP END PARALLEL DO
-
-         endif
+         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
+         do q=1,nq
+           Delta(:,q) = 0.d0
+         enddo
+         !$OMP END PARALLEL DO
 
        endif
 
@@ -395,64 +295,20 @@ print *, 'ok1'
        do j=1,nq
 
          if ( (Qmax <= Dmin).or.(N+j*1_8 > ndim8) ) exit
+
          ! i.
          rank = N+j
+         if (rank == rank_max) then
+           print *, 'cholesky: rank_max reached'
+           exit
+         endif
 
          if (iblock == block_size) then
 
-           if (delta_on_disk) then
-             ! Blocking improves I/O performance
-
-             dgemm_block_size = nproc*4
-             allocate (dgemm_buffer1(np,dgemm_block_size))
-             allocate (dgemm_buffer2(dgemm_block_size,block_size))
-
-             do jj=1,nq,dgemm_block_size
-               nqq = min(nq, jj+dgemm_block_size-1) - jj + 1
-
-               !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(q,ii)
-               !$OMP DO
-               do ii=1,block_size
-                 do q=jj,jj+nqq-1
-                   dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii)
-                 enddo
-               enddo
-               !$OMP END DO
-               !$OMP END PARALLEL
-
-!               !$OMP DO
-               do q=jj,jj+nqq-1
-                 read(iunit, rec=q) dgemm_buffer1(1:np, q-jj+1)
-               enddo
-!               !$OMP END DO
-
-print *, np, nq, jj, nqq, block_size
-               call dgemm('N', 'T', np, nqq, block_size, -1.d0,                       &
-                     Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 1.d0, dgemm_buffer1, np)
-
-               wait(iunit)
-               dgemm_buffer3 = dgemm_buffer1
-
-!               !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q)
-               do q=jj,jj+nqq-1
-                 write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(:, q-jj+1)
-               enddo
-!               !$OMP END PARALLEL DO
-
-             enddo
-print *, 'ok'
-             deallocate(dgemm_buffer1, dgemm_buffer2)
-             wait(iunit)
-             deallocate(dgemm_buffer3)
-
-           else
-
-             call dgemm('N','T',np,nq,block_size,-1.d0,                &
+            call dgemm('N','T',np,nq,block_size,-1.d0,                &
                  Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
 
-           endif
-
-           iblock = 0
+            iblock = 0
 
          endif
 
@@ -469,71 +325,47 @@ print *, 'ok'
          enddo
 
          iblock = iblock+1
-
-         if (delta_on_disk) then
-
-           read(iunit,rec=dj) Ltmp_p(1:np,iblock)
-
-         else
-
-           !$OMP PARALLEL DO PRIVATE(p)
-           do p=1,np
-             Ltmp_p(p,iblock) = Delta(p,dj)
-           enddo
-           !$OMP END PARALLEL DO
-
-         endif
+         !$OMP PARALLEL DO PRIVATE(p)
+         do p=1,np
+           Ltmp_p(p,iblock) = Delta(p,dj)
+         enddo
+         !$OMP END PARALLEL DO
 
          if (.not.computed(dj)) then
            m = dj
            if (do_direct_integrals) then
                !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
                do k=1,np
+                 Delta_col(k) = 0.d0
                  if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                        addr2(Lset(k)), addr2(Dset(m)) ) ) then
                      Delta_col(k) = &
                          ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
                          addr1(Dset(m)), addr2(Dset(m)))
-                 else
-                     Delta_col(k) = 0.d0
                  endif
                enddo
                !$OMP END PARALLEL DO
            else
+               PROVIDE ao_integrals_map
                !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
                do k=1,np
+                 Delta_col(k) = 0.d0
                  if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
                        addr2(Lset(k)), addr2(Dset(m)) ) ) then
                      Delta_col(k) = &
                          get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
                          addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
-                 else
-                     Delta_col(k) = 0.d0
                  endif
                enddo
                !$OMP END PARALLEL DO
            endif
 
-           if (delta_on_disk) then
-
-             !$OMP PARALLEL DO PRIVATE(p)
-             do p=1,np
-               Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
-             enddo
-             !$OMP END PARALLEL DO
-
-             write(iunit, ASYNCHRONOUS='YES', rec=dj) Ltmp_p(1:np,iblock)
-
-           else
-
-             !$OMP PARALLEL DO PRIVATE(p)
-             do p=1,np
-               Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
-               Delta(p,dj) =  Ltmp_p(p,iblock)
-             enddo
-             !$OMP END PARALLEL DO
-
-           endif
+           !$OMP PARALLEL DO PRIVATE(p)
+           do p=1,np
+             Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
+             Delta(p,dj) =  Ltmp_p(p,iblock)
+           enddo
+           !$OMP END PARALLEL DO
 
            computed(dj) = .True.
          endif
@@ -572,30 +404,26 @@ print *, 'ok'
 
        print '(I10, 4X, ES12.3)', rank, Qmax
 
-       deallocate(Delta_col)
        deallocate(Ltmp_p)
        deallocate(Ltmp_q)
        deallocate(computed)
-       if (delta_on_disk) then
-         close(iunit, status='delete')
-       else
-         deallocate(Delta)
-       endif
+       deallocate(Delta)
 
        ! i.
        N = rank
 
        ! j.
-       Dmax = D(Lset(1))
-       do p=1,np
-         Dmax = max(Dmax, D(Lset(p)))
-       enddo
+       D_sorted(:) = -D(:)
+       call dsort_noidx_big(D_sorted,ndim8)
+       D_sorted(:) = -D_sorted(:)
+
+       Dmax = D_sorted(1)
 
        dscale = 1.d0
        dscale_tmp = dscale*dscale*Dmax
        np8=0_8
        do p8=1,ndim8
-         if ( dscale_tmp*D(p8) > tau2 ) then
+         if ( dscale_tmp*D(p8) >= tau2 ) then
            np8 = np8+1_8
            Lset(np8) = p8
          endif
@@ -609,7 +437,6 @@ print *, 'ok'
      print *,  ''
 
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
-!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8)
 
      if (ierr /= 0) then
        call print_memory_usage()
@@ -621,7 +448,7 @@ print *, 'ok'
      !$OMP PARALLEL DO PRIVATE(k,j)
      do k=1,rank
        do j=1,ao_num
-           cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k)
+           cholesky_ao(1:ao_num,j,k) = L((j-1_8)*ao_num+1_8:1_8*j*ao_num,k)
        enddo
      enddo
      !$OMP END PARALLEL DO
@@ -646,5 +473,6 @@ print *, 'ok'
    call wall_time(wall1)
    print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
 
+
 END_PROVIDER
 
diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f
index 65b3d63c..6d917322 100644
--- a/src/hartree_fock/fock_matrix_hf.irp.f
+++ b/src/hartree_fock/fock_matrix_hf.irp.f
@@ -194,17 +194,28 @@ END_PROVIDER
  endif
 
 
- double precision :: rss
+ double precision :: rss, mem0, mem
  double precision :: memory_of_double
 
  integer :: iblock
- integer, parameter :: block_size = 32
+ integer :: block_size
+
+ call resident_memory(mem0)
+
+ block_size = 1024
+
+ rss = memory_of_double(2.d0*ao_num*ao_num)
+ do
+   mem = mem0 + block_size*rss
+   if ( (block_size < 2).or.(mem < qp_max_mem) ) exit
+   block_size = block_size/2
+ enddo
+
+ call check_mem(block_size*rss, irp_here)
 
- rss = memory_of_double(ao_num*ao_num)
- call check_mem(2.d0*block_size*rss, irp_here)
  allocate(X2(ao_num,ao_num,block_size,2))
  allocate(X3(ao_num,block_size,ao_num,2))
-    
+
 ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j)
 
  do iblock=1,cholesky_ao_num,block_size

From 4b578d9849df7fa548a0b8627f714df6248a8440 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 20 Jun 2024 13:43:46 +0200
Subject: [PATCH 087/131] mmap is now shared in cholesky

---
 src/ao_two_e_ints/cholesky.irp.f | 36 ++++++++++++++++++--------------
 src/utils/fortran_mmap.c         |  3 +++
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index a680e7ee..063cc365 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -31,12 +31,11 @@ END_PROVIDER
    integer*8                      :: ndim8
    integer                        :: rank
    double precision               :: tau, tau2
-   double precision, pointer      :: L(:,:), Delta(:,:)
+   double precision, pointer      :: L(:,:)
 
    double precision               :: s
-   double precision               :: dscale, dscale_tmp
 
-   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:)
    integer, allocatable           :: addr1(:), addr2(:)
    integer*8, allocatable         :: Lset(:), Dset(:)
    logical, allocatable           :: computed(:)
@@ -102,7 +101,7 @@ END_PROVIDER
      rank = 0
 
      allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
-     allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8) )
+     allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8), computed(ndim8) )
 
      call resident_memory(mem0)
 
@@ -149,11 +148,9 @@ END_PROVIDER
      Dmax = D_sorted(1)
 
      ! 2.
-     dscale = 1.d0
-     dscale_tmp = dscale*dscale*Dmax
      np8=0_8
      do p8=1,ndim8
-       if ( dscale_tmp*D(p8) >= tau2 ) then
+       if ( Dmax*D(p8) >= tau2 ) then
          np8 = np8+1_8
          Lset(np8) = p8
        endif
@@ -203,16 +200,23 @@ END_PROVIDER
 
          mem = mem0                                 &
              + np*memory_of_double(nq)              & ! Delta(np,nq)
-             + (np+nq)*memory_of_double(block_size) & ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
-             + memory_of_int(nq)                      ! computed(nq)
+             + (np+nq)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
          if (mem > qp_max_mem*0.5d0) then
-           nq = nq/2
+           Dmin = D_sorted(nq/2)
+           do ii=nq/2,np-1
+             if (D_sorted(ii) < Dmin) then
+               nq = ii
+               exit
+             endif
+           enddo
          else
            exit
          endif
 
        enddo
+!call print_memory_usage
+!print *, 'np, nq, Predicted memory: ', np, nq, mem
 
        if (nq <= 0) then
          print *, nq
@@ -247,8 +251,7 @@ END_PROVIDER
        endif
 
 
-       allocate(computed(nq))
-       computed(:) = .False.
+       computed(1:nq) = .False.
 
 
        !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
@@ -406,7 +409,6 @@ END_PROVIDER
 
        deallocate(Ltmp_p)
        deallocate(Ltmp_q)
-       deallocate(computed)
        deallocate(Delta)
 
        ! i.
@@ -419,11 +421,9 @@ END_PROVIDER
 
        Dmax = D_sorted(1)
 
-       dscale = 1.d0
-       dscale_tmp = dscale*dscale*Dmax
        np8=0_8
        do p8=1,ndim8
-         if ( dscale_tmp*D(p8) >= tau2 ) then
+         if ( Dmax*D(p8) >= tau2 ) then
            np8 = np8+1_8
            Lset(np8) = p8
          endif
@@ -436,6 +436,10 @@ END_PROVIDER
      print *,  '============ ============='
      print *,  ''
 
+     deallocate( D, Lset, Dset, D_sorted )
+     deallocate( addr1, addr2, Delta_col, computed )
+
+
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
 
      if (ierr /= 0) then
diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c
index 2dbe42b8..0306f64f 100644
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@@ -49,10 +49,13 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only,
         }
 
         if (single_node == 1) {
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+/*
           map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0);
           if (map == MAP_FAILED) {
              map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
           }
+*/
         } else {
           map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
         }

From b6b169c1cd5ad510364cf5b33800cad8a6b5272c Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 20 Jun 2024 17:27:21 +0200
Subject: [PATCH 088/131] Updated documentation

---
 src/ao_two_e_ints/cholesky.irp.f | 3 +++
 src/utils/linear_algebra.irp.f   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 41cdb80d..cdd64a8c 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -25,7 +25,10 @@ END_PROVIDER
    ! Last dimension of cholesky_ao is cholesky_ao_num
    !
    ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
+   !
    ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
+   !
+   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
    END_DOC
 
    integer*8                      :: ndim8
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 20386b30..4e7ca87d 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
 ! matrix A is destroyed inside this subroutine
 ! Cholesky vectors are stored in U
-! dimension of U: U(1:rank, 1:n)
+! dimension of U: U(1:n, 1:rank)
 ! U is allocated inside this subroutine
 ! rank is the number of Cholesky vectors depending on tol
 !

From 7e45c517d981adcd77883a39a361be38b470ff20 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 25 Jun 2024 18:32:44 +0200
Subject: [PATCH 089/131] Introducing gpu_x86

---
 src/gpu_x86/NEED           |   1 +
 src/gpu_x86/README.rst     |   5 +
 src/gpu_x86/gpu.c          | 506 +++++++++++++++++++++++++++++++++++++
 src/gpu_x86/gpu.h          |  41 +++
 src/gpu_x86/gpu_module.F90 | 141 +++++++++++
 5 files changed, 694 insertions(+)
 create mode 100644 src/gpu_x86/NEED
 create mode 100644 src/gpu_x86/README.rst
 create mode 100644 src/gpu_x86/gpu.c
 create mode 100644 src/gpu_x86/gpu.h
 create mode 100644 src/gpu_x86/gpu_module.F90

diff --git a/src/gpu_x86/NEED b/src/gpu_x86/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/gpu_x86/NEED
@@ -0,0 +1 @@
+
diff --git a/src/gpu_x86/README.rst b/src/gpu_x86/README.rst
new file mode 100644
index 00000000..f530bf29
--- /dev/null
+++ b/src/gpu_x86/README.rst
@@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
new file mode 100644
index 00000000..71505dbe
--- /dev/null
+++ b/src/gpu_x86/gpu.c
@@ -0,0 +1,506 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_free(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) 2;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) 1;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  if (handle == NULL) {
+    perror("NULL handle");
+    exit(-1);
+  }
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  if (handle == NULL) {
+    perror("NULL handle");
+    exit(-1);
+  }
+
+  if ( (transa == 'N' && transb == 'N') ||
+       (transa == 'n' && transb == 'N') ||
+       (transa == 'N' && transb == 'n') ||
+       (transa == 'n' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'N' && transb == 'T') ||
+              (transa == 'n' && transb == 'T') ||
+              (transa == 'N' && transb == 't') ||
+              (transa == 'n' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'N') ||
+              (transa == 't' && transb == 'N') ||
+              (transa == 'T' && transb == 'n') ||
+              (transa == 't' && transb == 'n') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (transa == 'T' && transb == 'T') ||
+              (transa == 't' && transb == 'T') ||
+              (transa == 'T' && transb == 't') ||
+              (transa == 't' && transb == 't') ) {
+
+     if (alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<n ; ++i) {
+           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
diff --git a/src/gpu_x86/gpu.h b/src/gpu_x86/gpu.h
new file mode 100644
index 00000000..03aeeab7
--- /dev/null
+++ b/src/gpu_x86/gpu.h
@@ -0,0 +1,41 @@
+#include <stdint.h>
+
+int  gpu_ndevices();
+void gpu_set_device(int32_t i);
+
+void gpu_allocate(void** ptr, const int64_t n);
+void gpu_free(void** ptr);
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
+
+void gpu_stream_create(void** ptr);
+void gpu_stream_destroy(void** ptr);
+void gpu_set_stream(void* handle, void* stream);
+void gpu_synchronize();
+
+void gpu_blas_create(void** handle);
+void gpu_blas_destroy(void** handle);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90
new file mode 100644
index 00000000..86ba3926
--- /dev/null
+++ b/src/gpu_x86/gpu_module.F90
@@ -0,0 +1,141 @@
+module gpu
+  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
+  implicit none
+
+  interface
+    integer function gpu_ndevices() bind(C)
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create(stream) bind(C)
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream) bind(C)
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream) bind(C)
+      import
+      type(c_ptr) :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize()
+    end subroutine
+
+    subroutine gpu_blas_create(handle) bind(C)
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle) bind(C)
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
+      import
+      type(c_ptr), intent(in)     :: handle
+      integer(c_int64_t), value   :: n, incx, incy
+      real(c_double), intent(in)  :: dx(*), dy(*)
+      real(c_double), intent(out) :: res
+    end subroutine
+
+    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
+      import
+      type(c_ptr), intent(in)     :: handle
+      integer(c_int64_t), value   :: n, incx, incy
+      real(c_float), intent(in)   :: dx(*), dy(*)
+      real(c_float), intent(out)  :: res
+    end subroutine
+
+  end interface
+
+end module
+
+subroutine gpu_allocate_double(ptr, s)
+  use gpu
+  implicit none
+  double precision, pointer, intent(inout) :: ptr
+  integer*8, intent(in) :: s(*)
+  type(c_ptr) :: cptr
+
+  call gpu_allocate_c(cptr, sum(s)*8_8)
+  call c_f_pointer(cptr, ptr, s)
+end subroutine
+
+subroutine gpu_free_double(ptr)
+  use gpu
+  implicit none
+  double precision, pointer, intent(inout) :: ptr
+  type(c_ptr) :: cptr
+  cptr = cloc(ptr)
+  call gpu_free(cptr)
+  NULLIFY(ptr)
+end subroutine
+
+subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: cpu_ptr(*)
+  double precision, intent(out)  :: gpu_ptr(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
+end subroutine
+
+subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: gpu_ptr(*)
+  double precision, intent(out)  :: cpu_ptr(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
+end subroutine
+
+subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
+  use gpu
+  implicit none
+  double precision, intent(in)   :: gpu_ptr_src(*)
+  double precision, intent(out)  :: gpu_ptr_dest(*)
+  integer(c_int64_t), intent(in) :: n
+  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
+end subroutine
+

From 646607ada4a0a58c9cd5e0593c04bce7bc9bd02e Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 26 Jun 2024 11:15:30 +0200
Subject: [PATCH 090/131] 1st commit

---
 plugins/local/tc_int/NEED                 |   5 +
 plugins/local/tc_int/README.rst           |   4 +
 plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ++++++++++++++++++++++
 plugins/local/tc_int/jast_grad_2e.irp.f   | 102 ++++++++++++++++
 plugins/local/tc_int/jast_grad_full.irp.f |  51 ++++++++
 plugins/local/tc_int/jast_utils_bh.irp.f  |  35 ++++++
 plugins/local/tc_int/write_tc_int.irp.f   |  58 ++++++++++
 7 files changed, 389 insertions(+)
 create mode 100644 plugins/local/tc_int/NEED
 create mode 100644 plugins/local/tc_int/README.rst
 create mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f
 create mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f
 create mode 100644 plugins/local/tc_int/jast_grad_full.irp.f
 create mode 100644 plugins/local/tc_int/jast_utils_bh.irp.f
 create mode 100644 plugins/local/tc_int/write_tc_int.irp.f

diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED
new file mode 100644
index 00000000..8a4caf5b
--- /dev/null
+++ b/plugins/local/tc_int/NEED
@@ -0,0 +1,5 @@
+tc_keywords
+jastrow
+qmckl
+becke_numerical_grid
+dft_utils_in_r
diff --git a/plugins/local/tc_int/README.rst b/plugins/local/tc_int/README.rst
new file mode 100644
index 00000000..bc9e8483
--- /dev/null
+++ b/plugins/local/tc_int/README.rst
@@ -0,0 +1,4 @@
+======
+tc_int
+======
+
diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f
new file mode 100644
index 00000000..0cf0d775
--- /dev/null
+++ b/plugins/local/tc_int/int2_grad1_u12.irp.f
@@ -0,0 +1,134 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  implicit none
+  integer                       :: ipoint, i, j, m, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: time0, time1
+  double precision              :: mem, n_double
+  double precision, allocatable :: tmp(:,:,:)
+  double precision, allocatable :: tmp_grad1_u12(:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+
+  ! int2_grad1_u12_ao(i,j,ipoint,1) =        \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,2) =        \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,3) =        \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2  \chi_i(r2) \chi_j(r2) 
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+  
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+  
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+  enddo
+  
+  deallocate(tmp_grad1_u12)
+
+  
+  if(n_rest .gt. 0) then
+  
+    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
+  
+    ii = n_pass*n_blocks + 1
+
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+
+    deallocate(tmp_grad1_u12)
+  endif
+
+  deallocate(tmp)
+
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+
+  deallocate(int2_grad1_u12_ao)
+
+  call wall_time(time1)
+  print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  call print_memory_usage()
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f
new file mode 100644
index 00000000..b18b9d62
--- /dev/null
+++ b/plugins/local/tc_int/jast_grad_2e.irp.f
@@ -0,0 +1,102 @@
+
+! ---
+
+subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz)
+
+  BEGIN_DOC
+  !
+  !  d/dx1 j_2e(1,2)
+  !  d/dy1 j_2e(1,2)
+  !  d/dz1 j_2e(1,2)
+  !
+  END_DOC
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint
+  integer                       :: i_nucl, p, mpA, npA, opA
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r2(3)
+  double precision              :: tmp, tmp1, tmp2
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_points_extra_final_grid ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0 
+    grady(jpoint) = 0.d0 
+    gradz(jpoint) = 0.d0 
+    do i_nucl = 1, nucl_num 
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
+      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+
+      ! Compute powers of f1A and f2A
+      do p = 1, powmax1
+        f1A_power(p) = f1A_power(p-1) * f1A
+        f2A_power(p) = f2A_power(p-1) * f2A
+      enddo
+      do p = 1, powmax2
+        g12_power(p) = g12_power(p-1) * g12
+      enddo
+
+      do p = 1, jBH_size
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+        tmp = jBH_c(p,i_nucl)
+        if(mpA .eq. npA) then
+          tmp = tmp * 0.5d0
+        endif
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
new file mode 100644
index 00000000..f63ee3e4
--- /dev/null
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -0,0 +1,51 @@
+
+! ---
+
+subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
+
+  BEGIN_DOC
+  ! 
+  ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
+  ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
+  ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
+  ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
+  !
+  ! We use:
+  !       grid for r1
+  ! extra_grid for r2
+  !
+  END_DOC
+
+  implicit none
+  integer,          intent(in)  :: ipoint, n_grid2
+  double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
+
+  integer                       :: jpoint
+  double precision              :: env_r1, tmp
+  double precision              :: grad1_env(3), r1(3)
+  double precision, allocatable :: env_r2(:)
+  double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:)
+  double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:)
+  double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:)
+  double precision, external    :: env_nucl
+
+  r1(1) = final_grid_points(1,ipoint)
+  r1(2) = final_grid_points(2,ipoint)
+  r1(3) = final_grid_points(3,ipoint)
+
+
+  ! j2e_type .eq. "Boys_Handy"
+  ! env_type .eq. "None"
+  ! j1e_type .eq "None"
+
+  call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1))
+
+  do jpoint = 1, n_points_extra_final_grid
+    res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
+  enddo
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f
new file mode 100644
index 00000000..750ce90b
--- /dev/null
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@@ -0,0 +1,35 @@
+
+! ---
+
+subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: alpha, r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+              + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+              + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+
+
+  if(dist .ge. 1d-10) then
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+
+    fct = alpha * dist * tmp1
+    tmp2 = alpha * tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+  else
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+  endif
+
+  return
+end 
+
+! ---
+
diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f
new file mode 100644
index 00000000..ebdce6f2
--- /dev/null
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@@ -0,0 +1,58 @@
+! ---
+
+program write_tc_int
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+  endif
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call provide_int2_grad1_u12_ao()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+

From a2f4bc218d207d4c588b3d0d1c1d4c5f7448b334 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 26 Jun 2024 13:44:45 +0200
Subject: [PATCH 091/131] GPU acceleration selection in configure

---
 configure | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index 41c0123d..014275eb 100755
--- a/configure
+++ b/configure
@@ -40,14 +40,16 @@ Usage:
   $(basename $0) -c <file>    
   $(basename $0) -h           
   $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|none]
 
 Options:
-  -c  <file>    Define a COMPILATION configuration file,
-                in "${QP_ROOT}/config/".
-  -h            Print the HELP message
-  -i <package>  INSTALL <package>. Use at your OWN RISK:
-                no support will be provided for the installation of
-                dependencies.
+  -c  <file>        Define a COMPILATION configuration file,
+                    in "${QP_ROOT}/config/".
+  -h                Print the HELP message
+  -i <package>      INSTALL <package>. Use at your OWN RISK:
+                    no support will be provided for the installation of
+                    dependencies.
+  -g [nvidia|none]  Choose GPU acceleration (experimental)
 
 Example:
   ./$(basename $0) -c config/gfortran.cfg
@@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""
 
 
-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
     case "$c" in
         c)
             case "$OPTARG" in
@@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                 "") help ; break;;
                 *) PACKAGES="${PACKAGE} $OPTARG"
             esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
         h)
             help
             exit 0;;
@@ -109,6 +114,23 @@ while getopts "d:c:i:h" c ; do
     esac
 done
 
+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu
+case "$GPU" in
+  amd) # Nvidia
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)
 

From 1d0bac25d081d76177c6efeba88117f397c5de3c Mon Sep 17 00:00:00 2001
From: Abdallah Ammar <abd.ammar.phys@gmail.com>
Date: Wed, 26 Jun 2024 15:31:44 +0200
Subject: [PATCH 092/131] v0

---
 plugins/local/tc_int/compute_tc_int.irp.f | 295 ++++++++++++++++++++++
 plugins/local/tc_int/int2_grad1_u12.irp.f | 134 ----------
 plugins/local/tc_int/jast_grad_2e.irp.f   | 102 --------
 plugins/local/tc_int/jast_grad_full.irp.f | 113 +++++++--
 plugins/local/tc_int/write_tc_int.irp.f   |  18 +-
 5 files changed, 401 insertions(+), 261 deletions(-)
 create mode 100644 plugins/local/tc_int/compute_tc_int.irp.f
 delete mode 100644 plugins/local/tc_int/int2_grad1_u12.irp.f
 delete mode 100644 plugins/local/tc_int/jast_grad_2e.irp.f

diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f
new file mode 100644
index 00000000..02f21570
--- /dev/null
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@@ -0,0 +1,295 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  BEGIN_DOC
+  !
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
+  !
+  !
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
+  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
+  ! where:
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !
+  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
+  !
+  END_DOC
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
+  double precision              :: time0, time1, time2, tc1, tc2, tc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+
+  double precision, external    :: get_ao_two_e_integral
+
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
+
+
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+  ! ---
+  ! ---
+  ! ---
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+
+  tc = 0.d0
+  
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+  
+    call wall_time(tc1)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+  enddo
+  
+  deallocate(tmp_grad1_u12)
+
+  
+  if(n_rest .gt. 0) then
+  
+    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
+  
+    ii = n_pass*n_blocks + 1
+
+    call wall_time(tc1)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO 
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1 
+  
+    do m = 1, 4
+      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+    enddo
+
+    deallocate(tmp_grad1_u12)
+  endif
+
+  deallocate(tmp)
+
+
+  call wall_time(time1)
+  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
+  call print_memory_usage()
+
+  ! ---
+  ! ---
+  ! ---
+
+
+  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(time1)
+
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  do m = 1, 3
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+  
+          weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  enddo
+  deallocate(c_mat)
+
+  call wall_time(time2)
+  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)  
+
+  PROVIDE ao_integrals_map
+  !$OMP PARALLEL DEFAULT(NONE)                         &
+  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                               < 1:i, 2:j | 1:k, 2:l > 
+          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(time2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do i = 1, ao_num
+    write(11) tc_int_2e_ao(:,:,:,i)
+  enddo
+  close(11)
+
+  ! ----
+
+  deallocate(int2_grad1_u12_ao)
+  deallocate(tc_int_2e_ao)
+
+  call wall_time(time2)
+  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/int2_grad1_u12.irp.f b/plugins/local/tc_int/int2_grad1_u12.irp.f
deleted file mode 100644
index 0cf0d775..00000000
--- a/plugins/local/tc_int/int2_grad1_u12.irp.f
+++ /dev/null
@@ -1,134 +0,0 @@
-
-! ---
-
-subroutine provide_int2_grad1_u12_ao()
-
-  implicit none
-  integer                       :: ipoint, i, j, m, jpoint
-  integer                       :: n_blocks, n_rest, n_pass
-  integer                       :: i_blocks, i_rest, i_pass, ii
-  double precision              :: time0, time1
-  double precision              :: mem, n_double
-  double precision, allocatable :: tmp(:,:,:)
-  double precision, allocatable :: tmp_grad1_u12(:,:,:)
-  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
-
-  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
-
-  print*, ' start provide_int2_grad1_u12_ao ...'
-  call wall_time(time0)
-
-
-  ! int2_grad1_u12_ao(i,j,ipoint,1) =        \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,2) =        \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,3) =        \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,4) = -(1/2) \int dr2 [\grad1 u(r1,r2)]^2  \chi_i(r2) \chi_j(r2) 
-  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
-
-
-
-  call total_memory(mem)
-  mem      = max(1.d0, qp_max_mem - mem)
-  n_double = mem * 1.d8
-  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
-  n_rest   = int(mod(n_points_final_grid, n_blocks))
-  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
-
-  call write_int(6, n_pass, 'Number of passes')
-  call write_int(6, n_blocks, 'Size of the blocks')
-  call write_int(6, n_rest, 'Size of the last block')
-
-
-  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
-  !$OMP PARALLEL               &
-  !$OMP DEFAULT (NONE)         &
-  !$OMP PRIVATE (j, i, jpoint) &
-  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
-  !$OMP DO SCHEDULE (static)
-  do j = 1, ao_num
-    do i = 1, ao_num
-      do jpoint = 1, n_points_extra_final_grid
-        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  
-  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
-  
-  do i_pass = 1, n_pass
-    ii = (i_pass-1)*n_blocks + 1
-  
-    !$OMP PARALLEL                   &
-    !$OMP DEFAULT (NONE)             &
-    !$OMP PRIVATE (i_blocks, ipoint) &
-    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
-    do i_blocks = 1, n_blocks
-      ipoint = ii - 1 + i_blocks ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
-    enddo
-  enddo
-  
-  deallocate(tmp_grad1_u12)
-
-  
-  if(n_rest .gt. 0) then
-  
-    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
-  
-    ii = n_pass*n_blocks + 1
-
-    !$OMP PARALLEL                 &
-    !$OMP DEFAULT (NONE)           &
-    !$OMP PRIVATE (i_rest, ipoint) &
-    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
-    do i_rest = 1, n_rest
-      ipoint = ii - 1 + i_rest ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
-  
-    do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
-    enddo
-
-    deallocate(tmp_grad1_u12)
-  endif
-
-  deallocate(tmp)
-
-
-  ! ---
-
-  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
-  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
-  call ezfio_set_work_empty(.False.)
-    write(11) int2_grad1_u12_ao(:,:,:,1:3)
-  close(11)
-
-  deallocate(int2_grad1_u12_ao)
-
-  call wall_time(time1)
-  print*, ' wall time for provide_int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
-  call print_memory_usage()
-
-end
-
-! ---
-
-
diff --git a/plugins/local/tc_int/jast_grad_2e.irp.f b/plugins/local/tc_int/jast_grad_2e.irp.f
deleted file mode 100644
index b18b9d62..00000000
--- a/plugins/local/tc_int/jast_grad_2e.irp.f
+++ /dev/null
@@ -1,102 +0,0 @@
-
-! ---
-
-subroutine get_grad1_u12_r1_2e(r1, n_grid2, gradx, grady, gradz)
-
-  BEGIN_DOC
-  !
-  !  d/dx1 j_2e(1,2)
-  !  d/dy1 j_2e(1,2)
-  !  d/dz1 j_2e(1,2)
-  !
-  END_DOC
-
-  include 'constants.include.F'
-
-  implicit none
-  integer         , intent(in)  :: n_grid2
-  double precision, intent(in)  :: r1(3)
-  double precision, intent(out) :: gradx(n_grid2)
-  double precision, intent(out) :: grady(n_grid2)
-  double precision, intent(out) :: gradz(n_grid2)
-
-  integer                       :: jpoint
-  integer                       :: i_nucl, p, mpA, npA, opA
-  integer                       :: powmax1, powmax, powmax2
-  double precision              :: r2(3)
-  double precision              :: tmp, tmp1, tmp2
-  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
-  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
-
-
-  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
-  powmax2 = maxval(jBH_o)
-  powmax  = max(powmax1, powmax2)
-
-  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
-
-  do p = 0, powmax
-    double_p(p) = dble(p)
-  enddo
-
-  f1A_power(-1) = 0.d0
-  f2A_power(-1) = 0.d0
-  g12_power(-1) = 0.d0
-
-  f1A_power(0) = 1.d0
-  f2A_power(0) = 1.d0
-  g12_power(0) = 1.d0
-
-  do jpoint = 1, n_points_extra_final_grid ! r2
-
-    r2(1) = final_grid_points_extra(1,jpoint)
-    r2(2) = final_grid_points_extra(2,jpoint)
-    r2(3) = final_grid_points_extra(3,jpoint)
-
-    gradx(jpoint) = 0.d0 
-    grady(jpoint) = 0.d0 
-    gradz(jpoint) = 0.d0 
-    do i_nucl = 1, nucl_num 
-
-      rn(1) = nucl_coord(i_nucl,1)
-      rn(2) = nucl_coord(i_nucl,2)
-      rn(3) = nucl_coord(i_nucl,3)
-
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
-      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
-
-      ! Compute powers of f1A and f2A
-      do p = 1, powmax1
-        f1A_power(p) = f1A_power(p-1) * f1A
-        f2A_power(p) = f2A_power(p-1) * f2A
-      enddo
-      do p = 1, powmax2
-        g12_power(p) = g12_power(p-1) * g12
-      enddo
-
-      do p = 1, jBH_size
-        mpA = jBH_m(p,i_nucl)
-        npA = jBH_n(p,i_nucl)
-        opA = jBH_o(p,i_nucl)
-        tmp = jBH_c(p,i_nucl)
-        if(mpA .eq. npA) then
-          tmp = tmp * 0.5d0
-        endif
-
-        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
-        tmp1 = tmp1 * g12_power(opA) * tmp
-        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
-
-        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
-        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
-        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
-      enddo ! p
-    enddo ! i_nucl
-  enddo ! jpoint
-
-  return
-end
-
-! ---
-
diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
index f63ee3e4..78ed1edf 100644
--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -16,31 +16,26 @@ subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
   !
   END_DOC
 
+  include 'constants.include.F'
+
   implicit none
   integer,          intent(in)  :: ipoint, n_grid2
   double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
 
-  integer                       :: jpoint
-  double precision              :: env_r1, tmp
-  double precision              :: grad1_env(3), r1(3)
-  double precision, allocatable :: env_r2(:)
-  double precision, allocatable :: u2b_r12(:), gradx1_u2b(:), grady1_u2b(:), gradz1_u2b(:)
-  double precision, allocatable :: u2b_mu(:), gradx1_mu(:), grady1_mu(:), gradz1_mu(:)
-  double precision, allocatable :: u2b_nu(:), gradx1_nu(:), grady1_nu(:), gradz1_nu(:)
-  double precision, external    :: env_nucl
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA, pp
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r1(3), r2(3)
+  double precision              :: tmp, tmp1, tmp2, tmp11, tmp22
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
 
   r1(1) = final_grid_points(1,ipoint)
   r1(2) = final_grid_points(2,ipoint)
   r1(3) = final_grid_points(3,ipoint)
 
+  call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
 
-  ! j2e_type .eq. "Boys_Handy"
-  ! env_type .eq. "None"
-  ! j1e_type .eq "None"
-
-  call get_grad1_u12_r1_2e(r1, n_grid2, resx(1), resy(1), resz(1))
-
-  do jpoint = 1, n_points_extra_final_grid
+  do jpoint = 1, n_grid2 ! r2
     res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
   enddo
 
@@ -49,3 +44,91 @@ end
 
 ! ---
 
+subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz, r12, tmp
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision              :: tmp1, tmp2
+  integer                       :: powmax1, powmax, powmax2
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_grid2 ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0
+    grady(jpoint) = 0.d0
+    gradz(jpoint) = 0.d0
+    do i_nucl = 1, nucl_num
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
+      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
+      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+
+      ! Compute powers of f1A and f2A
+      do p = 1, powmax1
+        f1A_power(p) = f1A_power(p-1) * f1A
+        f2A_power(p) = f2A_power(p-1) * f2A
+      enddo
+      do p = 1, powmax2
+        g12_power(p) = g12_power(p-1) * g12
+      enddo
+
+      do p = 1, jBH_size
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+        tmp = jBH_c(p,i_nucl)
+        if(mpA .eq. npA) then
+          tmp = tmp * 0.5d0
+        endif
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f
index ebdce6f2..9f25a6fd 100644
--- a/plugins/local/tc_int/write_tc_int.irp.f
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@@ -14,19 +14,17 @@ program write_tc_int
   my_n_pt_a_grid = tc_grid1_a
   touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
 
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
   call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
   call write_int(6, my_n_pt_a_grid, 'angular external grid over')
 
-  if(tc_integ_type .eq. "numeric") then
-    my_extra_grid_becke  = .True.
-    PROVIDE tc_grid2_a tc_grid2_r
-    my_n_pt_r_extra_grid = tc_grid2_r
-    my_n_pt_a_extra_grid = tc_grid2_a
-    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
-
-    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
-    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
-  endif
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
 
   call main()
 

From a9d2f0e188cdc88e1cfe1387de4f1c118bb17a5d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 26 Jun 2024 17:55:56 +0200
Subject: [PATCH 093/131] Working on TC ints. Not well tested

---
 plugins/local/tc_int/jast_grad_full.irp.f | 127 ++++++++++++++++++++--
 plugins/local/tc_int/jast_utils_bh.irp.f  |  22 ++--
 2 files changed, 134 insertions(+), 15 deletions(-)

diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
index 78ed1edf..599d3779 100644
--- a/plugins/local/tc_int/jast_grad_full.irp.f
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -4,7 +4,7 @@
 subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
 
   BEGIN_DOC
-  ! 
+  !
   ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
   ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
   ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
@@ -59,7 +59,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   double precision              :: r2(3)
   double precision              :: dx, dy, dz, r12, tmp
   double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
-  double precision              :: tmp1, tmp2
+  double precision              :: tmp1, tmp2, dist
   integer                       :: powmax1, powmax, powmax2
   double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
 
@@ -90,30 +90,105 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
     gradx(jpoint) = 0.d0
     grady(jpoint) = 0.d0
     gradz(jpoint) = 0.d0
+
+    call jBH_elem_fct_grad_alpha1(r1, r2, g12, grad1_g12)
+
+!    dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+!           + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+!           + (r1(3) - r2(3)) * (r1(3) - r2(3))
+!
+!    if(dist .ge. 1d-15) then
+!      dist = dsqrt( dist )
+!
+!      tmp1 = 1.d0 / (1.d0 + dist)
+!
+!      g12 = dist * tmp1
+!      tmp2 = tmp1 * tmp1 / dist
+!      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
+!      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
+!      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
+!
+!    else
+!
+!      grad1_g12(1) = 0.d0
+!      grad1_g12(2) = 0.d0
+!      grad1_g12(3) = 0.d0
+!      g12 = 0.d0
+!
+!    endif
+!
+    do p = 1, powmax2
+      g12_power(p) = g12_power(p-1) * g12
+    enddo
+
     do i_nucl = 1, nucl_num
 
       rn(1) = nucl_coord(i_nucl,1)
       rn(2) = nucl_coord(i_nucl,2)
       rn(3) = nucl_coord(i_nucl,3)
 
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r1, rn, f1A, grad1_f1A)
-      call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
-      call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
+        call jBH_elem_fct_grad_alpha1(r1, rn, f1A, grad1_f1A)
+!      dist =   (r1(1) - rn(1)) * (r1(1) - rn(1)) &
+!             + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
+!             + (r1(3) - rn(3)) * (r1(3) - rn(3))
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f1A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
+!        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
+!        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
+!
+!      else
+!
+!        grad1_f1A(1) = 0.d0
+!        grad1_f1A(2) = 0.d0
+!        grad1_f1A(3) = 0.d0
+!        f1A = 0.d0
+!
+!      endif
+
+        call jBH_elem_fct_grad_alpha1(r2, rn, f2A, grad2_f2A)
+!      dist =   (r2(1) - rn(1)) * (r2(1) - rn(1)) &
+!             + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
+!             + (r2(3) - rn(3)) * (r2(3) - rn(3))
+!
+!      if (dist > 1.d-15) then
+!        dist = dsqrt( dist )
+!
+!        tmp1 = 1.d0 / (1.d0 + dist)
+!
+!        f2A = dist * tmp1
+!        tmp2 = tmp1 * tmp1 / dist
+!        grad2_f2A(1) = tmp2 * (r2(1) - rn(1))
+!        grad2_f2A(2) = tmp2 * (r2(2) - rn(2))
+!        grad2_f2A(3) = tmp2 * (r2(3) - rn(3))
+!
+!      else
+!
+!        grad2_f2A(1) = 0.d0
+!        grad2_f2A(2) = 0.d0
+!        grad2_f2A(3) = 0.d0
+!        f2A = 0.d0
+!
+!      endif
 
       ! Compute powers of f1A and f2A
       do p = 1, powmax1
         f1A_power(p) = f1A_power(p-1) * f1A
         f2A_power(p) = f2A_power(p-1) * f2A
       enddo
-      do p = 1, powmax2
-        g12_power(p) = g12_power(p-1) * g12
-      enddo
 
       do p = 1, jBH_size
         mpA = jBH_m(p,i_nucl)
         npA = jBH_n(p,i_nucl)
         opA = jBH_o(p,i_nucl)
         tmp = jBH_c(p,i_nucl)
+!        if (dabs(tmp) <= 1.d-10) cycle
+!
         if(mpA .eq. npA) then
           tmp = tmp * 0.5d0
         endif
@@ -132,3 +207,39 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   return
 end
 
+subroutine jBH_elem_fct_grad_alpha1(r1, r2, fct, grad1_fct)
+
+  implicit none
+  double precision, intent(in)  :: r1(3), r2(3)
+  double precision, intent(out) :: fct, grad1_fct(3)
+  double precision              :: dist, tmp1, tmp2
+
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+
+
+  if(dist .ge. 1d-15) then
+    dist = dsqrt( dist )
+
+    tmp1 = 1.d0 / (1.d0 + dist)
+
+    fct = dist * tmp1
+    tmp2 = tmp1 * tmp1 / dist
+    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
+    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
+    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
+  else
+
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
+
+  endif
+
+  return
+end
+
+! ---
diff --git a/plugins/local/tc_int/jast_utils_bh.irp.f b/plugins/local/tc_int/jast_utils_bh.irp.f
index 750ce90b..200bc5ff 100644
--- a/plugins/local/tc_int/jast_utils_bh.irp.f
+++ b/plugins/local/tc_int/jast_utils_bh.irp.f
@@ -1,35 +1,43 @@
 
 ! ---
 
+
+
 subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
 
   implicit none
   double precision, intent(in)  :: alpha, r1(3), r2(3)
   double precision, intent(out) :: fct, grad1_fct(3)
-  double precision              :: dist, tmp1, tmp2
+  double precision              :: dist, tmp1, tmp2, dist_inv
 
-  dist = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
-              + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
-              + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+  dist =   (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
 
 
-  if(dist .ge. 1d-10) then
+  if(dist .ge. 1d-15) then
+    dist_inv = 1.d0/dsqrt( dist )
+    dist = dist_inv * dist
+
     tmp1 = 1.d0 / (1.d0 + alpha * dist)
 
     fct = alpha * dist * tmp1
-    tmp2 = alpha * tmp1 * tmp1 / dist
+    tmp2 = alpha * tmp1 * tmp1 * dist_inv
     grad1_fct(1) = tmp2 * (r1(1) - r2(1))
     grad1_fct(2) = tmp2 * (r1(2) - r2(2))
     grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+
   else
+
     grad1_fct(1) = 0.d0
     grad1_fct(2) = 0.d0
     grad1_fct(3) = 0.d0
     fct = 0.d0
+
   endif
 
   return
-end 
+end
 
 ! ---
 

From 5d80cb7b2dd53bdd9eb713e507912e6fce3cadd7 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Jun 2024 12:06:06 +0200
Subject: [PATCH 094/131] Separated gpu and gpu_arch

---
 configure                           |  8 ++--
 src/ccsd/NEED                       |  1 +
 src/ccsd/ccsd_space_orb_sub.irp.f   | 14 +++++--
 src/gpu/NEED                        |  1 +
 src/gpu/README.rst                  |  6 +++
 src/{gpu_x86 => gpu}/gpu.h          |  0
 src/{gpu_x86 => gpu}/gpu_module.F90 | 59 +++++++++++++++--------------
 src/gpu_x86/gpu.c                   |  2 +-
 8 files changed, 54 insertions(+), 37 deletions(-)
 create mode 100644 src/gpu/NEED
 create mode 100644 src/gpu/README.rst
 rename src/{gpu_x86 => gpu}/gpu.h (100%)
 rename src/{gpu_x86 => gpu}/gpu_module.F90 (74%)

diff --git a/configure b/configure
index 014275eb..db158966 100755
--- a/configure
+++ b/configure
@@ -115,19 +115,19 @@ while getopts "d:c:i:g:h" c ; do
 done
 
 # Handle GPU acceleration
-rm -f ${QP_ROOT}/src/gpu
+rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
   amd) # Nvidia
      echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch
      ;;
   nvidia) # Nvidia
      echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch
      ;;
   *) # No Acceleration
      echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
+     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch
      ;;
 esac
 
diff --git a/src/ccsd/NEED b/src/ccsd/NEED
index e6e6bc59..8298f28e 100644
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 9d4ae7f9..84aab08a 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu
 
   implicit none
 
@@ -11,7 +12,7 @@ subroutine run_ccsd_space_orb
 
   double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
   double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  double precision, pointer     :: H_oo, H_vv, H_vo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -55,7 +56,10 @@ subroutine run_ccsd_space_orb
   allocate(tau(nO,nO,nV,nV))
   allocate(tau_x(nO,nO,nV,nV))
   allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+
+  call gpu_allocate_double(H_oo, (/ nO, nO /) )
+  call gpu_allocate_double(H_vv, (/ nV, nV /) )
+  call gpu_allocate_double(H_vo, (/ nV, nO /) )
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -191,7 +195,11 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate_double(H_oo)
+  call gpu_deallocate_double(H_vv)
+  call gpu_deallocate_double(H_vo)
+
+  deallocate(r1,r2,tau)
 
   ! CCSD(T)
   double precision :: e_t, e_t_err
diff --git a/src/gpu/NEED b/src/gpu/NEED
new file mode 100644
index 00000000..c2af78d2
--- /dev/null
+++ b/src/gpu/NEED
@@ -0,0 +1 @@
+gpu_arch
diff --git a/src/gpu/README.rst b/src/gpu/README.rst
new file mode 100644
index 00000000..17ee28a0
--- /dev/null
+++ b/src/gpu/README.rst
@@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h
similarity index 100%
rename from src/gpu_x86/gpu.h
rename to src/gpu/gpu.h
diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu/gpu_module.F90
similarity index 74%
rename from src/gpu_x86/gpu_module.F90
rename to src/gpu/gpu_module.F90
index 86ba3926..f35ebc97 100644
--- a/src/gpu_x86/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -1,5 +1,5 @@
 module gpu
-  use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
+  use, intrinsic :: iso_c_binding
   implicit none
 
   interface
@@ -17,7 +17,7 @@ module gpu
       integer(c_int64_t), value :: n
     end subroutine
 
-    subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
       import
       type(c_ptr) :: ptr
     end subroutine
@@ -89,53 +89,54 @@ module gpu
 
   end interface
 
+  contains
+
+
+    subroutine gpu_allocate_double(ptr, s)
+      implicit none
+      double precision, pointer, intent(inout) :: ptr
+      integer, intent(in) :: s(:)
+      type(c_ptr) :: cptr
+
+      call gpu_allocate_c(cptr, sum(s*1_8)*8_8)
+      call c_f_pointer(cptr, ptr, s)
+    end subroutine
+
+    subroutine gpu_deallocate_double(ptr)
+      implicit none
+      double precision, pointer, intent(inout) :: ptr
+      type(c_ptr) :: cptr
+      cptr = c_loc(ptr)
+      call gpu_deallocate(cptr)
+      NULLIFY(ptr)
+    end subroutine
+
 end module
 
-subroutine gpu_allocate_double(ptr, s)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  integer*8, intent(in) :: s(*)
-  type(c_ptr) :: cptr
-
-  call gpu_allocate_c(cptr, sum(s)*8_8)
-  call c_f_pointer(cptr, ptr, s)
-end subroutine
-
-subroutine gpu_free_double(ptr)
-  use gpu
-  implicit none
-  double precision, pointer, intent(inout) :: ptr
-  type(c_ptr) :: cptr
-  cptr = cloc(ptr)
-  call gpu_free(cptr)
-  NULLIFY(ptr)
-end subroutine
-
 subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
   use gpu
   implicit none
   double precision, intent(in)   :: cpu_ptr(*)
-  double precision, intent(out)  :: gpu_ptr(*)
+  double precision, intent(in)   :: gpu_ptr(*)
   integer(c_int64_t), intent(in) :: n
-  call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
+  call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
 end subroutine
 
 subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
   use gpu
   implicit none
   double precision, intent(in)   :: gpu_ptr(*)
-  double precision, intent(out)  :: cpu_ptr(*)
+  double precision, intent(in)   :: cpu_ptr(*)
   integer(c_int64_t), intent(in) :: n
-  call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
+  call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
 end subroutine
 
 subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
   use gpu
   implicit none
   double precision, intent(in)   :: gpu_ptr_src(*)
-  double precision, intent(out)  :: gpu_ptr_dest(*)
+  double precision, intent(in)   :: gpu_ptr_dest(*)
   integer(c_int64_t), intent(in) :: n
-  call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
+  call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
 end subroutine
 
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 71505dbe..41ede396 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) {
   }
 }
 
-void gpu_free(void** ptr) {
+void gpu_deallocate(void** ptr) {
   free(*ptr);
   *ptr = NULL;
 }

From 6c02ac0f0b05ea3cc16e0fde66e23c9a0de14246 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Jun 2024 12:07:48 +0200
Subject: [PATCH 095/131] Separated gpu and gpu_arch

---
 src/gpu/gpu_module.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index f35ebc97..43754454 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -107,7 +107,7 @@ module gpu
       double precision, pointer, intent(inout) :: ptr
       type(c_ptr) :: cptr
       cptr = c_loc(ptr)
-      call gpu_deallocate(cptr)
+      call gpu_deallocate_c(cptr)
       NULLIFY(ptr)
     end subroutine
 

From fa6d1419496d271a4715efc776790ce7fc152064 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 27 Jun 2024 15:45:52 +0200
Subject: [PATCH 096/131] Introducing GPU in CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f      | 224 +++++++-----
 src/ccsd/ccsd_space_orb_sub_chol.irp.f |  12 +-
 src/gpu/gpu_module.F90                 | 450 ++++++++++++++++++++++---
 src/gpu_x86/gpu.c                      |  48 +--
 4 files changed, 570 insertions(+), 164 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 84aab08a..455d62f7 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -10,9 +10,9 @@ subroutine run_ccsd_space_orb
   double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
   logical :: not_converged
 
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, pointer     :: H_oo, H_vv, H_vo
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -52,14 +52,15 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-
-  call gpu_allocate_double(H_oo, (/ nO, nO /) )
-  call gpu_allocate_double(H_vv, (/ nV, nV /) )
-  call gpu_allocate_double(H_vo, (/ nV, nO /) )
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -101,14 +102,21 @@ subroutine run_ccsd_space_orb
   endif
 
   ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
   !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -127,40 +135,38 @@ subroutine run_ccsd_space_orb
     if (do_ao_cholesky) then
 !    if (.False.) then
       call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+      call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
+      call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
 
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
+      call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
     else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
 
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
     endif
     max_r = max(max_r1,max_r2)
 
     ! Update
     if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
 
     ! Standard update as T = T - Delta
     elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
     else
       print*,'Unkown cc_method_method: '//cc_update_method
     endif
 
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -185,8 +191,8 @@ subroutine run_ccsd_space_orb
   print*,''
 
   if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
     call ezfio_set_utils_cc_io_amplitudes('Read')
   endif
 
@@ -195,11 +201,14 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  call gpu_deallocate_double(H_oo)
-  call gpu_deallocate_double(H_vv)
-  call gpu_deallocate_double(H_vo)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
 
-  deallocate(r1,r2,tau)
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)
 
   ! CCSD(T)
   double precision :: e_t, e_t_err
@@ -207,28 +216,14 @@ subroutine run_ccsd_space_orb
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
     ! New
     e_t = uncorr_energy + energy ! For print in (T) call
     e_t_err = 0.d0
 
     print*,'Computing (T) correction...'
     call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
 
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
          ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
 
     call wall_time(tb)
@@ -243,7 +238,9 @@ subroutine run_ccsd_space_orb
 
   call save_energy(uncorr_energy + energy, e_t)
 
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)
 
 end
 
@@ -341,70 +338,139 @@ end
 
 ! Tau
 
-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
   implicit none
 
   ! in
   integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2
 
   ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
 
   ! internal
   integer                       :: i,j,a,b
 
+!  !$OMP PARALLEL &
+!  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
+!  !$OMP PRIVATE(i,j,a,b) &
+!  !$OMP DEFAULT(NONE)
+!  !$OMP DO
+!  do b = 1, nV
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END DO
+!  !$OMP END PARALLEL
+
+
+  type(gpu_blas) :: blas
+  type(gpu_stream) :: stream(nV)
+
+  call gpu_blas_create(blas)
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do j=1,nO
+    !$OMP DO
+    do b=1,nV
+      call gpu_set_stream(blas,stream(b))
+      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
+         1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
+         h_t1(j,b), t1%c, nO*1_8, &
+         c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
     enddo
+    !$OMP END DO
   enddo
-  !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+  call gpu_blas_destroy(blas)
+
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  integer, intent(in)         :: nO, nV
+  type(gpu_double4), intent(in)  :: tau
 
   ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
 
   ! internal
   integer                       :: i,j,a,b
 
+!  !$OMP PARALLEL &
+!  !$OMP SHARED(nO,nV,tau,tau_x) &
+!  !$OMP PRIVATE(i,j,a,b) &
+!  !$OMP DEFAULT(NONE)
+!  !$OMP DO
+!  do b = 1, nV
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$OMP END DO
+!  !$OMP END PARALLEL
+
+  type(gpu_blas) :: blas
+  type(gpu_stream) :: stream(nV)
+
+  call gpu_blas_create(blas)
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+  do b=1,nV
+    do a=1,nV
+      call gpu_set_stream(blas,stream(a))
+      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
+          2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
+         -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
+         c_loc(tau_x%f(1,1,a,b)), nO*1_8)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+  call gpu_blas_destroy(blas)
+
 end
 
 ! R1
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index b59dc0bb..9b161001 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -294,12 +294,12 @@ end
 ! H_oo
 
 subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
-
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_oo(nO, nO)
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
@@ -315,7 +315,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
     do b=1,nV
       do j=1,nO
         do a=1,nV
-          tmp_vov(a,j,b) = tau_x(u,j,a,b)
+          tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
         enddo
       enddo
     enddo
@@ -328,7 +328,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
   !$omp do
   do i = 1, nO
     do u = 1, nO
-      H_oo(u,i) = cc_space_f_oo(u,i)
+      H_oo%f(u,i) = cc_space_f_oo(u,i)
     enddo
   enddo
   !$omp end do nowait
@@ -336,7 +336,7 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
   !$omp end  parallel
   call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
     tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
+    1.d0, H_oo%f, nO)
 
 end
 
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 43754454..51f80ac0 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -2,6 +2,52 @@ module gpu
   use, intrinsic :: iso_c_binding
   implicit none
 
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
   interface
     integer function gpu_ndevices() bind(C)
     end function
@@ -43,100 +89,394 @@ module gpu
       integer(c_int64_t), value :: n
     end subroutine
 
-    subroutine gpu_stream_create(stream) bind(C)
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
       import
       type(c_ptr) :: stream
     end subroutine
 
-    subroutine gpu_stream_destroy(stream) bind(C)
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
       import
       type(c_ptr) :: stream
     end subroutine
 
-    subroutine gpu_set_stream(handle, stream) bind(C)
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
       import
       type(c_ptr) :: handle, stream
     end subroutine
 
-    subroutine gpu_synchronize()
+    subroutine gpu_synchronize() bind(C)
+      import
     end subroutine
 
-    subroutine gpu_blas_create(handle) bind(C)
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
       import
       type(c_ptr) :: handle
     end subroutine
 
-    subroutine gpu_blas_destroy(handle) bind(C)
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
       import
       type(c_ptr) :: handle
     end subroutine
 
-    subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_double), intent(in)  :: dx(*), dy(*)
-      real(c_double), intent(out) :: res
+      type(c_ptr), intent(in), value :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_double), intent(out)    :: res
     end subroutine
 
-    subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in)     :: handle
-      integer(c_int64_t), value   :: n, incx, incy
-      real(c_float), intent(in)   :: dx(*), dy(*)
+      type(c_ptr), intent(in), value :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
     end subroutine
 
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), intent(in), value :: handle
+      character(c_char), intent(in), value :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
   end interface
 
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1  &
+             ,gpu_allocate_double2  &
+             ,gpu_allocate_double3  &
+             ,gpu_allocate_double4  &
+             ,gpu_allocate_double5  &
+             ,gpu_allocate_double6
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1  &
+             ,gpu_deallocate_double2  &
+             ,gpu_deallocate_double3  &
+             ,gpu_deallocate_double4  &
+             ,gpu_deallocate_double5  &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
   contains
 
 
-    subroutine gpu_allocate_double(ptr, s)
-      implicit none
-      double precision, pointer, intent(inout) :: ptr
-      integer, intent(in) :: s(:)
-      type(c_ptr) :: cptr
+! gpu_allocate
+! ------------
 
-      call gpu_allocate_c(cptr, sum(s*1_8)*8_8)
-      call c_f_pointer(cptr, ptr, s)
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
     end subroutine
 
-    subroutine gpu_deallocate_double(ptr)
+    subroutine gpu_allocate_double2(ptr, s1, s2)
       implicit none
-      double precision, pointer, intent(inout) :: ptr
-      type(c_ptr) :: cptr
-      cptr = c_loc(ptr)
-      call gpu_deallocate_c(cptr)
-      NULLIFY(ptr)
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      import
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      import
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      import
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      import
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      import
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
     end subroutine
 
 end module
 
-subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: cpu_ptr(*)
-  double precision, intent(in)   :: gpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
-end subroutine
-
-subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr(*)
-  double precision, intent(in)   :: cpu_ptr(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
-end subroutine
-
-subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
-  use gpu
-  implicit none
-  double precision, intent(in)   :: gpu_ptr_src(*)
-  double precision, intent(in)   :: gpu_ptr_dest(*)
-  integer(c_int64_t), intent(in) :: n
-  call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
-end subroutine
-
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 41ede396..5f42cb0d 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -251,7 +251,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -259,7 +259,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -267,7 +267,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
          }
        }
@@ -282,7 +282,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -290,7 +290,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -298,7 +298,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
          }
        }
@@ -313,7 +313,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -321,7 +321,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -329,7 +329,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
          }
        }
@@ -344,7 +344,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -352,7 +352,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -360,7 +360,7 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
          }
        }
@@ -386,7 +386,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -394,7 +394,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -402,7 +402,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
          }
        }
@@ -417,7 +417,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -425,7 +425,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i];
          }
        }
@@ -433,7 +433,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
          }
        }
@@ -448,7 +448,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[j*ldb+i];
          }
        }
@@ -456,7 +456,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -464,7 +464,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
          }
        }
@@ -479,7 +479,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      if (alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = beta * b[i*ldb+j];
          }
        }
@@ -487,7 +487,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else if (beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j];
          }
        }
@@ -495,7 +495,7 @@ void gpu_sgeam(const void* handle, const char transa, const char transb, const i
      } else {
 
        for (int64_t j=0 ; j<n ; ++j) {
-         for (int64_t i=0 ; i<n ; ++i) {
+         for (int64_t i=0 ; i<m ; ++i) {
            c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
          }
        }

From caec5183841840a7933e7fa4cf46bcb0ff3f14d9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 11:00:58 +0200
Subject: [PATCH 097/131] Added Nvidia module

---
 src/ccsd/ccsd_space_orb_sub.irp.f |  10 +-
 src/gpu/gpu_module.F90            |   6 +-
 src/gpu_nvidia/LIB                |   1 +
 src/gpu_nvidia/NEED               |   1 +
 src/gpu_nvidia/README.rst         |   5 +
 src/gpu_nvidia/gpu.c              | 327 ++++++++++++++++++++++++++++++
 src/gpu_x86/gpu.c                 |  40 ++--
 7 files changed, 359 insertions(+), 31 deletions(-)
 create mode 100644 src/gpu_nvidia/LIB
 create mode 100644 src/gpu_nvidia/NEED
 create mode 100644 src/gpu_nvidia/README.rst
 create mode 100644 src/gpu_nvidia/gpu.c

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 455d62f7..e7c9b1ab 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -384,17 +384,17 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  do j=1,nO
-    !$OMP DO
-    do b=1,nV
-      call gpu_set_stream(blas,stream(b))
+  !$OMP DO
+  do b=1,nV
+    call gpu_set_stream(blas,stream(b))
+    do j=1,nO
       call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
          1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
          h_t1(j,b), t1%c, nO*1_8, &
          c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
     enddo
-    !$OMP END DO
   enddo
+  !$OMP END DO
   !$OMP END PARALLEL
 
   call gpu_synchronize()
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 51f80ac0..d1ddad4c 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -120,7 +120,7 @@ module gpu
 
     subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_double), intent(out)    :: res
@@ -128,7 +128,7 @@ module gpu
 
     subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
@@ -137,7 +137,7 @@ module gpu
     subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_dgeam')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       character(c_char), intent(in), value :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_double), intent(in), value :: alpha, beta
diff --git a/src/gpu_nvidia/LIB b/src/gpu_nvidia/LIB
new file mode 100644
index 00000000..91f54e91
--- /dev/null
+++ b/src/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/src/gpu_nvidia/NEED b/src/gpu_nvidia/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/src/gpu_nvidia/README.rst b/src/gpu_nvidia/README.rst
new file mode 100644
index 00000000..5dcfca92
--- /dev/null
+++ b/src/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
diff --git a/src/gpu_nvidia/gpu.c b/src/gpu_nvidia/gpu.c
new file mode 100644
index 00000000..f0bd247a
--- /dev/null
+++ b/src/gpu_nvidia/gpu.c
@@ -0,0 +1,327 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice(igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    /* Use managed memory if it does not fit on the GPU */
+    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    } else {
+      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  cudaStream_t stream;
+  cudaError_t rc = cudaStreamCreate(&stream);
+  assert (rc == cudaSuccess);
+  *ptr = (void*) stream;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  assert (*ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void** handle, void** stream) {
+  cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  cublasHandle_t cublas_handle;
+  cublasStatus_t rc = cublasCreate(&cublas_handle);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *handle = (void*) cublas_handle;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  assert (*handle != NULL);
+  cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *handle = NULL;
+}
+
+
+void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+}
+
+
+
+void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+}
+
+
+
+void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 5f42cb0d..ac7c3620 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -56,7 +56,7 @@ void gpu_stream_destroy(void** ptr) {
   *ptr = NULL;
 }
 
-void gpu_set_stream(void* handle, void* stream) {
+void gpu_set_stream(void** handle, void** stream) {
   return;
 }
 
@@ -79,8 +79,8 @@ void gpu_blas_destroy(void** handle) {
 
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 
-void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (handle != NULL);
+void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -100,8 +100,8 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 
-void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (handle != NULL);
+void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -122,10 +122,10 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -150,10 +150,10 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -178,10 +178,10 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -209,10 +209,10 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -236,12 +236,9 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+  assert (*handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
@@ -371,12 +368,9 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+  assert (*handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||

From d3d89022bc8092ab0c6131904f85475f160dfa53 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 16:50:52 +0200
Subject: [PATCH 098/131] Move GPU modules to plugins

---
 configure                                    | 6 +++---
 {src => plugins/local}/gpu_nvidia/LIB        | 0
 {src => plugins/local}/gpu_nvidia/NEED       | 0
 {src => plugins/local}/gpu_nvidia/README.rst | 0
 {src => plugins/local}/gpu_nvidia/gpu.c      | 0
 {src => plugins/local}/gpu_x86/NEED          | 0
 {src => plugins/local}/gpu_x86/README.rst    | 0
 {src => plugins/local}/gpu_x86/gpu.c         | 0
 8 files changed, 3 insertions(+), 3 deletions(-)
 rename {src => plugins/local}/gpu_nvidia/LIB (100%)
 rename {src => plugins/local}/gpu_nvidia/NEED (100%)
 rename {src => plugins/local}/gpu_nvidia/README.rst (100%)
 rename {src => plugins/local}/gpu_nvidia/gpu.c (100%)
 rename {src => plugins/local}/gpu_x86/NEED (100%)
 rename {src => plugins/local}/gpu_x86/README.rst (100%)
 rename {src => plugins/local}/gpu_x86/gpu.c (100%)

diff --git a/configure b/configure
index db158966..08dac444 100755
--- a/configure
+++ b/configure
@@ -119,15 +119,15 @@ rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
   amd) # Nvidia
      echo "Activating AMD GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
      ;;
   nvidia) # Nvidia
      echo "Activating Nvidia GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
      ;;
   *) # No Acceleration
      echo "Disabling GPU acceleration"
-     ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu_arch
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
      ;;
 esac
 
diff --git a/src/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB
similarity index 100%
rename from src/gpu_nvidia/LIB
rename to plugins/local/gpu_nvidia/LIB
diff --git a/src/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED
similarity index 100%
rename from src/gpu_nvidia/NEED
rename to plugins/local/gpu_nvidia/NEED
diff --git a/src/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst
similarity index 100%
rename from src/gpu_nvidia/README.rst
rename to plugins/local/gpu_nvidia/README.rst
diff --git a/src/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
similarity index 100%
rename from src/gpu_nvidia/gpu.c
rename to plugins/local/gpu_nvidia/gpu.c
diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_x86/NEED
similarity index 100%
rename from src/gpu_x86/NEED
rename to plugins/local/gpu_x86/NEED
diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst
similarity index 100%
rename from src/gpu_x86/README.rst
rename to plugins/local/gpu_x86/README.rst
diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
similarity index 100%
rename from src/gpu_x86/gpu.c
rename to plugins/local/gpu_x86/gpu.c

From 85b1035cfba778559e629045961cb542631841bd Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 17:33:08 +0200
Subject: [PATCH 099/131] Working on CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f | 117 +++++++++++-------------------
 src/gpu/gpu_module.F90            |  62 ++++++++++++++++
 2 files changed, 103 insertions(+), 76 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index e7c9b1ab..1329f172 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -14,6 +14,9 @@ subroutine run_ccsd_space_orb
   type(gpu_double2) :: t1, r1
   type(gpu_double2) :: H_oo, H_vv, H_vo
 
+  type(gpu_double2) :: d_cc_space_f_vo
+  type(gpu_double4) :: d_cc_space_v_oovv
+
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
@@ -52,6 +55,12 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+
+
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
   call gpu_allocate(tau, nO,nO,nV,nV)
@@ -116,7 +125,8 @@ subroutine run_ccsd_space_orb
   !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -166,7 +176,7 @@ subroutine run_ccsd_space_orb
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x%f,t1%f,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -239,6 +249,8 @@ subroutine run_ccsd_space_orb
   call save_energy(uncorr_energy + energy, e_t)
 
   deallocate(h_t1, h_t2)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_v_oovv)
   call gpu_deallocate(t1)
   call gpu_deallocate(t2)
 
@@ -246,59 +258,14 @@ end
 
 ! Energy
 
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)            :: nO, nV
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  double precision, intent(out)  :: energy
 
   ! internal
   integer :: i,j,a,b
@@ -307,14 +274,14 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
   energy = 0d0
   !$omp parallel &
   !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
+  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
   !$omp private(i,j,a,b,e) &
   !$omp default(none)
   e = 0d0
   !$omp do
   do a = 1, nV
     do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
+      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
     enddo
   enddo
   !$omp end do nowait
@@ -323,7 +290,7 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
+          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
        enddo
       enddo
     enddo
@@ -333,6 +300,12 @@ subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
   energy = energy + e
   !$omp end critical
   !$omp end parallel
+!
+!
+!   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
+!   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
+!   energy = energy + 2.d0*e
+
 
 end
 
@@ -372,26 +345,24 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
 !  !$OMP END PARALLEL
 
 
-  type(gpu_blas) :: blas
   type(gpu_stream) :: stream(nV)
 
-  call gpu_blas_create(blas)
   do b=1,nV
     call gpu_stream_create(stream(b))
   enddo
 
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
-    call gpu_set_stream(blas,stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
     do j=1,nO
-      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
-         1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
-         h_t1(j,b), t1%c, nO*1_8, &
-         c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, &
+         1.d0, t2%f(1,j,1,b), nO*nO*1_8, &
+         h_t1(j,b), t1%f, nO*1_8, &
+         tau%f(1,j,1,b), nO*nO*1_8)
     enddo
   enddo
   !$OMP END DO
@@ -403,8 +374,6 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
     call gpu_stream_destroy(stream(b))
   enddo
 
-  call gpu_blas_destroy(blas)
-
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
@@ -438,26 +407,24 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
 !  !$OMP END DO
 !  !$OMP END PARALLEL
 
-  type(gpu_blas) :: blas
   type(gpu_stream) :: stream(nV)
 
-  call gpu_blas_create(blas)
   do a=1,nV
     call gpu_stream_create(stream(a))
   enddo
 
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
     do a=1,nV
-      call gpu_set_stream(blas,stream(a))
-      call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nO*1_8, &
-          2.d0, c_loc(tau%f(1,1,a,b)), nO*1_8, &
-         -1.d0, c_loc(tau%f(1,1,b,a)), nO*1_8, &
-         c_loc(tau_x%f(1,1,a,b)), nO*1_8)
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, &
+          2.d0, tau%f(1,1,a,b), nO*1_8, &
+         -1.d0, tau%f(1,1,b,a), nO*1_8, &
+         tau_x%f(1,1,a,b), nO*1_8)
     enddo
   enddo
   !$OMP END DO
@@ -469,8 +436,6 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
     call gpu_stream_destroy(stream(b))
   enddo
 
-  call gpu_blas_destroy(blas)
-
 end
 
 ! R1
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index d1ddad4c..2057d1eb 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -144,6 +144,16 @@ module gpu
       type(c_ptr), value :: a, b, c
     end subroutine
 
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), intent(in)        :: handle
+      character(c_char), intent(in), value :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
   end interface
 
 
@@ -478,5 +488,57 @@ module gpu
       call gpu_blas_destroy_c(handle%c)
     end subroutine
 
+
 end module
 
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, intent(in)   :: dx(*), dy(*)
+  double precision, intent(out)    :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  real, intent(in)               :: dx(*), dy(*)
+  real, intent(out)              :: res
+  call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a(lda,*), b(ldb,*), c(ldc,*)
+  call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
+        c_loc(b), ldb, c_loc(c), ldc)
+end subroutine
+
+subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+ use gpu 
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  real, intent(in)             :: alpha, beta
+  real                         :: a(lda,*), b(ldb,*), c(ldc,*)
+  call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
+        c_loc(b), ldb, c_loc(c), ldc)
+end subroutine
+

From a5f4f0516eec9f17438474529616368a6f9e5de4 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 17:39:43 +0200
Subject: [PATCH 100/131] Fixing compile

---
 src/ccsd/ccsd_space_orb_sub.irp.f |  1 -
 src/gpu/gpu_module.F90            | 23 +++++++++--------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 1329f172..4e06e31d 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -306,7 +306,6 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
 !   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
 !   energy = energy + 2.d0*e
 
-
 end
 
 ! Tau
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 2057d1eb..d7c26ba6 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -365,42 +365,42 @@ module gpu
     subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double1), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:)
+      double precision, target, intent(in)   :: cpu_ptr(:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
     end subroutine
 
     subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double2), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double3), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double4), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double5), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
     subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
       implicit none
       type(gpu_double6), intent(in)  :: gpu_ptr
-      double precision, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
       call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
     end subroutine
 
@@ -454,19 +454,16 @@ module gpu
 ! ----------
 
     subroutine gpu_stream_create(stream)
-      import
       type(gpu_stream) :: stream
       call gpu_stream_create_c(stream%c)
     end subroutine
 
     subroutine gpu_stream_destroy(stream)
-      import
       type(gpu_stream) :: stream
       call gpu_stream_destroy_c(stream%c)
     end subroutine
 
     subroutine gpu_set_stream(handle, stream)
-      import
       type(gpu_blas)   :: handle
       type(gpu_stream) :: stream
       call gpu_set_stream_c(handle%c, stream%c)
@@ -477,13 +474,11 @@ module gpu
 ! --------
 
     subroutine gpu_blas_create(handle)
-      import
       type(gpu_blas) :: handle
       call gpu_blas_create_c(handle%c)
     end subroutine
 
     subroutine gpu_blas_destroy(handle)
-      import
       type(gpu_blas) :: handle
       call gpu_blas_destroy_c(handle%c)
     end subroutine
@@ -500,7 +495,7 @@ subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  double precision, intent(in)   :: dx(*), dy(*)
+  double precision, target, intent(in)   :: dx(*), dy(*)
   double precision, intent(out)    :: res
   call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
@@ -525,7 +520,7 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  double precision             :: a(lda,*), b(ldb,*), c(ldc,*)
+  double precision, target     :: a(lda,*), b(ldb,*), c(ldc,*)
   call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
         c_loc(b), ldb, c_loc(c), ldc)
 end subroutine
@@ -537,7 +532,7 @@ subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
   real, intent(in)             :: alpha, beta
-  real                         :: a(lda,*), b(ldb,*), c(ldc,*)
+  real, target                 :: a(lda,*), b(ldb,*), c(ldc,*)
   call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
         c_loc(b), ldb, c_loc(c), ldc)
 end subroutine

From c7df9a72cc68a7f5dfded36aa94ac50d5188a5a1 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 21:32:04 +0200
Subject: [PATCH 101/131] Fixing again actions

---
 src/gpu/gpu_module.F90 | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index d7c26ba6..ecf79c83 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -318,42 +318,42 @@ module gpu
 
     subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:)
+      double precision, target, intent(in)     :: cpu_ptr(*)
       type(gpu_double1), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
     end subroutine
 
     subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
       type(gpu_double2), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
       type(gpu_double3), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
       type(gpu_double4), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
       type(gpu_double5), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
 
     subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
       implicit none
-      double precision, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
       type(gpu_double6), intent(in)    :: gpu_ptr
       call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
     end subroutine
@@ -504,7 +504,7 @@ subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  real, intent(in)               :: dx(*), dy(*)
+  real, target, intent(in)       :: dx(*), dy(*)
   real, intent(out)              :: res
   call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine

From b467bef6dd1e14c5914cc6508aa898d5f1665e3a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 21:37:14 +0200
Subject: [PATCH 102/131] Forgot file

---
 src/ccsd/ccsd_space_orb_sub.irp.f | 68 +++++++++++++++----------------
 src/gpu/gpu.irp.f                 | 11 +++++
 2 files changed, 45 insertions(+), 34 deletions(-)
 create mode 100644 src/gpu/gpu.irp.f

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 4e06e31d..5c2daa05 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -271,40 +271,40 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
   integer :: i,j,a,b
   double precision :: e
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-!
-!
-!   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
-!   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
-!   energy = energy + 2.d0*e
+!  energy = 0d0
+!  !$omp parallel &
+!  !$omp shared(nO,nV,energy,tau_x,t1,&
+!  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
+!  !$omp private(i,j,a,b,e) &
+!  !$omp default(none)
+!  e = 0d0
+!  !$omp do
+!  do a = 1, nV
+!    do i = 1, nO
+!      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!  !$omp do
+!  do b = 1, nV
+!    do a = 1, nV
+!      do j = 1, nO
+!        do i = 1, nO
+!          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
+!       enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!  !$omp critical
+!  energy = energy + e
+!  !$omp end critical
+!  !$omp end parallel
+
+
+   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
+   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
+   energy = energy + 2.d0*e
 
 end
 
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
new file mode 100644
index 00000000..e91d66f5
--- /dev/null
+++ b/src/gpu/gpu.irp.f
@@ -0,0 +1,11 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+

From 860121d404f7ae255790cd12136139103bdc48d0 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Sat, 29 Jun 2024 02:27:50 +0200
Subject: [PATCH 103/131] H_oo on GPU

---
 plugins/local/gpu_nvidia/gpu.c         | 224 +++++++++++----------
 plugins/local/gpu_x86/gpu.c            |  38 ++--
 src/ccsd/ccsd_space_orb_sub.irp.f      | 112 ++++++++---
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 127 ++++++++----
 src/gpu/gpu.irp.f                      |   7 +
 src/gpu/gpu_module.F90                 | 260 +++++++++++++++++++++----
 6 files changed, 540 insertions(+), 228 deletions(-)

diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
index f0bd247a..189de64c 100644
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -1,5 +1,6 @@
 #include <stdint.h>
 #include <stdio.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
@@ -10,6 +11,10 @@
 
 /* Generic functions */
 
+bool no_gpu() {
+ return false;
+}
+
 int gpu_ndevices() {
   int ngpus;
   cudaGetDeviceCount(&ngpus);
@@ -17,7 +22,7 @@ int gpu_ndevices() {
 }
 
 void gpu_set_device(int32_t igpu) {
-  cudaSetDevice(igpu);
+  cudaSetDevice((int) igpu);
 }
 
 
@@ -64,22 +69,20 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 
 /* Streams */
 
-void gpu_stream_create(void** ptr) {
-  cudaStream_t stream;
-  cudaError_t rc = cudaStreamCreate(&stream);
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
   assert (rc == cudaSuccess);
-  *ptr = (void*) stream;
 }
 
-void gpu_stream_destroy(void** ptr) {
-  assert (*ptr != NULL);
-  cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr);
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
   assert (rc == cudaSuccess);
   *ptr = NULL;
 }
 
-void gpu_set_stream(void** handle, void** stream) {
-  cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream);
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
 }
 
 void gpu_synchronize() {
@@ -89,75 +92,80 @@ void gpu_synchronize() {
 
 /* BLAS functions */
 
-void gpu_blas_create(void** handle) {
-  cublasHandle_t cublas_handle;
-  cublasStatus_t rc = cublasCreate(&cublas_handle);
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
   assert (rc == CUBLAS_STATUS_SUCCESS);
-  *handle = (void*) cublas_handle;
 }
 
 
-void gpu_blas_destroy(void** handle) {
-  assert (*handle != NULL);
-  cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle);
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
   assert (rc == CUBLAS_STATUS_SUCCESS);
-  *handle = NULL;
+  ptr = NULL;
 }
 
 
-void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (*handle != NULL);
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
 
-  /* Convert to int32_t */
-  int32_t n_, incx_, incy_;
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
-  n_    = (int32_t) n;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+/*
+  double alpha = 1.0;
+  double beta = 0.0;
+  cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
+*/
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
   /* Check for integer overflows */
   assert ( (int64_t)    n_ == n   );
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
 }
 
 
 
-void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (*handle != NULL);
-
-  /* Convert to int32_t */
-  int32_t n_, incx_, incy_;
-
-  n_    = (int32_t) n;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
-
-  /* Check for integer overflows */
-  assert ( (int64_t)    n_ == n   );
-  assert ( (int64_t) incx_ == incx);
-  assert ( (int64_t) incy_ == incy);
-
-  cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
-}
-
-
-
-void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, incx_, incy_;
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
 
-  m_    = (int32_t) m;
-  n_    = (int32_t) n;
-  lda_  = (int32_t) lda;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
   /* Check for integer overflows */
   assert ( (int64_t)    m_ == m   );
@@ -169,24 +177,24 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t
   cublasOperation_t transa_ = CUBLAS_OP_N;
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
 
-  cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+  cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
 }
 
 
 
-void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, incx_, incy_;
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
 
-  m_    = (int32_t) m;
-  n_    = (int32_t) n;
-  lda_  = (int32_t) lda;
-  incx_ = (int32_t) incx;
-  incy_ = (int32_t) incy;
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
 
   /* Check for integer overflows */
   assert ( (int64_t)    m_ == m   );
@@ -198,24 +206,24 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t
   cublasOperation_t transa_ = CUBLAS_OP_N;
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
 
-  cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+  cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
 }
 
 
-void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  k_   = (int32_t) k;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -230,25 +238,25 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
 }
 
 
 
-void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  k_   = (int32_t) k;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -263,22 +271,22 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
 }
 
 
-void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -292,23 +300,23 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+  cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
 
 }
 
 
-void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
-  /* Convert to int32_t */
-  int32_t m_, n_, lda_, ldb_, ldc_;
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
 
-  m_   = (int32_t) m;
-  n_   = (int32_t) n;
-  lda_ = (int32_t) lda;
-  ldb_ = (int32_t) ldb;
-  ldc_ = (int32_t) ldc;
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
 
   /* Check for integer overflows */
   assert ( (int64_t)   m_ == m  );
@@ -322,6 +330,6 @@ void gpu_sgeam(void** handle, const char transa, const char transb, const int64_
   if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
   if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+  cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
 
 }
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
index ac7c3620..53267a7c 100644
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -2,8 +2,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 #include <assert.h>
 
+bool no_gpu() {
+ return true;
+}
 
 /* Generic functions */
 
@@ -56,7 +60,7 @@ void gpu_stream_destroy(void** ptr) {
   *ptr = NULL;
 }
 
-void gpu_set_stream(void** handle, void** stream) {
+void gpu_set_stream(void* handle, void* stream) {
   return;
 }
 
@@ -79,8 +83,8 @@ void gpu_blas_destroy(void** handle) {
 
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 
-void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (*handle != NULL);
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -100,8 +104,8 @@ void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t inc
 
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 
-void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (*handle != NULL);
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -122,10 +126,10 @@ void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -150,10 +154,10 @@ void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -178,10 +182,10 @@ void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -209,10 +213,10 @@ void gpu_dgemm(void** handle, const char transa, const char transb, const int64_
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -236,9 +240,9 @@ void gpu_sgemm(void** handle, const char transa, const char transb, const int64_
 }
 
 
-void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
@@ -368,9 +372,9 @@ void gpu_dgeam(void** handle, const char transa, const char transb, const int64_
 }
 
 
-void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  assert (*handle != NULL);
+  assert (handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 5c2daa05..5ee7366e 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -14,9 +14,15 @@ subroutine run_ccsd_space_orb
   type(gpu_double2) :: t1, r1
   type(gpu_double2) :: H_oo, H_vv, H_vo
 
-  type(gpu_double2) :: d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
   type(gpu_double4) :: d_cc_space_v_oovv
 
+
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
   integer(bit_kind)             :: det(N_int,2)
@@ -24,7 +30,7 @@ subroutine run_ccsd_space_orb
 
   call set_multiple_levels_omp(.False.)
 
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
     PROVIDE cholesky_mo_transp
     FREE cholesky_ao
   else
@@ -55,11 +61,36 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
   call gpu_allocate(d_cc_space_f_vo, nV, nO)
-  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
-  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
-  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
 
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
 
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
@@ -120,6 +151,13 @@ subroutine run_ccsd_space_orb
   call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
   call gpu_upload(h_t2, t2)
 
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+
+!  FREE cc_space_v_oovv
+
+
   call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
   !print*,'hf_energy', hf_energy
@@ -142,10 +180,10 @@ subroutine run_ccsd_space_orb
   do while (not_converged)
 
     ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x%f,H_vv%f)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+              d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
       call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
 
       call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
@@ -249,6 +287,12 @@ subroutine run_ccsd_space_orb
   call save_energy(uncorr_energy + energy, e_t)
 
   deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
   call gpu_deallocate(d_cc_space_f_vo)
   call gpu_deallocate(d_cc_space_v_oovv)
   call gpu_deallocate(t1)
@@ -302,8 +346,21 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
 !  !$omp end parallel
 
 
-   call gpu_ddot(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1, d_cc_space_v_oovv, 1, energy)
-   call gpu_ddot(blas_handle, nO*nV*1_8, d_cc_space_f_vo, 1, t1, 1, e)
+   type(gpu_stream) :: s1, s2
+   call gpu_stream_create(s1)
+   call gpu_stream_create(s2)
+
+   call gpu_set_stream(blas_handle,s1)
+   call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
+
+   call gpu_set_stream(blas_handle,s2)
+   call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+   call gpu_synchronize()
+   call gpu_set_stream(blas_handle,gpu_default_stream)
+
+   call gpu_stream_destroy(s1)
+   call gpu_stream_destroy(s2)
+
    energy = energy + 2.d0*e
 
 end
@@ -346,32 +403,29 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
 
   type(gpu_stream) :: stream(nV)
 
-  do b=1,nV
-    call gpu_stream_create(stream(b))
-  enddo
-
-  !$OMP PARALLEL &
+  !$OMP PARALLEL if (no_gpu()) &
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
+    call gpu_stream_create(stream(b))
     call gpu_set_stream(blas_handle,stream(b))
     do j=1,nO
-      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nV*1_8, &
-         1.d0, t2%f(1,j,1,b), nO*nO*1_8, &
-         h_t1(j,b), t1%f, nO*1_8, &
-         tau%f(1,j,1,b), nO*nO*1_8)
+      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f, nO, &
+         tau%f(1,j,1,b), nO*nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call gpu_synchronize()
-
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
 
 end
 
@@ -412,7 +466,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
     call gpu_stream_create(stream(a))
   enddo
 
-  !$OMP PARALLEL &
+  !$OMP PARALLEL if (no_gpu()) &
   !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
@@ -420,20 +474,20 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   do b=1,nV
     do a=1,nV
       call gpu_set_stream(blas_handle,stream(a))
-      call gpu_dgeam(blas_handle, 'N', 'N', nO*1_8, nO*1_8, &
-          2.d0, tau%f(1,1,a,b), nO*1_8, &
-         -1.d0, tau%f(1,1,b,a), nO*1_8, &
-         tau_x%f(1,1,a,b), nO*1_8)
+      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call gpu_synchronize()
-
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
 
 end
 
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 9b161001..288724f3 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -293,62 +293,115 @@ end
 
 ! H_oo
 
-subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
   use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
   type(gpu_double4), intent(in)    :: tau_x
   type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-  allocate(tau_kau(cholesky_mo_num,nV,nO))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,u,j,k,a,b,tmp_vov)
-  allocate(tmp_vov(nV,nO,nV) )
-  !$omp do
-  do u = 1, nO
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
+
+!  !$omp parallel &
+!  !$omp default(shared) &
+!  !$omp private(i,u,j,k,a,b,tmp_vov)
+!  call gpu_allocate(tmp_vov, nV, nO, nV)
+!  !$omp do
+!  do u = 1, nO
+!    do b=1,nV
+!      do j=1,nO
+!        do a=1,nV
+!          tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b)
+!        enddo
+!      enddo
+!    enddo
+!    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+!      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, &
+!      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
+!  enddo
+!  !$omp end do nowait
+!  call gpu_deallocate(tmp_vov)
+!  !$omp do
+!  do i = 1, nO
+!    do u = 1, nO
+!      H_oo%f(u,i) = d_cc_space_f_oo%f(u,i)
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!
+!  !$omp barrier
+!  !$omp end  parallel
+!  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+!    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
+!    1.d0, H_oo%f(1,1), nO)
+!
+
+  type(gpu_stream) :: stream(nV)
+
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+  enddo
+
+  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv)
+
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+
+  !$OMP DO
+  do u=1,nO
+    call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
     do b=1,nV
-      do j=1,nO
-        do a=1,nV
-          tmp_vov(a,j,b) = tau_x%f(u,j,a,b)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle,stream(b))
+      call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
-      0.d0, tau_kau(1,1,u), cholesky_mo_num)
+    call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
+    call gpu_synchronize()
   enddo
-  !$omp end do nowait
-  deallocate(tmp_vov)
-  !$omp do
-  do i = 1, nO
-    do u = 1, nO
-      H_oo%f(u,i) = cc_space_f_oo(u,i)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo%f, nO)
+  !$OMP END DO
 
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+  !$OMP END PARALLEL
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
+    1.d0, H_oo, nO)
+
+  call gpu_deallocate(tau_kau)
 end
 
 ! H_vv
 
 subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
-
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_vv(nV, nV)
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
@@ -364,7 +417,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
     do b=1,nV
       do j=1,nO
         do i=1,nO
-          tmp_oov(i,j,b) = tau_x(i,j,a,b)
+          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
         enddo
       enddo
     enddo
@@ -378,7 +431,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
   !$omp do
   do beta = 1, nV
     do a = 1, nV
-      H_vv(a,beta) = cc_space_f_vv(a,beta)
+      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
     enddo
   enddo
   !$omp end do nowait
@@ -386,7 +439,7 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
   !$omp end  parallel
   call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
     tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
+    1.d0, H_vv%f, nV)
 
 end
 
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
index e91d66f5..6ad0a075 100644
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@@ -8,4 +8,11 @@ BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
  call gpu_blas_create(blas_handle)
 END_PROVIDER
 
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
 
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index ecf79c83..2676b339 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -49,7 +49,12 @@ module gpu
 ! ------------
 
   interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
     integer function gpu_ndevices() bind(C)
+      import
     end function
 
     subroutine gpu_set_device(id) bind(C)
@@ -101,7 +106,7 @@ module gpu
 
     subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
       import
-      type(c_ptr) :: handle, stream
+      type(c_ptr), value :: handle, stream
     end subroutine
 
     subroutine gpu_synchronize() bind(C)
@@ -120,15 +125,15 @@ module gpu
 
     subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in)        :: handle
+      type(c_ptr), value, intent(in) :: handle
       integer(c_int64_t), value      :: n, incx, incy
-      type(c_ptr), intent(in), value :: dx, dy
+      type(c_ptr), value             :: dx, dy
       real(c_double), intent(out)    :: res
     end subroutine
 
     subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in)        :: handle
+      type(c_ptr), value, intent(in) :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
@@ -137,8 +142,8 @@ module gpu
     subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_dgeam')
       import
-      type(c_ptr), intent(in)        :: handle
-      character(c_char), intent(in), value :: transa, transb
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_double), intent(in), value :: alpha, beta
       type(c_ptr), value :: a, b, c
@@ -147,13 +152,33 @@ module gpu
     subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_sgeam')
       import
-      type(c_ptr), intent(in)        :: handle
-      character(c_char), intent(in), value :: transa, transb
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_float), intent(in), value :: alpha, beta
       type(c_ptr), value :: a, b, c
     end subroutine
 
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in), value :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
   end interface
 
 
@@ -161,20 +186,26 @@ module gpu
 ! ----------------------
 
   interface gpu_allocate
-    procedure gpu_allocate_double1  &
-             ,gpu_allocate_double2  &
-             ,gpu_allocate_double3  &
-             ,gpu_allocate_double4  &
-             ,gpu_allocate_double5  &
-             ,gpu_allocate_double6
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
   end interface gpu_allocate
 
   interface gpu_deallocate
-    procedure gpu_deallocate_double1  &
-             ,gpu_deallocate_double2  &
-             ,gpu_deallocate_double3  &
-             ,gpu_deallocate_double4  &
-             ,gpu_deallocate_double5  &
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
              ,gpu_deallocate_double6
   end interface gpu_deallocate
 
@@ -267,6 +298,61 @@ module gpu
     end subroutine
 
 
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
 ! gpu_deallocate
 ! --------------
 
@@ -494,19 +580,38 @@ end module
 subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
-  integer*8                      :: n, incx, incy
-  double precision, target, intent(in)   :: dx(*), dy(*)
-  double precision, intent(out)    :: res
-  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+  integer*4                      :: n, incx, incy
+  type(gpu_double1), intent(in)  :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res)
 end subroutine
 
-subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
+subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx(*), dy(*)
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
   use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  real, target, intent(in)       :: dx(*), dy(*)
-  real, intent(out)              :: res
-  call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+  type(gpu_double1), intent(in)  :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res)
+end subroutine
+
+subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res)
+  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx(*), dy(*)
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
 
 
@@ -518,22 +623,103 @@ subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
-  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  double precision, target     :: a(lda,*), b(ldb,*), c(ldc,*)
-  call gpu_dgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
-        c_loc(b), ldb, c_loc(c), ldc)
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
+        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
 end subroutine
 
-subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+
+subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, &
   b, ldb, c, ldc)
- use gpu 
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
-  real, intent(in)             :: alpha, beta
-  real, target                 :: a(lda,*), b(ldb,*), c(ldc,*)
-  call gpu_sgeam_c(handle%c, transa, transb, m, n, alpha, c_loc(a), lda, beta, &
-        c_loc(b), ldb, c_loc(c), ldc)
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
+        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a%c, int(lda,c_int64_t), &
+        b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  type(gpu_double2)            :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
+        alpha, a%c, lda, b%c, ldb, beta, c%c, ldc)
+end subroutine
+
+subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, c_loc(a), int(lda,c_int64_t), &
+        c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a(*), b(*), c(*)
+  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
+        alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc)
 end subroutine
 

From d3c1994c64ed9ae9914ce605a6b7c364ac518d9b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 1 Jul 2024 18:04:48 +0200
Subject: [PATCH 104/131] H_vv

---
 plugins/local/gpu_nvidia/gpu.c         |  16 +--
 plugins/local/gpu_x86/gpu.c            |   6 +-
 src/ccsd/ccsd_space_orb_sub.irp.f      |   7 +-
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 146 ++++++++++++++++---------
 src/gpu/gpu.irp.f                      |   8 ++
 5 files changed, 114 insertions(+), 69 deletions(-)

diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
index 189de64c..39a82984 100644
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -11,10 +11,6 @@
 
 /* Generic functions */
 
-bool no_gpu() {
- return false;
-}
-
 int gpu_ndevices() {
   int ngpus;
   cudaGetDeviceCount(&ngpus);
@@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) {
       free = INT64_MAX;
     }
 
-    /* Use managed memory if it does not fit on the GPU */
-    if (size < free && size < total/2) {
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
 //      rc= cudaMalloc(ptr, size);
-      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
-    } else {
-      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
-    }
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
     assert (rc == cudaSuccess);
 }
 
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
index 53267a7c..dab23a25 100644
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -5,14 +5,10 @@
 #include <stdbool.h>
 #include <assert.h>
 
-bool no_gpu() {
- return true;
-}
-
 /* Generic functions */
 
 int gpu_ndevices() {
-  return 1;
+  return 0;
 }
 
 void gpu_set_device(int32_t i) {
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 5ee7366e..0b3636ac 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb
     if (do_mo_cholesky) then
       call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
               d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+              d_cc_space_v_ov_chol,H_vv)
       call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
 
       call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
@@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
 
   type(gpu_stream) :: stream(nV)
 
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL if (gpu_num == 0) &
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
@@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
     call gpu_stream_create(stream(a))
   enddo
 
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL if (gpu_num == 0) &
   !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 288724f3..458016fb 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
 !    1.d0, H_oo%f(1,1), nO)
 !
 
-  type(gpu_stream) :: stream(nV)
+  type(gpu_blas) :: blas
 
-  do b=1,nV
-    call gpu_stream_create(stream(b))
-  enddo
 
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL  &
   !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv)
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
 
-  call gpu_allocate(tmp_vov, nV, nO, nV)
   call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+
+  call gpu_blas_create(blas)
 
   !$OMP DO
   do u=1,nO
-    call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
            tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
     do b=1,nV
-      call gpu_set_stream(blas_handle,stream(b))
-      call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
            tmp_ovv%f(1,1,b), nO, 0.d0, &
            tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
       d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
       0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-    call gpu_synchronize()
   enddo
   !$OMP END DO
 
+  call gpu_blas_destroy(blas)
+
   call gpu_deallocate(tmp_vov)
   call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
   !$OMP END PARALLEL
 
-  do b=1,nV
-    call gpu_stream_destroy(stream(b))
-  enddo
-
-  call gpu_set_stream(blas_handle,gpu_default_stream)
-
-  call gpu_copy(d_cc_space_f_oo, H_oo)
-
   call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
     tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
     1.d0, H_oo, nO)
@@ -395,52 +394,97 @@ end
 
 ! H_vv
 
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
   use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
   type(gpu_double4), intent(in)    :: tau_x
   type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
+  type(gpu_double3) :: tau_kia, tmp_oov
 
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  allocate(tmp_oov(nO,nO,nV) )
-  !$omp do
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+!  !$omp parallel &
+!  !$omp default(shared) &
+!  !$omp private(i,beta,j,k,a,b,tmp_oov)
+!  allocate(tmp_oov(nO,nO,nV) )
+!  !$omp do
+!  do a = 1, nV
+!    do b=1,nV
+!      do j=1,nO
+!        do i=1,nO
+!          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
+!        enddo
+!      enddo
+!    enddo
+!    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+!      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
+!      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+!  enddo
+!  !$omp end do nowait
+!  deallocate(tmp_oov)
+
+!  !$omp do
+!  do beta = 1, nV
+!    do a = 1, nV
+!      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
+!    enddo
+!  enddo
+!  !$omp end do nowait
+!  !$omp barrier
+!  !$omp end  parallel
+!  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+!    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
+!    1.d0, H_vv%f, nV)
+
+  type(gpu_blas) :: blas
+
+
+  PROVIDE gpu_num
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
   do a = 1, nV
     do b=1,nV
-      do j=1,nO
-        do i=1,nO
-          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_oov)
+  !$OMP END DO
 
-  !$omp do
-  do beta = 1, nV
-    do a = 1, nV
-      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv%f, nV)
+  call gpu_blas_destroy(blas)
 
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
+    1.d0, H_vv, nV)
+
+  call gpu_deallocate(tau_kia)
 end
 
 ! H_vo
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
index 6ad0a075..3b2feeb6 100644
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
  gpu_default_stream%c = C_NULL_PTR
 END_PROVIDER
 
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+

From 44a7729f65a37cc3a7c35ae55f462bb1d61e411b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 1 Jul 2024 19:00:27 +0200
Subject: [PATCH 105/131] H_ finished in CCSD

---
 src/ccsd/ccsd_space_orb_sub.irp.f      | 108 ++----
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 482 +++++++++----------------
 2 files changed, 200 insertions(+), 390 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 0b3636ac..13b974be 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -181,11 +181,9 @@ subroutine run_ccsd_space_orb
 
     ! Residue
     if (do_mo_cholesky) then
-      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
-              d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
-              d_cc_space_v_ov_chol,H_vv)
-      call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
 
       call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
       call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
@@ -316,51 +314,20 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
   integer :: i,j,a,b
   double precision :: e
 
-!  energy = 0d0
-!  !$omp parallel &
-!  !$omp shared(nO,nV,energy,tau_x,t1,&
-!  !$omp d_cc_space_f_vo,d_cc_space_v_oovv) &
-!  !$omp private(i,j,a,b,e) &
-!  !$omp default(none)
-!  e = 0d0
-!  !$omp do
-!  do a = 1, nV
-!    do i = 1, nO
-!      e = e + 2d0 * d_cc_space_f_vo%f(a,i) * t1%f(i,a)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp do
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          e = e + tau_x%f(i,j,a,b) * d_cc_space_v_oovv%f(i,j,a,b)
-!       enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp critical
-!  energy = energy + e
-!  !$omp end critical
-!  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)
 
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
 
-   type(gpu_stream) :: s1, s2
-   call gpu_stream_create(s1)
-   call gpu_stream_create(s2)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
 
-   call gpu_set_stream(blas_handle,s1)
-   call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
-
-   call gpu_set_stream(blas_handle,s2)
-   call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
-   call gpu_synchronize()
-   call gpu_set_stream(blas_handle,gpu_default_stream)
-
-   call gpu_stream_destroy(s1)
-   call gpu_stream_destroy(s2)
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)
 
    energy = energy + 2.d0*e
 
@@ -384,27 +351,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   ! internal
   integer                       :: i,j,a,b
 
-!  !$OMP PARALLEL &
-!  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1) &
-!  !$OMP PRIVATE(i,j,a,b) &
-!  !$OMP DEFAULT(NONE)
-!  !$OMP DO
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          tau%f(i,j,a,b) = t2%f(i,j,a,b) + t1%f(i,a) * h_t1(j,b)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
-
   type(gpu_stream) :: stream(nV)
 
-  !$OMP PARALLEL if (gpu_num == 0) &
+  !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
@@ -422,6 +371,8 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
@@ -444,32 +395,15 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   ! internal
   integer                       :: i,j,a,b
 
-!  !$OMP PARALLEL &
-!  !$OMP SHARED(nO,nV,tau,tau_x) &
-!  !$OMP PRIVATE(i,j,a,b) &
-!  !$OMP DEFAULT(NONE)
-!  !$OMP DO
-!  do b = 1, nV
-!    do a = 1, nV
-!      do j = 1, nO
-!        do i = 1, nO
-!          tau_x%f(i,j,a,b) = 2.d0*tau%f(i,j,a,b) - tau%f(i,j,b,a)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$OMP END DO
-!  !$OMP END PARALLEL
-
   type(gpu_stream) :: stream(nV)
 
   do a=1,nV
     call gpu_stream_create(stream(a))
   enddo
 
-  !$OMP PARALLEL if (gpu_num == 0) &
+  !$OMP PARALLEL &
   !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP PRIVATE(a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
   do b=1,nV
@@ -484,10 +418,12 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
   do b=1,nV
     call gpu_stream_destroy(stream(b))
   enddo
-  call gpu_set_stream(blas_handle,gpu_default_stream)
 
 
 end
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 458016fb..5eb95a06 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -1,81 +1,200 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
+! H_oo
 
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)           :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
+  integer :: a,b,i,j,u,k
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-end
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
 
-! Tau
+  type(gpu_blas) :: blas
 
-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
 
-  implicit none
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
 
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
 
-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
 
-  ! internal
-  integer                       :: i,j,a,b
+  call gpu_blas_create(blas)
 
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do u=1,nO
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
   enddo
   !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
   !$OMP END PARALLEL
 
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
+    1.d0, H_oo, nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
+end
+
+! H_vv
+
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
+  implicit none
+
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
+
+  integer :: a,b,i,j,u,k, beta
+
+  type(gpu_double3) :: tau_kia, tmp_oov
+
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
+  do a = 1, nV
+    do b=1,nV
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
+    enddo
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
+    1.d0, H_vv, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
+end
+
+! H_vo
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
+  implicit none
+
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
+
+  integer :: a,b,i,j,u,k
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol, cholesky_mo_num, &
+     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, &
+      d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
+      H_vo, nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
+  do i=1,nO
+    call gpu_stream_create(stream(i))
+  enddo
+
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
+  do i=1,nO
+    do j=1,nO
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
+    1.d0, H_vo, nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
 end
 
 ! R1
@@ -291,251 +410,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
 
 end
 
-! H_oo
-
-subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
-    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-  use gpu
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
-  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
-  type(gpu_double4), intent(in)    :: tau_x
-  type(gpu_double2), intent(out)   :: H_oo
-
-  integer :: a,b,i,j,u,k
-
-  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
-
-  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
-
-!  !$omp parallel &
-!  !$omp default(shared) &
-!  !$omp private(i,u,j,k,a,b,tmp_vov)
-!  call gpu_allocate(tmp_vov, nV, nO, nV)
-!  !$omp do
-!  do u = 1, nO
-!    do b=1,nV
-!      do j=1,nO
-!        do a=1,nV
-!          tmp_vov%f(a,j,b) = tau_x%f(u,j,a,b)
-!        enddo
-!      enddo
-!    enddo
-!    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-!      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f, nV, &
-!      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-!  enddo
-!  !$omp end do nowait
-!  call gpu_deallocate(tmp_vov)
-!  !$omp do
-!  do i = 1, nO
-!    do u = 1, nO
-!      H_oo%f(u,i) = d_cc_space_f_oo%f(u,i)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!
-!  !$omp barrier
-!  !$omp end  parallel
-!  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-!    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
-!    1.d0, H_oo%f(1,1), nO)
-!
-
-  type(gpu_blas) :: blas
-
-
-  !$OMP PARALLEL  &
-  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
-
-  !$OMP SINGLE
-  !$OMP TASK
-  call gpu_copy(d_cc_space_f_oo, H_oo)
-  !$OMP END TASK
-  !$OMP END SINGLE
-
-  call gpu_allocate(tmp_ovv, nO, nV, nV)
-  call gpu_allocate(tmp_vov, nV, nO, nV)
-
-  call gpu_blas_create(blas)
-
-  !$OMP DO
-  do u=1,nO
-    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
-           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
-    do b=1,nV
-      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
-           tmp_ovv%f(1,1,b), nO, 0.d0, &
-           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
-    enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
-      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
-  enddo
-  !$OMP END DO
-
-  call gpu_blas_destroy(blas)
-
-  call gpu_deallocate(tmp_vov)
-  call gpu_deallocate(tmp_ovv)
-
-  !$OMP TASKWAIT
-  !$OMP END PARALLEL
-
-  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
-
-  call gpu_deallocate(tau_kau)
-end
-
-! H_vv
-
-subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
-         d_cc_space_v_ov_chol,H_vv)
-  use gpu
-  implicit none
-
-  integer, intent(in)              :: nO,nV
-  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
-  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
-  type(gpu_double4), intent(in)    :: tau_x
-  type(gpu_double2), intent(out)   :: H_vv
-
-  integer :: a,b,i,j,u,k, beta
-
-  type(gpu_double3) :: tau_kia, tmp_oov
-
-  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
-
-!  !$omp parallel &
-!  !$omp default(shared) &
-!  !$omp private(i,beta,j,k,a,b,tmp_oov)
-!  allocate(tmp_oov(nO,nO,nV) )
-!  !$omp do
-!  do a = 1, nV
-!    do b=1,nV
-!      do j=1,nO
-!        do i=1,nO
-!          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
-!        enddo
-!      enddo
-!    enddo
-!    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-!      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
-!      0.d0, tau_kia(1,1,a), cholesky_mo_num)
-!  enddo
-!  !$omp end do nowait
-!  deallocate(tmp_oov)
-
-!  !$omp do
-!  do beta = 1, nV
-!    do a = 1, nV
-!      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
-!    enddo
-!  enddo
-!  !$omp end do nowait
-!  !$omp barrier
-!  !$omp end  parallel
-!  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-!    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
-!    1.d0, H_vv%f, nV)
-
-  type(gpu_blas) :: blas
-
-
-  PROVIDE gpu_num
-  !$OMP PARALLEL &
-  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(a,b,tmp_oov,blas)
-
-  !$OMP SINGLE
-  !$OMP TASK
-  call gpu_copy(d_cc_space_f_vv, H_vv)
-  !$OMP END TASK
-  !$OMP END SINGLE
-
-  call gpu_blas_create(blas)
-  call gpu_allocate(tmp_oov, nO, nO, nV)
-
-  !$OMP DO
-  do a = 1, nV
-    do b=1,nV
-      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
-        tau_x%f(1,1,a,b), nO, 0.d0, &
-        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
-    enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
-      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
-  enddo
-  !$OMP END DO
-
-  call gpu_blas_destroy(blas)
-
-  call gpu_deallocate(tmp_oov)
-  !$OMP TASKWAIT
-  !$OMP END PARALLEL
-
-  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
-
-  call gpu_deallocate(tau_kia)
-end
-
-! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
-  implicit none
-
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
-
-  integer :: a,b,i,j,u,k
-
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
-  do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
-  enddo
-
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
-  do i=1,nO
-    do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp)
-
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
-
-end
-
 
 ! R2
 

From 2bead959d0eee7790162df656e3781e4dcdedb7d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 2 Jul 2024 13:58:19 +0200
Subject: [PATCH 106/131] Fxied GPU interface for gfortran

---
 plugins/local/gpu_x86/gpu.c            |  90 ++++++++++----------
 src/ccsd/ccsd_space_orb_sub.irp.f      |  10 +--
 src/ccsd/ccsd_space_orb_sub_chol.irp.f |  46 +++++------
 src/gpu/gpu_module.F90                 | 109 +++++--------------------
 4 files changed, 94 insertions(+), 161 deletions(-)

diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
index dab23a25..fe3cadc5 100644
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 /* Streams */
 
 void gpu_stream_create(void** ptr) {
-  *ptr = (void*) 2;
+  *ptr = (void*) malloc(sizeof(char));
 }
 
 void gpu_stream_destroy(void** ptr) {
+  free(*ptr);
   *ptr = NULL;
 }
 
@@ -68,11 +69,12 @@ void gpu_synchronize() {
 /* BLAS functions */
 
 void gpu_blas_create(void** handle) {
-  *handle = (void*) 1;
+  *handle = (void*) malloc(sizeof(char));
 }
 
 
 void gpu_blas_destroy(void** handle) {
+  free(*handle);
   *handle = NULL;
 }
 
@@ -122,7 +124,7 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx,
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
   assert (handle != NULL);
@@ -143,14 +145,14 @@ void gpu_dgemv(void* handle, const char transa, const int64_t m, const int64_t n
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
 }
 
 
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
   assert (handle != NULL);
@@ -171,14 +173,14 @@ void gpu_sgemv(void* handle, const char transa, const int64_t m, const int64_t n
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
 }
 
 
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
   assert (handle != NULL);
@@ -201,7 +203,7 @@ void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t
   assert ( (int64_t) ldb_ == ldb);
   assert ( (int64_t) ldc_ == ldc);
 
-  dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
 }
 
 
@@ -209,7 +211,7 @@ void gpu_dgemm(void* handle, const char transa, const char transb, const int64_t
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
   assert (handle != NULL);
@@ -232,18 +234,18 @@ void gpu_sgemm(void* handle, const char transa, const char transb, const int64_t
   assert ( (int64_t) ldb_ == ldb);
   assert ( (int64_t) ldc_ == ldc);
 
-  sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
 }
 
 
-void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
   assert (handle != NULL);
 
-  if ( (transa == 'N' && transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
 
      if (alpha == 0.) {
 
@@ -271,10 +273,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t
 
      }
 
-  } else if ( (transa == 'N' && transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
 
      if (alpha == 0.) {
 
@@ -302,10 +304,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t
 
      }
 
-  } else if ( (transa == 'T' && transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
 
      if (alpha == 0.) {
 
@@ -333,10 +335,10 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t
 
      }
 
-  } else if ( (transa == 'T' && transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
 
      if (alpha == 0.) {
 
@@ -368,14 +370,14 @@ void gpu_dgeam(void* handle, const char transa, const char transb, const int64_t
 }
 
 
-void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
   assert (handle != NULL);
 
-  if ( (transa == 'N' && transb == 'N') ||
-       (transa == 'n' && transb == 'N') ||
-       (transa == 'N' && transb == 'n') ||
-       (transa == 'n' && transb == 'n') ) {
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
 
      if (alpha == 0.) {
 
@@ -403,10 +405,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t
 
      }
 
-  } else if ( (transa == 'N' && transb == 'T') ||
-              (transa == 'n' && transb == 'T') ||
-              (transa == 'N' && transb == 't') ||
-              (transa == 'n' && transb == 't') ) {
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
 
      if (alpha == 0.) {
 
@@ -434,10 +436,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t
 
      }
 
-  } else if ( (transa == 'T' && transb == 'N') ||
-              (transa == 't' && transb == 'N') ||
-              (transa == 'T' && transb == 'n') ||
-              (transa == 't' && transb == 'n') ) {
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
 
      if (alpha == 0.) {
 
@@ -465,10 +467,10 @@ void gpu_sgeam(void* handle, const char transa, const char transb, const int64_t
 
      }
 
-  } else if ( (transa == 'T' && transb == 'T') ||
-              (transa == 't' && transb == 'T') ||
-              (transa == 'T' && transb == 't') ||
-              (transa == 't' && transb == 't') ) {
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
 
      if (alpha == 0.) {
 
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 13b974be..de109cea 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -319,10 +319,10 @@ subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,
   call gpu_stream_create(s2)
 
   call gpu_set_stream(blas_handle,s1)
-  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo, 1, t1, 1, e)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
 
   call gpu_set_stream(blas_handle,s2)
-  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x, 1_8, d_cc_space_v_oovv, 1_8, energy)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
   call gpu_set_stream(blas_handle,gpu_default_stream)
 
   call gpu_synchronize()
@@ -362,9 +362,9 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
     call gpu_stream_create(stream(b))
     call gpu_set_stream(blas_handle,stream(b))
     do j=1,nO
-      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nV, &
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
          1.d0, t2%f(1,j,1,b), nO*nO, &
-         h_t1(j,b), t1%f, nO, &
+         h_t1(j,b), t1%f(1,1), nO, &
          tau%f(1,j,1,b), nO*nO)
     enddo
   enddo
@@ -409,7 +409,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
   do b=1,nV
     do a=1,nV
       call gpu_set_stream(blas_handle,stream(a))
-      call gpu_dgeam_f(blas_handle, 'N', 'N', nO, nO, &
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
           2.d0, tau%f(1,1,a,b), nO, &
          -1.d0, tau%f(1,1,b,a), nO, &
          tau_x%f(1,1,a,b), nO)
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 5eb95a06..a3490589 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -37,15 +37,15 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
 
   !$OMP DO
   do u=1,nO
-    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
-           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
+    call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1)
     do b=1,nV
-      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
+      call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, &
            tmp_ovv%f(1,1,b), nO, 0.d0, &
            tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
+    call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, &
       0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
   enddo
   !$OMP END DO
@@ -59,8 +59,8 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
   !$OMP END PARALLEL
 
   call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
+    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
+    1.d0, H_oo%f(1,1), nO)
 
   call gpu_synchronize()
   call gpu_deallocate(tau_kau)
@@ -103,12 +103,12 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
   !$OMP DO
   do a = 1, nV
     do b=1,nV
-      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
+      call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, &
         tau_x%f(1,1,a,b), nO, 0.d0, &
         tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
     enddo
-    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
+    call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, &
       0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
   enddo
   !$OMP END DO
@@ -119,9 +119,9 @@ subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
   !$OMP TASKWAIT
   !$OMP END PARALLEL
 
-  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
+  call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia%f(1,1,1), cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vv%f(1,1), nV)
 
   call gpu_synchronize()
   call gpu_deallocate(tau_kia)
@@ -148,20 +148,20 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
   call gpu_allocate(tmp_k, cholesky_mo_num)
 
   call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     d_cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+     t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
 
-  call gpu_dgemm(blas_handle, 'T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      d_cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
+  call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+      H_vo%f(1,1), nV*nO)
 
   call gpu_deallocate(tmp_k)
 
 
   call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
 
-  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    d_cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
+  call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO)
 
   call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
 
@@ -174,7 +174,7 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
   do i=1,nO
     do j=1,nO
       call gpu_set_stream(blas_handle,stream(j))
-      call gpu_dgeam_f(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+      call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
         tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
         tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
     enddo
@@ -190,8 +190,8 @@ subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
   call gpu_deallocate(tmp)
 
   call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    d_cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vo%f(1,1), nV)
 
   call gpu_synchronize()
   call gpu_deallocate(tmp2)
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 2676b339..20d99ede 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -156,17 +156,17 @@ module gpu
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_float), intent(in), value :: alpha, beta
-      type(c_ptr), value :: a, b, c
+      real(c_float) :: a, b, c
     end subroutine
 
     subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
       b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
       import
       type(c_ptr), value, intent(in)        :: handle
-      character(c_char), intent(in), value  :: transa, transb
+      character(c_char), intent(in)         :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
       real(c_double), intent(in), value :: alpha, beta
-      type(c_ptr), value :: a, b, c
+      real(c_double) :: a, b, c
     end subroutine
 
     subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
@@ -176,7 +176,7 @@ module gpu
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
       real(c_float), intent(in), value :: alpha, beta
-      type(c_ptr), value :: a, b, c
+      real(c_float) :: a, b, c
     end subroutine
 
   end interface
@@ -570,7 +570,6 @@ module gpu
     end subroutine
 
 
-end module
 
 
 
@@ -578,38 +577,20 @@ end module
 ! ---
 
 subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
-  use gpu
+!  use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*4                      :: n, incx, incy
-  type(gpu_double1), intent(in)  :: dx, dy
-  double precision, intent(out)  :: res
-  call gpu_ddot_c(handle%c, int(n,c_int64_t), dx%c, int(incx,c_int64_t), dy%c, int(incy,c_int64_t), res)
-end subroutine
-
-subroutine gpu_ddot_f(handle, n, dx, incx, dy, incy, res)
-  use gpu
-  type(gpu_blas), intent(in)     :: handle
-  integer*4                      :: n, incx, incy
-  double precision, target       :: dx(*), dy(*)
+  double precision, target       :: dx, dy
   double precision, intent(out)  :: res
   call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
 end subroutine
 
 
 subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
-  use gpu
+!  use gpu
   type(gpu_blas), intent(in)     :: handle
   integer*8                      :: n, incx, incy
-  type(gpu_double1), intent(in)  :: dx, dy
-  double precision, intent(out)  :: res
-  call gpu_ddot_c(handle%c, n, dx%c, incx, dy%c, incy, res)
-end subroutine
-
-subroutine gpu_ddot_f_64(handle, n, dx, incx, dy, incy, res)
-  use gpu
-  type(gpu_blas), intent(in)     :: handle
-  integer*8                      :: n, incx, incy
-  double precision, target       :: dx(*), dy(*)
+  double precision, target       :: dx, dy
   double precision, intent(out)  :: res
   call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
 end subroutine
@@ -620,25 +601,12 @@ end subroutine
 
 subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
   b, ldb, c, ldc)
-  use gpu
+!  use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
   integer*4, intent(in)        :: m, n, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  type(gpu_double2)            :: a, b, c
-  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
-        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
-end subroutine
-
-
-subroutine gpu_dgeam_f(handle, transa, transb, m, n, alpha, a, lda, beta, &
-  b, ldb, c, ldc)
-  use gpu
-  type(gpu_blas), intent(in)   :: handle
-  character, intent(in)        :: transa, transb
-  integer*4, intent(in)        :: m, n, lda, ldb, ldc
-  double precision, intent(in) :: alpha, beta
-  double precision, target     :: a(*), b(*), c(*)
+  double precision, target     :: a, b, c
   call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
         c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
 end subroutine
@@ -646,25 +614,12 @@ end subroutine
 
 subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
   b, ldb, c, ldc)
-  use gpu
+!  use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  type(gpu_double2)            :: a, b, c
-  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a%c, int(lda,c_int64_t), beta, &
-        b%c, int(ldb,c_int64_t), c%c, int(ldc,c_int64_t))
-end subroutine
-
-
-subroutine gpu_dgeam_f_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
-  b, ldb, c, ldc)
-  use gpu
-  type(gpu_blas), intent(in)   :: handle
-  character, intent(in)        :: transa, transb
-  integer*8, intent(in)        :: m, n, lda, ldb, ldc
-  double precision, intent(in) :: alpha, beta
-  double precision, target     :: a(*), b(*), c(*)
+  double precision, target     :: a, b, c
   call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
         c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
 end subroutine
@@ -675,51 +630,27 @@ end subroutine
 
 subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
   b, ldb, beta, c, ldc)
-  use gpu
+!  use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
   integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  type(gpu_double2)            :: a, b, c
+  double precision             :: a, b, c
   call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
-        alpha, a%c, int(lda,c_int64_t), &
-        b%c, int(ldb,c_int64_t), beta, c%c, int(ldc,c_int64_t))
+        alpha, a, int(lda,c_int64_t), &
+        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
 end subroutine
 
 subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
   b, ldb, beta, c, ldc)
-  use gpu
+!  use gpu
   type(gpu_blas), intent(in)   :: handle
   character, intent(in)        :: transa, transb
   integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
   double precision, intent(in) :: alpha, beta
-  type(gpu_double2)            :: a, b, c
-  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
-        alpha, a%c, lda, b%c, ldb, beta, c%c, ldc)
-end subroutine
-
-subroutine gpu_dgemm_f(handle, transa, transb, m, n, k, alpha, a, lda, &
-  b, ldb, beta, c, ldc)
-  use gpu
-  type(gpu_blas), intent(in)   :: handle
-  character, intent(in)        :: transa, transb
-  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
-  double precision, intent(in) :: alpha, beta
-  double precision, target     :: a(*), b(*), c(*)
+  double precision             :: a, b, c
   call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
-        alpha, c_loc(a), int(lda,c_int64_t), &
-        c_loc(b), int(ldb,c_int64_t), beta, c_loc(c), int(ldc,c_int64_t))
-end subroutine
-
-subroutine gpu_dgemm_f_64(handle, transa, transb, m, n, k, alpha, a, lda, &
-  b, ldb, beta, c, ldc)
-  use gpu
-  type(gpu_blas), intent(in)   :: handle
-  character, intent(in)        :: transa, transb
-  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
-  double precision, intent(in) :: alpha, beta
-  double precision, target     :: a(*), b(*), c(*)
-  call gpu_dgemm_c(handle%c, transa, transb, m, n, k, &
-        alpha, c_loc(a), lda, c_loc(b), ldb, beta, c_loc(c), ldc)
+        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
 end subroutine
 
+end module

From 447cdcd907dd864252777423763ed6947efc32d8 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 2 Jul 2024 17:22:41 +0200
Subject: [PATCH 107/131] Working on r1

---
 plugins/local/gpu_nvidia/gpu.c         |  32 ++---
 src/ccsd/ccsd_space_orb_sub.irp.f      |  31 ++++-
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 176 ++++++++++++-------------
 src/gpu/gpu_module.F90                 |  57 +++++++-
 4 files changed, 182 insertions(+), 114 deletions(-)

diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
index 39a82984..e77847a6 100644
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -149,7 +149,7 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6
 
 
 
-void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
   assert (handle != NULL);
@@ -171,14 +171,14 @@ void gpu_dgemv(cublasHandle_t handle, const char transa, const int64_t m, const
   assert ( (int64_t) incy_ == incy);
 
   cublasOperation_t transa_ = CUBLAS_OP_N;
-  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
 
   cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
 }
 
 
 
-void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
   assert (handle != NULL);
@@ -200,13 +200,13 @@ void gpu_sgemv(cublasHandle_t handle, const char transa, const int64_t m, const
   assert ( (int64_t) incy_ == incy);
 
   cublasOperation_t transa_ = CUBLAS_OP_N;
-  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
 
   cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
 }
 
 
-void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
   assert (handle != NULL);
@@ -231,15 +231,15 @@ void gpu_dgemm(cublasHandle_t handle, const char transa, const char transb, cons
 
   cublasOperation_t transa_ = CUBLAS_OP_N;
   cublasOperation_t transb_ = CUBLAS_OP_N;
-  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
-  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
   cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
 }
 
 
 
-void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
   assert (handle != NULL);
@@ -264,14 +264,14 @@ void gpu_sgemm(cublasHandle_t handle, const char transa, const char transb, cons
 
   cublasOperation_t transa_ = CUBLAS_OP_N;
   cublasOperation_t transb_ = CUBLAS_OP_N;
-  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
-  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
   cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
 }
 
 
-void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
   assert (handle != NULL);
 
@@ -293,15 +293,15 @@ void gpu_dgeam(cublasHandle_t handle, const char transa, const char transb, cons
 
   cublasOperation_t transa_ = CUBLAS_OP_N;
   cublasOperation_t transb_ = CUBLAS_OP_N;
-  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
-  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
   cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
 
 }
 
 
-void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
   assert (handle != NULL);
 
@@ -323,8 +323,8 @@ void gpu_sgeam(cublasHandle_t handle, const char transa, const char transb, cons
 
   cublasOperation_t transa_ = CUBLAS_OP_N;
   cublasOperation_t transb_ = CUBLAS_OP_N;
-  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
-  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
   cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
 
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index de109cea..256f743b 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -20,8 +20,8 @@ subroutine run_ccsd_space_orb
   type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
   type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
 
-  type(gpu_double4) :: d_cc_space_v_oovv
-
+  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
+  type(gpu_double4) :: d_cc_space_v_oovo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -69,6 +69,7 @@ subroutine run_ccsd_space_orb
 
   call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
   call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
   call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
 
 !  FREE cc_space_f_oo
@@ -92,6 +93,18 @@ subroutine run_ccsd_space_orb
 !    FREE cc_space_v_vv_chol
   endif
 
+  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
+  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
+  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+
+  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
+  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
+  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+
+!  FREE cc_space_v_voov
+!  FREE cc_space_v_ovov
+!  FREE cc_space_v_oovo
+
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
   call gpu_allocate(tau, nO,nO,nV,nV)
@@ -185,7 +198,8 @@ subroutine run_ccsd_space_orb
       call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
       call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
 
-      call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
       call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
     else
       call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
@@ -292,8 +306,17 @@ subroutine run_ccsd_space_orb
     call gpu_deallocate(d_cc_space_v_vo_chol)
     call gpu_deallocate(d_cc_space_v_vv_chol)
   endif
-  call gpu_deallocate(d_cc_space_f_vo)
+
   call gpu_deallocate(d_cc_space_v_oovv)
+  call gpu_deallocate(d_cc_space_v_voov)
+  call gpu_deallocate(d_cc_space_v_ovov)
+  call gpu_deallocate(d_cc_space_v_oovo)
+
+  call gpu_deallocate(d_cc_space_f_oo)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_f_ov)
+  call gpu_deallocate(d_cc_space_f_vv)
+
   call gpu_deallocate(t1)
   call gpu_deallocate(t2)
 
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index a3490589..6190e985 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -199,59 +199,52 @@ end
 
 ! R1
 
-subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-
+subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+    d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+  use gpu
   implicit none
 
   ! in
   integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
+  type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo
 
   ! out
-  double precision, intent(out) :: r1(nO,nV), max_r1
+  type(gpu_double2), intent(out) :: r1
+  double precision, intent(out)  :: max_r1
 
   ! internal
   integer                       :: u,i,j,beta,a,b
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r1,cc_space_f_ov) &
-  !$omp private(u,beta) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      r1(u,beta) = cc_space_f_ov(u,beta)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_copy(d_cc_space_f_ov, r1)
 
-  double precision, allocatable :: X_oo(:,:)
-  allocate(X_oo(nO,nO))
-  call dgemm('N','N', nO, nO, nV, &
-             -2d0, t1    , size(t1,1), &
-                   cc_space_f_vo, size(cc_space_f_vo,1), &
-              0d0, X_oo  , size(X_oo,1))
+  type(gpu_double2) :: X_oo
+  call gpu_allocate(X_oo,nO,nO)
 
-  call dgemm('T','N', nO, nV, nO, &
-             1d0, X_oo, size(X_oo,2), &
-                  t1  , size(t1,1), &
-             1d0, r1  , size(r1,1))
-  deallocate(X_oo)
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
+             -2d0, t1%f(1,1), size(t1%f,1), &
+                   d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+              0d0, X_oo%f(1,1), size(X_oo%f,1))
 
-  call dgemm('N','N', nO, nV, nV, &
-             1d0, t1  , size(t1,1), &
-                  H_vv, size(H_vv,1), &
-             1d0, r1  , size(r1,1))
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
+             1d0, X_oo%f(1,1), size(X_oo%f,2), &
+                  t1%f(1,1)  , size(t1%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
 
-  call dgemm('N','N', nO, nV, nO, &
-             -1d0, H_oo, size(H_oo,1), &
-                   t1  , size(t1,1), &
-              1d0, r1, size(r1,1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  H_vv%f(1,1), size(H_vv%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, &
+             -1d0, H_oo%f(1,1), size(H_oo%f,1), &
+                   t1%f(1,1)  , size(t1%f,1), &
+              1d0, r1%f(1,1), size(r1%f,1))
+
+  type(gpu_double4) :: X_voov
+  call gpu_allocate(X_voov, nV, nO, nO, nV)
 
-  double precision, allocatable :: X_voov(:,:,:,:)
-  allocate(X_voov(nV, nO, nO, nV))
 
   !$omp parallel &
   !$omp shared(nO,nV,X_voov,t2,t1) &
@@ -262,7 +255,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
     do u = 1, nO
       do i = 1, nO
         do a = 1, nV
-          X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
+          X_voov%f(a,i,u,beta) = 2d0 * t2%f(i,u,a,beta) - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta)
         enddo
       enddo
     enddo
@@ -270,18 +263,20 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp end do
   !$omp end parallel
 
-  call dgemv('T', nV*nO, nO*nV, &
-             1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
-                  H_vo  , 1, &
-             1d0, r1    , 1)
+  call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
+             1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+                  H_vo%f(1,1)  , 1, &
+             1d0, r1%f(1,1)    , 1)
 
-  deallocate(X_voov)
+  call gpu_synchronize()
+  call gpu_deallocate(X_oo)
+  call gpu_deallocate(X_voov)
 
-  double precision, allocatable :: X_ovov(:,:,:,:)
-  allocate(X_ovov(nO, nV, nO, nV))
+  type(gpu_double4) :: X_ovov
+  call gpu_allocate(X_ovov, nO, nV, nO, nV)
 
   !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
+  !$omp shared(nO,nV,d_cc_space_v_ovov,d_cc_space_v_voov,X_ovov) &
   !$omp private(u,beta,i,a) &
   !$omp default(none)
   !$omp do
@@ -289,7 +284,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
     do u = 1, nO
       do a = 1, nv
         do i = 1, nO
-          X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
+          X_ovov%f(i,a,u,beta) = 2d0 * d_cc_space_v_voov%f(a,u,i,beta) - d_cc_space_v_ovov%f(u,a,i,beta)
         enddo
       enddo
     enddo
@@ -297,17 +292,25 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   !$omp end do
   !$omp end parallel
 
-  call dgemv('T', nO*nV, nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  t1     , 1, &
-             1d0, r1     , 1)
-
-  deallocate(X_ovov)
+!  call dgemv('T', nO*nV, nO*nV, &
+!             1d0, X_ovov%f, size(X_ovov%f,1) * size(X_ovov%f,2), &
+!                  t1%f, 1, &
+!             1d0, r1%f, 1)
+  call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  t1%f(1,1), 1, &
+             1d0, r1%f(1,1), 1)
 
   integer :: iblock, block_size, nVmax
-  double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
+  type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
+
   block_size = 16
-  allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
+  call gpu_allocate(W_vvov,nV, nV,nO,block_size)
+  call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
+  call gpu_allocate(T_vvoo, nV,nV,nO,nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_ovov)
 
   !$omp parallel &
   !$omp private(u,i,b,a) &
@@ -317,7 +320,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
     do i = 1, nO
       do b = 1, nV
         do a = 1, nV
-          T_vvoo(a,b,i,u) = tau(i,u,a,b)
+          T_vvoo%f(a,b,i,u) = tau%f(i,u,a,b)
         enddo
       enddo
     enddo
@@ -328,11 +331,12 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   do iblock = 1, nV, block_size
     nVmax = min(block_size,nV-iblock+1)
 
-    call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
-      cc_space_v_vo_chol            , cholesky_mo_num, &
-      cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-      0.d0, W_vvov_tmp, nV*nO)
+    call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, &
+      d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+      0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
 
+    call gpu_synchronize()
     !$omp parallel &
     !$omp private(b,i,a,beta) &
     !$omp default(shared)
@@ -341,7 +345,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
         !$omp do
         do b = 1, nV
           do a = 1, nV
-            W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
+            W_vvov%f(a,b,i,beta) = 2d0 * W_vvov_tmp%f(a,i,b,beta) - W_vvov_tmp%f(b,i,a,beta)
           enddo
         enddo
         !$omp end do nowait
@@ -350,20 +354,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
     !$omp barrier
     !$omp end parallel
 
-    call dgemm('T','N',nO,nVmax,nO*nV*nV, &
-             1d0, T_vvoo, nV*nV*nO, &
-                  W_vvov, nO*nV*nV, &
-             1d0, r1(1,iblock), nO)
+    call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
+             1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
+                  W_vvov%f(1,1,1,1), nO*nV*nV, &
+             1d0, r1%f(1,iblock), nO)
   enddo
 
-  deallocate(W_vvov,T_vvoo)
+  call gpu_synchronize()
+  call gpu_deallocate(W_vvov)
+  call gpu_deallocate(T_vvoo)
 
 
-  double precision, allocatable :: W_oovo(:,:,:,:)
-  allocate(W_oovo(nO,nO,nV,nO))
+  type(gpu_double4) :: W_oovo
+  call gpu_allocate(W_oovo, nO,nO,nV,nO)
 
   !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
+  !$omp shared(nO,nV,d_cc_space_v_oovo,W_oovo) &
   !$omp private(u,a,i,j) &
   !$omp default(none)
   do u = 1, nO
@@ -371,8 +377,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-!          W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
-          W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
+          W_oovo%f(i,j,a,u) = 2d0 * d_cc_space_v_oovo%f(i,j,a,u) - d_cc_space_v_oovo%f(j,i,a,u)
         enddo
       enddo
     enddo
@@ -380,33 +385,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
   enddo
   !$omp end parallel
 
-  call dgemm('T','N', nO, nV, nO*nO*nV, &
-             -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
-                   tau   , size(tau,1) * size(tau,2) * size(tau,3), &
-              1d0, r1    , size(r1,1))
+  ! Change the sign for consistency with the code in spin orbitals
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
+              1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), &
+                   tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), &
+             -1d0, r1%f(1,1), size(r1%f,1))
 
-  deallocate(W_oovo)
+  call gpu_synchronize()
+  call gpu_deallocate(W_oovo)
 
   max_r1 = 0d0
   do a = 1, nV
     do i = 1, nO
-      max_r1 = max(dabs(r1(i,a)), max_r1)
+      max_r1 = max(dabs(r1%f(i,a)), max_r1)
     enddo
   enddo
 
-  ! Change the sign for consistency with the code in spin orbitals
-  !$omp parallel &
-  !$omp shared(nO,nV,r1) &
-  !$omp private(a,i) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      r1(i,a) = -r1(i,a)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
 
 end
 
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 20d99ede..949ae4fc 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -136,7 +136,7 @@ module gpu
       type(c_ptr), value, intent(in) :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
-      real(c_float), intent(out)  :: res
+      real(c_float), intent(out)     :: res
     end subroutine
 
     subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
@@ -145,7 +145,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
-      real(c_double), intent(in), value :: alpha, beta
+      real(c_double), intent(in), value      :: alpha, beta
       type(c_ptr), value :: a, b, c
     end subroutine
 
@@ -155,10 +155,31 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
-      real(c_float), intent(in), value :: alpha, beta
+      real(c_float), intent(in), value      :: alpha, beta
       real(c_float) :: a, b, c
     end subroutine
 
+    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_double), intent(in), value     :: alpha, beta
+      real(c_double)                        :: a, x, y
+    end subroutine
+
+    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_float), intent(in), value      :: alpha, beta
+      real(c_float)                         :: a, x, y
+    end subroutine
+
+
     subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
       b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
       import
@@ -625,6 +646,36 @@ subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
 end subroutine
 
 
+! gemv
+! ----
+
+subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*4, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*8, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+
 ! gemm
 ! ----
 

From 92fe3a6f84b0af99bf554602528969699c206cde Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 2 Jul 2024 18:36:19 +0200
Subject: [PATCH 108/131] Working on r1

---
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 37 ++++++++++++++++++++------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 6190e985..e0048637 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -245,23 +245,44 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
   type(gpu_double4) :: X_voov
   call gpu_allocate(X_voov, nV, nO, nO, nV)
 
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  call gpu_synchronize()
+
+!  do i=1,nO
+!    do beta=1,nV
+!      call gpu_set_stream(blas_handle, stream(beta))
+!      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO,  -1.d0, t2%f(1,i,1,beta), &
+!         nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV)
+!    enddo
+!  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,X_voov,t2,t1) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
   do beta = 1, nV
     do u = 1, nO
       do i = 1, nO
         do a = 1, nV
-          X_voov%f(a,i,u,beta) = 2d0 * t2%f(i,u,a,beta) - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta)
+          X_voov%f(a,i,u,beta) = - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta)
         enddo
       enddo
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_synchronize()
+
+  do beta=1,nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO,  1.d0, X_voov%f(1,1,1,beta), &
+         nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV)
+  enddo
+
+  call gpu_synchronize()
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
   call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
              1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &

From cc09f8c61a0e8a29e7d2a2933d9659c1ad70a7b5 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 3 Jul 2024 14:52:11 +0200
Subject: [PATCH 109/131] Minor changes in Cholesky

---
 src/ao_two_e_ints/cholesky.irp.f      | 6 +++---
 src/mo_two_e_ints/map_integrals.irp.f | 2 +-
 src/tools/four_idx_transform.irp.f    | 3 +++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index acb0872b..a2d9d043 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -158,9 +158,9 @@ END_PROVIDER
          Lset(np8) = p8
        endif
      enddo
-     np = np8
+     if (np8 > ndim8) stop 'np>ndim8'
+     np = int(np8,4)
      if (np <= 0) stop 'np<=0'
-     if (np > ndim8) stop 'np>ndim8'
 
      rank_max = min(np,20*elec_num*elec_num)
      call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
@@ -431,7 +431,7 @@ END_PROVIDER
            Lset(np8) = p8
          endif
        enddo
-       np = np8
+       np = int(np8,4)
 
      enddo
 
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index 168c34b4..eeb4279f 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -40,7 +40,7 @@ end
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
 
- mo_integrals_cache_size  = 2_8**mo_integrals_cache_shift
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
 
  mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
  mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f
index 92e87cad..fc6bface 100644
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@@ -12,6 +12,9 @@ program four_idx_transform
 !
   END_DOC
 
+  if (do_mo_cholesky) then
+    stop 'Not implemented with Cholesky integrals'
+  endif
   io_mo_two_e_integrals = 'Write'
   SOFT_TOUCH io_mo_two_e_integrals
   if (.true.) then

From 2f8e7bd4f79108476bcc1b04165912d629d7d924 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 3 Jul 2024 15:32:38 +0200
Subject: [PATCH 110/131] Updated to read CHolesky MO integrals from TREXIO

---
 src/trexio/export_trexio_routines.irp.f  |   2 +-
 src/trexio/import_trexio_integrals.irp.f | 226 ++++++++++++++++-------
 2 files changed, 159 insertions(+), 69 deletions(-)

diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f
index 63630243..0eec68bd 100644
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path)
     do k=1,cholesky_ao_num
      do j=1,mo_num
       do i=1,mo_num
-         integral = cholesky_mo(i,j,k)
+         integral = cholesky_mo_transp(k,i,j)
          if (integral == 0.d0) cycle
          icount += 1_8
          chol_buffer(icount) = integral
diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f
index 5a6b3c03..556ed7bc 100644
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@@ -28,7 +28,7 @@ subroutine run(f)
   integer(trexio_t), intent(in)  :: f ! TREXIO file handle
   integer(trexio_exit_code)      :: rc
 
-  integer ::i,j,k,l
+  integer :: i,j,k,l, iunit
   integer(8) :: m, n_integrals
   double precision :: integral
 
@@ -41,10 +41,12 @@ subroutine run(f)
   integer         , allocatable :: Vi(:,:)
   double precision              :: s
 
-! TODO:
-! - If Cholesky AO in trexio file, read cholesky ao vectors
-! - If Cholesky MO in trexio file, read cholesky mo vectors
-! - If Cholesky MO not in trexio file, force do_cholesky_mo to False
+  integer*4 :: BUFSIZE
+  integer :: rank
+  double precision, allocatable :: tmp(:,:,:)
+  integer*8 :: offset, icount
+
+  integer, external :: getUnitAndOpen
 
   if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
     rc = trexio_read_nucleus_repulsion(f, s)
@@ -120,45 +122,88 @@ subroutine run(f)
   rc = trexio_has_ao_2e_int(f)
   PROVIDE ao_num
   if (rc /= TREXIO_HAS_NOT) then
-      PROVIDE ao_integrals_map
 
-      integer*4 :: BUFSIZE
-      BUFSIZE=ao_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_ao_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
 
-      integer*8 :: offset, icount
+          rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          allocate(tmp(ao_num,ao_num,rank))
+          tmp(:,:,:) = 0.d0
 
-      call map_sort(ao_integrals_map)
-      call map_unique(ao_integrals_map)
+          BUFSIZE=ao_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
-      call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
 
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'AO integrals read from TREXIO file'
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(i,j,k) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky AO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky AO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_ao_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          PROVIDE ao_integrals_map
+
+          BUFSIZE=ao_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
+
+          call map_sort(ao_integrals_map)
+          call map_unique(ao_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
+          call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
+
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'AO integrals read from TREXIO file'
+      endif
   else
       print *, 'AO integrals not found in TREXIO file'
   endif
@@ -186,40 +231,85 @@ subroutine run(f)
   rc = trexio_has_mo_2e_int(f)
   if (rc /= TREXIO_HAS_NOT) then
 
-      BUFSIZE=mo_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_mo_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
+
+          rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
+
+          allocate(tmp(rank,mo_num,mo_num))
+          tmp(:,:,:) = 0.d0
+
+          BUFSIZE=mo_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(k,i,j) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky MO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky MO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_mo_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          BUFSIZE=mo_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
 
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
 
-      call map_sort(mo_integrals_map)
-      call map_unique(mo_integrals_map)
+          call map_sort(mo_integrals_map)
+          call map_unique(mo_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
+          call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'MO integrals read from TREXIO file'
+      endif
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
-      call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'MO integrals read from TREXIO file'
   else
       print *, 'MO integrals not found in TREXIO file'
   endif

From 7ceb8fdcca5cd7bff3984c816eef8e47aa681a0b Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 3 Jul 2024 18:24:13 +0200
Subject: [PATCH 111/131] Finished r1

---
 plugins/local/gpu_nvidia/gpu.c         |  41 +++---
 plugins/local/gpu_x86/gpu.c            | 112 ++++++++--------
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 177 ++++++++++---------------
 src/gpu/gpu.h                          |  24 ++--
 src/gpu/gpu_module.F90                 |  12 +-
 5 files changed, 162 insertions(+), 204 deletions(-)

diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
index e77847a6..a775ab95 100644
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -116,11 +116,6 @@ void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int
   assert ( (int64_t) incy_ == incy);
 
   cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
-/*
-  double alpha = 1.0;
-  double beta = 0.0;
-  cublasStatus_t rc = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, n_, &alpha, x, 1, y, n_, &beta, &result_, 1);
-*/
   assert (rc == CUBLAS_STATUS_SUCCESS);
 }
 
@@ -149,8 +144,8 @@ void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int6
 
 
 
-void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
 
   assert (handle != NULL);
 
@@ -173,13 +168,13 @@ void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const
   cublasOperation_t transa_ = CUBLAS_OP_N;
   if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
 
-  cublasDgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
 }
 
 
 
-void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
 
   assert (handle != NULL);
 
@@ -202,12 +197,12 @@ void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const
   cublasOperation_t transa_ = CUBLAS_OP_N;
   if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
 
-  cublasSgemv(handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
 }
 
 
-void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
 
   assert (handle != NULL);
 
@@ -234,13 +229,13 @@ void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, co
   if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
   if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasDgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
 }
 
 
 
-void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
 
   assert (handle != NULL);
 
@@ -267,12 +262,12 @@ void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, co
   if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
   if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasSgemm(handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
 }
 
 
-void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
   assert (handle != NULL);
 
   /* Convert to int */
@@ -296,13 +291,13 @@ void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, co
   if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
   if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasDgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
 
 }
 
 
-void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
   assert (handle != NULL);
 
   /* Convert to int */
@@ -326,6 +321,6 @@ void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, co
   if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
   if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
 
-  cublasSgeam(handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
 
 }
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
index fe3cadc5..49aec9d3 100644
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@@ -124,8 +124,8 @@ void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx,
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
 
   assert (handle != NULL);
 
@@ -145,15 +145,15 @@ void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  dgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 
 
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
 
   assert (handle != NULL);
 
@@ -173,15 +173,15 @@ void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t
   assert ( (int64_t) incx_ == incx);
   assert ( (int64_t) incy_ == incy);
 
-  sgemv_(transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
 }
 
 
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
 
   assert (handle != NULL);
 
@@ -203,7 +203,7 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64
   assert ( (int64_t) ldb_ == ldb);
   assert ( (int64_t) ldc_ == ldc);
 
-  dgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
 
 
@@ -211,8 +211,8 @@ void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
 
   assert (handle != NULL);
 
@@ -234,12 +234,12 @@ void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64
   assert ( (int64_t) ldb_ == ldb);
   assert ( (int64_t) ldc_ == ldc);
 
-  sgemm_(transa, transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
 }
 
 
-void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
   assert (handle != NULL);
 
   if ( (*transa == 'N' && *transb == 'N') ||
@@ -247,19 +247,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
        (*transa == 'N' && *transb == 'n') ||
        (*transa == 'n' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
@@ -267,7 +267,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
          }
        }
 
@@ -278,19 +278,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
               (*transa == 'N' && *transb == 't') ||
               (*transa == 'n' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
@@ -298,7 +298,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
          }
        }
 
@@ -309,19 +309,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
               (*transa == 'T' && *transb == 'n') ||
               (*transa == 't' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
@@ -329,7 +329,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
          }
        }
 
@@ -340,19 +340,19 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
               (*transa == 'T' && *transb == 't') ||
               (*transa == 't' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
@@ -360,7 +360,7 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
          }
        }
 
@@ -370,8 +370,8 @@ void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64
 }
 
 
-void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
   assert (handle != NULL);
 
   if ( (*transa == 'N' && *transb == 'N') ||
@@ -379,19 +379,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
        (*transa == 'N' && *transb == 'n') ||
        (*transa == 'n' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
@@ -399,7 +399,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
          }
        }
 
@@ -410,19 +410,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
               (*transa == 'N' && *transb == 't') ||
               (*transa == 'n' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i];
+           c[j*ldc+i] = *alpha * a[j*lda+i];
          }
        }
 
@@ -430,7 +430,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
          }
        }
 
@@ -441,19 +441,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
               (*transa == 'T' && *transb == 'n') ||
               (*transa == 't' && *transb == 'n') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[j*ldb+i];
+           c[j*ldc+i] = *beta * b[j*ldb+i];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
@@ -461,7 +461,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
          }
        }
 
@@ -472,19 +472,19 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
               (*transa == 'T' && *transb == 't') ||
               (*transa == 't' && *transb == 't') ) {
 
-     if (alpha == 0.) {
+     if (*alpha == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = beta * b[i*ldb+j];
+           c[j*ldc+i] = *beta * b[i*ldb+j];
          }
        }
 
-     } else if (beta == 0.) {
+     } else if (*beta == 0.) {
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j];
          }
        }
 
@@ -492,7 +492,7 @@ void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64
 
        for (int64_t j=0 ; j<n ; ++j) {
          for (int64_t i=0 ; i<m ; ++i) {
-           c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
          }
        }
 
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index e0048637..c34b390b 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -217,21 +217,18 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
   ! internal
   integer                       :: u,i,j,beta,a,b
 
-  call gpu_copy(d_cc_space_f_ov, r1)
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
   type(gpu_double2) :: X_oo
   call gpu_allocate(X_oo,nO,nO)
 
-  call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
-             -2d0, t1%f(1,1), size(t1%f,1), &
-                   d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
-              0d0, X_oo%f(1,1), size(X_oo%f,1))
-
-  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
-             1d0, X_oo%f(1,1), size(X_oo%f,2), &
-                  t1%f(1,1)  , size(t1%f,1), &
-             1d0, r1%f(1,1)  , size(r1%f,1))
+  call gpu_copy(d_cc_space_f_ov, r1)
 
+  call gpu_set_stream(blas_handle, stream(1))
   call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
              1d0, t1%f(1,1)  , size(t1%f,1), &
                   H_vv%f(1,1), size(H_vv%f,1), &
@@ -242,35 +239,32 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
                    t1%f(1,1)  , size(t1%f,1), &
               1d0, r1%f(1,1), size(r1%f,1))
 
+  call gpu_set_stream(blas_handle, stream(nV))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
+             -2d0, t1%f(1,1), size(t1%f,1), &
+                   d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+              0d0, X_oo%f(1,1), size(X_oo%f,1))
+
+  call gpu_synchronize()
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
+             1d0, X_oo%f(1,1), size(X_oo%f,2), &
+                  t1%f(1,1)  , size(t1%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+
+
   type(gpu_double4) :: X_voov
   call gpu_allocate(X_voov, nV, nO, nO, nV)
 
-  type(gpu_stream) :: stream(nV)
-
-  do a=1,nV
-    call gpu_stream_create(stream(a))
-  enddo
-
-  call gpu_synchronize()
-
-!  do i=1,nO
-!    do beta=1,nV
-!      call gpu_set_stream(blas_handle, stream(beta))
-!      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO,  -1.d0, t2%f(1,i,1,beta), &
-!         nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV)
-!    enddo
-!  enddo
-
-  do beta = 1, nV
-    do u = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov%f(a,i,u,beta) = - t2%f(u,i,a,beta) + t1%f(u,a) * t1%f(i,beta)
-        enddo
-      enddo
+  do i=1,nO
+    do beta=1,nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO,  -1.d0, t2%f(1,i,1,beta), &
+         nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO)
     enddo
   enddo
-  call gpu_synchronize()
 
   do beta=1,nV
     call gpu_set_stream(blas_handle, stream(beta))
@@ -279,9 +273,8 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
   enddo
 
   call gpu_synchronize()
-  do a=1,nV
-    call gpu_stream_destroy(stream(a))
-  enddo
+  call gpu_deallocate(X_oo)
+
   call gpu_set_stream(blas_handle, gpu_default_stream)
 
   call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
@@ -289,65 +282,38 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
                   H_vo%f(1,1)  , 1, &
              1d0, r1%f(1,1)    , 1)
 
-  call gpu_synchronize()
-  call gpu_deallocate(X_oo)
-  call gpu_deallocate(X_voov)
-
   type(gpu_double4) :: X_ovov
   call gpu_allocate(X_ovov, nO, nV, nO, nV)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,d_cc_space_v_ovov,d_cc_space_v_voov,X_ovov) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
   do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nv
-        do i = 1, nO
-          X_ovov%f(i,a,u,beta) = 2d0 * d_cc_space_v_voov%f(a,u,i,beta) - d_cc_space_v_ovov%f(u,a,i,beta)
-        enddo
-      enddo
+    call gpu_set_stream(blas_handle, stream(beta))
+    do u=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), &
+      nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-!  call dgemv('T', nO*nV, nO*nV, &
-!             1d0, X_ovov%f, size(X_ovov%f,1) * size(X_ovov%f,2), &
-!                  t1%f, 1, &
-!             1d0, r1%f, 1)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
   call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
              1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
                   t1%f(1,1), 1, &
              1d0, r1%f(1,1), 1)
 
+
   integer :: iblock, block_size, nVmax
   type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
 
   block_size = 16
-  call gpu_allocate(W_vvov,nV, nV,nO,block_size)
-  call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
   call gpu_allocate(T_vvoo, nV,nV,nO,nO)
 
-  call gpu_synchronize()
-  call gpu_deallocate(X_ovov)
+  call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), &
+    nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV)
 
-  !$omp parallel &
-  !$omp private(u,i,b,a) &
-  !$omp default(shared)
-  !$omp do
-  do u = 1, nO
-    do i = 1, nO
-      do b = 1, nV
-        do a = 1, nV
-          T_vvoo%f(a,b,i,u) = tau%f(i,u,a,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_allocate(W_vvov,nV, nV,nO,block_size)
+  call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
 
   do iblock = 1, nV, block_size
     nVmax = min(block_size,nV-iblock+1)
@@ -358,22 +324,22 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
       0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
 
     call gpu_synchronize()
-    !$omp parallel &
-    !$omp private(b,i,a,beta) &
-    !$omp default(shared)
-    do beta = 1,  nVmax
-      do i = 1, nO
-        !$omp do
-        do b = 1, nV
-          do a = 1, nV
-            W_vvov%f(a,b,i,beta) = 2d0 * W_vvov_tmp%f(a,i,b,beta) - W_vvov_tmp%f(b,i,a,beta)
-          enddo
-        enddo
-        !$omp end do nowait
+    do b=1,nV
+      call gpu_set_stream(blas_handle, stream(b))
+      do i=1,nO
+        call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax,  2.d0, W_vvov_tmp%f(1,i,b,1), &
+         nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO)
       enddo
     enddo
-    !$omp barrier
-    !$omp end parallel
+
+    call gpu_synchronize()
+
+    do beta = 1,  nVmax
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO,  1.d0, W_vvov%f(1,1,1,beta), &
+         nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV)
+    enddo
+    call gpu_synchronize()
 
     call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
              1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
@@ -381,30 +347,24 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
              1d0, r1%f(1,iblock), nO)
   enddo
 
-  call gpu_synchronize()
-  call gpu_deallocate(W_vvov)
-  call gpu_deallocate(T_vvoo)
-
+  call gpu_deallocate(X_ovov)
 
   type(gpu_double4) :: W_oovo
   call gpu_allocate(W_oovo, nO,nO,nV,nO)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,d_cc_space_v_oovo,W_oovo) &
-  !$omp private(u,a,i,j) &
-  !$omp default(none)
   do u = 1, nO
-    !$omp do
     do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          W_oovo%f(i,j,a,u) = 2d0 * d_cc_space_v_oovo%f(i,j,a,u) - d_cc_space_v_oovo%f(j,i,a,u)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO,  2.d0, d_cc_space_v_oovo%f(1,1,a,u), &
+         nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+
+  call gpu_deallocate(W_vvov)
+  call gpu_deallocate(T_vvoo)
 
   ! Change the sign for consistency with the code in spin orbitals
   call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
@@ -422,6 +382,9 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_s
     enddo
   enddo
 
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
 
 end
 
diff --git a/src/gpu/gpu.h b/src/gpu/gpu.h
index 03aeeab7..ac70e21a 100644
--- a/src/gpu/gpu.h
+++ b/src/gpu/gpu.h
@@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 
 void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
 
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
 
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
 
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
-               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
 
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
-               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
 
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
-               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
 
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
-               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 949ae4fc..6050075f 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -145,7 +145,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
-      real(c_double), intent(in), value      :: alpha, beta
+      real(c_double), intent(in)            :: alpha, beta
       type(c_ptr), value :: a, b, c
     end subroutine
 
@@ -155,7 +155,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
-      real(c_float), intent(in), value      :: alpha, beta
+      real(c_float), intent(in)             :: alpha, beta
       real(c_float) :: a, b, c
     end subroutine
 
@@ -165,7 +165,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in)         :: transa
       integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
-      real(c_double), intent(in), value     :: alpha, beta
+      real(c_double), intent(in)            :: alpha, beta
       real(c_double)                        :: a, x, y
     end subroutine
 
@@ -175,7 +175,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in)         :: transa
       integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
-      real(c_float), intent(in), value      :: alpha, beta
+      real(c_float), intent(in)             :: alpha, beta
       real(c_float)                         :: a, x, y
     end subroutine
 
@@ -186,7 +186,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in)         :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
-      real(c_double), intent(in), value :: alpha, beta
+      real(c_double), intent(in)            :: alpha, beta
       real(c_double) :: a, b, c
     end subroutine
 
@@ -196,7 +196,7 @@ module gpu
       type(c_ptr), value, intent(in)        :: handle
       character(c_char), intent(in), value  :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
-      real(c_float), intent(in), value :: alpha, beta
+      real(c_float), intent(in)             :: alpha, beta
       real(c_float) :: a, b, c
     end subroutine
 

From a788426aa563e4649a045a2b7a22fa97c29b74c8 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 4 Jul 2024 12:01:16 +0200
Subject: [PATCH 112/131] Working on r2

---
 src/ccsd/ccsd_space_orb_sub.irp.f      |  27 +-
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 353 +++++++++++++------------
 2 files changed, 200 insertions(+), 180 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 256f743b..59b9ebd2 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -21,7 +21,8 @@ subroutine run_ccsd_space_orb
   type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
 
   type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
-  type(gpu_double4) :: d_cc_space_v_oovo
+  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4) :: d_cc_space_v_vvoo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -93,17 +94,29 @@ subroutine run_ccsd_space_orb
 !    FREE cc_space_v_vv_chol
   endif
 
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
   call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
   call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
   call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
 
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
   call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
   call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
   call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
+  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
+  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
 
 !  FREE cc_space_v_voov
 !  FREE cc_space_v_ovov
 !  FREE cc_space_v_oovo
+!  FREE cc_space_v_oovv
+!  FREE cc_space_v_vooo
+!  FREE cc_space_v_oooo
+!  FREE cc_space_v_vvoo
 
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
@@ -165,15 +178,8 @@ subroutine run_ccsd_space_orb
   call gpu_upload(h_t2, t2)
 
 
-  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
-  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
-
-!  FREE cc_space_v_oovv
-
-
   call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
-  !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
 
@@ -200,7 +206,10 @@ subroutine run_ccsd_space_orb
 
       call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
            d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
-      call compute_r2_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r2%f,max_r2)
+      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, &
+           d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           r2, max_r2)
     else
       call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
       call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index c34b390b..0474dcec 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -391,168 +391,162 @@ end
 
 ! R2
 
-subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
-
+subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+    d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, &
+    d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+    r2,max_r2)
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
+  integer, intent(in)            :: nO, nV
+  type(gpu_double2), intent(in)  :: t1, H_oo, H_vv
+  type(gpu_double4), intent(in)  :: t2, tau, d_cc_space_v_oovv
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double3), intent(in)  :: d_cc_space_v_vo_chol
 
   ! out
-  double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
+  double precision, intent(out)  :: max_r2
+  type(gpu_double4), intent(out) :: r2
 
   ! internal
   integer                       :: u,v,i,j,beta,gam,a,b
   double precision              :: max_r2_local
 
+  type(gpu_stream) :: stream(nV)
+
   call set_multiple_levels_omp(.False.)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,cc_space_v_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-         r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_copy(d_cc_space_v_oovv, r2)
 
-  double precision, allocatable :: A1(:,:,:,:)
-  allocate(A1(nO,nO,nO,nO))
-  call compute_A1_chol(nO,nV,t1,t2,tau,A1)
-  call dgemm('N','N',nO*nO,nV*nV,nO*nO, &
-             1d0, A1, size(A1,1) * size(A1,2), &
-                  tau, size(tau,1) * size(tau,2), &
-             1d0, r2, size(r2,1) * size(r2,2))
+  type(gpu_double4) :: A1
+  call gpu_allocate(A1,nO,nO,nO,nO)
+  call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+      d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), &
+                  tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+             1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2))
+
+  call gpu_deallocate(A1)
 
-  deallocate(A1)
   integer :: block_size, iblock, k
   block_size = 16
-  double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1
-  double precision, dimension(:,:), allocatable :: tmp_cc2
+  type(gpu_double3) :: tmp_cc, B1, tmpB1
+  type(gpu_double2) :: tmp_cc2
 
-  allocate(tmp_cc(cholesky_mo_num,nV,nV))
-  call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV)
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
 
   call set_multiple_levels_omp(.False.)
+  call gpu_synchronize()
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas)
+  call gpu_allocate(B1,nV,nV,block_size)
+  call gpu_allocate(tmpB1,nV,block_size,nV)
+  call gpu_allocate(tmp_cc2,cholesky_mo_num,nV)
+
+  call gpu_blas_create(blas)
 
-  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a)
-  allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV))
   !$OMP DO
   do gam = 1, nV
 
-    do a=1,nV
-      do k=1,cholesky_mo_num
-        tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam)
-      enddo
-    enddo
+    call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), &
+         cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num)
 
     do iblock = 1, nV, block_size
 
-        call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, &
-                cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, &
-                0.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, &
+                d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, &
+                0.d0, tmpB1%f(1,1,1), nV*block_size)
 
-        call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-                tmp_cc2, cholesky_mo_num, &
-                1.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+                tmp_cc2%f(1,1), cholesky_mo_num, &
+                1.d0, tmpB1%f(1,1,1), nV*block_size)
 
         do beta = iblock, min(nV, iblock+block_size-1)
-          do b = 1, nV
-            do a = 1, nV
-              B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b)
-            enddo
-          enddo
+          call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), &
+             nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV)
         enddo
 
-        call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
-              1d0, tau, size(tau,1) * size(tau,2), &
-                   B1 , size(B1 ,1) * size(B1 ,2), &
-              1d0, r2(1,1,iblock,gam),  size(r2 ,1) * size(r2 ,2))
+        call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
+              1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                   B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), &
+              1d0, r2%f(1,1,iblock,gam),  size(r2%f ,1) * size(r2%f ,2))
       enddo
 
   enddo
   !$OMP ENDDO
 
-  deallocate(B1, tmpB1, tmp_cc2)
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(B1)
+  call gpu_deallocate(tmpB1)
+  call gpu_deallocate(tmp_cc2)
   !$OMP END PARALLEL
 
-  deallocate(tmp_cc)
+  call gpu_deallocate(tmp_cc)
 
+  type(gpu_double4) :: X_oovv
+  call gpu_allocate(X_oovv,nO,nO,nV,nV)
+  call gpu_copy(t2,X_oovv)
 
-  double precision, allocatable :: X_oovv(:,:,:,:)
-  allocate(X_oovv(nO,nO,nV,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,X_oovv) &
-  !$omp private(u,v,gam,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          X_oovv(u,v,gam,a) = t2(u,v,gam,a)
-        enddo
-      enddo
-    enddo
+  type(gpu_double2) :: g_vir
+  call gpu_allocate(g_vir,nV,nV)
+  call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f)
+
+  type(gpu_double4) :: Y_oovv
+  call gpu_allocate(Y_oovv,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, &
+             1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), &
+                  g_vir%f(1,1), size(g_vir%f,1), &
+             0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_synchronize()
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  double precision, allocatable :: g_vir(:,:)
-  allocate(g_vir(nV,nV))
-  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
-
-  double precision, allocatable :: Y_oovv(:,:,:,:)
-  allocate(Y_oovv(nO,nO,nV,nV))
-
-  call dgemm('N','N',nO*nO*nV,nV,nV, &
-             1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), &
-                  g_vir, size(g_vir,1), &
-             0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3))
-  deallocate(g_vir)
-  deallocate(X_oovv)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
+           nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
 
-  double precision, allocatable :: g_occ(:,:)
-  allocate(g_occ(nO,nO))
-  call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
+  call gpu_deallocate(g_vir)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO,nO*nV*nV,nO, &
-             1d0, g_occ , size(g_occ,1), &
-                  t2    , size(t2,1), &
-             0d0, X_oovv, size(X_oovv,1))
-  deallocate(g_occ)
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(Y_oovv)
+
+  type(gpu_double2) :: g_occ
+  call gpu_allocate(g_occ,nO,nO)
+
+  call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
+             1d0, g_occ%f(1,1), size(g_occ%f,1), &
+                  t2%f(1,1,1,1)    , size(t2%f,1), &
+             0d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1))
+
+  call gpu_synchronize()
 
   !$omp parallel &
   !$omp shared(nO,nV,r2,X_oovv) &
@@ -563,7 +557,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
+          r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta)
         enddo
       enddo
     enddo
@@ -571,27 +565,39 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   !$omp end do
   !$omp end parallel
 
-  deallocate(X_oovv)
+  call gpu_deallocate(g_occ)
+  call gpu_deallocate(X_oovv)
 
-  double precision, allocatable :: X_vovv(:,:,:,:)
+  type(gpu_double4) :: X_vovv
+
+  call gpu_allocate(X_vovv,nV,nO,nV,block_size)
+  call gpu_allocate(Y_oovv,nO,nO,nV,nV)
 
-  allocate(X_vovv(nV,nO,nV,block_size))
-  allocate(Y_oovv(nO,nO,nV,nV))
 
   do iblock = 1, nV, block_size
     do gam = iblock, min(nV, iblock+block_size-1)
-      call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
-        cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, &
-        cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV)
+      call gpu_stream_create(stream(gam))
+      call gpu_set_stream(blas_handle, stream(gam))
+      call gpu_dgemm(blas_handle, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
+        d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), &
+        cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV)
 
     enddo
+    do gam = iblock, min(nV, iblock+block_size-1)
+      call gpu_stream_destroy(stream(gam))
+    enddo
+
+    call gpu_synchronize()
+
+    call gpu_set_stream(blas_handle, gpu_default_stream)
     call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
-             1d0, t1    , size(t1,1), &
-                  X_vovv, size(X_vovv,1), &
-             0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1))
+             1d0, t1%f    , size(t1%f,1), &
+                  X_vovv%f, size(X_vovv%f,1), &
+             0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
 
   enddo
-  deallocate(X_vovv)
+  call gpu_synchronize()
+  call gpu_deallocate(X_vovv)
 
   !$omp parallel &
   !$omp shared(nO,nV,r2,Y_oovv) &
@@ -602,14 +608,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta)
+          r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(v,u,beta,gam) + Y_oovv%f(u,v,gam,beta)
         enddo
       enddo
     enddo
   enddo
   !$omp end do
   !$omp end parallel
-  deallocate(Y_oovv)
+  call gpu_deallocate(Y_oovv)
 
   double precision, allocatable :: X_ovvo(:,:,:,:)
   double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
@@ -617,11 +623,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   allocate(tcc(cholesky_mo_num,nO,nV))
 
   call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-     cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, &
+     d_cc_space_v_vv_chol%f, cholesky_mo_num*nV, t1%f, nO, &
      0.d0, tcc2, cholesky_mo_num*nV)
 
   call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
-     cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, &
+     cc_space_v_oo_chol, cholesky_mo_num*nO, t1%f, nO, &
      0.d0, tcc, cholesky_mo_num*nO)
 
   call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
@@ -639,7 +645,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v)
+          r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_ovvo(u,beta,gam,v)
         enddo
       enddo
     enddo
@@ -650,7 +656,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do gam = 1, nV
       do v = 1, nO
         do u = 1, nO
-          r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v)
+          r2%f(v,u,gam,beta) = r2%f(v,u,gam,beta) - X_ovvo(u,beta,gam,v)
         enddo
       enddo
     enddo
@@ -661,12 +667,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   deallocate(X_ovvo)
   !-----
 
-  allocate(X_oovv(nO,nO,nV,nV))
+  call gpu_allocate(X_oovv,nO,nO,nV,nV)
 
   call dgemm('N','N',nO*nO*nV,nV,nO, &
              1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
-                  t1 , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
+                  t1%f , size(t1%f,1), &
+             0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3))
 
   !$omp parallel &
   !$omp shared(nO,nV,r2,X_oovv) &
@@ -677,14 +683,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
+           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta)
         enddo
       enddo
     enddo
   enddo
   !$omp end do
   !$omp end parallel
-  deallocate(X_oovv)
+  call gpu_deallocate(X_oovv)
 
   double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
   allocate(X_vovo(nV,nO,nV,nO))
@@ -708,16 +714,16 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
 
   allocate(Y_oovo(nO,nO,nV,nO))
   call dgemm('N','N',nO,nO*nV*nO,nV, &
-             1d0, t1, size(t1,1), &
+             1d0, t1%f, size(t1%f,1), &
                   X_vovo, size(X_vovo,1), &
              0d0, Y_oovo, size(Y_oovo,1))
 
   deallocate(X_vovo)
-  allocate(X_oovv(nO,nO,nV,nV))
+  call gpu_allocate(X_oovv,nO,nO,nV,nV)
   call dgemm('N','N',nO*nO*nV, nV, nO, &
              1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
-                  t1    , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
+                  t1%f    , size(t1%f,1), &
+             0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3))
   deallocate(Y_oovo)
 
   !$omp parallel &
@@ -729,24 +735,24 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam)
+           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,gam,beta) - X_oovv%f(v,u,beta,gam)
         enddo
       enddo
     enddo
   enddo
   !$omp end do
   !$omp end parallel
-  deallocate(X_oovv)
+  call gpu_deallocate(X_oovv)
 
 
   double precision, allocatable :: J1(:,:,:,:)
   allocate(J1(nO,nV,nV,nO))
-  call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, &
+  call compute_J1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovvo,cc_space_v_ovoo, &
        cc_space_v_vvoo,J1)
 
   double precision, allocatable :: K1(:,:,:,:)
   allocate(K1(nO,nV,nO,nV))
-  call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, &
+  call compute_K1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovoo,cc_space_v_vvoo, &
        cc_space_v_ovov,K1)
 
   allocate(X_ovvo(nO,nV,nV,nO))
@@ -778,7 +784,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do v = 1, nO
       do i = 1, nO
         do a = 1, nV
-          Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a)
+          Y_voov(a,i,v,gam) = 2d0 * t2%f(i,v,a,gam) - t2%f(i,v,gam,a)
         enddo
       enddo
     enddo
@@ -805,7 +811,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta)
+           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta)
         enddo
       enddo
     enddo
@@ -820,7 +826,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
   allocate(Y_ovov(nO,nV,nO,nV))
 
   !$omp parallel &
-  !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
+  !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
   !$omp private(u,a,i,beta,gam) &
   !$omp default(none)
   !$omp do
@@ -840,7 +846,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do v = 1, nO
       do a = 1, nV
         do i = 1, nO
-          Y_ovov(i,a,v,gam) = t2(i,v,gam,a)
+          Y_ovov(i,a,v,gam) = t2%f(i,v,gam,a)
         enddo
       enddo
     enddo
@@ -864,7 +870,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
+           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
         enddo
       enddo
     enddo
@@ -895,7 +901,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do v = 1, nO
       do a = 1, nV
         do i = 1, nO
-          Y_ovov(i,a,v,beta) = t2(i,v,beta,a)
+          Y_ovov(i,a,v,beta) = t2%f(i,v,beta,a)
         enddo
       enddo
     enddo
@@ -922,7 +928,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
+           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
         enddo
       enddo
     enddo
@@ -945,8 +951,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          r2(i,j,a,b) = -r2(i,j,a,b)
-          max_r2_local = max(r2(i,j,a,b), max_r2_local)
+          r2%f(i,j,a,b) = -r2%f(i,j,a,b)
+          max_r2_local = max(r2%f(i,j,a,b), max_r2_local)
         enddo
       enddo
     enddo
@@ -961,28 +967,29 @@ end
 
 ! A1
 
-subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1)
-
+subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+  d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: tau(nO, nO, nV, nV)
-  double precision, intent(out) :: A1(nO, nO, nO, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, tau
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo
+  type(gpu_double4), intent(out) :: A1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
 
-  double precision, allocatable :: Y_oooo(:,:,:,:)
-  allocate(Y_oooo(nO,nO,nO,nO))
+  type(gpu_double4) :: Y_oooo
+  call gpu_allocate(Y_oooo,nO,nO,nO,nO)
 
   ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
   ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
 
   call dgemm('N','N', nO, nO*nO*nO, nV, &
-             1d0, t1    , size(t1,1), &
-                  cc_space_v_vooo, size(cc_space_v_vooo,1), &
-             0d0, Y_oooo, size(Y_oooo,1))
+             1d0, t1%f  , size(t1%f,1), &
+                  d_cc_space_v_vooo%f, size(d_cc_space_v_vooo%f,1), &
+             0d0, Y_oooo%f, size(Y_oooo%f,1))
 
   !$omp parallel &
   !$omp private(u,v,i,j) &
@@ -992,7 +999,7 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1)
     do i = 1, nO
       do v = 1, nO
         do u = 1, nO
-          A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j)
+          A1%f(u,v,i,j) = d_cc_space_v_oooo%f(u,v,i,j) + Y_oooo%f(v,u,j,i) + Y_oooo%f(u,v,i,j)
         enddo
       enddo
     enddo
@@ -1000,19 +1007,20 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1)
   !$omp end do
   !$omp end parallel
 
-  deallocate(Y_oooo)
+  call gpu_deallocate(Y_oooo)
 
   ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
   call dgemm('N','N', nO*nO, nO*nO, nV*nV, &
-             1d0, tau     , size(tau,1) * size(tau,2), &
-                  cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), &
-             1d0, A1      , size(A1,1) * size(A1,2))
+             1d0, tau%f     , size(tau%f,1) * size(tau%f,2), &
+                  d_cc_space_v_vvoo%f, size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
+             1d0, A1%f      , size(A1%f,1) * size(A1%f,2))
 
 end
 
 ! g_occ
 
 subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
+  use gpu
 
   implicit none
 
@@ -1048,6 +1056,7 @@ end
 ! g_vir
 
 subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
+  use gpu
 
   implicit none
 
@@ -1102,6 +1111,7 @@ end
 ! J1
 
 subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
@@ -1305,6 +1315,7 @@ end
 ! K1
 
 subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1)
+  use gpu
 
   implicit none
 

From 5b1e5f84e6defc8857b5d29c54449e3b8d35cb67 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Thu, 4 Jul 2024 14:52:09 +0200
Subject: [PATCH 113/131] Working on r2

---
 src/ccsd/ccsd_space_orb_sub.irp.f      |  17 +-
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 469 +++++++++++--------------
 2 files changed, 220 insertions(+), 266 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 59b9ebd2..e97c2325 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -22,7 +22,7 @@ subroutine run_ccsd_space_orb
 
   type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
   type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
-  type(gpu_double4) :: d_cc_space_v_vvoo
+  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -98,17 +98,21 @@ subroutine run_ccsd_space_orb
   call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
   call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
   call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
   call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
   call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
   call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
+  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
 
   call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
   call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
   call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
   call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
   call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
   call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
   call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
+  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
 
 !  FREE cc_space_v_voov
 !  FREE cc_space_v_ovov
@@ -117,6 +121,8 @@ subroutine run_ccsd_space_orb
 !  FREE cc_space_v_vooo
 !  FREE cc_space_v_oooo
 !  FREE cc_space_v_vvoo
+!  FREE cc_space_v_ovvo
+!  FREE cc_space_v_ovoo
 
   call gpu_allocate(t2, nO,nO,nV,nV)
   call gpu_allocate(r2, nO,nO,nV,nV)
@@ -207,8 +213,8 @@ subroutine run_ccsd_space_orb
       call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
            d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
       call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
-           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, &
-           d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
            r2, max_r2)
     else
       call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
@@ -320,6 +326,11 @@ subroutine run_ccsd_space_orb
   call gpu_deallocate(d_cc_space_v_voov)
   call gpu_deallocate(d_cc_space_v_ovov)
   call gpu_deallocate(d_cc_space_v_oovo)
+  call gpu_deallocate(d_cc_space_v_ovvo)
+  call gpu_deallocate(d_cc_space_v_vooo)
+  call gpu_deallocate(d_cc_space_v_oooo)
+  call gpu_deallocate(d_cc_space_v_vvoo)
+  call gpu_deallocate(d_cc_space_v_ovoo)
 
   call gpu_deallocate(d_cc_space_f_oo)
   call gpu_deallocate(d_cc_space_f_vo)
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 0474dcec..abb9909b 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -392,8 +392,8 @@ end
 ! R2
 
 subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
-    d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, &
-    d_cc_space_v_vvoo, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+    d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+    d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
     r2,max_r2)
   use gpu
   implicit none
@@ -403,9 +403,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   type(gpu_double2), intent(in)  :: t1, H_oo, H_vv
   type(gpu_double4), intent(in)  :: t2, tau, d_cc_space_v_oovv
   type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo
-  type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo
-  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
-  type(gpu_double3), intent(in)  :: d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo, d_cc_space_v_oovo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovvo, d_cc_space_v_ovoo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovov
+  type(gpu_double3), intent(in)  :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol
+  type(gpu_double3), intent(in)  :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
 
   ! out
   double precision, intent(out)  :: max_r2
@@ -499,9 +501,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   call gpu_allocate(X_oovv,nO,nO,nV,nV)
   call gpu_copy(t2,X_oovv)
 
-  type(gpu_double2) :: g_vir
+  type(gpu_double2) :: g_occ, g_vir
   call gpu_allocate(g_vir,nV,nV)
+  call gpu_allocate(g_occ,nO,nO)
   call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f)
+  call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f)
 
   type(gpu_double4) :: Y_oovv
   call gpu_allocate(Y_oovv,nO,nO,nV,nV)
@@ -511,7 +515,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
                   g_vir%f(1,1), size(g_vir%f,1), &
              0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
 
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
+             -1d0, g_occ%f(1,1), size(g_occ%f,1), &
+                  t2%f(1,1,1,1)    , size(t2%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, &
+            -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
+                  t1%f(1,1) , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
   call gpu_synchronize()
+  call gpu_deallocate(X_oovv)
+
+  call gpu_deallocate(g_vir)
+  call gpu_deallocate(g_occ)
+
+  type(gpu_double4) :: X_vovo, Y_oovo
+  call gpu_allocate(X_vovo,nV,nO,nV,nO)
+
+!  !$omp parallel &
+!  !$omp shared(nO,nV,r2,Y_oovv) &
+!  !$omp private(u,v,gam,beta) &
+!  !$omp default(none)
+!  !$omp do
+!  do gam = 1, nV
+!    do beta = 1, nV
+!      do v = 1, nO
+!        do u = 1, nO
+!           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,beta,gam) + Y_oovv%f(v,u,gam,beta)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$omp end do
+!  !$omp end parallel
 
   do a=1,nV
     call gpu_stream_create(stream(a))
@@ -527,52 +565,24 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     enddo
   enddo
 
-  call gpu_deallocate(g_vir)
-  call gpu_set_stream(blas_handle, gpu_default_stream)
+  do i = 1, nO
+    do gam = 1, nV
+      call gpu_set_stream(blas_handle, stream(gam))
+      call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), &
+           nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV)
+    enddo
+  enddo
 
   do a=1,nV
     call gpu_stream_destroy(stream(a))
   enddo
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call gpu_deallocate(Y_oovv)
-
-  type(gpu_double2) :: g_occ
-  call gpu_allocate(g_occ,nO,nO)
-
-  call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f)
-
-  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
-             1d0, g_occ%f(1,1), size(g_occ%f,1), &
-                  t2%f(1,1,1,1)    , size(t2%f,1), &
-             0d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1))
-
-  call gpu_synchronize()
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call gpu_deallocate(g_occ)
-  call gpu_deallocate(X_oovv)
 
   type(gpu_double4) :: X_vovv
 
   call gpu_allocate(X_vovv,nV,nO,nV,block_size)
-  call gpu_allocate(Y_oovv,nO,nO,nV,nV)
-
+  call gpu_allocate(Y_oovo,nO,nO,nV,nO)
 
   do iblock = 1, nV, block_size
     do gam = iblock, min(nV, iblock+block_size-1)
@@ -590,241 +600,176 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     call gpu_synchronize()
 
     call gpu_set_stream(blas_handle, gpu_default_stream)
-    call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
-             1d0, t1%f    , size(t1%f,1), &
-                  X_vovv%f, size(X_vovv%f,1), &
+    call gpu_dgemm(blas_handle, 'N','N', nO, &
+             nO*nV*min(block_size, nV-iblock+1),nV, &
+             1.d0, t1%f(1,1)    , size(t1%f,1), &
+             X_vovv%f(1,1,1,1), size(X_vovv%f,1), &
              0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
 
   enddo
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  X_vovo%f(1,1,1,1), size(X_vovo%f,1), &
+             0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, &
+            -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
   call gpu_synchronize()
   call gpu_deallocate(X_vovv)
+  call gpu_deallocate(X_vovo)
+  call gpu_deallocate(Y_oovo)
+
+!  !$omp parallel &
+!  !$omp shared(nO,nV,r2,Y_oovv) &
+!  !$omp private(u,v,gam,beta) &
+!  !$omp default(none)
+!  !$omp do
+!  do gam = 1, nV
+!    do beta = 1, nV
+!      do v = 1, nO
+!        do u = 1, nO
+!           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,gam,beta) + Y_oovv%f(v,u,beta,gam)
+!        enddo
+!      enddo
+!    enddo
+!  enddo
+!  !$omp end do
+!  !$omp end parallel
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(v,u,beta,gam) + Y_oovv%f(u,v,gam,beta)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
+           nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+
+  call gpu_synchronize()
   call gpu_deallocate(Y_oovv)
 
-  double precision, allocatable :: X_ovvo(:,:,:,:)
-  double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
-  allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO))
-  allocate(tcc(cholesky_mo_num,nO,nV))
+  type(gpu_double4) :: X_ovvo
+  type(gpu_double3) :: tcc, tcc2
+  call gpu_allocate(tcc2,cholesky_mo_num,nV,nO)
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(tcc,cholesky_mo_num,nO,nV)
 
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-     d_cc_space_v_vv_chol%f, cholesky_mo_num*nV, t1%f, nO, &
-     0.d0, tcc2, cholesky_mo_num*nV)
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+     d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, &
+     0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV)
 
-  call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
-     cc_space_v_oo_chol, cholesky_mo_num*nO, t1%f, nO, &
-     0.d0, tcc, cholesky_mo_num*nO)
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
+     d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, &
+     0.d0, tcc%f(1,1,1), cholesky_mo_num*nO)
 
-  call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
-              tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, &
-              X_ovvo, nO*nV)
+  call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
+              tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, &
+              X_ovvo%f(1,1,1,1), nO*nV)
 
-  deallocate(tcc, tcc2)
+  call gpu_synchronize()
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_ovvo) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, -1.d0, X_ovvo%f(1,beta,gam,1), &
+           nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), &
+           nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp do
-  do beta = 1, nV
-    do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2%f(v,u,gam,beta) = r2%f(v,u,gam,beta) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(X_ovvo)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_synchronize()
+  call gpu_deallocate(tcc)
+  call gpu_deallocate(tcc2)
+  call gpu_deallocate(X_ovvo)
+
   !-----
 
-  call gpu_allocate(X_oovv,nO,nO,nV,nV)
+  type(gpu_double4) :: J1, K1
+  type(gpu_double4) :: Y_voov, Z_ovov
 
-  call dgemm('N','N',nO*nO*nV,nV,nO, &
-             1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
-                  t1%f , size(t1%f,1), &
-             0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3))
+  call gpu_allocate(J1,nO,nV,nV,nO)
+  call compute_J1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovvo%f,d_cc_space_v_ovoo%f, &
+       d_cc_space_v_vvoo%f,J1%f)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,beta,gam) - X_oovv%f(v,u,gam,beta)
-        enddo
-      enddo
+  call gpu_allocate(K1,nO,nV,nO,nV)
+  call compute_K1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovoo%f,d_cc_space_v_vvoo%f, &
+       d_cc_space_v_ovov%f,K1%f)
+
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(Y_voov,nV,nO,nO,nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  do i=1, nO
+    do a=1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+           nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  call gpu_deallocate(X_oovv)
 
-  double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
-  allocate(X_vovo(nV,nO,nV,nO))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
-  !$omp private(a,v,gam,i) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do gam = 1, nV
-      do v = 1, nO
-        do a = 1, nV
-          X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  allocate(Y_oovo(nO,nO,nV,nO))
-  call dgemm('N','N',nO,nO*nV*nO,nV, &
-             1d0, t1%f, size(t1%f,1), &
-                  X_vovo, size(X_vovo,1), &
-             0d0, Y_oovo, size(Y_oovo,1))
-
-  deallocate(X_vovo)
-  call gpu_allocate(X_oovv,nO,nO,nV,nV)
-  call dgemm('N','N',nO*nO*nV, nV, nO, &
-             1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
-                  t1%f    , size(t1%f,1), &
-             0d0, X_oovv%f, size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3))
-  deallocate(Y_oovo)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - X_oovv%f(u,v,gam,beta) - X_oovv%f(v,u,beta,gam)
-        enddo
-      enddo
+  do gam=1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do v=1, nO
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,v,1,gam), &
+           nO*nO, -1.d0, t2%f(1,v,gam,1), nO*nO*nV, Y_voov%f(1,1,v,gam), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  call gpu_deallocate(X_oovv)
 
+  call gpu_allocate(Z_ovov,nO,nV,nO,nV)
 
-  double precision, allocatable :: J1(:,:,:,:)
-  allocate(J1(nO,nV,nV,nO))
-  call compute_J1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovvo,cc_space_v_ovoo, &
-       cc_space_v_vvoo,J1)
-
-  double precision, allocatable :: K1(:,:,:,:)
-  allocate(K1(nO,nV,nO,nV))
-  call compute_K1_chol(nO,nV,t1%f,t2%f,cc_space_v_ovoo,cc_space_v_vvoo, &
-       cc_space_v_ovov,K1)
-
-  allocate(X_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
   enddo
-  !$omp end parallel
-  deallocate(J1)
 
-  double precision, allocatable :: Y_voov(:,:,:,:)
-  allocate(Y_voov(nV,nO,nO,nV))
+  call gpu_deallocate(J1)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          Y_voov(a,i,v,gam) = 2d0 * t2%f(i,v,a,gam) - t2%f(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
 
-  double precision, allocatable :: Z_ovov(:,:,:,:)
-  allocate(Z_ovov(nO,nV,nO,nV))
+  call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
 
-  call dgemm('N','N', nO*nV,nO*nV,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_voov, size(Y_voov,1) * size(Y_voov,2), &
-             0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2))
+  call gpu_synchronize()
+  call gpu_deallocate(Y_voov)
+  call gpu_deallocate(X_ovvo)
 
-  deallocate(X_ovvo,Y_voov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
-
-  double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:)
-  allocate(X_ovov(nO,nV,nO,nV))
-  allocate(Y_ovov(nO,nV,nO,nV))
+  type(gpu_double4) :: Y_ovov, X_ovov
+  call gpu_allocate(X_ovov,nO,nV,nO,nV)
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
 
+!TODO 
   !$omp parallel &
   !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
   !$omp private(u,a,i,beta,gam) &
@@ -834,7 +779,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do u = 1, nO
       do a = 1, nV
         do i = 1, nO
-          X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta)
+          X_ovov%f(i,a,u,beta) = 0.5d0 * K1%f(u,a,i,beta)
         enddo
       enddo
     enddo
@@ -846,7 +791,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do v = 1, nO
       do a = 1, nV
         do i = 1, nO
-          Y_ovov(i,a,v,gam) = t2%f(i,v,gam,a)
+          Y_ovov%f(i,a,v,gam) = t2%f(i,v,gam,a)
         enddo
       enddo
     enddo
@@ -854,12 +799,12 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   !$omp end do
   !$omp end parallel
 
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('T','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-  deallocate(X_ovov, Y_ovov)
+  call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, &
+            -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
 
   !$omp parallel &
   !$omp shared(nO,nV,r2,Z_ovov) &
@@ -870,16 +815,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
+           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov%f(u,beta,v,gam) + Z_ovov%f(v,gam,u,beta)
         enddo
       enddo
     enddo
   enddo
   !$omp end do
   !$omp end parallel
-  deallocate(Z_ovov)
 
-  allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV))
   !$omp parallel &
   !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
   !$omp private(u,v,gam,beta,i,a) &
@@ -889,7 +832,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do i = 1, nO
       do gam = 1, nV
         do u = 1, nO
-          X_ovov(u,gam,i,a) = K1(u,a,i,gam)
+          X_ovov%f(u,gam,i,a) = K1%f(u,a,i,gam)
         enddo
       enddo
     enddo
@@ -901,7 +844,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do v = 1, nO
       do a = 1, nV
         do i = 1, nO
-          Y_ovov(i,a,v,beta) = t2%f(i,v,beta,a)
+          Y_ovov%f(i,a,v,beta) = t2%f(i,v,beta,a)
         enddo
       enddo
     enddo
@@ -909,16 +852,19 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   !$omp end do
   !$omp end parallel
 
-  deallocate(K1)
+  call gpu_deallocate(K1)
 
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('N','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
 
-  deallocate(X_ovov,Y_ovov)
+  call gpu_synchronize()
 
+  call gpu_deallocate(X_ovov)
+  call gpu_deallocate(Y_ovov)
+
+  ! Change the sign for consistency with the code in spin orbitals
   !$omp parallel &
   !$omp shared(nO,nV,r2,Z_ovov) &
   !$omp private(u,v,gam,beta) &
@@ -928,7 +874,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do beta = 1, nV
       do v = 1, nO
         do u = 1, nO
-           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
+           r2%f(u,v,beta,gam) = -r2%f(u,v,beta,gam) + Z_ovov%f(u,gam,v,beta) + Z_ovov%f(v,beta,u,gam)
         enddo
       enddo
     enddo
@@ -936,9 +882,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   !$omp end do
   !$omp end parallel
 
-  deallocate(Z_ovov)
-
-  ! Change the sign for consistency with the code in spin orbitals
+  call gpu_deallocate(Z_ovov)
 
   max_r2 = 0d0
   !$omp parallel &
@@ -951,7 +895,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          r2%f(i,j,a,b) = -r2%f(i,j,a,b)
           max_r2_local = max(r2%f(i,j,a,b), max_r2_local)
         enddo
       enddo

From f09e91cb2296b3c1fb5cf854dc06372c879c4104 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 8 Jul 2024 12:44:32 +0200
Subject: [PATCH 114/131] Working on CCSD GPU

---
 src/ccsd/ccsd_space_orb_sub.irp.f      |   1 +
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 874 +++++++++++--------------
 2 files changed, 392 insertions(+), 483 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index e97c2325..d8131a9c 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -215,6 +215,7 @@ subroutine run_ccsd_space_orb
       call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
            d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
            d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           d_cc_space_f_vo, &
            r2, max_r2)
     else
       call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index abb9909b..a185df13 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -394,13 +394,14 @@ end
 subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
     d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
     d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+    d_cc_space_f_vo, &
     r2,max_r2)
   use gpu
   implicit none
 
   ! in
   integer, intent(in)            :: nO, nV
-  type(gpu_double2), intent(in)  :: t1, H_oo, H_vv
+  type(gpu_double2), intent(in)  :: t1, H_oo, H_vv, d_cc_space_f_vo
   type(gpu_double4), intent(in)  :: t2, tau, d_cc_space_v_oovv
   type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo
   type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo, d_cc_space_v_oovo
@@ -504,7 +505,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   type(gpu_double2) :: g_occ, g_vir
   call gpu_allocate(g_vir,nV,nV)
   call gpu_allocate(g_occ,nO,nO)
-  call compute_g_vir_chol(nO,nV,t1%f,t2%f,H_vv%f,g_vir%f)
+  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+    d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
   call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f)
 
   type(gpu_double4) :: Y_oovv
@@ -525,6 +527,10 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
                   t1%f(1,1) , size(t1%f,1), &
              1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
 
+
+  call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), &
+         nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO)
+
   call gpu_synchronize()
   call gpu_deallocate(X_oovv)
 
@@ -534,32 +540,13 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   type(gpu_double4) :: X_vovo, Y_oovo
   call gpu_allocate(X_vovo,nV,nO,nV,nO)
 
-!  !$omp parallel &
-!  !$omp shared(nO,nV,r2,Y_oovv) &
-!  !$omp private(u,v,gam,beta) &
-!  !$omp default(none)
-!  !$omp do
-!  do gam = 1, nV
-!    do beta = 1, nV
-!      do v = 1, nO
-!        do u = 1, nO
-!           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,beta,gam) + Y_oovv%f(v,u,gam,beta)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$omp end do
-!  !$omp end parallel
-
   do a=1,nV
     call gpu_stream_create(stream(a))
   enddo
 
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
     do beta = 1, nV
-      call gpu_set_stream(blas_handle, stream(beta))
-      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
-           nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
       call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
            nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
     enddo
@@ -579,34 +566,33 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   call gpu_set_stream(blas_handle, gpu_default_stream)
 
 
-  type(gpu_double4) :: X_vovv
 
-  call gpu_allocate(X_vovv,nV,nO,nV,block_size)
   call gpu_allocate(Y_oovo,nO,nO,nV,nO)
 
+  !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv)
+  call gpu_blas_create(blas)
+  type(gpu_double4) :: X_vovv
+  call gpu_allocate(X_vovv,nV,nO,nV,block_size)
+  !$OMP DO
   do iblock = 1, nV, block_size
     do gam = iblock, min(nV, iblock+block_size-1)
-      call gpu_stream_create(stream(gam))
-      call gpu_set_stream(blas_handle, stream(gam))
-      call gpu_dgemm(blas_handle, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
+      call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
         d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), &
         cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV)
 
     enddo
-    do gam = iblock, min(nV, iblock+block_size-1)
-      call gpu_stream_destroy(stream(gam))
-    enddo
 
-    call gpu_synchronize()
-
-    call gpu_set_stream(blas_handle, gpu_default_stream)
-    call gpu_dgemm(blas_handle, 'N','N', nO, &
+    call gpu_dgemm(blas, 'N','N', nO, &
              nO*nV*min(block_size, nV-iblock+1),nV, &
              1.d0, t1%f(1,1)    , size(t1%f,1), &
              X_vovv%f(1,1,1,1), size(X_vovv%f,1), &
              0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
-
   enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+  call gpu_deallocate(X_vovv)
+  !$OMP END PARALLEL
 
   call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, &
              1d0, t1%f(1,1), size(t1%f,1), &
@@ -619,47 +605,27 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
              1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
 
   call gpu_synchronize()
-  call gpu_deallocate(X_vovv)
   call gpu_deallocate(X_vovo)
   call gpu_deallocate(Y_oovo)
 
-!  !$omp parallel &
-!  !$omp shared(nO,nV,r2,Y_oovv) &
-!  !$omp private(u,v,gam,beta) &
-!  !$omp default(none)
-!  !$omp do
-!  do gam = 1, nV
-!    do beta = 1, nV
-!      do v = 1, nO
-!        do u = 1, nO
-!           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Y_oovv%f(u,v,gam,beta) + Y_oovv%f(v,u,beta,gam)
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  !$omp end do
-!  !$omp end parallel
-
   do a=1,nV
     call gpu_stream_create(stream(a))
   enddo
 
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
     do beta = 1, nV
-      call gpu_set_stream(blas_handle, stream(beta))
       call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
            nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
-      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
-           nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO)
     enddo
   enddo
 
   call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  do a=1,nV
-    call gpu_stream_destroy(stream(a))
-  enddo
-
 
   call gpu_synchronize()
   call gpu_deallocate(Y_oovv)
@@ -684,15 +650,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
 
   call gpu_synchronize()
 
-  do a=1,nV
-    call gpu_stream_create(stream(a))
-  enddo
 
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), &
+           nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
     do beta = 1, nV
-      call gpu_set_stream(blas_handle, stream(beta))
-      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, -1.d0, X_ovvo%f(1,beta,gam,1), &
-           nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
       call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), &
            nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
     enddo
@@ -700,58 +665,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
 
   call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  do a=1,nV
-    call gpu_stream_destroy(stream(a))
-  enddo
-
-  call gpu_synchronize()
+  call gpu_synchronize
   call gpu_deallocate(tcc)
   call gpu_deallocate(tcc2)
   call gpu_deallocate(X_ovvo)
 
-  !-----
 
   type(gpu_double4) :: J1, K1
   type(gpu_double4) :: Y_voov, Z_ovov
 
+
   call gpu_allocate(J1,nO,nV,nV,nO)
-  call compute_J1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovvo%f,d_cc_space_v_ovoo%f, &
-       d_cc_space_v_vvoo%f,J1%f)
+  call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, &
+       d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
 
   call gpu_allocate(K1,nO,nV,nO,nV)
-  call compute_K1_chol(nO,nV,t1%f,t2%f,d_cc_space_v_ovoo%f,d_cc_space_v_vvoo%f, &
-       d_cc_space_v_ovov%f,K1%f)
+  call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, &
+       d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
 
 
   call gpu_allocate(X_ovvo,nO,nV,nV,nO)
   call gpu_allocate(Y_voov,nV,nO,nO,nV)
 
-  do a=1,nV
-    call gpu_stream_create(stream(a))
-  enddo
-
-  do i=1, nO
-    do a=1, nV
-      call gpu_set_stream(blas_handle, stream(a))
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do i=1, nO
       call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
            nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO)
-    enddo
-  enddo
-
-  do gam=1, nV
-    call gpu_set_stream(blas_handle, stream(gam))
-    do v=1, nO
-      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,v,1,gam), &
-           nO*nO, -1.d0, t2%f(1,v,gam,1), nO*nO*nV, Y_voov%f(1,1,v,gam), nV)
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), &
+           nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV)
     enddo
   enddo
 
   call gpu_allocate(Z_ovov,nO,nV,nO,nV)
 
-  do a=1,nV
-    call gpu_stream_destroy(stream(a))
-  enddo
-
+  call gpu_synchronize()
   call gpu_deallocate(J1)
   call gpu_set_stream(blas_handle, gpu_default_stream)
 
@@ -769,35 +717,20 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   call gpu_allocate(X_ovov,nO,nV,nO,nV)
   call gpu_allocate(Y_ovov,nO,nV,nO,nV)
 
-!TODO 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,a,i,beta,gam) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          X_ovov%f(i,a,u,beta) = 0.5d0 * K1%f(u,a,i,beta)
-        enddo
-      enddo
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), &
+           nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO)
+    enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), &
+           nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV)
     enddo
   enddo
-  !$omp end do nowait
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov%f(i,a,v,gam) = t2%f(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_synchronize()
 
   call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, &
             -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
@@ -806,51 +739,23 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
 
   call gpu_synchronize()
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2%f(u,v,beta,gam) = r2%f(u,v,beta,gam) + Z_ovov%f(u,beta,v,gam) + Z_ovov%f(v,gam,u,beta)
-        enddo
-      enddo
+  do gam=1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), &
+           nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), &
+           nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO)
+    enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      do gam = 1, nV
-        do u = 1, nO
-          X_ovov%f(u,gam,i,a) = K1%f(u,a,i,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do beta = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov%f(i,a,v,beta) = t2%f(i,v,beta,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
   call gpu_deallocate(K1)
 
@@ -865,22 +770,17 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   call gpu_deallocate(Y_ovov)
 
   ! Change the sign for consistency with the code in spin orbitals
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV,  1.d0, r2%f(1,j,1,gam), &
+           nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2%f(u,v,beta,gam) = -r2%f(u,v,beta,gam) + Z_ovov%f(u,gam,v,beta) + Z_ovov%f(v,beta,u,gam)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
   call gpu_deallocate(Z_ovov)
 
@@ -929,34 +829,42 @@ subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
   ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
   ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
 
-  call dgemm('N','N', nO, nO*nO*nO, nV, &
-             1d0, t1%f  , size(t1%f,1), &
-                  d_cc_space_v_vooo%f, size(d_cc_space_v_vooo%f,1), &
-             0d0, Y_oooo%f, size(Y_oooo%f,1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), &
+             0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1))
 
-  !$omp parallel &
-  !$omp private(u,v,i,j) &
-  !$omp default(shared)
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do v = 1, nO
-        do u = 1, nO
-          A1%f(u,v,i,j) = d_cc_space_v_oooo%f(u,v,i,j) + Y_oooo%f(v,u,j,i) + Y_oooo%f(u,v,i,j)
-        enddo
-      enddo
-    enddo
+  type(gpu_stream) :: stream(nO)
+
+  do i=1, nO
+    call gpu_stream_create(stream(i))
+  enddo
+
+  call gpu_synchronize()
+
+  do j = 1, nO
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), &
+           nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO)
+    enddo
+    call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), &
+         nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  do i=1, nO
+    call gpu_stream_destroy(stream(i))
   enddo
-  !$omp end do
-  !$omp end parallel
 
   call gpu_deallocate(Y_oooo)
 
   ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
-  call dgemm('N','N', nO*nO, nO*nO, nV*nV, &
-             1d0, tau%f     , size(tau%f,1) * size(tau%f,2), &
-                  d_cc_space_v_vvoo%f, size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
-             1d0, A1%f      , size(A1%f,1) * size(A1%f,2))
+  call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, &
+             1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                  d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2))
+  call gpu_synchronize()
 
 end
 
@@ -998,364 +906,364 @@ end
 
 ! g_vir
 
-subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
+subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+  d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
   use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_vv(nV, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_vir(nV, nV)
+  type(gpu_double2), intent(in)  :: t1, H_vv, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in)  :: t2
+  type(gpu_double2), intent(out) :: g_vir
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  call dgemm('N','N',nV,nV,nO, &
-             -1d0, cc_space_f_vo , size(cc_space_f_vo,1), &
-                   t1   , size(t1,1), &
-              0d0, g_vir, size(g_vir,1))
+  type(gpu_stream) :: stream(max(nO,4))
 
-  double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:)
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-    g_vir, nV*nV)
-  deallocate(tmp_k)
-
-  allocate(tmp_vo(cholesky_mo_num,nV,nO))
-  call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV)
-
-  allocate(tmp_vo2(cholesky_mo_num,nO,nV))
-  do beta=1,nV
-    do i=1,nO
-      do k=1,cholesky_mo_num
-        tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp_vo)
-
-  do beta = 1, nV
-    do a = 1, nV
-      g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta)
-    enddo
+  do i=1,max(nO,4)
+    call gpu_stream_create(stream(i))
   enddo
 
-  call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num*nO, &
-     tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV)
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, &
+             -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), &
+                   t1%f(1,1)   , size(t1%f,1), &
+              0d0, g_vir%f(1,1), size(g_vir%f,1))
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp_vo, tmp_vo2
+
+  call gpu_allocate(tmp_k,cholesky_mo_num)
+
+  call gpu_set_stream(blas_handle, stream(2))
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+    g_vir%f(1,1), nV*nV)
+
+  call gpu_set_stream(blas_handle, stream(3))
+  call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO)
+
+  call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_k)
+
+  do i=1,nO
+    call gpu_set_stream(blas_handle, stream(i))
+    call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), &
+         cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,max(nO,4)
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp_vo)
+
+  call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), &
+         nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+     tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_vo2)
 
 end
 
 ! J1
 
-subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)
+subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
   use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO)
-  double precision, intent(out) :: J1(nO, nV, nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_ovvo, v_ovoo, v_vvoo
+  type(gpu_double4), intent(out) :: J1
+  type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:)
-  allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV))
+  type(gpu_double4) :: X_ovoo, Y_ovov
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
-  !$omp private(i,j,a,u,beta) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = v_ovvo(u,a,beta,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_allocate(X_ovoo,nO,nV,nO,nO)
 
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  type(gpu_stream) :: stream(nV)
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), &
-                  t1    , size(t1,1), &
-             0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Y_ovov) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(X_ovoo)
-
-  double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:)
-  allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-      cc_space_v_vv_chol, cholesky_mo_num*nV, &
-      t1, nO, &
-      0.d0, tmp_cc, cholesky_mo_num*nV)
-
-  call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
-      tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, &
-      0.d0, J1_tmp, nV*nO)
-
-  deallocate(tmp_cc)
 
   do i=1,nO
-    do b=1,nV
-      do a=1,nV
-        do u=1,nO
-          J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate(J1_tmp)
-
-  !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) &
-  double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:)
-  allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
-  !$omp private(i,beta,a,u,b,j) &
-  !$omp default(none)
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do beta = 1, nV
-        do u = 1, nO
-          Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,j,b) = v_vvoo(a,b,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemm('N','T',nO*nV,nV*nO,nO*nV, &
-             -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-                   X_voov, size(X_voov,1) * size(X_voov,2), &
-              0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:)
-  allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-
-  !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do i = 1, nO
-        do a = 1, nV
-          Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
+    call gpu_stream_create(stream(i))
   enddo
 
   do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,b,j) = t2(u,j,beta,b)
-        enddo
-      enddo
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), &
+         nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), &
-             0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3))
+
+
+  call gpu_copy(v_ovvo, J1)
+
+  call gpu_synchronize()
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
   do i = 1, nO
-    !$omp do
     do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+         nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  deallocate(X_ovvo,Z_ovvo,Y_ovov)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO)
+  call gpu_allocate(J1_tmp,nV,nO,nV,nO)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  type(gpu_double4) :: J1_tmp
+  type(gpu_double3) :: tmp_cc
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+      d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, &
+      t1%f(1,1), nO, &
+      0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
+      tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, &
+      0.d0, J1_tmp%f(1,1,1,1), nV*nO)
+
+
+  call gpu_deallocate(X_ovoo)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_cc)
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), &
+         nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_voov, Z_ovvo
+
+  call gpu_allocate(X_voov,nV,nO,nO,nV)
+  call gpu_allocate(Z_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
+         nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO)
+    enddo
+  enddo
+
+  do b = 1, nV
+    call gpu_set_stream(blas_handle, stream(b))
+    call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), &
+         nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize()
+  call gpu_deallocate(J1_tmp)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, &
+             -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+                   X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+              0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+          nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_ovvo, Y_vovo
+  call gpu_allocate(Y_vovo,nV,nO,nV,nO)
+
+  do j = 1, nO
+    do i = 1, nO
+      call gpu_set_stream(blas_handle, stream(i))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), &
+          nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV)
+    enddo
+  enddo
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do b = 1, nV
+      call gpu_set_stream(blas_handle, stream(b))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), &
+          nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), &
+             0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+          nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_deallocate(Y_ovov)
+  call gpu_deallocate(X_ovvo)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(Z_ovvo)
 
 end
 
 ! K1
 
-subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1)
+subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
+   d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
   use gpu
 
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV)
-  double precision, intent(in)  :: v_ovoo(nO,nV,nO,nO)
-  double precision, intent(out) :: K1(nO, nV, nO, nV)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_vvoo, v_ovov, v_ovoo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: K1
 
-  double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:)
+  type(gpu_double4) :: X, Y, Z
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = v_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
+  call gpu_copy(v_ovov, K1)
+
+  type(gpu_stream) :: stream(nV)
+  do a = 1, nV
+    call gpu_stream_create(stream(a))
   enddo
-  !$omp end do nowait
+
+  call gpu_allocate(X,nV,nO,nV,nO)
 
   do i = 1, nO
-    !$omp do
     do a = 1, nV
-      do j = 1, nO
-        do b = 1, nV
-          X(b,j,a,i) = - v_vvoo(b,a,i,j)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
+          nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
     enddo
-    !$omp end do nowait
   enddo
 
+  call gpu_allocate(Y,nO,nV,nV,nO)
+
   do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
+          nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y%f(1,beta,1,j), nO*nV)
     enddo
-    !$omp end do
   enddo
-  !$omp end parallel
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), &
-                  t1    , size(t1,1), &
-            1d0, K1    , size(K1,1) * size(K1,2) * size(K1,3))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:)
-  allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO))
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+            1d0, K1%f(1,1,1,1)    , size(K1%f,1) * size(K1%f,2) * size(K1%f,3))
 
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, &
-    t1v, cholesky_mo_num*nO)
+  type(gpu_double4) :: K1tmp
+  type(gpu_double3) :: t1v
 
-  call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
-    t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, &
-    K1tmp, nO*nO)
+  call gpu_allocate(t1v,cholesky_mo_num,nO,nO)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, &
+    t1v%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_allocate(K1tmp,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
+    t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, &
+    K1tmp%f(1,1,1,1), nO*nO)
+
+  call gpu_allocate(Z,nO,nV,nV,nO)
+  call gpu_synchronize()
 
-  deallocate(t1v)
   ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i)
-  call dgemm('N','N',nV*nO,nO*nV,nV*nO, &
-             1d0, Y, size(Y,1) * size(Y,2), &
-                  X, size(X,1) * size(X,2), &
-             0d0, Z, size(Z,1) * size(Z,2))
+  call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, &
+             1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), &
+                  X%f(1,1,1,1), size(X%f,1) * size(X%f,2), &
+             0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,Z,K1tmp) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  !$omp do
-   do beta = 1, nV
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i)
-        enddo
-      enddo
+  call gpu_synchronize()
+  call gpu_deallocate(t1v)
+
+   do i = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(K1tmp,X,Y,Z)
+  call gpu_deallocate(X)
+  call gpu_deallocate(Y)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(K1tmp)
+  call gpu_deallocate(Z)
 
 end

From 9ad69bb27dc4195ae2ae2c9ea2f280156c20366e Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 Jul 2024 03:27:54 +0200
Subject: [PATCH 115/131] GPU accelerated CCSD

---
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 75 +++++++++++++-------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index a185df13..24fcc5af 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -507,7 +507,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
   call gpu_allocate(g_occ,nO,nO)
   call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
     d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
-  call compute_g_occ_chol(nO,nV,t1%f,t2%f,H_oo%f,g_occ%f)
+  call compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+    d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
 
   type(gpu_double4) :: Y_oovv
   call gpu_allocate(Y_oovv,nO,nO,nV,nV)
@@ -870,37 +871,42 @@ end
 
 ! g_occ
 
-subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
+subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+  d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
   use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_oo(nO, nO)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_occ(nO, nO)
+  type(gpu_double2), intent(in)  :: t1, H_oo, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol
+  type(gpu_double4), intent(in)  :: t2, d_cc_space_v_ovoo
+  type(gpu_double2), intent(out) :: g_occ
 
-  g_occ = H_oo
+  call gpu_copy(H_oo, g_occ)
 
-  call dgemm('N','N',nO,nO,nV, &
-             1d0, t1, size(t1,1), &
-                  cc_space_f_vo, size(cc_space_f_vo,1), &
-             1d0, g_occ, size(g_occ,1))
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+             1d0, g_occ%f(1,1), size(g_occ%f,1))
 
-  double precision, allocatable :: X(:)
-  allocate(X(cholesky_mo_num))
-  call dgemv('N',cholesky_mo_num,nO*nV,2.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, &
-    t1, 1, 0.d0, X, 1)
+  type(gpu_double1) :: X
+  call gpu_allocate(X,cholesky_mo_num)
 
-  call dgemv('T',cholesky_mo_num,nO*nO,1.d0, &
-    cc_space_v_oo_chol, cholesky_mo_num, &
-    X, 1, 1.d0, g_occ, 1)
-  deallocate(X)
+  call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+    t1%f(1,1), 1, 0.d0, X%f(1), 1)
 
-  call dgemv('T',nO*nV,nO*nO,-1.d0, &
-    cc_space_v_ovoo, nO*nV, &
-    t1, 1, 1.d0, g_occ, 1)
+  call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, &
+    d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, &
+    X%f(1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, &
+    d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, &
+    t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X)
 
 end
 
@@ -1193,22 +1199,15 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
   enddo
 
   call gpu_allocate(X,nV,nO,nV,nO)
-
-  do i = 1, nO
-    do a = 1, nV
-      call gpu_set_stream(blas_handle, stream(a))
-      call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
-          nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
-    enddo
-  enddo
-
   call gpu_allocate(Y,nO,nV,nV,nO)
 
-  do j = 1, nO
-    do beta = 1, nV
-      call gpu_set_stream(blas_handle, stream(beta))
-      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
-          nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y%f(1,beta,1,j), nO*nV)
+  do a = 1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
+          nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), &
+          nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV)
     enddo
   enddo
 
@@ -1246,9 +1245,9 @@ subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
   call gpu_synchronize()
   call gpu_deallocate(t1v)
 
+  do beta = 1, nV
+   call gpu_set_stream(blas_handle, stream(beta))
    do i = 1, nO
-    do beta = 1, nV
-      call gpu_set_stream(blas_handle, stream(beta))
       call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
           nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO)
       call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &

From dd9c6dcc03e6d24a78ed5651e5c825e81d68a9c6 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 Jul 2024 21:11:13 +0200
Subject: [PATCH 116/131] Introducing dpcpp

---
 configure                          |   8 +-
 plugins/local/gpu_intel/LIB        |   1 +
 plugins/local/gpu_intel/NEED       |   1 +
 plugins/local/gpu_intel/README.rst |   8 +
 plugins/local/gpu_intel/gpu.sycl   | 266 +++++++++++++++++++++++++++++
 5 files changed, 282 insertions(+), 2 deletions(-)
 create mode 100644 plugins/local/gpu_intel/LIB
 create mode 100644 plugins/local/gpu_intel/NEED
 create mode 100644 plugins/local/gpu_intel/README.rst
 create mode 100644 plugins/local/gpu_intel/gpu.sycl

diff --git a/configure b/configure
index 08dac444..3e3390e1 100755
--- a/configure
+++ b/configure
@@ -40,7 +40,7 @@ Usage:
   $(basename $0) -c <file>    
   $(basename $0) -h           
   $(basename $0) -i <package> 
-  $(basename $0) -g [nvidia|none]
+  $(basename $0) -g [nvidia|intel|none]
 
 Options:
   -c  <file>        Define a COMPILATION configuration file,
@@ -49,7 +49,7 @@ Options:
   -i <package>      INSTALL <package>. Use at your OWN RISK:
                     no support will be provided for the installation of
                     dependencies.
-  -g [nvidia|none]  Choose GPU acceleration (experimental)
+  -g [nvidia|intel|none]  Choose GPU acceleration
 
 Example:
   ./$(basename $0) -c config/gfortran.cfg
@@ -121,6 +121,10 @@ case "$GPU" in
      echo "Activating AMD GPU acceleration"
      ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
      ;;
+  intel) # Intel
+     echo "Activating Intel GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
+     ;;
   nvidia) # Nvidia
      echo "Activating Nvidia GPU acceleration"
      ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB
new file mode 100644
index 00000000..027c35b0
--- /dev/null
+++ b/plugins/local/gpu_intel/LIB
@@ -0,0 +1 @@
+-lmkl_sycl -lsycl
diff --git a/plugins/local/gpu_intel/NEED b/plugins/local/gpu_intel/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_intel/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst
new file mode 100644
index 00000000..3a4653de
--- /dev/null
+++ b/plugins/local/gpu_intel/README.rst
@@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+dpcpp -O3 -c gpu.o gpu.sycl
+```
diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl
new file mode 100644
index 00000000..7b589490
--- /dev/null
+++ b/plugins/local/gpu_intel/gpu.sycl
@@ -0,0 +1,266 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+    auto queue = sycl::queue(sycl::default_selector{});
+
+    try {
+        *ptr = sycl::malloc_shared(size, queue);
+        assert(*ptr != nullptr);
+    } catch (const sycl::exception& e) {
+        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+        *ptr = nullptr; // If allocation fails, set pointer to nullptr
+    }
+}
+
+void gpu_deallocate(void** ptr) {
+    assert(*ptr != nullptr);
+    sycl::free(*ptr, sycl::queue(sycl::default_selector{}));
+    *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector{});
+    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector{});
+    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+    sycl::queue queue(sycl::default_selector{});
+    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector{});
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+To translate the CUDA functions related to stream management to SYCL, you will need to adapt to SYCL's approach to command groups and queues. SYCL uses queues to manage execution order and parallelism, similar to CUDA streams but integrated within the SYCL ecosystem.
+
+### Original CUDA Code
+
+```c
+/* Create a CUDA stream */
+void gpu_stream_create(cudaStream_t* ptr) {
+    cudaError_t rc = cudaStreamCreate(ptr);
+    assert(rc == cudaSuccess);
+}
+
+/* Destroy a CUDA stream */
+void gpu_stream_destroy(cudaStream_t* ptr) {
+    assert(ptr != NULL);
+    cudaError_t rc = cudaStreamDestroy(*ptr);
+    assert(rc == cudaSuccess);
+    *ptr = NULL;
+}
+
+/* Set a specific stream for cuBLAS operations */
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+    cublasSetStream(handle, stream);
+}
+
+/* Synchronize all streams */
+void gpu_synchronize() {
+    cudaDeviceSynchronize();
+}
+```
+
+### Translated SYCL Code
+
+```cpp
+#include <CL/sycl.hpp>
+#include <cassert>
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector{});
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    *ptr->wait_and_throw();
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+/* SYCL does not need an equivalent for setting a stream on a cuBLAS handle,
+   because each SYCL queue acts independently and can be used directly. */
+
+void gpu_synchronize() {
+    sycl::queue queue(sycl::default_selector{});
+    queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+  sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+  handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t* ptr) {
+    *ptr = new blasHandle_t;
+    assert(*ptr != nullptr);
+    ptr->queue = new sycl::queue(sycl::default_selector{});
+    assert(ptr->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t* ptr) {
+    assert(*ptr != nullptr);
+    delete ptr->queue;
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+              const double* y, const int64_t incy, double* result) {
+    // Ensure input parameters are valid
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+    assert(n > 0);
+    assert(incx > 0);
+    assert(incy > 0);
+    assert(x != nullptr);
+    assert(y != nullptr);
+    assert(result != nullptr);
+
+    // SYCL buffer for the result
+    sycl::buffer<double, 1> result_buf(result, sycl::range<1>(1));
+
+    sycl::queue& queue = handle->queue;
+
+    // Perform the dot product operation
+    queue.submit([&](sycl::handler& cgh) {
+        // Accessors for the buffers
+        auto result_acc = result_buf.get_access<sycl::access::mode::write>(cgh);
+
+        // This is an asynchronous call to compute dot product
+        cgh.single_task([=]() {
+            result_acc[0] = oneapi::mkl::blas::dot(cgh, n, x, incx, y, incy);
+        });
+    });
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+
+    // Validate matrix dimensions and increments to be positive
+    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine the operation type
+    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+    if (*transa == 'T' || *transa == 't') {
+        transa_ = oneapi::mkl::transpose::trans;
+    }
+
+    // Perform DGEMV operation using oneMKL
+    handle->queue->submit([&](sycl::handler& cgh) {
+        // Use accessors to ensure data consistency and dependency resolution
+        auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh);
+        auto x_acc = sycl::accessor(x, sycl::range(n * incx), sycl::read_only, cgh);
+        auto y_acc = sycl::accessor(y, sycl::range(m * incy), sycl::read_write, cgh);
+
+        cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) {
+            oneapi::mkl::blas::gemv(*handle->queue, transa_, m, n, *alpha, a_acc.get_pointer(), lda, x_acc.get_pointer(), incx, *beta, y_acc.get_pointer(), incy);
+        });
+    });
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Transpose operations
+    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+    // Ensure queue is ready
+    handle->queue->submit([&](sycl::handler& cgh) {
+        // Accessors for matrices
+        auto a_acc = sycl::accessor(a, sycl::range<1>(m * lda), sycl::read_only, cgh);
+        auto b_acc = sycl::accessor(b, sycl::range<1>(k * ldb), sycl::read_only, cgh);
+        auto c_acc = sycl::accessor(c, sycl::range<1>(m * ldc), sycl::read_write, cgh);
+
+        cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) {
+            oneapi::mkl::blas::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a_acc.get_pointer(), lda,
+                                    b_acc.get_pointer(), ldb,
+                                    *beta, c_acc.get_pointer(), ldc);
+        });
+    });
+
+}
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine transpose operations
+    bool transA = (*transa == 'T' || *transa == 't');
+    bool transB = (*transb == 'T' || *transb == 't');
+
+    handle->queue->submit([&](sycl::handler& cgh) {
+        auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh);
+        auto b_acc = sycl::accessor(b, sycl::range(n * ldb), sycl::read_only, cgh);
+        auto c_acc = sycl::accessor(c, sycl::range(m * ldc), sycl::read_write, cgh);
+
+        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+            int i = idx[0];
+            int j = idx[1];
+            int ai = transA ? j * lda + i : i * lda + j;
+            int bi = transB ? j * ldb + i : i * ldb + j;
+            int ci = i * ldc + j;
+
+            c_acc[ci] = (*alpha) * a_acc[ai] + (*beta) * b_acc[bi];
+        });
+    });
+
+}
+
+}  // extern C

From 44b8e22e7aebf4dd89874549eac8bb8aef2fb16d Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 Jul 2024 22:02:13 +0200
Subject: [PATCH 117/131] Fixed sycl

---
 plugins/local/gpu_intel/README.rst |   2 +-
 plugins/local/gpu_intel/gpu.sycl   | 139 ++++++-----------------------
 2 files changed, 26 insertions(+), 115 deletions(-)

diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst
index 3a4653de..d42e2557 100644
--- a/plugins/local/gpu_intel/README.rst
+++ b/plugins/local/gpu_intel/README.rst
@@ -4,5 +4,5 @@ gpu_intel
 
 Intel implementation of GPU routines. Uses MKL and SYCL.
 ```bash
-dpcpp -O3 -c gpu.o gpu.sycl
+icpx -fsycl gpu.cxx -c -qmkl=sequential
 ```
diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl
index 7b589490..1f9f89ce 100644
--- a/plugins/local/gpu_intel/gpu.sycl
+++ b/plugins/local/gpu_intel/gpu.sycl
@@ -18,7 +18,7 @@ void gpu_set_device(int32_t igpu) {
 /* Allocation functions */
 
 void gpu_allocate(void** ptr, int64_t size) {
-    auto queue = sycl::queue(sycl::default_selector{});
+    auto queue = sycl::queue(sycl::default_selector_v);
 
     try {
         *ptr = sycl::malloc_shared(size, queue);
@@ -31,25 +31,25 @@ void gpu_allocate(void** ptr, int64_t size) {
 
 void gpu_deallocate(void** ptr) {
     assert(*ptr != nullptr);
-    sycl::free(*ptr, sycl::queue(sycl::default_selector{}));
+    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
     *ptr = nullptr;
 }
 
 /* Upload data from host to device */
 void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
-    sycl::queue queue(sycl::default_selector{});
+    sycl::queue queue(sycl::default_selector_v);
     queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
 }
 
 /* Download data from device to host */
 void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
-    sycl::queue queue(sycl::default_selector{});
+    sycl::queue queue(sycl::default_selector_v);
     queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
 }
 
 /* Copy data from one GPU memory location to another */
 void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
-    sycl::queue queue(sycl::default_selector{});
+    sycl::queue queue(sycl::default_selector_v);
     queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
 }
 
@@ -57,7 +57,7 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
 
 /* SYCL queue as a replacement for CUDA stream */
 void gpu_stream_create(sycl::queue** ptr) {
-    *ptr = new sycl::queue(sycl::default_selector{});
+    *ptr = new sycl::queue(sycl::default_selector_v);
 }
 
 void gpu_stream_destroy(sycl::queue** ptr) {
@@ -66,59 +66,8 @@ void gpu_stream_destroy(sycl::queue** ptr) {
     *ptr = nullptr;
 }
 
-To translate the CUDA functions related to stream management to SYCL, you will need to adapt to SYCL's approach to command groups and queues. SYCL uses queues to manage execution order and parallelism, similar to CUDA streams but integrated within the SYCL ecosystem.
-
-### Original CUDA Code
-
-```c
-/* Create a CUDA stream */
-void gpu_stream_create(cudaStream_t* ptr) {
-    cudaError_t rc = cudaStreamCreate(ptr);
-    assert(rc == cudaSuccess);
-}
-
-/* Destroy a CUDA stream */
-void gpu_stream_destroy(cudaStream_t* ptr) {
-    assert(ptr != NULL);
-    cudaError_t rc = cudaStreamDestroy(*ptr);
-    assert(rc == cudaSuccess);
-    *ptr = NULL;
-}
-
-/* Set a specific stream for cuBLAS operations */
-void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
-    cublasSetStream(handle, stream);
-}
-
-/* Synchronize all streams */
 void gpu_synchronize() {
-    cudaDeviceSynchronize();
-}
-```
-
-### Translated SYCL Code
-
-```cpp
-#include <CL/sycl.hpp>
-#include <cassert>
-
-/* SYCL queue as a replacement for CUDA stream */
-void gpu_stream_create(sycl::queue** ptr) {
-    *ptr = new sycl::queue(sycl::default_selector{});
-}
-
-void gpu_stream_destroy(sycl::queue** ptr) {
-    *ptr->wait_and_throw();
-    assert(*ptr != nullptr);
-    delete *ptr;
-    *ptr = nullptr;
-}
-
-/* SYCL does not need an equivalent for setting a stream on a cuBLAS handle,
-   because each SYCL queue acts independently and can be used directly. */
-
-void gpu_synchronize() {
-    sycl::queue queue(sycl::default_selector{});
+    sycl::queue queue(sycl::default_selector_v);
     queue.wait_and_throw();
 }
 
@@ -132,17 +81,17 @@ void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
   handle->queue = ptr;
 }
 
-void gpu_blas_create(blasHandle_t* ptr) {
-    *ptr = new blasHandle_t;
+void gpu_blas_create(blasHandle_t** ptr) {
+    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
     assert(*ptr != nullptr);
-    ptr->queue = new sycl::queue(sycl::default_selector{});
-    assert(ptr->queue != nullptr);
+    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+    assert((*ptr)->queue != nullptr);
 }
 
-void gpu_blas_destroy(blasHandle_t* ptr) {
+void gpu_blas_destroy(blasHandle_t** ptr) {
     assert(*ptr != nullptr);
-    delete ptr->queue;
-    delete *ptr;
+    delete (*ptr)->queue;
+    free(*ptr);
     *ptr = nullptr;
 }
 
@@ -159,21 +108,7 @@ void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int6
     assert(y != nullptr);
     assert(result != nullptr);
 
-    // SYCL buffer for the result
-    sycl::buffer<double, 1> result_buf(result, sycl::range<1>(1));
-
-    sycl::queue& queue = handle->queue;
-
-    // Perform the dot product operation
-    queue.submit([&](sycl::handler& cgh) {
-        // Accessors for the buffers
-        auto result_acc = result_buf.get_access<sycl::access::mode::write>(cgh);
-
-        // This is an asynchronous call to compute dot product
-        cgh.single_task([=]() {
-            result_acc[0] = oneapi::mkl::blas::dot(cgh, n, x, incx, y, incy);
-        });
-    });
+    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
 
 }
 
@@ -194,16 +129,7 @@ void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const
     }
 
     // Perform DGEMV operation using oneMKL
-    handle->queue->submit([&](sycl::handler& cgh) {
-        // Use accessors to ensure data consistency and dependency resolution
-        auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh);
-        auto x_acc = sycl::accessor(x, sycl::range(n * incx), sycl::read_only, cgh);
-        auto y_acc = sycl::accessor(y, sycl::range(m * incy), sycl::read_write, cgh);
-
-        cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) {
-            oneapi::mkl::blas::gemv(*handle->queue, transa_, m, n, *alpha, a_acc.get_pointer(), lda, x_acc.get_pointer(), incx, *beta, y_acc.get_pointer(), incy);
-        });
-    });
+    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
 
 }
 
@@ -218,23 +144,12 @@ void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, con
     auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
     auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
 
-    // Ensure queue is ready
-    handle->queue->submit([&](sycl::handler& cgh) {
-        // Accessors for matrices
-        auto a_acc = sycl::accessor(a, sycl::range<1>(m * lda), sycl::read_only, cgh);
-        auto b_acc = sycl::accessor(b, sycl::range<1>(k * ldb), sycl::read_only, cgh);
-        auto c_acc = sycl::accessor(c, sycl::range<1>(m * ldc), sycl::read_write, cgh);
-
-        cgh.parallel_for(sycl::range(1), [=](sycl::id<1>) {
-            oneapi::mkl::blas::gemm(*handle->queue, transa_, transb_, m, n, k,
-                                    *alpha, a_acc.get_pointer(), lda,
-                                    b_acc.get_pointer(), ldb,
-                                    *beta, c_acc.get_pointer(), ldc);
-        });
-    });
+    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
 
 }
 
+
 void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
                const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
     assert(handle != nullptr && handle->queue != nullptr);
@@ -246,18 +161,14 @@ void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, con
     bool transB = (*transb == 'T' || *transb == 't');
 
     handle->queue->submit([&](sycl::handler& cgh) {
-        auto a_acc = sycl::accessor(a, sycl::range(m * lda), sycl::read_only, cgh);
-        auto b_acc = sycl::accessor(b, sycl::range(n * ldb), sycl::read_only, cgh);
-        auto c_acc = sycl::accessor(c, sycl::range(m * ldc), sycl::read_write, cgh);
-
         cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
-            int i = idx[0];
-            int j = idx[1];
-            int ai = transA ? j * lda + i : i * lda + j;
-            int bi = transB ? j * ldb + i : i * ldb + j;
-            int ci = i * ldc + j;
+            const int i = idx[0];
+            const int j = idx[1];
+            const int ai = transA ? j * lda + i : i * lda + j;
+            const int bi = transB ? j * ldb + i : i * ldb + j;
+            const int ci = i * ldc + j;
 
-            c_acc[ci] = (*alpha) * a_acc[ai] + (*beta) * b_acc[bi];
+            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
         });
     });
 

From 6c275d54ef050ec8d210a35aa4bbb2c93d176f34 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 Jul 2024 22:14:19 +0200
Subject: [PATCH 118/131] Fix intent

---
 src/ccsd/ccsd_space_orb_sub_chol.irp.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index 24fcc5af..6f65ea79 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -996,8 +996,8 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol
   integer, intent(in)            :: nO,nV
   type(gpu_double2), intent(in)  :: t1
   type(gpu_double4), intent(in)  :: t2, v_ovvo, v_ovoo, v_vvoo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
   type(gpu_double4), intent(out) :: J1
-  type(gpu_double3), intent(out) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 

From f5cf674d7b4eb98637bde7eb07d1119cfeccc557 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 9 Jul 2024 23:04:22 +0200
Subject: [PATCH 119/131] Fix link stage for intel gpus

---
 configure                   | 4 ++--
 plugins/local/gpu_intel/LIB | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 3e3390e1..43ca9f6d 100755
--- a/configure
+++ b/configure
@@ -117,12 +117,12 @@ done
 # Handle GPU acceleration
 rm -f ${QP_ROOT}/src/gpu_arch
 case "$GPU" in
-  amd) # Nvidia
+  amd) # AMD
      echo "Activating AMD GPU acceleration"
      ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
      ;;
   intel) # Intel
-     echo "Activating Intel GPU acceleration"
+     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
      ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
      ;;
   nvidia) # Nvidia
diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB
index 027c35b0..199b0f1c 100644
--- a/plugins/local/gpu_intel/LIB
+++ b/plugins/local/gpu_intel/LIB
@@ -1 +1,2 @@
--lmkl_sycl -lsycl
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+

From d219dc10267c0fe86fbae4683c00f5051229c8c0 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 11 Jul 2024 13:57:28 +0200
Subject: [PATCH 120/131] beginning to put cholesky in CASSCF

---
 plugins/local/spher_harm/spher_harm.irp.f     |   4 +-
 .../local/spher_harm/spher_harm_func.irp.f    |  13 +++
 src/casscf_cipsi/chol_bielec.irp.f            |  93 ++++++++++++++++
 src/casscf_cipsi/test_chol.irp.f              |  23 ++++
 src/mo_two_e_ints/cholesky.irp.f              |  31 ++++++
 src/mu_of_r/f_hf_cholesky.irp.f               | 100 ++++++++++++++++++
 src/mu_of_r/test_proj_op.irp.f                |  20 +++-
 7 files changed, 280 insertions(+), 4 deletions(-)
 create mode 100644 src/casscf_cipsi/chol_bielec.irp.f
 create mode 100644 src/casscf_cipsi/test_chol.irp.f

diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
index 7a2eea06..e8deafb9 100644
--- a/plugins/local/spher_harm/spher_harm.irp.f
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -1,7 +1,7 @@
 program spher_harm
  implicit none
-! call test_spher_harm
+ call test_spher_harm
 ! call test_cart
- call test_brutal_spheric
+! call test_brutal_spheric
 end
 
diff --git a/plugins/local/spher_harm/spher_harm_func.irp.f b/plugins/local/spher_harm/spher_harm_func.irp.f
index 825bd8ac..f12c8fb9 100644
--- a/plugins/local/spher_harm/spher_harm_func.irp.f
+++ b/plugins/local/spher_harm/spher_harm_func.irp.f
@@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
  double precision :: theta, phi,r_abs
  call cartesian_to_spherical(r,theta,phi,r_abs)
  call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
 end
 
 
@@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
   tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
   re_ylm = tmp * dcos(phi)
   im_ylm = tmp * dsin(phi)
+ else if (l==1.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
+  re_ylm = tmp * dcos(phi)
+  im_ylm = -tmp * dsin(phi)
  else if(l==1.and.m==0)then
   tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) 
   re_ylm = tmp 
@@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
   tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
   re_ylm = tmp * dcos(2.d0*phi)
   im_ylm = tmp * dsin(2.d0*phi)
+ else if(l==2.and.m==-2)then
+  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
+  re_ylm = tmp * dcos(2.d0*phi)
+  im_ylm =-tmp * dsin(2.d0*phi)
  else if(l==2.and.m==1)then
   tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
   re_ylm = tmp * dcos(phi)
   im_ylm = tmp * dsin(phi)
+ else if(l==2.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
+  re_ylm = tmp * dcos(phi)
+  im_ylm =-tmp * dsin(phi)
  else if(l==2.and.m==0)then
   tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
   re_ylm = tmp
diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f
new file mode 100644
index 00000000..1fe985ad
--- /dev/null
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@@ -0,0 +1,93 @@
+
+BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)]
+ BEGIN_DOC
+ ! Cholesky vectors with ONE orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,i_mo,jj_act
+ double precision, allocatable :: chol_tmp(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb))
+ cholesky_no_1_idx_transp = 0.D0
+ do i_mo = 1, mo_num
+  ! Get all the integrals corresponding to the "i_mo"
+  do i_act = 1, n_act_orb
+   jj_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
+   enddo
+  enddo
+!  ! Do the matrix product 
+!  do i_act = 1, n_act_orb
+!   do jj_act = 1, n_act_orb
+!    do i_chol = 1, cholesky_mo_num
+!     cholesky_no_1_idx_transp(i_chol, i_act, i_mo) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) 
+!    enddo
+!   enddo
+!  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
+ enddo
+
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
+ BEGIN_DOC
+ ! Cholesky vectors with TWO orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,j_act,jj_act
+ double precision, allocatable :: chol_tmp(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb))
+ cholesky_no_2_idx_transp = 0.D0
+ do j_act = 1, n_act_orb
+  do i_act = 1, n_act_orb
+   do jj_act = 1, n_act_orb
+    do i_chol = 1, cholesky_mo_num
+     cholesky_no_2_idx_transp(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jj_act) * natorbsCI(jj_act,i_act) 
+    enddo
+   enddo
+  enddo
+ enddo
+
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_dgemm, (cholesky_mo_num, n_act_orb, n_act_orb)]
+ BEGIN_DOC
+ ! Cholesky vectors with TWO orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,j_act,jj_act
+ double precision, allocatable :: chol_tmp(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb))
+ cholesky_no_2_idx_transp_dgemm = 0.D0
+ do j_act = 1, n_act_orb
+  ! Get all the integrals corresponding to the "j_act"
+  do i_act = 1, n_act_orb
+   jj_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, i_act) = cholesky_no_1_idx_transp(i_chol, j_act, jj_act)
+   enddo
+  enddo
+!  ! Do the matrix product 
+!  do i_act = 1, n_act_orb
+!   do jj_act = 1, n_act_orb
+!    do i_chol = 1, cholesky_mo_num
+!     cholesky_no_1_idx_transp(i_chol, i_act, j_act) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) 
+!    enddo
+!   enddo
+!  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_2_idx_transp_dgemm(1,1,j_act), size(cholesky_no_2_idx_transp_dgemm,1))
+ enddo
+
+END_PROVIDER 
+
+
diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f
new file mode 100644
index 00000000..b94851f9
--- /dev/null
+++ b/src/casscf_cipsi/test_chol.irp.f
@@ -0,0 +1,23 @@
+program test_chol
+ implicit none
+ read_wf= .True.
+ touch read_wf 
+ call routine
+
+end
+
+subroutine routine
+ implicit none
+ integer :: i_chol, i_act, i_mo
+ double precision :: accu
+ accu = 0.d0
+ do i_mo = 1, n_act_orb
+  do i_act = 1, n_act_orb
+   do i_chol = 1, cholesky_mo_num
+    accu += dabs(cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp(i_chol,i_act,i_mo))
+    print*,cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp(i_chol,i_act,i_mo)
+   enddo
+  enddo
+ enddo
+ print*,'accu =', accu
+end
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 7e2c8b37..1fed949d 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
 
 END_PROVIDER
 
+
+BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors in MO basis
+ END_DOC
+
+ double precision, allocatable :: X(:,:,:)
+ double precision :: wall0, wall1
+ integer :: ierr
+ print *, 'Semi AO->MO Transformation of Cholesky vectors'
+  call wall_time(wall0)
+
+ allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
+ if (ierr /= 0) then
+   print *, irp_here, ': Allocation failed'
+ endif
+ integer :: i_chol, i_mo, j_mo, i_ao 
+ cholesky_semi_mo_transp_simple = 0.d0
+ do i_mo = 1, mo_num
+  do i_ao = 1, ao_num
+   do j_mo = 1, mo_num
+    do i_chol = 1, cholesky_mo_num
+     cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao)
+    enddo
+   enddo
+  enddo
+ enddo
+
+END_PROVIDER
+
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
index 5dd69eb6..179b80dd 100644
--- a/src/mu_of_r/f_hf_cholesky.irp.f
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
  endif
 END_PROVIDER 
 
+BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,mm,i,ii,p
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ double precision :: thresh_1,thresh_2
+ double precision, allocatable :: accu_vec(:),delta_vec(:)
+ thresh_2 = ao_cholesky_threshold * 100.d0
+ thresh_1 = dsqrt(thresh_2)
+ provide cholesky_mo_transp
+ if(elec_alpha_num == elec_beta_num)then
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) 
+  allocate(accu_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, ao_num ! electron 1 
+       mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1
+       if(dabs(mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ else
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(2)
+      i = list_valence_orb_for_hf(ii,2)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     delta_vec = 0.d0
+     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ endif
+END_PROVIDER 
+
+
 BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
  implicit none
  integer :: ipoint,i,ii
diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f
index f9aba094..fd5e976b 100644
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@@ -15,7 +15,23 @@ program projected_operators
 !  call test_f_HF_valence_ab
 !  call routine_full_mos
 !   call test_f_ii_valence_ab
-   call test_f_ia_valence_ab
-  call test_f_ii_ia_aa_valence_ab
+!   call test_f_ia_valence_ab
+!  call test_f_ii_ia_aa_valence_ab
+ call test
 end
 
+
+subroutine test
+ implicit none
+ integer :: i_point
+ double precision :: ref, new, accu, weight
+ accu = 0.d0
+ do i_point = 1, n_points_final_grid
+  ref = f_hf_cholesky_sparse(i_point)
+  new = f_hf_cholesky_sparse_bis(i_point)
+  weight = final_weight_at_r_vector(i_point)
+  accu += dabs(ref - new) * weight
+ enddo
+ print*,'accu = ',accu
+
+end

From 31ec3ace0540177c2476e478d763a068d73bd41b Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 11 Jul 2024 14:22:27 +0200
Subject: [PATCH 121/131] correct transformation of cholesky vectors on the NO
 basis

---
 src/casscf_cipsi/chol_bielec.irp.f | 41 +++++++++++++-----------------
 src/casscf_cipsi/test_chol.irp.f   | 14 +++++++---
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f
index 1fe985ad..3104fe5f 100644
--- a/src/casscf_cipsi/chol_bielec.irp.f
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@@ -34,20 +34,21 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_
 END_PROVIDER 
 
 
-BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_old, (cholesky_mo_num, n_act_orb, n_act_orb)]
  BEGIN_DOC
  ! Cholesky vectors with TWO orbital on the active natural orbital basis 
  END_DOC
  implicit none
- integer :: i_chol,i_act,j_act,jj_act
+ integer :: i_chol,i_act,j_act,jj_act,jjj_act
  double precision, allocatable :: chol_tmp(:,:)
  allocate(chol_tmp(cholesky_mo_num,n_act_orb))
- cholesky_no_2_idx_transp = 0.D0
- do j_act = 1, n_act_orb
-  do i_act = 1, n_act_orb
-   do jj_act = 1, n_act_orb
+ cholesky_no_2_idx_transp_old = 0.D0
+ do jj_act = 1, n_act_orb
+  jjj_act = list_act(jj_act)
+  do j_act = 1, n_act_orb
+   do i_act = 1, n_act_orb
     do i_chol = 1, cholesky_mo_num
-     cholesky_no_2_idx_transp(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jj_act) * natorbsCI(jj_act,i_act) 
+     cholesky_no_2_idx_transp_old(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jjj_act) * natorbsCI(jj_act,j_act) 
     enddo
    enddo
   enddo
@@ -56,36 +57,28 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_
 END_PROVIDER 
 
 
-BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_dgemm, (cholesky_mo_num, n_act_orb, n_act_orb)]
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
  BEGIN_DOC
  ! Cholesky vectors with TWO orbital on the active natural orbital basis 
  END_DOC
  implicit none
  integer :: i_chol,i_act,j_act,jj_act
- double precision, allocatable :: chol_tmp(:,:)
- allocate(chol_tmp(cholesky_mo_num,n_act_orb))
- cholesky_no_2_idx_transp_dgemm = 0.D0
- do j_act = 1, n_act_orb
+ double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
+ cholesky_no_2_idx_transp = 0.D0
+ do i_act = 1, n_act_orb
   ! Get all the integrals corresponding to the "j_act"
-  do i_act = 1, n_act_orb
-   jj_act = list_act(i_act)
+  do j_act = 1, n_act_orb
+   jj_act = list_act(j_act)
    do i_chol = 1, cholesky_mo_num
-    chol_tmp(i_chol, i_act) = cholesky_no_1_idx_transp(i_chol, j_act, jj_act)
+    chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act)
    enddo
   enddo
-!  ! Do the matrix product 
-!  do i_act = 1, n_act_orb
-!   do jj_act = 1, n_act_orb
-!    do i_chol = 1, cholesky_mo_num
-!     cholesky_no_1_idx_transp(i_chol, i_act, j_act) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) 
-!    enddo
-!   enddo
-!  enddo
   call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
         chol_tmp, size(chol_tmp,1),                              &
         natorbsCI, size(natorbsCI,1),                                              &
         0.d0,                                                      &
-        cholesky_no_2_idx_transp_dgemm(1,1,j_act), size(cholesky_no_2_idx_transp_dgemm,1))
+        cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
  enddo
 
 END_PROVIDER 
diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f
index b94851f9..8d978817 100644
--- a/src/casscf_cipsi/test_chol.irp.f
+++ b/src/casscf_cipsi/test_chol.irp.f
@@ -9,15 +9,21 @@ end
 subroutine routine
  implicit none
  integer :: i_chol, i_act, i_mo
- double precision :: accu
+ double precision :: accu,error,exact
  accu = 0.d0
  do i_mo = 1, n_act_orb
   do i_act = 1, n_act_orb
    do i_chol = 1, cholesky_mo_num
-    accu += dabs(cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp(i_chol,i_act,i_mo))
-    print*,cholesky_no_2_idx_transp_dgemm(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp(i_chol,i_act,i_mo)
+    error = dabs(cholesky_no_2_idx_transp(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo))
+    exact = dabs(cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo))
+    accu += error
+    if(exact.gt.1.d-10)then
+     if(error/exact.gt.1.d-7)then
+       write(*,'(4(E16.10,X))')cholesky_no_2_idx_transp(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo),error,error/exact
+     endif
+   endif
    enddo
   enddo
  enddo
- print*,'accu =', accu
+ print*,'accu =', accu/(dble(n_act_orb*n_act_orb*cholesky_mo_num))
 end

From 56450ed0436c4c1be26f33757ea3a4ca35238b57 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Thu, 11 Jul 2024 19:09:20 +0200
Subject: [PATCH 122/131] introduced functions mimicking the arrays

---
 src/casscf_cipsi/chol_bielec.irp.f | 158 ++++++++++++++++++++++++++++-
 src/casscf_cipsi/test_chol.irp.f   |  92 ++++++++++++++---
 2 files changed, 234 insertions(+), 16 deletions(-)

diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f
index 3104fe5f..94a76453 100644
--- a/src/casscf_cipsi/chol_bielec.irp.f
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@@ -33,7 +33,6 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_
 
 END_PROVIDER 
 
-
 BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_old, (cholesky_mo_num, n_act_orb, n_act_orb)]
  BEGIN_DOC
  ! Cholesky vectors with TWO orbital on the active natural orbital basis 
@@ -83,4 +82,161 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_
 
 END_PROVIDER 
 
+BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors defined on all basis including the NO basis
+ END_DOC
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt
+ ! Block when two orbitals belong to the core/inact 
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact)
+   enddo
+  enddo
+ enddo
 
+ ! Block when one orbitals belongs to the core/inact and one belongs to the active
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when two orbitals belong to the active 
+ do j_act = 1, n_act_orb
+  jj_act = list_act(j_act)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act)
+   enddo
+  enddo
+ enddo
+
+ ! Block when two orbitals belong to the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do j_virt = 1, n_virt_orb
+   jj_virt = list_virt(j_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when one orbital is in active and the other in the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ ! Block when one orbital is in the virtual and one in the core-inact 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_core_inact = 1, n_core_inact_orb
+  ii_core_inact = list_core_inact(i_core_inact)
+  do i_virt = 1, n_virt_orb
+   ii_virt = list_virt(i_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact)
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+
+double precision function bielec_no_basis_chol(i_1,j_1,i_2,j_2)
+ implicit none
+ integer, intent(in) :: i_1,j_1,i_2,j_2
+  BEGIN_DOC
+  ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
+  ! 
+  END_DOC
+  integer :: i_chol 
+ bielec_no_basis_chol = 0.d0
+ do i_chol = 1, cholesky_mo_num
+  bielec_no_basis_chol += cholesky_no_total_transp(i_chol,i_1, j_1) * cholesky_no_total_transp(i_chol,i_2,j_2)
+ enddo
+end
+
+double precision function bielec_PQxx_no_chol(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ BEGIN_DOC
+ ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition 
+ ! 
+ ! indices are unshifted orbital numbers
+ END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis_chol
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx_no_chol = bielec_no_basis_chol(i_mo,j_mo,ii_ca,jj_ca)
+
+end
+
+double precision function bielec_PxxQ_no_chol(i_mo, j_ca, i_ca, j_mo)
+ implicit none 
+  BEGIN_DOC
+  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition 
+  ! 
+  ! indices are unshifted orbital numbers
+  END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis_chol
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ_no_chol = bielec_no_basis_chol(i_mo, jj_ca, ii_ca, j_mo)
+
+end
+
+double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
+ implicit none 
+ integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
+ integer :: ii_ca, jj_ca, kk_ca
+ double precision :: bielec_no_basis_chol
+ ii_ca = list_act(i_ca)
+ jj_ca = list_act(j_ca)
+ kk_ca = list_act(k_ca)
+ bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
+
+end
diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f
index 8d978817..87c5c352 100644
--- a/src/casscf_cipsi/test_chol.irp.f
+++ b/src/casscf_cipsi/test_chol.irp.f
@@ -2,28 +2,90 @@ program test_chol
  implicit none
  read_wf= .True.
  touch read_wf 
- call routine
+! call routine_bielec_PxxQ_no
+ call routine_bielecCI_no
 
 end
 
-subroutine routine
+subroutine routine_bielec_PQxx_no
  implicit none
- integer :: i_chol, i_act, i_mo
- double precision :: accu,error,exact
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PQxx_no_chol
+ 
  accu = 0.d0
- do i_mo = 1, n_act_orb
-  do i_act = 1, n_act_orb
-   do i_chol = 1, cholesky_mo_num
-    error = dabs(cholesky_no_2_idx_transp(i_chol,i_act,i_mo) - cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo))
-    exact = dabs(cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo))
-    accu += error
-    if(exact.gt.1.d-10)then
-     if(error/exact.gt.1.d-7)then
-       write(*,'(4(E16.10,X))')cholesky_no_2_idx_transp(i_chol,i_act,i_mo) , cholesky_no_2_idx_transp_old(i_chol,i_act,i_mo),error,error/exact
+ do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+  do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+   do i_mo = 1, mo_num
+    do j_mo = 1, mo_num
+     exact = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
+!     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
+     new   = bielec_PQxx_no_chol(j_mo,i_mo, j_core_inact, i_core_inact) 
+     error = dabs(exact-new)
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
      endif
-   endif
+     accu += error
+    enddo
    enddo
   enddo
  enddo
- print*,'accu =', accu/(dble(n_act_orb*n_act_orb*cholesky_mo_num))
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine routine_bielec_PxxQ_no
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PxxQ_no_chol
+ 
+ accu = 0.d0
+ do i_mo = 1, mo_num
+  do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+   do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+    do j_mo = 1, mo_num
+     exact = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
+!     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
+     new   = bielec_PxxQ_no_chol(j_mo, j_core_inact,  i_core_inact,i_mo) 
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine routine_bielecCI_no
+ implicit none
+ integer :: i_ca, j_ca, k_ca, i_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielecCI_no_chol
+ 
+ accu = 0.d0
+ do i_mo = 1, mo_num
+  do i_ca = 1, n_act_orb
+   do j_ca = 1, n_act_orb
+    do k_ca = 1, n_act_orb 
+     exact =bielecCI_no(k_ca, j_ca, i_ca, i_mo)
+     new   = bielecCI_no_chol(k_ca, j_ca, i_ca, i_mo)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end

From 505d10084c8331c36ea6a2244bdc47ea8fb33a81 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 12 Jul 2024 16:19:53 +0200
Subject: [PATCH 123/131] Choleskization of the CASSCF

---
 src/casscf_cipsi/bielec.irp.f        |  61 +++++++++-----
 src/casscf_cipsi/bielec_natorb.irp.f |  68 ++++++++++-----
 src/casscf_cipsi/casscf.irp.f        |   2 +-
 src/casscf_cipsi/chol_bielec.irp.f   | 118 ++++++++++++++-------------
 src/casscf_cipsi/chol_garb.irp.f     |  34 ++++++++
 src/casscf_cipsi/gradient.irp.f      |   1 +
 src/casscf_cipsi/hessian.irp.f       |   6 ++
 src/casscf_cipsi/mcscf_fock.irp.f    |   2 +
 src/casscf_cipsi/test_chol.irp.f     |  63 +++++++++-----
 src/casscf_cipsi/tot_en.irp.f        |   1 +
 10 files changed, 238 insertions(+), 118 deletions(-)
 create mode 100644 src/casscf_cipsi/chol_garb.irp.f

diff --git a/src/casscf_cipsi/bielec.irp.f b/src/casscf_cipsi/bielec.irp.f
index 0a44f994..a4901985 100644
--- a/src/casscf_cipsi/bielec.irp.f
+++ b/src/casscf_cipsi/bielec.irp.f
@@ -1,18 +1,25 @@
-BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
   BEGIN_DOC
-  ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx
+  !
+  ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active
   ! indices are unshifted orbital numbers
   END_DOC
   implicit none
   integer                        :: i,j,ii,jj,p,q,i3,j3,t3,v3
   real*8                         :: mo_two_e_integral
+  print*,''
+  print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
   
-  bielec_PQxx(:,:,:,:) = 0.d0
+  bielec_PQxx_array(:,:,:,:) = 0.d0
   PROVIDE mo_two_e_integrals_in_map
   
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(i,ii,j,jj,i3,j3) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, &
   !$OMP  n_act_orb,mo_integrals_map,list_act)
 
   !$OMP DO
@@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
     ii=list_core_inact(i)
     do j=i,n_core_inact_orb
       jj=list_core_inact(j)
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map)
-      bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map)
+      bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j)
     end do
     do j=1,n_act_orb
       jj=list_act(j)
       j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3)
     end do
   end do
   !$OMP END DO
@@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
     do j=i,n_act_orb
       jj=list_act(j)
       j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3)
     end do
   end do
   !$OMP END DO
@@ -52,9 +59,13 @@ END_PROVIDER
 
 
 
-BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
   BEGIN_DOC
-  ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ
+  !
+  ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active
   ! indices are unshifted orbital numbers
   END_DOC
   implicit none
@@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
   double precision, allocatable  :: integrals_array(:,:)
   real*8                         :: mo_two_e_integral
   
+  print*,''
+  print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
   PROVIDE mo_two_e_integrals_in_map
-  bielec_PxxQ = 0.d0
+  bielec_PxxQ_array = 0.d0
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, &
   !$OMP  n_act_orb,mo_integrals_map,list_act)
 
   allocate(integrals_array(mo_num,mo_num))
@@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i,j,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   BEGIN_DOC
   ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active
   ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
   END_DOC
   implicit none
   integer                        :: i,j,k,p,t,u,v
   double precision, external     :: mo_two_e_integral
+  double precision :: wall0, wall1 
+  call wall_time(wall0)
+  print*,'Providing bielecCI'
   PROVIDE mo_two_e_integrals_in_map
   
   !$OMP PARALLEL DO DEFAULT(NONE) &
@@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
     end do
   end do
   !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI = ',wall1 - wall0
 
 END_PROVIDER
diff --git a/src/casscf_cipsi/bielec_natorb.irp.f b/src/casscf_cipsi/bielec_natorb.irp.f
index 9968530c..99734a0b 100644
--- a/src/casscf_cipsi/bielec_natorb.irp.f
+++ b/src/casscf_cipsi/bielec_natorb.irp.f
@@ -1,30 +1,38 @@
- BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+ BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
   BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx_no
+  !
   ! integral (pq|xx) in the basis of natural MOs
   ! indices are unshifted orbital numbers
+  ! 
   END_DOC
   implicit none
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  print*,''
+  print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
 
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
   !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI)
+  !$OMP   bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI)
 
   allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), &
       d(n_act_orb,mo_num,n_core_inact_act_orb))
 
   !$OMP DO
   do l=1,n_core_inact_act_orb
-    bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l)
+    bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l)
 
     do k=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l)
+          f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l)
         end do
       end do
     end do
@@ -36,13 +44,13 @@
     do k=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k)
+          bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k)
         end do
       end do
 
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l)
+          f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l)
         end do
       end do
     end do
@@ -54,7 +62,7 @@
     do k=1,n_core_inact_act_orb
       do p=1,n_act_orb
         do j=1,mo_num
-          bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k)
+          bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k)
         end do
       end do
     end do
@@ -71,7 +79,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)
         end do
       end do
     end do
@@ -83,7 +91,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p)
         end do
       end do
     end do
@@ -97,7 +105,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)
         end do
       end do
     end do
@@ -109,7 +117,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
         end do
       end do
     end do
@@ -123,8 +131,12 @@ END_PROVIDER
 
 
 
-BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
   BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ_no
+  !
   ! integral (px|xq) in the basis of natural MOs
   ! indices are unshifted orbital numbers
   END_DOC
@@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  print*,''
+  print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
+
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
   !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI)
+  !$OMP   bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI)
 
 
   allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), &
@@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
 
   !$OMP DO
   do j=1,mo_num
-    bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j)
+    bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j)
     do l=1,n_core_inact_act_orb
       do k=1,n_core_inact_act_orb
         do p=1,n_act_orb
-            f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j)
+            f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j)
         end do
       end do
     end do
@@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do k=1,n_core_inact_act_orb
         do p=1,n_act_orb
-          bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l)
+          bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l)
         end do
       end do
     end do
@@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)
+          f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)
         end do
       end do
     end do
@@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l)
+          bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l)
         end do
       end do
     end do
@@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do l=1,n_core_inact_act_orb
         do j=1,mo_num
-          f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)
+          f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)
         end do
       end do
     end do
@@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do l=1,n_core_inact_act_orb
         do j=1,mo_num
-          bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p)
+          bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p)
         end do
       end do
     end do
@@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do k=1,n_core_inact_act_orb
         do j=1,mo_num
-          f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)
         end do
       end do
     end do
@@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do k=1,n_core_inact_act_orb
         do j=1,mo_num
-          bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
         end do
       end do
     end do
@@ -259,10 +275,16 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   BEGIN_DOC
   ! integrals (tu|vp) in the basis of natural MOs
   ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
   END_DOC
   implicit none
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
+ 
+  double precision :: wall0, wall1
+  call wall_time(wall0)
+  print*,'Providing bielecCI_no'
   
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
@@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   
   deallocate(d,f)
   !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI_no = ',wall1-wall0
 
 
 END_PROVIDER
diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f
index d0a26d36..dc3e2245 100644
--- a/src/casscf_cipsi/casscf.irp.f
+++ b/src/casscf_cipsi/casscf.irp.f
@@ -11,7 +11,7 @@ program casscf
   if(small_active_space)then
    pt2_relative_error = 0.00001
   else
-   thresh_scf = 1.d-4
+   thresh_scf = max(1.d-4,thresh_scf)
    pt2_relative_error = 0.04
   endif
   touch pt2_relative_error 
diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f
index 94a76453..f69832c1 100644
--- a/src/casscf_cipsi/chol_bielec.irp.f
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@@ -6,6 +6,9 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_
  implicit none
  integer :: i_chol,i_act,i_mo,jj_act
  double precision, allocatable :: chol_tmp(:,:)
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_1_idx_transp'
  allocate(chol_tmp(cholesky_mo_num,n_act_orb))
  cholesky_no_1_idx_transp = 0.D0
  do i_mo = 1, mo_num
@@ -16,46 +19,17 @@ BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_
     chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
    enddo
   enddo
-!  ! Do the matrix product 
-!  do i_act = 1, n_act_orb
-!   do jj_act = 1, n_act_orb
-!    do i_chol = 1, cholesky_mo_num
-!     cholesky_no_1_idx_transp(i_chol, i_act, i_mo) += chol_tmp(i_chol, jj_act) * natorbsCI(jj_act,i_act) 
-!    enddo
-!   enddo
-!  enddo
   call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
         chol_tmp, size(chol_tmp,1),                              &
         natorbsCI, size(natorbsCI,1),                                              &
         0.d0,                                                      &
         cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
  enddo
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0
 
 END_PROVIDER 
 
-BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp_old, (cholesky_mo_num, n_act_orb, n_act_orb)]
- BEGIN_DOC
- ! Cholesky vectors with TWO orbital on the active natural orbital basis 
- END_DOC
- implicit none
- integer :: i_chol,i_act,j_act,jj_act,jjj_act
- double precision, allocatable :: chol_tmp(:,:)
- allocate(chol_tmp(cholesky_mo_num,n_act_orb))
- cholesky_no_2_idx_transp_old = 0.D0
- do jj_act = 1, n_act_orb
-  jjj_act = list_act(jj_act)
-  do j_act = 1, n_act_orb
-   do i_act = 1, n_act_orb
-    do i_chol = 1, cholesky_mo_num
-     cholesky_no_2_idx_transp_old(i_chol, i_act, j_act) += cholesky_no_1_idx_transp(i_chol, i_act,jjj_act) * natorbsCI(jj_act,j_act) 
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-
 BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
  BEGIN_DOC
  ! Cholesky vectors with TWO orbital on the active natural orbital basis 
@@ -64,6 +38,9 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_
  integer :: i_chol,i_act,j_act,jj_act
  double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
  allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_2_idx_transp'
  cholesky_no_2_idx_transp = 0.D0
  do i_act = 1, n_act_orb
   ! Get all the integrals corresponding to the "j_act"
@@ -79,6 +56,8 @@ BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_
         0.d0,                                                      &
         cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
  enddo
+ call wall_time(wall1)
+ print*,'Time to provide  cholesky_no_2_idx_transp = ', wall1 - wall0
 
 END_PROVIDER 
 
@@ -89,6 +68,9 @@ BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, m
  END_DOC
  integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
  integer :: i_virt, ii_virt, j_virt, jj_virt
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_total_transp '
  ! Block when two orbitals belong to the core/inact 
  do j_core_inact = 1, n_core_inact_orb
   jj_core_inact = list_core_inact(j_core_inact)
@@ -180,63 +162,87 @@ BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, m
    enddo
   enddo
  enddo
+
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0
 END_PROVIDER 
 
 
-double precision function bielec_no_basis_chol(i_1,j_1,i_2,j_2)
+double precision function bielec_no_basis(i_1,j_1,i_2,j_2)
  implicit none
  integer, intent(in) :: i_1,j_1,i_2,j_2
   BEGIN_DOC
   ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
   ! 
   END_DOC
-  integer :: i_chol 
- bielec_no_basis_chol = 0.d0
- do i_chol = 1, cholesky_mo_num
-  bielec_no_basis_chol += cholesky_no_total_transp(i_chol,i_1, j_1) * cholesky_no_total_transp(i_chol,i_2,j_2)
+  integer :: i 
+ bielec_no_basis = 0.d0
+ do i = 1, cholesky_mo_num
+  bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2)
  enddo
 end
 
-double precision function bielec_PQxx_no_chol(i_mo, j_mo, i_ca, j_ca)
+double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
  implicit none
  BEGIN_DOC
- ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition 
+ ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition  on the NO basis for active orbitals 
  ! 
- ! indices are unshifted orbital numbers
+ ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
  END_DOC
  integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
  integer :: ii_ca, jj_ca
- double precision :: bielec_no_basis_chol
+ double precision :: bielec_no_basis
  ii_ca = list_core_inact_act(i_ca)
  jj_ca = list_core_inact_act(j_ca)
- bielec_PQxx_no_chol = bielec_no_basis_chol(i_mo,j_mo,ii_ca,jj_ca)
-
+ bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
 end
 
-double precision function bielec_PxxQ_no_chol(i_mo, j_ca, i_ca, j_mo)
+double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
  implicit none 
   BEGIN_DOC
-  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition 
+  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition  on the NO basis for active orbitals 
   ! 
-  ! indices are unshifted orbital numbers
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
   END_DOC
  integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
  integer :: ii_ca, jj_ca
- double precision :: bielec_no_basis_chol
+ double precision :: bielec_no_basis
  ii_ca = list_core_inact_act(i_ca)
  jj_ca = list_core_inact_act(j_ca)
- bielec_PxxQ_no_chol = bielec_no_basis_chol(i_mo, jj_ca, ii_ca, j_mo)
+ bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
 
 end
 
-double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
+
+double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! indices are unshifted orbital numbers
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
  implicit none 
- integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
- integer :: ii_ca, jj_ca, kk_ca
- double precision :: bielec_no_basis_chol
- ii_ca = list_act(i_ca)
- jj_ca = list_act(j_ca)
- kk_ca = list_act(k_ca)
- bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
-
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca)
 end
+
+double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo)
+end
+
diff --git a/src/casscf_cipsi/chol_garb.irp.f b/src/casscf_cipsi/chol_garb.irp.f
new file mode 100644
index 00000000..c4a8fa59
--- /dev/null
+++ b/src/casscf_cipsi/chol_garb.irp.f
@@ -0,0 +1,34 @@
+
+!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED
+!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition 
+!  ! 
+!  ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE 
+!  END_DOC
+! implicit none
+! integer, intent(in) :: i_a, j_a, k_a, i_mo
+! integer :: ii_a, jj_a, kk_a
+! double precision :: mo_two_e_integral
+! ii_a = list_act(i_a)
+! jj_a = list_act(j_a)
+! kk_a = list_act(k_a)
+! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo)
+!end
+
+!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals 
+!  ! 
+!  ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb]
+!  END_DOC
+! implicit none 
+! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
+! integer :: ii_ca, jj_ca, kk_ca
+! double precision :: bielec_no_basis_chol
+! ii_ca = list_act(i_ca)
+! jj_ca = list_act(j_ca)
+! kk_ca = list_act(k_ca)
+! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
+!
+!end
diff --git a/src/casscf_cipsi/gradient.irp.f b/src/casscf_cipsi/gradient.irp.f
index a1c5e947..961d260d 100644
--- a/src/casscf_cipsi/gradient.irp.f
+++ b/src/casscf_cipsi/gradient.irp.f
@@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t)
   
   integer                        :: ii,tt,v,vv,x,y
   integer                        :: x3,y3
+  double precision :: bielec_PQxx_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
diff --git a/src/casscf_cipsi/hessian.irp.f b/src/casscf_cipsi/hessian.irp.f
index 458c6aa6..9a7a9031 100644
--- a/src/casscf_cipsi/hessian.irp.f
+++ b/src/casscf_cipsi/hessian.irp.f
@@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u)
   implicit none
   integer                        :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj
   real*8                         :: term,t2
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
@@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a)
   implicit none
   integer                        :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ! it/ja
   ii=list_core_inact(i)
@@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a)
   implicit none
   integer                        :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
@@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b)
   implicit none
   integer                        :: i,a,j,b,ii,aa,jj,bb
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   aa=list_virt(a)
@@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b)
   implicit none
   integer                        :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   aa=list_virt(a)
@@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b)
   integer                        :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y
   integer                        :: v3,x3
   real*8                         :: term,t1,t2,t3
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   tt=list_act(t)
   aa=list_virt(a)
diff --git a/src/casscf_cipsi/mcscf_fock.irp.f b/src/casscf_cipsi/mcscf_fock.irp.f
index 0f4b7a99..82b710a7 100644
--- a/src/casscf_cipsi/mcscf_fock.irp.f
+++ b/src/casscf_cipsi/mcscf_fock.irp.f
@@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ]
    END_DOC
    implicit none
    integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
    
    do q=1,mo_num
      do p=1,mo_num
@@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ]
    END_DOC
    implicit none
    integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
    
    Fapq = 0.d0
    
diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f
index 87c5c352..bcce7cf7 100644
--- a/src/casscf_cipsi/test_chol.irp.f
+++ b/src/casscf_cipsi/test_chol.irp.f
@@ -3,7 +3,9 @@ program test_chol
  read_wf= .True.
  touch read_wf 
 ! call routine_bielec_PxxQ_no
- call routine_bielecCI_no
+! call routine_bielecCI_no
+! call test_bielec_PxxQ_chol
+! call test_bielecCI
 
 end
 
@@ -12,7 +14,7 @@ subroutine routine_bielec_PQxx_no
  integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
  integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
  double precision :: exact, new, error, accu, bielec_no_basis_chol
- double precision :: bielec_PQxx_no_chol
+ double precision :: bielec_PQxx_no
  
  accu = 0.d0
  do i_core_inact = 1, n_core_inact_act_orb
@@ -21,9 +23,8 @@ subroutine routine_bielec_PQxx_no
    jj_core_inact = list_core_inact_act(j_core_inact)
    do i_mo = 1, mo_num
     do j_mo = 1, mo_num
-     exact = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
-!     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
-     new   = bielec_PQxx_no_chol(j_mo,i_mo, j_core_inact, i_core_inact) 
+     exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) 
+     new   = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
      error = dabs(exact-new)
      if(dabs(exact).gt.1.d-10)then
       print*,exact,new,error
@@ -36,12 +37,12 @@ subroutine routine_bielec_PQxx_no
  print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end
 
-subroutine routine_bielec_PxxQ_no
+subroutine routine_bielec_PxxQ_no_array
  implicit none
  integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
  integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
  double precision :: exact, new, error, accu, bielec_no_basis_chol
- double precision :: bielec_PxxQ_no_chol
+ double precision :: bielec_PxxQ_no
  
  accu = 0.d0
  do i_mo = 1, mo_num
@@ -50,9 +51,9 @@ subroutine routine_bielec_PxxQ_no
    do j_core_inact = 1, n_core_inact_act_orb
    jj_core_inact = list_core_inact_act(j_core_inact)
     do j_mo = 1, mo_num
-     exact = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
+     exact = bielec_PxxQ_no_array(j_mo, j_core_inact,  i_core_inact,i_mo) 
 !     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
-     new   = bielec_PxxQ_no_chol(j_mo, j_core_inact,  i_core_inact,i_mo) 
+     new   = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
      error = dabs(exact-new)
      accu += error
      if(dabs(exact).gt.1.d-10)then
@@ -65,19 +66,43 @@ subroutine routine_bielec_PxxQ_no
  print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
 end
 
-subroutine routine_bielecCI_no
+subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
  implicit none
- integer :: i_ca, j_ca, k_ca, i_mo
- double precision :: exact, new, error, accu, bielec_no_basis_chol
- double precision :: bielecCI_no_chol
+ integer :: i_mo, j_mo, i_ca, j_ca 
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PQxx
  
  accu = 0.d0
- do i_mo = 1, mo_num
-  do i_ca = 1, n_act_orb
-   do j_ca = 1, n_act_orb
-    do k_ca = 1, n_act_orb 
-     exact =bielecCI_no(k_ca, j_ca, i_ca, i_mo)
-     new   = bielecCI_no_chol(k_ca, j_ca, i_ca, i_mo)
+ do j_ca = 1, n_core_inact_act_orb
+  do i_ca = 1, n_core_inact_act_orb
+   do j_mo = 1, mo_num
+    do i_mo = 1, mo_num
+     exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca)
+     new   = bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo)
+ implicit none
+ integer :: i_mo, i_ca, j_ca, j_mo
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PxxQ
+ accu = 0.d0
+ do j_mo = 1, mo_num
+  do j_ca = 1, n_core_inact_act_orb
+   do i_ca =1, n_core_inact_act_orb
+    do i_mo = 1, mo_num
+     exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo)
+     new   = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
      error = dabs(exact-new)
      accu += error
      if(dabs(exact).gt.1.d-10)then
diff --git a/src/casscf_cipsi/tot_en.irp.f b/src/casscf_cipsi/tot_en.irp.f
index 1d70e087..37ceac05 100644
--- a/src/casscf_cipsi/tot_en.irp.f
+++ b/src/casscf_cipsi/tot_en.irp.f
@@ -8,6 +8,7 @@
    implicit none
    integer                        :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3
    real*8                         :: e_one_all,e_two_all
+   double precision :: bielec_PQxx,bielec_PxxQ
    e_one_all=0.D0
    e_two_all=0.D0
    do i=1,n_core_inact_orb

From 4a9a11c630c94528fb429255d5d9767d9b2edefc Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 12 Jul 2024 17:32:41 +0200
Subject: [PATCH 124/131] GPU acceleration compute_tc_int

---
 plugins/local/tc_int/NEED                 |   1 +
 plugins/local/tc_int/compute_tc_int.irp.f | 117 +++++++++++++---------
 2 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED
index 8a4caf5b..34d2e183 100644
--- a/plugins/local/tc_int/NEED
+++ b/plugins/local/tc_int/NEED
@@ -1,3 +1,4 @@
+gpu
 tc_keywords
 jastrow
 qmckl
diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f
index 02f21570..92c90d03 100644
--- a/plugins/local/tc_int/compute_tc_int.irp.f
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@@ -2,23 +2,23 @@
 ! ---
 
 subroutine provide_int2_grad1_u12_ao()
-
+  use gpu
   BEGIN_DOC
   !
-  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) 
-  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) 
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
   !
   !
-  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) 
-  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator 
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
   !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
   ! where:
   !
   ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
-  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
-  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) 
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
   !
   ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
   !
@@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao()
   double precision              :: weight1, ao_k_r, ao_i_r
   double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
   double precision              :: time0, time1, time2, tc1, tc2, tc
-  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
-  double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
+  type(gpu_double4)             :: int2_grad1_u12_ao
+  type(gpu_double3)             :: tmp_grad1_u12, tmp_grad1_u12p, tmp
+  double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)
 
   double precision, external    :: get_ao_two_e_integral
 
@@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao()
 
   call total_memory(mem)
   mem      = max(1.d0, qp_max_mem - mem)
+  mem = 6
   n_double = mem * 1.d8
   n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
   n_rest   = int(mod(n_points_final_grid, n_blocks))
@@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao()
   ! ---
   ! ---
 
-  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+  call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4)
 
-  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num)
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (j, i, jpoint) &
@@ -75,49 +77,55 @@ subroutine provide_int2_grad1_u12_ao()
   do j = 1, ao_num
     do i = 1, ao_num
       do jpoint = 1, n_points_extra_final_grid
-        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+        tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+  call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4)
+  call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4)
 
   tc = 0.d0
-  
+
+  type(gpu_stream) :: stream(4)
+  do i=1,4
+    call gpu_stream_create(stream(i))
+  enddo
+
   do i_pass = 1, n_pass
     ii = (i_pass-1)*n_blocks + 1
-  
+
     call wall_time(tc1)
+
     !$OMP PARALLEL                   &
     !$OMP DEFAULT (NONE)             &
     !$OMP PRIVATE (i_blocks, ipoint) &
     !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
+    !$OMP DO
     do i_blocks = 1, n_blocks
       ipoint = ii - 1 + i_blocks ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), &
+        tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4))
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
     call wall_time(tc2)
-    tc = tc + tc2 - tc1 
+    tc = tc + tc2 - tc1
 
+    call gpu_synchronize()
+    call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p)
     do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
     enddo
   enddo
-  
-  deallocate(tmp_grad1_u12)
 
-  
   if(n_rest .gt. 0) then
-  
-    allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
-  
+
     ii = n_pass*n_blocks + 1
 
     call wall_time(tc1)
@@ -125,26 +133,35 @@ subroutine provide_int2_grad1_u12_ao()
     !$OMP DEFAULT (NONE)           &
     !$OMP PRIVATE (i_rest, ipoint) &
     !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
-    !$OMP DO 
+    !$OMP DO
     do i_rest = 1, n_rest
       ipoint = ii - 1 + i_rest ! r1
-      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), &
+        tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4))
     enddo
     !$OMP END DO
     !$OMP END PARALLEL
     call wall_time(tc2)
-    tc = tc + tc2 - tc1 
-  
+    tc = tc + tc2 - tc1
+
     do m = 1, 4
-      call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
-                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
-                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) 
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
     enddo
 
-    deallocate(tmp_grad1_u12)
   endif
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_grad1_u12)
+  call gpu_deallocate(tmp_grad1_u12p)
 
-  deallocate(tmp)
+  do i=1,4
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+
+  call gpu_deallocate(tmp)
 
 
   call wall_time(time1)
@@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao()
   print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
   call print_memory_usage()
 
+!TODO
+stop
   ! ---
   ! ---
   ! ---
@@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao()
   !$OMP END DO
   !$OMP END PARALLEL
   call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
-            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
             , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
   deallocate(c_mat)
 
@@ -188,23 +207,23 @@ subroutine provide_int2_grad1_u12_ao()
   ! ---
 
   call wall_time(time1)
-  
+
   allocate(c_mat(n_points_final_grid,ao_num,ao_num))
   do m = 1, 3
     !$OMP PARALLEL                                                              &
     !$OMP DEFAULT (NONE)                                                        &
-    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       &
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
     !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
     !$OMP DO SCHEDULE (static)
     do i = 1, ao_num
       do k = 1, ao_num
         do ipoint = 1, n_points_final_grid
-  
+
           weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
           ao_i_r  = aos_in_r_array_transp(ipoint,i)
           ao_k_r  = aos_in_r_array_transp(ipoint,k)
-  
+
           c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
         enddo
       enddo
@@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao()
     !$OMP END PARALLEL
 
     call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
-              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
               , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
   enddo
   deallocate(c_mat)
@@ -234,7 +253,7 @@ subroutine provide_int2_grad1_u12_ao()
 
   ! ---
 
-  call wall_time(time1)  
+  call wall_time(time1)
 
   PROVIDE ao_integrals_map
   !$OMP PARALLEL DEFAULT(NONE)                         &
@@ -245,7 +264,7 @@ subroutine provide_int2_grad1_u12_ao()
     do l = 1, ao_num
       do i = 1, ao_num
         do k = 1, ao_num
-          !                                               < 1:i, 2:j | 1:k, 2:l > 
+          !                                               < 1:i, 2:j | 1:k, 2:l >
           tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
         enddo
       enddo
@@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao()
   print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
   open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
   call ezfio_set_work_empty(.False.)
-    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+    write(11) int2_grad1_u12_ao%f(:,:,:,1:3)
   close(11)
 
   print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
@@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao()
 
   ! ----
 
-  deallocate(int2_grad1_u12_ao)
+  call gpu_deallocate(int2_grad1_u12_ao)
   deallocate(tc_int_2e_ao)
 
   call wall_time(time2)

From 228796cff525bc6420b2b961c803fda2aa0094bc Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 12 Jul 2024 17:41:15 +0200
Subject: [PATCH 125/131] split the mo_optimization into mo_optimizatio_utils
 and mo_optimization

---
 src/casscf_cipsi/NEED                         |  1 +
 src/mo_optimization/NEED                      |  6 +-
 src/mo_optimization/cipsi_orb_opt.irp.f       | 82 +------------------
 .../EZFIO.cfg                                 |  0
 src/mo_optimization_utils/NEED                |  5 ++
 src/mo_optimization_utils/README.md           | 74 +++++++++++++++++
 .../class.irp.f                               |  0
 .../constants.h                               |  0
 .../diagonal_hessian_list_opt.irp.f           |  0
 .../diagonal_hessian_opt.irp.f                |  0
 .../diagonalization_hessian.irp.f             |  0
 .../first_diagonal_hessian_list_opt.irp.f     |  0
 .../first_diagonal_hessian_opt.irp.f          |  0
 .../first_gradient_list_opt.irp.f             |  0
 .../first_gradient_opt.irp.f                  |  0
 .../first_hessian_list_opt.irp.f              |  0
 .../first_hessian_opt.irp.f                   |  0
 .../gradient_list_opt.irp.f                   |  0
 .../gradient_opt.irp.f                        |  0
 .../hessian_list_opt.irp.f                    |  0
 .../hessian_opt.irp.f                         |  0
 .../org/TODO.org                              |  0
 .../org/debug_gradient_list_opt.org           |  0
 .../org/debug_gradient_opt.org                |  0
 .../org/debug_hessian_list_opt.org            |  0
 .../org/debug_hessian_opt.org                 |  0
 .../org/diagonal_hessian_list_opt.org         |  0
 .../org/diagonal_hessian_opt.org              |  0
 .../org/diagonalization_hessian.org           |  0
 .../org/first_diagonal_hessian_list_opt.org   |  0
 .../org/first_diagonal_hessian_opt.org        |  0
 .../org/first_gradient_list_opt.org           |  0
 .../org/first_gradient_opt.org                |  0
 .../org/first_hessian_list_opt.org            |  0
 .../org/first_hessian_opt.org                 |  0
 .../org/gradient_list_opt.org                 |  0
 .../org/gradient_opt.org                      |  0
 .../org/hessian_list_opt.org                  |  0
 .../org/hessian_opt.org                       |  0
 .../org/my_providers.org                      |  0
 .../org/optimization.org                      |  0
 .../org/orb_opt_trust_v2.org                  |  0
 .../org/state_average_energy.org              |  0
 .../org/state_weight_normalization.org        |  0
 .../org/update_parameters.org                 |  0
 .../org/update_st_av_ci_energy.org            |  0
 .../routine_opt_mos.irp.f                     | 81 ++++++++++++++++++
 .../run_orb_opt_trust_v2.irp.f                |  0
 .../save_energy.irp.f                         |  0
 .../state_average_energy.irp.f                |  0
 .../state_weight_normalization.irp.f          |  0
 .../update_parameters.irp.f                   |  0
 .../update_st_av_ci_energy.irp.f              |  0
 53 files changed, 163 insertions(+), 86 deletions(-)
 rename src/{mo_optimization => mo_optimization_utils}/EZFIO.cfg (100%)
 create mode 100644 src/mo_optimization_utils/NEED
 create mode 100644 src/mo_optimization_utils/README.md
 rename src/{mo_optimization => mo_optimization_utils}/class.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/constants.h (100%)
 rename src/{mo_optimization => mo_optimization_utils}/diagonal_hessian_list_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/diagonal_hessian_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/diagonalization_hessian.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/first_diagonal_hessian_list_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/first_diagonal_hessian_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/first_gradient_list_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/first_gradient_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/first_hessian_list_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/first_hessian_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/gradient_list_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/gradient_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/hessian_list_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/hessian_opt.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/TODO.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/debug_gradient_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/debug_gradient_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/debug_hessian_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/debug_hessian_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/diagonal_hessian_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/diagonal_hessian_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/diagonalization_hessian.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/first_diagonal_hessian_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/first_diagonal_hessian_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/first_gradient_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/first_gradient_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/first_hessian_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/first_hessian_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/gradient_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/gradient_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/hessian_list_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/hessian_opt.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/my_providers.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/optimization.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/orb_opt_trust_v2.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/state_average_energy.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/state_weight_normalization.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/update_parameters.org (100%)
 rename src/{mo_optimization => mo_optimization_utils}/org/update_st_av_ci_energy.org (100%)
 create mode 100644 src/mo_optimization_utils/routine_opt_mos.irp.f
 rename src/{mo_optimization => mo_optimization_utils}/run_orb_opt_trust_v2.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/save_energy.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/state_average_energy.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/state_weight_normalization.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/update_parameters.irp.f (100%)
 rename src/{mo_optimization => mo_optimization_utils}/update_st_av_ci_energy.irp.f (100%)

diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED
index dd91c7bd..11d1a78c 100644
--- a/src/casscf_cipsi/NEED
+++ b/src/casscf_cipsi/NEED
@@ -3,3 +3,4 @@ selectors_full
 generators_cas
 two_body_rdm
 dav_general_mat
+mo_optimization
diff --git a/src/mo_optimization/NEED b/src/mo_optimization/NEED
index 91f41ee3..33f770c3 100644
--- a/src/mo_optimization/NEED
+++ b/src/mo_optimization/NEED
@@ -1,7 +1,3 @@
-two_body_rdm
-hartree_fock
-cipsi
-davidson_undressed
+mo_optimization_utils
 selectors_full
 generators_full
-utils_trust_region
diff --git a/src/mo_optimization/cipsi_orb_opt.irp.f b/src/mo_optimization/cipsi_orb_opt.irp.f
index 7e3a79eb..19b3e9db 100644
--- a/src/mo_optimization/cipsi_orb_opt.irp.f
+++ b/src/mo_optimization/cipsi_orb_opt.irp.f
@@ -2,87 +2,7 @@ program optimization
   
   read_wf = .true. ! must be True for the orbital optimization !!!
   TOUCH read_wf 
-  call run_optimization
+  call run_optimization_mos_CIPSI
 
 end
 
-subroutine run_optimization
-
-  implicit none
-
-  double precision :: e_cipsi, e_opt, delta_e
-  double precision, allocatable :: Ev(:),PT2(:)
-  integer :: nb_iter,i
-  logical :: not_converged
-  character (len=100) :: filename
-
-  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
-  allocate(Ev(N_states),PT2(N_states))
-
-  not_converged = .True.
-  nb_iter = 0
-
-  ! To start from the wf
-  N_det_max = max(n_det,5)
-  TOUCH N_det_max
-
-  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
-  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
-  call state_average_energy(e_cipsi)
-  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
-  close(10)
-
-  do while (not_converged)
-      print*,''
-      print*,'======================'
-      print*,' Cipsi step:', nb_iter
-      print*,'======================'
-      print*,'' 
-      print*,'********** cipsi step **********'
-      ! cispi calculation
-      call run_stochastic_cipsi(Ev,PT2)
-
-      ! State average energy after the cipsi step
-      call state_average_energy(e_cipsi)
-
-      print*,''
-      print*,'********** optimization step **********'
-      ! orbital optimization
-      call run_orb_opt_trust_v2
-
-      ! State average energy after the orbital optimization
-      call state_average_energy(e_opt)
-
-      print*,''
-      print*,'********** diff step **********'
-      ! Gain in energy
-      delta_e = e_opt - e_cipsi
-      print*, 'Gain in energy during the orbital optimization:', delta_e
-
-      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
-      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
-      close(10)
-
-      ! Exit
-      if (delta_e > 1d-12) then
-          print*, 'WARNING, something wrong happened'
-          print*, 'The gain (delta_e) in energy during the optimization process'
-          print*, 'is > 0, but it must be < 0'
-          print*, 'The program will exit'
-          exit
-      endif
-
-      if (n_det > n_det_max_opt) then
-          print*, 'The number of determinants in the wf > n_det_max_opt'
-          print*, 'The program will exit'
-          exit
-      endif
-      
-      ! To double the number of determinants in the wf
-      N_det_max = int(dble(n_det * 2)*0.9)
-      TOUCH N_det_max
-
-      nb_iter = nb_iter + 1
-  enddo
-
-end
diff --git a/src/mo_optimization/EZFIO.cfg b/src/mo_optimization_utils/EZFIO.cfg
similarity index 100%
rename from src/mo_optimization/EZFIO.cfg
rename to src/mo_optimization_utils/EZFIO.cfg
diff --git a/src/mo_optimization_utils/NEED b/src/mo_optimization_utils/NEED
new file mode 100644
index 00000000..1a78a17f
--- /dev/null
+++ b/src/mo_optimization_utils/NEED
@@ -0,0 +1,5 @@
+two_body_rdm
+hartree_fock
+cipsi
+davidson_undressed
+utils_trust_region
diff --git a/src/mo_optimization_utils/README.md b/src/mo_optimization_utils/README.md
new file mode 100644
index 00000000..94f29aee
--- /dev/null
+++ b/src/mo_optimization_utils/README.md
@@ -0,0 +1,74 @@
+# Orbital optimization
+
+## Methods  
+Different methods are available:  
+- full hessian  
+``` 
+qp set orbital_optimization optimization_method full  
+```  
+- diagonal hessian  
+``` 
+qp set orbital_optimization optimization_method diag  
+``` 
+- identity matrix  
+``` 
+qp set orbital_optimization optimization_method none  
+``` 
+
+After the optimization the ezfio contains the optimized orbitals
+ 
+## For a fixed number of determinants
+To optimize the MOs for the actual determinants:  
+``` 
+qp run orb_opt
+``` 
+ 
+## For a complete optimization, i.e, with a larger and larger wave function
+To optimize the MOs with a larger and larger wave function:  
+``` 
+qp run optimization  
+``` 
+
+The results are stored in the EZFIO in "mo_optimization/result_opt",
+with the following format:  
+(1) (2) (3) (4)  
+1: Number of determinants in the wf,  
+2: Cispi energy before the optimization,   
+3: Cipsi energy after the optimization,  
+4: Energy difference between (2) and (3).  
+ 
+The optimization process if the following: 
+- we do a first cipsi step to obtain a small number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- we do a new cipsi step to double the number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- ... 
+- we do that until the energy difference between (2) and (3) is  
+  smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) 
+  or the wf is larger than a given size (n_det_max_opt in qp_edit) 
+- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation  
+  
+### End of the optimization
+You can choos the number of determinants after what the 
+optimization will stop:
+```
+qp set orbital_optimization n_det_max_opt 1e5 # or any number
+```
+## Weight of the states
+You can change the weights of the differents states directly in qp edit.  
+It will affect ths weights used in the orbital optimization.
+
+# Tests
+To run the tests:  
+``` 
+qp test
+``` 
+
+# Org files
+The org files are stored in the directory org in order to avoid overwriting on user changes.
+The org files can be modified, to export the change to the source code, run
+```
+./TANGLE_org_mode.sh
+mv *.irp.f ../.
+```
+
diff --git a/src/mo_optimization/class.irp.f b/src/mo_optimization_utils/class.irp.f
similarity index 100%
rename from src/mo_optimization/class.irp.f
rename to src/mo_optimization_utils/class.irp.f
diff --git a/src/mo_optimization/constants.h b/src/mo_optimization_utils/constants.h
similarity index 100%
rename from src/mo_optimization/constants.h
rename to src/mo_optimization_utils/constants.h
diff --git a/src/mo_optimization/diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/diagonal_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/diagonal_hessian_opt.irp.f
rename to src/mo_optimization_utils/diagonal_hessian_opt.irp.f
diff --git a/src/mo_optimization/diagonalization_hessian.irp.f b/src/mo_optimization_utils/diagonalization_hessian.irp.f
similarity index 100%
rename from src/mo_optimization/diagonalization_hessian.irp.f
rename to src/mo_optimization_utils/diagonalization_hessian.irp.f
diff --git a/src/mo_optimization/first_diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_diagonal_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/first_diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_diagonal_hessian_opt.irp.f
rename to src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
diff --git a/src/mo_optimization/first_gradient_list_opt.irp.f b/src/mo_optimization_utils/first_gradient_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_gradient_list_opt.irp.f
rename to src/mo_optimization_utils/first_gradient_list_opt.irp.f
diff --git a/src/mo_optimization/first_gradient_opt.irp.f b/src/mo_optimization_utils/first_gradient_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_gradient_opt.irp.f
rename to src/mo_optimization_utils/first_gradient_opt.irp.f
diff --git a/src/mo_optimization/first_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/first_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/first_hessian_opt.irp.f b/src/mo_optimization_utils/first_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_hessian_opt.irp.f
rename to src/mo_optimization_utils/first_hessian_opt.irp.f
diff --git a/src/mo_optimization/gradient_list_opt.irp.f b/src/mo_optimization_utils/gradient_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/gradient_list_opt.irp.f
rename to src/mo_optimization_utils/gradient_list_opt.irp.f
diff --git a/src/mo_optimization/gradient_opt.irp.f b/src/mo_optimization_utils/gradient_opt.irp.f
similarity index 100%
rename from src/mo_optimization/gradient_opt.irp.f
rename to src/mo_optimization_utils/gradient_opt.irp.f
diff --git a/src/mo_optimization/hessian_list_opt.irp.f b/src/mo_optimization_utils/hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/hessian_list_opt.irp.f
rename to src/mo_optimization_utils/hessian_list_opt.irp.f
diff --git a/src/mo_optimization/hessian_opt.irp.f b/src/mo_optimization_utils/hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/hessian_opt.irp.f
rename to src/mo_optimization_utils/hessian_opt.irp.f
diff --git a/src/mo_optimization/org/TODO.org b/src/mo_optimization_utils/org/TODO.org
similarity index 100%
rename from src/mo_optimization/org/TODO.org
rename to src/mo_optimization_utils/org/TODO.org
diff --git a/src/mo_optimization/org/debug_gradient_list_opt.org b/src/mo_optimization_utils/org/debug_gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_gradient_list_opt.org
rename to src/mo_optimization_utils/org/debug_gradient_list_opt.org
diff --git a/src/mo_optimization/org/debug_gradient_opt.org b/src/mo_optimization_utils/org/debug_gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_gradient_opt.org
rename to src/mo_optimization_utils/org/debug_gradient_opt.org
diff --git a/src/mo_optimization/org/debug_hessian_list_opt.org b/src/mo_optimization_utils/org/debug_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_hessian_list_opt.org
rename to src/mo_optimization_utils/org/debug_hessian_list_opt.org
diff --git a/src/mo_optimization/org/debug_hessian_opt.org b/src/mo_optimization_utils/org/debug_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_hessian_opt.org
rename to src/mo_optimization_utils/org/debug_hessian_opt.org
diff --git a/src/mo_optimization/org/diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/diagonal_hessian_list_opt.org
rename to src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
diff --git a/src/mo_optimization/org/diagonal_hessian_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/diagonal_hessian_opt.org
rename to src/mo_optimization_utils/org/diagonal_hessian_opt.org
diff --git a/src/mo_optimization/org/diagonalization_hessian.org b/src/mo_optimization_utils/org/diagonalization_hessian.org
similarity index 100%
rename from src/mo_optimization/org/diagonalization_hessian.org
rename to src/mo_optimization_utils/org/diagonalization_hessian.org
diff --git a/src/mo_optimization/org/first_diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_diagonal_hessian_list_opt.org
rename to src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
diff --git a/src/mo_optimization/org/first_diagonal_hessian_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_diagonal_hessian_opt.org
rename to src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
diff --git a/src/mo_optimization/org/first_gradient_list_opt.org b/src/mo_optimization_utils/org/first_gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_gradient_list_opt.org
rename to src/mo_optimization_utils/org/first_gradient_list_opt.org
diff --git a/src/mo_optimization/org/first_gradient_opt.org b/src/mo_optimization_utils/org/first_gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_gradient_opt.org
rename to src/mo_optimization_utils/org/first_gradient_opt.org
diff --git a/src/mo_optimization/org/first_hessian_list_opt.org b/src/mo_optimization_utils/org/first_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_hessian_list_opt.org
rename to src/mo_optimization_utils/org/first_hessian_list_opt.org
diff --git a/src/mo_optimization/org/first_hessian_opt.org b/src/mo_optimization_utils/org/first_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_hessian_opt.org
rename to src/mo_optimization_utils/org/first_hessian_opt.org
diff --git a/src/mo_optimization/org/gradient_list_opt.org b/src/mo_optimization_utils/org/gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/gradient_list_opt.org
rename to src/mo_optimization_utils/org/gradient_list_opt.org
diff --git a/src/mo_optimization/org/gradient_opt.org b/src/mo_optimization_utils/org/gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/gradient_opt.org
rename to src/mo_optimization_utils/org/gradient_opt.org
diff --git a/src/mo_optimization/org/hessian_list_opt.org b/src/mo_optimization_utils/org/hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/hessian_list_opt.org
rename to src/mo_optimization_utils/org/hessian_list_opt.org
diff --git a/src/mo_optimization/org/hessian_opt.org b/src/mo_optimization_utils/org/hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/hessian_opt.org
rename to src/mo_optimization_utils/org/hessian_opt.org
diff --git a/src/mo_optimization/org/my_providers.org b/src/mo_optimization_utils/org/my_providers.org
similarity index 100%
rename from src/mo_optimization/org/my_providers.org
rename to src/mo_optimization_utils/org/my_providers.org
diff --git a/src/mo_optimization/org/optimization.org b/src/mo_optimization_utils/org/optimization.org
similarity index 100%
rename from src/mo_optimization/org/optimization.org
rename to src/mo_optimization_utils/org/optimization.org
diff --git a/src/mo_optimization/org/orb_opt_trust_v2.org b/src/mo_optimization_utils/org/orb_opt_trust_v2.org
similarity index 100%
rename from src/mo_optimization/org/orb_opt_trust_v2.org
rename to src/mo_optimization_utils/org/orb_opt_trust_v2.org
diff --git a/src/mo_optimization/org/state_average_energy.org b/src/mo_optimization_utils/org/state_average_energy.org
similarity index 100%
rename from src/mo_optimization/org/state_average_energy.org
rename to src/mo_optimization_utils/org/state_average_energy.org
diff --git a/src/mo_optimization/org/state_weight_normalization.org b/src/mo_optimization_utils/org/state_weight_normalization.org
similarity index 100%
rename from src/mo_optimization/org/state_weight_normalization.org
rename to src/mo_optimization_utils/org/state_weight_normalization.org
diff --git a/src/mo_optimization/org/update_parameters.org b/src/mo_optimization_utils/org/update_parameters.org
similarity index 100%
rename from src/mo_optimization/org/update_parameters.org
rename to src/mo_optimization_utils/org/update_parameters.org
diff --git a/src/mo_optimization/org/update_st_av_ci_energy.org b/src/mo_optimization_utils/org/update_st_av_ci_energy.org
similarity index 100%
rename from src/mo_optimization/org/update_st_av_ci_energy.org
rename to src/mo_optimization_utils/org/update_st_av_ci_energy.org
diff --git a/src/mo_optimization_utils/routine_opt_mos.irp.f b/src/mo_optimization_utils/routine_opt_mos.irp.f
new file mode 100644
index 00000000..fceba2c5
--- /dev/null
+++ b/src/mo_optimization_utils/routine_opt_mos.irp.f
@@ -0,0 +1,81 @@
+
+subroutine run_optimization_mos_CIPSI
+
+  implicit none
+
+  double precision :: e_cipsi, e_opt, delta_e
+  double precision, allocatable :: Ev(:),PT2(:)
+  integer :: nb_iter,i
+  logical :: not_converged
+  character (len=100) :: filename
+
+  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
+  allocate(Ev(N_states),PT2(N_states))
+
+  not_converged = .True.
+  nb_iter = 0
+
+  ! To start from the wf
+  N_det_max = max(n_det,5)
+  TOUCH N_det_max
+
+  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
+  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
+  call state_average_energy(e_cipsi)
+  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
+  close(10)
+
+  do while (not_converged)
+      print*,''
+      print*,'======================'
+      print*,' Cipsi step:', nb_iter
+      print*,'======================'
+      print*,'' 
+      print*,'********** cipsi step **********'
+      ! cispi calculation
+      call run_stochastic_cipsi(Ev,PT2)
+
+      ! State average energy after the cipsi step
+      call state_average_energy(e_cipsi)
+
+      print*,''
+      print*,'********** optimization step **********'
+      ! orbital optimization
+      call run_orb_opt_trust_v2
+
+      ! State average energy after the orbital optimization
+      call state_average_energy(e_opt)
+
+      print*,''
+      print*,'********** diff step **********'
+      ! Gain in energy
+      delta_e = e_opt - e_cipsi
+      print*, 'Gain in energy during the orbital optimization:', delta_e
+
+      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
+      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
+      close(10)
+
+      ! Exit
+      if (delta_e > 1d-12) then
+          print*, 'WARNING, something wrong happened'
+          print*, 'The gain (delta_e) in energy during the optimization process'
+          print*, 'is > 0, but it must be < 0'
+          print*, 'The program will exit'
+          exit
+      endif
+
+      if (n_det > n_det_max_opt) then
+          print*, 'The number of determinants in the wf > n_det_max_opt'
+          print*, 'The program will exit'
+          exit
+      endif
+      
+      ! To double the number of determinants in the wf
+      N_det_max = int(dble(n_det * 2)*0.9)
+      TOUCH N_det_max
+
+      nb_iter = nb_iter + 1
+  enddo
+
+end
diff --git a/src/mo_optimization/run_orb_opt_trust_v2.irp.f b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
similarity index 100%
rename from src/mo_optimization/run_orb_opt_trust_v2.irp.f
rename to src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
diff --git a/src/mo_optimization/save_energy.irp.f b/src/mo_optimization_utils/save_energy.irp.f
similarity index 100%
rename from src/mo_optimization/save_energy.irp.f
rename to src/mo_optimization_utils/save_energy.irp.f
diff --git a/src/mo_optimization/state_average_energy.irp.f b/src/mo_optimization_utils/state_average_energy.irp.f
similarity index 100%
rename from src/mo_optimization/state_average_energy.irp.f
rename to src/mo_optimization_utils/state_average_energy.irp.f
diff --git a/src/mo_optimization/state_weight_normalization.irp.f b/src/mo_optimization_utils/state_weight_normalization.irp.f
similarity index 100%
rename from src/mo_optimization/state_weight_normalization.irp.f
rename to src/mo_optimization_utils/state_weight_normalization.irp.f
diff --git a/src/mo_optimization/update_parameters.irp.f b/src/mo_optimization_utils/update_parameters.irp.f
similarity index 100%
rename from src/mo_optimization/update_parameters.irp.f
rename to src/mo_optimization_utils/update_parameters.irp.f
diff --git a/src/mo_optimization/update_st_av_ci_energy.irp.f b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
similarity index 100%
rename from src/mo_optimization/update_st_av_ci_energy.irp.f
rename to src/mo_optimization_utils/update_st_av_ci_energy.irp.f

From 6985d4d5493a6204f95a8d1d1cccbccc80c12071 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Fri, 12 Jul 2024 18:25:17 +0200
Subject: [PATCH 126/131] the casscf does not work with mo optimization ...

---
 src/casscf_cipsi/EZFIO.cfg                    |   6 +
 src/casscf_cipsi/NEED                         |   2 +-
 src/casscf_cipsi/casscf.irp.f                 | 166 +++++++++---------
 .../class.irp.f                               |   0
 4 files changed, 94 insertions(+), 80 deletions(-)
 rename src/{mo_optimization_utils => mo_optimization}/class.irp.f (100%)

diff --git a/src/casscf_cipsi/EZFIO.cfg b/src/casscf_cipsi/EZFIO.cfg
index 18e0b6b1..5b72d906 100644
--- a/src/casscf_cipsi/EZFIO.cfg
+++ b/src/casscf_cipsi/EZFIO.cfg
@@ -79,3 +79,9 @@ type: logical
 doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change
 interface: ezfio,provider,ocaml
 default: False
+
+[act_mos_opt]
+type: logical
+doc: If |true|, the active orbitals are also optimized variationally
+interface: ezfio,provider,ocaml
+default: False
diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED
index 11d1a78c..32f5ae90 100644
--- a/src/casscf_cipsi/NEED
+++ b/src/casscf_cipsi/NEED
@@ -3,4 +3,4 @@ selectors_full
 generators_cas
 two_body_rdm
 dav_general_mat
-mo_optimization
+mo_optimization_utils
diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f
index dc3e2245..b64a9d8f 100644
--- a/src/casscf_cipsi/casscf.irp.f
+++ b/src/casscf_cipsi/casscf.irp.f
@@ -46,94 +46,101 @@ subroutine run
   do while (.not.converged)
     print*,'pt2_max = ',pt2_max
     call run_stochastic_cipsi(Ev,PT2)
-    print*,'Ev,PT2',Ev(1),PT2(1)
-    E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
-    energy_old = energy
-    energy = eone+etwo+ecore
-    pt2_max_before = pt2_max
-
-    call write_time(6)
-    call write_int(6,iteration,'CAS-SCF iteration = ')
-    call write_double(6,energy,'State-average CAS-SCF energy = ')
-!    if(n_states == 1)then
-!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
-!     call ezfio_get_casscf_cipsi_energy(PT2)
-     double precision :: delta_E_istate, e_av
-     e_av = 0.d0
-     do istate=1,N_states
-      e_av += state_average_weight(istate) * Ev(istate) 
-      if(istate.gt.1)then
-       delta_E_istate = E_PT2(istate)  - E_PT2(1)
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2    = ',delta_E_istate
-      endif
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
-!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
-!      call write_double(6,PT2(istate),'  PT2          = ')
-     enddo
-     call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
-     call write_double(6,pt2_max,' PT2_MAX       = ')
+!    if(act_mos_opt)then DOES NOT WORK
+!     call run_orb_opt_trust_v2
+!     call run_stochastic_cipsi(Ev,PT2)
 !    endif
-
-    print*,''
-    call write_double(6,norm_grad_vec2,'Norm of gradients = ')
-    call write_double(6,norm_grad_vec2_tab(1), '     Core-active  gradients = ')
-    call write_double(6,norm_grad_vec2_tab(2), '     Core-virtual gradients = ')
-    call write_double(6,norm_grad_vec2_tab(3), '   Active-virtual gradients = ')
-    print*,''
-    call write_double(6,energy_improvement, 'Predicted energy improvement = ')
-
-    if(criterion_casscf == "energy")then
-     converged = dabs(energy_improvement) < thresh_scf
-    else if (criterion_casscf == "gradients")then
-     converged = norm_grad_vec2 < thresh_scf
-    else if (criterion_casscf == "e_pt2")then
-     delta_E = 0.d0
-     do istate = 1, N_states
-      delta_E += dabs(E_PT2(istate) - ept2_before(istate))
-     enddo
-     converged = dabs(delta_E) < thresh_casscf
-    endif
-    ept2_before = E_PT2
-    if(.not.small_active_space)then
-     if(adaptive_pt2_max)then
-      pt2_max = dabs(energy_improvement / (pt2_relative_error))
-      pt2_max = min(pt2_max, pt2_max_before)
-      if(n_act_orb.ge.n_big_act_orb)then
-       pt2_max = max(pt2_max,pt2_min_casscf)
-      endif
+    if(.True.)then
+     print*,'Ev,PT2',Ev(1),PT2(1)
+     E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
+     energy_old = energy
+     energy = eone+etwo+ecore
+     pt2_max_before = pt2_max
+ 
+     call write_time(6)
+     call write_int(6,iteration,'CAS-SCF iteration = ')
+     call write_double(6,energy,'State-average CAS-SCF energy = ')
+!!    if(n_states == 1)then
+!!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
+!!     call ezfio_get_casscf_cipsi_energy(PT2)
+      double precision :: delta_E_istate, e_av
+      e_av = 0.d0
+      do istate=1,N_states
+       e_av += state_average_weight(istate) * Ev(istate) 
+       if(istate.gt.1)then
+        delta_E_istate = E_PT2(istate)  - E_PT2(1)
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2    = ',delta_E_istate
+       endif
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
+!!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
+!!      call write_double(6,PT2(istate),'  PT2          = ')
+      enddo
+      call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
+      call write_double(6,pt2_max,' PT2_MAX       = ')
+!!    endif
+ 
+     print*,''
+     call write_double(6,norm_grad_vec2,'Norm of gradients = ')
+     call write_double(6,norm_grad_vec2_tab(1), '     Core-active  gradients = ')
+     call write_double(6,norm_grad_vec2_tab(2), '     Core-virtual gradients = ')
+     call write_double(6,norm_grad_vec2_tab(3), '   Active-virtual gradients = ')
+     print*,''
+     call write_double(6,energy_improvement, 'Predicted energy improvement = ')
+ 
+     if(criterion_casscf == "energy")then
+      converged = dabs(energy_improvement) < thresh_scf
+     else if (criterion_casscf == "gradients")then
+      converged = norm_grad_vec2 < thresh_scf
+     else if (criterion_casscf == "e_pt2")then
+      delta_E = 0.d0
+      do istate = 1, N_states
+       delta_E += dabs(E_PT2(istate) - ept2_before(istate))
+      enddo
+      converged = dabs(delta_E) < thresh_casscf
      endif
-    endif
-    print*,''
-    call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
-
-    mo_coef = NewOrbs
-    mo_occ  = occnum
-    if(.not.converged)then
-     call save_mos
-     iteration += 1
-     if(norm_grad_vec2.gt.0.01d0)then
-      N_det = N_states
-     else
-      N_det = max(N_det/8 ,N_states)
-     endif
-     psi_det = psi_det_sorted
-     psi_coef = psi_coef_sorted
-     read_wf = .True.
-     call clear_mo_map
-     SOFT_TOUCH mo_coef N_det psi_det psi_coef
+     ept2_before = E_PT2
      if(.not.small_active_space)then
       if(adaptive_pt2_max)then
-        SOFT_TOUCH pt2_max  
+       pt2_max = dabs(energy_improvement / (pt2_relative_error))
+       pt2_max = min(pt2_max, pt2_max_before)
+       if(n_act_orb.ge.n_big_act_orb)then
+        pt2_max = max(pt2_max,pt2_min_casscf)
+       endif
       endif
      endif
-     if(iteration .gt. 3)then
-      state_following_casscf = state_following_casscf_cipsi_save
-      soft_touch state_following_casscf
+     print*,''
+     call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
+ 
+     mo_coef = NewOrbs
+     mo_occ  = occnum
+     if(.not.converged)then
+      call save_mos
+      iteration += 1
+      if(norm_grad_vec2.gt.0.01d0)then
+       N_det = N_states
+      else
+       N_det = max(N_det/8 ,N_states)
+      endif
+      psi_det = psi_det_sorted
+      psi_coef = psi_coef_sorted
+      read_wf = .True.
+      call clear_mo_map
+      SOFT_TOUCH mo_coef N_det psi_det psi_coef
+      if(.not.small_active_space)then
+       if(adaptive_pt2_max)then
+         SOFT_TOUCH pt2_max  
+       endif
+      endif
+      if(iteration .gt. 3)then
+       state_following_casscf = state_following_casscf_cipsi_save
+       soft_touch state_following_casscf
+      endif
      endif
     endif
-
+ 
   enddo
+  if(.True.)then
      integer :: i
     print*,'Converged CASSCF '
     print*,'--------------------------'
@@ -153,6 +160,7 @@ subroutine run
 !   write(*,*)mcscf_fock_alpha_mo(i,i)
   enddo
 
+ endif
 
 end
 
diff --git a/src/mo_optimization_utils/class.irp.f b/src/mo_optimization/class.irp.f
similarity index 100%
rename from src/mo_optimization_utils/class.irp.f
rename to src/mo_optimization/class.irp.f

From 31028f8979189d1cc2822d4dc60eb527ed639932 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Tue, 16 Jul 2024 17:44:48 +0200
Subject: [PATCH 127/131] fixed some weird dependencies in TC, introduced an AO
 cholesky 2e function

---
 .../cipsi_tc_bi_ortho/stochastic_cipsi.irp.f  | 32 +++++++++++++++++++
 plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f | 20 ++++++++++--
 .../local/non_h_ints_mu/total_tc_int.irp.f    | 18 +++++++----
 plugins/local/slater_tc/slater_tc_opt.irp.f   |  2 --
 plugins/local/slater_tc/tc_hmat.irp.f         |  4 ++-
 src/ao_two_e_ints/cholesky.irp.f              | 15 ++++++++-
 6 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
index e363830d..6b8f3b42 100644
--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@@ -1,4 +1,36 @@
 
+
+! ---
+
+subroutine run_pouet
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+
+  use selection_types
+  implicit none
+  integer                       :: i, j, k, ndet
+  integer                       :: to_select
+  logical                       :: has
+  type(pt2_type)                :: pt2_data, pt2_data_err
+  double precision              :: rss
+  double precision              :: correlation_energy_ratio
+  double precision              :: hf_energy_ref
+  double precision              :: relative_error
+  double precision, allocatable :: zeros(:),E_tc(:), norm(:)
+
+  logical,          external    :: qp_stop
+  double precision, external    :: memory_of_double
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*, ' Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
+  enddo
+end
 ! ---
 
 subroutine run_stochastic_cipsi
diff --git a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
index 1c1c0411..f1de0fe3 100644
--- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -65,7 +65,15 @@ subroutine run_cipsi_tc()
 
   if (.not. is_zmq_slave) then
 
-    PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
+    if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e 
+    endif
 
     if((elec_alpha_num+elec_beta_num) .ge. 3) then
       if(three_body_h_tc) then
@@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
     call json_close
 
   else
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
 
-    PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    endif
 
     if((elec_alpha_num+elec_beta_num) .ge. 3) then
       if(three_body_h_tc) then
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index 656f5f16..fb09168e 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -288,25 +288,31 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       !$OMP END DO
       !$OMP END PARALLEL
     else
-      print*, ' ao_integrals_map will be used'
-      PROVIDE ao_integrals_map
+!      print*, ' ao_integrals_map will be used'
+!      PROVIDE ao_integrals_map
+      print*,'Cholesky vectors will be used '
+      double precision :: get_ao_integ_chol,eri
+      eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP 
       !$OMP PARALLEL DEFAULT(NONE)                            &
-      !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-      !$OMP PRIVATE(i, j, k, l)
+!!!    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot) &
+      !$OMP PRIVATE(i, j, k, l,eri)
       !$OMP DO COLLAPSE(3)
       do j = 1, ao_num
         do l = 1, ao_num
           do i = 1, ao_num
             do k = 1, ao_num
               !                                                     < 1:i, 2:j | 1:k, 2:l > 
-              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+!              eri =  get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+               eri = get_ao_integ_chol(i,k,j,l)
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
             enddo
           enddo
         enddo
       enddo
       !$OMP END DO
       !$OMP END PARALLEL
-      FREE ao_integrals_map
+!      FREE ao_integrals_map
     endif
 
     if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
diff --git a/plugins/local/slater_tc/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f
index 5651a299..3c4421f8 100644
--- a/plugins/local/slater_tc/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
   implicit none
   double precision :: t1, t2
 
-  PROVIDE ao_two_e_integrals_in_map
-
   print *, ' start provide_all_three_ints_bi_ortho'
   call wall_time(t1)
 
diff --git a/plugins/local/slater_tc/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f
index cc780364..6323d129 100644
--- a/plugins/local/slater_tc/tc_hmat.irp.f
+++ b/plugins/local/slater_tc/tc_hmat.irp.f
@@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
   print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
   call wall_time(t1)
 
-  call provide_all_three_ints_bi_ortho()
+  if(three_body_h_tc)then
+   call provide_all_three_ints_bi_ortho()
+  endif
 
   i = 1
   j = 1
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index a2d9d043..bfa6bd0a 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -1,3 +1,15 @@
+double precision function get_ao_integ_chol(i,j,k,l)
+ implicit none
+  BEGIN_DOC
+  !  CHOLESKY representation of the integral of the AO basis <ik|jl> or (ij|kl)
+  !     i(r1) j(r1) 1/r12 k(r2) l(r2)
+  END_DOC
+ integer, intent(in) :: i,j,k,l
+ double precision, external :: ddot                                                                                  
+ get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1)
+
+end
+
 BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ]
  implicit none
  BEGIN_DOC
@@ -162,7 +174,8 @@ END_PROVIDER
      np = int(np8,4)
      if (np <= 0) stop 'np<=0'
 
-     rank_max = min(np,20*elec_num*elec_num)
+!     rank_max = min(np,20*elec_num*elec_num)
+     rank_max = np
      call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
      call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
 

From a0140b9b0af59712494de5b443846c70a432465b Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 24 Jul 2024 12:20:16 +0200
Subject: [PATCH 128/131] added mu_of_r_mean_field.irp.f

---
 src/mu_of_r/mu_of_r_mean_field.irp.f | 132 +++++++++++++++++++++++++++
 src/mu_of_r/test_proj_op.irp.f       |  96 ++++++++++++++++++-
 2 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 src/mu_of_r/mu_of_r_mean_field.irp.f

diff --git a/src/mu_of_r/mu_of_r_mean_field.irp.f b/src/mu_of_r/mu_of_r_mean_field.irp.f
new file mode 100644
index 00000000..9b9c2e20
--- /dev/null
+++ b/src/mu_of_r/mu_of_r_mean_field.irp.f
@@ -0,0 +1,132 @@
+BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
+ implicit none
+ integer :: i,j,k,l 
+ double precision :: get_two_e_integral
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) 
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ implicit none
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
+ double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
+ call give_all_mos_at_r(r,mos_array_r) 
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+ enddo
+
+ dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+ enddo
+
+ dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+ enddo
+ two_bod_dens = dm_a * dm_b
+ 
+ f_mf_ab = 0.d0
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
+    enddo
+   enddo
+  enddo
+ enddo
+ ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+
+end
+
+subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ implicit none
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: f_mf_ab, two_bod_dens
+ double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
+ double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
+
+ double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
+ double precision, allocatable :: mos_array_a(:), mos_array_b(:)
+ double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
+ double precision :: mo_i, mo_j, mo_k, mo_l
+ double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
+ 
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
+ allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
+ allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
+ call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+  mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+  mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+
+ ! ALPHA DENSITY AND GRADIENT 
+ dm_a = 0.d0
+ grad_dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+  grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)  
+ enddo
+
+ ! BETA DENSITY AND GRADIENT 
+ dm_b = 0.d0
+ grad_dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+  grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)  
+ enddo
+ ! TWO-BODY DENSITY AND GRADIENT 
+ two_bod_dens = dm_a * dm_b
+ grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
+
+ ! F_MF and GRADIENT 
+ grad_f_mf_ab = 0.d0
+ f_mf_ab  = 0.d0
+ do i = 1, elec_alpha_num
+  mo_i = mos_array_a(i)
+  grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
+  do j = 1, elec_beta_num
+   mo_j = mos_array_b(j)
+   grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
+   do k = 1, elec_alpha_num
+    mo_k = mos_array_a(k)
+    grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
+    do l = 1, elec_beta_num
+     mo_l = mos_array_b(l)
+     grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
+     grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & 
+     (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & 
+     +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
+    enddo
+   enddo
+  enddo
+ enddo
+
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+ grad_f_mf_ab *= 2.D0
+ grad_two_bod_dens *= 2.d0
+end
diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f
index fd5e976b..bd2f3b4f 100644
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@@ -17,7 +17,9 @@ program projected_operators
 !   call test_f_ii_valence_ab
 !   call test_f_ia_valence_ab
 !  call test_f_ii_ia_aa_valence_ab
- call test
+! call test
+!  call test_f_mean_field
+ call test_grad_f_mean_field
 end
 
 
@@ -35,3 +37,95 @@ subroutine test
  print*,'accu = ',accu
 
 end
+
+subroutine test_f_mean_field
+ implicit none
+ integer :: i_point
+ double precision :: weight,r(3)
+ double precision :: ref_f, new_f, accu_f
+ double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b
+ accu_f = 0.d0
+ accu_two_dens = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b)
+  call f_HF_valence_ab(r,r,ref_f,ref_two_dens)
+  accu_f += weight * dabs(new_f- ref_f)
+  accu_two_dens += weight * dabs(new_two_dens - ref_two_dens)
+ enddo
+ print*,'accu_f        = ',accu_f
+ print*,'accu_two_dens = ',accu_two_dens
+
+end
+
+subroutine test_grad_f_mean_field
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3)
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
+ double precision :: grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+
+ double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
+ double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
+ double precision :: f_mf_ab_p,f_mf_ab_m
+ double precision :: two_bod_dens_p, two_bod_dens_m
+ double precision :: dm_a_p, dm_a_m
+ double precision :: dm_b_p, dm_b_m
+ double precision :: rbis(3), dr
+ double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
+ double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
+ double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
+ dr = 0.00001d0
+ accu_f_mf_ab = 0.d0 
+ accu_two_bod_dens = 0.d0 
+ accu_dm_a = 0.d0 
+ accu_dm_b = 0.d0
+
+ accu_grad_f_mf_ab = 0.d0
+ accu_grad_two_bod_dens = 0.d0
+ accu_grad_dm_a = 0.d0
+ accu_grad_dm_b = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+  call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+  accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
+  accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
+  accu_dm_a += weight*dabs(dm_a - dm_a_p)
+  accu_dm_b += weight*dabs(dm_b - dm_b_p)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+   rbis = r
+   rbis(k) -= dr
+   call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
+   num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
+   num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
+   num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
+   num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
+   accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
+   accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
+   accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
+  enddo
+ enddo
+ print*,'accu_f_mf_ab = ',accu_f_mf_ab
+ print*,'accu_two_bod_dens = ',accu_two_bod_dens
+ print*,'accu_dm_a = ',accu_dm_a
+ print*,'accu_dm_b = ',accu_dm_b
+ print*,'accu_grad_f_mf_ab = '
+ print*,accu_grad_f_mf_ab
+ print*,'accu_grad_two_bod_dens = '
+ print*,accu_grad_two_bod_dens
+ print*,'accu_dm_a = '
+ print*,accu_grad_dm_a
+ print*,'accu_dm_b = '
+ print*,accu_grad_dm_b
+
+end

From cb8bef2ecda20f5e8b6076ece8cd922aa063db65 Mon Sep 17 00:00:00 2001
From: eginer <giner.emmanuel@gmail.com>
Date: Wed, 24 Jul 2024 12:43:20 +0200
Subject: [PATCH 129/131] added gradients of mu_mf

---
 src/mu_of_r/mu_of_r_mean_field.irp.f | 39 +++++++++++++++++++++++
 src/mu_of_r/test_proj_op.irp.f       | 47 +++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/src/mu_of_r/mu_of_r_mean_field.irp.f b/src/mu_of_r/mu_of_r_mean_field.irp.f
index 9b9c2e20..6abc7e4f 100644
--- a/src/mu_of_r/mu_of_r_mean_field.irp.f
+++ b/src/mu_of_r/mu_of_r_mean_field.irp.f
@@ -130,3 +130,42 @@ subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_de
  grad_f_mf_ab *= 2.D0
  grad_two_bod_dens *= 2.d0
 end
+
+subroutine mu_of_r_mean_field(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+ endif
+end
+
+subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+
+ dm = dm_a + dm_b
+ grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
+
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+  grad_mu_mf = 0.d0
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+  grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                   /(two_bod_dens*two_bod_dens)
+ endif 
+
+end
diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f
index bd2f3b4f..cf53c772 100644
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@@ -19,7 +19,8 @@ program projected_operators
 !  call test_f_ii_ia_aa_valence_ab
 ! call test
 !  call test_f_mean_field
- call test_grad_f_mean_field
+! call test_grad_f_mean_field
+ call test_grad_mu_mf
 end
 
 
@@ -129,3 +130,47 @@ subroutine test_grad_f_mean_field
  print*,accu_grad_dm_b
 
 end
+
+subroutine test_grad_mu_mf
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3),rbis(3)
+ double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
+ double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
+ double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
+ dr = 0.00001d0
+ accu_grad_mu_mf = 0.d0
+ accu_mu = 0.d0
+ accu_grad_dm = 0.d0
+ accu_dm = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+  call mu_of_r_mean_field(r,mu_mf_p, dm_p)
+  accu_mu += weight*dabs(mu_mf_p - mu_mf)
+  accu_dm += weight*dabs(dm_p - dm)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
+   rbis = r
+   rbis(k) -= dr
+   call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
+
+   num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
+   num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
+   accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
+  enddo
+ enddo
+ print*,'accu_mu = ',accu_mu
+ print*,'accu_dm = ',accu_dm
+ print*,'accu_grad_dm = '
+ print*, accu_grad_dm
+ print*,'accu_grad_mu_mf = '
+ print*, accu_grad_mu_mf
+
+end

From edf3a27534e531a1866eac30202bf26ca305123a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Mon, 29 Jul 2024 16:15:48 +0200
Subject: [PATCH 130/131] rank_max

---
 src/ao_two_e_ints/cholesky.irp.f | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index bfa6bd0a..ccaa7239 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -174,8 +174,11 @@ END_PROVIDER
      np = int(np8,4)
      if (np <= 0) stop 'np<=0'
 
-!     rank_max = min(np,20*elec_num*elec_num)
      rank_max = np
+     ! Avoid too large arrays when there are many electrons
+     if (elec_num > 10) then
+       rank_max = min(np,20*elec_num*elec_num)
+     endif
      call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
      call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
 

From 4d5467218de414d76e81a5080f762519e49f54f4 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Wed, 31 Jul 2024 16:36:44 +0200
Subject: [PATCH 131/131] Fix state following when N_states_diag is too small

---
 .../diagonalization_hs2_dressed.irp.f         | 50 +++----------------
 src/davidson/diagonalize_ci.irp.f             |  5 +-
 2 files changed, 10 insertions(+), 45 deletions(-)

diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f
index fb04b29b..191e0021 100644
--- a/src/davidson/diagonalization_hs2_dressed.irp.f
+++ b/src/davidson/diagonalization_hs2_dressed.irp.f
@@ -594,6 +594,13 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
           state(l) = idx
         enddo
 
+        ! Check if all states are attributed. If not, exit and N_st_diag will be increased.
+        do l=1,N_st
+          if (state(l) == 0) then
+            return
+          endif
+        enddo
+
         ! tmp array before setting state_ok
         ok = .False.
         do l = 1, N_st
@@ -627,47 +634,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
       ! Swapped eigenvectors
       prev_y = y
 
-!      if (state_following) then
-!
-!        overlap = -1.d0
-!        do k=1,shift2
-!          do i=1,shift2
-!            overlap(k,i) = dabs(y(k,i))
-!          enddo
-!        enddo
-!        do k=1,N_st
-!          cmax = -1.d0
-!          do i=1,N_st
-!            if (overlap(i,k) > cmax) then
-!              cmax = overlap(i,k)
-!              order(k) = i
-!            endif
-!          enddo
-!          do i=1,N_st_diag
-!            overlap(order(k),i) = -1.d0
-!          enddo
-!        enddo
-!        overlap = y
-!        do k=1,N_st
-!          l = order(k)
-!          if (k /= l) then
-!            y(1:shift2,k) = overlap(1:shift2,l)
-!          endif
-!        enddo
-!        do k=1,N_st
-!          overlap(k,1) = lambda(k)
-!          overlap(k,2) = s2(k)
-!        enddo
-!        do k=1,N_st
-!          l = order(k)
-!          if (k /= l) then
-!            lambda(k) = overlap(l,1)
-!            s2(k) = overlap(l,2)
-!          endif
-!        enddo
-!
-!      endif
-
 
       ! Express eigenvectors of h in the determinant basis
       ! --------------------------------------------------
@@ -703,7 +669,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
 
 
       if ((itertot>1).and.(iter == 1)) then
-        !don't print
+        ! Don't print
         continue
       else
         write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,ES11.3))') iter-1, to_print(1:3,1:N_st)
diff --git a/src/davidson/diagonalize_ci.irp.f b/src/davidson/diagonalize_ci.irp.f
index 59c8313a..6b852905 100644
--- a/src/davidson/diagonalize_ci.irp.f
+++ b/src/davidson/diagonalize_ci.irp.f
@@ -282,9 +282,8 @@ END_PROVIDER
          print*,'  Within the ',N_det,'determinants selected'
          print*,'  and the ',N_states_diag,'states requested'
          print*,'  We did not find only states with S^2 values close to ',expected_s2
-         print*,'  We will then set the first N_states eigenvectors of the H matrix'
-         print*,'  as the CI_eigenvectors'
-         print*,'  You should consider more states and maybe ask for s2_eig to be .True. or just enlarge the CI space'
+         print*,'  You should consider more states, or change s2_eig, or just enlarge the CI space'
+         print*,'!!!!!!!!   WARNING  !!!!!!!!!'
          print*,''
 
          do j=1,min(N_states_diag,N_det)