diff --git a/bin/zcat b/bin/zcat
deleted file mode 100755
index 7ccecf07..00000000
--- a/bin/zcat
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# On Darwin: try gzcat if available, otherwise use Python
-
-if [[ $(uname -s) = Darwin ]] ; then
-   which gzcat &> /dev/null
-   if [[ $? -eq 0 ]] ; then 
-      exec gzcat $@
-   else
-
-      exec python3 << EOF
-import sys
-import gzip
-with gzip.open("$1", "rt") as f:
-    print(f.read())
-EOF
-   fi
-else
-   SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-   command=$(which -a zcat | grep -v "$SCRIPTPATH/" | head -1)
-   exec $command $@
-fi
-
diff --git a/config/gfortran_debug_mkl.cfg b/config/gfortran_debug_mkl.cfg
new file mode 100644
index 00000000..1dc3f2b2
--- /dev/null
+++ b/config/gfortran_debug_mkl.cfg
@@ -0,0 +1,63 @@
+# Common flags
+##############
+#
+# -ffree-line-length-none : Needed for IRPF90 which produces long lines
+# -lblas -llapack         : Link with libblas and liblapack libraries provided by the system
+# -I .                    : Include the curent directory (Mandatory)
+#
+# --ninja                 : Allow the utilisation of ninja. (Mandatory)
+# --align=32              : Align all provided arrays on a 32-byte boundary
+#
+#
+[COMMON]
+FC           : gfortran -g -ffree-line-length-none -I . -fPIC -std=legacy
+LAPACK_LIB   : -I${MKLROOT}/include -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_gf_lp64 -lmkl_core -lpthread -lm -ldl -lmkl_gnu_thread -lgomp -fopenmp
+IRPF90       : irpf90
+IRPF90_FLAGS : --ninja --align=32 --assert -DSET_NESTED
+
+# Global options
+################
+#
+# 1 : Activate
+# 0 : Deactivate
+#
+[OPTION]
+MODE    : DEBUG      ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
+CACHE   : 0          ; Enable cache_compile.py
+OPENMP  : 1          ; Append OpenMP flags
+
+# Optimization flags
+####################
+#
+# -Ofast                  : Disregard strict standards compliance. Enables all -O3 optimizations.
+#                           It also enables optimizations that are not valid
+#                           for all standard-compliant programs.  It turns on
+#                           -ffast-math and the Fortran-specific
+#                           -fno-protect-parens and -fstack-arrays.
+[OPT]
+FCFLAGS : -Ofast
+
+# Profiling flags
+#################
+#
+[PROFILE]
+FC      : -p -g
+FCFLAGS : -Ofast
+
+# Debugging flags
+#################
+#
+# -fcheck=all  : Checks uninitialized variables,  array subscripts, etc...
+# -g           : Extra debugging information
+#
+[DEBUG]
+#FCFLAGS : -g -msse4.2  -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant -Wuninitialized  -fbacktrace -ffpe-trap=zero,overflow,underflow -finit-real=nan
+FCFLAGS : -g -mavx  -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant -Wuninitialized  -fbacktrace -ffpe-trap=zero,overflow -finit-real=nan
+
+# OpenMP flags
+#################
+#
+[OPENMP]
+FC           : -fopenmp
+IRPF90_FLAGS : --openmp
+
diff --git a/configure b/configure
index 41c0123d..43ca9f6d 100755
--- a/configure
+++ b/configure
@@ -40,14 +40,16 @@ Usage:
   $(basename $0) -c <file>    
   $(basename $0) -h           
   $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|intel|none]
 
 Options:
-  -c  <file>    Define a COMPILATION configuration file,
-                in "${QP_ROOT}/config/".
-  -h            Print the HELP message
-  -i <package>  INSTALL <package>. Use at your OWN RISK:
-                no support will be provided for the installation of
-                dependencies.
+  -c  <file>        Define a COMPILATION configuration file,
+                    in "${QP_ROOT}/config/".
+  -h                Print the HELP message
+  -i <package>      INSTALL <package>. Use at your OWN RISK:
+                    no support will be provided for the installation of
+                    dependencies.
+  -g [nvidia|intel|none]  Choose GPU acceleration
 
 Example:
   ./$(basename $0) -c config/gfortran.cfg
@@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""
 
 
-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
     case "$c" in
         c)
             case "$OPTARG" in
@@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                 "") help ; break;;
                 *) PACKAGES="${PACKAGE} $OPTARG"
             esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
         h)
             help
             exit 0;;
@@ -109,6 +114,27 @@ while getopts "d:c:i:h" c ; do
     esac
 done
 
+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu_arch
+case "$GPU" in
+  amd) # AMD
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ;;
+  intel) # Intel
+     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
+     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)
 
diff --git a/etc/paths.rc b/etc/paths.rc
index dc1741e8..843ec383 100644
--- a/etc/paths.rc
+++ b/etc/paths.rc
@@ -28,6 +28,15 @@ function qp_prepend_export () {
   fi
 }
 
+function qp_append_export () {
+  eval "value_1="\${$1}""
+  if [[ -z $value_1 ]] ; then
+      echo "${2}:"
+  else
+      echo "${value_1}:${2}"
+  fi
+}
+
 export PYTHONPATH=$(qp_prepend_export "PYTHONPATH" "${QP_EZFIO}/Python":"${QP_PYTHON}")
 
 export PATH=$(qp_prepend_export "PATH" "${QP_PYTHON}":"${QP_ROOT}"/bin:"${QP_ROOT}"/ocaml)
diff --git a/external/irpf90 b/external/irpf90
index beac6153..4ab1b175 160000
--- a/external/irpf90
+++ b/external/irpf90
@@ -1 +1 @@
-Subproject commit beac615343f421bd6c0571a408ba389a6d5a32ac
+Subproject commit 4ab1b175fc7ed0d96c1912f13dc53579b24157a6
diff --git a/plugins/local/basis_correction/51.basis_c.bats b/plugins/local/basis_correction/51.basis_c.bats
index 914b482b..1e20bae3 100644
--- a/plugins/local/basis_correction/51.basis_c.bats
+++ b/plugins/local/basis_correction/51.basis_c.bats
@@ -37,14 +37,6 @@ function run_sd() {
   eq $energy1 $1 $thresh
 }
 
-@test "O2 CAS" {  
-  qp set_file o2_cas.gms.ezfio
-  qp set_mo_class -c "[1-2]" -a "[3-10]" -d "[11-46]"
-  run -149.72435425 3.e-4 10000
-  qp set_mo_class -c "[1-2]" -a "[3-10]" -v "[11-46]"
-  run_md -0.1160222327 1.e-6 
-}
-
 
 @test "LiF RHF" {  
   qp set_file lif.ezfio 
diff --git a/plugins/local/basis_correction/basis_correction.irp.f b/plugins/local/basis_correction/basis_correction.irp.f
index a7ea7244..f17b5d5b 100644
--- a/plugins/local/basis_correction/basis_correction.irp.f
+++ b/plugins/local/basis_correction/basis_correction.irp.f
@@ -7,10 +7,6 @@ program basis_correction
   touch read_wf
   no_core_density = .True.
   touch no_core_density
-  if(io_mo_two_e_integrals .ne. "Read")then
-   provide ao_two_e_integrals_in_map
-  endif
-  provide mo_two_e_integrals_in_map
   call print_basis_correction
 end
 
diff --git a/plugins/local/basis_correction/print_routine.irp.f b/plugins/local/basis_correction/print_routine.irp.f
index 96faba30..b3b38673 100644
--- a/plugins/local/basis_correction/print_routine.irp.f
+++ b/plugins/local/basis_correction/print_routine.irp.f
@@ -22,7 +22,7 @@ subroutine print_basis_correction
  print*, '****************************************'
  print*, '****************************************'
  print*, 'mu_of_r_potential = ',mu_of_r_potential
- if(mu_of_r_potential.EQ."hf")then
+ if(mu_of_r_potential.EQ."hf".or.mu_of_r_potential.EQ."hf_old".or.mu_of_r_potential.EQ."hf_sparse")then
    print*, ''
    print*,'Using a HF-like two-body density to define mu(r)'
    print*,'This assumes that HF is a qualitative representation of the wave function '
diff --git a/plugins/local/basis_correction/test_chol_bas.irp.f b/plugins/local/basis_correction/test_chol_bas.irp.f
new file mode 100644
index 00000000..076d888c
--- /dev/null
+++ b/plugins/local/basis_correction/test_chol_bas.irp.f
@@ -0,0 +1,18 @@
+program pouet
+ implicit none
+ call test
+end
+subroutine test
+ implicit none
+! provide mos_times_cholesky_r1
+! provide mos_times_cholesky_r2
+ integer :: ipoint
+ double precision :: accu,weight
+ accu = 0.d0
+ do ipoint = 1, n_points_final_grid
+  weight = final_weight_at_r_vector(ipoint)
+!  accu += dabs(mu_of_r_hf(ipoint) - mu_of_r_hf_old(ipoint)) * weight
+  accu += dabs(f_hf_cholesky_sparse(ipoint) - f_hf_cholesky(ipoint)) * weight
+ enddo
+ print*,'accu = ',accu
+end
diff --git a/plugins/local/bi_ort_ints/bi_ort_ints.irp.f b/plugins/local/bi_ort_ints/bi_ort_ints.irp.f
index 0349c731..0398a18f 100644
--- a/plugins/local/bi_ort_ints/bi_ort_ints.irp.f
+++ b/plugins/local/bi_ort_ints/bi_ort_ints.irp.f
@@ -17,12 +17,15 @@ program bi_ort_ints
 ! call test_3e
 ! call test_5idx
 ! call test_5idx2
-  call test_4idx()
+!  call test_4idx()
   !call test_4idx_n4()
   !call test_4idx2()
   !call test_5idx2
   !call test_5idx
 
+  call test_mos_in_r()
+  call test_int2_grad1_u12_bimo_t()
+
 end
 
 subroutine test_5idx2
@@ -472,4 +475,94 @@ subroutine test_4idx()
   return
 end
 
+! ---
+
+subroutine test_mos_in_r()
+
+  implicit none
+
+  integer          :: i, j
+  double precision :: err_tot, nrm_tot, err_loc, acc_thr
+
+  PROVIDE mos_l_in_r_array_transp_old mos_r_in_r_array_transp_old
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+  acc_thr = 1d-13
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, n_points_final_grid
+      err_loc = dabs(mos_l_in_r_array_transp_old(j,i) - mos_l_in_r_array_transp(j,i))
+      if(err_loc > acc_thr) then
+        print*, " error on", j, i
+        print*, " old res", mos_l_in_r_array_transp_old(j,i)
+        print*, " new res", mos_l_in_r_array_transp    (j,i)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(mos_l_in_r_array_transp_old(j,i))
+    enddo
+  enddo
+  print *, ' absolute accuracy on mos_l_in_r_array_transp (%) =', 100.d0 * err_tot / nrm_tot
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, n_points_final_grid
+      err_loc = dabs(mos_r_in_r_array_transp_old(j,i) - mos_r_in_r_array_transp(j,i))
+      if(err_loc > acc_thr) then
+        print*, " error on", j, i
+        print*, " old res", mos_r_in_r_array_transp_old(j,i)
+        print*, " new res", mos_r_in_r_array_transp    (j,i)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(mos_r_in_r_array_transp_old(j,i))
+    enddo
+  enddo
+  print *, ' absolute accuracy on mos_r_in_r_array_transp (%) =', 100.d0 * err_tot / nrm_tot
+
+  return
+end
+
+! ---
+
+subroutine test_int2_grad1_u12_bimo_t()
+
+  implicit none
+  integer          :: i, j, ipoint, m
+  double precision :: err_tot, nrm_tot, err_loc, acc_thr
+
+  PROVIDE int2_grad1_u12_bimo_t_old
+  PROVIDE int2_grad1_u12_bimo_t
+
+  acc_thr = 1d-13
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = 1, 3
+        do ipoint = 1, n_points_final_grid
+          err_loc = dabs(int2_grad1_u12_bimo_t_old(ipoint,m,j,i) - int2_grad1_u12_bimo_t(ipoint,m,j,i))
+          if(err_loc > acc_thr) then
+            print*, " error on", ipoint, m, j, i
+            print*, " old res", int2_grad1_u12_bimo_t_old(ipoint,m,j,i)
+            print*, " new res", int2_grad1_u12_bimo_t    (ipoint,m,j,i)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(int2_grad1_u12_bimo_t_old(ipoint,m,j,i))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on int2_grad1_u12_bimo_t (%) =', 100.d0 * err_tot / nrm_tot
+
+  return
+end
+
+! ---
+
 
diff --git a/plugins/local/bi_ort_ints/no_dressing.irp.f b/plugins/local/bi_ort_ints/no_dressing.irp.f
index bd225274..3bbf31f9 100644
--- a/plugins/local/bi_ort_ints/no_dressing.irp.f
+++ b/plugins/local/bi_ort_ints/no_dressing.irp.f
@@ -322,6 +322,12 @@ END_PROVIDER
 
 BEGIN_PROVIDER [double precision, noL_0e]
 
+  BEGIN_DOC
+  ! 
+  ! < Phi_left | L | Phi_right >
+  !
+  END_DOC 
+
   implicit none
   integer                       :: i, j, k, ipoint
   double precision              :: t0, t1
@@ -330,10 +336,6 @@ BEGIN_PROVIDER [double precision, noL_0e]
   double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
   double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
 
-
-  call wall_time(t0)
-  print*, " Providing noL_0e ..."
-
   if(elec_alpha_num .eq. elec_beta_num) then
 
     allocate(tmp(elec_beta_num))
@@ -708,10 +710,7 @@ BEGIN_PROVIDER [double precision, noL_0e]
 
   endif
 
-  call wall_time(t1)
-  print*, " Wall time for noL_0e (min) = ", (t1 - t0)/60.d0
-
-  print*, " noL_0e = ", noL_0e
+  print*, " noL_0e =", noL_0e
 
 END_PROVIDER
 
diff --git a/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f b/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
index 51f0cba4..dc7642b0 100644
--- a/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
@@ -1,350 +1,54 @@
 
 ! ---
 
-! TODO :: optimization : transform into a DGEMM
-
-BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! mo_v_ki_bi_ortho_erf_rk_cst_mu(k,i,ip) = int dr chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1 )/(2|r - R_ip|) on the BI-ORTHO MO basis 
-  ! 
-  ! where phi_k(r) is a LEFT MOs and phi_i(r) is a RIGHT MO
-  !
-  ! R_ip = the "ip"-th point of the DFT Grid
-  !
-  END_DOC
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
 
   implicit none
-  integer :: ipoint
- !$OMP PARALLEL         &
- !$OMP DEFAULT (NONE)   &
- !$OMP PRIVATE (ipoint) & 
- !$OMP SHARED (n_points_final_grid,v_ij_erf_rk_cst_mu,mo_v_ki_bi_ortho_erf_rk_cst_mu)
- !$OMP DO SCHEDULE (dynamic)
-  do ipoint = 1, n_points_final_grid
-    call ao_to_mo_bi_ortho( v_ij_erf_rk_cst_mu            (1,1,ipoint), size(v_ij_erf_rk_cst_mu,             1) &
-                          , mo_v_ki_bi_ortho_erf_rk_cst_mu(1,1,ipoint), size(mo_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  mo_v_ki_bi_ortho_erf_rk_cst_mu = mo_v_ki_bi_ortho_erf_rk_cst_mu * 0.5d0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! int dr phi_i(r) phi_j(r) (erf(mu(R) |r - R|) - 1)/(2|r - R|) on the BI-ORTHO MO basis
-  !
-  END_DOC
-
-  implicit none
-  integer :: ipoint, i, j
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,j,i) = mo_v_ki_bi_ortho_erf_rk_cst_mu(j,i,ipoint)
-      enddo
-    enddo
-  enddo
-
-  !FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
-
-END_PROVIDER 
-
-! ---
-
-! TODO :: optimization : transform into a DGEMM
-
-BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, 3, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! mo_x_v_ki_bi_ortho_erf_rk_cst_mu(k,i,m,ip) = int dr x(m) * chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1)/2|r - R_ip| on the BI-ORTHO MO basis 
-  !
-  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => x(m) = x, m=2 => x(m) = y, m=3 => x(m) = z,
-  !
-  ! R_ip = the "ip"-th point of the DFT Grid
-  !
-  END_DOC
-
-  implicit none
-  integer :: ipoint
-
- !$OMP PARALLEL         &
- !$OMP DEFAULT (NONE)   &
- !$OMP PRIVATE (ipoint) & 
- !$OMP SHARED (n_points_final_grid,x_v_ij_erf_rk_cst_mu_transp,mo_x_v_ki_bi_ortho_erf_rk_cst_mu)
- !$OMP DO SCHEDULE (dynamic)
-  do ipoint = 1, n_points_final_grid
-
-    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,1,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
-                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,1,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,2,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
-                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,2,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,3,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
-                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,3,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  mo_x_v_ki_bi_ortho_erf_rk_cst_mu = 0.5d0 * mo_x_v_ki_bi_ortho_erf_rk_cst_mu
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3, n_points_final_grid)]
-
-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  print *, ' providing int2_grad1_u12_ao_transp ...'
-  call wall_time(wall0)
-
-  if(test_cycle_tc) then
-
-    PROVIDE int2_grad1_u12_ao_test
-
-    do ipoint = 1, n_points_final_grid
-      do i = 1, ao_num
-        do j = 1, ao_num
-          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,1)
-          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,2)
-          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,3)
-        enddo
-      enddo
-    enddo
-
-    FREE int2_grad1_u12_ao_test
-
-  else
-
-    PROVIDE int2_grad1_u12_ao
-
-    do ipoint = 1, n_points_final_grid
-      do i = 1, ao_num
-        do j = 1, ao_num
-          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(j,i,ipoint,1)
-          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(j,i,ipoint,2)
-          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(j,i,ipoint,3)
-        enddo
-      enddo
-    enddo
-
-  endif
-
-  call wall_time(wall1)
-  print *, ' wall time for int2_grad1_u12_ao_transp ', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
-
-  implicit none
-  integer          :: ipoint
-  double precision :: wall0, wall1
+  integer                       :: i, j, ipoint
+  double precision              :: tt1, tt2
+  double precision, allocatable :: tmp(:,:,:,:)
 
   PROVIDE mo_l_coef mo_r_coef
-  PROVIDE int2_grad1_u12_ao_transp
+  PROVIDE int2_grad1_u12_ao
 
-  !print *, ' providing int2_grad1_u12_bimo_transp'
-  !call wall_time(wall0)
+  call wall_time(tt1)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
 
   !$OMP PARALLEL         &
   !$OMP DEFAULT (NONE)   &
   !$OMP PRIVATE (ipoint) & 
-  !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao_transp,int2_grad1_u12_bimo_transp)
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
   !$OMP DO SCHEDULE (dynamic)
-   do ipoint = 1, n_points_final_grid
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,1,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,1,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,2,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,2,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,3,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,3,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-   enddo
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  !call wall_time(wall1)
-  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
-  !call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
-
-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  !call wall_time(wall0)
-  !print *, ' Providing int2_grad1_u12_bimo_t ...'
-
-  PROVIDE mo_l_coef mo_r_coef
-  PROVIDE int2_grad1_u12_bimo_transp
-
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
   do ipoint = 1, n_points_final_grid
     do i = 1, mo_num
       do j = 1, mo_num
-        int2_grad1_u12_bimo_t(ipoint,1,j,i) = int2_grad1_u12_bimo_transp(j,i,1,ipoint)
-        int2_grad1_u12_bimo_t(ipoint,2,j,i) = int2_grad1_u12_bimo_transp(j,i,2,ipoint)
-        int2_grad1_u12_bimo_t(ipoint,3,j,i) = int2_grad1_u12_bimo_transp(j,i,3,ipoint)
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
       enddo                                  
     enddo
   enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
 
-  FREE int2_grad1_u12_bimo_transp
+  deallocate(tmp)
 
-  !call wall_time(wall1)
-  !print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
-  !call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3, ao_num, ao_num)]
-
-  implicit none
-  integer :: i, j, ipoint
-
-  PROVIDE int2_grad1_u12_ao
-
-  do ipoint = 1, n_points_final_grid
-    do i = 1, ao_num
-      do j = 1, ao_num
-        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(j,i,ipoint,1)
-        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(j,i,ipoint,2)
-        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(j,i,ipoint,3)
-      enddo                                  
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, 3, mo_num, mo_num)]
-
-  implicit none
-  integer :: i, j, ipoint
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,1,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,1,ipoint)
-        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,2,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,2,ipoint)
-        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,3,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,3,ipoint)
-      enddo
-    enddo
-  enddo
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk, (n_points_final_grid, 3, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! x_W_ki_bi_ortho_erf_rk(ip,m,k,i) = \int dr chi_k(r) \frac{(1 - erf(mu |r-R_ip|))}{2|r-R_ip|} (x(m)-R_ip(m)) phi_i(r) ON THE BI-ORTHO MO BASIS 
-  !
-  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
-  !
-  ! R_ip = the "ip"-th point of the DFT Grid
-  END_DOC
- 
-  implicit none
-  include 'constants.include.F'
- 
-  integer          :: ipoint, m, i, k
-  double precision :: xyz
-  double precision :: wall0, wall1
- 
-  print*, ' providing x_W_ki_bi_ortho_erf_rk ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (ipoint,m,i,k,xyz) & 
- !$OMP SHARED (x_W_ki_bi_ortho_erf_rk,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do m = 1, 3
-        do ipoint = 1, n_points_final_grid
-          xyz = final_grid_points(m,ipoint)
-          x_W_ki_bi_ortho_erf_rk(ipoint,m,k,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,k,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
- !$OMP END DO
- !$OMP END PARALLEL
-
- ! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu_transp 
- ! FREE mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp
-
-  call wall_time(wall1)
-  print *, ' time to provide x_W_ki_bi_ortho_erf_rk = ', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk_diag, (n_points_final_grid, 3, mo_num)]
-  BEGIN_DOC
-  ! x_W_ki_bi_ortho_erf_rk_diag(ip,m,i) = \int dr chi_i(r) (1 - erf(mu |r-R_ip|)) (x(m)-X(m)_ip) phi_i(r) ON THE BI-ORTHO MO BASIS 
-!
-! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
-!
-! R_ip = the "ip"-th point of the DFT Grid
-  END_DOC
-
-  implicit none
-  include 'constants.include.F'
- 
-  integer          :: ipoint, m, i
-  double precision :: xyz
-  double precision :: wall0, wall1
- 
-  print*,'providing x_W_ki_bi_ortho_erf_rk_diag ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                 &
- !$OMP DEFAULT (NONE)           &
- !$OMP PRIVATE (ipoint,m,i,xyz) & 
- !$OMP SHARED (x_W_ki_bi_ortho_erf_rk_diag,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do m = 1, 3
-      do ipoint = 1, n_points_final_grid
-        xyz = final_grid_points(m,ipoint)
-        x_W_ki_bi_ortho_erf_rk_diag(ipoint,m,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,i,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,i,i)
-      enddo
-    enddo
-  enddo
-
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print*,'time to provide x_W_ki_bi_ortho_erf_rk_diag = ',wall1 - wall0
+  call wall_time(tt2)
+  write(*,"(A,2X,F15.7)") ' wall time for int2_grad1_u12_bimo_t (sec) = ', (tt2 - tt1)
 
 END_PROVIDER 
 
diff --git a/plugins/local/bi_ort_ints/semi_num_ints_mo_old.irp.f b/plugins/local/bi_ort_ints/semi_num_ints_mo_old.irp.f
new file mode 100644
index 00000000..c2b9ad6d
--- /dev/null
+++ b/plugins/local/bi_ort_ints/semi_num_ints_mo_old.irp.f
@@ -0,0 +1,362 @@
+
+! ---
+
+! TODO :: optimization : transform into a DGEMM
+
+BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  !
+  ! mo_v_ki_bi_ortho_erf_rk_cst_mu(k,i,ip) = int dr chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1 )/(2|r - R_ip|) on the BI-ORTHO MO basis 
+  ! 
+  ! where phi_k(r) is a LEFT MOs and phi_i(r) is a RIGHT MO
+  !
+  ! R_ip = the "ip"-th point of the DFT Grid
+  !
+  END_DOC
+
+  implicit none
+  integer :: ipoint
+ !$OMP PARALLEL         &
+ !$OMP DEFAULT (NONE)   &
+ !$OMP PRIVATE (ipoint) & 
+ !$OMP SHARED (n_points_final_grid,v_ij_erf_rk_cst_mu,mo_v_ki_bi_ortho_erf_rk_cst_mu)
+ !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho( v_ij_erf_rk_cst_mu            (1,1,ipoint), size(v_ij_erf_rk_cst_mu,             1) &
+                          , mo_v_ki_bi_ortho_erf_rk_cst_mu(1,1,ipoint), size(mo_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  mo_v_ki_bi_ortho_erf_rk_cst_mu = mo_v_ki_bi_ortho_erf_rk_cst_mu * 0.5d0
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! int dr phi_i(r) phi_j(r) (erf(mu(R) |r - R|) - 1)/(2|r - R|) on the BI-ORTHO MO basis
+  !
+  END_DOC
+
+  implicit none
+  integer :: ipoint, i, j
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,j,i) = mo_v_ki_bi_ortho_erf_rk_cst_mu(j,i,ipoint)
+      enddo
+    enddo
+  enddo
+
+  !FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
+
+END_PROVIDER 
+
+! ---
+
+! TODO :: optimization : transform into a DGEMM
+
+BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, 3, n_points_final_grid)]
+
+  BEGIN_DOC
+  !
+  ! mo_x_v_ki_bi_ortho_erf_rk_cst_mu(k,i,m,ip) = int dr x(m) * chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1)/2|r - R_ip| on the BI-ORTHO MO basis 
+  !
+  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => x(m) = x, m=2 => x(m) = y, m=3 => x(m) = z,
+  !
+  ! R_ip = the "ip"-th point of the DFT Grid
+  !
+  END_DOC
+
+  implicit none
+  integer :: ipoint
+
+ !$OMP PARALLEL         &
+ !$OMP DEFAULT (NONE)   &
+ !$OMP PRIVATE (ipoint) & 
+ !$OMP SHARED (n_points_final_grid,x_v_ij_erf_rk_cst_mu_transp,mo_x_v_ki_bi_ortho_erf_rk_cst_mu)
+ !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+
+    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,1,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
+                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,1,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,2,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
+                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,2,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,3,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
+                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,3,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  mo_x_v_ki_bi_ortho_erf_rk_cst_mu = 0.5d0 * mo_x_v_ki_bi_ortho_erf_rk_cst_mu
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3, n_points_final_grid)]
+
+  implicit none
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !print *, ' providing int2_grad1_u12_ao_transp ...'
+  !call wall_time(wall0)
+
+  if(test_cycle_tc) then
+
+    PROVIDE int2_grad1_u12_ao_test
+
+    do ipoint = 1, n_points_final_grid
+      do i = 1, ao_num
+        do j = 1, ao_num
+          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,1)
+          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,2)
+          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,3)
+        enddo
+      enddo
+    enddo
+
+    FREE int2_grad1_u12_ao_test
+
+  else
+
+    PROVIDE int2_grad1_u12_ao
+
+    do ipoint = 1, n_points_final_grid
+      do i = 1, ao_num
+        do j = 1, ao_num
+          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(j,i,ipoint,1)
+          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(j,i,ipoint,2)
+          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(j,i,ipoint,3)
+        enddo
+      enddo
+    enddo
+
+  endif
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_ao_transp (min) = ', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
+
+  implicit none
+  integer          :: ipoint
+  double precision :: wall0, wall1
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE int2_grad1_u12_ao_transp
+
+  !print *, ' providing int2_grad1_u12_bimo_transp ...'
+  !call wall_time(wall0)
+
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao_transp,int2_grad1_u12_bimo_transp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,1,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,1,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,2,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,2,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,3,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,3,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !FREE int2_grad1_u12_ao_transp
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_bimo_transp (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_t_old, (n_points_final_grid, 3, mo_num, mo_num)]
+
+  implicit none
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_bimo_t_old ...'
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE int2_grad1_u12_bimo_transp
+
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t_old(ipoint,1,j,i) = int2_grad1_u12_bimo_transp(j,i,1,ipoint)
+        int2_grad1_u12_bimo_t_old(ipoint,2,j,i) = int2_grad1_u12_bimo_transp(j,i,2,ipoint)
+        int2_grad1_u12_bimo_t_old(ipoint,3,j,i) = int2_grad1_u12_bimo_transp(j,i,3,ipoint)
+      enddo                                  
+    enddo
+  enddo
+
+  FREE int2_grad1_u12_bimo_transp
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_bimo_t_old (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3, ao_num, ao_num)]
+
+  implicit none
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_ao_t ...'
+
+  PROVIDE int2_grad1_u12_ao
+
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      do j = 1, ao_num
+        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(j,i,ipoint,1)
+        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(j,i,ipoint,2)
+        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(j,i,ipoint,3)
+      enddo                                  
+    enddo
+  enddo
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_ao_t (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, 3, mo_num, mo_num)]
+
+  implicit none
+  integer :: i, j, ipoint
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,1,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,1,ipoint)
+        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,2,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,2,ipoint)
+        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,3,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,3,ipoint)
+      enddo
+    enddo
+  enddo
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk, (n_points_final_grid, 3, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! x_W_ki_bi_ortho_erf_rk(ip,m,k,i) = \int dr chi_k(r) \frac{(1 - erf(mu |r-R_ip|))}{2|r-R_ip|} (x(m)-R_ip(m)) phi_i(r) ON THE BI-ORTHO MO BASIS 
+  !
+  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
+  !
+  ! R_ip = the "ip"-th point of the DFT Grid
+  END_DOC
+ 
+  implicit none
+  include 'constants.include.F'
+ 
+  integer          :: ipoint, m, i, k
+  double precision :: xyz
+  double precision :: wall0, wall1
+ 
+  !print*, ' providing x_W_ki_bi_ortho_erf_rk ...'
+  !call wall_time(wall0)
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (ipoint,m,i,k,xyz) & 
+ !$OMP SHARED (x_W_ki_bi_ortho_erf_rk,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do m = 1, 3
+        do ipoint = 1, n_points_final_grid
+          xyz = final_grid_points(m,ipoint)
+          x_W_ki_bi_ortho_erf_rk(ipoint,m,k,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,k,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+ ! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu_transp 
+ ! FREE mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp
+
+  !call wall_time(wall1)
+  !print *, ' time to provide x_W_ki_bi_ortho_erf_rk = ', wall1 - wall0
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk_diag, (n_points_final_grid, 3, mo_num)]
+  BEGIN_DOC
+  ! x_W_ki_bi_ortho_erf_rk_diag(ip,m,i) = \int dr chi_i(r) (1 - erf(mu |r-R_ip|)) (x(m)-X(m)_ip) phi_i(r) ON THE BI-ORTHO MO BASIS 
+!
+! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
+!
+! R_ip = the "ip"-th point of the DFT Grid
+  END_DOC
+
+  implicit none
+  include 'constants.include.F'
+ 
+  integer          :: ipoint, m, i
+  double precision :: xyz
+  double precision :: wall0, wall1
+ 
+  !print*,'providing x_W_ki_bi_ortho_erf_rk_diag ...'
+  !call wall_time(wall0)
+
+ !$OMP PARALLEL                 &
+ !$OMP DEFAULT (NONE)           &
+ !$OMP PRIVATE (ipoint,m,i,xyz) & 
+ !$OMP SHARED (x_W_ki_bi_ortho_erf_rk_diag,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do m = 1, 3
+      do ipoint = 1, n_points_final_grid
+        xyz = final_grid_points(m,ipoint)
+        x_W_ki_bi_ortho_erf_rk_diag(ipoint,m,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,i,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,i,i)
+      enddo
+    enddo
+  enddo
+
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  !call wall_time(wall1)
+  !print*,'time to provide x_W_ki_bi_ortho_erf_rk_diag = ',wall1 - wall0
+
+END_PROVIDER 
+
+! ---
+
diff --git a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
index 726e48ba..73e5a611 100644
--- a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
@@ -123,7 +123,7 @@ subroutine give_integrals_3_body_bi_ort_spin( n, sigma_n, l, sigma_l, k, sigma_k
   endif
 
   return
-end subroutine give_integrals_3_body_bi_ort_spin
+end
 
 ! ---
 
@@ -168,7 +168,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
     integral = integral + tmp * final_weight_at_r_vector(ipoint)
   enddo
 
-end subroutine give_integrals_3_body_bi_ort
+end
 
 ! ---
 
diff --git a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
index 5e6a24e9..e27fdb7f 100644
--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@@ -16,10 +16,10 @@ double precision function bi_ortho_mo_ints(l, k, j, i)
   integer             :: m, n, p, q
 
   bi_ortho_mo_ints = 0.d0
-  do m = 1, ao_num
-    do p = 1, ao_num
-      do n = 1, ao_num
-        do q = 1, ao_num
+  do p = 1, ao_num
+    do m = 1, ao_num
+      do q = 1, ao_num
+        do n = 1, ao_num
           !                                   p1h1p2h2   l1                  l2              r1               r2
           bi_ortho_mo_ints += ao_two_e_tc_tot(n,q,m,p) * mo_l_coef(m,l) * mo_l_coef(n,k) * mo_r_coef(p,j) * mo_r_coef(q,i)
         enddo
@@ -27,7 +27,7 @@ double precision function bi_ortho_mo_ints(l, k, j, i)
     enddo
   enddo
 
-end function bi_ortho_mo_ints
+end
 
 ! ---
 
@@ -40,38 +40,106 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
   END_DOC
 
   implicit none
-  integer                       :: i, j, k, l, m, n, p, q
+  integer                       :: i, j, k, l, m, n, p, q, s, r
+  double precision              :: t1, t2, tt1, tt2
   double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
+  double precision, allocatable :: a_jkp(:,:,:), a_kpq(:,:,:), ao_two_e_tc_tot_tmp(:,:,:)
+
+  print *, ' PROVIDING mo_bi_ortho_tc_two_e_chemist ...'
+  call wall_time(t1)
+  call print_memory_usage()
 
   PROVIDE mo_r_coef mo_l_coef
 
-  allocate(a2(ao_num,ao_num,ao_num,mo_num))
+  if(ao_to_mo_tc_n3) then
 
-  call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
-            , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
-            , 0.d0 , a2(1,1,1,1), ao_num*ao_num*ao_num)
+    print*, ' memory scale of TC ao -> mo: O(N3) '
 
-  allocate(a1(ao_num,ao_num,mo_num,mo_num))
+    if(.not.read_tc_integ) then
+       stop 'read_tc_integ needs to be set to true'
+    endif
 
-  call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
-            , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+    allocate(a_jkp(ao_num,ao_num,mo_num))
+    allocate(a_kpq(ao_num,mo_num,mo_num))
+    allocate(ao_two_e_tc_tot_tmp(ao_num,ao_num,ao_num))
 
-  deallocate(a2)
-  allocate(a2(ao_num,mo_num,mo_num,mo_num))
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="read")
 
-  call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
-            , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+    call wall_time(tt1)
 
-  deallocate(a1)
+    mo_bi_ortho_tc_two_e_chemist(:,:,:,:) = 0.d0
+    do l = 1, ao_num
+      read(11) ao_two_e_tc_tot_tmp(:,:,:)
 
-  call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
-            , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+      do s = 1, mo_num
 
-  deallocate(a2)
+        call dgemm( 'T', 'N', ao_num*ao_num, mo_num, ao_num, 1.d0              &
+                  , ao_two_e_tc_tot_tmp(1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+                  , 0.d0, a_jkp(1,1,1), ao_num*ao_num)
 
+        call dgemm( 'T', 'N', ao_num*mo_num, mo_num, ao_num, 1.d0 &
+                  , a_jkp(1,1,1), ao_num, mo_r_coef(1,1), ao_num  &
+                  , 0.d0, a_kpq(1,1,1), ao_num*mo_num)
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, ao_num, mo_r_coef(l,s) &
+                  , a_kpq(1,1,1), ao_num, mo_l_coef(1,1), ao_num            &
+                  , 1.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,s), mo_num*mo_num)
+
+      enddo ! s
+
+      if(l == 2) then
+        call wall_time(tt2)
+        print*, ' 1 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-1)*(tt2-tt1)/60.d0
+      elseif(l == 11) then
+        call wall_time(tt2)
+        print*, ' 10 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-10)*(tt2-tt1)/(60.d0*10.d0)
+      elseif(l == 101) then
+        call wall_time(tt2)
+        print*, ' 100 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-100)*(tt2-tt1)/(60.d0*100.d0)
+      endif
+    enddo ! l
+
+    close(11)
+
+    deallocate(a_jkp, a_kpq, ao_two_e_tc_tot_tmp)
+
+  else
+
+    print*, ' memory scale of TC ao -> mo: O(N4) '
+
+    allocate(a2(ao_num,ao_num,ao_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
+              , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+              , 0.d0, a2(1,1,1,1), ao_num*ao_num*ao_num)
+  
+    FREE ao_two_e_tc_tot
+
+    allocate(a1(ao_num,ao_num,mo_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+              , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+  
+    deallocate(a2)
+    allocate(a2(ao_num,mo_num,mo_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
+              , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+  
+    deallocate(a1)
+  
+    call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+              , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+  
+    deallocate(a2)
+  
+  endif
 
   !allocate(a1(mo_num,ao_num,ao_num,ao_num))
   !a1 = 0.d0
@@ -135,6 +203,10 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
   !enddo
   !deallocate(a1)
 
+  call wall_time(t2)
+  print *, ' WALL TIME for PROVIDING mo_bi_ortho_tc_two_e_chemist (min)', (t2-t1)/60.d0
+  call print_memory_usage()
+
 END_PROVIDER 
 
 ! ---
@@ -176,6 +248,34 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e, (mo_num, mo_num, mo_num,
 
 END_PROVIDER 
 
+BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_transp, (mo_num, mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+  !
+  ! mo_bi_ortho_tc_two_e_transp(i,j,k,l) = <k l| V(r_12) |i j> = transpose of mo_bi_ortho_tc_two_e
+  !
+  ! the potential V(r_12) contains ALL TWO-E CONTRIBUTION OF THE TC-HAMILTONIAN
+  !
+ END_DOC
+
+ integer :: i,j,k,l
+ print*,'Providing mo_bi_ortho_tc_two_e_transp'
+ double precision :: t0,t1
+ call wall_time(t0)
+ do i = 1, mo_num
+  do j = 1, mo_num
+   do k = 1, mo_num
+    do l = 1, mo_num
+     mo_bi_ortho_tc_two_e_transp(i,j,k,l) = mo_bi_ortho_tc_two_e(k,l,i,j)
+    enddo
+   enddo
+  enddo
+ enddo
+ call wall_time(t1)
+ 
+ print *, ' WALL TIME for PROVIDING mo_bi_ortho_tc_two_e_transp (min)', (t1-t0)/60.d0
+
+END_PROVIDER 
 ! ---
 
  BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj,          (mo_num,mo_num)]
@@ -232,3 +332,23 @@ END_PROVIDER
 
 ! ---
 
+ BEGIN_PROVIDER [double precision, tc_2e_3idx_coulomb_integrals_transp , (mo_num,mo_num,mo_num)]
+&BEGIN_PROVIDER [double precision, tc_2e_3idx_exchange_integrals_transp, (mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  ! tc_2e_3idx_coulomb_integrals_transp (j,k,i) = <jk|ji> 
+  ! tc_2e_3idx_exchange_integrals_transp(j,k,i) = <kj|ji> 
+  END_DOC
+ implicit none
+ integer :: i, j, k
+
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        tc_2e_3idx_coulomb_integrals_transp(j, k,i) = mo_bi_ortho_tc_two_e_transp(j ,k ,j ,i ) 
+        tc_2e_3idx_exchange_integrals_transp(j,k,i) = mo_bi_ortho_tc_two_e_transp(k ,j ,j ,i ) 
+      enddo
+    enddo
+  enddo
+
+END_PROVIDER 
diff --git a/plugins/local/bi_ortho_mos/bi_ort_mos_in_r.irp.f b/plugins/local/bi_ortho_mos/bi_ort_mos_in_r.irp.f
index 25572854..b1c2dc87 100644
--- a/plugins/local/bi_ortho_mos/bi_ort_mos_in_r.irp.f
+++ b/plugins/local/bi_ortho_mos/bi_ort_mos_in_r.irp.f
@@ -1,135 +1,70 @@
 
-! TODO: left & right MO without duplicate AO calculation
-
-! ---
-
-BEGIN_PROVIDER[double precision, mos_r_in_r_array, (mo_num, n_points_final_grid)]
+ BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid, mo_num)]
+&BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid, mo_num)]
 
   BEGIN_DOC
-  ! mos_in_r_array(i,j) = value of the ith RIGHT mo on the jth grid point
+  !
+  ! mos_l_in_r_array_transp(i,j) = value of the jth left-mo  on the ith grid point
+  ! mos_r_in_r_array_transp(i,j) = value of the jth right-mo on the ith grid point
+  !
   END_DOC
 
   implicit none
-  integer          :: i, j
-  double precision :: mos_array(mo_num), r(3)
 
- !$OMP PARALLEL DO &
- !$OMP DEFAULT (NONE)  &
- !$OMP PRIVATE (i, j, r, mos_array) & 
- !$OMP SHARED (mos_r_in_r_array, n_points_final_grid, mo_num, final_grid_points)
+  integer                       :: i
+  double precision              :: tt0, tt1, tt2, tt3
+  double precision              :: r(3)
+  double precision, allocatable :: aos_r(:,:)
+
+  call wall_time(tt0)
+
+  allocate(aos_r(ao_num,n_points_final_grid))
+
+  ! provide everything required before OpenMP
+  r(1) = final_grid_points(1,1)
+  r(2) = final_grid_points(2,1)
+  r(3) = final_grid_points(3,1)
+  call give_all_aos_at_r(r, aos_r(1,1))
+
+
+  call wall_time(tt2)
+
+  !$OMP PARALLEL       &
+  !$OMP DEFAULT (NONE) &
+  !$OMP PRIVATE (i, r) & 
+  !$OMP SHARED(n_points_final_grid, final_grid_points, aos_r)
+  !$OMP DO
   do i = 1, n_points_final_grid
     r(1) = final_grid_points(1,i)
     r(2) = final_grid_points(2,i)
     r(3) = final_grid_points(3,i)
-    call give_all_mos_r_at_r(r, mos_array)
-    do j = 1, mo_num
-      mos_r_in_r_array(j,i) = mos_array(j)
-    enddo
+    call give_all_aos_at_r(r, aos_r(1,i))
   enddo
- !$OMP END PARALLEL DO
- 
-END_PROVIDER
+  !$OMP END DO
+  !$OMP END PARALLEL
 
-! ---
+  call wall_time(tt3)
+  write(*,"(A,2X,F15.7)") ' wall time for AOs on r (sec) = ', (tt3 - tt2)
 
-BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid, mo_num)]
 
-  BEGIN_DOC
-  ! mos_r_in_r_array_transp(i,j) = value of the jth mo on the ith grid point
-  END_DOC
+  call dgemm("T", "N", n_points_final_grid, mo_num, ao_num, &
+             1.d0,                                          &
+             aos_r(1,1), ao_num,                            &
+             mo_l_coef(1,1), ao_num,                        &
+             0.d0,                                          &
+             mos_l_in_r_array_transp(1,1), n_points_final_grid)
 
-  implicit none
-  integer :: i,j
+  call dgemm("T", "N", n_points_final_grid, mo_num, ao_num, &
+             1.d0,                                          &
+             aos_r(1,1), ao_num,                            &
+             mo_r_coef(1,1), ao_num,                        &
+             0.d0,                                          &
+             mos_r_in_r_array_transp(1,1), n_points_final_grid)
 
-  do i = 1, n_points_final_grid
-    do j = 1, mo_num
-      mos_r_in_r_array_transp(i,j) = mos_r_in_r_array(j,i) 
-    enddo
-  enddo
-
-END_PROVIDER
-
-! ---
-
-subroutine give_all_mos_r_at_r(r, mos_r_array)
-
-  BEGIN_DOC
-  ! mos_r_array(i) = ith RIGHT MO function evaluated at "r"
-  END_DOC
-
-  implicit none
-  double precision, intent(in)  :: r(3)
-  double precision, intent(out) :: mos_r_array(mo_num)
-  double precision              :: aos_array(ao_num)
-
-  call give_all_aos_at_r(r, aos_array)
-  call dgemv('N', mo_num, ao_num, 1.d0, mo_r_coef_transp, mo_num, aos_array, 1, 0.d0, mos_r_array, 1)
-
-end subroutine give_all_mos_r_at_r
-
-! ---
-
-BEGIN_PROVIDER[double precision, mos_l_in_r_array, (mo_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  ! mos_in_r_array(i,j) = value of the ith LEFT mo on the jth grid point
-  END_DOC
-
-  implicit none
-  integer          :: i, j
-  double precision :: mos_array(mo_num), r(3)
-
- !$OMP PARALLEL DO &
- !$OMP DEFAULT (NONE)  &
- !$OMP PRIVATE (i,r,mos_array,j) & 
- !$OMP SHARED(mos_l_in_r_array,n_points_final_grid,mo_num,final_grid_points)
-  do i = 1, n_points_final_grid
-    r(1) = final_grid_points(1,i)
-    r(2) = final_grid_points(2,i)
-    r(3) = final_grid_points(3,i)
-    call give_all_mos_l_at_r(r, mos_array)
-    do j = 1, mo_num
-      mos_l_in_r_array(j,i) = mos_array(j)
-    enddo
-  enddo
- !$OMP END PARALLEL DO
- 
-END_PROVIDER
-
-! ---
-
-subroutine give_all_mos_l_at_r(r, mos_l_array)
-
-  BEGIN_DOC
-  ! mos_l_array(i) = ith LEFT MO function evaluated at "r"
-  END_DOC
-
-  implicit none
-  double precision, intent(in)  :: r(3)
-  double precision, intent(out) :: mos_l_array(mo_num)
-  double precision              :: aos_array(ao_num)
-
-  call give_all_aos_at_r(r, aos_array)
-  call dgemv('N', mo_num, ao_num, 1.d0, mo_l_coef_transp, mo_num, aos_array, 1, 0.d0, mos_l_array, 1)
-
-end subroutine give_all_mos_l_at_r
-
-! ---
-
-BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid,mo_num)]
-
-  BEGIN_DOC
-  ! mos_l_in_r_array_transp(i,j) = value of the jth mo on the ith grid point
-  END_DOC
-
-  implicit none
-  integer :: i, j
-
-  do i = 1, n_points_final_grid
-    do j = 1, mo_num
-      mos_l_in_r_array_transp(i,j) = mos_l_in_r_array(j,i) 
-    enddo
-  enddo
+  deallocate(aos_r)
+
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for mos_l_in_r_array_transp & mos_r_in_r_array_transp (sec) = ', (tt1 - tt0)
 
 END_PROVIDER
 
diff --git a/plugins/local/bi_ortho_mos/bi_ort_mos_in_r_old.irp.f b/plugins/local/bi_ortho_mos/bi_ort_mos_in_r_old.irp.f
new file mode 100644
index 00000000..9fd671f8
--- /dev/null
+++ b/plugins/local/bi_ortho_mos/bi_ort_mos_in_r_old.irp.f
@@ -0,0 +1,137 @@
+
+! TODO: left & right MO without duplicate AO calculation
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_r_in_r_array, (mo_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  ! mos_in_r_array(i,j) = value of the ith RIGHT mo on the jth grid point
+  END_DOC
+
+  implicit none
+  integer          :: i, j
+  double precision :: mos_array(mo_num), r(3)
+
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (i, j, r, mos_array) & 
+ !$OMP SHARED (mos_r_in_r_array, n_points_final_grid, mo_num, final_grid_points)
+  do i = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    call give_all_mos_r_at_r(r, mos_array)
+    do j = 1, mo_num
+      mos_r_in_r_array(j,i) = mos_array(j)
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+ 
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp_old, (n_points_final_grid, mo_num)]
+
+  BEGIN_DOC
+  ! mos_r_in_r_array_transp_old(i,j) = value of the jth mo on the ith grid point
+  END_DOC
+
+  implicit none
+  integer :: i,j
+
+  do i = 1, n_points_final_grid
+    do j = 1, mo_num
+      mos_r_in_r_array_transp_old(i,j) = mos_r_in_r_array(j,i) 
+    enddo
+  enddo
+
+END_PROVIDER
+
+! ---
+
+subroutine give_all_mos_r_at_r(r, mos_r_array)
+
+  BEGIN_DOC
+  ! mos_r_array(i) = ith RIGHT MO function evaluated at "r"
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: mos_r_array(mo_num)
+  double precision              :: aos_array(ao_num)
+
+  call give_all_aos_at_r(r, aos_array)
+  call dgemv('N', mo_num, ao_num, 1.d0, mo_r_coef_transp, mo_num, aos_array, 1, 0.d0, mos_r_array, 1)
+
+end subroutine give_all_mos_r_at_r
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_l_in_r_array, (mo_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  ! mos_in_r_array(i,j) = value of the ith LEFT mo on the jth grid point
+  END_DOC
+
+  implicit none
+  integer          :: i, j
+  double precision :: mos_array(mo_num), r(3)
+
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (i,r,mos_array,j) & 
+ !$OMP SHARED(mos_l_in_r_array,n_points_final_grid,mo_num,final_grid_points)
+  do i = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    call give_all_mos_l_at_r(r, mos_array)
+    do j = 1, mo_num
+      mos_l_in_r_array(j,i) = mos_array(j)
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+ 
+END_PROVIDER
+
+! ---
+
+subroutine give_all_mos_l_at_r(r, mos_l_array)
+
+  BEGIN_DOC
+  ! mos_l_array(i) = ith LEFT MO function evaluated at "r"
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: mos_l_array(mo_num)
+  double precision              :: aos_array(ao_num)
+
+  call give_all_aos_at_r(r, aos_array)
+  call dgemv('N', mo_num, ao_num, 1.d0, mo_l_coef_transp, mo_num, aos_array, 1, 0.d0, mos_l_array, 1)
+
+end subroutine give_all_mos_l_at_r
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp_old, (n_points_final_grid,mo_num)]
+
+  BEGIN_DOC
+  ! mos_l_in_r_array_transp_old(i,j) = value of the jth mo on the ith grid point
+  END_DOC
+
+  implicit none
+  integer :: i, j
+
+  do i = 1, n_points_final_grid
+    do j = 1, mo_num
+      mos_l_in_r_array_transp_old(i,j) = mos_l_in_r_array(j,i) 
+    enddo
+  enddo
+
+END_PROVIDER
+
+! ---
+
diff --git a/plugins/local/bi_ortho_mos/overlap.irp.f b/plugins/local/bi_ortho_mos/overlap.irp.f
index ff5d5c84..7f07929f 100644
--- a/plugins/local/bi_ortho_mos/overlap.irp.f
+++ b/plugins/local/bi_ortho_mos/overlap.irp.f
@@ -56,10 +56,10 @@
     print*,'Average trace of overlap_bi_ortho is different from 1 by ', dabs(accu_d-1.d0)
     print*,'And bi orthogonality is off by an average of ',accu_nd
     print*,'****************'
-    print*,'Overlap matrix betwee mo_l_coef and mo_r_coef  '
-    do i = 1, mo_num
-      write(*,'(100(F16.10,X))')overlap_bi_ortho(i,:)
-    enddo
+    !print*,'Overlap matrix betwee mo_l_coef and mo_r_coef  '
+    !do i = 1, mo_num
+    !  write(*,'(100(F16.10,X))')overlap_bi_ortho(i,:)
+    !enddo
   endif
   print*,'Average trace of overlap_bi_ortho (should be 1.)'
   print*,'accu_d  = ',accu_d
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
new file mode 100644
index 00000000..f149e7c6
--- /dev/null
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
@@ -0,0 +1,108 @@
+subroutine get_d0_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be okay for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  integer :: i, j, k, s, h1, h2, p1, p2, puti, putj, mm
+  double precision :: phase
+  double precision :: hij,hji
+  double precision, external :: get_phase_bi
+  logical :: ok
+
+  integer, parameter :: bant=1
+  double precision, allocatable :: hij_cache1(:), hij_cache2(:)
+  allocate (hij_cache1(mo_num),hij_cache2(mo_num))
+  double precision, allocatable :: hji_cache1(:), hji_cache2(:)
+  allocate (hji_cache1(mo_num),hji_cache2(mo_num))
+!  print*,'in get_d0_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  if(sp == 3) then ! AB
+    h1 = p(1,1)
+    h2 = p(1,2)
+    do p1=1, mo_num
+      if(bannedOrb(p1, 1)) cycle
+!      call get_mo_two_e_integrals_complex(p1,h2,h1,mo_num,hij_cache1,mo_integrals_map)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,p1,h2,h1)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e_transp(mm,p1,h2,h1)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e_both(det,gen,N_int, hij,hji)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hij = hij_cache1(p2) * phase
+          hji = hji_cache1(p2) * phase
+        end if
+        if (hij == 0.d0.or.hji == 0.d0) cycle
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij  ! HOTSPOT
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji  ! HOTSPOT
+        enddo
+      end do
+    end do
+
+  else ! AA BB
+    p1 = p(1,sp)
+    p2 = p(2,sp)
+    do puti=1, mo_num
+      if(bannedOrb(puti, sp)) cycle
+!      call get_mo_two_e_integrals_complex(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,puti,p2,p1)
+       hij_cache2(mm) = mo_bi_ortho_tc_two_e(mm,puti,p1,p2)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e_transp(mm,puti,p2,p1)
+       hji_cache2(mm) = mo_bi_ortho_tc_two_e_transp(mm,puti,p1,p2)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          !call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e_both(det,gen,N_int, hij,hji)
+         if (hij == 0.d0.or.hji == 0.d0) cycle
+        else
+!          hij = (mo_two_e_integral_complex(p1, p2, puti, putj) -  mo_two_e_integral_complex(p2, p1, puti, putj))
+!          hij = (mo_bi_ortho_tc_two_e(p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e(p2, p1, puti, putj))
+          hij = (mo_bi_ortho_tc_two_e(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e(puti, putj, p2, p1))
+          hji = (mo_bi_ortho_tc_two_e_transp(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e_transp(puti, putj, p2, p1))
+          if (hij == 0.d0.or.hji == 0.d0) cycle
+          phase = get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+          hij = (hij) * phase
+          hji = (hji) * phase
+        end if
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+          mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+        enddo
+      end do
+
+    end do
+  end if
+
+  deallocate(hij_cache1,hij_cache2)
+end
+
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
new file mode 100644
index 00000000..a3d7b076
--- /dev/null
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
@@ -0,0 +1,358 @@
+subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices should be okay for complex?
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in)  :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in)  :: phasemask(N_int,2)
+  logical, intent(in)            :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind)              :: det(N_int, 2)
+  double precision, intent(in)   :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+  double precision, external     :: get_phase_bi
+  double precision, external     :: mo_two_e_integral_complex
+  logical                        :: ok
+
+  logical, allocatable           :: lbanned(:,:)
+  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j, istate
+  integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k, l, mm
+
+  integer, parameter             :: turn2(2) = (/2,1/)
+  integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer                        :: bant
+  double precision, allocatable :: hij_cache(:,:)
+  double precision               :: hij, tmp_rowij(N_states, mo_num), tmp_rowij2(N_states, mo_num),phase
+  double precision, allocatable :: hji_cache(:,:)
+  double precision               :: hji, tmp_rowji(N_states, mo_num), tmp_rowji2(N_states, mo_num)
+!  PROVIDE mo_integrals_map N_int
+!  print*,'in get_d1_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  allocate (lbanned(mo_num, 2))
+  allocate (hij_cache(mo_num,2))
+  allocate (hji_cache(mo_num,2))
+  lbanned = bannedOrb
+
+  do i=1, p(0,1)
+    lbanned(p(i,1), 1) = .true.
+  end do
+  do i=1, p(0,2)
+    lbanned(p(i,2), 2) = .true.
+  end do
+
+  ma = 1
+  if(p(0,2) >= 2) ma = 2
+  mi = turn2(ma)
+
+  bant = 1
+
+  if(sp == 3) then
+    !move MA
+    if(ma == 2) bant = 2
+    puti = p(1,mi)
+    hfix = h(1,ma)
+    p1 = p(1,ma)
+    p2 = p(2,ma)
+    if(.not. bannedOrb(puti, mi)) then
+!      call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+       do istate = 1,N_states
+        tmp_rowij(istate,mm)  = 0.d0
+        tmp_rowji(istate,mm)  = 0.d0
+       enddo
+      enddo
+      !! <alpha|H|psi>
+      do putj=1, hfix-1
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,1) - hij_cache(putj,2)
+        hji = hji_cache(putj,1) - hji_cache(putj,2)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,2)
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,1)
+          enddo
+        endif
+      end do
+      do putj=hfix+1, mo_num
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,2) - hij_cache(putj,1)
+        hji = hji_cache(putj,2) - hji_cache(putj,1)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,2)
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,1)
+          enddo
+        endif
+      end do
+
+      if(ma == 1) then
+        mat_r(1:N_states,1:mo_num,puti) = mat_r(1:N_states,1:mo_num,puti) + tmp_rowij(1:N_states,1:mo_num)
+        mat_l(1:N_states,1:mo_num,puti) = mat_l(1:N_states,1:mo_num,puti) + tmp_rowji(1:N_states,1:mo_num)
+      else
+        do l=1,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k,puti,l) = mat_r(k,puti,l) + tmp_rowij(k,l)
+            mat_l(k,puti,l) = mat_l(k,puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end if
+
+    end if
+
+    !MOVE MI
+    pfix = p(1,mi)
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+    do mm = 1, mo_num
+     do istate = 1,N_states
+      tmp_rowij(istate,mm)  = 0.d0
+      tmp_rowij2(istate,mm) = 0.d0
+      tmp_rowji(istate,mm)  = 0.d0
+      tmp_rowji2(istate,mm) = 0.d0
+     enddo
+     hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p1)
+     hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p2)
+     hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p1)
+     hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p2)
+    enddo
+    putj = p1
+    !! <alpha|H|psi>
+    do puti=1,mo_num !HOT
+      if(lbanned(puti,mi)) cycle
+      !p1 fixed
+      putj = p1
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,2)
+        hji = hji_cache(puti,2)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,2)
+            tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,1)
+          enddo
+        endif
+      end if
+!      
+      putj = p2
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,1)
+        hji = hji_cache(puti,1)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          do k=1,N_states
+            tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,2)
+            tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,1)
+          enddo
+        endif
+      end if
+    end do
+
+    if(mi == 1) then
+      mat_r(:,:,p1) = mat_r(:,:,p1) + tmp_rowij(:,:)
+      mat_r(:,:,p2) = mat_r(:,:,p2) + tmp_rowij2(:,:)
+      mat_l(:,:,p1) = mat_l(:,:,p1) + tmp_rowji(:,:)
+      mat_l(:,:,p2) = mat_l(:,:,p2) + tmp_rowji2(:,:)
+    else
+      do l=1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij(k,l)
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij2(k,l)
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+
+  else  ! sp /= 3
+
+    if(p(0,ma) == 3) then
+      do i=1,3
+        hfix = h(1,ma)
+        puti = p(i, ma)
+        p1 = p(turn3(1,i), ma)
+        p2 = p(turn3(2,i), ma)
+!        call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!        call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+        do mm = 1, mo_num
+         hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+         hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+         hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
+         hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+         do istate = 1, N_states
+          tmp_rowij(istate,mm) = 0.d0
+          tmp_rowji(istate,mm) = 0.d0
+         enddo
+        enddo
+    !! <alpha|H|psi>
+        do putj=1,hfix-1
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,1) - hij_cache(putj,2)
+          hji = hji_cache(putj,1) - hji_cache(putj,2)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,2)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,1)
+          endif
+        end do
+        do putj=hfix+1,mo_num
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,2) - hij_cache(putj,1)
+          hji = hji_cache(putj,2) - hji_cache(putj,1)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,2)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,1)
+          endif
+        end do
+
+        mat_r(:, :puti-1, puti) = mat_r(:, :puti-1, puti) + tmp_rowij(:,:puti-1)
+        mat_l(:, :puti-1, puti) = mat_l(:, :puti-1, puti) + tmp_rowji(:,:puti-1)
+        do l=puti,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, l) = mat_r(k, puti,l) + tmp_rowij(k,l)
+            mat_l(k, puti, l) = mat_l(k, puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end do
+    else
+      hfix = h(1,mi)
+      pfix = p(1,mi)
+      p1 = p(1,ma)
+      p2 = p(2,ma)
+!      call get_mo_two_e_integrals_complex(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,pfix)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,pfix)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,pfix)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,pfix)
+       do istate = 1,N_states
+        tmp_rowij (istate,mm) =  0.d0
+        tmp_rowij2(istate,mm) = 0.d0
+        tmp_rowji (istate,mm) =  0.d0
+        tmp_rowji2(istate,mm) = 0.d0
+       enddo
+      enddo
+      putj = p2
+    !! <alpha|H|psi>
+      do puti=1,mo_num
+        if(lbanned(puti,ma)) cycle
+        putj = p2
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,1)
+          hji = hji_cache(puti,1)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,2)
+              tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,1)
+            enddo
+          endif
+        end if
+
+        putj = p1
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,2)
+          hji = hji_cache(puti,2)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            do k=1,N_states
+              tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,2)
+              tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,1)
+            enddo
+          endif
+        end if
+      end do
+      mat_r(:,:p2-1,p2) = mat_r(:,:p2-1,p2) + tmp_rowij(:,:p2-1)
+      mat_l(:,:p2-1,p2) = mat_l(:,:p2-1,p2) + tmp_rowji(:,:p2-1)
+      do l=p2,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji(k,l)
+        enddo
+      enddo
+      mat_r(:,:p1-1,p1) = mat_r(:,:p1-1,p1) + tmp_rowij2(:,:p1-1)
+      mat_l(:,:p1-1,p1) = mat_l(:,:p1-1,p1) + tmp_rowji2(:,:p1-1)
+      do l=p1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij2(k,l)
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+  end if
+  deallocate(lbanned,hij_cache, hji_cache)
+
+ !! MONO
+    if(sp == 3) then
+      s1 = 1
+      s2 = 2
+    else
+      s1 = sp
+      s2 = sp
+    end if
+
+    do i1=1,p(0,s1)
+      ib = 1
+      if(s1 == s2) ib = i1+1
+      do i2=ib,p(0,s2)
+        p1 = p(i1,s1)
+        p2 = p(i2,s2)
+        if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
+        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+        ! gen is a selector; mask is ionized generator; det is alpha
+        ! hij is contribution to <psi|H|alpha>
+!        call i_h_j_complex(gen, det, N_int, hij)
+        call htilde_mu_mat_opt_bi_ortho_no_3e_both(det, gen, N_int, hij,hji)
+!        call htilde_mu_mat_opt_bi_ortho_no_3e(gen, det, N_int, hji)
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,1) * dconjg(hij)
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji
+        enddo
+      end do
+    end do
+end
+
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
index d01ed433..86922ae9 100644
--- a/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
@@ -25,9 +25,6 @@ subroutine get_d2_new(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h,
 
   integer :: bant
   bant = 1
-!  print*, 'in get_d2_new'
-!  call debug_det(gen,N_int)
-!  print*,'coefs',coefs(1,:)
 
   tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f b/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f
new file mode 100644
index 00000000..b2a7ea31
--- /dev/null
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f
@@ -0,0 +1,235 @@
+
+subroutine get_d2_new_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be correct for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  double precision, external :: get_phase_bi
+
+  integer :: i, j, k, tip, ma, mi, puti, putj
+  integer :: h1, h2, p1, p2, i1, i2
+  double precision :: phase
+  double precision :: hij,hji
+
+  integer, parameter:: turn2d(2,3,4) = reshape((/0,0, 0,0, 0,0,  3,4, 0,0, 0,0,  2,4, 1,4, 0,0,  2,3, 1,3, 1,2 /), (/2,3,4/))
+  integer, parameter :: turn2(2) = (/2, 1/)
+  integer, parameter :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer :: bant
+  bant = 1
+
+  tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles
+
+  ma = sp !1:(alpha,alpha); 2:(b,b); 3:(a,b)
+  if(p(0,1) > p(0,2)) ma = 1 ! more alpha particles than beta particles
+  if(p(0,1) < p(0,2)) ma = 2 ! fewer alpha particles than beta particles
+  mi = mod(ma, 2) + 1
+
+  if(sp == 3) then ! if one alpha and one beta xhole 
+    !(where xholes refer to the ionizations from the generator, not the holes occupied in the ionized generator)
+    if(ma == 2) bant = 2 ! if more beta particles than alpha particles
+
+    if(tip == 3) then ! if 3 of one particle spin and 1 of the other particle spin
+      puti = p(1, mi)
+      if(bannedOrb(puti, mi)) return
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+
+      !! <alpha|H|psi>
+      do i = 1, 3    ! loop over all 3 combinations of 2 particles with spin ma
+        putj = p(i, ma)
+        if(banned(putj,puti,bant)) cycle
+        i1 = turn3(1,i)
+        i2 = turn3(2,i)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        
+     ! |G> = |psi_{gen,i}>
+     ! |G'> = a_{x1} a_{x2} |G>
+     ! |alpha> = a_{puti}^{\dagger} a_{putj}^{\dagger} |G'>
+     ! |alpha> = t_{x1,x2}^{puti,putj} |G>
+     ! hij = <psi_{selectors,i}|H|alpha>
+     ! |alpha> = t_{p1,p2}^{h1,h2}|psi_{selectors,i}>
+        !todo: <i|H|j>  =  (<h1,h2|p1,p2> - <h1,h2|p2,p1>) * phase
+        !    <psi|H|j> +=  dconjg(c_i) * <i|H|j>
+        !      <j|H|i>  =  (<p1,p2|h1,h2> - <p2,p1|h1,h2>) * phase
+        !    <j|H|psi> +=  <j|H|i> * c_i
+
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+
+!        hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e( h1, h2, p2, p1)
+!        hji = mo_bi_ortho_tc_two_e_transp(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e_transp( h1, h2, p2, p1)
+        hij = mo_bi_ortho_tc_two_e_transp(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e_transp( p1, p2, h2, h1)
+        hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)       - mo_bi_ortho_tc_two_e( p1, p2, h2, h1)
+        if (hij == 0.d0.or.hji==0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        phase = get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        hij = hij * phase
+        hji = hji * phase
+
+        if(ma == 1) then ! if particle spins are (alpha,alpha,alpha,beta), then puti is beta and putj is alpha
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,2) * hij
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,1) * hji
+          enddo
+        else            ! if particle spins are (beta,beta,beta,alpha), then puti is alpha and putj is beta
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        end if
+      end do
+    else ! if 2 alpha and 2 beta particles
+      h1 = h(1,1)
+      h2 = h(1,2)
+      !! <alpha|H|psi>
+      do j = 1,2 ! loop over all 4 combinations of one alpha and one beta particle
+        putj = p(j, 2)
+        if(bannedOrb(putj, 2)) cycle
+        p2 = p(turn2(j), 2)
+        do i = 1,2
+          puti = p(i, 1)
+          if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle
+          p1 = p(turn2(i), 1)
+    ! hij = <psi_{selectors,i}|H|alpha> 
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+!          hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2 )
+!          hji = mo_bi_ortho_tc_two_e_transp(h1, h2, p1, p2 )
+          hij = mo_bi_ortho_tc_two_e_transp(p1, p2 ,h1, h2 )
+          hji = mo_bi_ortho_tc_two_e( p1, p2, h1, h2)
+          if (hij /= 0.d0.or.hji==0.d0) then
+            ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!            hij = dconjg(hij) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+              mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+            enddo
+          endif
+        end do
+      end do
+    end if
+
+  else ! if holes are (a,a) or (b,b)
+    if(tip == 0) then ! if particles are (a,a,a,a) or (b,b,b,b)
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(i, ma)
+        if(bannedOrb(puti,ma)) cycle
+        do j=i+1,4
+          putj = p(j, ma)
+          if(bannedOrb(putj,ma)) cycle
+          if(banned(puti,putj,1)) cycle
+
+          i1 = turn2d(1, i, j)
+          i2 = turn2d(2, i, j)
+          p1 = p(i1, ma)
+          p2 = p(i2, ma)
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+          hij = mo_bi_ortho_tc_two_e_transp(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e_transp(p1, p2, h2,h1 )
+          hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p1, p2, h2,h1 )
+          if (hij == 0.d0.or.hji == 0.d0) cycle
+
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          phase = get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) +coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) +coefs(k,1) * hji
+          enddo
+        end do
+      end do
+    else if(tip == 3) then ! if particles are (a,a,a,b) (ma=1,mi=2) or (a,b,b,b) (ma=2,mi=1)
+      h1 = h(1, mi)
+      h2 = h(1, ma)
+      p1 = p(1, mi)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(turn3(1,i), ma)
+        if(bannedOrb(puti,ma)) cycle
+        putj = p(turn3(2,i), ma)
+        if(bannedOrb(putj,ma)) cycle
+        if(banned(puti,putj,1)) cycle
+        p2 = p(i, ma)
+
+!        hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = mo_bi_ortho_tc_two_e_transp(p1, p2 ,h1, h2)
+        hji = mo_bi_ortho_tc_two_e(p1, p2,h1, h2 )
+        if (hij == 0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        phase = get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        hij = hij * phase
+        hji = hji * phase
+        if (puti < putj) then
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        else
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,2) * hij
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,1) * hji
+          enddo
+        endif
+      end do
+    else ! tip == 4  (a,a,b,b)
+      puti = p(1, sp)
+      putj = p(2, sp)
+      if(.not. banned(puti,putj,1)) then
+        p1 = p(1, mi)
+        p2 = p(2, mi)
+        h1 = h(1, mi)
+        h2 = h(2, mi)
+      !! <alpha|H|psi>
+!        hij = (mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2))
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = (mo_bi_ortho_tc_two_e_transp(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e_transp(p2,p1,h1, h2))
+        hji = (mo_bi_ortho_tc_two_e(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e(p2,p1,h1, h2))
+        if (hij /= 0.d0.or.hji==0.d0) then
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          phase = get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          hij = hij * phase
+          hji = hji* phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        end if
+      end if
+    end if
+  end if
+end
diff --git a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
index 833cc0ea..22381991 100644
--- a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
@@ -65,8 +65,12 @@ subroutine tc_pt2
   call pt2_dealloc(pt2_data_err)
   call pt2_alloc(pt2_data, N_states)
   call pt2_alloc(pt2_data_err, N_states)
+  if(transpose_two_e_int)then
+   provide mo_bi_ortho_tc_two_e_transp tc_2e_3idx_coulomb_integrals_transp
+  endif
   call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
   call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  call print_summary_tc(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)
 
 end
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
index 12163e06..72ccf9c4 100644
--- a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
@@ -636,10 +636,7 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
     negMask(i,2) = not(mask(i,2))
   end do
 
-!  print*,'in selection '
   do i = 1, N_sel
-!    call debug_det(det(1,1,i),N_int)
-!    print*,i,dabs(psi_selectors_coef_transp_tc(1,2,i) * psi_selectors_coef_transp_tc(1,1,i))
     if(interesting(i) < 0) then
       stop 'prefetch interesting(i) and det(i)'
     endif
@@ -691,11 +688,23 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
 
         call get_mask_phase(psi_det_sorted_tc(1,1,interesting(i)), phasemask,N_int)
         if(nt == 4) then
-          call get_d2_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          if(transpose_two_e_int)then
+           call get_d2_new_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          else
+           call get_d2_new       (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          endif
         elseif(nt == 3) then
-          call get_d1_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          if(transpose_two_e_int)then
+           call get_d1_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          else
+           call get_d1_new   (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          endif
         else
-          call get_d0_new (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         if(transpose_two_e_int)then
+          call get_d0_transp (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         else
+          call get_d0_new    (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         endif
         endif
     elseif(nt == 4) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
@@ -887,79 +896,11 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
         call diag_htilde_mu_mat_fock_bi_ortho(N_int, det, hmono, htwoe, hthree, hii)
         do istate = 1,N_states
           delta_E = E0(istate) - Hii + E_shift
-          double precision               :: alpha_h_psi_tmp, psi_h_alpha_tmp, error
-          if(debug_tc_pt2 == 1)then !! Using the old version
-            psi_h_alpha = 0.d0
-            alpha_h_psi = 0.d0
-            do iii = 1, N_det_selectors
-              call htilde_mu_mat_bi_ortho_tot_slow(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_bi_ortho_tot_slow(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-              call get_excitation_degree(psi_selectors(1,1,iii), det,degree,N_int)
-              if(degree == 0)then
-               print*,'problem !!!'
-               print*,'a determinant is already in the wave function !!'
-               print*,'it corresponds to the selector number ',iii
-               call debug_det(det,N_int)
-               stop
-              endif
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-              psi_h_alpha += i_h_alpha * psi_selectors_coef_tc(iii,2,1) ! left function
-              alpha_h_psi += alpha_h_i * psi_selectors_coef_tc(iii,1,1) ! right function
-            enddo
-          else if(debug_tc_pt2 == 2)then !! debugging the new version
-!            psi_h_alpha_tmp = 0.d0
-!            alpha_h_psi_tmp = 0.d0
-!            do iii = 1, N_det_selectors ! old version
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-!              psi_h_alpha_tmp += i_h_alpha * psi_selectors_coef_tc(iii,1,1) ! left function
-!              alpha_h_psi_tmp += alpha_h_i * psi_selectors_coef_tc(iii,2,1) ! right function
-!            enddo
-            psi_h_alpha_tmp = mat_l(istate, p1, p2) ! new version
-            alpha_h_psi_tmp = mat_r(istate, p1, p2) ! new version
-            psi_h_alpha = 0.d0
-            alpha_h_psi = 0.d0
-            do iii = 1, N_det ! old version
-              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_det(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_det(1,1,iii), N_int, alpha_h_i)
-              psi_h_alpha += i_h_alpha * psi_l_coef_bi_ortho(iii,1) ! left function
-              alpha_h_psi += alpha_h_i * psi_r_coef_bi_ortho(iii,1) ! right function
-            enddo
-            if(dabs(psi_h_alpha*alpha_h_psi/delta_E).gt.1.d-10)then
-              error = dabs(psi_h_alpha * alpha_h_psi - psi_h_alpha_tmp * alpha_h_psi_tmp)/dabs(psi_h_alpha * alpha_h_psi)
-              if(error.gt.1.d-2)then
-                call debug_det(det, N_int)
-                print*,'error =',error,psi_h_alpha * alpha_h_psi/delta_E,psi_h_alpha_tmp * alpha_h_psi_tmp/delta_E
-                print*,psi_h_alpha , alpha_h_psi
-                print*,psi_h_alpha_tmp , alpha_h_psi_tmp
-                print*,'selectors '
-                do iii = 1, N_det_selectors ! old version
-                 print*,'iii',iii,psi_selectors_coef_tc(iii,1,1),psi_selectors_coef_tc(iii,2,1)
-                 call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-                 call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-                 print*,i_h_alpha,alpha_h_i
-                 call debug_det(psi_selectors(1,1,iii),N_int)
-                enddo
-!                print*,'psi_det '
-!                do iii = 1, N_det! old version
-!                 print*,'iii',iii,psi_l_coef_bi_ortho(iii,1),psi_r_coef_bi_ortho(iii,1)
-!                 call debug_det(psi_det(1,1,iii),N_int)
-!                enddo
-                stop
-              endif
-            endif
-          else
-           psi_h_alpha = mat_l(istate, p1, p2)
-           alpha_h_psi = mat_r(istate, p1, p2)
-          endif
+          psi_h_alpha = mat_l(istate, p1, p2)
+          alpha_h_psi = mat_r(istate, p1, p2)
           val = 4.d0 * psi_h_alpha * alpha_h_psi
           tmp = dsqrt(delta_E * delta_E + val)
-!          if (delta_E < 0.d0) then
-!              tmp = -tmp
-!          endif
           e_pert(istate) = 0.25 * val / delta_E
-!          e_pert(istate) = 0.5d0 * (tmp - delta_E)
           if(dsqrt(tmp).gt.1.d-4.and.dabs(psi_h_alpha).gt.1.d-4)then
            coef(istate)   = e_pert(istate) / psi_h_alpha
           else
@@ -976,15 +917,6 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
            if(e_pert(istate).gt.0.d0)e_pert(istate)=0.d0
           endif
 
-!         if(selection_tc     ==  1 )then
-!          if(e_pert(istate).lt.0.d0)then
-!           e_pert(istate) = 0.d0
-!          endif
-!         else if(selection_tc == -1)then
-!          if(e_pert(istate).gt.0.d0)then
-!           e_pert(istate) = 0.d0
-!          endif
-!         endif
         enddo
 
 
diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
index 99a8de7e..0c61e38f 100644
--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@@ -1,4 +1,36 @@
 
+
+! ---
+
+subroutine run_pouet
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+
+  use selection_types
+  implicit none
+  integer                       :: i, j, k, ndet
+  integer                       :: to_select
+  logical                       :: has
+  type(pt2_type)                :: pt2_data, pt2_data_err
+  double precision              :: rss
+  double precision              :: correlation_energy_ratio
+  double precision              :: hf_energy_ref
+  double precision              :: relative_error
+  double precision, allocatable :: zeros(:),E_tc(:), norm(:)
+
+  logical,          external    :: qp_stop
+  double precision, external    :: memory_of_double
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*, ' Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
+  enddo
+end
 ! ---
 
 subroutine run_stochastic_cipsi
@@ -88,6 +120,9 @@ subroutine run_stochastic_cipsi
     call pt2_dealloc(pt2_data_err)
     call pt2_alloc(pt2_data, N_states)
     call pt2_alloc(pt2_data_err, N_states)
+    if(transpose_two_e_int)then
+     provide mo_bi_ortho_tc_two_e_transp tc_2e_3idx_coulomb_integrals_transp
+    endif
     call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
 !    stop
 
@@ -120,6 +155,7 @@ subroutine run_stochastic_cipsi
   call pt2_alloc(pt2_data_err, N_states)
   call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
   call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm)
+  call print_summary_tc(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)
   call pt2_dealloc(pt2_data)
   call pt2_dealloc(pt2_data_err)
 
diff --git a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
index 1c1c0411..f1de0fe3 100644
--- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
@@ -65,7 +65,15 @@ subroutine run_cipsi_tc()
 
   if (.not. is_zmq_slave) then
 
-    PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
+    if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e 
+    endif
 
     if((elec_alpha_num+elec_beta_num) .ge. 3) then
       if(three_body_h_tc) then
@@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
     call json_close
 
   else
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
 
-    PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    endif
 
     if((elec_alpha_num+elec_beta_num) .ge. 3) then
       if(three_body_h_tc) then
diff --git a/plugins/local/fci_tc_bi/pt2_tc.irp.f b/plugins/local/fci_tc_bi/pt2_tc.irp.f
index 390042bf..3c07e367 100644
--- a/plugins/local/fci_tc_bi/pt2_tc.irp.f
+++ b/plugins/local/fci_tc_bi/pt2_tc.irp.f
@@ -13,6 +13,8 @@ program tc_pt2_prog
 
   pruning = -1.d0
   touch pruning
+  read_wf = .True.
+  touch read_wf 
 
 !  pt2_relative_error = 0.01d0
 !  touch pt2_relative_error
diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB
new file mode 100644
index 00000000..199b0f1c
--- /dev/null
+++ b/plugins/local/gpu_intel/LIB
@@ -0,0 +1,2 @@
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+
diff --git a/plugins/local/gpu_intel/NEED b/plugins/local/gpu_intel/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_intel/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst
new file mode 100644
index 00000000..d42e2557
--- /dev/null
+++ b/plugins/local/gpu_intel/README.rst
@@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+icpx -fsycl gpu.cxx -c -qmkl=sequential
+```
diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl
new file mode 100644
index 00000000..1f9f89ce
--- /dev/null
+++ b/plugins/local/gpu_intel/gpu.sycl
@@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+    auto queue = sycl::queue(sycl::default_selector_v);
+
+    try {
+        *ptr = sycl::malloc_shared(size, queue);
+        assert(*ptr != nullptr);
+    } catch (const sycl::exception& e) {
+        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+        *ptr = nullptr; // If allocation fails, set pointer to nullptr
+    }
+}
+
+void gpu_deallocate(void** ptr) {
+    assert(*ptr != nullptr);
+    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
+    *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector_v);
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+void gpu_synchronize() {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+  sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+  handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t** ptr) {
+    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
+    assert(*ptr != nullptr);
+    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+    assert((*ptr)->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t** ptr) {
+    assert(*ptr != nullptr);
+    delete (*ptr)->queue;
+    free(*ptr);
+    *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+              const double* y, const int64_t incy, double* result) {
+    // Ensure input parameters are valid
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+    assert(n > 0);
+    assert(incx > 0);
+    assert(incy > 0);
+    assert(x != nullptr);
+    assert(y != nullptr);
+    assert(result != nullptr);
+
+    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+
+    // Validate matrix dimensions and increments to be positive
+    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine the operation type
+    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+    if (*transa == 'T' || *transa == 't') {
+        transa_ = oneapi::mkl::transpose::trans;
+    }
+
+    // Perform DGEMV operation using oneMKL
+    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Transpose operations
+    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
+
+}
+
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine transpose operations
+    bool transA = (*transa == 'T' || *transa == 't');
+    bool transB = (*transb == 'T' || *transb == 't');
+
+    handle->queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+            const int i = idx[0];
+            const int j = idx[1];
+            const int ai = transA ? j * lda + i : i * lda + j;
+            const int bi = transB ? j * ldb + i : i * ldb + j;
+            const int ci = i * ldc + j;
+
+            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
+        });
+    });
+
+}
+
+}  // extern C
diff --git a/plugins/local/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB
new file mode 100644
index 00000000..91f54e91
--- /dev/null
+++ b/plugins/local/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/plugins/local/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst
new file mode 100644
index 00000000..5dcfca92
--- /dev/null
+++ b/plugins/local/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c
new file mode 100644
index 00000000..a775ab95
--- /dev/null
+++ b/plugins/local/gpu_nvidia/gpu.c
@@ -0,0 +1,326 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice((int) igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
+  assert (rc == cudaSuccess);
+}
+
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  ptr = NULL;
+}
+
+
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
+}
+
+
+
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
diff --git a/plugins/local/gpu_x86/NEED b/plugins/local/gpu_x86/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/gpu_x86/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst
new file mode 100644
index 00000000..f530bf29
--- /dev/null
+++ b/plugins/local/gpu_x86/README.rst
@@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
diff --git a/plugins/local/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c
new file mode 100644
index 00000000..49aec9d3
--- /dev/null
+++ b/plugins/local/gpu_x86/gpu.c
@@ -0,0 +1,502 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 0;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_deallocate(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) malloc(sizeof(char));
+}
+
+void gpu_stream_destroy(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) malloc(sizeof(char));
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  free(*handle);
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
diff --git a/plugins/local/jastrow/EZFIO.cfg b/plugins/local/jastrow/EZFIO.cfg
index 8fd2d05a..9d4cf431 100644
--- a/plugins/local/jastrow/EZFIO.cfg
+++ b/plugins/local/jastrow/EZFIO.cfg
@@ -1,7 +1,25 @@
+[log_jpsi]
+type: logical
+doc: If |true|, the Jpsi is taken as log(1+psi_cor)
+interface: ezfio,provider,ocaml
+default: False
+
+
+[mu_of_r_tc]
+type: character*(32)
+doc: type of the mu(r): [ Standard | Erfmu | Erfmugauss ]
+interface: ezfio,provider,ocaml
+default: Standard
+
+[mu_of_r_av]
+type: logical
+doc: If |true|, take the second formula for mu(r)
+interface: ezfio,provider,ocaml
+default: False
 
 [j2e_type]
 type: character*(32)
-doc: type of the 2e-Jastrow: [ None | Mu | Mu_Nu | Mur | Boys | Boys_Handy | Qmckl ]
+doc: type of the 2e-Jastrow: [ None | Mu | Mugauss | Mu_Nu | Mur | Murgauss | Bump | Boys | Boys_Handy | Qmckl ]
 interface: ezfio,provider,ocaml
 default: Mu
 
diff --git a/plugins/local/jastrow/bh_param.irp.f b/plugins/local/jastrow/bh_param.irp.f
index 1ed871bc..b9d51dd3 100644
--- a/plugins/local/jastrow/bh_param.irp.f
+++ b/plugins/local/jastrow/bh_param.irp.f
@@ -232,6 +232,14 @@
 
   ! ---
 
+  do i_nucl = 1, nucl_num
+    do p = 1, jBH_size
+      if(jBH_m(p,i_nucl) .eq. jBH_n(p,i_nucl)) then
+        jBH_c(p,i_nucl) = 0.5d0 * jBH_c(p,i_nucl)
+      endif
+    enddo
+  enddo
+
   print *, ' parameters for Boys-Handy Jastrow'
   print *, ' nb of terms per nucleus = ', jBH_size
 
diff --git a/plugins/local/mo_localization/README.md b/plugins/local/mo_localization/README.md
index c28a5ee1..512e36af 100644
--- a/plugins/local/mo_localization/README.md
+++ b/plugins/local/mo_localization/README.md
@@ -3,7 +3,7 @@ To localize the MOs:
 ```
 qp run localization  
 ```
-By default, the different otbital classes are automatically set by splitting  
+By default, the different orbital classes are automatically set by splitting  
 the orbitales in the following classes:  
 - Core -> Core  
 - Active, doubly occupied -> Inactive  
diff --git a/plugins/local/non_h_ints_mu/NEED b/plugins/local/non_h_ints_mu/NEED
index 48c1c24b..bfc4f311 100644
--- a/plugins/local/non_h_ints_mu/NEED
+++ b/plugins/local/non_h_ints_mu/NEED
@@ -3,3 +3,5 @@ hamiltonian
 jastrow
 ao_tc_eff_map
 bi_ortho_mos
+trexio
+mu_of_r
diff --git a/plugins/local/non_h_ints_mu/deb_aos.irp.f b/plugins/local/non_h_ints_mu/deb_aos.irp.f
index c9bc9c9a..70604f54 100644
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@@ -31,24 +31,63 @@ subroutine print_aos()
   integer          :: i, ipoint
   double precision :: r(3)
   double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: accu_vgl(5)
+  double precision :: accu_vgl_nrm(5)
+
+  double precision :: mo_val, mo_der(3), mo_lap
 
   PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array
 
   do ipoint = 1, n_points_final_grid
     r(:) = final_grid_points(:,ipoint)
-    print*, r
+    write(1000, '(3(f15.7, 3X))') r
   enddo
-
+  
   do ipoint = 1, n_points_final_grid
-    r(:) = final_grid_points(:,ipoint)
     do i = 1, ao_num
       ao_val    = aos_in_r_array     (i,ipoint)
       ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
       ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
-      write(*, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+      write(111, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
     enddo
   enddo
  
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      ao_val    = aos_in_r_array_qmckl     (i,ipoint)
+      ao_der(:) = aos_grad_in_r_array_qmckl(i,ipoint,:)
+      ao_lap    = aos_lapl_in_r_array_qmckl(i,ipoint)
+      write(222, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+    enddo
+  enddo
+ 
+  accu_vgl = 0.d0
+  accu_vgl_nrm = 0.d0
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      ao_val    = aos_in_r_array     (i,ipoint)
+      ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
+      ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
+      accu_vgl_nrm(1) += dabs(ao_val)
+      accu_vgl_nrm(2) += dabs(ao_der(1))
+      accu_vgl_nrm(3) += dabs(ao_der(2))
+      accu_vgl_nrm(4) += dabs(ao_der(3))
+      accu_vgl_nrm(5) += dabs(ao_lap)
+
+      ao_val    -= aos_in_r_array_qmckl     (i,ipoint)
+      ao_der(:) -= aos_grad_in_r_array_qmckl(i,ipoint,:)
+      ao_lap    -= aos_lapl_in_r_array_qmckl(i,ipoint)
+      accu_vgl(1) += dabs(ao_val)
+      accu_vgl(2) += dabs(ao_der(1))
+      accu_vgl(3) += dabs(ao_der(2))
+      accu_vgl(4) += dabs(ao_der(3))
+      accu_vgl(5) += dabs(ao_lap)
+    enddo
+
+  enddo
+  accu_vgl(:) *= 1.d0 / accu_vgl_nrm(:)
+  print *, accu_vgl
+
   return
 end
 
diff --git a/plugins/local/non_h_ints_mu/deb_deriv_mu.irp.f b/plugins/local/non_h_ints_mu/deb_deriv_mu.irp.f
new file mode 100644
index 00000000..1c8d7198
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_deriv_mu.irp.f
@@ -0,0 +1,28 @@
+program test_j_mu_of_r
+ implicit none
+ double precision :: x,mu_min,dmu,mu_max, mu, mu_p, mu_m
+ double precision :: j_simple,j_p, j_m,numeric_d_mu,d_dx_mu
+ double precision :: accu
+ integer :: npt,i
+ npt = 1000
+ mu_min = 0.3d0
+ mu_max = 10.d0
+ dmu = (mu_max - mu_min)/dble(npt)
+ x = 0.7d0
+ mu = mu_min
+ do i = 1, npt
+  call get_deriv_mu_j12(x,mu,d_dx_mu)
+  mu_p = mu + dmu
+  mu_m = mu - dmu
+  j_p = j_simple(x,mu_p)
+  j_m = j_simple(x,mu_m)
+  numeric_d_mu = 0.5d0 * (j_p - j_m)/dmu
+  print*,mu
+  print*,numeric_d_mu,d_dx_mu,dabs(d_dx_mu-numeric_d_mu)
+  accu += dabs(d_dx_mu-numeric_d_mu)
+  mu += dmu
+ enddo
+ accu *= dmu
+ print*,'accu = ',accu
+end
+
diff --git a/plugins/local/non_h_ints_mu/deb_j_bump.irp.f b/plugins/local/non_h_ints_mu/deb_j_bump.irp.f
new file mode 100644
index 00000000..82872357
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_j_bump.irp.f
@@ -0,0 +1,98 @@
+program test_j_mu_of_r
+ implicit none
+! call routine_test_mu_of_r
+ call routine_test_mu_of_r_tot
+end
+
+subroutine routine_test_mu_of_r_tot
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast_mu_r1(3),j_bump
+ double precision :: jast_p,jast_m,num_grad_jast_mu_r1(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+!   call grad_j_sum_mu_of_r(r1,r2,jast,grad_jast_mu_r1)
+   call get_grad_j_bump_mu_of_r(r1,r2,grad_jast_mu_r1)
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    jast_p = j_bump(r1bis,r2,a_boys)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    jast_m = j_bump(r1bis,r2,a_boys)
+
+    num_grad_jast_mu_r1(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast_mu_r1(k)*num_grad_jast_mu_r1(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.dr)then
+     print*,'/////'
+     print*,error,norm
+     print*,grad_jast_mu_r1
+     print*,num_grad_jast_mu_r1
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
+subroutine routine_test_mu_of_r
+ implicit none
+ integer :: ipoint,k 
+ double precision :: weight, dr, r1(3), r1bis(3),accu_grad(3),num_grad_mu_r1(3)
+ double precision :: mu_r1,dm_r1, mu_der_r1(3), grad_dm_r1(3)
+ double precision :: mu_der_rp(3), grad_dm_rp(3),mu_rp
+ double precision :: mu_der_rm(3), grad_dm_rm(3),mu_rm
+
+ dr = 0.0001d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad_mu_of_r_mean_field(r1,mu_r1,dm_r1, mu_der_r1, grad_dm_r1) 
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rp, dm_r1, mu_der_rp, grad_dm_r1) 
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rm, dm_r1, mu_der_rm, grad_dm_r1) 
+    num_grad_mu_r1(k) = (mu_rp - mu_rm)/(2.d0* dr)
+!    print*,jast_mu_r1_p,jast_mu_r1_m
+   enddo
+   print*,'/////'
+   print*,mu_der_r1
+   print*,num_grad_mu_r1
+   do k = 1,3
+    accu_grad(k) += weight * dabs(mu_der_r1(k) - num_grad_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
diff --git a/plugins/local/non_h_ints_mu/deb_j_gauss.irp.f b/plugins/local/non_h_ints_mu/deb_j_gauss.irp.f
new file mode 100644
index 00000000..264a6f04
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_j_gauss.irp.f
@@ -0,0 +1,62 @@
+program test_j_mu_of_r
+ implicit none
+! call routine_test_mu_of_r
+ call routine_test_mu_of_r_tot
+end
+
+subroutine routine_test_mu_of_r_tot
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast(3),j_bump,j12_mu
+ double precision :: jast_p,jast_m,num_grad_jast(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad1_j12_mu(r1, r2, grad_jast)
+   grad_jast = - grad_jast
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    jast_p = j12_mu(r1bis, r2)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    jast_m = j12_mu(r1bis, r2)
+
+    num_grad_jast(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast(k)*num_grad_jast(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.dr)then
+     print*,'/////'
+     print*,error,norm
+     print*,grad_jast
+     print*,num_grad_jast
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
diff --git a/plugins/local/non_h_ints_mu/deb_j_mu_of_r.irp.f b/plugins/local/non_h_ints_mu/deb_j_mu_of_r.irp.f
new file mode 100644
index 00000000..fc29bf50
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_j_mu_of_r.irp.f
@@ -0,0 +1,97 @@
+program test_j_mu_of_r
+ implicit none
+! call routine_test_mu_of_r
+ call routine_test_mu_of_r_tot
+end
+
+subroutine routine_test_mu_of_r_tot
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast_mu_r1(3)
+ double precision :: jast_p,jast_m,num_grad_jast_mu_r1(3)
+
+ dr = 0.000001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad_j_sum_mu_of_r(r1,r2,jast,grad_jast_mu_r1)
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call get_j_sum_mu_of_r(r1bis,r2,jast_p)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call get_j_sum_mu_of_r(r1bis,r2,jast_m)
+
+    num_grad_jast_mu_r1(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast_mu_r1(k)*num_grad_jast_mu_r1(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.10.d0*dr)then
+     print*,'/////'
+     print*,error,norm,dabs(error/norm)
+     print*,grad_jast_mu_r1
+     print*,num_grad_jast_mu_r1
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
+subroutine routine_test_mu_of_r
+ implicit none
+ integer :: ipoint,k 
+ double precision :: weight, dr, r1(3), r1bis(3),accu_grad(3),num_grad_mu_r1(3)
+ double precision :: mu_r1,dm_r1, mu_der_r1(3), grad_dm_r1(3)
+ double precision :: mu_der_rp(3), grad_dm_rp(3),mu_rp
+ double precision :: mu_der_rm(3), grad_dm_rm(3),mu_rm
+
+ dr = 0.0001d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad_mu_of_r_mean_field(r1,mu_r1,dm_r1, mu_der_r1, grad_dm_r1) 
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rp, dm_r1, mu_der_rp, grad_dm_r1) 
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rm, dm_r1, mu_der_rm, grad_dm_r1) 
+    num_grad_mu_r1(k) = (mu_rp - mu_rm)/(2.d0* dr)
+!    print*,jast_mu_r1_p,jast_mu_r1_m
+   enddo
+   print*,'/////'
+   print*,mu_der_r1
+   print*,num_grad_mu_r1
+   do k = 1,3
+    accu_grad(k) += weight * dabs(mu_der_r1(k) - num_grad_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
diff --git a/plugins/local/non_h_ints_mu/deb_j_psi.irp.f b/plugins/local/non_h_ints_mu/deb_j_psi.irp.f
new file mode 100644
index 00000000..1b034684
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_j_psi.irp.f
@@ -0,0 +1,131 @@
+program test_j_mu_of_r
+ implicit none
+ call routine_deb_j_psi
+! call routine_deb_denom
+end
+
+subroutine routine_deb_j_psi
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast(3),j_bump,jastrow_psi,grad_jast_bis(3)
+ double precision :: jast_p,jast_m,num_grad_jast(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call get_grad_r1_jastrow_psi(r1,r2,grad_jast,jast)
+!   grad_jast = - grad_jast
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call get_grad_r1_jastrow_psi(r1bis,r2,grad_jast_bis,jast_p)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call get_grad_r1_jastrow_psi(r1bis,r2,grad_jast_bis,jast_m)
+
+    num_grad_jast(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast(k)*num_grad_jast(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.dr)then
+     print*,'/////'
+     print*,error,norm
+     print*,grad_jast
+     print*,num_grad_jast
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
+
+subroutine routine_deb_denom
+ implicit none
+ integer :: ipoint,k,i,j
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast(3),j_bump,jastrow_psi,grad_jast_bis(3)
+ double precision :: jast_p,jast_m,num_grad_jast(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ double precision, allocatable :: mos_array_r1(:), mos_array_r2(:)
+ double precision, allocatable :: mos_grad_array_r1(:,:),mos_grad_array_r2(:,:)
+ allocate(mos_array_r1(mo_num), mos_array_r2(mo_num))
+ allocate(mos_grad_array_r1(3,mo_num), mos_grad_array_r2(3,mo_num))
+ do i = 1, 1
+  do j = 1, 1
+   accu_grad = 0.d0
+   call give_all_mos_and_grad_at_r(r2,mos_array_r2,mos_grad_array_r2)
+    do ipoint = 1, n_points_final_grid
+     r1(1:3) = final_grid_points(1:3,ipoint)
+     weight = final_weight_at_r_vector(ipoint)
+     call give_all_mos_and_grad_at_r(r1,mos_array_r1,mos_grad_array_r1)
+     call denom_jpsi(i,j,a_boys, mos_array_r1,mos_grad_array_r1,mos_array_r2,jast, grad_jast)
+     double precision :: norm,error
+     norm = 0.D0
+     do k = 1, 3
+      r1bis= r1
+      r1bis(k) += dr
+      call give_all_mos_and_grad_at_r(r1bis,mos_array_r1,mos_grad_array_r1)
+      call denom_jpsi(i,j,a_boys, mos_array_r1,mos_grad_array_r1,mos_array_r2,jast_p, grad_jast_bis)
+  
+      r1bis= r1
+      r1bis(k) -= dr
+      call give_all_mos_and_grad_at_r(r1bis,mos_array_r1,mos_grad_array_r1)
+      call denom_jpsi(i,j,a_boys, mos_array_r1,mos_grad_array_r1,mos_array_r2,jast_m, grad_jast_bis)
+  
+      num_grad_jast(k) = (jast_p - jast_m)/(2.d0* dr)
+      norm += num_grad_jast(k)*num_grad_jast(k)
+     enddo
+     error = 0.d0
+     do k = 1, 3
+      error += dabs(grad_jast(k) - num_grad_jast(k))
+     enddo
+     error *= 0.33333333d0
+     norm = dsqrt(norm)
+     if(norm.gt.1.d-05)then
+      if(dabs(error/norm).gt.dr)then
+       print*,'/////'
+       print*,error,norm
+       print*,grad_jast
+       print*,num_grad_jast
+      endif
+     endif
+     do k = 1,3
+      accu_grad(k) += weight * dabs(grad_jast(k) - num_grad_jast(k))
+     enddo
+    enddo
+   print*,'i,j = ',i,j
+   print*,'accu_grad = '
+   print*, accu_grad
+  enddo
+ enddo
+
+end
+
diff --git a/plugins/local/non_h_ints_mu/deb_mos.irp.f b/plugins/local/non_h_ints_mu/deb_mos.irp.f
new file mode 100644
index 00000000..26344786
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/deb_mos.irp.f
@@ -0,0 +1,101 @@
+
+! ---
+
+program deb_mos
+
+  implicit none
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+  endif
+
+  call print_mos()
+  
+end
+
+! ---
+
+subroutine print_mos()
+
+  implicit none
+  integer          :: i, ipoint
+  double precision :: r(3)
+  double precision :: mo_val, mo_der(3), mo_lap
+
+  PROVIDE final_grid_points mos_in_r_array mos_grad_in_r_array mos_lapl_in_r_array
+
+!  do ipoint = 1, n_points_final_grid
+!    r(:) = final_grid_points(:,ipoint)
+!    print*, r
+!  enddo
+double precision :: accu_vgl(5)
+double precision :: accu_vgl_nrm(5)
+
+  do ipoint = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    write(1111, '(5(f15.7, 3X))') r
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      write(1111, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
+    enddo
+  enddo
+ 
+  do ipoint = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    write(2222, '(5(f15.7, 3X))') r
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array_qmckl     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array_qmckl(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array_qmckl(i,ipoint)
+      write(2222, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
+    enddo
+  enddo
+ 
+  accu_vgl = 0.d0
+  accu_vgl_nrm = 0.d0
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      accu_vgl_nrm(1) += dabs(mo_val)
+      accu_vgl_nrm(2) += dabs(mo_der(1))
+      accu_vgl_nrm(3) += dabs(mo_der(2))
+      accu_vgl_nrm(4) += dabs(mo_der(3))
+      accu_vgl_nrm(5) += dabs(mo_lap)
+
+      mo_val    -= mos_in_r_array_qmckl     (i,ipoint)
+      mo_der(:) -= mos_grad_in_r_array_qmckl(i,ipoint,:)
+      mo_lap    -= mos_lapl_in_r_array_qmckl(i,ipoint)
+      accu_vgl(1) += dabs(mo_val)
+      accu_vgl(2) += dabs(mo_der(1))
+      accu_vgl(3) += dabs(mo_der(2))
+      accu_vgl(4) += dabs(mo_der(3))
+      accu_vgl(5) += dabs(mo_lap)
+    enddo
+
+  enddo
+  accu_vgl(:) *= 1.d0 / accu_vgl_nrm(:)
+  print *, accu_vgl
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/non_h_ints_mu/j_bump.irp.f b/plugins/local/non_h_ints_mu/j_bump.irp.f
new file mode 100644
index 00000000..1731bb72
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/j_bump.irp.f
@@ -0,0 +1,90 @@
+double precision function wigner_radius(rho)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: rho
+ wigner_radius = 4.d0 * pi * rho * 0.333333333333d0
+ wigner_radius = wigner_radius**(-0.3333333d0)
+end
+
+double precision function j_bump(r1,r2,a)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: r1(3),r2(3),a
+ double precision :: inv_a,factor,x_scaled,scalar
+ double precision :: r12
+ r12   = (r1(1) - r2(1))*(r1(1) - r2(1))
+ r12  += (r1(2) - r2(2))*(r1(2) - r2(2))
+ r12  += (r1(3) - r2(3))*(r1(3) - r2(3))
+ r12 = dsqrt(r12)
+ inv_a = 1.d0/a
+ x_scaled = r12*inv_a*inv_sq_pi
+ x_scaled*= x_scaled
+ j_bump = 0.5d0 * (r12-a) * dexp(-x_scaled)
+end
+
+subroutine get_grad_j_bump(x,a,grad)
+ implicit none
+ BEGIN_DOC
+ ! gradient of the Jastrow with a bump
+ !
+ ! j(x,a) = 1/2 * (x-a)* exp[-(x/(a*sqrt(pi)))^2]
+ !
+ ! d/dx j(x,a) = 1/(2 pi a^2) * exp[-(x/(a*sqrt(pi)))^2] * (pi a^2 + 2 a x - 2x^2)
+ END_DOC
+  include 'constants.include.F'
+ double precision, intent(in)  :: x,a
+ double precision, intent(out) :: grad
+ double precision :: inv_a,factor,x_scaled,scalar
+ inv_a = 1.d0/a
+ factor = 0.5d0*inv_pi*inv_a*inv_a 
+ x_scaled = x*inv_a*inv_sq_pi
+ x_scaled*= x_scaled
+ grad = factor * dexp(-x_scaled) * (pi*a*a + 2.d0 * a*x - 2.d0*x*x)
+end
+
+subroutine get_d_da_j_bump(x,a,d_da)
+ implicit none
+ BEGIN_DOC
+ ! Derivative with respect by to the parameter "a" of the Jastrow with a bump
+ !
+ ! j(x,a) = 1/2 * (x-a)* exp[-(x/(a*sqrt(pi)))^2]
+ !
+ ! d/da j(x,a) = - 1/(pi*a^3) * exp[-(x/(a*sqrt(pi)))^2] * (-2 x^3 + 2 a x^2 + pi a^x3)
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in)  :: x,a
+ double precision, intent(out) :: d_da
+ double precision :: factor, inv_a,x_scaled,scalar
+ inv_a = 1.d0/a
+ factor = inv_a*inv_a*inv_a*inv_pi
+ x_scaled = x*inv_a*inv_sq_pi
+ x_scaled*= x_scaled
+ d_da = factor * dexp(-x_scaled) * (-2.d0 * x*x*x + 2.d0*x*x*a+pi*a*a*a)
+end
+
+subroutine get_grad_j_bump_mu_of_r(r1,r2,grad_j_bump)
+ implicit none
+ BEGIN_DOC
+ ! d/dx1 j(x,a(r1,r2)) where j(x,a) is the Jastrow with a bump
+ !
+ ! j(x,a) = 1/2 * (x-a)* exp[-(x/(a*sqrt(pi)))^2]
+ !
+ ! a(r1,r2) = [rho(r1) a(r1) + rho(r2) a(r2)]/[rho(r1) + rho(r2)]
+ !
+ ! d/dx1 j(x,a) = d/dx1 j(x,a(r1,r2))
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: grad_j_bump(3)
+ double precision :: r12,r12_vec(3),grad_scal,inv_r12
+ r12_vec = r1 - r2
+ r12   = (r1(1) - r2(1))*(r1(1) - r2(1))
+ r12  += (r1(2) - r2(2))*(r1(2) - r2(2))
+ r12  += (r1(3) - r2(3))*(r1(3) - r2(3))
+ r12 = dsqrt(r12)
+ call get_grad_j_bump(r12,a_boys,grad_scal)
+ if(r12.lt.1.d-10)then
+  grad_j_bump = 0.d0
+ else 
+  grad_j_bump = grad_scal * r12_vec/r12
+ endif
+end
diff --git a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
index 9cfabf58..c6b2b0a0 100644
--- a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
@@ -132,6 +132,7 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   double precision, allocatable :: A(:,:,:,:), b(:), A_tmp(:,:,:,:)
   double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:)
   double precision, allocatable :: u1e_tmp(:), tmp(:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
   double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:)
 
 
@@ -176,26 +177,27 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   ! --- --- ---
   ! get A
 
-  allocate(tmp(n_points_final_grid,ao_num,ao_num))
+  allocate(tmp1(n_points_final_grid,ao_num,ao_num), tmp2(n_points_final_grid,ao_num,ao_num))
   allocate(A(ao_num,ao_num,ao_num,ao_num))
 
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
   !$OMP DO COLLAPSE(2)
   do j = 1, ao_num
     do i = 1, ao_num
       do ipoint = 1, n_points_final_grid
-        tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(ipoint,i,j) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(ipoint,i,j) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
       enddo
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
-  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
+  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0  &
+            , tmp1(1,1,1), n_points_final_grid, tmp2(1,1,1), n_points_final_grid &
             , 0.d0, A(1,1,1,1), ao_num*ao_num)
 
   allocate(A_tmp(ao_num,ao_num,ao_num,ao_num))
@@ -207,13 +209,13 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
   allocate(b(ao_num*ao_num))
 
   do ipoint = 1, n_points_final_grid
-    u1e_tmp(ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * u1e_tmp(ipoint)
+    u1e_tmp(ipoint) = u1e_tmp(ipoint)
   enddo
 
-  call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1)
+  call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp1(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1)
 
   deallocate(u1e_tmp)
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)
 
   ! --- --- ---
   ! solve Ax = b
diff --git a/plugins/local/non_h_ints_mu/jast_deriv.irp.f b/plugins/local/non_h_ints_mu/jast_deriv.irp.f
index 9a430135..1f97c18a 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv.irp.f
@@ -31,7 +31,7 @@
   grad1_u12_squared_num = 0.d0
 
   if( ((j2e_type .eq. "Mu") .and. (env_type .eq. "None")) .or. &
-       (j2e_type .eq. "Mur") ) then
+       (j2e_type .eq. "Mur").or.(j2e_type .eq. "Mugauss") .or. (j2e_type .eq. "Murgauss")) then
 
     !$OMP PARALLEL                                                                                    &
     !$OMP DEFAULT (NONE)                                                                              &
diff --git a/plugins/local/non_h_ints_mu/jast_deriv_mu_of_r.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_mu_of_r.irp.f
new file mode 100644
index 00000000..8f5aee0c
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/jast_deriv_mu_of_r.irp.f
@@ -0,0 +1,306 @@
+subroutine get_j_sum_mu_of_r(r1,r2,jast)
+ implicit none
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: jast
+ double precision :: mu_r1, dm_r1, grad_mu_r1(3), grad_dm_r1(3), j_mu_r1
+ double precision :: mu_r2, dm_r2, grad_mu_r2(3), grad_dm_r2(3), j_mu_r2
+ double precision :: j12_mu_input,mu_tot,r12,j_simple
+ jast = 0.d0
+ if(murho_type==0)then
+! J(r1,r2) = [rho(r1) * j(mu(r1),r12) + rho(r2) * j(mu(r2),r12)] / [rho(r1) + rho(r2)]
+  call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1) 
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  j_mu_r1 = j12_mu_input(r1, r2, mu_r1)
+  j_mu_r2 = j12_mu_input(r1, r2, mu_r2)
+  if(dm_r1 + dm_r2.lt.1.d-7)return
+  jast = (dm_r1 * j_mu_r1 + dm_r2 * j_mu_r2) / (dm_r1 + dm_r2)
+ else if(murho_type==1)then
+! J(r1,r2) = j(0.5 * (mu(r1)+mu(r2)),r12), MU(r1,r2) = 0.5 *(mu(r1)+mu(r2))
+  call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1) 
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  mu_tot = 0.5d0 * (mu_r1 + mu_r2)
+  jast = j12_mu_input(r1, r2, mu_tot)
+ else if(murho_type==2)then
+! MU(r1,r2) = (rho(1) * mu(r1)+ rho(2) * mu(r2))/(rho(1)+rho(2))
+! J(r1,r2) = j(MU(r1,r2),r12)
+  call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1) 
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  double precision :: mu_tmp, dm_tot, dm_tot_inv
+  dm_tot = dm_r1**a_boys + dm_r2**a_boys  ! rho(1)**alpha+rho(2)**alpha
+  if(dm_tot.lt.1.d-12)then
+   dm_tot_inv = 1.d+12
+  else
+   dm_tot_inv = 1.d0/dm_tot
+  endif
+  mu_tmp = dm_r1**a_boys * mu_r1 + dm_r2**a_boys * mu_r2 !rho(1)**alpha * mu(r1)+ rho(2)**alpha * mu(r2)
+  mu_tot = nu_erf * mu_tmp*dm_tot_inv ! 
+  r12  = (r1(1) - r2(1)) * (r1(1) - r2(1))
+  r12 += (r1(2) - r2(2)) * (r1(2) - r2(2))
+  r12 += (r1(3) - r2(3)) * (r1(3) - r2(3))
+  r12 = dsqrt(r12)
+  jast = j_simple(r12,mu_tot)
+ endif
+
+end
+
+subroutine grad_j_sum_mu_of_r(r1,r2,jast,grad_jast)
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: jast, grad_jast(3)
+ jast = 0.d0
+ grad_jast = 0.d0
+ double precision :: num, denom, grad_num(3), grad_denom(3)
+ double precision :: j_r1, grad_j_r1(3),j_r2, grad_j_r2(3)
+ double precision :: dm_r1, grad_dm_r1(3), grad_jmu_r2(3)
+ double precision :: dm_r2, grad_dm_r2(3),mu_r2, grad_mu_r2(3),mu_r1
+ double precision :: j12_mu_input,r12,grad_mu_r1(3),grad_jmu_r1(3)
+ double precision :: mu_tot,dx,dy,dz,r12_vec(3),d_dmu_j,d_dr12_j
+
+ dx = r1(1) - r2(1)
+ dy = r1(2) - r2(2)
+ dz = r1(3) - r2(3)
+
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ if(r12.gt.1.d-10)then
+  r12_vec(1) = dx
+  r12_vec(2) = dy
+  r12_vec(3) = dz
+  r12_vec *= 1.d0/r12
+  ! r12_vec = grad_r1 (r12)
+ else
+  r12 = 1.d-10
+  r12_vec = 0.d0
+ endif
+
+ if(murho_type==0)then
+! J(r1,r2) = [rho(r1) * j(mu(r1),r12) + rho(r2) * j(mu(r2),r12)] / [rho(r1) + rho(r2)]
+! 
+!          = num(r1,r2) / denom(r1,r2)
+!
+! d/dx1 J(r1,r2) = [denom(r1,r2) X d/dx1 num(r1,r2) - num(r1,r2) X d/dx1 denom(r1,r2) ] / denom(r1,r2)^2
+!
+! d/dx1 num(r1,r2) =  j(mu(r1),r12)*d/dx1 rho(r1) + rho(r1) * d/dx1 j(mu(r1),r12) 
+!                   + rho(r2) d/dx1 j(mu(r2),r12)
+! d/dx1 denom(r1,r2) = d/dx1 rho(r1)
+  call grad_j_mu_of_r_1(r1,r2,j_r1, grad_j_r1,dm_r1, grad_dm_r1)
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  j_r2 = j12_mu_input(r1, r2, mu_r2) ! j(mu(r2),r1,r2)
+  num = dm_r1 * j_r1 + dm_r2 * j_r2
+  denom = dm_r1 + dm_r2
+  if(denom.lt.1.d-7)return
+  jast = num / denom
+ 
+  grad_denom = grad_dm_r1
+  call grad_j12_mu_input(r1, r2, mu_r2, grad_jmu_r2,r12)
+  grad_num =  j_r1 * grad_dm_r1 + dm_r1 * grad_j_r1 + dm_r2 * grad_jmu_r2
+  grad_jast = (grad_num * denom - num * grad_denom)/(denom*denom)
+ else if(murho_type==1)then
+! J(r1,r2) = j(0.5 * (mu(r1)+mu(r2)),r12), MU(r1,r2) = 0.5 *(mu(r1)+mu(r2))
+!
+! d/dx1 J(r1,r2) = d/dx1 j(MU(r1,r2),r12)|MU=cst 
+!                + d/dMU [j(MU,r12)]
+!                x d/d(mu(r1)) MU(r1,r2)
+!                x d/dx1 mu(r1)
+!                = 0.5 * (1 - erf(MU(r1,r2) *r12))/r12 * (x1 - x2) == grad_jmu_r1
+!                + e^{-(r12*MU(r1,r2))^2}/(2 sqrt(pi) * MU(r1,r2)^2) 
+!                x 0.5 
+!                x d/dx1 mu(r1)
+ call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1)
+ call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2)
+ mu_tot = 0.5d0 * (mu_r1 + mu_r2)
+ call grad_j12_mu_input(r1, r2, mu_tot, grad_jmu_r1,r12)
+ grad_jast = grad_jmu_r1
+ grad_jast+= dexp(-r12*mu_tot*r12*mu_tot) * inv_sq_pi_2 /(mu_tot* mu_tot) * 0.5d0 * grad_mu_r1
+ else if(murho_type==2)then
+! MU(r1,r2) = beta * (rho(1)**alpha * mu(r1)+ rho(2)**alpha * mu(r2))/(rho(1)**alpha+rho(2)**alpha)
+! J(r1,r2) = j(MU(r1,r2),r12)
+!
+! d/dx1 J(r1,r2) = d/dx1 j(MU(r1,r2),r12)|MU=cst 
+!                + d/dMU [j(MU,r12)] 
+!                x d/d(mu(r1)) MU(r1,r2)
+!                x d/dx1 mu(r1)
+!                = 0.5 * (1 - erf(MU(r1,r2) *r12))/r12 * (x1 - x2) == grad_jmu_r1
+!                + 0.5 e^{-(r12*MU(r1,r2))^2}/(2 sqrt(pi) * MU(r1,r2)^2) 
+!                x d/dx1 MU(r1,r2)
+! with d/dx1 MU(r1,r2) = beta * {[mu(1) d/dx1 [rho(1)**alpha] + rho(1)**alpha * d/dx1 mu(1)](rho(1)**alpha+rho(2)**alpha) 
+!                       - MU(1,2) d/dx1 [rho(1)]**alpha}/(rho(1)**alpha+rho(2)**alpha)^2
+! d/dx1 [rho(1)]**alpha = alpha [rho(1)]**(alpha-1) d/dx1 rho(1)
+!                        
+ call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1)
+ call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2)
+ double precision :: dm_tot,dm_tot_inv,grad_mu_tot(3),mu_tmp,grad_dm_r1_alpha(3),d_dx_j
+ dm_tot = dm_r1**a_boys + dm_r2**a_boys  ! rho(1)**alpha+rho(2)**alpha
+ grad_dm_r1_alpha = a_boys * dm_r1**(a_boys-1) * grad_dm_r1
+ if(dm_tot.lt.1.d-12)then
+  dm_tot_inv = 1.d+12
+ else
+  dm_tot_inv = 1.d0/dm_tot
+ endif
+ mu_tmp = dm_r1**a_boys * mu_r1 + dm_r2**a_boys * mu_r2 !rho(1)**alpha * mu(r1)+ rho(2)**alpha * mu(r2)
+ mu_tot = nu_erf * mu_tmp*dm_tot_inv ! 
+ grad_mu_tot = ( mu_r1 * grad_dm_r1_alpha + dm_r1**a_boys * grad_mu_r1 ) * dm_tot & 
+              -  mu_tmp * grad_dm_r1_alpha
+ grad_mu_tot *= dm_tot_inv * dm_tot_inv * nu_erf
+ call get_deriv_r12_j12(r12,mu_tot,d_dr12_j) ! d/dr12 j(MU(r1,r2,r12)
+ ! d/dx1 j(MU(r1,r2),r12) | MU(r1,r2) = cst
+ ! d/dr12 j(MU(r1,r2,r12) x d/dx1 r12
+ grad_jmu_r1 = d_dr12_j * r12_vec 
+! call grad_j12_mu_input(r1, r2, mu_tot, grad_jmu_r1,r12)
+ grad_jast = grad_jmu_r1
+ ! d/dMU j(MU(r1,r2),r12)
+ call get_deriv_mu_j12(r12,mu_tot,d_dmu_j)
+ grad_jast+= d_dmu_j * grad_mu_tot
+ else if(murho_type==-1)then
+! J(r1,r2) = 0.5 * [j(mu(r1),r12) + j(mu(r2),r12)] 
+!
+! d/dx1 J(r1,r2)  = 0.5 * (d/dx1 j(mu(r1),r12) + d/dx1 j(mu(r2),r12))
+  call grad_j_mu_of_r_1(r1,r2,j_r1, grad_j_r1,dm_r1, grad_dm_r1)
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  j_r2 = j12_mu_input(r1, r2, mu_r2) ! j(mu(r2),r1,r2)
+  call grad_j12_mu_input(r1, r2, mu_r2, grad_jmu_r2,r12)
+  jast = 0.5d0 * (j_r1 + j_r2)
+  grad_jast = 0.5d0 * (grad_j_r1 + grad_jmu_r2)
+  
+ endif
+
+end
+
+subroutine grad_j_mu_of_r_1(r1,r2,jast, grad_jast, dm_r1, grad_dm_r1)
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+! grad_r1 of j(mu(r1),r12)
+  !
+  !
+  ! d/dx1 j(mu(r1),r12) = exp(-(mu(r1)*r12)**2) /(2 *sqrt(pi) * mu(r1)**2 ) d/dx1 mu(r1) 
+  !                     + d/dx1 j(mu(r1),r12) 
+  !
+  ! with 
+  !
+  !           j(mu,r12) = 1/2 r12 (1 - erf(mu r12)) - 1/2 (sqrt(pi) * mu) e^{-(mu*r12)^2}
+  !
+  ! and d/dx1 j(mu,r12) = 0.5 * (1 - erf(mu *r12))/r12 * (x1 - x2)
+  !
+  !     d/d mu j(mu,r12) = e^{-(r12*mu)^2}/(2 sqrt(pi) * mu^2)
+  !
+  ! here mu(r1) is obtained by MU MEAN FIELD 
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: jast, grad_jast(3),dm_r1, grad_dm_r1(3)
+ double precision :: dx, dy, dz,  r12, mu_der(3)
+ double precision :: mu_tmp, tmp, grad(3), mu_val
+ jast = 0.d0
+ grad = 0.d0
+
+ dx  = r1(1) - r2(1)
+ dy  = r1(2) - r2(2)
+ dz  = r1(3) - r2(3)
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ ! get mu(r1) == mu_val and its gradient d/dx1 mu(r1) == mu_der 
+ call grad_mu_of_r_mean_field(r1,mu_val, dm_r1, mu_der, grad_dm_r1) 
+ mu_tmp  = mu_val * r12
+ ! evalulation of the jastrow j(mu(r1),r12)
+ jast = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_val
+
+ ! tmp = exp(-(mu(r1)*r12)**2) /(2 *sqrt(pi) * mu(r1)**2 )
+ tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
+ ! grad = 
+ grad(1) = tmp * mu_der(1)
+ grad(2) = tmp * mu_der(2)
+ grad(3) = tmp * mu_der(3)
+
+ if(r12 .lt. 1d-10) return
+ tmp     = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12 ! d/dx1 j(mu(r1),r12) 
+ grad(1) = grad(1) + tmp * dx
+ grad(2) = grad(2) + tmp * dy
+ grad(3) = grad(3) + tmp * dz
+
+ grad_jast = grad 
+end
+
+! ---
+
+double precision function j12_mu_input(r1, r2, mu)
+
+  BEGIN_DOC
+  ! j(mu,r12) = 1/2 r12 (1 - erf(mu r12)) - 1/2 (sqrt(pi) * mu) e^{-(mu*r12)^2}
+  END_DOC
+  include 'constants.include.F'
+
+  implicit none
+  double precision, intent(in) :: r1(3), r2(3), mu
+  double precision             :: mu_tmp, r12
+
+    r12 = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+               + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+               + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+    mu_tmp = mu * r12
+
+    j12_mu_input = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu
+
+end
+
+subroutine grad_j12_mu_input(r1, r2, mu, grad_jmu,r12)
+ implicit none
+ BEGIN_DOC
+ ! grad_jmu = d/dx1 j(mu,r12) assuming mu=cst(r1)
+ !
+ !          = 0.5/r_12 * (x_1 - x_2) * [1 - erf(mu*r12)]
+ END_DOC
+ double precision, intent(in) :: r1(3), r2(3), mu
+ double precision, intent(out):: grad_jmu(3),r12
+ double precision             :: mu_tmp, dx, dy, dz, grad(3), tmp
+ grad_jmu = 0.d0
+ dx  = r1(1) - r2(1)
+ dy  = r1(2) - r2(2)
+ dz  = r1(3) - r2(3)
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ if(r12 .lt. 1d-10) return
+ mu_tmp  = mu * r12
+ tmp     = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12 ! d/dx1 j(mu(r1),r12) 
+ grad(1) = tmp * dx
+ grad(2) = tmp * dy
+ grad(3) = tmp * dz
+
+ grad_jmu = grad 
+end
+
+subroutine j12_and_grad_j12_mu_input(r1, r2, mu, jmu, grad_jmu)
+ implicit none
+ include 'constants.include.F'
+ BEGIN_DOC
+ ! jmu = j(mu,r12) 
+ ! grad_jmu = d/dx1 j(mu,r12) assuming mu=cst(r1)
+ !
+ !          = 0.5/r_12 * (x_1 - x_2) * [1 - erf(mu*r12)]
+ END_DOC
+ double precision, intent(in) :: r1(3), r2(3), mu
+ double precision, intent(out):: grad_jmu(3), jmu
+ double precision             :: mu_tmp, r12, dx, dy, dz, grad(3), tmp
+ double precision :: erfc_mur12,inv_mu
+ inv_mu = 1.d0/mu
+
+ grad_jmu = 0.d0 ! initialization when r12 --> 0
+ jmu = - inv_sq_pi_2 * inv_mu ! initialization when r12 --> 0
+
+ dx  = r1(1) - r2(1)
+ dy  = r1(2) - r2(2)
+ dz  = r1(3) - r2(3)
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ if(r12 .lt. 1d-10) return
+ erfc_mur12 = (1.d0 - derf(mu_tmp))
+ mu_tmp  = mu * r12
+ tmp     = 0.5d0 * erfc_mur12  / r12 ! d/dx1 j(mu(r1),r12) 
+ grad(1) = tmp * dx
+ grad(2) = tmp * dy
+ grad(3) = tmp * dz
+
+ grad_jmu = grad 
+
+ jmu= 0.5d0 * r12 * erfc_mur12 - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) * inv_mu
+
+
+end
diff --git a/plugins/local/non_h_ints_mu/jast_deriv_utils.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_utils.irp.f
index 79822508..f13990de 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils.irp.f
@@ -1,8 +1,73 @@
+subroutine get_deriv_r12_j12(x,mu,d_dx_j)
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ ! d/dr12 j(mu,r12)
+ END_DOC
+ double precision, intent(in)  :: x,mu
+ double precision, intent(out) :: d_dx_j
+ 
+ d_dx_j = 0.d0
+ if(x .lt. 1d-10) return
+ if(j2e_type .eq. "Mu" .or. j2e_type .eq. "Mur") then
+  d_dx_j = 0.5d0 * (1.d0 - derf(mu * x))
+ else if(j2e_type .eq. "Mugauss" .or. j2e_type .eq. "Murgauss" ) then
+  double precision :: x_tmp
+  x_tmp = mu * x
+  ! gradient of j(mu,x)
+  d_dx_j = 0.5d0 * (1.d0 - derf(x_tmp))
+
+  ! gradient of gaussian additional term
+  x_tmp *= alpha_mu_gauss
+  x_tmp *= x_tmp 
+  d_dx_j += -0.5d0 * mu * c_mu_gauss * x * dexp(-x_tmp)
+ else
+   print *, ' Error in get_deriv_r12_j12: Unknown j2e_type = ', j2e_type
+   stop
+ endif
+end
+
+
+subroutine get_deriv_mu_j12(x,mu,d_d_mu)
+ implicit none
+ BEGIN_DOC
+ ! d/dmu j(mu,r12)
+ END_DOC
+  include 'constants.include.F'
+ double precision, intent(in)  :: x,mu
+ double precision, intent(out) :: d_d_mu
+ double precision :: x_tmp,inv_mu_2,inv_alpha_2
+ 
+ d_d_mu = 0.d0
+ if(x .lt. 1d-10) return
+ x_tmp = x*mu
+ if(mu.lt.1.d-10) return
+ inv_mu_2 = mu*mu
+ inv_mu_2 = 1.d0/inv_mu_2
+ if(j2e_type .eq. "Mu" .or. j2e_type .eq. "Mur") then
+  ! e^{-(r12*mu)^2}/(2 sqrt(pi) * mu^2)
+  d_d_mu = dexp(-x_tmp*x_tmp) * inv_sq_pi_2 * inv_mu_2
+ else if(j2e_type .eq. "Mugauss"  .or. j2e_type .eq. "Murgauss" ) then
+  d_d_mu  = dexp(-x_tmp*x_tmp) * inv_sq_pi_2 * inv_mu_2
+  inv_alpha_2 = 1.d0/alpha_mu_gauss
+  inv_alpha_2 *= inv_alpha_2
+  x_tmp *= alpha_mu_gauss
+  x_tmp *= x_tmp
+  d_d_mu += -0.25d0 * c_mu_gauss*inv_alpha_2*dexp(-x_tmp) * (1.d0 + 2.d0 * x_tmp) * inv_mu_2
+ else
+   print *, ' Error in get_deriv_r12_j12: Unknown j2e_type = ', j2e_type
+   stop
+ endif
+end
+
 
 ! ---
 
 double precision function j12_mu(r1, r2)
 
+  BEGIN_DOC
+  ! j(mu,r12) = 1/2 r12 (1 - erf(mu r12)) - 1/2 (sqrt(pi) * mu) e^{-(mu*r12)^2}
+  END_DOC
   include 'constants.include.F'
 
   implicit none
@@ -18,6 +83,18 @@ double precision function j12_mu(r1, r2)
 
     j12_mu = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_erf
 
+  else if(j2e_type .eq. "Mugauss") then
+
+    r12 = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+               + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+               + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+    double precision :: r12_tmp
+    r12_tmp = mu_erf * r12
+
+    j12_mu = 0.5d0 * r12 * (1.d0 - derf(r12_tmp)) - inv_sq_pi_2 * dexp(-r12_tmp*r12_tmp) / mu_erf
+    r12_tmp *= alpha_mu_gauss
+    j12_mu += 0.25d0 * c_mu_gauss / (alpha_mu_gauss*alpha_mu_gauss*mu_erf) * dexp(-r12_tmp*r12_tmp)
+
   else
 
     print *, ' Error in j12_mu: Unknown j2e_type = ', j2e_type
@@ -57,7 +134,7 @@ subroutine grad1_j12_mu(r1, r2, grad)
 
   grad = 0.d0
 
-  if(j2e_type .eq. "Mu") then
+  if(j2e_type .eq. "Mu".or.j2e_type .eq. "Mugauss") then
 
     dx = r1(1) - r2(1)
     dy = r1(2) - r2(2)
@@ -66,31 +143,42 @@ subroutine grad1_j12_mu(r1, r2, grad)
     r12 = dsqrt(dx * dx + dy * dy + dz * dz)
     if(r12 .lt. 1d-10) return
 
-    tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
+    call get_deriv_r12_j12(r12,mu_erf,tmp)
+!    tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
 
     grad(1) = tmp * dx
     grad(2) = tmp * dy
     grad(3) = tmp * dz
+    grad *= 1.d0/r12
 
-  elseif(j2e_type .eq. "Mur") then
+  elseif(j2e_type .eq. "Mur" .or. j2e_type .eq. "Murgauss") then
+   double precision :: jast
+   call grad_j_sum_mu_of_r(r1,r2,jast,grad)
+ 
+  elseif(j2e_type .eq. "Bump") then
+   double precision ::grad_jast(3)
+   call get_grad_j_bump_mu_of_r(r1,r2,grad_jast)
+     dx = r1(1) - r2(1)
+     dy = r1(2) - r2(2)
+     dz = r1(3) - r2(3)
 
-    dx  = r1(1) - r2(1)
-    dy  = r1(2) - r2(2)
-    dz  = r1(3) - r2(3)
-    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+     r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+     if(r12 .lt. 1d-10) then
+       grad(1) = 0.d0
+       grad(2) = 0.d0
+       grad(3) = 0.d0
+       return
+     endif                                                                                                                                                                            
 
-    call mu_r_val_and_grad(r1, r2, mu_val, mu_der)
-    mu_tmp  = mu_val * r12
-    tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
-    grad(1) = tmp * mu_der(1)
-    grad(2) = tmp * mu_der(2)
-    grad(3) = tmp * mu_der(3)
+     tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
+
+     grad(1) = 0.5d0 * tmp * dx
+     grad(2) = 0.5d0 * tmp * dy
+     grad(3) = 0.5d0 * tmp * dz
+     grad(1) += 0.5d0 * grad_jast(1)
+     grad(2) += 0.5d0 * grad_jast(2)
+     grad(3) += 0.5d0 * grad_jast(3)
 
-    if(r12 .lt. 1d-10) return
-    tmp     = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12
-    grad(1) = grad(1) + tmp * dx
-    grad(2) = grad(2) + tmp * dy
-    grad(3) = grad(3) + tmp * dz
 
   else
 
@@ -369,7 +457,18 @@ end
 ! ---
 
 subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
-
+ BEGIN_DOC
+! various flavours of mu(r1,r2) 
+! depends on essentially the density and other related quantities 
+!
+! change the variable "murho_type" to change type
+!
+!  murho_type == -1 :: mu(r1,r2) = (rho(r1) mu_mf(r1) + rho(r2) mu_mf(r2))/[rho(r1)+rho(r2)]
+!
+!             ==  0 :: mu(r1,r2) = (sqrt(rho(r1)) mu_mf(r1) + sqrt(rho(r2)) mu_mf(r2))/[sqrt(rho(r1))+sqrt(rho(r2))]
+!             
+!             == -2 :: mu(r1,r2) = 0.5(mu_mf(r1) + mu_mf(r2))
+ END_DOC
   implicit none
   double precision, intent(in)  :: r1(3), r2(3)
   double precision, intent(out) :: mu_val, mu_der(3)
@@ -379,11 +478,50 @@ subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
   double precision :: rho1, grad_rho1(3),rho2,rho_tot,inv_rho_tot
   double precision :: f_rho1, f_rho2, d_drho_f_rho1
   double precision :: d_dx1_f_rho1(3),d_dx_rho_f_rho(3),nume
+  double precision :: mu_mf_r1, dm_r1, grad_mu_mf_r1(3), grad_dm_r1(3)
+  double precision :: mu_mf_r2, dm_r2, grad_mu_mf_r2(3), grad_dm_r2(3)
+
+  double precision :: num, denom, grad_denom(3), grad_num(3)
+  double precision :: dsqrt_dm_r1
 
   PROVIDE murho_type
   PROVIDE mu_r_ct mu_erf
 
-  if(murho_type .eq. 1) then
+  if(murho_type .eq. 0) then 
+   call grad_mu_of_r_mean_field(r1,mu_mf_r1, dm_r1, grad_mu_mf_r1, grad_dm_r1)
+   call grad_mu_of_r_mean_field(r2,mu_mf_r2, dm_r2, grad_mu_mf_r2, grad_dm_r2)
+   dsqrt_dm_r1 = dsqrt(dm_r1)
+   denom = (dsqrt_dm_r1 + dsqrt(dm_r2) )
+   if(denom.lt.1.d-7)then
+    mu_val = 1.d+10
+    mu_der = 0.d0
+    return
+   endif
+   num = (dsqrt(dm_r1) * mu_mf_r1 + dsqrt(dm_r2) * mu_mf_r2) 
+   mu_val = num / denom
+   grad_denom = grad_dm_r1/dsqrt_dm_r1
+   grad_num = dsqrt(dm_r1) * grad_mu_mf_r1 + mu_mf_r1 * grad_dm_r1
+   mu_der = (grad_num * denom - num * grad_denom)/(denom*denom)
+  else if(murho_type .eq. -1) then
+   call grad_mu_of_r_mean_field(r1,mu_mf_r1, dm_r1, grad_mu_mf_r1, grad_dm_r1)
+   call grad_mu_of_r_mean_field(r2,mu_mf_r2, dm_r2, grad_mu_mf_r2, grad_dm_r2)
+   denom = (dm_r1 + dm_r2 )
+   if(denom.lt.1.d-7)then
+    mu_val = 1.d+10
+    mu_der = 0.d0
+    return
+   endif
+   num = (dm_r1 * mu_mf_r1 + dm_r2 * mu_mf_r2) 
+   mu_val = num / denom
+   grad_denom = grad_dm_r1
+   grad_num = dm_r1 * grad_mu_mf_r1 + mu_mf_r1 * grad_dm_r1
+   mu_der = (grad_num * denom - num * grad_denom)/(denom*denom)
+  else if(murho_type .eq. -2) then
+   call grad_mu_of_r_mean_field(r1,mu_mf_r1, dm_r1, grad_mu_mf_r1, grad_dm_r1)
+   call grad_mu_of_r_mean_field(r2,mu_mf_r2, dm_r2, grad_mu_mf_r2, grad_dm_r2)
+   mu_val = 0.5d0 * (mu_mf_r1 + mu_mf_r2) 
+   mu_der = 0.5d0 * grad_mu_mf_r1
+  else if(murho_type .eq. 1) then
 
     !
     ! r = 0.5 (r1 + r2)
diff --git a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
index 33563102..d951db93 100644
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
@@ -4,7 +4,7 @@
 subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
 
   BEGIN_DOC
-  !
+  ! 
   ! grad_1 u(r1,r2)
   !
   ! we use grid for r1 and extra_grid for r2
@@ -33,8 +33,12 @@ subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
   r1(2) = final_grid_points(2,ipoint)
   r1(3) = final_grid_points(3,ipoint)
 
-  if( (j2e_type .eq. "Mu")  .or. &
-      (j2e_type .eq. "Mur") .or. &
+  if( (j2e_type .eq. "Mu")        .or. &
+      (j2e_type .eq. "Mur")       .or. &
+      (j2e_type .eq. "Jpsi")   .or. &
+      (j2e_type .eq. "Mugauss")   .or. &
+      (j2e_type .eq. "Murgauss")  .or. &
+      (j2e_type .eq. "Bump")      .or. &
       (j2e_type .eq. "Boys") ) then
 
     if(env_type .eq. "None") then
@@ -167,7 +171,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
   integer                       :: jpoint
   integer                       :: i_nucl, p, mpA, npA, opA
   double precision              :: r2(3)
-  double precision              :: dx, dy, dz, r12, tmp, r12_inv
+  double precision              :: dx, dy, dz, r12, tmp
   double precision              :: mu_val, mu_tmp, mu_der(3)
   double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
   double precision              :: tmp1, tmp2
@@ -181,7 +185,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
     !  d/dy1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (y1 - y2)
     !  d/dz1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (z1 - z2)
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -191,66 +195,107 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       dy = r1(2) - r2(2)
       dz = r1(3) - r2(3)
 
-      r12 = dx * dx + dy * dy + dz * dz
-
-      if(r12 .lt. 1d-20) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
         cycle
       endif
 
-      r12_inv = 1.d0/dsqrt(r12)
-      r12 = r12*r12_inv
-
-      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) * r12_inv
+      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
 
       gradx(jpoint) = tmp * dx
       grady(jpoint) = tmp * dy
       gradz(jpoint) = tmp * dz
     enddo
 
-  elseif(j2e_type .eq. "Mur") then
+  else if(j2e_type .eq. "Mugauss") then
 
-    ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 ) d/dx1 mu(r1,r2)
-    !                        + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)
+    !  d/dx1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12 - mu*c*r12*exp(-(mu*alpha*r12)^2] * (x1 - x2)
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
       r2(3) = final_grid_points_extra(3,jpoint)
 
-      dx  = r1(1) - r2(1)
-      dy  = r1(2) - r2(2)
-      dz  = r1(3) - r2(3)
+      dx = r1(1) - r2(1)
+      dy = r1(2) - r2(2)
+      dz = r1(3) - r2(3)
 
-      r12 = dx * dx + dy * dy + dz * dz
-
-      if(r12 .lt. 1d-20) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
         cycle
       endif
 
-      r12_inv = 1.d0/dsqrt(r12)
-      r12 = r12*r12_inv
+      double precision :: r12_tmp
+      r12_tmp = mu_erf * r12
+      ! gradient of j(mu,r12)
+      tmp = 0.5d0 * (1.d0 - derf(r12_tmp)) / r12
+      ! gradient of gaussian additional term
+      r12_tmp *= alpha_mu_gauss
+      r12_tmp *= r12_tmp 
+      tmp += -0.5d0 * mu_erf * c_mu_gauss * r12 * dexp(-r12_tmp)/r12
 
-      call mu_r_val_and_grad(r1, r2, mu_val, mu_der)
+      gradx(jpoint) = tmp * dx
+      grady(jpoint) = tmp * dy
+      gradz(jpoint) = tmp * dz
+    enddo
 
-      mu_tmp  = mu_val * r12
-      tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
+  elseif(j2e_type .eq. "Mur".or.j2e_type .eq. "Murgauss") then
 
-      gradx(jpoint) = tmp * mu_der(1)
-      grady(jpoint) = tmp * mu_der(2)
-      gradz(jpoint) = tmp * mu_der(3)
+    ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 ) d/dx1 mu(r1,r2) 
+    !                        + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)
 
-      tmp = 0.5d0 * (1.d0 - derf(mu_tmp)) * r12_inv
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
-      gradx(jpoint) = gradx(jpoint) + tmp * dx
-      grady(jpoint) = grady(jpoint) + tmp * dy
-      gradz(jpoint) = gradz(jpoint) + tmp * dz
+      r2(1) = final_grid_points_extra(1,jpoint)
+      r2(2) = final_grid_points_extra(2,jpoint)
+      r2(3) = final_grid_points_extra(3,jpoint)
+      double precision :: jast, grad_jast(3)
+      call grad_j_sum_mu_of_r(r1,r2,jast,grad_jast)
+      gradx(jpoint) = grad_jast(1)
+      grady(jpoint) = grad_jast(2)
+      gradz(jpoint) = grad_jast(3)
+    enddo
+  elseif(j2e_type .eq. "Bump") then
+
+    ! d/dx1 jbump(r1,r2)
+
+    do jpoint = 1, n_points_extra_final_grid ! r2 
+
+      r2(1) = final_grid_points_extra(1,jpoint)
+      r2(2) = final_grid_points_extra(2,jpoint)
+      r2(3) = final_grid_points_extra(3,jpoint)
+      call get_grad_j_bump_mu_of_r(r1,r2,grad_jast)
+
+      dx = r1(1) - r2(1)
+      dy = r1(2) - r2(2)
+      dz = r1(3) - r2(3)
+
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
+        cycle
+      endif
+
+      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
+
+      gradx(jpoint) = 0.5d0 * tmp * dx
+      grady(jpoint) = 0.5d0 * tmp * dy
+      gradz(jpoint) = 0.5d0 * tmp * dz
+      gradx(jpoint) += 0.5d0 * grad_jast(1)
+      grady(jpoint) += 0.5d0 * grad_jast(2)
+      gradz(jpoint) += 0.5d0 * grad_jast(3)
+!      gradx(jpoint) =  grad_jast(1)
+!      grady(jpoint) =  grad_jast(2)
+!      gradz(jpoint) =  grad_jast(3)
     enddo
 
   elseif(j2e_type .eq. "Boys") then
@@ -264,7 +309,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
     PROVIDE a_boys
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -273,17 +318,14 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       dx  = r1(1) - r2(1)
       dy  = r1(2) - r2(2)
       dz  = r1(3) - r2(3)
-      r12 = dx * dx + dy * dy + dz * dz
-
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
       if(r12 .lt. 1d-10) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
         cycle
       endif
 
-      r12 = dsqrt(r12)
-
       tmp = 1.d0 + a_boys * r12
       tmp = 0.5d0 / (r12 * tmp * tmp)
 
@@ -294,13 +336,16 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
 
   elseif(j2e_type .eq. "Boys_Handy") then
 
-    integer :: powmax
-    powmax = max(maxval(jBH_m),maxval(jBH_n))
-
+    integer                       :: powmax1, powmax, powmax2
     double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
-    allocate (f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
 
-    do p=0,powmax
+    powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+    powmax2 = maxval(jBH_o)
+    powmax  = max(powmax1, powmax2)
+
+    allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+    do p = 0, powmax
       double_p(p) = dble(p)
     enddo
 
@@ -318,11 +363,10 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
       r2(2) = final_grid_points_extra(2,jpoint)
       r2(3) = final_grid_points_extra(3,jpoint)
 
-      gradx(jpoint) = 0.d0
-      grady(jpoint) = 0.d0
-      gradz(jpoint) = 0.d0
-
-      do i_nucl = 1, nucl_num
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
+      do i_nucl = 1, nucl_num 
 
         rn(1) = nucl_coord(i_nucl,1)
         rn(2) = nucl_coord(i_nucl,2)
@@ -332,61 +376,59 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
         call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
         call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)
 
-
         ! Compute powers of f1A and f2A
-
-        do p = 1, maxval(jBH_m(:,i_nucl))
+        do p = 1, powmax1
           f1A_power(p) = f1A_power(p-1) * f1A
-        enddo
-
-        do p = 1, maxval(jBH_n(:,i_nucl))
           f2A_power(p) = f2A_power(p-1) * f2A
         enddo
-
-        do p = 1, maxval(jBH_o(:,i_nucl))
+        do p = 1, powmax2
           g12_power(p) = g12_power(p-1) * g12
         enddo
 
-
-
         do p = 1, jBH_size
           mpA = jBH_m(p,i_nucl)
           npA = jBH_n(p,i_nucl)
           opA = jBH_o(p,i_nucl)
           tmp = jBH_c(p,i_nucl)
-          if(mpA .eq. npA) then
-            tmp = tmp * 0.5d0
-          endif
-
-!TODO : Powers to optimize here
-
-!          tmp1 = 0.d0
-!          if(mpA .gt. 0) then
-!            tmp1 = tmp1 + dble(mpA) * f1A**(mpA-1) * f2A**npA
-!          endif
-!          if(npA .gt. 0) then
-!            tmp1 = tmp1 + dble(npA) * f1A**(npA-1) * f2A**mpA
-!          endif
-!          tmp1 = tmp1 * g12**(opA)
-!
-!          tmp2 = 0.d0
-!          if(opA .gt. 0) then
-!            tmp2 = tmp2 + dble(opA) * g12**(opA-1) * (f1A**(mpA) * f2A**(npA) + f1A**(npA) * f2A**(mpA))
-!          endif
 
           tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
-          tmp1 = tmp1 * g12_power(opA)
+          tmp1 = tmp1 * g12_power(opA) * tmp
+          tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
 
-          tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA))
+          !tmp1 = 0.d0
+          !if(mpA .gt. 0) then
+          !  tmp1 = tmp1 + dble(mpA) * f1A**dble(mpA-1) * f2A**dble(npA)
+          !endif
+          !if(npA .gt. 0) then
+          !  tmp1 = tmp1 + dble(npA) * f1A**dble(npA-1) * f2A**dble(mpA)
+          !endif
+          !tmp1 = tmp1 * g12**dble(opA)
+          !tmp2 = 0.d0
+          !if(opA .gt. 0) then
+          !  tmp2 = tmp2 + dble(opA) * g12**dble(opA-1) * (f1A**dble(mpA) * f2A**dble(npA) + f1A**dble(npA) * f2A**dble(mpA))
+          !endif
 
-
-          gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
-          grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
-          gradz(jpoint) = gradz(jpoint) + tmp * (tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3))
+!          gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
+!          grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
+!          gradz(jpoint) = gradz(jpoint) + tmp * (tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3))
+          gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+          grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+          gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
         enddo ! p
       enddo ! i_nucl
     enddo ! jpoint
 
+  elseif(j2e_type .eq. "Jpsi") then
+   double precision :: grad_j_psi_r1(3),jast_psi
+   do jpoint = 1, n_points_extra_final_grid ! r2 
+     r2(1) = final_grid_points_extra(1,jpoint)
+     r2(2) = final_grid_points_extra(2,jpoint)
+     r2(3) = final_grid_points_extra(3,jpoint)
+     call get_grad_r1_jastrow_psi(r1,r2,grad_j_psi_r1,jast_psi)
+     gradx(jpoint) = grad_j_psi_r1(1)
+     grady(jpoint) = grad_j_psi_r1(2)
+     gradz(jpoint) = grad_j_psi_r1(3)
+   enddo
   else
 
     print *, ' Error in grad1_j12_r1_seq: Unknown j2e_type = ', j2e_type
@@ -418,10 +460,10 @@ subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)
 
   integer                       :: jpoint
   double precision              :: r2(3)
-  double precision              :: dx, dy, dz, r12, r12_inv, tmp
+  double precision              :: dx, dy, dz, r12, tmp
 
 
-  do jpoint = 1, n_points_extra_final_grid ! r2
+  do jpoint = 1, n_points_extra_final_grid ! r2 
 
     r2(1) = final_grid_points_extra(1,jpoint)
     r2(2) = final_grid_points_extra(2,jpoint)
@@ -431,19 +473,15 @@ subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)
     dy = r1(2) - r2(2)
     dz = r1(3) - r2(3)
 
-    r12 = dx * dx + dy * dy + dz * dz
-
-    if(r12 .lt. 1d-20) then
-      gradx(jpoint) = 0.d0
-      grady(jpoint) = 0.d0
-      gradz(jpoint) = 0.d0
+    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+    if(r12 .lt. 1d-10) then
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
       cycle
     endif
 
-    r12_inv = 1.d0 / dsqrt(r12)
-    r12 = r12 * r12_inv
-
-    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) * r12_inv
+    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) / r12
 
     gradx(jpoint) = tmp * dx
     grady(jpoint) = tmp * dy
@@ -467,7 +505,7 @@ subroutine j12_r1_seq(r1, n_grid2, res)
   integer                       :: jpoint
   double precision              :: r2(3)
   double precision              :: dx, dy, dz
-  double precision              :: mu_tmp, r12, mu_erf_inv
+  double precision              :: mu_tmp, r12
 
   PROVIDE final_grid_points_extra
 
@@ -475,21 +513,20 @@ subroutine j12_r1_seq(r1, n_grid2, res)
 
     PROVIDE mu_erf
 
-    mu_erf_inv = 1.d0 / mu_erf
-    do jpoint = 1, n_points_extra_final_grid ! r2
-
+    do jpoint = 1, n_points_extra_final_grid ! r2 
+  
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
       r2(3) = final_grid_points_extra(3,jpoint)
-
+  
       dx  = r1(1) - r2(1)
       dy  = r1(2) - r2(2)
       dz  = r1(3) - r2(3)
       r12 = dsqrt(dx * dx + dy * dy + dz * dz)
 
       mu_tmp = mu_erf * r12
-
-      res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) * mu_erf_inv
+  
+      res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_erf
     enddo
 
   elseif(j2e_type .eq. "Boys") then
@@ -498,7 +535,7 @@ subroutine j12_r1_seq(r1, n_grid2, res)
 
     PROVIDE a_boys
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
 
       r2(1) = final_grid_points_extra(1,jpoint)
       r2(2) = final_grid_points_extra(2,jpoint)
@@ -540,19 +577,19 @@ subroutine jmu_r1_seq(mu, r1, n_grid2, res)
 
   tmp1 = inv_sq_pi_2 / mu
 
-  do jpoint = 1, n_points_extra_final_grid ! r2
-
+  do jpoint = 1, n_points_extra_final_grid ! r2 
+  
     r2(1) = final_grid_points_extra(1,jpoint)
     r2(2) = final_grid_points_extra(2,jpoint)
     r2(3) = final_grid_points_extra(3,jpoint)
-
+  
     dx  = r1(1) - r2(1)
     dy  = r1(2) - r2(2)
     dz  = r1(3) - r2(3)
     r12 = dsqrt(dx * dx + dy * dy + dz * dz)
 
     tmp2 = mu * r12
-
+  
     res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(tmp2)) - tmp1 * dexp(-tmp2*tmp2)
   enddo
 
@@ -579,7 +616,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -598,7 +635,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -618,7 +655,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -636,7 +673,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)
 
     res = 1.d0
 
-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
       r(1) = final_grid_points_extra(1,jpoint)
       r(2) = final_grid_points_extra(2,jpoint)
       r(3) = final_grid_points_extra(3,jpoint)
@@ -666,7 +703,7 @@ end
 subroutine get_grad1_u12_2e_r1_seq(ipoint, n_grid2, resx, resy, resz)
 
   BEGIN_DOC
-  !
+  ! 
   ! grad_1 u_2e(r1,r2)
   !
   ! we use grid for r1 and extra_grid for r2
@@ -695,8 +732,12 @@ subroutine get_grad1_u12_2e_r1_seq(ipoint, n_grid2, resx, resy, resz)
   r1(2) = final_grid_points(2,ipoint)
   r1(3) = final_grid_points(3,ipoint)
 
-  if( (j2e_type .eq. "Mu")  .or. &
-      (j2e_type .eq. "Mur") .or. &
+  if( (j2e_type .eq. "Mu")        .or. &
+      (j2e_type .eq. "Mugauss")   .or. &
+      (j2e_type .eq. "Mur")       .or. &
+      (j2e_type .eq. "Jpsi")       .or. &
+      (j2e_type .eq. "Murgauss")  .or. &
+      (j2e_type .eq. "Bump")      .or. &
       (j2e_type .eq. "Boys") ) then
 
     if(env_type .eq. "None") then
@@ -786,7 +827,7 @@ end
 subroutine get_u12_2e_r1_seq(ipoint, n_grid2, res)
 
   BEGIN_DOC
-  !
+  ! 
   ! u_2e(r1,r2)
   !
   ! we use grid for r1 and extra_grid for r2
@@ -813,8 +854,11 @@ subroutine get_u12_2e_r1_seq(ipoint, n_grid2, res)
   r1(2) = final_grid_points(2,ipoint)
   r1(3) = final_grid_points(3,ipoint)
 
-  if( (j2e_type .eq. "Mu")  .or. &
-      (j2e_type .eq. "Mur") .or. &
+  if( (j2e_type .eq. "Mu")        .or. &
+      (j2e_type .eq. "Mur")       .or. &
+      (j2e_type .eq. "Mugauss")  .or. &
+      (j2e_type .eq. "Murgauss")  .or. &
+      (j2e_type .eq. "Mugauss")   .or. &
       (j2e_type .eq. "Boys") ) then
 
     if(env_type .eq. "None") then
@@ -893,23 +937,24 @@ subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
               + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
               + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
 
-  tmp1 = 1.d0 / (1.d0 + alpha * dist)
 
-  fct = alpha * dist * tmp1
-
-  if(dist .lt. 1d-10) then
-    grad1_fct(1) = 0.d0
-    grad1_fct(2) = 0.d0
-    grad1_fct(3) = 0.d0
-  else
+  if(dist .ge. 1d-10) then
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+    
+    fct = alpha * dist * tmp1
     tmp2 = alpha * tmp1 * tmp1 / dist
     grad1_fct(1) = tmp2 * (r1(1) - r2(1))
     grad1_fct(2) = tmp2 * (r1(2) - r2(2))
     grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+  else
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
   endif
 
   return
-end
+end 
 
 ! ---
 
diff --git a/plugins/local/non_h_ints_mu/jastrow_psi.irp.f b/plugins/local/non_h_ints_mu/jastrow_psi.irp.f
new file mode 100644
index 00000000..4c88e793
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/jastrow_psi.irp.f
@@ -0,0 +1,124 @@
+BEGIN_PROVIDER [ double precision, c_ij_ab_jastrow, (mo_num, mo_num, elec_alpha_num, elec_beta_num)]
+ implicit none
+ integer :: iunit, getUnitAndOpen
+ c_ij_ab_jastrow = 0.d0
+ iunit = getUnitAndOpen(trim(ezfio_work_dir)//'c_ij_ab', 'R')                                                     
+ read(iunit) c_ij_ab_jastrow
+ close(iunit)
+ print*,'c_ij_ab_jastrow = '
+ integer :: i,j,a,b
+ do i = 1, elec_beta_num ! r2
+  do j = 1, elec_alpha_num ! r1
+   do a = elec_beta_num+1, mo_num ! r2
+    do b = elec_alpha_num+1, mo_num ! r1 
+!     print*,b,a,j,i
+     print*,c_ij_ab_jastrow(b,a,j,i),b,a,j,i
+     if(dabs(c_ij_ab_jastrow(b,a,j,i)).lt.1.d-12)then
+      c_ij_ab_jastrow(b,a,j,i) = 0.d0
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+double precision function jastrow_psi(r1,r2)
+ implicit none
+ double precision, intent(in) :: r1(3), r2(3)
+ integer :: i,j,a,b
+ double precision, allocatable :: mos_array_r1(:), mos_array_r2(:)
+ allocate(mos_array_r1(mo_num), mos_array_r2(mo_num))
+ call give_all_mos_at_r(r1,mos_array_r1)
+ call give_all_mos_at_r(r2,mos_array_r2)
+ double precision :: eps,coef, numerator,denominator
+ double precision :: phi_i_phi_j
+ eps = a_boys
+ jastrow_psi= 0.d0
+ do i = 1, elec_beta_num ! r1
+  do j = 1, elec_alpha_num ! r2
+   phi_i_phi_j = mos_array_r1(i) * mos_array_r2(j) + eps
+   denominator = 1.d0/phi_i_phi_j
+   do a = elec_beta_num+1, mo_num ! r1
+    do b = elec_alpha_num+1, mo_num ! r2 
+     coef = c_ij_ab_jastrow(b,a,j,i)
+     numerator = mos_array_r2(b) * mos_array_r1(a)
+     jastrow_psi += coef * numerator*denominator
+    enddo
+   enddo
+  enddo
+ enddo
+end
+
+subroutine get_grad_r1_jastrow_psi(r1,r2,grad_j_psi_r1,jast)
+ implicit none
+ double precision, intent(in) :: r1(3), r2(3)
+ double precision, intent(out):: grad_j_psi_r1(3),jast
+ integer :: i,j,a,b
+ double precision, allocatable :: mos_array_r1(:), mos_array_r2(:)
+ double precision, allocatable :: mos_grad_array_r1(:,:),mos_grad_array_r2(:,:)
+ double precision :: num_j, denom_j, num_j_grad(3), denom_j_grad(3),delta,coef
+ double precision :: inv_denom_j
+ allocate(mos_array_r1(mo_num), mos_array_r2(mo_num))
+ allocate(mos_grad_array_r1(3,mo_num), mos_grad_array_r2(3,mo_num))
+ delta = a_boys
+ call give_all_mos_and_grad_at_r(r1,mos_array_r1,mos_grad_array_r1)
+ call give_all_mos_and_grad_at_r(r2,mos_array_r2,mos_grad_array_r2)
+ grad_j_psi_r1 = 0.d0
+ jast = 0.d0
+ do i = 1, elec_beta_num ! r1
+  do j = 1, elec_alpha_num ! r2
+   call denom_jpsi(i,j,delta,mos_array_r1,mos_grad_array_r1,mos_array_r2,denom_j, denom_j_grad)
+   inv_denom_j = 1.d0/denom_j
+   do a = elec_beta_num+1, mo_num ! r1
+    do b = elec_alpha_num+1, mo_num ! r2 
+     call numerator_psi(a,b,mos_array_r1,mos_grad_array_r1,mos_array_r2,num_j, num_j_grad)
+     coef = c_ij_ab_jastrow(b,a,j,i)
+     jast += coef * num_j * inv_denom_j
+     grad_j_psi_r1 += coef * (num_j_grad * denom_j - num_j * denom_j_grad) * inv_denom_j * inv_denom_j
+    enddo
+   enddo
+  enddo
+ enddo
+ if(jast.lt.-1.d0.or.dabs(jast).gt.1.d0)then
+  print*,'pb ! '
+  print*,jast
+  print*,dsqrt(r1(1)**2+r1(2)**2+r1(3)**2),dsqrt(r2(1)**2+r2(2)**2+r2(3)**2)
+  print*,r1
+!  print*,mos_array_r1(1:2)
+  print*,r2
+!  print*,mos_array_r2(1:2)
+  stop
+ endif
+ if(log_jpsi)then
+  grad_j_psi_r1 = grad_j_psi_r1/(1.d0 + jast)
+ endif
+
+end
+
+
+subroutine denom_jpsi(i,j,delta,mos_array_r1,mos_grad_array_r1,mos_array_r2,denom, grad_denom)
+ implicit none
+ integer, intent(in)           :: i,j
+ double precision, intent(in)  :: mos_array_r1(mo_num),mos_grad_array_r1(3,mo_num),mos_array_r2(mo_num),delta
+ double precision, intent(out) :: denom, grad_denom(3)
+ double precision :: coef,phi_i_phi_j,inv_phi_i_phi_j,inv_phi_i_phi_j_2
+ phi_i_phi_j = mos_array_r1(i) * mos_array_r2(j)
+ if(phi_i_phi_j /= 0.d0)then
+  inv_phi_i_phi_j   = 1.d0/phi_i_phi_j
+  inv_phi_i_phi_j_2 = 1.d0/(phi_i_phi_j * phi_i_phi_j)
+ else
+  inv_phi_i_phi_j   = huge(1.0)
+  inv_phi_i_phi_j_2 = huge(1.d0)
+ endif
+ denom         = phi_i_phi_j + delta * inv_phi_i_phi_j
+ grad_denom(:) = (1.d0 - delta*inv_phi_i_phi_j_2) * mos_array_r2(j) * mos_grad_array_r1(:,i)
+end
+
+subroutine numerator_psi(a,b,mos_array_r1,mos_grad_array_r1,mos_array_r2,num, grad_num)
+ implicit none
+ integer, intent(in)           :: a,b
+ double precision, intent(in)  :: mos_array_r1(mo_num),mos_grad_array_r1(3,mo_num),mos_array_r2(mo_num)
+ double precision, intent(out) :: num, grad_num(3)
+ num = mos_array_r1(a) * mos_array_r2(b)
+ grad_num(:) = mos_array_r2(b) * mos_grad_array_r1(:,a)
+end
diff --git a/plugins/local/non_h_ints_mu/mu_of_r.irp.f b/plugins/local/non_h_ints_mu/mu_of_r.irp.f
new file mode 100644
index 00000000..e9cf6c4a
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/mu_of_r.irp.f
@@ -0,0 +1,43 @@
+
+subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+ implicit none
+ BEGIN_DOC
+ ! returns the value and gradients of the mu(r) mean field, together with the HF density and its gradients. 
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ 
+ double precision :: dist
+ call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ 
+ dm = dm_a + dm_b
+ grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
+
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+  grad_mu_mf = 0.d0
+ else
+  if(mu_of_r_tc=="Erfmu")then
+   mu_mf           = 0.3333333333d0 * sqpi * (f_mf_ab/two_bod_dens + 0.25d0)
+   grad_mu_mf(1:3) = 0.3333333333d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                    /(two_bod_dens*two_bod_dens)
+  else if(mu_of_r_tc=="Standard")then
+   mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+   grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                    /(two_bod_dens*two_bod_dens)
+  else if(mu_of_r_tc=="Erfmugauss")then
+   mu_mf = (f_mf_ab/two_bod_dens + 0.25d0)/c_mu_gauss_tot 
+   grad_mu_mf(1:3) = 1.d0/c_mu_gauss_tot* (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                    /(two_bod_dens*two_bod_dens)
+  else 
+   print*,'Wrong value for mu_of_r_tc !'
+   stop
+  endif
+ endif 
+
+end
+
diff --git a/plugins/local/non_h_ints_mu/mu_of_r_mean_field.irp.f b/plugins/local/non_h_ints_mu/mu_of_r_mean_field.irp.f
new file mode 100644
index 00000000..295d58c2
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/mu_of_r_mean_field.irp.f
@@ -0,0 +1,166 @@
+BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
+ implicit none
+ integer :: i,j,k,l 
+ double precision :: get_two_e_integral
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) 
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ implicit none
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
+ double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
+ call give_all_mos_at_r(r,mos_array_r) 
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+ enddo
+
+ dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+ enddo
+
+ dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+ enddo
+ two_bod_dens = dm_a * dm_b
+ 
+ f_mf_ab = 0.d0
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
+    enddo
+   enddo
+  enddo
+ enddo
+ ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+
+end
+
+subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ implicit none
+ BEGIN_DOC
+ ! gradient of mu(r) mean field, together with the gradient of the one- and two-body HF density. 
+ END_DOC
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: f_mf_ab, two_bod_dens
+ double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
+ double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
+
+ double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
+ double precision, allocatable :: mos_array_a(:), mos_array_b(:)
+ double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
+ double precision :: mo_i, mo_j, mo_k, mo_l
+ double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
+ 
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
+ allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
+ allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
+ call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+  mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+  mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+
+ ! ALPHA DENSITY AND GRADIENT 
+ dm_a = 0.d0
+ grad_dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+  grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)  
+ enddo
+
+ ! BETA DENSITY AND GRADIENT 
+ dm_b = 0.d0
+ grad_dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+  grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)  
+ enddo
+ ! TWO-BODY DENSITY AND GRADIENT 
+ two_bod_dens = dm_a * dm_b
+ grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
+
+ ! F_MF and GRADIENT 
+ grad_f_mf_ab = 0.d0
+ f_mf_ab  = 0.d0
+ do i = 1, elec_alpha_num
+  mo_i = mos_array_a(i)
+  grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
+  do j = 1, elec_beta_num
+   mo_j = mos_array_b(j)
+   grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
+   do k = 1, elec_alpha_num
+    mo_k = mos_array_a(k)
+    grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
+    do l = 1, elec_beta_num
+     mo_l = mos_array_b(l)
+     grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
+     grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & 
+     (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & 
+     +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
+    enddo
+   enddo
+  enddo
+ enddo
+
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+ grad_f_mf_ab *= 2.D0
+ grad_two_bod_dens *= 2.d0
+end
+
+subroutine mu_of_r_mean_field(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+ endif
+end
+
+subroutine mu_of_r_mean_field_tc(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.3333333333d0 * sqpi * (f_mf_ab/two_bod_dens + 0.25d0)
+ endif
+end
+
diff --git a/plugins/local/non_h_ints_mu/numerical_integ.irp.f b/plugins/local/non_h_ints_mu/numerical_integ.irp.f
index 5436b857..2737774a 100644
--- a/plugins/local/non_h_ints_mu/numerical_integ.irp.f
+++ b/plugins/local/non_h_ints_mu/numerical_integ.irp.f
@@ -179,7 +179,7 @@ double precision function num_v_ij_erf_rk_cst_mu_env(i, j, ipoint)
     dx    = r1(1) - r2(1)
     dy    = r1(2) - r2(2)
     dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
     if(r12 .lt. 1d-10) cycle
 
     tmp1  = (derf(mu_erf * r12) - 1.d0) / r12
@@ -228,7 +228,7 @@ subroutine num_x_v_ij_erf_rk_cst_mu_env(i, j, ipoint, integ)
     dx    = r1(1) - r2(1)
     dy    = r1(2) - r2(2)
     dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
     if(r12 .lt. 1d-10) cycle
 
     tmp1  = (derf(mu_erf * r12) - 1.d0) / r12
@@ -530,7 +530,7 @@ subroutine num_int2_u_grad1u_total_env2(i, j, ipoint, integ)
     dx    = r1(1) - r2(1)
     dy    = r1(2) - r2(2)
     dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
     if(r12 .lt. 1d-10) cycle
 
     tmp0 = env_nucl(r2)
diff --git a/plugins/local/non_h_ints_mu/plot_j_gauss.irp.f b/plugins/local/non_h_ints_mu/plot_j_gauss.irp.f
new file mode 100644
index 00000000..a4030d8c
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/plot_j_gauss.irp.f
@@ -0,0 +1,59 @@
+program plot_j_gauss
+ implicit none
+ double precision :: xmin, xmax, x, dx
+ double precision :: mu_min, mu_max, mu, d_mu
+ double precision :: pot_j_gauss,j_mu_simple,j_gauss_simple,pot_j_mu
+ double precision, allocatable :: mu_tab(:),j_mu(:),j_mu_gauss(:)
+ double precision, allocatable :: w_mu(:), w_mu_gauss(:)
+
+ character*(128) :: output
+ integer :: getUnitAndOpen
+ integer :: i_unit_output_wee_gauss,i_unit_output_wee_mu
+ integer :: i_unit_output_j_gauss,i_unit_output_j_mu
+ output=trim(ezfio_filename)//'.w_ee_mu_gauss'
+ i_unit_output_wee_gauss = getUnitAndOpen(output,'w')
+ output=trim(ezfio_filename)//'.w_ee_mu'
+ i_unit_output_wee_mu = getUnitAndOpen(output,'w')
+ output=trim(ezfio_filename)//'.j_mu_gauss'
+ i_unit_output_j_gauss = getUnitAndOpen(output,'w')
+ output=trim(ezfio_filename)//'.j_mu'
+ i_unit_output_j_mu = getUnitAndOpen(output,'w')
+
+ integer :: npt, i, j, n_mu
+ n_mu = 3
+ allocate(mu_tab(n_mu),j_mu(n_mu),j_mu_gauss(n_mu),w_mu(n_mu), w_mu_gauss(n_mu))
+ mu_min = 0.5d0
+ mu_max = 2.d0
+ d_mu = (mu_max - mu_min)/dble(n_mu)
+ mu = mu_min
+ do i = 1, n_mu
+  mu_tab(i) = mu
+  print*,'mu = ',mu
+  mu += d_mu
+ enddo
+ mu_tab(1) = 0.9d0
+ mu_tab(2) = 0.95d0
+ mu_tab(3) = 1.d0
+
+ xmin = 0.01d0
+ xmax = 10.d0
+ npt = 1000
+ dx = (xmax - xmin)/dble(npt)
+ x = xmin
+ do i = 1, npt
+  do j = 1, n_mu
+   mu = mu_tab(j)
+   w_mu_gauss(j) = pot_j_gauss(x,mu)
+   w_mu(j) = pot_j_mu(x,mu)
+   j_mu(j) = j_mu_simple(x,mu)
+   j_mu_gauss(j) = j_gauss_simple(x,mu) + j_mu(j)
+  enddo
+  write(i_unit_output_wee_gauss,'(100(F16.10,X))')x,w_mu_gauss(:)
+  write(i_unit_output_wee_mu,'(100(F16.10,X))')x,w_mu(:)
+  write(i_unit_output_j_gauss,'(100(F16.10,X))')x,j_mu_gauss(:)
+  write(i_unit_output_j_mu,'(100(F16.10,X))')x,j_mu(:)
+  x += dx
+ enddo
+
+
+end
diff --git a/plugins/local/non_h_ints_mu/plot_mo.irp.f b/plugins/local/non_h_ints_mu/plot_mo.irp.f
new file mode 100644
index 00000000..e1ecc783
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/plot_mo.irp.f
@@ -0,0 +1,19 @@
+program plot_mo
+ implicit none
+ integer :: i,npt
+ double precision :: xmin,xmax,dx,r(3)
+ double precision,allocatable :: mos_array(:)
+ allocate(mos_array(mo_num))
+ npt = 10000
+ xmin =0.d0
+ xmax =10.d0
+ dx=(xmax-xmin)/dble(npt)
+ r=0.d0
+ r(1) = xmin
+ do i = 1, npt
+  call give_all_mos_at_r(r,mos_array)
+  write(33,'(100(F16.10,X))')r(1),mos_array(1),mos_array(2),mos_array(3)
+  r(1) += dx
+ enddo
+
+end
diff --git a/plugins/local/non_h_ints_mu/plot_mu_of_r.irp.f b/plugins/local/non_h_ints_mu/plot_mu_of_r.irp.f
index 3a5984bd..4a3ec0d5 100644
--- a/plugins/local/non_h_ints_mu/plot_mu_of_r.irp.f
+++ b/plugins/local/non_h_ints_mu/plot_mu_of_r.irp.f
@@ -16,15 +16,16 @@ subroutine routine_print
  integer :: ipoint,nx,i
  double precision :: xmax,xmin,r(3),dx,sigma
  double precision :: mu_val, mu_der(3),dm_a,dm_b,grad,grad_dm_a(3), grad_dm_b(3)
- xmax =  5.D0
- xmin = -5.D0
+ xmax =  3.9D0
+ xmin = -3.9D0
  nx = 10000
  dx = (xmax - xmin)/dble(nx)
  r = 0.d0
  r(1) = xmin
  do ipoint = 1, nx
-  call mu_r_val_and_grad(r, r, mu_val, mu_der)
-  call density_and_grad_alpha_beta(r,dm_a,dm_b, grad_dm_a, grad_dm_b)
+!  call mu_r_val_and_grad(r, r, mu_val, mu_der)
+  call grad_mu_of_r_mean_field(r,mu_val, dm_a, mu_der, grad_dm_a)
+!  call density_and_grad_alpha_beta(r,dm_a,dm_b, grad_dm_a, grad_dm_b)
   sigma = 0.d0
   do i = 1,3
    sigma += grad_dm_a(i)**2
@@ -32,7 +33,8 @@ subroutine routine_print
   sigma=dsqrt(sigma)
   grad = mu_der(1)**2 + mu_der(2)**2 + mu_der(3)**2 
   grad = dsqrt(grad)
-  write(i_unit_output,'(100(F16.7,X))')r(1),mu_val,dm_a+dm_b,grad,sigma/dm_a
+  print*,r(1),mu_val
+  write(i_unit_output,'(100(F16.7,X))')r(1),mu_val,dm_a,grad,sigma/dm_a
   r(1) += dx
  enddo
 end
diff --git a/plugins/local/non_h_ints_mu/pot_j_gauss.irp.f b/plugins/local/non_h_ints_mu/pot_j_gauss.irp.f
new file mode 100644
index 00000000..f9a0a7bc
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/pot_j_gauss.irp.f
@@ -0,0 +1,146 @@
+double precision function j_simple(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ double precision :: j_mu_simple,j_gauss_simple
+ if(j2e_type .eq. "Mu".or.j2e_type .eq. "Mur") then
+  j_simple = j_mu_simple(x,mu)
+ else if(j2e_type .eq. "Mugauss".or.j2e_type .eq. "Murgauss") then
+  j_simple = j_gauss_simple(x,mu) + j_mu_simple(x,mu)
+ endif
+end
+
+
+double precision function j_mu_simple(x,mu)
+ implicit none
+ double precision, intent(in):: x,mu
+ include 'constants.include.F'
+ BEGIN_DOC
+! j_mu(mu,x) = 0.5 x (1 - erf(mu x)) - 1/[2 sqrt(pi)mu] exp(-(x*mu)^2)
+ END_DOC
+ j_mu_simple = 0.5d0 * x * (1.D0 - derf(mu*x)) - 0.5d0 * inv_sq_pi/mu *  dexp(-x*mu*x*mu)
+
+end
+
+double precision function j_gauss_simple(x,mu)
+ implicit none
+ double precision, intent(in):: x,mu
+ include 'constants.include.F'
+ BEGIN_DOC
+! j_mu(mu,x) = c/[4 alpha^2 mu] exp(-(alpha * mu * x)^2)
+!      with c = 27/(8 sqrt(pi)), alpha=3/2
+ END_DOC
+ double precision :: x_tmp
+ x_tmp = alpha_mu_gauss * mu * x
+ j_gauss_simple = 0.25d0 * c_mu_gauss / (alpha_mu_gauss*alpha_mu_gauss*mu) * dexp(-x_tmp*x_tmp)
+
+end
+
+double precision function j_mu_deriv(x,mu)
+ implicit none
+ BEGIN_DOC
+! d/dx j_mu(mu,x) = d/dx 0.5 x (1 - erf(mu x)) - 1/[2 sqrt(pi)mu] exp(-(x*mu)^2)
+!              = 0.5*(1 - erf(mu x))
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ j_mu_deriv = 0.5d0 * (1.d0 - derf(mu*x))
+end
+
+double precision function j_mu_deriv_2(x,mu)
+ implicit none
+ BEGIN_DOC
+! d^2/dx^2 j_mu(mu,x) = d^2/dx^2 0.5 x (1 - erf(mu x)) - 1/[2 sqrt(pi)mu] exp(-(x*mu)^2)
+!                  = -mu/sqrt(pi) * exp(-(mu x)^2)
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ j_mu_deriv_2 = - mu * inv_sq_pi * dexp(-x*mu*x*mu)
+end
+
+double precision function j_gauss_deriv(x,mu)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d/dx j_gauss(mu,x) = d/dx c/[4 alpha^2 mu] exp(-(alpha * mu * x)^2)
+!      with c = 27/(8 sqrt(pi)), alpha=3/2
+!                    = -0.5 * mu *  c * x * exp(-(alpha * mu * x)^2)
+ END_DOC
+ double precision :: x_tmp
+ x_tmp = alpha_mu_gauss * mu * x
+ j_gauss_deriv = -0.5d0 * mu * c_mu_gauss * x * exp(-x_tmp*x_tmp)
+end
+
+double precision function j_gauss_deriv_2(x,mu)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d/dx j_gauss(mu,x) = d/dx c/[4 alpha^2 mu] exp(-(alpha * mu * x)^2)
+!      with c = 27/(8 sqrt(pi)), alpha=3/2
+!                    = 0.5 * mu *  c * exp(-(alpha * mu * x)^2) * (2 (alpha*mu*x)^2 - 1)
+ END_DOC
+ double precision :: x_tmp
+ x_tmp = alpha_mu_gauss * mu * x
+ x_tmp = x_tmp * x_tmp
+ j_gauss_deriv_2 = 0.5d0 * mu * c_mu_gauss * exp(-x_tmp) * (2.d0*x_tmp - 1.d0)
+end
+
+double precision function j_erf_gauss_deriv(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d/dx (j_gauss(mu,x)+j_mu(mu,x)) 
+ END_DOC
+ double precision :: j_gauss_deriv,j_mu_deriv
+ j_erf_gauss_deriv = j_gauss_deriv(x,mu)+j_mu_deriv(x,mu)
+end
+
+double precision function j_erf_gauss_deriv_2(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d^2/dx^2 (j_gauss(mu,x)+j_mu(mu,x)) 
+ END_DOC
+ double precision :: j_gauss_deriv_2,j_mu_deriv_2
+ j_erf_gauss_deriv_2 = j_gauss_deriv_2(x,mu)+j_mu_deriv_2(x,mu)
+end
+
+
+double precision function pot_j_gauss(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+ ! effective scalar potential associated with the erf_gauss correlation factor
+ ! 
+ ! 1/x( 1 - 2 * d/dx j_erf_gauss(x,mu)) - d^2/dx^2 j_erf_gauss(x,mu)) - d/dx d/dx (j_erf_gauss(x,mu))^2
+ END_DOC
+ double precision :: j_erf_gauss_deriv_2,j_erf_gauss_deriv
+ double precision :: deriv_1, deriv_2
+ pot_j_gauss = 0.d0
+ if(x.ne.0.d0)then
+  deriv_1 = j_erf_gauss_deriv(x,mu)
+  deriv_2 = j_erf_gauss_deriv_2(x,mu)
+  pot_j_gauss = 1.d0/x * (1.d0 - 2.d0 * deriv_1) - deriv_1 * deriv_1 - deriv_2
+ endif
+
+end
+
+double precision function pot_j_mu(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+ ! effective scalar potential associated with the correlation factor
+ ! 
+ ! 1/x( 1 - 2 * d/dx j_erf(x,mu)) - d^2/dx^2 j_erf(x,mu)) - d/dx d/dx (j_erf(x,mu))^2
+ END_DOC
+ double precision :: j_mu_deriv_2,j_mu_deriv
+ double precision :: deriv_1, deriv_2
+ pot_j_mu = 0.d0
+ if(x.ne.0.d0)then
+  deriv_1 = j_mu_deriv(x,mu)
+  deriv_2 = j_mu_deriv_2(x,mu)
+  pot_j_mu= 1.d0/x * (1.d0 - 2.d0 * deriv_1) - deriv_1 * deriv_1 - deriv_2
+ endif
+
+end
diff --git a/plugins/local/non_h_ints_mu/print_jastrow_psi.irp.f b/plugins/local/non_h_ints_mu/print_jastrow_psi.irp.f
new file mode 100644
index 00000000..740743cb
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/print_jastrow_psi.irp.f
@@ -0,0 +1,15 @@
+program print_j_psi
+ implicit none
+ integer :: i,j,a,b
+ do i = 1, elec_beta_num ! r2
+  do j = 1, elec_alpha_num ! r1
+   do a = elec_beta_num+1, mo_num ! r2
+    do b = elec_alpha_num+1, mo_num ! r1 
+     print*,b,a,j,i
+     print*,c_ij_ab_jastrow(b,a,j,i)
+    enddo
+   enddo
+  enddo
+ enddo
+
+end
diff --git a/plugins/local/non_h_ints_mu/qmckl.irp.f b/plugins/local/non_h_ints_mu/qmckl.irp.f
index 1df80457..de440f14 100644
--- a/plugins/local/non_h_ints_mu/qmckl.irp.f
+++ b/plugins/local/non_h_ints_mu/qmckl.irp.f
@@ -75,3 +75,107 @@ BEGIN_PROVIDER [ integer*8, qmckl_ctx_jastrow ]
   endif
 
 END_PROVIDER
+
+
+ BEGIN_PROVIDER [ double precision, aos_in_r_array_qmckl, (ao_num,n_points_final_grid)]
+&BEGIN_PROVIDER [ double precision, aos_grad_in_r_array_qmckl, (ao_num,n_points_final_grid,3)]
+&BEGIN_PROVIDER [ double precision, aos_lapl_in_r_array_qmckl, (ao_num, n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! AOS computed with qmckl
+ END_DOC
+ use qmckl
+
+ integer*8 :: qmckl_ctx
+ integer(qmckl_exit_code) :: rc
+
+ qmckl_ctx = qmckl_context_create()
+
+ rc = qmckl_trexio_read(qmckl_ctx, trexio_file, 1_8*len(trim(trexio_filename)))
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in read_trexio'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ rc = qmckl_set_point(qmckl_ctx, 'N', n_points_final_grid*1_8, final_grid_points, n_points_final_grid*3_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in set_electron_point'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ double precision, allocatable :: vgl(:,:,:)
+ allocate( vgl(ao_num,5,n_points_final_grid))
+ rc = qmckl_get_ao_basis_ao_vgl_inplace(qmckl_ctx, vgl, n_points_final_grid*ao_num*5_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in get_ao_vgl'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ integer :: i,k
+ do k=1,n_points_final_grid
+   do i=1,ao_num
+     aos_in_r_array_qmckl(i,k) = vgl(i,1,k)
+     aos_grad_in_r_array_qmckl(i,k,1) = vgl(i,2,k)
+     aos_grad_in_r_array_qmckl(i,k,2) = vgl(i,3,k)
+     aos_grad_in_r_array_qmckl(i,k,3) = vgl(i,4,k)
+     aos_lapl_in_r_array_qmckl(i,k) = vgl(i,5,k)
+   enddo
+ enddo
+
+END_PROVIDER
+
+
+ BEGIN_PROVIDER [ double precision, mos_in_r_array_qmckl, (mo_num,n_points_final_grid)]
+&BEGIN_PROVIDER [ double precision, mos_grad_in_r_array_qmckl, (mo_num,n_points_final_grid,3)]
+&BEGIN_PROVIDER [ double precision, mos_lapl_in_r_array_qmckl, (mo_num, n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! moS computed with qmckl
+ END_DOC
+ use qmckl
+
+ integer*8 :: qmckl_ctx
+ integer(qmckl_exit_code) :: rc
+
+ qmckl_ctx = qmckl_context_create()
+
+ rc = qmckl_trexio_read(qmckl_ctx, trexio_file, 1_8*len(trim(trexio_filename)))
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in read_trexio'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ rc = qmckl_set_point(qmckl_ctx, 'N', n_points_final_grid*1_8, final_grid_points, n_points_final_grid*3_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in set_electron_point'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ double precision, allocatable :: vgl(:,:,:)
+ allocate( vgl(mo_num,5,n_points_final_grid))
+ rc = qmckl_get_mo_basis_mo_vgl(qmckl_ctx, vgl, n_points_final_grid*mo_num*5_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in get_mo_vgl'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ integer :: i,k
+ do k=1,n_points_final_grid
+   do i=1,mo_num
+     mos_in_r_array_qmckl(i,k) = vgl(i,1,k)
+     mos_grad_in_r_array_qmckl(i,k,1) = vgl(i,2,k)
+     mos_grad_in_r_array_qmckl(i,k,2) = vgl(i,3,k)
+     mos_grad_in_r_array_qmckl(i,k,3) = vgl(i,4,k)
+     mos_lapl_in_r_array_qmckl(i,k) = vgl(i,5,k)
+   enddo
+ enddo
+
+END_PROVIDER
+
+
diff --git a/plugins/local/non_h_ints_mu/tc_integ.irp.f b/plugins/local/non_h_ints_mu/tc_integ.irp.f
index 775a9e4c..ab88dd41 100644
--- a/plugins/local/non_h_ints_mu/tc_integ.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ.irp.f
@@ -44,14 +44,92 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
     elseif(tc_integ_type .eq. "numeric") then
 
       print *, ' Numerical integration over r1 and r2 will be performed'
-  
-      ! TODO combine 1shot & int2_grad1_u12_ao_num
 
-      PROVIDE int2_grad1_u12_ao_num
-      int2_grad1_u12_ao = int2_grad1_u12_ao_num
+      if(tc_save_mem) then
 
-      !PROVIDE int2_grad1_u12_ao_num_1shot
-      !int2_grad1_u12_ao = int2_grad1_u12_ao_num_1shot
+        integer                       :: n_blocks, n_rest, n_pass
+        integer                       :: i_blocks, i_rest, i_pass, ii
+        double precision              :: mem, n_double
+        double precision, allocatable :: tmp(:,:,:), xx(:)
+        double precision, allocatable :: tmp_grad1_u12(:,:,:)
+
+        PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+        allocate(tmp(n_points_extra_final_grid,ao_num,ao_num), xx(n_points_extra_final_grid))
+        !$OMP PARALLEL               &
+        !$OMP DEFAULT (NONE)         &
+        !$OMP PRIVATE (j, i, jpoint) &
+        !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+        !$OMP DO COLLAPSE(2)
+        do j = 1, ao_num
+          do i = 1, ao_num
+            do jpoint = 1, n_points_extra_final_grid
+              tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+            enddo
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call total_memory(mem)
+        mem      = max(1.d0, qp_max_mem - mem)
+        n_double = mem * 1.d8
+        n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+        n_rest   = int(mod(n_points_final_grid, n_blocks))
+        n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+        call write_int(6, n_pass, 'Number of passes')
+        call write_int(6, n_blocks, 'Size of the blocks')
+        call write_int(6, n_rest, 'Size of the last block')
+        allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,3))
+        do i_pass = 1, n_pass
+          ii = (i_pass-1)*n_blocks + 1
+          !$OMP PARALLEL                   &
+          !$OMP DEFAULT (NONE)             &
+          !$OMP PRIVATE (i_blocks, ipoint) &
+          !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, xx, tmp_grad1_u12)
+          !$OMP DO 
+          do i_blocks = 1, n_blocks
+            ipoint = ii - 1 + i_blocks ! r1
+            call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), xx(1))
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+          do m = 1, 3
+            call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                      , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                      , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+          enddo
+        enddo
+        deallocate(tmp_grad1_u12)
+        if(n_rest .gt. 0) then
+          allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,3))
+          ii = n_pass*n_blocks + 1
+          !$OMP PARALLEL                 &
+          !$OMP DEFAULT (NONE)           &
+          !$OMP PRIVATE (i_rest, ipoint) &
+          !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, xx, tmp_grad1_u12)
+          !$OMP DO 
+          do i_rest = 1, n_rest
+            ipoint = ii - 1 + i_rest ! r1
+            call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), xx(1))
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+          do m = 1, 3
+            call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                      , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                      , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+          enddo
+          deallocate(tmp_grad1_u12)
+        endif
+        deallocate(tmp,xx)
+
+      else
+        ! TODO combine 1shot & int2_grad1_u12_ao_num
+        PROVIDE int2_grad1_u12_ao_num
+        int2_grad1_u12_ao = int2_grad1_u12_ao_num
+!         PROVIDE int2_grad1_u12_ao_num_1shot
+!         int2_grad1_u12_ao = int2_grad1_u12_ao_num_1shot
+      endif
 
     elseif(tc_integ_type .eq. "semi-analytic") then
 
@@ -126,7 +204,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
     print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
-    call ezfio_set_work_empty(.False.)
+      call ezfio_set_work_empty(.False.)
       write(11) int2_grad1_u12_ao
     close(11)
     call ezfio_set_tc_keywords_io_tc_integ('Read')
@@ -177,13 +255,88 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p
 
     print *, ' Numerical integration over r1 and r2 will be performed'
   
-    ! TODO combine 1shot & int2_grad1_u12_square_ao_num
+    if(tc_save_mem) then
 
-    PROVIDE int2_grad1_u12_square_ao_num
-    int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num
+      integer                       :: n_blocks, n_rest, n_pass
+      integer                       :: i_blocks, i_rest, i_pass, ii
+      double precision              :: mem, n_double
+      double precision, allocatable :: tmp(:,:,:), xx(:,:,:)
+      double precision, allocatable :: tmp_grad1_u12_squared(:,:)
 
-    !PROVIDE int2_grad1_u12_square_ao_num_1shot
-    !int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num_1shot
+      PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+      allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+      !$OMP PARALLEL               &
+      !$OMP DEFAULT (NONE)         &
+      !$OMP PRIVATE (j, i, jpoint) &
+      !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+      !$OMP DO COLLAPSE(2)
+      do j = 1, ao_num
+        do i = 1, ao_num
+          do jpoint = 1, n_points_extra_final_grid
+            tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call total_memory(mem)
+      mem      = max(1.d0, qp_max_mem - mem)
+      n_double = mem * 1.d8
+      n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+      n_rest   = int(mod(n_points_final_grid, n_blocks))
+      n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+      call write_int(6, n_pass, 'Number of passes')
+      call write_int(6, n_blocks, 'Size of the blocks')
+      call write_int(6, n_rest, 'Size of the last block')
+      allocate(tmp_grad1_u12_squared(n_points_extra_final_grid,n_blocks), xx(n_points_extra_final_grid,n_blocks,3))
+      do i_pass = 1, n_pass
+        ii = (i_pass-1)*n_blocks + 1
+        !$OMP PARALLEL                   &
+        !$OMP DEFAULT (NONE)             &
+        !$OMP PRIVATE (i_blocks, ipoint) &
+        !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, xx, final_grid_points, tmp_grad1_u12_squared)
+        !$OMP DO 
+        do i_blocks = 1, n_blocks
+          ipoint = ii - 1 + i_blocks ! r1
+          call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, xx(1,i_blocks,1), xx(1,i_blocks,2), xx(1,i_blocks,3), tmp_grad1_u12_squared(1,i_blocks))
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, -0.5d0                         &
+                  , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12_squared(1,1), n_points_extra_final_grid &
+                  , 0.d0, int2_grad1_u12_square_ao(1,1,ii), ao_num*ao_num)
+      enddo
+      deallocate(tmp_grad1_u12_squared, xx)
+      if(n_rest .gt. 0) then
+        ii = n_pass*n_blocks + 1
+        allocate(tmp_grad1_u12_squared(n_points_extra_final_grid,n_rest), xx(n_points_extra_final_grid,n_rest,3))
+        !$OMP PARALLEL                 &
+        !$OMP DEFAULT (NONE)           &
+        !$OMP PRIVATE (i_rest, ipoint) &
+        !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, xx, final_grid_points, tmp_grad1_u12_squared)
+        !$OMP DO 
+        do i_rest = 1, n_rest
+          ipoint = ii - 1 + i_rest ! r1
+          call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, xx(1,i_rest,1), xx(1,i_rest,2), xx(1,i_rest,3), tmp_grad1_u12_squared(1,i_rest))
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, -0.5d0                           &
+                  , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12_squared(1,1), n_points_extra_final_grid &
+                  , 0.d0, int2_grad1_u12_square_ao(1,1,ii), ao_num*ao_num)
+        deallocate(tmp_grad1_u12_squared, xx)
+      endif
+      deallocate(tmp)
+
+    else
+
+      ! TODO combine 1shot & int2_grad1_u12_square_ao_num
+      PROVIDE int2_grad1_u12_square_ao_num
+      int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num
+      !PROVIDE int2_grad1_u12_square_ao_num_1shot
+      !int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num_1shot
+    endif
 
   elseif(tc_integ_type .eq. "semi-analytic") then
 
diff --git a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
index e5d75c3d..9d9601c0 100644
--- a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
@@ -63,12 +63,10 @@
   do i_pass = 1, n_pass
     ii = (i_pass-1)*n_blocks + 1
   
-    !$OMP PARALLEL                                         &
-    !$OMP DEFAULT (NONE)                                   &
-    !$OMP PRIVATE (i_blocks, ipoint)                       &
-    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, &
-    !$OMP         final_grid_points, tmp_grad1_u12,        &
-    !$OMP         tmp_grad1_u12_squared)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12, tmp_grad1_u12_squared)
     !$OMP DO 
     do i_blocks = 1, n_blocks
       ipoint = ii - 1 + i_blocks ! r1
@@ -99,12 +97,10 @@
   
     ii = n_pass*n_blocks + 1
 
-    !$OMP PARALLEL                                       &
-    !$OMP DEFAULT (NONE)                                 &
-    !$OMP PRIVATE (i_rest, ipoint)                       &
-    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, &
-    !$OMP         final_grid_points, tmp_grad1_u12,      &
-    !$OMP         tmp_grad1_u12_squared)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12, tmp_grad1_u12_squared)
     !$OMP DO 
     do i_rest = 1, n_rest
       ipoint = ii - 1 + i_rest ! r1
@@ -131,7 +127,7 @@
   deallocate(tmp)
 
   call wall_time(time1)
-  print*, ' wall time for int2_grad1_u12_ao_num & int2_grad1_u12_square_ao_num =', time1-time0 
+  print*, ' wall time for int2_grad1_u12_ao_num & int2_grad1_u12_square_ao_num = (min)', (time1-time0) / 60.d0
   call print_memory_usage()
 
 END_PROVIDER
diff --git a/plugins/local/non_h_ints_mu/test_mu_of_r_tc.irp.f b/plugins/local/non_h_ints_mu/test_mu_of_r_tc.irp.f
new file mode 100644
index 00000000..79e21d1b
--- /dev/null
+++ b/plugins/local/non_h_ints_mu/test_mu_of_r_tc.irp.f
@@ -0,0 +1,157 @@
+program test_mu_of_r_tc
+  implicit none
+  BEGIN_DOC
+! TODO
+  END_DOC
+  ! You specify that you want to avoid any contribution from 
+  ! orbitals coming from core 
+ call test_grad_f_mean_field
+ call test_grad_mu_mf
+ call plot_mu_of_r_mf
+end
+
+
+subroutine test_grad_f_mean_field
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3)
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
+ double precision :: grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+
+ double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
+ double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
+ double precision :: f_mf_ab_p,f_mf_ab_m
+ double precision :: two_bod_dens_p, two_bod_dens_m
+ double precision :: dm_a_p, dm_a_m
+ double precision :: dm_b_p, dm_b_m
+ double precision :: rbis(3), dr
+ double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
+ double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
+ double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
+ dr = 0.00001d0
+ accu_f_mf_ab = 0.d0 
+ accu_two_bod_dens = 0.d0 
+ accu_dm_a = 0.d0 
+ accu_dm_b = 0.d0
+
+ accu_grad_f_mf_ab = 0.d0
+ accu_grad_two_bod_dens = 0.d0
+ accu_grad_dm_a = 0.d0
+ accu_grad_dm_b = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+  call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+  accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
+  accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
+  accu_dm_a += weight*dabs(dm_a - dm_a_p)
+  accu_dm_b += weight*dabs(dm_b - dm_b_p)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+   rbis = r
+   rbis(k) -= dr
+   call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
+   num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
+   num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
+   num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
+   num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
+   accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
+   accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
+   accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
+  enddo
+ enddo
+ print*,'accu_f_mf_ab = ',accu_f_mf_ab
+ print*,'accu_two_bod_dens = ',accu_two_bod_dens
+ print*,'accu_dm_a = ',accu_dm_a
+ print*,'accu_dm_b = ',accu_dm_b
+ print*,'accu_grad_f_mf_ab = '
+ print*,accu_grad_f_mf_ab
+ print*,'accu_grad_two_bod_dens = '
+ print*,accu_grad_two_bod_dens
+ print*,'accu_dm_a = '
+ print*,accu_grad_dm_a
+ print*,'accu_dm_b = '
+ print*,accu_grad_dm_b
+
+end
+
+subroutine test_grad_mu_mf
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3),rbis(3)
+ double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
+ double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
+ double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
+ dr = 0.00001d0
+ accu_grad_mu_mf = 0.d0
+ accu_mu = 0.d0
+ accu_grad_dm = 0.d0
+ accu_dm = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+  call mu_of_r_mean_field(r,mu_mf_p, dm_p)
+  accu_mu += weight*dabs(mu_mf_p - mu_mf)
+  accu_dm += weight*dabs(dm_p - dm)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
+   rbis = r
+   rbis(k) -= dr
+   call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
+
+   num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
+   num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
+   accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
+  enddo
+ enddo
+ print*,'accu_mu = ',accu_mu
+ print*,'accu_dm = ',accu_dm
+ print*,'accu_grad_dm = '
+ print*, accu_grad_dm
+ print*,'accu_grad_mu_mf = '
+ print*, accu_grad_mu_mf
+
+end
+
+subroutine plot_mu_of_r_mf
+ implicit none
+  include 'constants.include.F'
+ integer :: ipoint,npoint
+ double precision :: dx,r(3),xmax,xmin
+ double precision :: accu_mu,accu_nelec,mu_mf, dm,mu_mf_tc
+ character*(128) :: output
+ integer :: i_unit_output,getUnitAndOpen
+ output=trim(ezfio_filename)//'.mu_mf'
+ i_unit_output = getUnitAndOpen(output,'w')
+ xmax = 5.D0
+ xmin = 0.d0
+ npoint = 10000
+ dx = (xmax - xmin)/dble(npoint)
+ r = 0.d0
+ r(1) = xmin
+ accu_mu = 0.d0
+ accu_nelec = 0.d0
+ do ipoint = 1, npoint
+  call mu_of_r_mean_field(r,mu_mf, dm)
+  call mu_of_r_mean_field_tc(r,mu_mf_tc, dm)
+  write(i_unit_output,'(100(F16.10,X))')r(1),mu_mf,mu_mf_tc,dm
+  accu_mu    += mu_mf * dm * r(1)**2*dx*4.D0*pi
+  accu_nelec +=         dm * r(1)**2*dx*4.D0*pi
+  r(1) += dx
+ enddo
+ print*,'nelec      = ',accu_nelec
+ print*,'mu average = ',accu_mu/accu_nelec
+end
diff --git a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
index 464a1c1f..4c63dec4 100644
--- a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
+++ b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
@@ -1125,6 +1125,7 @@ subroutine test_fit_coef_A1()
   double precision              :: accu, norm, diff
   double precision, allocatable :: A1(:,:)
   double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
 
   ! ---
 
@@ -1165,16 +1166,17 @@ subroutine test_fit_coef_A1()
 
   call wall_time(t1)
 
-  allocate(tmp(ao_num,ao_num,n_points_final_grid))
+  allocate(tmp1(ao_num,ao_num,n_points_final_grid), tmp2(ao_num,ao_num,n_points_final_grid))
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
   !$OMP DO COLLAPSE(2)
   do j = 1, ao_num
     do i = 1, ao_num
       do ipoint = 1, n_points_final_grid
-        tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(i,j,ipoint) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(i,j,ipoint) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
       enddo
     enddo
   enddo
@@ -1184,9 +1186,9 @@ subroutine test_fit_coef_A1()
   allocate(A2(ao_num,ao_num,ao_num,ao_num))
 
   call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num              &
+            , tmp1(1,1,1), ao_num*ao_num, tmp2(1,1,1), ao_num*ao_num            &
             , 0.d0, A2(1,1,1,1), ao_num*ao_num)
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)
 
   call wall_time(t2)
   print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0
@@ -1238,6 +1240,7 @@ subroutine test_fit_coef_inv()
   double precision, allocatable :: A1(:,:), A1_inv(:,:), A1_tmp(:,:)
   double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:), A2_inv(:,:,:,:)
   double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A2_tmp(:,:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
 
   cutoff_svd = 5d-8
 
@@ -1286,16 +1289,17 @@ subroutine test_fit_coef_inv()
 
   call wall_time(t1)
 
-  allocate(tmp(n_points_final_grid,ao_num,ao_num))
+  allocate(tmp1(n_points_final_grid,ao_num,ao_num), tmp2(n_points_final_grid,ao_num,ao_num))
   !$OMP PARALLEL               &
   !$OMP DEFAULT (NONE)         &
   !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
   !$OMP DO COLLAPSE(2)
   do j = 1, ao_num
     do i = 1, ao_num
       do ipoint = 1, n_points_final_grid
-        tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(ipoint,i,j) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(ipoint,i,j) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
       enddo
     enddo
   enddo
@@ -1304,11 +1308,11 @@ subroutine test_fit_coef_inv()
 
   allocate(A2(ao_num,ao_num,ao_num,ao_num))
 
-  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
+  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0  &
+            , tmp1(1,1,1), n_points_final_grid, tmp2(1,1,1), n_points_final_grid &
             , 0.d0, A2(1,1,1,1), ao_num*ao_num)
 
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)
 
   call wall_time(t2)
   print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0
diff --git a/plugins/local/non_h_ints_mu/total_tc_int.irp.f b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
index 9d3cf565..634d7e87 100644
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@@ -33,8 +33,10 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
   double precision              :: weight1, ao_k_r, ao_i_r
   double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
   double precision              :: time0, time1
-  double precision, allocatable :: b_mat(:,:,:,:), c_mat(:,:,:)
+  double precision, allocatable :: c_mat(:,:,:)
+  logical,          external    :: ao_two_e_integral_zero
   double precision, external    :: get_ao_two_e_integral
+  double precision, external    :: ao_two_e_integral
 
   PROVIDe tc_integ_type
   PROVIDE env_type
@@ -53,7 +55,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     print*, ' Reading ao_two_e_tc_tot from ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
 
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="read")
-      read(11) ao_two_e_tc_tot
+    do i = 1, ao_num
+      read(11) ao_two_e_tc_tot(:,:,:,i)
+    enddo
     close(11)
 
   else
@@ -65,27 +69,59 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
     PROVIDE int2_grad1_u12_square_ao
 
-    allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+    if(tc_save_mem_loops) then
 
-    !$OMP PARALLEL               &
-    !$OMP DEFAULT (NONE)         &
-    !$OMP PRIVATE (i, k, ipoint) &
-    !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
-    !$OMP DO SCHEDULE (static)
-    do i = 1, ao_num
-      do k = 1, ao_num
-        do ipoint = 1, n_points_final_grid
-          c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      print*, ' LOOPS are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
+
+      !$OMP PARALLEL                                              &
+      !$OMP DEFAULT (NONE)                                        &
+      !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
+      !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
+      !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
+      !$OMP DO COLLAPSE(3)
+      do i = 1, ao_num
+        do k = 1, ao_num
+          do l = 1, ao_num
+            do j = 1, ao_num
+              ao_two_e_tc_tot(j,l,k,i) = 0.d0
+              do ipoint = 1, n_points_final_grid
+                weight1 = final_weight_at_r_vector(ipoint)
+                ao_i_r = aos_in_r_array_transp(ipoint,i)
+                ao_k_r = aos_in_r_array_transp(ipoint,k)
+                ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) + int2_grad1_u12_square_ao(j,l,ipoint) * weight1 * ao_i_r * ao_k_r
+              enddo
+            enddo
+          enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
+      !$OMP END DO
+      !$OMP END PARALLEL
 
-    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                 &
-              , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
-              , 0.d0, ao_two_e_tc_tot, ao_num*ao_num)
+    else
 
+      print*, ' DGEMM are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
+
+      allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+      !$OMP PARALLEL               &
+      !$OMP DEFAULT (NONE)         &
+      !$OMP PRIVATE (i, k, ipoint) &
+      !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+      !$OMP DO SCHEDULE (static)
+      do i = 1, ao_num
+        do k = 1, ao_num
+          do ipoint = 1, n_points_final_grid
+            c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                 &
+                , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+                , 0.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
+      deallocate(c_mat)
+    endif
+  
     FREE int2_grad1_u12_square_ao
 
     if( (tc_integ_type .eq. "semi-analytic")                            .and. &
@@ -96,6 +132,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
       ! an additional term is added here directly instead of 
       ! being added in int2_grad1_u12_square_ao for performance
 
+      allocate(c_mat(n_points_final_grid,ao_num,ao_num))
       PROVIDE int2_u2_env2
 
       !$OMP PARALLEL                                                                                     &
@@ -127,10 +164,13 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
                 , int2_u2_env2(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
                 , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
 
+      deallocate(c_mat)
       FREE int2_u2_env2
     endif ! use_ipp
 
-    deallocate(c_mat)
+    call wall_time(time1)
+    print*, ' done with Hermitian part after (min) ', (time1 - time0) / 60.d0
+    call print_memory_usage()
 
     ! ---
 
@@ -138,39 +178,71 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
       PROVIDE int2_grad1_u12_ao
 
-      allocate(b_mat(n_points_final_grid,ao_num,ao_num,3))
+      if(tc_save_mem_loops) then
 
-      !$OMP PARALLEL                                                              &
-      !$OMP DEFAULT (NONE)                                                        &
-      !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-      !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
-      !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
-      !$OMP DO SCHEDULE (static)
-      do i = 1, ao_num
-        do k = 1, ao_num
-          do ipoint = 1, n_points_final_grid
+        print*, ' LOOPS are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
 
-            weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-            ao_i_r  = aos_in_r_array_transp(ipoint,i)
-            ao_k_r  = aos_in_r_array_transp(ipoint,k)
-
-            b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
-            b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
-            b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1)    &
+        !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
+        !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
+        !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
+        !$OMP DO COLLAPSE(3)
+        do i = 1, ao_num
+          do k = 1, ao_num
+            do l = 1, ao_num
+              do j = 1, ao_num
+                do ipoint = 1, n_points_final_grid
+                  weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+                  ao_i_r  = aos_in_r_array_transp(ipoint,i)
+                  ao_k_r  = aos_in_r_array_transp(ipoint,k)
+                  ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,1) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,2) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,3) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+                enddo
+              enddo
+            enddo
           enddo
         enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
+        !$OMP END DO
+        !$OMP END PARALLEL
 
-      do m = 1, 3
-        call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0             &
-                  , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
-                  , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
-      enddo
-      deallocate(b_mat)
+      else
 
-      FREE int2_grad1_u12_ao
+        print*, ' DGEMM are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
+
+        allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+        do m = 1, 3
+          !$OMP PARALLEL                                                              &
+          !$OMP DEFAULT (NONE)                                                        &
+          !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+          !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+          !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+          !$OMP DO SCHEDULE (static)
+          do i = 1, ao_num
+            do k = 1, ao_num
+              do ipoint = 1, n_points_final_grid
+  
+                weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+                ao_i_r  = aos_in_r_array_transp(ipoint,i)
+                ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+                c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+              enddo
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+                    , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+                    , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
+        enddo
+        deallocate(c_mat)
+
+      end if
 
       if(tc_integ_type .eq. "semi-analytic") then 
         FREE int2_grad1_u2e_ao
@@ -178,30 +250,72 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
 
     endif ! var_tc
 
+    call wall_time(time1)
+    print*, ' done with non-Hermitian part after (min) ', (time1 - time0) / 60.d0
+    call print_memory_usage()
+
     ! ---
 
     call sum_A_At(ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
 
-    PROVIDE ao_integrals_map
+    ! ---
+  
+    logical          :: integ_zero
+    double precision :: integ_val
 
-    !$OMP PARALLEL DEFAULT(NONE)                            &
-    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-    !$OMP PRIVATE(i, j, k, l)
-    !$OMP DO
-    do j = 1, ao_num
-      do l = 1, ao_num
-        do i = 1, ao_num
-          do k = 1, ao_num
-            !                                                     < 1:i, 2:j | 1:k, 2:l > 
-            ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+    print*, ' adding ERI to ao_two_e_tc_tot ...'
+
+    if(tc_save_mem) then
+      print*, ' ao_integrals_map will not be used'
+      !$OMP PARALLEL DEFAULT(NONE)                     &
+      !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot)
+      !$OMP DO COLLAPSE(3)
+      do j = 1, ao_num
+        do l = 1, ao_num
+          do i = 1, ao_num
+            do k = 1, ao_num
+              integ_zero = ao_two_e_integral_zero(i,j,k,l)
+              if(.not. integ_zero) then
+                            ! i,k : r1    j,l : r2
+                integ_val = ao_two_e_integral(i,k,j,l)
+                ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + integ_val
+              endif
+            enddo
           enddo
         enddo
       enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
+      !$OMP END DO
+      !$OMP END PARALLEL
+    else
+!      print*, ' ao_integrals_map will be used'
+!      PROVIDE ao_integrals_map
+      print*,'Cholesky vectors will be used '
+      double precision :: get_ao_integ_chol,eri
+      eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP 
+      !$OMP PARALLEL DEFAULT(NONE)                            &
+!!!    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot) &
+      !$OMP PRIVATE(i, j, k, l,eri)
+      !$OMP DO COLLAPSE(3)
+      do j = 1, ao_num
+        do l = 1, ao_num
+          do i = 1, ao_num
+            do k = 1, ao_num
+              !                                                     < 1:i, 2:j | 1:k, 2:l > 
+!              eri =  get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+               eri = get_ao_integ_chol(i,k,j,l)
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+!      FREE ao_integrals_map
+    endif
 
-    if(tc_integ_type .eq. "numeric") then
+    if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
       FREE int2_grad1_u12_ao_num int2_grad1_u12_square_ao_num
     endif
 
@@ -211,7 +325,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
     print*, ' Saving ao_two_e_tc_tot in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
     open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
     call ezfio_set_work_empty(.False.)
-      write(11) ao_two_e_tc_tot
+    do i = 1, ao_num
+      write(11) ao_two_e_tc_tot(:,:,:,i)
+    enddo
     close(11)
     call ezfio_set_tc_keywords_io_tc_integ('Read')
   endif
diff --git a/plugins/local/non_hermit_dav/biorthog.irp.f b/plugins/local/non_hermit_dav/biorthog.irp.f
index 2229e17d..4b618228 100644
--- a/plugins/local/non_hermit_dav/biorthog.irp.f
+++ b/plugins/local/non_hermit_dav/biorthog.irp.f
@@ -1,254 +1,3 @@
-subroutine non_hrmt_diag_split_degen(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  integer                       :: n_good
-  double precision              :: shift,shift_current
-  double precision              :: r,thr
-  integer,          allocatable :: list_good(:), iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-
-
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  shift = 1.d-15
-  shift_current = shift
-  iteration = 1 
-  logical :: good_ortho
-  good_ortho = .False.
-  do while(n_real_eigv.ne.n.or. .not.good_ortho)
-   if(shift.gt.1.d-3)then
-    print*,'shift > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   print*,'***** iteration = ',iteration
-   print*,'shift = ',shift
-   allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-   Aw = A_save
-   do i = 1, n
-    do j = 1, n
-     if(dabs(Aw(j,i)).lt.shift)then
-      Aw(j,i) = 0.d0
-     endif
-    enddo
-   enddo
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   allocate(im_part(n),iorder(n))
-   do i = 1, n
-    im_part(i) = -dabs(WI(i))
-    iorder(i) = i
-   enddo
-   call dsort(im_part, iorder, n)
-
-   shift_current = max(10.d0 * dabs(im_part(1)),shift)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-   Aw = A_save
-   call split_matrix_degen(Aw,n,shift_current)
-   deallocate( im_part, iorder )
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   ! You track the real eigenvalues 
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_good += 1
-     else
-       print*,'Found an imaginary component to eigenvalue'
-       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   allocate( list_good(n_good), iorder(n_good) )
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_good += 1
-       list_good(n_good) = i
-       eigval(n_good) = WR(i)
-     endif
-   enddo
-   deallocate( WR, WI )
- 
-   n_real_eigv = n_good 
-   do i = 1, n_good
-     iorder(i) = i
-   enddo
- 
-   ! You sort the real eigenvalues 
-   call dsort(eigval, iorder, n_good)
- 
-   reigvec(:,:) = 0.d0 
-   leigvec(:,:) = 0.d0 
-   do i = 1, n_real_eigv
-     do j = 1, n
-       reigvec_tmp(j,i) = VR(j,list_good(iorder(i)))
-       leigvec_tmp(j,i) = Vl(j,list_good(iorder(i)))
-     enddo
-   enddo
-
-   if(n_real_eigv == n)then
-    allocate(S(n,n))
-    call check_bi_ortho(reigvec_tmp,leigvec_tmp,n,S,accu_nd)
-    print*,'accu_nd = ',accu_nd
-    double precision :: accu_nd
-    good_ortho = accu_nd .lt. 1.d-10
-    deallocate(S)
-   endif
- 
-   deallocate( list_good, iorder )
-   deallocate( VL, VR, Aw)
-   shift *= 10.d0
-   iteration += 1
-  enddo
-  do i = 1, n
-   do j = 1, n
-    reigvec(iorder_origin(j),i) = reigvec_tmp(j,i)
-    leigvec(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-end
-
-! ---
-
-subroutine non_hrmt_real_diag_new(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_good
-  double precision              :: shift,shift_current
-  double precision              :: r,thr
-  integer,          allocatable :: list_good(:), iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: im_part(:)
-
-
-  print*,'Computing the left/right eigenvectors ...'
-
-  ! Eigvalue(n) = WR(n) + i * WI(n)
-  shift = 1.d-10
-  do while(n_real_eigv.ne.n.or.shift.gt.1.d-3)
-   allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-   Aw = A
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   allocate(im_part(n), iorder(n))
-   do i = 1, n
-    im_part(i) = -dabs(WI(i))
-    iorder(i) = i
-   enddo
-   shift_current = max(10.d0 * dabs(im_part(1)),shift)
-   print*,'adding random number of magnitude ',shift_current
-   Aw = A
-   do i = 1, n
-     call RANDOM_NUMBER(r)
-     Aw(i,i) += shift_current * r
-   enddo
-   deallocate( im_part, iorder )
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
- 
-   ! You track the real eigenvalues 
-   thr = 1.d-10
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.thr)then
-       n_good += 1
-     else
-       print*,'Found an imaginary component to eigenvalue'
-       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
- 
-   allocate( list_good(n_good), iorder(n_good) )
-   n_good = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.thr)then
-       n_good += 1
-       list_good(n_good) = i
-       eigval(n_good) = WR(i)
-     endif
-   enddo
- 
-   deallocate( WR, WI )
- 
-   n_real_eigv = n_good 
-   do i = 1, n_good
-     iorder(i) = i
-   enddo
- 
-   ! You sort the real eigenvalues 
-   call dsort(eigval, iorder, n_good)
- 
-   reigvec(:,:) = 0.d0 
-   leigvec(:,:) = 0.d0 
-   do i = 1, n_real_eigv
-     do j = 1, n
-       reigvec(j,i) = VR(j,list_good(iorder(i)))
-       leigvec(j,i) = Vl(j,list_good(iorder(i)))
-     enddo
-   enddo
- 
-   deallocate( list_good, iorder )
-   deallocate( VL, VR, Aw)
-   shift *= 10.d0
-  enddo
-  if(shift.gt.1.d-3)then
-   print*,'shift > 1.d-3 !!'
-   print*,'Your matrix intrinsically contains complex eigenvalues'
-  endif
-
-end
 
 ! ---
 
@@ -282,126 +31,20 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
 
   allocate(phi_1_tilde(n),phi_2_tilde(n),chi_1_tilde(n),chi_2_tilde(n))
 
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  !print *, ' '
-  !print *, ' Computing the left/right eigenvectors ...'
-  !print *, ' '
-
   allocate(WR(n), WI(n), VL(n,n), VR(n,n)) 
-  
-  !print *, ' fock matrix'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') A(i,:)
-  !enddo
 
-  !thr_cut = 1.d-15
-  !call cancel_small_elmts(A, n, thr_cut)
-
-  !call lapack_diag_non_sym_right(n, A, WR, WI, VR)
   call lapack_diag_non_sym(n, A, WR, WI, VL, VR)
-  !call lapack_diag_non_sym_new(n, A, WR, WI, VL, VR)
-
-
-
-  !print *, ' '
-  !print *, ' eigenvalues'
-  i = 1
-  do while(i .le. n)
-    !write(*, '(I3,X,1000(F16.10,X))')i, WR(i), WI(i)
-   if(.false.)then
-    if(WI(i).ne.0.d0)then
-     print*,'*****************'
-     print*,'WARNING ! IMAGINARY EIGENVALUES !!!'
-     write(*, '(1000(F16.10,X))') WR(i), WI(i+1)
-     ! phi = VR(:,i), psi = VR(:,i+1), |Phi_i> = phi + j psi , |Phi_i+1> = phi - j psi 
-     ! chi = VL(:,i), xhi = VL(:,i+1), |Chi_i> = chi + j xhi , |Chi_i+1> = chi - j xhi 
-     !                                   
-     accu_chi_phi = 0.d0
-     accu_xhi_psi = 0.d0
-     accu_chi_psi = 0.d0
-     accu_xhi_phi = 0.d0
-     double precision :: accu_chi_phi, accu_xhi_psi, accu_chi_psi, accu_xhi_phi
-     double precision :: mat_ovlp(2,2),eigval_tmp(2),eigvec(2,2),mat_ovlp_orig(2,2)
-     do j = 1, n 
-      accu_chi_phi += VL(j,i)   * VR(j,i)
-      accu_xhi_psi += VL(j,i+1) * VR(j,i+1)
-      accu_chi_psi += VL(j,i)   * VR(j,i+1)
-      accu_xhi_phi += VL(j,i+1) * VR(j,i)
-     enddo
-     mat_ovlp_orig(1,1) = accu_chi_phi
-     mat_ovlp_orig(2,1) = accu_xhi_phi
-     mat_ovlp_orig(1,2) = accu_chi_psi
-     mat_ovlp_orig(2,2) = accu_xhi_psi
-     print*,'old overlap matrix '
-     write(*,'(100(F16.10,X))')mat_ovlp_orig(1:2,1)
-     write(*,'(100(F16.10,X))')mat_ovlp_orig(1:2,2)
-
-
-     mat_ovlp(1,1) = accu_xhi_phi
-     mat_ovlp(2,1) = accu_chi_phi
-     mat_ovlp(1,2) = accu_xhi_psi
-     mat_ovlp(2,2) = accu_chi_psi
-    !print*,'accu_chi_phi = ',accu_chi_phi
-    !print*,'accu_xhi_psi = ',accu_xhi_psi
-    !print*,'accu_chi_psi = ',accu_chi_psi
-    !print*,'accu_xhi_phi = ',accu_xhi_phi
-     print*,'new overlap matrix '
-     write(*,'(100(F16.10,X))')mat_ovlp(1:2,1)
-     write(*,'(100(F16.10,X))')mat_ovlp(1:2,2)
-     call lapack_diag(eigval_tmp,eigvec,mat_ovlp,2,2)
-     print*,'eigval_tmp(1) = ',eigval_tmp(1)
-     print*,'eigvec(1) = ',eigvec(1:2,1)
-     print*,'eigval_tmp(2) = ',eigval_tmp(2)
-     print*,'eigvec(2) = ',eigvec(1:2,2)
-     print*,'*****************'
-     phi_1_tilde = 0.d0
-     phi_2_tilde = 0.d0
-     chi_1_tilde = 0.d0
-     chi_2_tilde = 0.d0
-     do j = 1, n
-      phi_1_tilde(j) += VR(j,i) * eigvec(1,1) + VR(j,i+1) * eigvec(2,1)
-      phi_2_tilde(j) += VR(j,i) * eigvec(1,2) + VR(j,i+1) * eigvec(2,2)
-      chi_1_tilde(j) += VL(j,i+1) * eigvec(1,1) + VL(j,i) * eigvec(2,1)
-      chi_2_tilde(j) += VL(j,i+1) * eigvec(1,2) + VL(j,i) * eigvec(2,2)
-     enddo
-     VR(1:n,i)   = phi_1_tilde(1:n)
-     VR(1:n,i+1) = phi_2_tilde(1:n)
-!     Vl(1:n,i)   = -chi_1_tilde(1:n)
-!     Vl(1:n,i+1) = chi_2_tilde(1:n)
-     i+=1 
-    endif
-   endif
-    i+=1
-  enddo
-  !print *, ' right eigenvect bef' 
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') VR(:,i)
-  !enddo
-  !print *, ' left eigenvect bef'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') VL(:,i)
-  !enddo
 
   thr_diag = 1d-06
   thr_norm = 1d+10
-  !call check_EIGVEC(n, n, A, WR, VL, VR, thr_diag, thr_norm, .false.)
-
-  !
-  ! -------------------------------------------------------------------------------------
 
   ! ---
 
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
+  ! track & sort the real eigenvalues 
 
   n_good = 0
-  !thr    = 100d0
-  thr    = Im_thresh_tcscf
+  thr    = Im_thresh_tc
   do i = 1, n
-    !print*, 'Re(i) + Im(i)', WR(i), WI(i)
     if(dabs(WI(i)) .lt. thr) then
       n_good += 1
     else
@@ -410,11 +53,12 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
     endif
   enddo
 
-  if(n_good.ne.n)then
-   print*,'there are some imaginary eigenvalues '
-   thr_diag = 1d-03
-   n_good = n
+  if(n_good.ne.n) then
+    print*,'there are some imaginary eigenvalues '
+    thr_diag = 1d-03
+    n_good = n
   endif
+
   allocate(list_good(n_good), iorder(n_good))
 
   n_good = 0
@@ -446,26 +90,9 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
 
   ASSERT(n==n_real_eigv)
 
-  !print *, ' eigenvalues'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') eigval(i)
-  !enddo
-  !print *, ' right eigenvect aft ord' 
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') reigvec(:,i)
-  !enddo
-  !print *, ' left eigenvect aft ord'
-  !do i = 1, n
-  !  write(*, '(1000(F16.10,X))') leigvec(:,i)
-  !enddo
-
-  !
-  ! -------------------------------------------------------------------------------------
-
   ! ---
 
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
+  ! check bi-orthogonality
 
   thr_diag = 10.d0
   thr_norm = 1d+10
@@ -495,8 +122,6 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
 
     print *, ' lapack vectors are not normalized neither bi-orthogonalized'
 
-    ! ---
-
     allocate(deg_num(n))
     call reorder_degen_eigvec(n, deg_num, eigval, leigvec, reigvec)
     call impose_biorthog_degen_eigvec(n, deg_num, eigval, leigvec, reigvec)
@@ -508,700 +133,36 @@ subroutine non_hrmt_bieig(n, A, thr_d, thr_nd, leigvec, reigvec, n_real_eigv, ei
     endif
     call check_biorthog(n, n_real_eigv, leigvec, reigvec, accu_d, accu_nd, S, thr_d, thr_nd, .true.)
 
-    !call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, thr_diag, thr_norm, .true.)
-
     deallocate(S)
 
   endif
 
-  !
-  ! -------------------------------------------------------------------------------------
-
   return
 
 end
 
 ! ---
 
-subroutine non_hrmt_bieig_random_diag(n, A, leigvec, reigvec, n_real_eigv, eigval)
+subroutine check_bi_ortho(reigvec, leigvec, n, S, accu_nd)
 
   BEGIN_DOC
-  ! 
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
+  ! retunrs the overlap matrix S = Leigvec^T Reigvec 
   !
+  ! and the square root of the sum of the squared off-diagonal elements of S
   END_DOC
 
   implicit none
   integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
+  double precision, intent(in)  :: reigvec(n,n), leigvec(n,n)
+  double precision, intent(out) :: S(n,n), accu_nd
 
-  integer                       :: i, j
-  integer                       :: n_good
-  double precision              :: thr
-  double precision              :: accu_nd
+  integer :: i,j
 
-  integer,          allocatable :: list_good(:), iorder(:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:)
-  double precision, allocatable :: S(:,:)
-  double precision :: r
-
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n) )
-
-  Aw(:,:) = A(:,:)
-  call lapack_diag_non_sym_new(n, Aw, WR, WI, VL, VR)
-
-  thr    = 1.d-12
-  double precision, allocatable :: im_part(:)
-  n_good = 0
-  do i = 1, n
-    if( dabs(WI(i)).lt.thr ) then
-      n_good += 1
-    else
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) + Im(i)', WR(i), WI(i)
-    endif
-  enddo
-  print*,'n_good = ',n_good
-  if(n_good .lt. n)then
-   print*,'Removing degeneracies to remove imaginary parts'
-   allocate(im_part(n),iorder(n))
-   r = 0.d0
-   do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-   enddo
-   call dsort(im_part,iorder,n) 
-   thr = 10.d0 * dabs(im_part(1))
-   print*,'adding random numbers on the diagonal of magnitude ',thr
-   Aw(:,:) = A(:,:)
-   do i = 1, n
-     call RANDOM_NUMBER(r)
-     print*,'r = ',r*thr
-     Aw(i,i) += thr * r
-   enddo
-   print*,'Rediagonalizing the matrix with random numbers'
-   call lapack_diag_non_sym_new(n, Aw, WR, WI, VL, VR)
-   deallocate(im_part,iorder)
-  endif
-  deallocate( Aw )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  n_good = 0
-  thr    = 1.d-5
-  do i = 1, n
-    if( dabs(WI(i)).lt.thr ) then
-      n_good += 1
-    else
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) + Im(i)', WR(i), WI(i)
-    endif
-  enddo
-  print*,'n_good = ',n_good
-  allocate( list_good(n_good), iorder(n_good) )
-
-  n_good = 0
-  do i = 1, n
-    if( dabs(WI(i)).lt.thr ) then
-      n_good += 1
-      list_good(n_good) = i
-      eigval(n_good) = WR(i)
-    endif
-  enddo
-
-  deallocate( WR, WI )
-
-  n_real_eigv = n_good 
-  do i = 1, n_good
-    iorder(i) = i
-  enddo
-  call dsort(eigval, iorder, n_good)
-      
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n_real_eigv
-    do j = 1, n
-      reigvec(j,i) = VR(j,list_good(iorder(i)))
-      leigvec(j,i) = VL(j,list_good(iorder(i)))
-    enddo
-  enddo
-
-  deallocate( list_good, iorder )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n_real_eigv,n_real_eigv) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n_real_eigv, n_real_eigv, n, 1.d0          &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n_real_eigv
-    do j = 1, n_real_eigv
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    ! L x R is already bi-orthogonal
-
-    print *, ' L & T bi-orthogonality: ok'
-    deallocate( S )
-    return
-
-  else
-    ! impose bi-orthogonality 
-
-    print *, ' L & T bi-orthogonality: not imposed yet'
-    print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n_real_eigv, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
-    deallocate( S )
-  
-  endif
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  return
-
-end
-
-! ---
-
-subroutine non_hrmt_real_im(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  ! 
-  ! routine which returns the EIGENVALUES sorted the REAL part and corresponding LEFT/RIGHT eigenvetors 
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_bad
-  double precision              :: thr
-  double precision              :: accu_nd
-
-  integer,          allocatable :: iorder(:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:)
-  double precision, allocatable :: S(:,:)
-  double precision :: r
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n), iorder(n))
-
-  Aw(:,:) = A(:,:)
-   do i = 1, n
-     call RANDOM_NUMBER(r)
-     Aw(i,i) += 10.d-10* r
-   enddo
-  call lapack_diag_non_sym(n, Aw, WR, WI, VL, VR)
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  i = 1
-  thr    = 1.d-15
-  n_real_eigv = 0
-  do while (i.le.n) 
-!    print*,i,dabs(WI(i))
-    if( dabs(WI(i)).gt.thr ) then
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) , Im(i)  ', WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)
-      i+=1
-      print*, 'Re(i+1),Im(i+1)',WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)
-      i+=1
-    else  
-      n_real_eigv += 1
-      iorder(i) = i
-      eigval(i) = WR(i)
-      i+=1
-    endif
-  enddo
-  call dsort(eigval, iorder, n)
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n
-    do j = 1, n
-      reigvec(j,i) = VR(j,iorder(i))
-      leigvec(j,i) = VL(j,iorder(i))
-    enddo
-  enddo
-
-  deallocate( iorder )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n,n) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n, n, n, 1.d0          &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n
-    do j = 1, n
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  deallocate( S )
-
-end
-
-! ---
-
-subroutine non_hrmt_generalized_real_im(n, A, B, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  ! 
-  ! routine which returns the EIGENVALUES sorted the REAL part and corresponding LEFT/RIGHT eigenvetors 
-  ! for A R = lambda B R and A^\dagger L = lambda B^\dagger L
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n),B(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_bad
-  double precision              :: thr
-  double precision              :: accu_nd
-
-  integer,          allocatable :: iorder(:)
-  double precision, allocatable :: Aw(:,:),Bw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:), beta(:)
-  double precision, allocatable :: S(:,:)
-  double precision :: r
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n), Bw(n,n),iorder(n),beta(n))
-
-  Aw(:,:) = A(:,:)
-  Bw(:,:) = B(:,:)
-  call lapack_diag_general_non_sym(n,Aw,Bw,WR,beta,WI,VL,VR)
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  i = 1
-  thr    = 1.d-10
-  n_real_eigv = 0
-  do while (i.le.n) 
-    if( dabs(WI(i)).gt.thr ) then
-      print*, 'Found an imaginary component to eigenvalue on i = ', i
-      print*, 'Re(i) , Im(i)  ', WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)/(beta(i) + 1.d-10)
-      i+=1
-      print*, 'Re(i+1),Im(i+1)',WR(i), WI(i)
-      iorder(i) = i
-      eigval(i) = WR(i)/(beta(i) + 1.d-10)
-      i+=1
-    else  
-      n_real_eigv += 1
-      iorder(i) = i
-      eigval(i) = WR(i)/(beta(i) + 1.d-10)
-      i+=1
-    endif
-  enddo
-  call dsort(eigval, iorder, n)
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n
-    do j = 1, n
-      reigvec(j,i) = VR(j,iorder(i))
-      leigvec(j,i) = VL(j,iorder(i))
-    enddo
-  enddo
-
-  deallocate( iorder )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n,n) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n, n, n, 1.d0          &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n
-    do j = 1, n
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  deallocate( S )
-
-end
-
-! ---
-
-subroutine non_hrmt_bieig_fullvect(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  ! 
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-
-  integer                       :: i, j
-  integer                       :: n_good
-  double precision              :: thr
-  double precision              :: accu_nd
-
-  integer,          allocatable :: iorder(:)
-  double precision, allocatable :: Aw(:,:)
-  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:)
-  double precision, allocatable :: S(:,:)
-  double precision, allocatable :: eigval_sorted(:)
-
-
-  ! -------------------------------------------------------------------------------------
-  !
-
-  print *, 'Computing the left/right eigenvectors ...'
-
-  allocate( WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n) )
-  Aw(:,:) = A(:,:)
-
-  call lapack_diag_non_sym_new(n, Aw, WR, WI, VL, VR)
-
-  deallocate( Aw )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                  track & sort the real eigenvalues 
-
-  allocate( eigval_sorted(n), iorder(n) )
-
-  n_good = 0
-  thr    = 1.d-10
-
-  do i = 1, n
-
-    iorder(i) = i
-    eigval_sorted(i) = WR(i)
-
-    if(dabs(WI(i)) .gt. thr) then
-      print*, ' Found an imaginary component to eigenvalue on i = ', i
-      print*, ' Re(i) + Im(i)', WR(i), WI(i)
-    else
-      n_good += 1
-    endif
-
-  enddo
-
-  n_real_eigv = n_good 
-
-  call dsort(eigval_sorted, iorder, n)
-      
-  reigvec(:,:) = 0.d0 
-  leigvec(:,:) = 0.d0 
-  do i = 1, n
-    eigval(i) = WR(i)
-    do j = 1, n
-      reigvec(j,i) = VR(j,iorder(i))
-      leigvec(j,i) = VL(j,iorder(i))
-    enddo
-  enddo
-
-  deallocate( eigval_sorted, iorder )
-  deallocate( WR, WI )
-  deallocate( VL, VR )
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  ! ---
-
-  ! -------------------------------------------------------------------------------------
-  !                               check bi-orthogonality
-
-  allocate( S(n,n) )
-
-  ! S = VL x VR
-  call dgemm( 'T', 'N', n, n, n, 1.d0                              &
-            , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1) &
-            , 0.d0, S, size(S, 1) )
-
-  accu_nd = 0.d0
-  do i = 1, n
-    do j = 1, n
-      if(i==j) cycle
-      accu_nd = accu_nd + S(j,i) * S(j,i)
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    ! L x R is already bi-orthogonal
-
-    !print *, ' L & T bi-orthogonality: ok'
-    deallocate( S )
-    return
-
-  else
-    ! impose bi-orthogonality 
-
-    !print *, ' L & T bi-orthogonality: not imposed yet'
-    !print *, ' accu_nd = ', accu_nd
-    call impose_biorthog_qr(n, n, thresh_biorthog_diag, thresh_biorthog_nondiag, leigvec, reigvec)
-    deallocate( S )
-  
-  endif
-
-  !
-  ! -------------------------------------------------------------------------------------
-
-  return
-
-end
-
-! ---
-
-
-subroutine split_matrix_degen(aw,n,shift)
- implicit none
- BEGIN_DOC
- ! subroutines that splits the degeneracies of a matrix by adding a splitting of magnitude thr * n_degen/2
- !
- ! WARNING !! THE MATRIX IS ASSUMED TO BE PASSED WITH INCREASING DIAGONAL ELEMENTS
- END_DOC
- double precision,intent(inout) :: Aw(n,n)
- double precision,intent(in)    :: shift
- integer, intent(in) :: n
- integer :: i,j,n_degen
- logical :: keep_on
- i=1
- do while(i.lt.n)
-  if(dabs(Aw(i,i)-Aw(i+1,i+1)).lt.shift)then
-   j=1
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i,i)-Aw(i+j,i+j)).lt.shift)then
-     j+=1
-    else
-     keep_on=.False.
-     exit
-    endif
-   enddo
-   n_degen = j
-   j=0
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j+1.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i+j,i+j)-Aw(i+j+1,i+j+1)).lt.shift)then
-     Aw(i+j,i+j) += (j-n_degen/2) * shift
-     j+=1
-    else 
-     keep_on = .False.
-     exit
-    endif
-   enddo
-   Aw(i+n_degen-1,i+n_degen-1) += (n_degen-1-n_degen/2) * shift
-   i+=n_degen
-  else 
-   i+=1
-  endif
- enddo
-
-end
-
-subroutine give_degen(a,n,shift,list_degen,n_degen_list)
- implicit none
- BEGIN_DOC
- ! returns n_degen_list :: the number of degenerated SET of elements (i.e. with |A(i)-A(i+1)| below shift)
- !
- ! for each of these sets, list_degen(1,i) = first degenerate element of the set i, 
- !
- !                         list_degen(2,i) = last degenerate element of the set i.
- END_DOC
- double precision,intent(in) :: A(n)
- double precision,intent(in)    :: shift
- integer, intent(in) :: n
- integer, intent(out) :: list_degen(2,n),n_degen_list
- integer :: i,j,n_degen,k
- logical :: keep_on
- double precision,allocatable :: Aw(:)
- list_degen = -1
- allocate(Aw(n))
- Aw = A
- i=1
- k = 0
- do while(i.lt.n)
-  if(dabs(Aw(i)-Aw(i+1)).lt.shift)then
-   k+=1
-   j=1
-   list_degen(1,k) = i
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i)-Aw(i+j)).lt.shift)then
-     j+=1
-    else
-     keep_on=.False.
-     exit
-    endif
-   enddo
-   n_degen = j
-   list_degen(2,k) = list_degen(1,k)-1 + n_degen
-   j=0
-   keep_on = .True.
-   do while(keep_on)
-    if(i+j+1.gt.n)then
-     keep_on = .False.
-     exit
-    endif
-    if(dabs(Aw(i+j)-Aw(i+j+1)).lt.shift)then
-     Aw(i+j) += (j-n_degen/2) * shift
-     j+=1
-    else 
-     keep_on = .False.
-     exit
-    endif
-   enddo
-   Aw(i+n_degen-1) += (n_degen-1-n_degen/2) * shift
-   i+=n_degen
-  else 
-   i+=1
-  endif
- enddo
- n_degen_list = k
-
-end
-
-subroutine cancel_small_elmts(aw,n,shift)
- implicit none
- BEGIN_DOC
- ! subroutines that splits the degeneracies of a matrix by adding a splitting of magnitude thr * n_degen/2
- !
- ! WARNING !! THE MATRIX IS ASSUMED TO BE PASSED WITH INCREASING DIAGONAL ELEMENTS
- END_DOC
- double precision,intent(inout) :: Aw(n,n)
- double precision,intent(in)    :: shift
- integer, intent(in) :: n
- integer :: i,j
- do i = 1, n
-  do j = 1, n
-   if(dabs(Aw(j,i)).lt.shift)then
-    Aw(j,i) = 0.d0
-   endif
-  enddo
- enddo
-end
-
-subroutine check_bi_ortho(reigvec,leigvec,n,S,accu_nd)
- implicit none
- integer, intent(in) :: n
- double precision,intent(in) :: reigvec(n,n),leigvec(n,n)
- double precision, intent(out) :: S(n,n),accu_nd
- BEGIN_DOC
-! retunrs the overlap matrix S = Leigvec^T Reigvec 
-!
-! and the square root of the sum of the squared off-diagonal elements of S
- END_DOC
- integer :: i,j
   ! S = VL x VR
   call dgemm( 'T', 'N', n, n, n, 1.d0 &
             , leigvec, size(leigvec, 1), reigvec, size(reigvec, 1)  &
             , 0.d0, S, size(S, 1) )
+
   accu_nd = 0.d0
   do i = 1, n
     do j = 1, n
@@ -1213,3 +174,5 @@ subroutine check_bi_ortho(reigvec,leigvec,n,S,accu_nd)
   accu_nd = dsqrt(accu_nd)
 
 end
+
+
diff --git a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
index cb38347e..2c053ac8 100644
--- a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
+++ b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
@@ -273,60 +273,6 @@ end
 
 ! ---
 
-subroutine lapack_diag_non_sym_right(n, A, WR, WI, VR)
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  double precision, intent(out) :: WR(n), WI(n), VR(n,n)
-
-  integer                       :: i, lda, ldvl, ldvr, LWORK, INFO
-  double precision, allocatable :: Atmp(:,:), WORK(:), VL(:,:)
-
-  lda  = n
-  ldvl = 1
-  ldvr = n
-
-  allocate( Atmp(n,n), VL(1,1) )
-  Atmp(1:n,1:n) = A(1:n,1:n)
-
-  allocate(WORK(1))
-  LWORK = -1
-  call dgeev('N', 'V', n, Atmp, lda, WR, WI, VL, ldvl, VR, ldvr, WORK, LWORK, INFO)
-  if(INFO.gt.0)then
-    print*,'dgeev failed !!',INFO
-    stop
-  endif
-
-  LWORK = max(int(WORK(1)), 1) ! this is the optimal size of WORK 
-  deallocate(WORK)
-
-  allocate(WORK(LWORK))
-
-  ! Actual diagonalization 
-  call dgeev('N', 'V', n, Atmp, lda, WR, WI, VL, ldvl, VR, ldvr, WORK, LWORK, INFO)
-  if(INFO.ne.0) then
-    print*,'dgeev failed !!', INFO
-    stop
-  endif
-
-  deallocate(Atmp, WORK, VL)
-
-! print *, ' JOBL = F'
-! print *, ' eigenvalues'
-! do i = 1, n
-!   write(*, '(1000(F16.10,X))') WR(i), WI(i)
-! enddo
-! print *, ' right eigenvect' 
-! do i = 1, n
-!   write(*, '(1000(F16.10,X))') VR(:,i)
-! enddo
-
-end
-
-! ---
-
 subroutine non_hrmt_real_diag(n, A, leigvec, reigvec, n_real_eigv, eigval)
 
   BEGIN_DOC
@@ -1780,70 +1726,6 @@ end
 
 ! ---
 
-subroutine check_weighted_biorthog(n, m, W, Vl, Vr, thr_d, thr_nd, accu_d, accu_nd, S, stop_ifnot)
-
-  implicit none
-  
-  integer,          intent(in)  :: n, m
-  double precision, intent(in)  :: Vl(n,m), Vr(n,m), W(n,n)
-  double precision, intent(in)  :: thr_d, thr_nd
-  logical,          intent(in)  :: stop_ifnot
-  double precision, intent(out) :: accu_d, accu_nd, S(m,m)
-
-  integer                       :: i, j
-  double precision, allocatable :: SS(:,:), tmp(:,:)
-
-  print *, ' check weighted bi-orthogonality'
-
-  ! ---
-
-  allocate(tmp(m,n))
-  call dgemm( 'T', 'N', m, n, n, 1.d0        &
-            , Vl, size(Vl, 1), W, size(W, 1) &
-            , 0.d0, tmp, size(tmp, 1) )
-  call dgemm( 'N', 'N', m, m, n, 1.d0            &
-            , tmp, size(tmp, 1), Vr, size(Vr, 1) &
-            , 0.d0, S, size(S, 1) )
-  deallocate(tmp)
-
-  !print *, ' overlap matrix:'
-  !do i = 1, m
-  !  write(*,'(1000(F16.10,X))') S(i,:)
-  !enddo
-
-  accu_d  = 0.d0
-  accu_nd = 0.d0
-  do i = 1, m
-    do j = 1, m
-      if(i==j) then
-        accu_d = accu_d + dabs(S(i,i))
-      else
-        accu_nd = accu_nd + S(j,i) * S(j,i)
-      endif
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  print *, ' accu_nd = ', accu_nd
-  print *, ' accu_d  = ', dabs(accu_d-dble(m))/dble(m)
-
-  ! ---
-
-  if( stop_ifnot .and. ((accu_nd .gt. thr_nd) .or. dabs(accu_d-dble(m))/dble(m) .gt. thr_d) ) then
-    print *, ' non bi-orthogonal vectors !'
-    print *, ' accu_nd = ', accu_nd
-    print *, ' accu_d  = ', dabs(accu_d-dble(m))/dble(m)
-    !print *, ' overlap matrix:'
-    !do i = 1, m
-    !  write(*,'(1000(F16.10,X))') S(i,:)
-    !enddo
-    stop
-  endif
-
-end
-
-! ---
-
 subroutine check_biorthog(n, m, Vl, Vr, accu_d, accu_nd, S, thr_d, thr_nd, stop_ifnot)
 
   implicit none
@@ -2144,6 +2026,7 @@ subroutine impose_biorthog_degen_eigvec(n, deg_num, e0, L0, R0)
       enddo
       !print*,' accu_nd after = ', accu_nd
       if(accu_nd .gt. 1d-12) then
+        print*, ' accu_nd =', accu_nd
         print*, ' your strategy for degenerates orbitals failed !'
         print*, m, 'deg on', i
         stop
diff --git a/plugins/local/non_hermit_dav/new_routines.irp.f b/plugins/local/non_hermit_dav/new_routines.irp.f
deleted file mode 100644
index 8db044d3..00000000
--- a/plugins/local/non_hermit_dav/new_routines.irp.f
+++ /dev/null
@@ -1,670 +0,0 @@
-subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut, thr_norm=1d0
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    print *, ' '
-    print *, ' '
-    print *, ' orthog between degen eigenvect' 
-    print *, ' '
-    double precision, allocatable :: S_nh_inv_half(:,:)
-    allocate(S_nh_inv_half(n,n))
-    logical :: complex_root
-    deallocate(S_nh_inv_half)
-    call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp)
-    call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S)
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-     if(accu_nd .lt. thresh_biorthog_nondiag) then
-       print *, ' bi-orthogonality: ok'
-     else 
-      print*,'New vectors not bi-orthonormals at ',accu_nd
-      print*,'Must be a deep problem ...'
-      stop
-     endif
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
-
-subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut, thr_norm=1.d0
-  double precision, allocatable :: S_nh_inv_half(:,:)
-  logical :: complex_root
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-  allocate(S_nh_inv_half(n,n))
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-!   thr_cut = shift_current
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    if(complex_root) then 
-     print *, ' '
-     print *, ' '
-     print *, ' orthog between degen eigenvect' 
-     print *, ' '
-     ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
-     call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp) ! orthogonalization of reigvec
-     call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp) ! orthogonalization of leigvec
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S,  thresh_biorthog_diag, thresh_biorthog_nondiag, .false.) 
-
-     if(accu_nd .lt. thresh_biorthog_nondiag) then
-       print *, ' bi-orthogonality: ok'
-     else 
-      print*,'New vectors not bi-orthonormals at ', accu_nd
-      call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half, complex_root)
-      if(complex_root)then 
-       call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
-      else
-       print*,'S^{-1/2} exists !!'
-       call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-      endif
-     endif
-    else ! the matrix S^{-1/2} exists
-     print*,'S^{-1/2} exists !!'
-     call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     print*,'Must be a deep problem ...'
-     stop
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
-subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine returning the eigenvalues and left/right eigenvectors of the TC fock matrix 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut
-  double precision, allocatable :: S_nh_inv_half(:,:)
-  logical :: complex_root
-  double precision :: thr_norm=1d0
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-  allocate(S_nh_inv_half(n,n))
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-!   thr_cut = shift_current
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    print *, ' '
-    print *, ' '
-    print *, ' Using impose_unique_biorthog_degen_eigvec' 
-    print *, ' '
-    ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
-    call impose_unique_biorthog_degen_eigvec(n, eigval, mo_coef, leigvec_tmp, reigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    print*,'accu_nd = ',accu_nd
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half,complex_root)
-     if(complex_root)then 
-      print*,'S^{-1/2} does not exits, using QR bi-orthogonalization'
-      call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
-     else
-      print*,'S^{-1/2} exists !!'
-      call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-     endif
-    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     print*,'Must be a deep problem ...'
-     stop
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
diff --git a/plugins/local/normal_order_old/NEED b/plugins/local/normal_order_old/NEED
new file mode 100644
index 00000000..e8c8c478
--- /dev/null
+++ b/plugins/local/normal_order_old/NEED
@@ -0,0 +1 @@
+tc_scf
diff --git a/plugins/local/normal_order_old/README.rst b/plugins/local/normal_order_old/README.rst
new file mode 100644
index 00000000..a284fcfd
--- /dev/null
+++ b/plugins/local/normal_order_old/README.rst
@@ -0,0 +1,4 @@
+================
+normal_order_old
+================
+
diff --git a/plugins/local/tc_bi_ortho/normal_ordered.irp.f b/plugins/local/normal_order_old/normal_ordered.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered.irp.f
rename to plugins/local/normal_order_old/normal_ordered.irp.f
diff --git a/plugins/local/tc_bi_ortho/normal_ordered_contractions.irp.f b/plugins/local/normal_order_old/normal_ordered_contractions.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered_contractions.irp.f
rename to plugins/local/normal_order_old/normal_ordered_contractions.irp.f
diff --git a/plugins/local/tc_bi_ortho/normal_ordered_old.irp.f b/plugins/local/normal_order_old/normal_ordered_old.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered_old.irp.f
rename to plugins/local/normal_order_old/normal_ordered_old.irp.f
diff --git a/plugins/local/tc_bi_ortho/normal_ordered_v0.irp.f b/plugins/local/normal_order_old/normal_ordered_v0.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/normal_ordered_v0.irp.f
rename to plugins/local/normal_order_old/normal_ordered_v0.irp.f
diff --git a/plugins/local/old_delta_tc_qmc/NEED b/plugins/local/old_delta_tc_qmc/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/plugins/local/old_delta_tc_qmc/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/old_delta_tc_qmc/README.rst b/plugins/local/old_delta_tc_qmc/README.rst
new file mode 100644
index 00000000..1d56f96c
--- /dev/null
+++ b/plugins/local/old_delta_tc_qmc/README.rst
@@ -0,0 +1,4 @@
+================
+old_delta_tc_qmc
+================
+
diff --git a/plugins/local/tc_bi_ortho/compute_deltamu_right.irp.f b/plugins/local/old_delta_tc_qmc/compute_deltamu_right.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/compute_deltamu_right.irp.f
rename to plugins/local/old_delta_tc_qmc/compute_deltamu_right.irp.f
diff --git a/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f b/plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
similarity index 88%
rename from plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f
rename to plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
index 0aff9980..135f9d17 100644
--- a/plugins/local/tc_bi_ortho/dressing_vectors_lr.irp.f
+++ b/plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
@@ -27,7 +27,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
   call hmat_bi_ortho         (psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
 
   delta = 0.d0
@@ -39,7 +39,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
     do j = 1, ndet
 
       ! < I |Htilde | J >
-      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
       ! < I |H | J >
       call hmat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)
 
@@ -78,7 +78,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
 
   i = 1
   j = 1
-  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
 
   delta = 0.d0
  !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8)   &
@@ -88,7 +88,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
     do j = 1, ndet
 
       ! < I |Htilde | J >
-      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
 
       delta(i) = delta(i) + psicoef(j) * htc_tot
     enddo
diff --git a/plugins/local/tc_keywords/tc_keywords.irp.f b/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
similarity index 82%
rename from plugins/local/tc_keywords/tc_keywords.irp.f
rename to plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
index 3bc68550..5ff08bd6 100644
--- a/plugins/local/tc_keywords/tc_keywords.irp.f
+++ b/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
@@ -1,4 +1,4 @@
-program tc_keywords
+program old_delta_tc_qmc
   implicit none
   BEGIN_DOC
 ! TODO : Put the documentation of the program here
diff --git a/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f b/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
index a3f1b6ef..cb7cdb22 100644
--- a/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
+++ b/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
@@ -183,11 +183,3 @@ BEGIN_PROVIDER [ double precision, x_W_ij_erf_rk, ( n_points_final_grid,3,mo_num
 
 END_PROVIDER 
 
-BEGIN_PROVIDER [ double precision, sqrt_weight_at_r, (n_points_final_grid)]
- implicit none
- integer :: ipoint
- do ipoint = 1, n_points_final_grid
-  sqrt_weight_at_r(ipoint) = dsqrt(final_weight_at_r_vector(ipoint))
- enddo
-END_PROVIDER 
-
diff --git a/plugins/local/slater_tc/NEED b/plugins/local/slater_tc/NEED
new file mode 100644
index 00000000..a8669866
--- /dev/null
+++ b/plugins/local/slater_tc/NEED
@@ -0,0 +1,8 @@
+determinants
+normal_order_old
+bi_ort_ints
+bi_ortho_mos
+tc_keywords
+non_hermit_dav
+dav_general_mat
+tc_scf
diff --git a/plugins/local/tc_bi_ortho/h_mat_triple.irp.f b/plugins/local/slater_tc/h_mat_triple.irp.f
similarity index 53%
rename from plugins/local/tc_bi_ortho/h_mat_triple.irp.f
rename to plugins/local/slater_tc/h_mat_triple.irp.f
index 6f5697a2..9cb4b60a 100644
--- a/plugins/local/tc_bi_ortho/h_mat_triple.irp.f
+++ b/plugins/local/slater_tc/h_mat_triple.irp.f
@@ -1,196 +1,3 @@
-subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
- use bitmasks
- BEGIN_DOC
-! returns the array, for each spin, of holes/particles between key_i and key_j 
-!
-! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
- END_DOC
-  include 'utils/constants.include.F'
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
- double precision, intent(out)  :: phase
- integer :: ispin,k,i,pos
- integer(bit_kind) :: key_hole, key_particle
- integer(bit_kind) :: xorvec(N_int_max,2)
- holes_array = -1
- particles_array = -1
- degree_array = 0
-  do i = 1, N_int
-   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
-   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
-   degree_array(1) += popcnt(xorvec(i,1))
-   degree_array(2) += popcnt(xorvec(i,2))
-  enddo
-  degree_array(1) = shiftr(degree_array(1),1)
-  degree_array(2) = shiftr(degree_array(2),1)
-  
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE HOLES 
-  do i = 1, N_int
-   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE PARTICLES
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general '
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo
-  enddo 
- enddo
- integer :: h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree_array(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
-subroutine get_holes_general(key_i, key_j,Nint, holes_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of holes between key_i and key_j 
-!
-! with the following convention: a_{hole}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2)
- integer(bit_kind) :: key_hole
- integer :: ispin,k,i,pos
- holes_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_particles_general(key_i, key_j,Nint,particles_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of particles between key_i and key_j 
-!
-! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: particles_array(100,2)
- integer(bit_kind) :: key_particle
- integer :: ispin,k,i,pos
- particles_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'Those are the two determinants'
-     call debug_det(key_i, N_int)
-     call debug_det(key_j, N_int)
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
- implicit none
- integer, intent(in)            :: degree(2), Nint
- integer(bit_kind), intent(in)  :: key_i(Nint,2)
- integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
- double precision, intent(out)  :: phase
- integer :: i,ispin,h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
 subroutine H_tc_s2_u_0_with_pure_three(v_0, s_0, u_0, N_st, sze)
   BEGIN_DOC
   ! Computes $v_0 = H^TC | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
diff --git a/plugins/local/tc_bi_ortho/h_tc_s2_u0.irp.f b/plugins/local/slater_tc/h_tc_s2_u0.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_tc_s2_u0.irp.f
rename to plugins/local/slater_tc/h_tc_s2_u0.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f
similarity index 79%
rename from plugins/local/tc_bi_ortho/slater_tc_opt.irp.f
rename to plugins/local/slater_tc/slater_tc_opt.irp.f
index 59efc943..bf954882 100644
--- a/plugins/local/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt.irp.f
@@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
   implicit none
   double precision :: t1, t2
 
-  PROVIDE ao_two_e_integrals_in_map
-
   print *, ' start provide_all_three_ints_bi_ortho'
   call wall_time(t1)
 
@@ -131,9 +129,9 @@ subroutine htilde_mu_mat_opt_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree,
 
   endif
 
-  if(degree==0) then
-    htot += nuclear_repulsion
-  endif
+! if(degree==0) then
+!   htot += nuclear_repulsion
+! endif
 
 end
 
@@ -181,3 +179,48 @@ end
 
 ! ---
 
+subroutine htilde_mu_mat_opt_bi_ortho_no_3e_both(key_j, key_i, Nint, hji,hij)
+
+  BEGIN_DOC
+  !
+  ! <key_j |H_tilde | key_i> where |key_j> is developed on the LEFT basis and |key_i> is developed on the RIGHT basis
+  !!
+  ! Returns the detail of the matrix element WITHOUT ANY CONTRIBUTION FROM THE THREE ELECTRON TERMS
+  !! WARNING !!
+  !
+  ! Non hermitian !!
+  !
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: hji,hij
+  integer                       :: degree
+
+  hji = 0.d0
+  hij = 0.d0
+
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+  if(degree.gt.2) return
+
+  if(degree == 0) then
+    call diag_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_i,hji)
+    hij = hji
+  else if (degree == 1) then
+    call single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint,key_j, key_i , hji,hij)
+  else if(degree == 2) then
+    call double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+  endif
+
+  if(degree==0) then
+    hji += nuclear_repulsion
+    hij += nuclear_repulsion
+  endif
+
+end
+
+! ---
+
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f b/plugins/local/slater_tc/slater_tc_opt_diag.irp.f
similarity index 61%
rename from plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f
rename to plugins/local/slater_tc/slater_tc_opt_diag.irp.f
index 78f9dc66..3c5a5d12 100644
--- a/plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_diag.irp.f
@@ -19,13 +19,13 @@
   PROVIDE HF_bitmask
   PROVIDE mo_l_coef mo_r_coef
 
-  call diag_htilde_mu_mat_bi_ortho_slow(N_int, HF_bitmask, hmono, htwoe, htot)
+  call diag_htc_bi_orth_2e_brute(N_int, HF_bitmask, hmono, htwoe, htot)
 
   ref_tc_energy_1e = hmono
   ref_tc_energy_2e = htwoe 
 
   if(three_body_h_tc) then
-    call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
+    call diag_htc_bi_orth_3e_brute(N_int, HF_bitmask, hthree)
     ref_tc_energy_3e = hthree
   else
     ref_tc_energy_3e = 0.d0
@@ -524,3 +524,310 @@ end
 
 ! ---
 
+subroutine diag_htc_bi_orth_2e_brute(Nint, key_i, hmono, htwoe, htot)
+
+  BEGIN_DOC
+  !
+  ! diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
+  !
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in)  :: Nint
+  integer(bit_kind), intent(in)  :: key_i(Nint,2)
+  double precision, intent(out)  :: hmono,htwoe,htot
+  integer                        :: occ(Nint*bit_kind_size,2)
+  integer                        :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  double precision               :: get_mo_two_e_integral_tc_int
+  integer(bit_kind)              :: key_i_core(Nint,2)
+
+  PROVIDE mo_bi_ortho_tc_two_e
+
+  hmono = 0.d0
+  htwoe = 0.d0
+  htot  = 0.d0
+
+  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+
+  do ispin = 1, 2
+    do i = 1, Ne(ispin)
+      ii = occ(i,ispin)
+      hmono += mo_bi_ortho_tc_one_e(ii,ii)
+    enddo
+  enddo
+
+  ! alpha/beta two-body
+  ispin = 1
+  jspin = 2
+  do i = 1, Ne(ispin) ! electron 1 (so it can be associated to mu(r1))
+    ii = occ(i,ispin) 
+    do j = 1, Ne(jspin) ! electron 2 
+      jj = occ(j,jspin) 
+      htwoe += mo_bi_ortho_tc_two_e(jj,ii,jj,ii) 
+    enddo
+  enddo
+ 
+  ! alpha/alpha two-body
+  do i = 1, Ne(ispin)
+    ii = occ(i,ispin) 
+    do j = i+1, Ne(ispin)
+      jj = occ(j,ispin) 
+      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
+    enddo
+  enddo
+ 
+  ! beta/beta two-body
+  do i = 1, Ne(jspin)
+    ii = occ(i,jspin) 
+    do j = i+1, Ne(jspin)
+      jj = occ(j,jspin) 
+      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
+    enddo
+  enddo
+
+  htot = hmono + htwoe 
+
+end
+
+! ---                                                                                           
+
+subroutine diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
+
+  BEGIN_DOC
+  !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2)
+  double precision, intent(out) :: hthree
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2),i,j,ii,jj,ispin,jspin,m,mm
+  integer(bit_kind)             :: key_i_core(Nint,2)
+  double precision              :: direct_int, exchange_int, ref
+  double precision, external    :: sym_3_e_int_from_6_idx_tensor
+  double precision, external    :: three_e_diag_parrallel_spin
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  if(core_tc_op) then
+    do i = 1, Nint
+      key_i_core(i,1) = xor(key_i(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(key_i(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, Nint)
+  else
+    call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+  endif
+
+  hthree = 0.d0
+
+  if((Ne(1)+Ne(2)) .ge. 3) then
+
+    ! alpha/alpha/beta three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1)
+      do j = i+1, Ne(1)
+        jj = occ(j,1)
+        do m = 1, Ne(2)
+          mm = occ(m,2)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
+    enddo
+
+    ! beta/beta/alpha three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2)
+      do j = i+1, Ne(2)
+        jj = occ(j,2)
+        do m = 1, Ne(1)
+          mm = occ(m,1)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
+    enddo
+
+    ! alpha/alpha/alpha three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1) ! 1
+      do j = i+1, Ne(1)
+        jj = occ(j,1) ! 2
+        do m = j+1, Ne(1)
+          mm = occ(m,1) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
+    ! beta/beta/beta three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2) ! 1
+      do j = i+1, Ne(2)
+        jj = occ(j,2) ! 2
+        do m = j+1, Ne(2)
+          mm = occ(m,2) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
+  endif
+
+end
+
+
+
+BEGIN_PROVIDER [ double precision, three_e_diag_parrallel_spin_prov, (mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator ON A BI ORTHONORMAL BASIS 
+  !
+  ! three_e_diag_parrallel_spin_prov(m,j,i) = All combinations of the form <mji|-L|mji> for same spin matrix elements  
+  ! 
+  ! notice the -1 sign: in this way three_e_diag_parrallel_spin_prov can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, m
+  double precision :: integral, wall1, wall0, three_e_diag_parrallel_spin
+
+  three_e_diag_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_diag_parrallel_spin_prov ...'
+
+ integral = three_e_diag_parrallel_spin(1,1,1) ! to provide all stuffs
+  call wall_time(wall0)
+ !$OMP PARALLEL                 &
+ !$OMP DEFAULT (NONE)           &
+ !$OMP PRIVATE (i,j,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_diag_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = j, mo_num
+        three_e_diag_parrallel_spin_prov(m,j,i) =  three_e_diag_parrallel_spin(m,j,i)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = 1, j
+        three_e_diag_parrallel_spin_prov(m,j,i) = three_e_diag_parrallel_spin_prov(j,m,i)
+      enddo
+    enddo
+  enddo
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_diag_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, three_e_single_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_single_parrallel_spin_prov(m,j,k,i) = All combination of <mjk|-L|mji> for same spin matrix elements 
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+ implicit none
+ integer          :: i, j, k, m
+ double precision :: integral, wall1, wall0, three_e_single_parrallel_spin
+
+  three_e_single_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_single_parrallel_spin_prov ...'
+
+  integral = three_e_single_parrallel_spin(1,1,1,1)
+  call wall_time(wall0)
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_single_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_single_parrallel_spin_prov(m,j,k,i) = three_e_single_parrallel_spin(m,j,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_single_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_double_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_double_parrallel_spin_prov(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0, three_e_double_parrallel_spin
+
+  three_e_double_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_double_parrallel_spin_prov ...'
+  call wall_time(wall0)
+
+ integral = three_e_double_parrallel_spin(1,1,1,1,1)
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP SHARED (mo_num,three_e_double_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            three_e_double_parrallel_spin_prov(m,l,j,k,i) = three_e_double_parrallel_spin(m,l,j,k,i)
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_double_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f b/plugins/local/slater_tc/slater_tc_opt_double.irp.f
similarity index 90%
rename from plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f
rename to plugins/local/slater_tc/slater_tc_opt_double.irp.f
index 4067473c..181ae11d 100644
--- a/plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_double.irp.f
@@ -505,3 +505,63 @@ subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_j, key_i, htot)
 
 end
 
+subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+
+  BEGIN_DOC
+  ! <key_j |H_tilde | key_i> and <key_i |H_tilde | key_j> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS
+  !!
+  !! WARNING !!
+  !
+  ! Non hermitian !!
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
+  double precision, intent(out) :: hji,hij
+  double precision :: hmono, htwoe_ji, htwoe_ij
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  integer                       :: degree,exc(0:2,2,2)
+  integer                       :: h1, p1, h2, p2, s1, s2
+  double precision              :: get_mo_two_e_integral_tc_int,phase
+
+
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+
+  hmono  = 0.d0
+  htwoe_ji  = 0.d0
+  htwoe_ij  = 0.d0
+  hji   = 0.d0
+  hij   = 0.d0
+
+  if(degree.ne.2)then
+   return
+  endif
+  integer :: degree_i,degree_j
+  call get_excitation_degree(ref_bitmask,key_i,degree_i,N_int)
+  call get_excitation_degree(ref_bitmask,key_j,degree_j,N_int)
+  call get_double_excitation(key_i, key_j, exc, phase, Nint)
+  call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
+
+  if(s1.ne.s2)then
+   ! opposite spin two-body
+    htwoe_ji  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+    htwoe_ij  = mo_bi_ortho_tc_two_e_transp(p2,p1,h2,h1)
+  else
+   ! same spin two-body
+   ! direct terms
+   htwoe_ji  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+   htwoe_ij  = mo_bi_ortho_tc_two_e_transp(p2,p1,h2,h1)
+   ! exchange terms
+   htwoe_ji -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1)
+   htwoe_ij -= mo_bi_ortho_tc_two_e_transp(p1,p2,h2,h1)
+  endif
+  htwoe_ji  *= phase
+  hji    =  htwoe_ji
+  htwoe_ij  *= phase
+  hij    =  htwoe_ij
+
+end
diff --git a/plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
similarity index 81%
rename from plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f
rename to plugins/local/slater_tc/slater_tc_opt_single.irp.f
index e57cb05c..47bcbe34 100644
--- a/plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f
+++ b/plugins/local/slater_tc/slater_tc_opt_single.irp.f
@@ -618,3 +618,145 @@ subroutine get_single_excitation_from_fock_tc_no_3e(Nint, key_i, key_j, h, p, sp
 
 end
 
+
+subroutine  single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+
+  BEGIN_DOC
+  ! <key_j |H_tilde | key_i> and <key_i |H_tilde | key_j> for single excitation ONLY FOR ONE- AND TWO-BODY TERMS 
+  !!
+  !! WARNING !!
+  ! 
+  ! Non hermitian !!
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
+  double precision, intent(out) :: hji,hij
+
+  double precision              :: hmono, htwoe
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  integer                       :: degree,exc(0:2,2,2)
+  integer                       :: h1, p1, h2, p2, s1, s2
+  double precision              :: get_mo_two_e_integral_tc_int, phase
+  double precision              :: direct_int, exchange_int_12, exchange_int_23, exchange_int_13
+  integer                       :: other_spin(2)
+  integer(bit_kind)             :: key_j_core(Nint,2), key_i_core(Nint,2)
+
+  other_spin(1) = 2
+  other_spin(2) = 1
+
+  hmono  = 0.d0
+  htwoe  = 0.d0
+  hji   = 0.d0
+  hij   = 0.d0
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+  if(degree.ne.1)then
+   return
+  endif
+  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+
+  call get_single_excitation(key_i, key_j, exc, phase, Nint)
+  call decode_exc(exc,1,h1,p1,h2,p2,s1,s2)
+  call get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h1, p1, s1, phase, hji,hij)
+
+end
+
+! ---
+
+subroutine get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h, p, spin, phase, hji,hij)
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer,           intent(in) :: h, p, spin
+  double precision,  intent(in) :: phase
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: hji,hij
+  double precision :: hmono_ji,htwoe_ji
+  double precision :: hmono_ij,htwoe_ij
+
+  integer(bit_kind)             :: differences(Nint,2)
+  integer(bit_kind)             :: hole(Nint,2)
+  integer(bit_kind)             :: partcl(Nint,2)
+  integer                       :: occ_hole(Nint*bit_kind_size,2)
+  integer                       :: occ_partcl(Nint*bit_kind_size,2)
+  integer                       :: n_occ_ab_hole(2),n_occ_ab_partcl(2)
+  integer                       :: i0,i
+  double precision              :: buffer_c_ji(mo_num), buffer_x_ji(mo_num)
+  double precision              :: buffer_c_ij(mo_num), buffer_x_ij(mo_num)
+
+  do i = 1, mo_num
+    buffer_c_ji(i) = tc_2e_3idx_coulomb_integrals(i,p,h)
+    buffer_x_ji(i) = tc_2e_3idx_exchange_integrals(i,p,h)
+    buffer_c_ij(i) = tc_2e_3idx_coulomb_integrals_transp(i,p,h)
+    buffer_x_ij(i) = tc_2e_3idx_exchange_integrals_transp(i,p,h)
+  enddo
+
+  do i = 1, Nint
+   differences(i,1) = xor(key_i(i,1),ref_closed_shell_bitmask(i,1))
+   differences(i,2) = xor(key_i(i,2),ref_closed_shell_bitmask(i,2))
+   hole(i,1) = iand(differences(i,1),ref_closed_shell_bitmask(i,1))
+   hole(i,2) = iand(differences(i,2),ref_closed_shell_bitmask(i,2))
+   partcl(i,1) = iand(differences(i,1),key_i(i,1))
+   partcl(i,2) = iand(differences(i,2),key_i(i,2))
+  enddo
+
+  call bitstring_to_list_ab(hole, occ_hole, n_occ_ab_hole, Nint)
+  call bitstring_to_list_ab(partcl, occ_partcl, n_occ_ab_partcl, Nint)
+  hmono_ji = mo_bi_ortho_tc_one_e(p,h)
+  htwoe_ji = fock_op_2_e_tc_closed_shell(p,h)
+  hmono_ij = mo_bi_ortho_tc_one_e(h,p)
+  htwoe_ij = fock_op_2_e_tc_closed_shell(h,p)
+
+  ! holes :: direct terms
+  do i0 = 1, n_occ_ab_hole(1)
+    i = occ_hole(i0,1)
+    htwoe_ji -= buffer_c_ji(i)
+    htwoe_ij -= buffer_c_ij(i)
+  enddo
+  do i0 = 1, n_occ_ab_hole(2)
+    i = occ_hole(i0,2)
+    htwoe_ji -= buffer_c_ji(i)
+    htwoe_ij -= buffer_c_ij(i)
+  enddo
+ 
+  ! holes :: exchange terms
+  do i0 = 1, n_occ_ab_hole(spin)
+    i = occ_hole(i0,spin)
+    htwoe_ji += buffer_x_ji(i)
+    htwoe_ij += buffer_x_ij(i)
+  enddo
+ 
+  ! particles :: direct terms
+  do i0 = 1, n_occ_ab_partcl(1)
+    i = occ_partcl(i0,1)
+    htwoe_ji += buffer_c_ji(i)
+    htwoe_ij += buffer_c_ij(i)
+  enddo
+  do i0 = 1, n_occ_ab_partcl(2)
+    i = occ_partcl(i0,2)
+    htwoe_ji += buffer_c_ji(i)
+    htwoe_ij += buffer_c_ij(i)
+  enddo
+ 
+  ! particles :: exchange terms
+  do i0 = 1, n_occ_ab_partcl(spin)
+    i = occ_partcl(i0,spin)
+    htwoe_ji -= buffer_x_ji(i)
+    htwoe_ij -= buffer_x_ij(i)
+  enddo
+  htwoe_ji = htwoe_ji * phase
+  hmono_ji = hmono_ji * phase
+  hji  = htwoe_ji + hmono_ji 
+
+  htwoe_ij = htwoe_ij * phase
+  hmono_ij = hmono_ij * phase
+  hij  = htwoe_ij + hmono_ij 
+
+end
+
diff --git a/plugins/local/tc_bi_ortho/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f
similarity index 93%
rename from plugins/local/tc_bi_ortho/tc_hmat.irp.f
rename to plugins/local/slater_tc/tc_hmat.irp.f
index abec410d..6323d129 100644
--- a/plugins/local/tc_bi_ortho/tc_hmat.irp.f
+++ b/plugins/local/slater_tc/tc_hmat.irp.f
@@ -22,6 +22,7 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
 
   if(noL_standard) then
     PROVIDE noL_0e
+    print*, "noL_0e =", noL_0e
     PROVIDE noL_1e
     PROVIDE noL_2e
   endif
@@ -29,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
   print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
   call wall_time(t1)
 
-  call provide_all_three_ints_bi_ortho()
+  if(three_body_h_tc)then
+   call provide_all_three_ints_bi_ortho()
+  endif
 
   i = 1
   j = 1
diff --git a/plugins/local/slater_tc_no_opt/.gitignore b/plugins/local/slater_tc_no_opt/.gitignore
new file mode 100644
index 00000000..1561915b
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/.gitignore
@@ -0,0 +1,59 @@
+IRPF90_temp/
+IRPF90_man/
+build.ninja
+irpf90.make
+ezfio_interface.irp.f
+irpf90_entities
+tags
+Makefile
+ao_basis
+ao_one_e_ints
+ao_two_e_erf_ints
+ao_two_e_ints
+aux_quantities
+becke_numerical_grid
+bitmask
+cis
+cisd
+cipsi
+davidson
+davidson_dressed
+davidson_undressed
+density_for_dft
+determinants
+dft_keywords
+dft_utils_in_r
+dft_utils_one_e
+dft_utils_two_body
+dressing
+dummy
+electrons
+ezfio_files
+fci
+generators_cas
+generators_full
+hartree_fock
+iterations
+kohn_sham
+kohn_sham_rs
+mo_basis
+mo_guess
+mo_one_e_ints
+mo_two_e_erf_ints
+mo_two_e_ints
+mpi
+mrpt_utils
+nuclei
+perturbation
+pseudo
+psiref_cas
+psiref_utils
+scf_utils
+selectors_cassd
+selectors_full
+selectors_utils
+single_ref_method
+slave
+tools
+utils
+zmq
diff --git a/plugins/local/slater_tc_no_opt/NEED b/plugins/local/slater_tc_no_opt/NEED
new file mode 100644
index 00000000..a8669866
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/NEED
@@ -0,0 +1,8 @@
+determinants
+normal_order_old
+bi_ort_ints
+bi_ortho_mos
+tc_keywords
+non_hermit_dav
+dav_general_mat
+tc_scf
diff --git a/plugins/local/slater_tc_no_opt/README.rst b/plugins/local/slater_tc_no_opt/README.rst
new file mode 100644
index 00000000..90679e4c
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/README.rst
@@ -0,0 +1,4 @@
+================
+slater_tc_no_opt
+================
+
diff --git a/plugins/local/tc_bi_ortho/h_biortho.irp.f b/plugins/local/slater_tc_no_opt/h_biortho.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_biortho.irp.f
rename to plugins/local/slater_tc_no_opt/h_biortho.irp.f
diff --git a/plugins/local/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f b/plugins/local/slater_tc_no_opt/h_tc_bi_ortho_psi.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/h_tc_bi_ortho_psi.irp.f
rename to plugins/local/slater_tc_no_opt/h_tc_bi_ortho_psi.irp.f
diff --git a/plugins/local/tc_bi_ortho/slater_tc_3e_slow.irp.f b/plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
similarity index 99%
rename from plugins/local/tc_bi_ortho/slater_tc_3e_slow.irp.f
rename to plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
index cb33d343..f7919653 100644
--- a/plugins/local/tc_bi_ortho/slater_tc_3e_slow.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
@@ -1,7 +1,7 @@
 
 ! ---
 
-subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
+subroutine diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
 
   BEGIN_DOC
   !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
diff --git a/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f b/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
new file mode 100644
index 00000000..0fcc587f
--- /dev/null
+++ b/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
@@ -0,0 +1,7 @@
+program slater_tc_no_opt
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+end
diff --git a/plugins/local/tc_bi_ortho/slater_tc_slow.irp.f b/plugins/local/slater_tc_no_opt/slater_tc_slow.irp.f
similarity index 80%
rename from plugins/local/tc_bi_ortho/slater_tc_slow.irp.f
rename to plugins/local/slater_tc_no_opt/slater_tc_slow.irp.f
index caf7d665..b06fd12f 100644
--- a/plugins/local/tc_bi_ortho/slater_tc_slow.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_slow.irp.f
@@ -61,7 +61,7 @@ subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree,
   if(degree.gt.2) return
 
   if(degree == 0) then
-    call diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
+    call diag_htc_bi_orth_2e_brute(Nint, key_i, hmono, htwoe, htot)
   else if (degree == 1) then
     call single_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
   else if(degree == 2) then
@@ -76,7 +76,7 @@ subroutine htilde_mu_mat_bi_ortho_slow(key_j, key_i, Nint, hmono, htwoe, hthree,
     else if((degree == 1) .and. (elec_num .gt. 2) .and. three_e_4_idx_term) then
       call single_htilde_three_body_ints_bi_ort_slow(Nint, key_j, key_i, hthree)
     else if((degree == 0) .and. (elec_num .gt. 2) .and. three_e_3_idx_term) then
-      call diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
+      call diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
     endif
   endif
 
@@ -95,75 +95,6 @@ end
 
 ! ---
 
-subroutine diag_htilde_mu_mat_bi_ortho_slow(Nint, key_i, hmono, htwoe, htot)
-
-  BEGIN_DOC
-  !
-  ! diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
-  !
-  END_DOC
-
-  use bitmasks
-
-  implicit none
-  integer,           intent(in)  :: Nint
-  integer(bit_kind), intent(in)  :: key_i(Nint,2)
-  double precision, intent(out)  :: hmono,htwoe,htot
-  integer                        :: occ(Nint*bit_kind_size,2)
-  integer                        :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
-  double precision               :: get_mo_two_e_integral_tc_int
-  integer(bit_kind)              :: key_i_core(Nint,2)
-
-  PROVIDE mo_bi_ortho_tc_two_e
-
-  hmono = 0.d0
-  htwoe = 0.d0
-  htot  = 0.d0
-
-  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
-
-  do ispin = 1, 2 
-    do i = 1, Ne(ispin)
-      ii = occ(i,ispin) 
-      hmono += mo_bi_ortho_tc_one_e(ii,ii)
-    enddo
-  enddo
-
-  ! alpha/beta two-body
-  ispin = 1
-  jspin = 2 
-  do i = 1, Ne(ispin) ! electron 1 (so it can be associated to mu(r1))
-    ii = occ(i,ispin) 
-    do j = 1, Ne(jspin) ! electron 2 
-      jj = occ(j,jspin) 
-      htwoe += mo_bi_ortho_tc_two_e(jj,ii,jj,ii) 
-    enddo
-  enddo
- 
-  ! alpha/alpha two-body
-  do i = 1, Ne(ispin)
-    ii = occ(i,ispin) 
-    do j = i+1, Ne(ispin)
-      jj = occ(j,ispin) 
-      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
-    enddo
-  enddo
- 
-  ! beta/beta two-body
-  do i = 1, Ne(jspin)
-    ii = occ(i,jspin) 
-    do j = i+1, Ne(jspin)
-      jj = occ(j,jspin) 
-      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
-    enddo
-  enddo
-
-  htot = hmono + htwoe 
-
-end
-
-! ---
-
 subroutine double_htilde_mu_mat_bi_ortho_slow(Nint, key_j, key_i, hmono, htwoe, htot)
 
   BEGIN_DOC
diff --git a/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f b/plugins/local/slater_tc_no_opt/test_tc_bi_ortho.irp.f
similarity index 96%
rename from plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f
rename to plugins/local/slater_tc_no_opt/test_tc_bi_ortho.irp.f
index 369efd15..559c0200 100644
--- a/plugins/local/tc_bi_ortho/test_tc_bi_ortho.irp.f
+++ b/plugins/local/slater_tc_no_opt/test_tc_bi_ortho.irp.f
@@ -88,7 +88,7 @@ subroutine test_slater_tc_opt
  i_count = 0.d0
  do i = 1, N_det
   do j = 1,N_det
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hnewmono, hnewtwoe, hnewthree, hnewtot)
    if(dabs(htot).gt.1.d-15)then
      i_count += 1.D0
@@ -124,7 +124,7 @@ subroutine timing_tot
   do j = 1, N_det
 !   call get_excitation_degree(psi_det(1,1,j), psi_det(1,1,i),degree,N_int)
    i_count += 1.d0
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   enddo
  enddo
  call wall_time(wall1)
@@ -171,7 +171,7 @@ subroutine timing_diag
  do i = 1, N_det
   do j = i,i 
    i_count += 1.d0
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
   enddo
  enddo
  call wall_time(wall1)
@@ -208,7 +208,7 @@ subroutine timing_single
    if(degree.ne.1)cycle
    i_count += 1.d0
    call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call wall_time(wall1)
    accu += wall1 - wall0
   enddo
@@ -250,7 +250,7 @@ subroutine timing_double
    if(degree.ne.2)cycle
    i_count += 1.d0
    call wall_time(wall0)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,i), N_int, hmono, htwoe, hthree, htot)
    call wall_time(wall1)
    accu += wall1 - wall0
   enddo
diff --git a/plugins/local/spher_harm/.gitignore b/plugins/local/spher_harm/.gitignore
new file mode 100644
index 00000000..1561915b
--- /dev/null
+++ b/plugins/local/spher_harm/.gitignore
@@ -0,0 +1,59 @@
+IRPF90_temp/
+IRPF90_man/
+build.ninja
+irpf90.make
+ezfio_interface.irp.f
+irpf90_entities
+tags
+Makefile
+ao_basis
+ao_one_e_ints
+ao_two_e_erf_ints
+ao_two_e_ints
+aux_quantities
+becke_numerical_grid
+bitmask
+cis
+cisd
+cipsi
+davidson
+davidson_dressed
+davidson_undressed
+density_for_dft
+determinants
+dft_keywords
+dft_utils_in_r
+dft_utils_one_e
+dft_utils_two_body
+dressing
+dummy
+electrons
+ezfio_files
+fci
+generators_cas
+generators_full
+hartree_fock
+iterations
+kohn_sham
+kohn_sham_rs
+mo_basis
+mo_guess
+mo_one_e_ints
+mo_two_e_erf_ints
+mo_two_e_ints
+mpi
+mrpt_utils
+nuclei
+perturbation
+pseudo
+psiref_cas
+psiref_utils
+scf_utils
+selectors_cassd
+selectors_full
+selectors_utils
+single_ref_method
+slave
+tools
+utils
+zmq
diff --git a/plugins/local/spher_harm/NEED b/plugins/local/spher_harm/NEED
new file mode 100644
index 00000000..92df7f12
--- /dev/null
+++ b/plugins/local/spher_harm/NEED
@@ -0,0 +1 @@
+dft_utils_in_r
diff --git a/plugins/local/spher_harm/README.rst b/plugins/local/spher_harm/README.rst
new file mode 100644
index 00000000..9c9b12a6
--- /dev/null
+++ b/plugins/local/spher_harm/README.rst
@@ -0,0 +1,7 @@
+==========
+spher_harm
+==========
+
+Routines for spherical Harmonics evaluation in real space. 
+The main routine is "spher_harm_func_r3(r,l,m,re_ylm, im_ylm)".  
+The test routine is "test_spher_harm" where everything is explained in details. 
diff --git a/plugins/local/spher_harm/assoc_gaus_pol.irp.f b/plugins/local/spher_harm/assoc_gaus_pol.irp.f
new file mode 100644
index 00000000..fa790307
--- /dev/null
+++ b/plugins/local/spher_harm/assoc_gaus_pol.irp.f
@@ -0,0 +1,50 @@
+double precision function plgndr(l,m,x)
+ integer, intent(in) :: l,m
+ double precision, intent(in) :: x
+ BEGIN_DOC
+ ! associated Legenre polynom P_l,m(x). Used for the Y_lm(theta,phi)
+ ! Taken from https://iate.oac.uncor.edu/~mario/materia/nr/numrec/f6-8.pdf
+ END_DOC
+ integer :: i,ll
+ double precision :: fact,pll,pmm,pmmp1,somx2
+ if(m.lt.0.or.m.gt.l.or.dabs(x).gt.1.d0)then 
+  print*,'bad arguments in plgndr'
+  pause
+ endif
+ pmm=1.d0
+ if(m.gt.0) then
+  somx2=dsqrt((1.d0-x)*(1.d0+x))
+  fact=1.d0
+  do i=1,m
+   pmm=-pmm*fact*somx2
+   fact=fact+2.d0
+  enddo 
+ endif ! m > 0
+ if(l.eq.m) then
+  plgndr=pmm 
+ else
+  pmmp1=x*(2*m+1)*pmm ! Compute P_m+1^m
+   if(l.eq.m+1) then
+    plgndr=pmmp1
+   else ! Compute P_l^m, l> m+1
+    do ll=m+2,l
+     pll=(x*dble(2*ll-1)*pmmp1-dble(ll+m-1)*pmm)/(ll-m)
+     pmm=pmmp1
+     pmmp1=pll
+    enddo
+    plgndr=pll
+   endif ! l.eq.m+1
+ endif ! l.eq.m
+ return
+end
+
+double precision function ortho_assoc_gaus_pol(l1,m1,l2)
+ implicit none
+ integer, intent(in) :: l1,m1,l2
+ double precision :: fact
+ if(l1.ne.l2)then
+  ortho_assoc_gaus_pol= 0.d0
+ else
+  ortho_assoc_gaus_pol = 2.d0*fact(l1+m1) / (dble(2*l1+1)*fact(l1-m1))
+ endif
+end
diff --git a/plugins/local/spher_harm/routines_test.irp.f b/plugins/local/spher_harm/routines_test.irp.f
new file mode 100644
index 00000000..fe8fc422
--- /dev/null
+++ b/plugins/local/spher_harm/routines_test.irp.f
@@ -0,0 +1,231 @@
+subroutine test_spher_harm
+ implicit none
+ BEGIN_DOC
+ ! routine to test the generic spherical harmonics routine "spher_harm_func_r3" from R^3 --> C
+ ! 
+ ! We test <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2
+ ! 
+ ! The test is done through the integration on a sphere with the Lebedev grid. 
+ END_DOC
+  include 'constants.include.F'
+ integer :: l1,m1,i,l2,m2,lmax
+ double precision :: r(3),weight,accu_re, accu_im,accu
+ double precision :: re_ylm_1, im_ylm_1,re_ylm_2, im_ylm_2
+ double precision :: theta,phi,r_abs
+ lmax = 5 ! Maximum angular momentum until which we are going to test orthogonality conditions
+ do l1 = 0,lmax
+  do m1 =  -l1 ,l1
+   do l2 = 0,lmax
+    do m2 =  -l2 ,l2
+     accu_re = 0.d0 ! accumulator for the REAL part of <Y_l1,m1|Y_l2,m2>
+     accu_im = 0.d0 ! accumulator for the IMAGINARY part of <Y_l1,m1|Y_l2,m2>
+     accu    = 0.d0 ! accumulator for the weights ==> should be \int dOmega == 4 pi
+     ! <l1,m1|l2,m2> = \int dOmega Y_l1,m1^* Y_l2,m2 
+     !               \approx \sum_i W_i Y_l1,m1^*(r_i) Y_l2,m2(r_i) WITH r_i being on the spher of radius 1 
+     do i = 1, n_points_integration_angular 
+      r(1:3) = angular_quadrature_points(i,1:3) ! ith Lebedev point (x,y,z) on the sphere of radius 1 
+      weight = weights_angular_points(i)        ! associated Lebdev weight not necessarily positive 
+
+!!!!!!!!!!! Test of the Cartesian --> Spherical coordinates  
+      ! theta MUST belong to [0,pi] and phi to [0,2pi]
+      ! gets the cartesian to spherical change of coordinates 
+      call cartesian_to_spherical(r,theta,phi,r_abs)
+      if(theta.gt.pi.or.theta.lt.0.d0)then
+       print*,'pb with theta, it should be in [0,pi]',theta
+       print*,r
+      endif
+      if(phi.gt.2.d0*pi.or.phi.lt.0.d0)then
+       print*,'pb with phi, it should be in [0,2 pi]',phi/pi
+       print*,r
+      endif
+      
+!!!!!!!!!!! Routines returning the Spherical harmonics on the grid point 
+      call spher_harm_func_r3(r,l1,m1,re_ylm_1, im_ylm_1)
+      call spher_harm_func_r3(r,l2,m2,re_ylm_2, im_ylm_2)
+      
+!!!!!!!!!!! Integration of Y_l1,m1^*(r) Y_l2,m2(r)
+     !               = \int dOmega (re_ylm_1 -i im_ylm_1) * (re_ylm_2 +i im_ylm_2)
+     !               = \int dOmega (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2) +i (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+      accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+      accu += weight
+     enddo
+     ! Test that the sum of the weights is 4 pi
+     if(dabs(accu - dfour_pi).gt.1.d-6)then
+      print*,'Problem !! The sum of the Lebedev weight is not 4 pi ..'
+      print*,accu
+      stop
+     endif
+     ! Test for the delta l1,l2 and delta m1,m2
+     !
+     ! Test for the off-diagonal part of the Kronecker delta 
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+     ! Test for the diagonal part of the Kronecker delta 
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+       print*,'l1,m1,l2,m2',l1,m1,l2,m2
+       print*,'accu_re = ',accu_re
+       print*,'accu_im = ',accu_im
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+end
+
+subroutine test_cart
+ implicit none
+ BEGIN_DOC
+ ! test for the cartesian --> spherical change of coordinates 
+ !
+ ! test the routine "cartesian_to_spherical" such that the polar angle theta ranges in [0,pi]
+ !
+ ! and the asymuthal angle phi ranges in [0,2pi]
+ END_DOC
+ include 'constants.include.F'
+ double precision :: r(3),theta,phi,r_abs 
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) = 1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) =-1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+ print*,''
+ r = 0.d0
+ r(1) = 1.d0
+ r(2) =-1.d0
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ print*,r
+ print*,phi/pi
+end
+
+
+subroutine test_brutal_spheric
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ ! Test for the <Y_l1,m1|Y_l2,m2> = delta_m1,m2 delta_l1,l2 using the following two dimentional integration 
+ ! 
+ !   \int_0^2pi d Phi \int_-1^+1 d(cos(Theta)) Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi)
+ !
+ !=  \int_0^2pi d Phi \int_0^pi dTheta sin(Theta)  Y_l1,m1^*(Theta,Phi) Y_l2,m2(Theta,Phi) 
+ !
+ ! Allows to test for the general functions "spher_harm_func_m_pos" with "spher_harm_func_expl"
+ END_DOC
+ integer :: itheta, iphi,ntheta,nphi
+ double precision :: theta_min, theta_max, dtheta,theta
+ double precision :: phi_min, phi_max, dphi,phi
+ double precision :: accu_re, accu_im,weight 
+ double precision :: re_ylm_1, im_ylm_1 ,re_ylm_2, im_ylm_2,accu
+ integer :: l1,m1,i,l2,m2,lmax
+ phi_min = 0.d0
+ phi_max = 2.D0 * pi
+ theta_min = 0.d0
+ theta_max = 1.D0 * pi
+ ntheta = 1000
+ nphi = 1000
+ dphi = (phi_max - phi_min)/dble(nphi)
+ dtheta = (theta_max - theta_min)/dble(ntheta)
+
+ lmax = 2
+ do l1 = 0,lmax
+  do m1 =  0 ,l1
+   do l2 = 0,lmax
+    do m2 =  0 ,l2
+     accu_re = 0.d0
+     accu_im = 0.d0
+     accu  = 0.d0
+     theta = theta_min
+     do itheta = 1, ntheta
+      phi = phi_min
+      do iphi = 1, nphi
+!      call spher_harm_func_expl(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+!      call spher_harm_func_expl(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       call spher_harm_func_m_pos(l1,m1,theta,phi,re_ylm_1, im_ylm_1)
+       call spher_harm_func_m_pos(l2,m2,theta,phi,re_ylm_2, im_ylm_2)
+       weight = dtheta * dphi * dsin(theta) 
+       accu_re += weight * (re_ylm_1*re_ylm_2 + im_ylm_1*im_ylm_2)
+       accu_im += weight * (im_ylm_2*re_ylm_1 - im_ylm_1*re_ylm_2)
+       accu += weight
+       phi += dphi
+      enddo
+      theta += dtheta
+     enddo
+     print*,'l1,m1,l2,m2',l1,m1,l2,m2
+     print*,'accu_re = ',accu_re
+     print*,'accu_im = ',accu_im
+     print*,'accu    = ',accu
+     if(l1.ne.l2.or.m1.ne.m2)then
+      if(dabs(accu_re).gt.1.d-6.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb OFF DIAG !!!!! '
+      endif
+     endif
+     if(l1==l2.and.m1==m2)then
+      if(dabs(accu_re-1.d0).gt.1.d-5.or.dabs(accu_im).gt.1.d-6)then
+       print*,'pb DIAG !!!!! '
+      endif
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ 
+
+end
+
+subroutine test_assoc_leg_pol
+  implicit none
+  BEGIN_DOC
+! Test for the associated Legendre Polynoms. The test is done through the orthogonality condition. 
+  END_DOC
+  print *, 'Hello world'
+ integer :: l1,m1,ngrid,i,l2,m2
+ l1 = 0
+ m1 = 0
+ l2 = 2
+ m2 = 0
+ double precision :: x, dx,xmax,accu,xmin
+ double precision :: plgndr,func_1,func_2,ortho_assoc_gaus_pol
+ ngrid = 100000
+ xmax = 1.d0
+ xmin = -1.d0
+ dx = (xmax-xmin)/dble(ngrid)
+ do l2 = 0,10
+  x = xmin
+  accu = 0.d0
+  do i = 1, ngrid
+   func_1 = plgndr(l1,m1,x)
+   func_2 = plgndr(l2,m2,x)
+   write(33,*)x, func_1,func_2
+   accu += func_1 * func_2 * dx
+   x += dx
+  enddo
+  print*,'l2 = ',l2
+  print*,'accu = ',accu
+  print*,ortho_assoc_gaus_pol(l1,m1,l2)
+ enddo
+end
diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f
new file mode 100644
index 00000000..e8deafb9
--- /dev/null
+++ b/plugins/local/spher_harm/spher_harm.irp.f
@@ -0,0 +1,7 @@
+program spher_harm
+ implicit none
+ call test_spher_harm
+! call test_cart
+! call test_brutal_spheric
+end
+
diff --git a/plugins/local/spher_harm/spher_harm_func.irp.f b/plugins/local/spher_harm/spher_harm_func.irp.f
new file mode 100644
index 00000000..f12c8fb9
--- /dev/null
+++ b/plugins/local/spher_harm/spher_harm_func.irp.f
@@ -0,0 +1,164 @@
+subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
+ implicit none
+ integer, intent(in) :: l,m
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: re_ylm, im_ylm
+
+ double precision :: theta, phi,r_abs
+ call cartesian_to_spherical(r,theta,phi,r_abs)
+ call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
+end
+
+
+subroutine spher_harm_func_m_pos(l,m,theta,phi,re_ylm, im_ylm)
+  include 'constants.include.F'
+ implicit none
+ BEGIN_DOC
+! Y_lm(theta,phi) with m >0
+!
+ END_DOC
+ double precision, intent(in) :: theta, phi
+ integer, intent(in)          :: l,m
+ double precision, intent(out):: re_ylm,im_ylm
+ double precision :: prefact,fact,cos_theta,plgndr,p_lm
+ double precision :: tmp
+ prefact = dble(2*l+1)*fact(l-m)/(dfour_pi * fact(l+m))
+ prefact = dsqrt(prefact)
+ cos_theta = dcos(theta)
+ p_lm = plgndr(l,m,cos_theta)
+ tmp  = prefact * p_lm
+ re_ylm = dcos(dble(m)*phi) * tmp
+ im_ylm = dsin(dble(m)*phi) * tmp
+end
+
+subroutine spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
+ implicit none 
+ BEGIN_DOC
+ ! Y_lm(theta,phi) with -l<m<+l
+ !
+ END_DOC
+ double precision, intent(in) :: theta, phi
+ integer, intent(in)          :: l,m
+ double precision, intent(out):: re_ylm,im_ylm
+ double precision :: re_ylm_pos,im_ylm_pos,tmp
+ integer :: minus_m
+ if(abs(m).gt.l)then
+  print*,'|m| > l in spher_harm_func !! stopping ...'
+  stop
+ endif
+ if(m.ge.0)then
+  call spher_harm_func_m_pos(l,m,theta,phi,re_ylm_pos, im_ylm_pos)
+  re_ylm = re_ylm_pos
+  im_ylm = im_ylm_pos
+ else
+  minus_m = -m !> 0
+  call spher_harm_func_m_pos(l,minus_m,theta,phi,re_ylm_pos, im_ylm_pos)
+  tmp = (-1)**minus_m
+  re_ylm =   tmp  * re_ylm_pos
+  im_ylm =  -tmp  * im_ylm_pos ! complex conjugate 
+ endif
+end
+
+subroutine cartesian_to_spherical(r,theta,phi,r_abs)
+ implicit none 
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: theta, phi,r_abs
+ double precision :: r_2,x_2_y_2,tmp
+ include 'constants.include.F'
+ x_2_y_2 = r(1)*r(1) + r(2)*r(2)
+ r_2 = x_2_y_2 + r(3)*r(3)
+ r_abs = dsqrt(r_2)
+
+ if(r_abs.gt.1.d-20)then
+  theta = dacos(r(3)/r_abs)
+ else
+  theta = 0.d0
+ endif
+
+ if(.true.)then
+  if(dabs(r(1)).gt.0.d0)then
+   tmp = datan(r(2)/r(1))
+!   phi = datan2(r(2),r(1))
+  endif
+  ! From Wikipedia on Spherical Harmonics
+  if(r(1).gt.0.d0)then
+   phi = tmp
+  else if(r(1).lt.0.d0.and.r(2).ge.0.d0)then
+   phi = tmp + pi
+  else if(r(1).lt.0.d0.and.r(2).lt.0.d0)then
+   phi = tmp - pi
+  else if(r(1)==0.d0.and.r(2).gt.0.d0)then
+   phi = 0.5d0*pi
+  else if(r(1)==0.d0.and.r(2).lt.0.d0)then
+   phi =-0.5d0*pi
+  else if(r(1)==0.d0.and.r(2)==0.d0)then
+   phi = 0.d0
+  endif
+  if(r(2).lt.0.d0.and.r(1).le.0.d0)then
+   tmp = pi - dabs(phi)
+   phi = pi + tmp
+  else if(r(2).lt.0.d0.and.r(1).gt.0.d0)then
+   phi = dtwo_pi + phi
+  endif
+ endif
+
+ if(.false.)then
+  x_2_y_2 = dsqrt(x_2_y_2)  
+  if(dabs(x_2_y_2).gt.1.d-20.and.dabs(r(2)).gt.1.d-20)then
+   phi = dabs(r(2))/r(2) * dacos(r(1)/x_2_y_2) 
+  else 
+   phi = 0.d0
+  endif
+ endif
+end
+
+
+subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
+ implicit none 
+ BEGIN_DOC
+ ! Y_lm(theta,phi) with -l<m<+l and 0<= l <=2
+ !
+ END_DOC
+ double precision, intent(in) :: theta, phi
+ integer, intent(in)          :: l,m
+ double precision, intent(out):: re_ylm,im_ylm
+ double precision :: tmp
+ include 'constants.include.F'
+ if(l==0.and.m==0)then
+  re_ylm = 0.5d0 * inv_sq_pi 
+  im_ylm = 0.d0
+ else if(l==1.and.m==1)then
+  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
+  re_ylm = tmp * dcos(phi)
+  im_ylm = tmp * dsin(phi)
+ else if (l==1.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) 
+  re_ylm = tmp * dcos(phi)
+  im_ylm = -tmp * dsin(phi)
+ else if(l==1.and.m==0)then
+  tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) 
+  re_ylm = tmp 
+  im_ylm = 0.d0
+ else if(l==2.and.m==2)then
+  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
+  re_ylm = tmp * dcos(2.d0*phi)
+  im_ylm = tmp * dsin(2.d0*phi)
+ else if(l==2.and.m==-2)then
+  tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
+  re_ylm = tmp * dcos(2.d0*phi)
+  im_ylm =-tmp * dsin(2.d0*phi)
+ else if(l==2.and.m==1)then
+  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
+  re_ylm = tmp * dcos(phi)
+  im_ylm = tmp * dsin(phi)
+ else if(l==2.and.m==-1)then
+  tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
+  re_ylm = tmp * dcos(phi)
+  im_ylm =-tmp * dsin(phi)
+ else if(l==2.and.m==0)then
+  tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
+  re_ylm = tmp
+  im_ylm = 0.d0
+ endif
+end
diff --git a/plugins/local/tc_bi_ortho/EZFIO.cfg b/plugins/local/tc_bi_ortho/EZFIO.cfg
index a34d2134..67c780d7 100644
--- a/plugins/local/tc_bi_ortho/EZFIO.cfg
+++ b/plugins/local/tc_bi_ortho/EZFIO.cfg
@@ -9,3 +9,14 @@ interface: ezfio
 doc: Coefficients for the right wave function
 type: double precision
 size: (determinants.n_det,determinants.n_states)
+
+[tc_gs_energy]
+type: Threshold
+doc: TC GS Energy
+interface: ezfio
+
+[tc_gs_var]
+type: Threshold
+doc: TC GS VAR
+interface: ezfio
+
diff --git a/plugins/local/tc_bi_ortho/NEED b/plugins/local/tc_bi_ortho/NEED
index 9a0c20ef..01841e02 100644
--- a/plugins/local/tc_bi_ortho/NEED
+++ b/plugins/local/tc_bi_ortho/NEED
@@ -1,6 +1,2 @@
-bi_ort_ints
-bi_ortho_mos
-tc_keywords
-non_hermit_dav
-dav_general_mat
 tc_scf
+slater_tc
diff --git a/plugins/local/tc_bi_ortho/tc_bi_ortho.irp.f b/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
similarity index 70%
rename from plugins/local/tc_bi_ortho/tc_bi_ortho.irp.f
rename to plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
index 398e96db..03388898 100644
--- a/plugins/local/tc_bi_ortho/tc_bi_ortho.irp.f
+++ b/plugins/local/tc_bi_ortho/diagonalize_tc_h.irp.f
@@ -35,8 +35,8 @@ program tc_bi_ortho
   print*, ' nb of det    = ', N_det
 
   call routine_diag()
-  call write_tc_energy()
-  call save_tc_bi_ortho_wavefunction()
+!  call write_tc_energy()
+!  call save_tc_bi_ortho_wavefunction()
 
 end
 
@@ -76,28 +76,26 @@ subroutine routine_diag()
     PROVIDE noL_2e
   endif
 
-  PROVIDE htilde_matrix_elmt_bi_ortho
-  return
 
   if(N_states .eq. 1) then
 
     print*,'eigval_right_tc_bi_orth   = ',eigval_right_tc_bi_orth(1)
-    print*,'e_tc_left_right           = ',e_tc_left_right
-    print*,'e_tilde_bi_orth_00        = ',e_tilde_bi_orth_00
-    print*,'e_pt2_tc_bi_orth          = ',e_pt2_tc_bi_orth
-    print*,'e_pt2_tc_bi_orth_single   = ',e_pt2_tc_bi_orth_single
-    print*,'e_pt2_tc_bi_orth_double   = ',e_pt2_tc_bi_orth_double
-    print*,'***'                      
-    print*,'e_corr_bi_orth            = ',e_corr_bi_orth
-    print*,'e_corr_bi_orth_proj       = ',e_corr_bi_orth_proj
-    print*,'e_corr_bi_orth_proj_abs   = ',e_corr_bi_orth_proj_abs
-    print*,'e_corr_single_bi_orth     = ',e_corr_single_bi_orth
-    print*,'e_corr_double_bi_orth     = ',e_corr_double_bi_orth
-    print*,'e_corr_single_bi_orth_abs = ',e_corr_single_bi_orth_abs
-    print*,'e_corr_double_bi_orth_abs = ',e_corr_double_bi_orth_abs
+!    print*,'e_tc_left_right           = ',e_tc_left_right
+!    print*,'e_tilde_bi_orth_00        = ',e_tilde_bi_orth_00
+!    print*,'e_pt2_tc_bi_orth          = ',e_pt2_tc_bi_orth
+!    print*,'e_pt2_tc_bi_orth_single   = ',e_pt2_tc_bi_orth_single
+!    print*,'e_pt2_tc_bi_orth_double   = ',e_pt2_tc_bi_orth_double
+!    print*,'***'                      
+!    print*,'e_corr_bi_orth            = ',e_corr_bi_orth
+!    print*,'e_corr_bi_orth_proj       = ',e_corr_bi_orth_proj
+!    print*,'e_corr_bi_orth_proj_abs   = ',e_corr_bi_orth_proj_abs
+!    print*,'e_corr_single_bi_orth     = ',e_corr_single_bi_orth
+!    print*,'e_corr_double_bi_orth     = ',e_corr_double_bi_orth
+!    print*,'e_corr_single_bi_orth_abs = ',e_corr_single_bi_orth_abs
+!    print*,'e_corr_double_bi_orth_abs = ',e_corr_double_bi_orth_abs
     print*,'Left/right eigenvectors'
     do i = 1,N_det
-      write(*,'(I5,X,(100(F12.7,X)))')i,leigvec_tc_bi_orth(i,1),reigvec_tc_bi_orth(i,1),leigvec_tc_bi_orth(i,1)*reigvec_tc_bi_orth(i,1)
+      write(*,'(I6,X,(100(F12.7,X)))')i,leigvec_tc_bi_orth(i,1),reigvec_tc_bi_orth(i,1),leigvec_tc_bi_orth(i,1)*reigvec_tc_bi_orth(i,1)
     enddo
 
   else
diff --git a/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f b/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
index 6d5c3b21..5a3971c5 100644
--- a/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
+++ b/plugins/local/tc_bi_ortho/e_corr_bi_ortho.irp.f
@@ -2,7 +2,7 @@
  BEGIN_PROVIDER [ double precision, e_tilde_00]
  implicit none
  double precision :: hmono,htwoe,hthree,htot
- call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
+ call htilde_mu_mat_opt_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,htot)
  e_tilde_00 = htot
  END_PROVIDER 
 
@@ -18,16 +18,15 @@
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
   if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
-   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    e_pt2_tc_bi_orth += coef_pt1 * htilde_ij
    if(degree == 1)then
     e_pt2_tc_bi_orth_single += coef_pt1 * htilde_ij
    else 
-!    print*,'coef_pt1, e_pt2',coef_pt1,coef_pt1 * htilde_ij
     e_pt2_tc_bi_orth_double += coef_pt1 * htilde_ij
    endif
   endif
@@ -37,7 +36,7 @@
  BEGIN_PROVIDER [ double precision, e_tilde_bi_orth_00]
  implicit none
  double precision :: hmono,htwoe,hthree,htilde_ij
- call htilde_mu_mat_bi_ortho_slow(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
+ call htilde_mu_mat_opt_bi_ortho(HF_bitmask,HF_bitmask,N_int,hmono,htwoe,hthree,e_tilde_bi_orth_00)
  e_tilde_bi_orth_00 += nuclear_repulsion
  END_PROVIDER 
 
@@ -57,7 +56,7 @@
  e_corr_double_bi_orth = 0.d0
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+  call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
   if(degree == 1)then
    e_corr_single_bi_orth += reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1)
    e_corr_single_bi_orth_abs += dabs(reigvec_tc_bi_orth(i,1) * htilde_ij/reigvec_tc_bi_orth(1,1))
@@ -80,7 +79,7 @@
  do i = 1, N_det
   accu += reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(i,1)
   do j = 1, N_det
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j),psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
    e_tc_left_right += htilde_ij * reigvec_tc_bi_orth(i,1) * leigvec_tc_bi_orth(j,1)
   enddo
  enddo
@@ -99,8 +98,8 @@ BEGIN_PROVIDER [ double precision, coef_pt1_bi_ortho, (N_det)]
   if(degree==0)then
    coef_pt1_bi_ortho(i) = 1.d0
   else
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-   call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+   call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
    delta_e = e_tilde_00 - e_i0
    coef_pt1 = htilde_ij / delta_e
    coef_pt1_bi_ortho(i)= coef_pt1
diff --git a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f b/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
deleted file mode 100644
index ef38cbcc..00000000
--- a/plugins/local/tc_bi_ortho/print_tc_energy.irp.f
+++ /dev/null
@@ -1,32 +0,0 @@
-program print_tc_energy
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-
-  PROVIDE j2e_type
-  PROVIDE j1e_type
-  PROVIDE env_type
-
-  print *, ' j2e_type = ', j2e_type
-  print *, ' j1e_type = ', j1e_type
-  print *, ' env_type = ', env_type
-
-  call write_tc_energy()
-
-end
-
diff --git a/plugins/local/tc_bi_ortho/psi_r_l_prov.irp.f b/plugins/local/tc_bi_ortho/psi_r_l_prov.irp.f
index b28c417f..1d233b0b 100644
--- a/plugins/local/tc_bi_ortho/psi_r_l_prov.irp.f
+++ b/plugins/local/tc_bi_ortho/psi_r_l_prov.irp.f
@@ -1,6 +1,7 @@
 use bitmasks
 
-BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (psi_det_size,N_states) ]
+!BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (psi_det_size,N_states) ]
+BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (N_det,N_states) ]
   implicit none
   BEGIN_DOC
   ! The wave function coefficients. Initialized with Hartree-Fock if the |EZFIO| file
@@ -68,7 +69,8 @@ BEGIN_PROVIDER [ double precision, psi_l_coef_bi_ortho, (psi_det_size,N_states)
 END_PROVIDER
 
 
-BEGIN_PROVIDER [ double precision, psi_r_coef_bi_ortho, (psi_det_size,N_states) ]
+!BEGIN_PROVIDER [ double precision, psi_r_coef_bi_ortho, (psi_det_size,N_states) ]
+BEGIN_PROVIDER [ double precision, psi_r_coef_bi_ortho, (N_det,N_states) ]
   implicit none
   BEGIN_DOC
   ! The wave function coefficients. Initialized with Hartree-Fock if the |EZFIO| file
diff --git a/plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f b/plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f
deleted file mode 100644
index 8940a4f6..00000000
--- a/plugins/local/tc_bi_ortho/pt2_tc_cisd.irp.f
+++ /dev/null
@@ -1,129 +0,0 @@
-program pt2_tc_cisd
-
-  BEGIN_DOC
-  !
-  ! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together 
-  !        with the energy. Saves the left-right wave functions at the end. 
-  !
-  END_DOC
-
-  implicit none
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  print*, ' nb of states = ', N_states
-  print*, ' nb of det    = ', N_det
-  call routine_diag()
-
-  call routine
-end
-
-subroutine routine
- implicit none
- integer :: i,h1,p1,h2,p2,s1,s2,degree
- double precision :: h0i,hi0,e00,ei,delta_e
- double precision :: norm,e_corr,coef,e_corr_pos,e_corr_neg,e_corr_abs
-
- integer                        :: exc(0:2,2,2)
- double precision               :: phase
- double precision :: eh1,ep1,eh2,ep2
-
- norm = 0.d0 
- e_corr = 0.d0
- e_corr_abs = 0.d0
- e_corr_pos = 0.d0
- e_corr_neg = 0.d0
- call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,1), N_int, e00) 
- do i = 2, N_det
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,1), N_int, hi0) 
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,1), psi_det(1,1,i), N_int, h0i) 
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, ei) 
-  call get_excitation_degree(psi_det(1,1,1), psi_det(1,1,i),degree,N_int)
-  call get_excitation(psi_det(1,1,1), psi_det(1,1,i),exc,degree,phase,N_int)
-  call decode_exc(exc,degree,h1,p1,h2,p2,s1,s2)
-  eh1 = Fock_matrix_tc_diag_mo_tot(h1)
-  ep1 = Fock_matrix_tc_diag_mo_tot(p1)
-  delta_e = eh1 - ep1
-  if (degree==2)then
-   eh2 = Fock_matrix_tc_diag_mo_tot(h2)
-   ep2 = Fock_matrix_tc_diag_mo_tot(p2)
-   delta_e +=  eh2 - ep2
-  endif
-!  delta_e = e00 - ei
-  coef = hi0/delta_e
-  norm += coef*coef
-  e_corr = coef* h0i
-  if(e_corr.lt.0.d0)then
-   e_corr_neg += e_corr
-  elseif(e_corr.gt.0.d0)then
-   e_corr_pos += e_corr
-  endif
-  e_corr_abs += dabs(e_corr)
- enddo
- print*,'e_corr_abs = ',e_corr_abs
- print*,'e_corr_pos = ',e_corr_pos
- print*,'e_corr_neg = ',e_corr_neg
- print*,'norm       = ',dsqrt(norm)
-
-end
-
-subroutine routine_diag()
-
-  implicit none
-  integer          :: i, j, k
-  double precision :: dE
-
-  ! provide eigval_right_tc_bi_orth
-  ! provide overlap_bi_ortho
-  ! provide htilde_matrix_elmt_bi_ortho
-
-  if(N_states .eq. 1) then
-
-    print*,'eigval_right_tc_bi_orth   = ',eigval_right_tc_bi_orth(1)
-    print*,'e_tc_left_right           = ',e_tc_left_right
-    print*,'e_tilde_bi_orth_00        = ',e_tilde_bi_orth_00
-    print*,'e_pt2_tc_bi_orth          = ',e_pt2_tc_bi_orth
-    print*,'e_pt2_tc_bi_orth_single   = ',e_pt2_tc_bi_orth_single
-    print*,'e_pt2_tc_bi_orth_double   = ',e_pt2_tc_bi_orth_double
-    print*,'***'                      
-    print*,'e_corr_bi_orth            = ',e_corr_bi_orth
-    print*,'e_corr_bi_orth_proj       = ',e_corr_bi_orth_proj
-    print*,'e_corr_bi_orth_proj_abs   = ',e_corr_bi_orth_proj_abs
-    print*,'e_corr_single_bi_orth     = ',e_corr_single_bi_orth
-    print*,'e_corr_double_bi_orth     = ',e_corr_double_bi_orth
-    print*,'e_corr_single_bi_orth_abs = ',e_corr_single_bi_orth_abs
-    print*,'e_corr_double_bi_orth_abs = ',e_corr_double_bi_orth_abs
-    print*,'Left/right eigenvectors'
-    do i = 1,N_det
-      write(*,'(I5,X,(100(F12.7,X)))')i,leigvec_tc_bi_orth(i,1),reigvec_tc_bi_orth(i,1),leigvec_tc_bi_orth(i,1)*reigvec_tc_bi_orth(i,1)
-    enddo
-
-  else
-
-    print*,'eigval_right_tc_bi_orth : '
-    do i = 1, N_states
-      print*, i, eigval_right_tc_bi_orth(i)
-    enddo
-
-    print*,''
-    print*,'******************************************************'
-    print*,'TC Excitation energies (au)                     (eV)'
-    do i = 2, N_states
-      dE = eigval_right_tc_bi_orth(i) - eigval_right_tc_bi_orth(1)
-      print*, i, dE, dE/0.0367502d0
-    enddo
-    print*,''
-
-  endif
-
-end
-
-
-
diff --git a/plugins/local/tc_bi_ortho/symmetrized_3_e_int_prov.irp.f b/plugins/local/tc_bi_ortho/symmetrized_3_e_int_prov.irp.f
deleted file mode 100644
index e8277a74..00000000
--- a/plugins/local/tc_bi_ortho/symmetrized_3_e_int_prov.irp.f
+++ /dev/null
@@ -1,140 +0,0 @@
-
-BEGIN_PROVIDER [ double precision, three_e_diag_parrallel_spin_prov, (mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator ON A BI ORTHONORMAL BASIS 
-  !
-  ! three_e_diag_parrallel_spin_prov(m,j,i) = All combinations of the form <mji|-L|mji> for same spin matrix elements  
-  ! 
-  ! notice the -1 sign: in this way three_e_diag_parrallel_spin_prov can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, m
-  double precision :: integral, wall1, wall0, three_e_diag_parrallel_spin
-
-  three_e_diag_parrallel_spin_prov = 0.d0
-  print *, ' Providing the three_e_diag_parrallel_spin_prov ...'
-
- integral = three_e_diag_parrallel_spin(1,1,1) ! to provide all stuffs
-  call wall_time(wall0)
- !$OMP PARALLEL                 &
- !$OMP DEFAULT (NONE)           &
- !$OMP PRIVATE (i,j,m,integral) & 
- !$OMP SHARED (mo_num,three_e_diag_parrallel_spin_prov)
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do m = j, mo_num
-        three_e_diag_parrallel_spin_prov(m,j,i) =  three_e_diag_parrallel_spin(m,j,i)
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do m = 1, j
-        three_e_diag_parrallel_spin_prov(m,j,i) = three_e_diag_parrallel_spin_prov(j,m,i)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_diag_parrallel_spin_prov', wall1 - wall0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, three_e_single_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_single_parrallel_spin_prov(m,j,k,i) = All combination of <mjk|-L|mji> for same spin matrix elements 
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  !
-  END_DOC
-
- implicit none
- integer          :: i, j, k, m
- double precision :: integral, wall1, wall0, three_e_single_parrallel_spin
-
-  three_e_single_parrallel_spin_prov = 0.d0
-  print *, ' Providing the three_e_single_parrallel_spin_prov ...'
-
-  integral = three_e_single_parrallel_spin(1,1,1,1)
-  call wall_time(wall0)
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (i,j,k,m,integral) & 
- !$OMP SHARED (mo_num,three_e_single_parrallel_spin_prov)
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do m = 1, mo_num
-          three_e_single_parrallel_spin_prov(m,j,k,i) = three_e_single_parrallel_spin(m,j,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_single_parrallel_spin_prov', wall1 - wall0
-
-END_PROVIDER 
-
-
-! ---
-
-BEGIN_PROVIDER [ double precision, three_e_double_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
-  !
-  ! three_e_double_parrallel_spin_prov(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
-  !
-  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, m, l
-  double precision :: integral, wall1, wall0, three_e_double_parrallel_spin
-
-  three_e_double_parrallel_spin_prov = 0.d0
-  print *, ' Providing the three_e_double_parrallel_spin_prov ...'
-  call wall_time(wall0)
-
- integral = three_e_double_parrallel_spin(1,1,1,1,1)
- !$OMP PARALLEL                     &
- !$OMP DEFAULT (NONE)               &
- !$OMP PRIVATE (i,j,k,m,l,integral) & 
- !$OMP SHARED (mo_num,three_e_double_parrallel_spin_prov)
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do j = 1, mo_num
-        do l = 1, mo_num
-          do m = 1, mo_num
-            three_e_double_parrallel_spin_prov(m,l,j,k,i) = three_e_double_parrallel_spin(m,l,j,k,i)
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print *, ' wall time for three_e_double_parrallel_spin_prov', wall1 - wall0
-
-END_PROVIDER 
-
diff --git a/plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f b/plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f
deleted file mode 100644
index d4c8c55d..00000000
--- a/plugins/local/tc_bi_ortho/tc_cisd_sc2.irp.f
+++ /dev/null
@@ -1,36 +0,0 @@
-
-! ---
-
-program tc_cisd_sc2
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call test
-
-end
-
-! ---
-
-subroutine test()
- implicit none
-! double precision, allocatable :: dressing_dets(:),e_corr_dets(:)
-! allocate(dressing_dets(N_det),e_corr_dets(N_det))
-! e_corr_dets = 0.d0
-! call get_cisd_sc2_dressing(psi_det,e_corr_dets,N_det,dressing_dets)
-  provide eigval_tc_cisd_sc2_bi_ortho
-end
diff --git a/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f b/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
deleted file mode 100644
index 4c3c0788..00000000
--- a/plugins/local/tc_bi_ortho/tc_cisd_sc2_utils.irp.f
+++ /dev/null
@@ -1,145 +0,0 @@
- BEGIN_PROVIDER [ double precision, reigvec_tc_cisd_sc2_bi_ortho, (N_det,N_states)]
-&BEGIN_PROVIDER [ double precision, leigvec_tc_cisd_sc2_bi_ortho, (N_det,N_states)]
-&BEGIN_PROVIDER [ double precision, eigval_tc_cisd_sc2_bi_ortho, (N_states)]
- implicit none
- integer :: it,n_real,degree,i,istate
- double precision :: e_before, e_current,thr, hmono,htwoe,hthree,accu
- double precision, allocatable :: e_corr_dets(:),h0j(:), h_sc2(:,:), dressing_dets(:)
- double precision, allocatable :: leigvec_tc_bi_orth_tmp(:,:),reigvec_tc_bi_orth_tmp(:,:),eigval_right_tmp(:)
- allocate(leigvec_tc_bi_orth_tmp(N_det,N_det),reigvec_tc_bi_orth_tmp(N_det,N_det),eigval_right_tmp(N_det))
- allocate(e_corr_dets(N_det),h0j(N_det),h_sc2(N_det,N_det),dressing_dets(N_det))
- allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),eigval_tmp(N_states))
- dressing_dets = 0.d0
- do i = 1, N_det
-  call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
-  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  if(degree == 1 .or. degree == 2)then
-   call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,h0j(i))
-  endif
- enddo
- reigvec_tc_bi_orth_tmp = 0.d0
- do i = 1, N_det 
-  reigvec_tc_bi_orth_tmp(i,1) = psi_r_coef_bi_ortho(i,1) 
- enddo
- vec_tmp = 0.d0
- do istate = 1, N_states
-  vec_tmp(:,istate) = reigvec_tc_bi_orth_tmp(:,istate)
- enddo
- do istate = N_states+1, n_states_diag
-  vec_tmp(istate,istate) = 1.d0
- enddo
- print*,'Diagonalizing the TC CISD '
- call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
- do i = 1, N_det 
-  e_corr_dets(i) = reigvec_tc_bi_orth_tmp(i,1) * h0j(i)/reigvec_tc_bi_orth_tmp(1,1)
- enddo
- E_before = eigval_tmp(1)
- print*,'Starting from ',E_before
-
- e_current = 10.d0
- thr = 1.d-5
- it = 0
- dressing_dets = 0.d0
-  double precision, allocatable :: H_jj(:),vec_tmp(:,:),eigval_tmp(:)
-  external                         htc_bi_ortho_calc_tdav_slow
-  external                         htcdag_bi_ortho_calc_tdav_slow
-  logical                       :: converged
- do while (dabs(E_before-E_current).gt.thr)
-  it += 1
-  E_before = E_current
-!  h_sc2 = htilde_matrix_elmt_bi_ortho
-  call get_cisd_sc2_dressing(psi_det,e_corr_dets,N_det,dressing_dets)
-  do i = 1, N_det
-!   print*,'dressing_dets(i) = ',dressing_dets(i)
-   h_sc2(i,i) += dressing_dets(i)
-  enddo
-  print*,'********************'
-  print*,'iteration       ',it
-!  call non_hrmt_real_diag(N_det,h_sc2,& 
-!       leigvec_tc_bi_orth_tmp,reigvec_tc_bi_orth_tmp,& 
-!       n_real,eigval_right_tmp)
-!  print*,'eigval_right_tmp(1)',eigval_right_tmp(1)
-  vec_tmp = 0.d0
-  do istate = 1, N_states
-   vec_tmp(:,istate) = reigvec_tc_bi_orth_tmp(:,istate)
-  enddo
-  do istate = N_states+1, n_states_diag
-   vec_tmp(istate,istate) = 1.d0
-  enddo
-  call davidson_general_diag_dressed_ext_rout_nonsym_b1space(vec_tmp, H_jj, dressing_dets,eigval_tmp, N_det, n_states, n_states_diag, converged, htc_bi_ortho_calc_tdav_slow)
-  print*,'outside Davidson'
-  print*,'eigval_tmp(1) = ',eigval_tmp(1)
-  do i = 1, N_det 
-   reigvec_tc_bi_orth_tmp(i,1) = vec_tmp(i,1)
-   e_corr_dets(i) = reigvec_tc_bi_orth_tmp(i,1) * h0j(i)/reigvec_tc_bi_orth_tmp(1,1)
-  enddo
-!  E_current = eigval_right_tmp(1)
-  E_current = eigval_tmp(1)
-  print*,'it, E(SC)^2 = ',it,E_current
- enddo
- eigval_tc_cisd_sc2_bi_ortho(1:N_states) = eigval_right_tmp(1:N_states)
- reigvec_tc_cisd_sc2_bi_ortho(1:N_det,1:N_states) = reigvec_tc_bi_orth_tmp(1:N_det,1:N_states)
- leigvec_tc_cisd_sc2_bi_ortho(1:N_det,1:N_states) = leigvec_tc_bi_orth_tmp(1:N_det,1:N_states)
- 
-END_PROVIDER 
-
-subroutine get_cisd_sc2_dressing(dets,e_corr_dets,ndet,dressing_dets)
- implicit none
-  use bitmasks
- integer, intent(in) :: ndet
- integer(bit_kind), intent(in)  :: dets(N_int,2,ndet)
- double precision, intent(in)   :: e_corr_dets(ndet)
- double precision, intent(out) :: dressing_dets(ndet)
- integer, allocatable  :: degree(:),hole(:,:),part(:,:),spin(:,:)
- integer(bit_kind), allocatable :: hole_part(:,:,:)
- integer :: i,j,k, exc(0:2,2,2),h1,p1,h2,p2,s1,s2
- integer(bit_kind) :: xorvec(2,N_int)
-
- double precision :: phase
- dressing_dets = 0.d0
- allocate(degree(ndet),hole(2,ndet),part(2,ndet), spin(2,ndet),hole_part(N_int,2,ndet))
- do i = 2, ndet
-  call get_excitation_degree(HF_bitmask,dets(1,1,i),degree(i),N_int)
-  do j = 1, N_int
-   hole_part(j,1,i) = xor( HF_bitmask(j,1), dets(j,1,i))
-   hole_part(j,2,i) = xor( HF_bitmask(j,2), dets(j,2,i))
-  enddo
-  if(degree(i) == 1)then
-   call get_single_excitation(HF_bitmask,psi_det(1,1,i),exc,phase,N_int)
-  else if(degree(i) == 2)then
-   call get_double_excitation(HF_bitmask,psi_det(1,1,i),exc,phase,N_int)
-  endif
-  call decode_exc(exc,degree,h1,p1,h2,p2,s1,s2)
-  hole(1,i) = h1
-  hole(2,i) = h2
-  part(1,i) = p1
-  part(2,i) = p2
-  spin(1,i) = s1
-  spin(2,i) = s2
- enddo
- 
- integer :: same
- if(elec_alpha_num+elec_beta_num<3)return
- do i = 2, ndet
-  do j = i+1, ndet
-   same = 0
-   if(degree(i) == degree(j) .and. degree(i)==1)cycle
-   do k = 1, N_int
-    xorvec(k,1) = iand(hole_part(k,1,i),hole_part(k,1,j))
-    xorvec(k,2) = iand(hole_part(k,2,i),hole_part(k,2,j))
-    same += popcnt(xorvec(k,1)) + popcnt(xorvec(k,2)) 
-   enddo
-!   print*,'i,j',i,j
-!   call debug_det(dets(1,1,i),N_int) 
-!   call debug_det(hole_part(1,1,i),N_int) 
-!   call debug_det(dets(1,1,j),N_int) 
-!   call debug_det(hole_part(1,1,j),N_int) 
-!   print*,'same = ',same
-   if(same.eq.0)then
-    dressing_dets(i) += e_corr_dets(j)  
-    dressing_dets(j) += e_corr_dets(i)  
-   endif
-  enddo
- enddo
- 
-end
diff --git a/plugins/local/tc_bi_ortho/tc_h_eigvectors.irp.f b/plugins/local/tc_bi_ortho/tc_h_eigvectors.irp.f
index 6bf3d99e..16844221 100644
--- a/plugins/local/tc_bi_ortho/tc_h_eigvectors.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_h_eigvectors.irp.f
@@ -326,7 +326,13 @@ end
   enddo
 
   double precision, allocatable :: buffer(:,:)
-  allocate(buffer(N_det,N_states))
+  allocate(buffer(psi_det_size,N_states))
+!  print*,N_det,N_states
+!  print*,size(psi_l_coef_bi_ortho,1),size(psi_l_coef_bi_ortho,2)
+!  print*,size(leigvec_tc_bi_orth,1),size(leigvec_tc_bi_orth,2)
+!  print*,size(reigvec_tc_bi_orth,1),size(reigvec_tc_bi_orth,2)
+!  print*,size(psi_r_coef_bi_ortho,1),size(psi_r_coef_bi_ortho,2)
+  buffer = 0.d0
   do k = 1, N_states
     do i = 1, N_det
       psi_l_coef_bi_ortho(i,k) = leigvec_tc_bi_orth(i,k)
diff --git a/plugins/local/tc_bi_ortho/tc_utils.irp.f b/plugins/local/tc_bi_ortho/tc_utils.irp.f
index 53fe5884..067e8719 100644
--- a/plugins/local/tc_bi_ortho/tc_utils.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_utils.irp.f
@@ -2,12 +2,71 @@
 subroutine write_tc_energy()
 
   implicit none
-  integer          :: i, j, k
-  double precision :: hmono, htwoe, hthree, htot
-  double precision :: E_TC, O_TC
-  double precision :: E_1e, E_2e, E_3e
+  integer                       :: i, j, k
+  double precision              :: hmono, htwoe, hthree, htot
+  double precision              :: E_TC, O_TC
+  double precision              :: E_1e, E_2e, E_3e
+  double precision, allocatable :: E_TC_tmp(:), E_1e_tmp(:), E_2e_tmp(:), E_3e_tmp(:)
 
-  do k = 1, n_states
+  call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,1), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot)
+
+  ! GS 
+  ! ---
+
+  call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,1), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot)
+
+  allocate(E_TC_tmp(N_det), E_1e_tmp(N_det), E_2e_tmp(N_det), E_3e_tmp(N_det))
+
+  !$OMP PARALLEL                                                                &
+  !$OMP DEFAULT (NONE)                                                          &
+  !$OMP PRIVATE(i, j, hmono, htwoe, hthree, htot)                               &
+  !$OMP SHARED(N_det, psi_det, N_int, psi_l_coef_bi_ortho, psi_r_coef_bi_ortho, &
+  !$OMP        E_TC_tmp, E_1e_tmp, E_2e_tmp, E_3e_tmp)
+  !$OMP DO
+  do i = 1, N_det
+    E_TC_tmp(i) = 0.d0
+    E_1e_tmp(i) = 0.d0
+    E_2e_tmp(i) = 0.d0
+    E_3e_tmp(i) = 0.d0
+    do j = 1, N_det
+      call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+      E_TC_tmp(i) = E_TC_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * htot
+      E_1e_tmp(i) = E_1e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * hmono
+      E_2e_tmp(i) = E_2e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * htwoe
+      E_3e_tmp(i) = E_3e_tmp(i) + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(j,1) * hthree
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  E_1e = 0.d0
+  E_2e = 0.d0
+  E_3e = 0.d0
+  E_TC = 0.d0
+  O_TC = 0.d0
+  do i = 1, N_det
+    E_1e = E_1e + E_1e_tmp(i)
+    E_2e = E_2e + E_2e_tmp(i)
+    E_3e = E_3e + E_3e_tmp(i)
+    E_TC = E_TC + E_TC_tmp(i)
+    O_TC = O_TC + psi_l_coef_bi_ortho(i,1) * psi_r_coef_bi_ortho(i,1)
+  enddo
+
+  print *, ' state :', 1
+  print *, " E_TC = ", E_TC / O_TC
+  print *, " E_1e = ", E_1e / O_TC
+  print *, " E_2e = ", E_2e / O_TC
+  print *, " E_3e = ", E_3e / O_TC
+  print *, " O_TC = ", O_TC
+
+  call ezfio_set_tc_bi_ortho_tc_gs_energy(E_TC/O_TC)
+
+  ! ---
+
+  ! ES 
+  ! ---
+
+  do k = 2, n_states
 
     E_TC = 0.d0
     E_1e = 0.d0
@@ -15,7 +74,7 @@ subroutine write_tc_energy()
     E_3e = 0.d0
     do i = 1, N_det
       do j = 1, N_det
-        call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
+        call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot)
         E_TC = E_TC + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htot
         E_1e = E_1e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * hmono
         E_2e = E_2e + psi_l_coef_bi_ortho(i,k) * psi_r_coef_bi_ortho(j,k) * htwoe
@@ -37,6 +96,8 @@ subroutine write_tc_energy()
 
   enddo
 
+  deallocate(E_TC_tmp, E_1e_tmp, E_2e_tmp, E_3e_tmp)
+
 end
 
 ! ---
@@ -52,8 +113,8 @@ subroutine write_tc_var()
 
     SIGMA_TC = 0.d0
     do j = 2, N_det
-      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
-      call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
+      call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,1), psi_det(1,1,j), N_int, hmono, htwoe, hthree, htot_1j)
+      call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot_j1)
       SIGMA_TC = SIGMA_TC + htot_1j * htot_j1
     enddo
 
@@ -66,3 +127,25 @@ end
 
 ! ---
 
+subroutine write_tc_gs_var_HF()
+
+  implicit none
+  integer          :: i, j, k
+  double precision :: hmono, htwoe, hthree, htot
+  double precision :: SIGMA_TC
+
+  SIGMA_TC = 0.d0
+  do j = 2, N_det
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,j), psi_det(1,1,1), N_int, hmono, htwoe, hthree, htot)
+    SIGMA_TC = SIGMA_TC + htot * htot
+  enddo
+
+  print *, " SIGMA_TC = ", SIGMA_TC
+
+  call ezfio_set_tc_bi_ortho_tc_gs_var(SIGMA_TC)
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_bi_ortho/test_natorb.irp.f b/plugins/local/tc_bi_ortho/test_natorb.irp.f
deleted file mode 100644
index 5b8801f7..00000000
--- a/plugins/local/tc_bi_ortho/test_natorb.irp.f
+++ /dev/null
@@ -1,64 +0,0 @@
-
-! ---
-
-program test_natorb
-
-  BEGIN_DOC
-  ! TODO : Reads psi_det in the EZFIO folder and prints out the left- and right-eigenvectors together with the energy. Saves the left-right wave functions at the end. 
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call routine()
-  ! call test()
-
-end
-
-! ---
-
-subroutine routine()
-
- implicit none
- double precision, allocatable :: fock_diag(:),eigval(:),leigvec(:,:),reigvec(:,:),mat_ref(:,:)
- allocate(eigval(mo_num),leigvec(mo_num,mo_num),reigvec(mo_num,mo_num),fock_diag(mo_num),mat_ref(mo_num, mo_num))
- double precision, allocatable :: eigval_ref(:),leigvec_ref(:,:),reigvec_ref(:,:)
- allocate(eigval_ref(mo_num),leigvec_ref(mo_num,mo_num),reigvec_ref(mo_num,mo_num))
- 
- double precision :: thr_deg
- integer :: i,n_real,j
- print*,'fock_matrix'
- do i = 1, mo_num
-  fock_diag(i) = Fock_matrix_mo(i,i)
-  print*,i,fock_diag(i)
- enddo
- thr_deg = 1.d-6
- mat_ref = -one_e_dm_mo
- print*,'diagonalization by block'
- call diag_mat_per_fock_degen(fock_diag,mat_ref,mo_num,thr_deg,leigvec,reigvec,eigval)
- call non_hrmt_bieig( mo_num, mat_ref&
-                     , leigvec_ref, reigvec_ref& 
-                     , n_real, eigval_ref)
- print*,'TEST ***********************************'
- double precision :: accu_l, accu_r
- do i = 1, mo_num
-  accu_l = 0.d0
-  accu_r = 0.d0
-  do j = 1, mo_num
-   accu_r += reigvec_ref(j,i) * reigvec(j,i)
-   accu_l += leigvec_ref(j,i) * leigvec(j,i)
-  enddo
-  print*,i
-  write(*,'(I3,X,100(F16.10,X))')i,eigval(i),eigval_ref(i),accu_l,accu_r
- enddo
-end
diff --git a/plugins/local/tc_bi_ortho/test_normal_order.irp.f b/plugins/local/tc_bi_ortho/test_normal_order.irp.f
deleted file mode 100644
index 0cf27396..00000000
--- a/plugins/local/tc_bi_ortho/test_normal_order.irp.f
+++ /dev/null
@@ -1,173 +0,0 @@
-
-! ---
-
-program test_normal_order
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call provide_all_three_ints_bi_ortho()
-  call test()
-
-end
-
-! ---
-
-subroutine test
- implicit none
-  use bitmasks ! you need to include the bitmasks_module.f90 features
- integer :: h1,h2,p1,p2,s1,s2,i_ok,degree,Ne(2)
- integer                        :: exc(0:2,2,2)
- integer(bit_kind), allocatable :: det_i(:,:)
- double precision :: hmono,htwoe,hthree,htilde_ij,accu,phase,normal,hthree_tmp
- integer,           allocatable :: occ(:,:)
- allocate( occ(N_int*bit_kind_size,2) )
- call bitstring_to_list_ab(ref_bitmask, occ, Ne, N_int)
- allocate(det_i(N_int,2))
- s1 = 1 
- s2 = 2
- accu = 0.d0
- do h1 = 1, elec_beta_num
-  do p1 = elec_alpha_num+1, mo_num
-   do h2 = 1, elec_beta_num
-    do p2 = elec_beta_num+1, mo_num
-     hthree = 0.d0
-
-     det_i = ref_bitmask
- s1 = 1 
- s2 = 2
-     call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-     call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-     call htilde_mu_mat_bi_ortho_slow(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
-     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-     hthree_tmp *= phase
-     hthree += 0.5d0 * hthree_tmp
-     det_i = ref_bitmask
- s1 = 2 
- s2 = 1
-     call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-     call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-     call htilde_mu_mat_bi_ortho_slow(det_i,HF_bitmask,N_int,hmono,htwoe,hthree_tmp,htilde_ij)
-     call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-     call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-     hthree_tmp *= phase
-     hthree += 0.5d0 * hthree_tmp
-
-
-!     normal = normal_two_body_bi_orth_ab(p2,h2,p1,h1)
-     call give_aba_contraction(N_int, h1, h2, p1, p2, Ne, occ, normal)
-     if(dabs(hthree).lt.1.d-10)cycle
-     if(dabs(hthree-normal).gt.1.d-10)then
-!      print*,pp2,pp1,hh2,hh1
-      print*,p2,p1,h2,h1
-      print*,hthree,normal,dabs(hthree-normal)
-      stop
-     endif
-!     call three_comp_two_e_elem(det_i,h1,h2,p1,p2,s1,s2,normal)
-!     normal = eff_2_e_from_3_e_ab(p2,p1,h2,h1)
-     accu += dabs(hthree-normal)
-    enddo
-   enddo
-  enddo
- enddo
-print*,'accu opposite spin = ',accu
-stop
-
-!    p2=6
-!    p1=5
-!    h2=2
-!    h1=1
-
-s1 = 1 
-s2 = 1
-accu = 0.d0
-do h1 = 1, elec_alpha_num
- do p1 = elec_alpha_num+1, mo_num
-  do p2 = p1+1, mo_num
-   do h2 = h1+1, elec_alpha_num
-    det_i = ref_bitmask
-    call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-    call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-    call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-    integer :: hh1, pp1, hh2, pp2, ss1, ss2
-    call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
-    hthree *= phase
-    normal = normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1)
-!     normal = eff_2_e_from_3_e_aa(p2,p1,h2,h1)
-    if(dabs(hthree).lt.1.d-10)cycle
-    if(dabs(hthree-normal).gt.1.d-10)then
-     print*,pp2,pp1,hh2,hh1
-     print*,p2,p1,h2,h1
-     print*,hthree,normal,dabs(hthree-normal)
-     stop
-    endif
-!     print*,hthree,normal,dabs(hthree-normal)
-    accu += dabs(hthree-normal)
-   enddo
-  enddo
- enddo
-enddo
-print*,'accu same spin alpha = ',accu
-
-
-s1 = 2 
-s2 = 2
-accu = 0.d0
-do h1 = 1, elec_beta_num
- do p1 = elec_beta_num+1, mo_num
-  do p2 = p1+1, mo_num
-   do h2 = h1+1, elec_beta_num
-    det_i = ref_bitmask
-    call do_single_excitation(det_i,h1,p1,s1,i_ok)
-    if(i_ok.ne.1)cycle
-    call do_single_excitation(det_i,h2,p2,s2,i_ok)
-    if(i_ok.ne.1)cycle
-    call htilde_mu_mat_bi_ortho_slow(det_i,ref_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call get_excitation_degree(ref_bitmask,det_i,degree,N_int)
-    call get_excitation(ref_bitmask,det_i,exc,degree,phase,N_int)
-    call decode_exc(exc, 2, hh1, pp1, hh2, pp2, ss1, ss2)
-    hthree *= phase
-!    normal = normal_two_body_bi_orth_aa_bb(p2,h2,p1,h1)
-     normal = eff_2_e_from_3_e_bb(p2,p1,h2,h1)
-    if(dabs(hthree).lt.1.d-10)cycle
-    if(dabs(hthree-normal).gt.1.d-10)then
-     print*,pp2,pp1,hh2,hh1
-     print*,p2,p1,h2,h1
-     print*,hthree,normal,dabs(hthree-normal)
-     stop
-    endif
-!     print*,hthree,normal,dabs(hthree-normal)
-    accu += dabs(hthree-normal)
-   enddo
-  enddo
- enddo
-enddo
-print*,'accu same spin beta  = ',accu
-
-
-end
-
-
diff --git a/plugins/local/tc_bi_ortho/test_s2_tc.irp.f b/plugins/local/tc_bi_ortho/test_s2_tc.irp.f
deleted file mode 100644
index 7c70b119..00000000
--- a/plugins/local/tc_bi_ortho/test_s2_tc.irp.f
+++ /dev/null
@@ -1,170 +0,0 @@
-
-! ---
-
-program test_tc 
-
-  implicit none
- 
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  call provide_all_three_ints_bi_ortho()
-  call routine_h_triple_left
-  call routine_h_triple_right
-!  call routine_test_s2_davidson
-
-end
-
-subroutine routine_h_triple_right
- implicit none
- logical           :: do_right 
- integer :: sze ,i, N_st, j
- double precision :: sij, accu_e, accu_s, accu_e_0, accu_s_0
- double precision, allocatable :: v_0_ref(:,:),u_0(:,:),s_0_ref(:,:)
- double precision, allocatable :: v_0_new(:,:),s_0_new(:,:)
- sze = N_det
- N_st = 1
- allocate(v_0_ref(N_det,1),u_0(N_det,1),s_0_ref(N_det,1),s_0_new(N_det,1),v_0_new(N_det,1))
- print*,'Checking first the Right '
- do i = 1, sze
-  u_0(i,1) = psi_r_coef_bi_ortho(i,1)
- enddo
- double precision :: wall0,wall1
- call wall_time(wall0)
- call H_tc_s2_u_0_with_pure_three_omp(v_0_ref,s_0_ref, u_0,N_st,sze)
- call wall_time(wall1)
- print*,'time for omp',wall1 - wall0
- call wall_time(wall0)
- call H_tc_s2_u_0_with_pure_three(v_0_new, s_0_new, u_0, N_st, sze)
- call wall_time(wall1)
- print*,'time serial ',wall1 - wall0
- accu_e = 0.d0
- accu_s = 0.d0
- do i = 1, sze
-  accu_e += dabs(v_0_ref(i,1) - v_0_new(i,1))
-  accu_s += dabs(s_0_ref(i,1) - s_0_new(i,1))
- enddo
- print*,'accu_e   = ',accu_e
- print*,'accu_s   = ',accu_s
-
-end
-
-subroutine routine_h_triple_left
- implicit none
- logical           :: do_right 
- integer :: sze ,i, N_st, j
- double precision :: sij, accu_e, accu_s, accu_e_0, accu_s_0
- double precision, allocatable :: v_0_ref(:,:),u_0(:,:),s_0_ref(:,:)
- double precision, allocatable :: v_0_new(:,:),s_0_new(:,:)
- sze = N_det
- N_st = 1
- allocate(v_0_ref(N_det,1),u_0(N_det,1),s_0_ref(N_det,1),s_0_new(N_det,1),v_0_new(N_det,1))
- print*,'Checking the Left '
- do i = 1, sze
-  u_0(i,1) = psi_l_coef_bi_ortho(i,1)
- enddo
- double precision :: wall0,wall1
- call wall_time(wall0)
- call H_tc_s2_dagger_u_0_with_pure_three_omp(v_0_ref,s_0_ref, u_0,N_st,sze)
- call wall_time(wall1)
- print*,'time for omp',wall1 - wall0
- call wall_time(wall0)
- call H_tc_s2_dagger_u_0_with_pure_three(v_0_new, s_0_new, u_0, N_st, sze)
- call wall_time(wall1)
- print*,'time serial ',wall1 - wall0
- accu_e = 0.d0
- accu_s = 0.d0
- do i = 1, sze
-  accu_e += dabs(v_0_ref(i,1) - v_0_new(i,1))
-  accu_s += dabs(s_0_ref(i,1) - s_0_new(i,1))
- enddo
- print*,'accu_e   = ',accu_e
- print*,'accu_s   = ',accu_s
-
-end
-
-
-subroutine routine_test_s2_davidson
- implicit none
- double precision, allocatable :: H_jj(:),vec_tmp(:,:), energies(:) , s2(:)
- integer :: i,istate
- logical :: converged 
- external H_tc_s2_dagger_u_0_opt
- external H_tc_s2_u_0_opt
- allocate(H_jj(N_det),vec_tmp(N_det,n_states_diag),energies(n_states_diag), s2(n_states_diag))
- do i = 1, N_det
-   call htilde_mu_mat_bi_ortho_tot_slow(psi_det(1,1,i), psi_det(1,1,i), N_int, H_jj(i))
- enddo
- ! Preparing the left-eigenvector
- print*,'Computing the left-eigenvector '
- vec_tmp = 0.d0
- do istate = 1, N_states
-  vec_tmp(1:N_det,istate) = psi_l_coef_bi_ortho(1:N_det,istate)
- enddo
- do istate = N_states+1, n_states_diag
-  vec_tmp(istate,istate) = 1.d0
- enddo
- do istate = 1, N_states
-  leigvec_tc_bi_orth(1:N_det,istate) = vec_tmp(1:N_det,istate)
- enddo
- integer :: n_it_max
- n_it_max = 1
- call davidson_hs2_nonsym_b1space(vec_tmp, H_jj, s2, energies, N_det, n_states, n_states_diag, n_it_max, converged, H_tc_s2_dagger_u_0_opt)
- double precision, allocatable :: v_0_new(:,:),s_0_new(:,:)
- integer :: sze,N_st
- logical           :: do_right 
- sze = N_det
- N_st = 1
- do_right = .False.
- allocate(s_0_new(N_det,1),v_0_new(N_det,1))
- call H_tc_s2_u_0_nstates_openmp(v_0_new,s_0_new,vec_tmp,N_st,sze, do_right)
- double precision :: accu_e_0, accu_s_0
- accu_e_0 = 0.d0
- accu_s_0 = 0.d0
- do i = 1, sze
-  accu_e_0 += v_0_new(i,1) * vec_tmp(i,1)
-  accu_s_0 += s_0_new(i,1) * vec_tmp(i,1)
- enddo
- print*,'energies = ',energies
- print*,'s2       = ',s2
- print*,'accu_e_0',accu_e_0
- print*,'accu_s_0',accu_s_0
-
- ! Preparing the right-eigenvector
- print*,'Computing the right-eigenvector '
- vec_tmp = 0.d0
- do istate = 1, N_states
-  vec_tmp(1:N_det,istate) = psi_r_coef_bi_ortho(1:N_det,istate)
- enddo
- do istate = N_states+1, n_states_diag
-  vec_tmp(istate,istate) = 1.d0
- enddo
- do istate = 1, N_states
-  leigvec_tc_bi_orth(1:N_det,istate) = vec_tmp(1:N_det,istate)
- enddo
- n_it_max = 1
- call davidson_hs2_nonsym_b1space(vec_tmp, H_jj, s2, energies, N_det, n_states, n_states_diag, n_it_max, converged, H_tc_s2_u_0_opt)
- sze = N_det
- N_st = 1
- do_right = .True.
- v_0_new = 0.d0
- s_0_new = 0.d0
- call H_tc_s2_u_0_nstates_openmp(v_0_new,s_0_new,vec_tmp,N_st,sze, do_right)
- accu_e_0 = 0.d0
- accu_s_0 = 0.d0
- do i = 1, sze
-  accu_e_0 += v_0_new(i,1) * vec_tmp(i,1)
-  accu_s_0 += s_0_new(i,1) * vec_tmp(i,1)
- enddo
- print*,'energies = ',energies
- print*,'s2       = ',s2
- print*,'accu_e_0',accu_e_0
- print*,'accu_s_0',accu_s_0
-
-end
diff --git a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f b/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
deleted file mode 100644
index f1a7cc0a..00000000
--- a/plugins/local/tc_bi_ortho/test_tc_fock.irp.f
+++ /dev/null
@@ -1,204 +0,0 @@
-
-! ---
-
-program test_tc_fock
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  read_wf = .True.
-  touch read_wf
-
-  !call routine_1
-  !call routine_2
-!  call routine_3()
-
-! call test_3e
-  call routine_tot
-
-end
-
-! ---
-
-subroutine test_3e
- implicit none
- double precision :: integral_aaa,integral_aab,integral_abb,integral_bbb,accu
- double precision ::  hmono, htwoe, hthree, htot
- call htilde_mu_mat_bi_ortho_slow(ref_bitmask, ref_bitmask, N_int, hmono, htwoe, hthree, htot)
- print*,'hmono = ',hmono
- print*,'htwoe = ',htwoe
- print*,'hthree= ',hthree
- print*,'htot  = ',htot
- print*,''
- print*,''
- print*,'TC_one= ',tc_hf_one_e_energy
- print*,'TC_two= ',TC_HF_two_e_energy
- print*,'TC_3e = ',diag_three_elem_hf
- print*,'TC_tot= ',TC_HF_energy
- print*,''
- print*,''
- call give_aaa_contrib(integral_aaa)
- print*,'integral_aaa = ',integral_aaa
- call give_aab_contrib(integral_aab)
- print*,'integral_aab = ',integral_aab
- call give_abb_contrib(integral_abb)
- print*,'integral_abb = ',integral_abb
- call give_bbb_contrib(integral_bbb)
- print*,'integral_bbb = ',integral_bbb
- accu = integral_aaa + integral_aab + integral_abb + integral_bbb
- print*,'accu = ',accu
- print*,'delta = ',hthree - accu
-
-end
-
-subroutine routine_3()
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, a, i_ok, s1
-  double precision               :: hmono, htwoe, hthree, htilde_ij
-  double precision               :: err_ai, err_tot, ref, new
-  integer(bit_kind), allocatable :: det_i(:,:)
-
-  allocate(det_i(N_int,2))
-
-  err_tot = 0.d0
- 
-  do s1 = 1, 2
-
-   det_i = ref_bitmask
-   call debug_det(det_i, N_int)
-   print*, ' HF det'
-   call debug_det(det_i, N_int)
- 
-   do i = 1, elec_num_tab(s1)
-     do a = elec_num_tab(s1)+1, mo_num ! virtual 
- 
- 
-       det_i = ref_bitmask
-       call do_single_excitation(det_i, i, a, s1, i_ok)
-       if(i_ok == -1) then
-        print*, 'PB !!'
-        print*, i, a
-        stop
-       endif
-       print*, ' excited det'
-       call debug_det(det_i, N_int)
- 
-       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
-       if(dabs(hthree).lt.1.d-10)cycle
-       ref = hthree 
-       if(s1 == 1)then
-        new = fock_a_tot_3e_bi_orth(a,i)
-       else if(s1 == 2)then
-        new = fock_b_tot_3e_bi_orth(a,i)
-       endif
-       err_ai = dabs(dabs(ref) - dabs(new))
-       if(err_ai .gt. 1d-7) then
-         print*,'s1 = ',s1
-         print*, ' warning on', i, a
-         print*, ref,new,err_ai
-       endif
-       print*, ref,new,err_ai
-       err_tot += err_ai
- 
-       write(22, *) htilde_ij
-     enddo
-   enddo
-  enddo
-
-  print *, ' err_tot = ', err_tot
-
-  deallocate(det_i)
-
-end subroutine routine_3
-
-! ---
-subroutine routine_tot()
-
-  use bitmasks ! you need to include the bitmasks_module.f90 features
-
-  implicit none
-  integer                        :: i, a, i_ok, s1,other_spin(2)
-  double precision               :: hmono, htwoe, hthree, htilde_ij
-  double precision               :: err_ai, err_tot, ref, new
-  integer(bit_kind), allocatable :: det_i(:,:)
-
-  allocate(det_i(N_int,2))
-  other_spin(1) = 2
-  other_spin(2) = 1
-
-  err_tot = 0.d0
- 
-!  do s1 = 1, 2
-   s1 = 2
-   det_i = ref_bitmask
-   call debug_det(det_i, N_int)
-   print*, ' HF det'
-   call debug_det(det_i, N_int)
- 
-!   do i = 1, elec_num_tab(s1)
-!     do a = elec_num_tab(s1)+1, mo_num ! virtual 
-   do i = 1, elec_beta_num
-     do a = elec_beta_num+1, mo_num! virtual 
-       print*,i,a 
- 
-       det_i = ref_bitmask
-       call do_single_excitation(det_i, i, a, s1, i_ok)
-       if(i_ok == -1) then
-        print*, 'PB !!'
-        print*, i, a
-        stop
-       endif
- 
-       call htilde_mu_mat_bi_ortho_slow(det_i, ref_bitmask, N_int, hmono, htwoe, hthree, htilde_ij)
-       print*,htilde_ij
-!       if(dabs(htilde_ij).lt.1.d-10)cycle
-       print*, ' excited det'
-       call debug_det(det_i, N_int)
-
-       if(s1 == 1)then
-        new = Fock_matrix_tc_mo_alpha(a,i)
-       else
-        new = Fock_matrix_tc_mo_beta(a,i)
-       endif
-       ref = htilde_ij
-!       if(s1 == 1)then
-!        new = fock_a_tot_3e_bi_orth(a,i)
-!       else if(s1 == 2)then
-!        new = fock_b_tot_3e_bi_orth(a,i)
-!       endif
-       err_ai = dabs(dabs(ref) - dabs(new))
-       if(err_ai .gt. 1d-7) then
-         print*,'---------'
-         print*,'s1 = ',s1
-         print*, ' warning on', i, a
-         print*, ref,new,err_ai
-         print*,hmono, htwoe, hthree
-         print*,'---------'
-       endif
-       print*, ref,new,err_ai
-       err_tot += err_ai
- 
-       write(22, *) htilde_ij
-     enddo
-   enddo
-!  enddo
-
-  print *, ' err_tot = ', err_tot
-
-  deallocate(det_i)
-
-end subroutine routine_3
diff --git a/plugins/local/tc_int/EZFIO.cfg b/plugins/local/tc_int/EZFIO.cfg
new file mode 100644
index 00000000..5615ce4b
--- /dev/null
+++ b/plugins/local/tc_int/EZFIO.cfg
@@ -0,0 +1,36 @@
+[nxBlocks]
+type: integer
+doc: nb of x blocks in the Grid
+interface: ezfio,provider,ocaml
+default: 10
+
+[nyBlocks]
+type: integer
+doc: nb of y blocks in the Grid
+interface: ezfio,provider,ocaml
+default: 1
+
+[nzBlocks]
+type: integer
+doc: nb of z blocks in the Grid
+interface: ezfio,provider,ocaml
+default: 1
+
+[blockxSize]
+type: integer
+doc: size of x blocks
+interface: ezfio,provider,ocaml
+default: 32
+
+[blockySize]
+type: integer
+doc: size of y blocks
+interface: ezfio,provider,ocaml
+default: 1
+
+[blockzSize]
+type: integer
+doc: size of z blocks
+interface: ezfio,provider,ocaml
+default: 1
+
diff --git a/plugins/local/tc_int/LIB b/plugins/local/tc_int/LIB
new file mode 100644
index 00000000..9f7e2d9f
--- /dev/null
+++ b/plugins/local/tc_int/LIB
@@ -0,0 +1 @@
+-lcutcint
diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED
new file mode 100644
index 00000000..362314da
--- /dev/null
+++ b/plugins/local/tc_int/NEED
@@ -0,0 +1,7 @@
+gpu
+tc_keywords
+jastrow
+qmckl
+becke_numerical_grid
+dft_utils_in_r
+bi_ortho_mos
diff --git a/plugins/local/tc_int/README.rst b/plugins/local/tc_int/README.rst
new file mode 100644
index 00000000..bc9e8483
--- /dev/null
+++ b/plugins/local/tc_int/README.rst
@@ -0,0 +1,4 @@
+======
+tc_int
+======
+
diff --git a/plugins/local/tc_int/compute_int_2e_ao_cpu.irp.f b/plugins/local/tc_int/compute_int_2e_ao_cpu.irp.f
new file mode 100644
index 00000000..9ff5431d
--- /dev/null
+++ b/plugins/local/tc_int/compute_int_2e_ao_cpu.irp.f
@@ -0,0 +1,233 @@
+
+! ---
+
+program compute_int_2e_ao_cpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: m
+  integer                       :: i, j, k, l
+  integer                       :: ipoint, jpoint
+
+  double precision              :: weight1, ao_i_r, ao_k_r
+
+  double precision              :: time0, time1
+  double precision              :: wall_time0, wall_time1
+  double precision              :: wall_ttime0, wall_ttime1
+  double precision              :: tt1, tt2
+
+  double precision, allocatable :: rn(:,:), aos_data1(:,:,:), aos_data2(:,:,:)
+  double precision, allocatable :: grad1_u12(:,:,:), int_fct_long_range(:,:,:), c_mat(:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: int_2e_ao(:,:,:,:)
+
+
+  call wall_time(time0)
+  print*, ' start compute_int_2e_ao_cpu'
+
+
+  ! ---
+
+  allocate(rn(3,nucl_num))
+  allocate(aos_data1(n_points_final_grid,ao_num,4))
+  allocate(aos_data2(n_points_extra_final_grid,ao_num,4))
+
+  do k = 1, nucl_num
+    rn(1,k) = nucl_coord(k,1)
+    rn(2,k) = nucl_coord(k,2)
+    rn(3,k) = nucl_coord(k,3)
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_final_grid
+      aos_data1(ipoint,k,1) = aos_in_r_array(k,ipoint)
+      aos_data1(ipoint,k,2) = aos_grad_in_r_array(k,ipoint,1)
+      aos_data1(ipoint,k,3) = aos_grad_in_r_array(k,ipoint,2)
+      aos_data1(ipoint,k,4) = aos_grad_in_r_array(k,ipoint,3)
+    enddo
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_extra_final_grid
+      aos_data2(ipoint,k,1) = aos_in_r_array_extra(k,ipoint)
+      aos_data2(ipoint,k,2) = aos_grad_in_r_array_extra(k,ipoint,1)
+      aos_data2(ipoint,k,3) = aos_grad_in_r_array_extra(k,ipoint,2)
+      aos_data2(ipoint,k,4) = aos_grad_in_r_array_extra(k,ipoint,3)
+    enddo
+  enddo
+
+
+  allocate(int_fct_long_range(n_points_extra_final_grid,ao_num,ao_num))
+  allocate(grad1_u12(n_points_extra_final_grid,n_points_final_grid,4))
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+  allocate(int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(wall_time0)
+
+  call wall_time(wall_ttime0)
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (int_fct_long_range, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        int_fct_long_range(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for int_long_range (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) &
+  !$OMP SHARED (n_points_final_grid, n_points_extra_final_grid, grad1_u12)
+  !$OMP DO
+  do ipoint = 1, n_points_final_grid
+    call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, grad1_u12(1,ipoint,1) &
+                                                               , grad1_u12(1,ipoint,2) &
+                                                               , grad1_u12(1,ipoint,3) &
+                                                               , grad1_u12(1,ipoint,4) )
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for tc_int_bh (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  do m = 1, 4
+    call dgemm("T", "N", ao_num*ao_num, n_points_final_grid, n_points_extra_final_grid, 1.d0                      &
+              , int_fct_long_range(1,1,1), n_points_extra_final_grid, grad1_u12(1,1,m), n_points_extra_final_grid &
+              , 0.d0, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num)
+  enddo
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for DGEMM of integ over r2 (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time of Hermitian part (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, int_2e_ao(1,1,1,1), ao_num*ao_num)
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for DGEMM of Hermitian part (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  tt1 = 0.d0
+  tt2 = 0.d0
+  do m = 1, 3
+
+    call wall_time(wall_ttime0)
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       &
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+
+          weight1 = final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(wall_ttime1)
+    tt1 += wall_ttime1 - wall_ttime0
+
+    call wall_time(wall_ttime0)
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -0.5d0          &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, int_2e_ao(1,1,1,1), ao_num*ao_num)
+    call wall_time(wall_ttime1)
+    tt2 += wall_ttime1 - wall_ttime0
+  enddo
+  write(*,"(A,2X,F15.7)") ' wall time of non-Hermitian part (sec) = ', tt1
+  write(*,"(A,2X,F15.7)") ' wall time for DGEMM of non Hermitian part (sec) = ', tt2
+
+
+  call wall_time(wall_ttime0)
+  call sum_A_At(int_2e_ao(1,1,1,1), ao_num*ao_num)
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time of A + A.T (sec) = ', wall_ttime1 - wall_ttime0
+
+
+  call wall_time(wall_time1)
+  write(*,"(A,2X,F15.7)") ' wall time on cpu (sec) = ', (wall_time1 - wall_time0)
+
+
+  deallocate(int_fct_long_range, grad1_u12, c_mat)
+  deallocate(int_2e_ao, int2_grad1_u12_ao)
+  deallocate(rn, aos_data1, aos_data2)
+
+  call wall_time(time1)
+  write(*,"(A,2X,F15.7)") ' wall time for compute_int_2e_ao_cpu (sec) = ', (time1 - time0)
+
+  return
+end
diff --git a/plugins/local/tc_int/compute_int_2e_ao_gpu.irp.f b/plugins/local/tc_int/compute_int_2e_ao_gpu.irp.f
new file mode 100644
index 00000000..4ffdc1cb
--- /dev/null
+++ b/plugins/local/tc_int/compute_int_2e_ao_gpu.irp.f
@@ -0,0 +1,117 @@
+
+! ---
+
+program compute_int_2e_ao_gpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l
+  integer                       :: ipoint
+
+  double precision              :: time0, time1
+
+  double precision, allocatable :: rn(:,:), aos_data1(:,:,:), aos_data2(:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao_gpu(:,:,:,:)
+  double precision, allocatable :: int_2e_ao_gpu(:,:,:,:)
+
+
+  call wall_time(time0)
+  print*, ' start compute_int_2e_ao_gpu'
+
+
+  ! ---
+
+  allocate(rn(3,nucl_num))
+  allocate(aos_data1(n_points_final_grid,ao_num,4))
+  allocate(aos_data2(n_points_extra_final_grid,ao_num,4))
+
+  do k = 1, nucl_num
+    rn(1,k) = nucl_coord(k,1)
+    rn(2,k) = nucl_coord(k,2)
+    rn(3,k) = nucl_coord(k,3)
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_final_grid
+      aos_data1(ipoint,k,1) = aos_in_r_array(k,ipoint)
+      aos_data1(ipoint,k,2) = aos_grad_in_r_array(k,ipoint,1)
+      aos_data1(ipoint,k,3) = aos_grad_in_r_array(k,ipoint,2)
+      aos_data1(ipoint,k,4) = aos_grad_in_r_array(k,ipoint,3)
+    enddo
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_extra_final_grid
+      aos_data2(ipoint,k,1) = aos_in_r_array_extra(k,ipoint)
+      aos_data2(ipoint,k,2) = aos_grad_in_r_array_extra(k,ipoint,1)
+      aos_data2(ipoint,k,3) = aos_grad_in_r_array_extra(k,ipoint,2)
+      aos_data2(ipoint,k,4) = aos_grad_in_r_array_extra(k,ipoint,3)
+    enddo
+  enddo
+
+  ! ---
+
+  integer :: nB
+  integer :: sB
+
+  PROVIDE nxBlocks nyBlocks nzBlocks
+  PROVIDE blockxSize blockySize blockzSize
+
+  sB = 32
+  nB = (n_points_final_grid + sB - 1) / sB
+
+  call ezfio_set_tc_int_blockxSize(sB)
+  call ezfio_set_tc_int_nxBlocks(nB)
+
+  allocate(int2_grad1_u12_ao_gpu(ao_num,ao_num,n_points_final_grid,3))
+  allocate(int_2e_ao_gpu(ao_num,ao_num,ao_num,ao_num))
+
+  call cutc_int(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize,           &
+                n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
+                final_grid_points, final_weight_at_r_vector,                                &
+                final_grid_points_extra, final_weight_at_r_vector_extra,                    &
+                rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o,                       &
+                int2_grad1_u12_ao_gpu, int_2e_ao_gpu)
+
+  deallocate(int_2e_ao_gpu, int2_grad1_u12_ao_gpu)
+  deallocate(rn, aos_data1, aos_data2)
+
+  call wall_time(time1)
+  write(*,"(A,2X,F15.7)") ' wall time for compute_int_2e_ao_gpu (sec) = ', (time1 - time0)
+
+  return
+end
diff --git a/plugins/local/tc_int/compute_no_cpu.irp.f b/plugins/local/tc_int/compute_no_cpu.irp.f
new file mode 100644
index 00000000..54ba37ce
--- /dev/null
+++ b/plugins/local/tc_int/compute_no_cpu.irp.f
@@ -0,0 +1,143 @@
+
+! ---
+
+program compute_no_cpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r 
+  my_n_pt_r_grid = tc_grid1_r   
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True. 
+  PROVIDE tc_grid2_a tc_grid2_r 
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+    
+end
+
+! ---
+
+subroutine main()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, ipoint
+  double precision              :: time0, time1
+  double precision              :: tt0, tt1
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision              :: noL_0e
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+  double precision, allocatable :: noL_1e    (:,:)
+  double precision, allocatable :: noL_2e    (:,:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  call wall_time(time0)
+  print*, ' start compute_no_cpu'
+
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  call wall_time(tt0)
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for reading (sec) = ', (tt1 - tt0)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  call wall_time(tt0)
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for 3e-tensor (sec) = ', (tt1 - tt0)
+
+  deallocate(tmp)
+
+  allocate(noL_2e(mo_num,mo_num,mo_num,mo_num))
+  allocate(noL_1e(mo_num,mo_num))
+
+  call provide_no_2e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_2e(1,1,1,1))
+
+  call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1))
+
+  call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_0e)
+
+  deallocate(int2_grad1_u12_bimo_t)
+  deallocate(noL_2e)
+  deallocate(noL_1e)
+
+
+  call wall_time(time1)
+  write(*,"(A,2X,F15.7)") ' wall time for compute_no_cpu (sec) = ', (time1 - time0)
+
+  return
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/compute_no_gpu.irp.f b/plugins/local/tc_int/compute_no_gpu.irp.f
new file mode 100644
index 00000000..9e4d90cc
--- /dev/null
+++ b/plugins/local/tc_int/compute_no_gpu.irp.f
@@ -0,0 +1,132 @@
+
+! ---
+
+program compute_no_gpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r 
+  my_n_pt_r_grid = tc_grid1_r   
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True. 
+  PROVIDE tc_grid2_a tc_grid2_r 
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+    
+end
+
+! ---
+
+subroutine main()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, ipoint
+  double precision              :: time0, time1
+  double precision              :: tt0, tt1
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision              :: noL_0e_gpu(1)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+  double precision, allocatable :: noL_1e_gpu(:,:)
+  double precision, allocatable :: noL_2e_gpu(:,:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  call wall_time(time0)
+  print*, ' start compute_no_gpu'
+
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  call wall_time(tt0)
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for reading (sec) = ', (tt1 - tt0)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  call wall_time(tt0)
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for 3e-tensor (sec) = ', (tt1 - tt0)
+
+  deallocate(tmp)
+
+  allocate(noL_2e_gpu(mo_num,mo_num,mo_num,mo_num))
+  allocate(noL_1e_gpu(mo_num,mo_num))
+
+  call cutc_no(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+               final_weight_at_r_vector(1),                                &
+               mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+               int2_grad1_u12_bimo_t(1,1,1,1), noL_2e_gpu(1,1,1,1), noL_1e_gpu(1,1), noL_0e_gpu(1))
+
+  deallocate(int2_grad1_u12_bimo_t)
+  deallocate(noL_2e_gpu)
+  deallocate(noL_1e_gpu)
+
+  call wall_time(time1)
+  write(*,"(A,2X,F15.7)") ' wall time for compute_no_gpu (sec) = ', (time1 - time0)
+
+  return
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f
new file mode 100644
index 00000000..60c0d53b
--- /dev/null
+++ b/plugins/local/tc_int/compute_tc_int.irp.f
@@ -0,0 +1,301 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao()
+
+  BEGIN_DOC
+  !
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
+  !
+  !
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
+  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
+  ! where:
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
+  !
+  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
+  !
+  END_DOC
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
+  double precision              :: time0, time1, time2, tc1, tc2, tc
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp_grad1_u12(:,:,:), tmp(:,:,:)
+  double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)
+
+  double precision, external    :: get_ao_two_e_integral
+
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
+
+
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+  ! ---
+  ! ---
+  ! ---
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+
+  allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
+
+  tc = 0.d0
+
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+
+    call wall_time(tc1)
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, tmp_grad1_u12)
+    !$OMP DO
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), &
+        tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1
+
+    do m = 1, 4
+      call dgemm("T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                      &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+    enddo
+  enddo
+
+  if(n_rest .gt. 0) then
+
+    ii = n_pass*n_blocks + 1
+
+    call wall_time(tc1)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, tmp_grad1_u12)
+    !$OMP DO
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), &
+        tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1
+
+    do m = 1, 4
+      call dgemm("T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                        &
+                , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+    enddo
+
+  endif
+  deallocate(tmp_grad1_u12)
+
+  deallocate(tmp)
+
+
+  call wall_time(time1)
+  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
+
+  call print_memory_usage()
+
+  ! ---
+  ! ---
+  ! ---
+
+
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(time1)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  do m = 1, 3
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       &
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+
+          weight1 = final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -0.5d0          &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  enddo
+
+  call wall_time(time2)
+  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+
+  deallocate(c_mat)
+
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+
+  call print_memory_usage()
+
+  ! ---
+
+  double precision :: tmp_omp
+
+  call wall_time(time1)
+
+  PROVIDE ao_integrals_map
+  tmp_omp = get_ao_two_e_integral(1, 1, 1, 1, ao_integrals_map)
+
+  !$OMP PARALLEL DEFAULT(NONE)                         &
+  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                               < 1:i, 2:j | 1:k, 2:l >
+          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(time2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+    call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao(:,:,:,1:3)
+  close(11)
+  deallocate(int2_grad1_u12_ao)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do i = 1, ao_num
+    write(11) tc_int_2e_ao(:,:,:,i)
+  enddo
+  close(11)
+
+  ! ----
+
+  deallocate(tc_int_2e_ao)
+
+  call wall_time(time2)
+  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/compute_tc_int_gpu.irp.f b/plugins/local/tc_int/compute_tc_int_gpu.irp.f
new file mode 100644
index 00000000..c2653ac6
--- /dev/null
+++ b/plugins/local/tc_int/compute_tc_int_gpu.irp.f
@@ -0,0 +1,315 @@
+
+! ---
+
+subroutine provide_int2_grad1_u12_ao_gpu()
+
+  use gpu
+
+  BEGIN_DOC
+  !
+  ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2         [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2         [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2         [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
+  ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
+  !
+  !
+  ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
+  !                       = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
+  !                       = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
+  ! where:
+  !
+  ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
+  !                              = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2      \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
+  !                              =  1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
+  !
+  ! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
+  !
+  ! ao_two_e_coul(k,i,l,j) = < l k | 1/r12 | j i > = ( k i | 1/r12 | l j )
+  !
+  END_DOC
+
+  implicit none
+
+  integer                       :: i, j, k, l, m, ipoint, jpoint
+  integer                       :: n_blocks, n_rest, n_pass
+  integer                       :: i_blocks, i_rest, i_pass, ii
+  double precision              :: mem, n_double
+  double precision              :: weight1, ao_k_r, ao_i_r
+  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
+  double precision              :: time0, time1, time2, tc1, tc2, tc
+  type(gpu_double4)             :: int2_grad1_u12_ao
+  type(gpu_double3)             :: tmp_grad1_u12, tmp_grad1_u12p, tmp
+  double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)
+
+  double precision, external    :: get_ao_two_e_integral
+
+
+  PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+  PROVIDE final_weight_at_r_vector aos_grad_in_r_array_transp_bis final_weight_at_r_vector aos_in_r_array_transp
+
+
+
+  print*, ' start provide_int2_grad1_u12_ao ...'
+  call wall_time(time0)
+
+  call total_memory(mem)
+  mem      = max(1.d0, qp_max_mem - mem)
+  mem = 6
+  n_double = mem * 1.d8
+  n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+  n_rest   = int(mod(n_points_final_grid, n_blocks))
+  n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+
+  call write_int(6, n_pass, 'Number of passes')
+  call write_int(6, n_blocks, 'Size of the blocks')
+  call write_int(6, n_rest, 'Size of the last block')
+
+  ! ---
+  ! ---
+  ! ---
+
+  call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4)
+
+  call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num)
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4)
+  call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4)
+
+  tc = 0.d0
+
+  type(gpu_stream) :: stream(4)
+  do i=1,4
+    call gpu_stream_create(stream(i))
+  enddo
+
+  do i_pass = 1, n_pass
+    ii = (i_pass-1)*n_blocks + 1
+
+    call wall_time(tc1)
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO
+    do i_blocks = 1, n_blocks
+      ipoint = ii - 1 + i_blocks ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), &
+        tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1
+
+    call gpu_synchronize()
+    call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p)
+    do m = 1, 4
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
+    enddo
+  enddo
+
+  if(n_rest .gt. 0) then
+
+    ii = n_pass*n_blocks + 1
+
+    call wall_time(tc1)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
+    !$OMP DO
+    do i_rest = 1, n_rest
+      ipoint = ii - 1 + i_rest ! r1
+      call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), &
+        tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4))
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(tc2)
+    tc = tc + tc2 - tc1
+
+    do m = 1, 4
+      call gpu_set_stream(blas_handle, stream(m))
+      call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid &
+                , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
+    enddo
+
+  endif
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_grad1_u12)
+  call gpu_deallocate(tmp_grad1_u12p)
+
+  do i=1,4
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+
+  call gpu_deallocate(tmp)
+
+
+  call wall_time(time1)
+  print*, ' wall time for int2_grad1_u12_ao (min) = ', (time1-time0) / 60.d0
+  print*, ' wall time Jastrow derivatives   (min) = ', tc / 60.d0
+  call print_memory_usage()
+
+!TODO
+stop
+  ! ---
+  ! ---
+  ! ---
+
+
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  allocate(tc_int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(time1)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' wall time of Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  do m = 1, 3
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       &
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+
+          weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+              , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+  enddo
+
+  call wall_time(time2)
+  print*, ' wall time of non-Hermitian part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  deallocate(c_mat)
+
+  ! ---
+
+  call wall_time(time1)
+
+  call sum_A_At(tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
+
+  call wall_time(time2)
+  print*, ' lower- and upper-triangle of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+
+  PROVIDE ao_integrals_map
+  !$OMP PARALLEL DEFAULT(NONE)                         &
+  !$OMP SHARED(ao_num, tc_int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                               < 1:i, 2:j | 1:k, 2:l >
+          tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(time2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (time2 - time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+  call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao%f(:,:,:,1:3)
+  close(11)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do i = 1, ao_num
+    write(11) tc_int_2e_ao(:,:,:,i)
+  enddo
+  close(11)
+
+  ! ----
+
+  call gpu_deallocate(int2_grad1_u12_ao)
+  deallocate(tc_int_2e_ao)
+
+  call wall_time(time2)
+  print*, ' wall time for tc_int_2e_ao (min) = ', (time2-time1) / 60.d0
+  call print_memory_usage()
+
+  ! ---
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/cutc_module.F90 b/plugins/local/tc_int/cutc_module.F90
new file mode 100644
index 00000000..0f5f6c78
--- /dev/null
+++ b/plugins/local/tc_int/cutc_module.F90
@@ -0,0 +1,70 @@
+
+module cutc_module
+
+  use, intrinsic :: iso_c_binding
+
+  implicit none
+
+  interface
+
+    ! ---
+
+    subroutine cutc_int(nxBlocks, nyBlocks, nzBlocks,           &
+                        blockxSize, blockySize, blockzSize,     &
+                        n_grid1, n_grid2, n_ao, n_nuc, size_bh, &
+                        r1, wr1, r2, wr2, rn,                   &
+                        aos_data1, aos_data2,                   &
+                        c_bh, m_bh, n_bh, o_bh,                 &
+                        int2_grad1_u12_ao, int_2e_ao) bind(C, name = "cutc_int")
+
+      import c_int, c_double, c_ptr
+      integer(c_int), intent(in), value :: nxBlocks, blockxSize
+      integer(c_int), intent(in), value :: nyBlocks, blockySize
+      integer(c_int), intent(in), value :: nzBlocks, blockzSize
+      integer(c_int), intent(in), value :: n_grid1, n_grid2
+      integer(c_int), intent(in), value :: n_ao
+      integer(c_int), intent(in), value :: n_nuc
+      integer(c_int), intent(in), value :: size_bh
+      real(c_double), intent(in)        :: r1(3,n_grid1), wr1(n_grid1)
+      real(c_double), intent(in)        :: r2(3,n_grid2), wr2(n_grid2)
+      real(c_double), intent(in)        :: rn(3,n_nuc)
+      real(c_double), intent(in)        :: aos_data1(n_grid1,n_ao,4)
+      real(c_double), intent(in)        :: aos_data2(n_grid2,n_ao,4)
+      real(c_double), intent(in)        :: c_bh(size_bh,n_nuc)
+      integer(c_int), intent(in)        :: m_bh(size_bh,n_nuc)
+      integer(c_int), intent(in)        :: n_bh(size_bh,n_nuc)
+      integer(c_int), intent(in)        :: o_bh(size_bh,n_nuc)
+      real(c_double), intent(out)       :: int2_grad1_u12_ao(n_ao,n_ao,n_grid1,3)
+      real(c_double), intent(out)       :: int_2e_ao(n_ao,n_ao,n_ao,n_ao)
+
+    end subroutine cutc_int
+
+    ! ---
+
+    subroutine cutc_no(n_grid1, n_mo, ne_a, ne_b,                   &
+                       wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                       no_2e, no_1e, no_0e) bind(C, name = "cutc_no")
+
+      import c_int, c_double, c_ptr
+
+      integer(c_int), intent(in), value :: n_grid1
+      integer(c_int), intent(in), value :: n_mo
+      integer(c_int), intent(in), value :: ne_a
+      integer(c_int), intent(in), value :: ne_b
+      real(c_double), intent(in)        :: wr1(n_grid1)
+      real(c_double), intent(in)        :: mos_l_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: mos_r_in_r(n_grid1,n_mo)
+      real(c_double), intent(in)        :: int2_grad1_u12(n_grid1,3,n_mo,n_mo)
+      real(c_double), intent(out)       :: no_2e(n_mo,n_mo,n_mo,n_mo)
+      real(c_double), intent(out)       :: no_1e(n_mo,n_mo)
+      real(c_double), intent(out)       :: no_0e(1)
+
+    end subroutine cutc_no
+
+    ! ---
+
+  end interface
+
+end module cutc_module
+
+
diff --git a/plugins/local/tc_int/deb_int_2e_ao_gpu.irp.f b/plugins/local/tc_int/deb_int_2e_ao_gpu.irp.f
new file mode 100644
index 00000000..4bb44af9
--- /dev/null
+++ b/plugins/local/tc_int/deb_int_2e_ao_gpu.irp.f
@@ -0,0 +1,311 @@
+
+! ---
+
+program deb_int_2e_ao_gpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: m
+  integer                       :: i, j, k, l
+  integer                       :: ipoint, jpoint
+
+  double precision              :: weight1, ao_i_r, ao_k_r
+
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+
+  double precision              :: time0, time1
+  double precision              :: wall_time0, wall_time1
+  double precision              :: wall_ttime0, wall_ttime1
+  double precision              :: tt1, tt2
+
+  double precision, allocatable :: rn(:,:), aos_data1(:,:,:), aos_data2(:,:,:)
+  double precision, allocatable :: grad1_u12(:,:,:), int_fct_long_range(:,:,:), c_mat(:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao_gpu(:,:,:,:)
+  double precision, allocatable :: int_2e_ao(:,:,:,:)
+  double precision, allocatable :: int_2e_ao_gpu(:,:,:,:)
+
+
+  call wall_time(time0)
+  print*, ' start deb_int_2e_ao_gpu'
+
+
+  ! ---
+
+  allocate(rn(3,nucl_num))
+  allocate(aos_data1(n_points_final_grid,ao_num,4))
+  allocate(aos_data2(n_points_extra_final_grid,ao_num,4))
+
+  do k = 1, nucl_num
+    rn(1,k) = nucl_coord(k,1)
+    rn(2,k) = nucl_coord(k,2)
+    rn(3,k) = nucl_coord(k,3)
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_final_grid
+      aos_data1(ipoint,k,1) = aos_in_r_array(k,ipoint)
+      aos_data1(ipoint,k,2) = aos_grad_in_r_array(k,ipoint,1)
+      aos_data1(ipoint,k,3) = aos_grad_in_r_array(k,ipoint,2)
+      aos_data1(ipoint,k,4) = aos_grad_in_r_array(k,ipoint,3)
+    enddo
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_extra_final_grid
+      aos_data2(ipoint,k,1) = aos_in_r_array_extra(k,ipoint)
+      aos_data2(ipoint,k,2) = aos_grad_in_r_array_extra(k,ipoint,1)
+      aos_data2(ipoint,k,3) = aos_grad_in_r_array_extra(k,ipoint,2)
+      aos_data2(ipoint,k,4) = aos_grad_in_r_array_extra(k,ipoint,3)
+    enddo
+  enddo
+
+  ! ---
+
+  integer :: nB
+  integer :: sB
+
+  PROVIDE nxBlocks nyBlocks nzBlocks
+  PROVIDE blockxSize blockySize blockzSize
+
+  sB = 32
+  nB = (n_points_final_grid + sB - 1) / sB
+
+  call ezfio_set_tc_int_blockxSize(sB)
+  call ezfio_set_tc_int_nxBlocks(nB)
+
+  allocate(int2_grad1_u12_ao_gpu(ao_num,ao_num,n_points_final_grid,3))
+  allocate(int_2e_ao_gpu(ao_num,ao_num,ao_num,ao_num))
+
+  call cutc_int(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize,           &
+                n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
+                final_grid_points, final_weight_at_r_vector,                                &
+                final_grid_points_extra, final_weight_at_r_vector_extra,                    &
+                rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o,                       &
+                int2_grad1_u12_ao_gpu, int_2e_ao_gpu)
+
+  ! ---
+
+  allocate(int_fct_long_range(n_points_extra_final_grid,ao_num,ao_num))
+  allocate(grad1_u12(n_points_extra_final_grid,n_points_final_grid,4))
+  allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
+  allocate(int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+  call wall_time(wall_time0)
+
+  call wall_time(wall_ttime0)
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (j, i, jpoint) &
+  !$OMP SHARED (int_fct_long_range, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+  !$OMP DO SCHEDULE (static)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      do jpoint = 1, n_points_extra_final_grid
+        int_fct_long_range(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for int_long_range (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) &
+  !$OMP SHARED (n_points_final_grid, n_points_extra_final_grid, grad1_u12)
+  !$OMP DO
+  do ipoint = 1, n_points_final_grid
+    call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, grad1_u12(1,ipoint,1) &
+                                                               , grad1_u12(1,ipoint,2) &
+                                                               , grad1_u12(1,ipoint,3) &
+                                                               , grad1_u12(1,ipoint,4) )
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for tc_int_bh (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  do m = 1, 4
+    call dgemm("T", "N", ao_num*ao_num, n_points_final_grid, n_points_extra_final_grid, 1.d0                      &
+              , int_fct_long_range(1,1,1), n_points_extra_final_grid, grad1_u12(1,1,m), n_points_extra_final_grid &
+              , 0.d0, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num)
+  enddo
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for DGEMM of integ over r2 (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, k, ipoint) &
+  !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+  !$OMP DO SCHEDULE (static)
+  do i = 1, ao_num
+    do k = 1, ao_num
+      do ipoint = 1, n_points_final_grid
+        c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time of Hermitian part (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  call wall_time(wall_ttime0)
+  call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0            &
+            , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+            , 0.d0, int_2e_ao(1,1,1,1), ao_num*ao_num)
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time for DGEMM of Hermitian part (sec) = ', (wall_ttime1 - wall_ttime0)
+
+
+  tt1 = 0.d0
+  tt2 = 0.d0
+  do m = 1, 3
+
+    call wall_time(wall_ttime0)
+    !$OMP PARALLEL                                                              &
+    !$OMP DEFAULT (NONE)                                                        &
+    !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       &
+    !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
+    !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+    !$OMP DO SCHEDULE (static)
+    do i = 1, ao_num
+      do k = 1, ao_num
+        do ipoint = 1, n_points_final_grid
+
+          weight1 = final_weight_at_r_vector(ipoint)
+          ao_i_r  = aos_in_r_array_transp(ipoint,i)
+          ao_k_r  = aos_in_r_array_transp(ipoint,k)
+
+          c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call wall_time(wall_ttime1)
+    tt1 += wall_ttime1 - wall_ttime0
+
+    call wall_time(wall_ttime0)
+    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -0.5d0          &
+              , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+              , 1.d0, int_2e_ao(1,1,1,1), ao_num*ao_num)
+    call wall_time(wall_ttime1)
+    tt2 += wall_ttime1 - wall_ttime0
+  enddo
+  write(*,"(A,2X,F15.7)") ' wall time of non-Hermitian part (sec) = ', tt1
+  write(*,"(A,2X,F15.7)") ' wall time for DGEMM of non Hermitian part (sec) = ', tt2
+
+
+  call wall_time(wall_ttime0)
+  call sum_A_At(int_2e_ao(1,1,1,1), ao_num*ao_num)
+  call wall_time(wall_ttime1)
+  write(*,"(A,2X,F15.7)") ' wall time of A + A.T (sec) = ', wall_ttime1 - wall_ttime0
+
+
+  call wall_time(wall_time1)
+  write(*,"(A,2X,F15.7)") ' wall time on cpu (sec) = ', (wall_time1 - wall_time0)
+
+  ! ---
+
+  acc_thr = 1d-12
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do m = 1, 3
+    do ipoint = 1, n_points_final_grid
+      do j = 1, ao_num
+        do i = 1, ao_num
+          err_loc = dabs(int2_grad1_u12_ao(i,j,ipoint,m) - int2_grad1_u12_ao_gpu(i,j,ipoint,m))
+          if(err_loc > acc_thr) then
+            print*, " error on", i, j, ipoint, m
+            print*, " CPU res", int2_grad1_u12_ao    (i,j,ipoint,m)
+            print*, " GPU res", int2_grad1_u12_ao_gpu(i,j,ipoint,m)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(int2_grad1_u12_ao(i,j,ipoint,m))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on int2_grad1_u12_ao (%) =', 100.d0 * err_tot / nrm_tot
+
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, ao_num
+    do j = 1, ao_num
+      do k = 1, ao_num
+        do l = 1, ao_num
+          err_loc = dabs(int_2e_ao(l,k,j,i) - int_2e_ao_gpu(l,k,j,i))
+          if(err_loc > acc_thr) then
+            print*, " error on", l, k, j, i
+            print*, " CPU res", int_2e_ao    (l,k,j,i)
+            print*, " GPU res", int_2e_ao_gpu(l,k,j,i)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(int_2e_ao(l,k,j,i))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on int_2e_ao (%) =', 100.d0 * err_tot / nrm_tot
+
+  ! ---
+
+  deallocate(int_fct_long_range, grad1_u12, c_mat)
+  deallocate(int_2e_ao, int2_grad1_u12_ao)
+  deallocate(int_2e_ao_gpu, int2_grad1_u12_ao_gpu)
+  deallocate(rn, aos_data1, aos_data2)
+
+  call wall_time(time1)
+  write(*,"(A,2X,F15.7)") ' wall time for deb_int_2e_ao_gpu (sec) = ', (time1 - time0)
+
+  return
+end
diff --git a/plugins/local/tc_int/deb_no_gpu.irp.f b/plugins/local/tc_int/deb_no_gpu.irp.f
new file mode 100644
index 00000000..de8adb97
--- /dev/null
+++ b/plugins/local/tc_int/deb_no_gpu.irp.f
@@ -0,0 +1,218 @@
+
+! ---
+
+program deb_no_gpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r 
+  my_n_pt_r_grid = tc_grid1_r   
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True. 
+  PROVIDE tc_grid2_a tc_grid2_r 
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+    
+end
+
+! ---
+
+subroutine main()
+
+  use cutc_module
+
+  implicit none
+
+  integer                       :: i, j, k, l, ipoint
+  double precision              :: time0, time1
+  double precision              :: tt0, tt1
+  double precision              :: acc_thr, err_tot, nrm_tot, err_loc
+  double precision              :: noL_0e
+  double precision              :: noL_0e_gpu(1)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: tmp(:,:,:,:)
+  double precision, allocatable :: int2_grad1_u12_bimo_t(:,:,:,:)
+  double precision, allocatable :: noL_1e    (:,:)
+  double precision, allocatable :: noL_1e_gpu(:,:)
+  double precision, allocatable :: noL_2e    (:,:,:,:)
+  double precision, allocatable :: noL_2e_gpu(:,:,:,:)
+
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+
+  call wall_time(time0)
+  print*, ' start deb_no_gpu'
+
+
+
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  print*, ' Reading int2_grad1_u12_ao from ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  call wall_time(tt0)
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="read")
+    read(11) int2_grad1_u12_ao
+  close(11)
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for reading (sec) = ', (tt1 - tt0)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))
+  allocate(int2_grad1_u12_bimo_t(n_points_final_grid,3,mo_num,mo_num))
+
+  call wall_time(tt0)
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  deallocate(int2_grad1_u12_ao)
+
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for 3e-tensor (sec) = ', (tt1 - tt0)
+
+  deallocate(tmp)
+
+  ! ---
+
+  allocate(noL_2e_gpu(mo_num,mo_num,mo_num,mo_num))
+  allocate(noL_1e_gpu(mo_num,mo_num))
+
+  call cutc_no(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+               final_weight_at_r_vector(1),                                &
+               mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+               int2_grad1_u12_bimo_t(1,1,1,1), noL_2e_gpu(1,1,1,1), noL_1e_gpu(1,1), noL_0e_gpu(1))
+
+  ! ---
+
+  allocate(noL_2e(mo_num,mo_num,mo_num,mo_num))
+  allocate(noL_1e(mo_num,mo_num))
+
+  call provide_no_2e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_2e(1,1,1,1))
+
+  call provide_no_1e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_1e(1,1))
+
+  call provide_no_0e(n_points_final_grid, mo_num, elec_alpha_num, elec_beta_num, &
+                     final_weight_at_r_vector(1),                                &
+                     mos_l_in_r_array_transp(1,1), mos_r_in_r_array_transp(1,1), &
+                     int2_grad1_u12_bimo_t(1,1,1,1), noL_0e)
+
+  ! ---
+
+  deallocate(int2_grad1_u12_bimo_t)
+
+  acc_thr = 1d-12
+
+  ! ---
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do k = 1, mo_num
+        do l = 1, mo_num
+          err_loc = dabs(noL_2e(l,k,j,i) - noL_2e_gpu(l,k,j,i))
+          if(err_loc > acc_thr) then
+            print*, " error on", l, k, j, i
+            print*, " CPU res", noL_2e    (l,k,j,i)
+            print*, " GPU res", noL_2e_gpu(l,k,j,i)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(noL_2e(l,k,j,i))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on noL_2e (%) =', 100.d0 * err_tot / nrm_tot
+
+  deallocate(noL_2e)
+  deallocate(noL_2e_gpu)
+
+  ! ---
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do k = 1, mo_num
+    do l = 1, mo_num
+      err_loc = dabs(noL_1e(l,k) - noL_1e_gpu(l,k))
+      if(err_loc > acc_thr) then
+        print*, " error on", l, k
+        print*, " CPU res", noL_1e    (l,k)
+        print*, " GPU res", noL_1e_gpu(l,k)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(noL_1e(l,k))
+    enddo
+  enddo
+  print *, ' absolute accuracy on noL_1e (%) =', 100.d0 * err_tot / nrm_tot
+
+  deallocate(noL_1e)
+  deallocate(noL_1e_gpu)
+
+  ! ---
+
+  print *, 'noL_0e CPU = ', noL_0e
+  print *, 'noL_0e GPU = ', noL_0e_gpu(1)
+
+  err_tot = dabs(noL_0e - noL_0e_gpu(1))
+  nrm_tot = dabs(noL_0e)
+  print *, ' absolute accuracy on noL_0e (%) =', 100.d0 * err_tot / nrm_tot
+
+
+  call wall_time(time1)
+  write(*,"(A,2X,F15.7)") ' wall time for deb_no_gpu (sec) = ', (time1 - time0)
+
+  return
+
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/install b/plugins/local/tc_int/install
new file mode 100755
index 00000000..be779c3a
--- /dev/null
+++ b/plugins/local/tc_int/install
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Check if the QP_ROOT environment variable is set.
+if [[ -z ${QP_ROOT} ]]
+then
+  print "The QP_ROOT environment variable is not set."
+  print "Please reload the quantum_package.rc file."
+  exit -1
+fi
+
+git clone https://github.com/AbdAmmar/CuTC
+cd CuTC
+source config/env.rc 
+make
+cd ..
+
+ln -s ${PWD}/CuTC/build/libcutcint.so ${QP_ROOT}/lib
+
diff --git a/plugins/local/tc_int/jast_grad_full.irp.f b/plugins/local/tc_int/jast_grad_full.irp.f
new file mode 100644
index 00000000..943d8567
--- /dev/null
+++ b/plugins/local/tc_int/jast_grad_full.irp.f
@@ -0,0 +1,188 @@
+
+! ---
+
+subroutine get_grad1_u12_for_tc(ipoint, n_grid2, resx, resy, resz, res)
+
+  BEGIN_DOC
+  !
+  ! resx(ipoint) =      [grad1 u(r1,r2)]_x1
+  ! resy(ipoint) =      [grad1 u(r1,r2)]_y1
+  ! resz(ipoint) =      [grad1 u(r1,r2)]_z1
+  ! res (ipoint) = -0.5 [grad1 u(r1,r2)]^2
+  !
+  ! We use:
+  !       grid for r1
+  ! extra_grid for r2
+  !
+  END_DOC
+
+  include 'constants.include.F'
+
+  implicit none
+  integer,          intent(in)  :: ipoint, n_grid2
+  double precision, intent(out) :: resx(n_grid2), resy(n_grid2), resz(n_grid2), res(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA, pp
+  integer                       :: powmax1, powmax, powmax2
+  double precision              :: r1(3), r2(3)
+  double precision              :: tmp, tmp1, tmp2, tmp11, tmp22
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  r1(1) = final_grid_points(1,ipoint)
+  r1(2) = final_grid_points(2,ipoint)
+  r1(3) = final_grid_points(3,ipoint)
+
+  call grad1_j12_r1_seq(r1, n_grid2, resx, resy, resz)
+
+  do jpoint = 1, n_grid2 ! r2
+    res(jpoint) = -0.5d0 * (resx(jpoint) * resx(jpoint) + resy(jpoint) * resy(jpoint) + resz(jpoint) * resz(jpoint))
+  enddo
+
+  return
+end
+
+! ---
+
+subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
+
+  include 'constants.include.F'
+
+  implicit none
+  integer         , intent(in)  :: n_grid2
+  double precision, intent(in)  :: r1(3)
+  double precision, intent(out) :: gradx(n_grid2)
+  double precision, intent(out) :: grady(n_grid2)
+  double precision, intent(out) :: gradz(n_grid2)
+
+  integer                       :: jpoint, i_nucl, p, mpA, npA, opA
+  double precision              :: r2(3)
+  double precision              :: dx, dy, dz, r12, tmp
+  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, g12, grad1_g12(3)
+  double precision              :: tmp1, tmp2, dist
+  integer                       :: powmax1, powmax, powmax2
+  double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
+
+  powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+  powmax2 = maxval(jBH_o)
+  powmax  = max(powmax1, powmax2)
+
+  allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+  do p = 0, powmax
+    double_p(p) = dble(p)
+  enddo
+
+  f1A_power(-1) = 0.d0
+  f2A_power(-1) = 0.d0
+  g12_power(-1) = 0.d0
+
+  f1A_power(0) = 1.d0
+  f2A_power(0) = 1.d0
+  g12_power(0) = 1.d0
+
+  do jpoint = 1, n_grid2 ! r2
+
+    r2(1) = final_grid_points_extra(1,jpoint)
+    r2(2) = final_grid_points_extra(2,jpoint)
+    r2(3) = final_grid_points_extra(3,jpoint)
+
+    gradx(jpoint) = 0.d0
+    grady(jpoint) = 0.d0
+    gradz(jpoint) = 0.d0
+
+    dist = (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+         + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+         + (r1(3) - r2(3)) * (r1(3) - r2(3))
+    if(dist .ge. 1d-15) then
+      dist = dsqrt(dist)
+      tmp1 = 1.d0 / (1.d0 + dist)
+      g12 = dist * tmp1
+      tmp2 = tmp1 * tmp1 / dist
+      grad1_g12(1) = tmp2 * (r1(1) - r2(1))
+      grad1_g12(2) = tmp2 * (r1(2) - r2(2))
+      grad1_g12(3) = tmp2 * (r1(3) - r2(3))
+      do p = 1, powmax2
+        g12_power(p) = g12_power(p-1) * g12
+      enddo
+    else
+      grad1_g12(1) = 0.d0
+      grad1_g12(2) = 0.d0
+      grad1_g12(3) = 0.d0
+      g12 = 0.d0
+      do p = 1, powmax2
+        g12_power(p) = 0.d0
+      enddo
+    endif
+
+    do i_nucl = 1, nucl_num
+
+      rn(1) = nucl_coord(i_nucl,1)
+      rn(2) = nucl_coord(i_nucl,2)
+      rn(3) = nucl_coord(i_nucl,3)
+
+      dist = (r1(1) - rn(1)) * (r1(1) - rn(1)) &
+           + (r1(2) - rn(2)) * (r1(2) - rn(2)) &
+           + (r1(3) - rn(3)) * (r1(3) - rn(3))
+      if (dist > 1.d-15) then
+        dist = dsqrt(dist)
+        tmp1 = 1.d0 / (1.d0 + dist)
+        f1A = dist * tmp1
+        tmp2 = tmp1 * tmp1 / dist
+        grad1_f1A(1) = tmp2 * (r1(1) - rn(1))
+        grad1_f1A(2) = tmp2 * (r1(2) - rn(2))
+        grad1_f1A(3) = tmp2 * (r1(3) - rn(3))
+        do p = 1, powmax1
+          f1A_power(p) = f1A_power(p-1) * f1A
+        enddo
+      else
+        grad1_f1A(1) = 0.d0
+        grad1_f1A(2) = 0.d0
+        grad1_f1A(3) = 0.d0
+        f1A = 0.d0
+        do p = 1, powmax1
+          f1A_power(p) = 0.d0
+        enddo
+      endif
+
+      dist = (r2(1) - rn(1)) * (r2(1) - rn(1)) &
+           + (r2(2) - rn(2)) * (r2(2) - rn(2)) &
+           + (r2(3) - rn(3)) * (r2(3) - rn(3))
+      if (dist > 1.d-15) then
+        dist = dsqrt(dist)
+        f2A = dist / (1.d0 + dist)
+        do p = 1, powmax1
+          f2A_power(p) = f2A_power(p-1) * f2A
+        enddo
+      else
+        f2A = 0.d0
+        do p = 1, powmax1
+          f2A_power(p) = 0.d0
+        enddo
+      endif
+
+      do p = 1, jBH_size
+
+        tmp = jBH_c(p,i_nucl)
+        if (dabs(tmp) <= 1.d-10) cycle
+
+        mpA = jBH_m(p,i_nucl)
+        npA = jBH_n(p,i_nucl)
+        opA = jBH_o(p,i_nucl)
+
+        tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
+        tmp1 = tmp1 * g12_power(opA) * tmp
+        tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp
+
+        gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+        grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+        gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
+      enddo ! p
+    enddo ! i_nucl
+  enddo ! jpoint
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/no_0e.irp.f b/plugins/local/tc_int/no_0e.irp.f
new file mode 100644
index 00000000..830b91a8
--- /dev/null
+++ b/plugins/local/tc_int/no_0e.irp.f
@@ -0,0 +1,407 @@
+
+! ---
+
+subroutine provide_no_0e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, noL_0e)
+
+  implicit none
+
+  integer,          intent(in)  :: n_grid, n_mo
+  integer,          intent(in)  :: ne_a, ne_b
+  double precision, intent(in)  :: wr1(n_grid)
+  double precision, intent(in)  :: mos_l_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: mos_r_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: int2_grad1_u12(n_grid,3,n_mo,n_mo)
+  double precision, intent(out) :: noL_0e
+
+  integer                       :: i, j, k, ipoint
+  double precision              :: t0, t1
+  double precision, allocatable :: tmp(:)
+  double precision, allocatable :: tmpL(:,:), tmpR(:,:)
+  double precision, allocatable :: tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:)
+  double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
+
+
+  call wall_time(t0)
+
+
+  if(ne_a .eq. ne_b) then
+
+    allocate(tmp(ne_b))
+    allocate(tmpL(n_grid,3), tmpR(n_grid,3))
+
+    !$OMP PARALLEL                            &
+    !$OMP DEFAULT(NONE)                       &
+    !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR)   &
+    !$OMP SHARED(ne_b, n_grid,                & 
+    !$OMP        mos_l_in_r, mos_r_in_r, wr1, &
+    !$OMP        int2_grad1_u12, tmp)
+
+    !$OMP DO
+    do j = 1, ne_b
+
+      tmpL = 0.d0
+      tmpR = 0.d0
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      tmp(j) = 0.d0
+      do ipoint = 1, n_grid
+        tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
+      enddo
+    enddo ! j
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    noL_0e = -2.d0 * sum(tmp)
+
+    deallocate(tmp)
+    deallocate(tmpL, tmpR)
+
+    ! ---
+
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, n_grid,                     &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, n_grid,                        &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    allocate(tmp(n_grid))
+
+    do ipoint = 1, n_grid
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint)
+
+      tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) &
+                                                                         + tmpJ(ipoint,2) * tmpM(ipoint,2) &
+                                                                         + tmpJ(ipoint,3) * tmpM(ipoint,3) ) )
+    enddo
+
+    noL_0e = noL_0e - 2.d0 * (sum(tmp))
+
+    deallocate(tmp)
+
+  else
+
+    allocate(tmp(ne_a))
+    allocate(tmpL(n_grid,3), tmpR(n_grid,3))
+
+    !$OMP PARALLEL                          &
+    !$OMP DEFAULT(NONE)                     &
+    !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,        &
+    !$OMP        mos_l_in_r, mos_r_in_r,    &
+    !$OMP        int2_grad1_u12, tmp, wr1)
+
+    !$OMP DO
+    do j = 1, ne_b
+
+      tmpL = 0.d0
+      tmpR = 0.d0
+      do i = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1) = tmpL(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1) = tmpR(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      tmp(j) = 0.d0
+      do ipoint = 1, n_grid
+        tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
+      enddo
+
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      do ipoint = 1, n_grid
+        tmp(j) = tmp(j) + wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
+      enddo
+    enddo ! j
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    !$OMP PARALLEL                          &
+    !$OMP DEFAULT(NONE)                     &
+    !$OMP PRIVATE(j, i, ipoint, tmpL, tmpR) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,        &
+    !$OMP        mos_l_in_r, mos_r_in_r,    &
+    !$OMP        int2_grad1_u12, tmp, wr1)
+
+    !$OMP DO
+    do j = ne_b+1, ne_a
+
+      tmpL = 0.d0
+      tmpR = 0.d0
+      do i = 1, ne_a
+        do ipoint = 1, n_grid
+          tmpL(ipoint,1) = tmpL(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2) = tmpL(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3) = tmpL(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1) = tmpR(ipoint,1) + int2_grad1_u12(ipoint,1,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2) = tmpR(ipoint,2) + int2_grad1_u12(ipoint,2,i,j) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3) = tmpR(ipoint,3) + int2_grad1_u12(ipoint,3,i,j) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      tmp(j) = 0.d0
+      do ipoint = 1, n_grid
+        tmp(j) = tmp(j) + 0.5d0 * wr1(ipoint) * (tmpL(ipoint,1)*tmpR(ipoint,1) + tmpL(ipoint,2)*tmpR(ipoint,2) + tmpL(ipoint,3)*tmpR(ipoint,3))
+      enddo
+    enddo ! j
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    noL_0e = -2.d0 * sum(tmp)
+
+    deallocate(tmp)
+    deallocate(tmpL, tmpR)
+
+    ! ---
+
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,               &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO 
+    do i = ne_b+1, ne_a
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,                  &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint)   = tmpS_priv(ipoint)   + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                      + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                      + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    allocate(tmp(n_grid))
+
+    do ipoint = 1, n_grid
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1)*tmpJ(ipoint,1) + tmpJ(ipoint,2)*tmpJ(ipoint,2) + tmpJ(ipoint,3)*tmpJ(ipoint,3)) - tmpS(ipoint)
+
+      tmp(ipoint) = wr1(ipoint) * ( tmpO(ipoint) * tmpS(ipoint) - 2.d0 * ( tmpJ(ipoint,1) * tmpM(ipoint,1) &
+                                                                         + tmpJ(ipoint,2) * tmpM(ipoint,2) &
+                                                                         + tmpJ(ipoint,3) * tmpM(ipoint,3) ) )
+    enddo
+
+    noL_0e = noL_0e - 2.d0 * (sum(tmp))
+
+    deallocate(tmp)
+
+  endif
+
+
+  call wall_time(t1)
+  write(*,"(A,2X,F15.7)") ' wall time for noL_0e (sec) = ', (t1 - t0)
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/no_1e.irp.f b/plugins/local/tc_int/no_1e.irp.f
new file mode 100644
index 00000000..5a9798f0
--- /dev/null
+++ b/plugins/local/tc_int/no_1e.irp.f
@@ -0,0 +1,1179 @@
+
+! ---
+
+subroutine provide_no_1e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, noL_1e)
+
+  implicit none
+
+  integer,          intent(in)  :: n_grid, n_mo
+  integer,          intent(in)  :: ne_a, ne_b
+  double precision, intent(in)  :: wr1(n_grid)
+  double precision, intent(in)  :: mos_l_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: mos_r_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: int2_grad1_u12(n_grid,3,n_mo,n_mo)
+  double precision, intent(out) :: noL_1e(n_mo,n_mo)
+
+  integer                       :: p, s, i, j, ipoint
+  double precision              :: t0, t1
+  double precision, allocatable :: tmpC(:,:,:,:), tmpD(:,:), tmpE(:,:,:), tmpF(:,:,:)
+  double precision, allocatable :: tmpL(:,:,:), tmpR(:,:,:), tmpM(:,:), tmpS(:), tmpO(:), tmpJ(:,:)
+  double precision, allocatable :: tmpL0(:,:,:), tmpR0(:,:,:)
+  double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
+
+
+  call wall_time(t0)
+
+
+  if(ne_a .eq. ne_b) then
+
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, n_grid,                     & 
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, n_grid,                        &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpC(n_grid,4,n_mo,n_mo))
+    allocate(tmpD(n_grid,4))
+
+    do ipoint = 1, n_grid
+
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
+    enddo
+
+    deallocate(tmpO, tmpM)
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        int2_grad1_u12, tmpC)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+        enddo
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
+              , 0.d0, noL_1e(1,1), 1)
+
+    deallocate(tmpC, tmpD)
+
+    ! ---
+
+    allocate(tmpL(n_grid,3,n_mo))
+    allocate(tmpR(n_grid,3,n_mo))
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, ipoint)          &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, tmpL, tmpR)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpL(:,1:3,p) = 0.d0
+      tmpR(:,1:3,p) = 0.d0
+
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpE(n_grid,5,n_mo))
+    allocate(tmpF(n_grid,5,n_mo))
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, j, ipoint)       &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, wr1,    &
+    !$OMP        tmpL, tmpR, tmpJ, tmpS, tmpE, tmpF)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      do ipoint = 1, n_grid
+
+        tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+
+        tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                         + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF(ipoint,5,p) = tmpR(ipoint,3,p)
+      enddo
+
+      do i = 1, ne_b
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                         + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                         + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate(tmpL, tmpR, tmpJ, tmpS)
+
+    call dgemm( 'T', 'N', n_mo, n_mo, 5*n_grid, 1.d0         &
+              , tmpE(1,1,1), 5*n_grid, tmpF(1,1,1), 5*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
+   
+    deallocate(tmpE, tmpF)
+
+    ! ---
+
+  else
+
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,               &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO 
+    do i = ne_b+1, ne_a
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpM(n_grid,3), tmpS(n_grid))
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,                  &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpC(n_grid,4,n_mo,n_mo))
+    allocate(tmpD(n_grid,4))
+
+    do ipoint = 1, n_grid
+
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
+    enddo
+
+    deallocate(tmpO, tmpM)
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        ne_a, int2_grad1_u12, tmpC)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+        enddo
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+        do i = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
+              , 0.d0, noL_1e(1,1), 1)
+
+    deallocate(tmpC, tmpD)
+
+    ! ---
+
+    allocate(tmpL(n_grid,3,n_mo), tmpL0(n_grid,3,n_mo))
+    allocate(tmpR(n_grid,3,n_mo), tmpR0(n_grid,3,n_mo))
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, ipoint)            &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, tmpL0, tmpR0, tmpL, tmpR)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpL0(:,1:3,p) = 0.d0
+      tmpR0(:,1:3,p) = 0.d0
+      do i = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpL0(ipoint,1,p) = tmpL0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,2,p) = tmpL0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,3,p) = tmpL0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+                                     
+          tmpR0(ipoint,1,p) = tmpR0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,2,p) = tmpR0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,3,p) = tmpR0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      tmpL(:,1:3,p) = tmpL0(:,1:3,p)
+      tmpR(:,1:3,p) = tmpR0(:,1:3,p)
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+                                   
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpE(n_grid,8,n_mo))
+    allocate(tmpF(n_grid,8,n_mo))
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, j, ipoint)         &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & 
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, wr1,      &
+    !$OMP        tmpL, tmpL0, tmpR, tmpR0, tmpJ, tmpS, tmpE, tmpF)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      do ipoint = 1, n_grid
+
+        tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+        tmpE(ipoint,6,p) = wr1(ipoint) * tmpL0(ipoint,1,p)
+        tmpE(ipoint,7,p) = wr1(ipoint) * tmpL0(ipoint,2,p)
+        tmpE(ipoint,8,p) = wr1(ipoint) * tmpL0(ipoint,3,p)
+
+        tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                         + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF(ipoint,5,p) = tmpR(ipoint,3,p)
+        tmpF(ipoint,6,p) = tmpR0(ipoint,1,p)
+        tmpF(ipoint,7,p) = tmpR0(ipoint,2,p)
+        tmpF(ipoint,8,p) = tmpR0(ipoint,3,p)
+      enddo
+
+      do i = 1, ne_b
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                         + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                         + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+      do i = ne_b+1, ne_a
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                                 + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                                 + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & 
+                                                                                 + int2_grad1_u12(ipoint,2,p,j) * int2_grad1_u12(ipoint,2,j,i) &                                                                                     
+                                                                                 + int2_grad1_u12(ipoint,3,p,j) * int2_grad1_u12(ipoint,3,j,i) )
+
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                                 + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                                 + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & 
+                                                                                 + int2_grad1_u12(ipoint,2,j,i) * int2_grad1_u12(ipoint,2,i,p) &                                                                                     
+                                                                                 + int2_grad1_u12(ipoint,3,j,i) * int2_grad1_u12(ipoint,3,i,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+      do i = ne_b+1, ne_a
+        do j = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                                 + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                                 + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                                 + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                                 + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate(tmpL0, tmpL, tmpR0, tmpR, tmpJ, tmpS)
+
+    call dgemm( 'T', 'N', n_mo, n_mo, 8*n_grid, 1.d0         &
+              , tmpE(1,1,1), 8*n_grid, tmpF(1,1,1), 8*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
+   
+    deallocate(tmpE, tmpF)
+
+  endif
+
+
+  call wall_time(t1)
+  write(*,"(A,2X,F15.7)") ' wall time for noL_1e (sec) = ', (t1 - t0)
+
+  return
+end
+
+! ---
+
+subroutine provide_no_1e_tmp(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                             tmpO, tmpJ, tmpM, tmpS, tmpC, tmpD, tmpL, tmpR, tmpE, tmpF, noL_1e)
+
+
+  implicit none
+
+  integer,          intent(in)  :: n_grid, n_mo
+  integer,          intent(in)  :: ne_a, ne_b
+  double precision, intent(in)  :: wr1(n_grid)
+  double precision, intent(in)  :: mos_l_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: mos_r_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: int2_grad1_u12(n_grid,3,n_mo,n_mo)
+  double precision, intent(out) :: tmpO(n_grid), tmpJ(n_grid,3)
+  double precision, intent(out) :: tmpM(n_grid,3), tmpS(n_grid)
+  double precision, intent(out) :: tmpC(n_grid,4,n_mo,n_mo), tmpD(n_grid,4)
+  double precision, intent(out) :: tmpL(n_grid,3,n_mo), tmpR(n_grid,3,n_mo)
+  double precision, intent(out) :: tmpE(n_grid,5,n_mo), tmpF(n_grid,5,n_mo)
+  double precision, intent(out) :: noL_1e(n_mo,n_mo)
+
+  integer                       :: p, s, i, j, ipoint
+  double precision              :: t0, t1
+  double precision, allocatable :: tmpM_priv(:,:), tmpS_priv(:), tmpO_priv(:), tmpJ_priv(:,:)
+  double precision, allocatable :: tmpL0(:,:,:), tmpR0(:,:,:)
+  double precision, allocatable :: tmpE_os(:,:,:), tmpF_os(:,:,:)
+
+
+  call wall_time(t0)
+
+
+  if(ne_a .eq. ne_b) then
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, n_grid,                     & 
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, n_grid,                        &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    do ipoint = 1, n_grid
+
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
+    enddo
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        int2_grad1_u12, tmpC)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+        enddo
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
+              , 0.d0, noL_1e(1,1), 1)
+
+    ! ---
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, ipoint)          &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, tmpL, tmpR)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpL(:,1:3,p) = 0.d0
+      tmpR(:,1:3,p) = 0.d0
+
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, j, ipoint)       &
+    !$OMP SHARED(ne_b, n_grid, n_mo,     &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12, wr1,    &
+    !$OMP        tmpL, tmpR, tmpJ, tmpS, tmpE, tmpF)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      do ipoint = 1, n_grid
+
+        tmpE(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+
+        tmpF(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                         + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF(ipoint,5,p) = tmpR(ipoint,3,p)
+      enddo
+
+      do i = 1, ne_b
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE(ipoint,2,p) = tmpE(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                         + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF(ipoint,1,p) = tmpF(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                         + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                         + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemm( 'T', 'N', n_mo, n_mo, 5*n_grid, 1.d0         &
+              , tmpE(1,1,1), 5*n_grid, tmpF(1,1,1), 5*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
+   
+    ! ---
+
+  else
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+
+    !$OMP PARALLEL                                 &
+    !$OMP DEFAULT(NONE)                            &
+    !$OMP PRIVATE(i, ipoint, tmpO_priv, tmpJ_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,               &
+    !$OMP        mos_l_in_r, mos_r_in_r,           &
+    !$OMP        int2_grad1_u12, tmpO, tmpJ)
+
+    allocate(tmpO_priv(n_grid), tmpJ_priv(n_grid,3))
+    tmpO_priv = 0.d0
+    tmpJ_priv = 0.d0
+  
+    !$OMP DO 
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO 
+    do i = ne_b+1, ne_a
+      do ipoint = 1, n_grid
+        tmpO_priv(ipoint)   = tmpO_priv(ipoint)   + 0.5d0 * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ_priv(ipoint,1) = tmpJ_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ_priv(ipoint,2) = tmpJ_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ_priv(ipoint,3) = tmpJ_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpO = tmpO + tmpO_priv
+    tmpJ = tmpJ + tmpJ_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpO_priv, tmpJ_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    tmpM = 0.d0
+    tmpS = 0.d0
+
+    !$OMP PARALLEL                                    &
+    !$OMP DEFAULT(NONE)                               &
+    !$OMP PRIVATE(i, j, ipoint, tmpM_priv, tmpS_priv) &
+    !$OMP SHARED(ne_b, ne_a, n_grid,                  &
+    !$OMP        mos_l_in_r, mos_r_in_r,              &
+    !$OMP        int2_grad1_u12, tmpM, tmpS)
+
+    allocate(tmpM_priv(n_grid,3), tmpS_priv(n_grid))
+    tmpM_priv = 0.d0
+    tmpS_priv = 0.d0
+  
+    !$OMP DO COLLAPSE(2)
+    do i = 1, ne_b
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * mos_l_in_r(ipoint,j) * mos_r_in_r(ipoint,i)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP DO COLLAPSE(2)
+    do i = ne_b+1, ne_a
+      do j = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpM_priv(ipoint,1) = tmpM_priv(ipoint,1) + 0.5d0 * int2_grad1_u12(ipoint,1,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,2) = tmpM_priv(ipoint,2) + 0.5d0 * int2_grad1_u12(ipoint,2,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+          tmpM_priv(ipoint,3) = tmpM_priv(ipoint,3) + 0.5d0 * int2_grad1_u12(ipoint,3,j,i) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,j)
+
+          tmpS_priv(ipoint) = tmpS_priv(ipoint) + 0.5d0 * int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,i) &
+                                                + 0.5d0 * int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,i)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO NOWAIT
+
+    !$OMP CRITICAL
+    tmpM = tmpM + tmpM_priv
+    tmpS = tmpS + tmpS_priv
+    !$OMP END CRITICAL
+
+    deallocate(tmpM_priv, tmpS_priv)
+    !$OMP END PARALLEL
+
+    ! ---
+
+    do ipoint = 1, n_grid
+
+      tmpD(ipoint,1) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,1) - tmpM(ipoint,1))
+      tmpD(ipoint,2) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,2) - tmpM(ipoint,2))
+      tmpD(ipoint,3) = wr1(ipoint) * (2.d0 * tmpO(ipoint) * tmpJ(ipoint,3) - tmpM(ipoint,3))
+      tmpD(ipoint,4) = -wr1(ipoint) * tmpO(ipoint)
+
+      tmpS(ipoint) = 2.d0 * (tmpJ(ipoint,1) * tmpJ(ipoint,1) + tmpJ(ipoint,2) * tmpJ(ipoint,2) + tmpJ(ipoint,3) * tmpJ(ipoint,3)) - tmpS(ipoint)
+    enddo
+
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT(NONE)              &
+    !$OMP PRIVATE(p, s, i, ipoint)   &
+    !$OMP SHARED(n_mo, ne_b, n_grid, &
+    !$OMP        ne_a, int2_grad1_u12, tmpC)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+          tmpC(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpC(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpC(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+        enddo
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+        do i = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) = tmpC(ipoint,4,p,s) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                                    + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                                    + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo
+        enddo
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    call dgemv( 'T', 4*n_grid, n_mo*n_mo, 2.d0               &
+              , tmpC(1,1,1,1), size(tmpC, 1) * size(tmpC, 2) &
+              , tmpD(1,1), 1                                 &
+              , 0.d0, noL_1e(1,1), 1)
+
+    ! ---
+
+    allocate(tmpL0(n_grid,3,n_mo))
+    allocate(tmpR0(n_grid,3,n_mo))
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, ipoint)            &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, tmpL0, tmpR0, tmpL, tmpR)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpL0(:,1:3,p) = 0.d0
+      tmpR0(:,1:3,p) = 0.d0
+      do i = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+
+          tmpL0(ipoint,1,p) = tmpL0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,2,p) = tmpL0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL0(ipoint,3,p) = tmpL0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+                                     
+          tmpR0(ipoint,1,p) = tmpR0(ipoint,1,p) + 0.5d0 * int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,2,p) = tmpR0(ipoint,2,p) + 0.5d0 * int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR0(ipoint,3,p) = tmpR0(ipoint,3,p) + 0.5d0 * int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+      tmpL(:,1:3,p) = tmpL0(:,1:3,p)
+      tmpR(:,1:3,p) = tmpR0(:,1:3,p)
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+
+          tmpL(ipoint,1,p) = tmpL(ipoint,1,p) + int2_grad1_u12(ipoint,1,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,2,p) = tmpL(ipoint,2,p) + int2_grad1_u12(ipoint,2,p,i) * mos_l_in_r(ipoint,i)
+          tmpL(ipoint,3,p) = tmpL(ipoint,3,p) + int2_grad1_u12(ipoint,3,p,i) * mos_l_in_r(ipoint,i)
+                                   
+          tmpR(ipoint,1,p) = tmpR(ipoint,1,p) + int2_grad1_u12(ipoint,1,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,2,p) = tmpR(ipoint,2,p) + int2_grad1_u12(ipoint,2,i,p) * mos_r_in_r(ipoint,i)
+          tmpR(ipoint,3,p) = tmpR(ipoint,3,p) + int2_grad1_u12(ipoint,3,i,p) * mos_r_in_r(ipoint,i)
+        enddo
+      enddo
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! ---
+
+    allocate(tmpE_os(n_grid,8,n_mo))
+    allocate(tmpF_os(n_grid,8,n_mo))
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, j, ipoint)         &
+    !$OMP SHARED(ne_b, ne_a, n_grid, n_mo, & 
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12, wr1,      &
+    !$OMP        tmpL, tmpL0, tmpR, tmpR0, tmpJ, tmpS, tmpE_os, tmpF_os)
+
+    !$OMP DO
+    do p = 1, n_mo
+
+      do ipoint = 1, n_grid
+
+        tmpE_os(ipoint,1,p) = wr1(ipoint) * mos_l_in_r(ipoint,p)
+        tmpE_os(ipoint,2,p) = -2.d0 * (tmpL(ipoint,1,p) * tmpJ(ipoint,1) + tmpL(ipoint,2,p) * tmpJ(ipoint,2) + tmpL(ipoint,3,p) * tmpJ(ipoint,3))
+        tmpE_os(ipoint,3,p) = wr1(ipoint) * tmpL(ipoint,1,p)
+        tmpE_os(ipoint,4,p) = wr1(ipoint) * tmpL(ipoint,2,p)
+        tmpE_os(ipoint,5,p) = wr1(ipoint) * tmpL(ipoint,3,p)
+        tmpE_os(ipoint,6,p) = wr1(ipoint) * tmpL0(ipoint,1,p)
+        tmpE_os(ipoint,7,p) = wr1(ipoint) * tmpL0(ipoint,2,p)
+        tmpE_os(ipoint,8,p) = wr1(ipoint) * tmpL0(ipoint,3,p)
+
+        tmpF_os(ipoint,1,p) = -2.d0 * (tmpR(ipoint,1,p) * tmpJ(ipoint,1) + tmpR(ipoint,2,p) * tmpJ(ipoint,2) + tmpR(ipoint,3,p) * tmpJ(ipoint,3)) &
+                            + mos_r_in_r(ipoint,p) * tmpS(ipoint)
+        tmpF_os(ipoint,2,p) = wr1(ipoint) * mos_r_in_r(ipoint,p)
+        tmpF_os(ipoint,3,p) = tmpR(ipoint,1,p)
+        tmpF_os(ipoint,4,p) = tmpR(ipoint,2,p)
+        tmpF_os(ipoint,5,p) = tmpR(ipoint,3,p)
+        tmpF_os(ipoint,6,p) = tmpR0(ipoint,1,p)
+        tmpF_os(ipoint,7,p) = tmpR0(ipoint,2,p)
+        tmpF_os(ipoint,8,p) = tmpR0(ipoint,3,p)
+      enddo
+
+      do i = 1, ne_b
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                               + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                               + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                               + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                               + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+      do i = ne_b+1, ne_a
+        do j = 1, ne_b
+          do ipoint = 1, n_grid
+
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                                       + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,p,j) * int2_grad1_u12(ipoint,1,j,i) & 
+                                                                                       + int2_grad1_u12(ipoint,2,p,j) * int2_grad1_u12(ipoint,2,j,i) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,p,j) * int2_grad1_u12(ipoint,3,j,i) )
+
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                                       + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,j,i) * int2_grad1_u12(ipoint,1,i,p) & 
+                                                                                       + int2_grad1_u12(ipoint,2,j,i) * int2_grad1_u12(ipoint,2,i,p) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,j,i) * int2_grad1_u12(ipoint,3,i,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+      do i = ne_b+1, ne_a
+        do j = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+
+            tmpE_os(ipoint,2,p) = tmpE_os(ipoint,2,p) + 0.5d0 * mos_l_in_r(ipoint,j) * ( int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,j) & 
+                                                                                       + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,j) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,j) )
+
+            tmpF_os(ipoint,1,p) = tmpF_os(ipoint,1,p) + 0.5d0 * mos_r_in_r(ipoint,i) * ( int2_grad1_u12(ipoint,1,i,j) * int2_grad1_u12(ipoint,1,j,p) & 
+                                                                                       + int2_grad1_u12(ipoint,2,i,j) * int2_grad1_u12(ipoint,2,j,p) &                                                                                     
+                                                                                       + int2_grad1_u12(ipoint,3,i,j) * int2_grad1_u12(ipoint,3,j,p) )
+          enddo ! ipoint
+        enddo ! j
+      enddo ! i
+
+    enddo ! p
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate(tmpL0, tmpR0)
+
+    call dgemm( 'T', 'N', n_mo, n_mo, 8*n_grid, 1.d0               &
+              , tmpE_os(1,1,1), 8*n_grid, tmpF_os(1,1,1), 8*n_grid &
+              , 1.d0, noL_1e(1,1), n_mo)
+   
+    deallocate(tmpE_os, tmpF_os)
+
+  endif
+
+
+  call wall_time(t1)
+  write(*,"(A,2X,F15.7)") ' wall time for noL_1e (sec) = ', (t1 - t0)
+
+  return
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/no_2e.irp.f b/plugins/local/tc_int/no_2e.irp.f
new file mode 100644
index 00000000..6d31d6a0
--- /dev/null
+++ b/plugins/local/tc_int/no_2e.irp.f
@@ -0,0 +1,605 @@
+
+! ---
+
+subroutine provide_no_2e(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, noL_2e)
+
+  implicit none
+
+  integer,          intent(in)  :: n_grid, n_mo
+  integer,          intent(in)  :: ne_a, ne_b
+  double precision, intent(in)  :: wr1(n_grid)
+  double precision, intent(in)  :: mos_l_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: mos_r_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: int2_grad1_u12(n_grid,3,n_mo,n_mo)
+  double precision, intent(out) :: noL_2e(n_mo,n_mo,n_mo,n_mo)
+
+  integer                       :: p, q, s, t, i, ipoint
+  double precision              :: t0, t1
+  double precision, allocatable :: tmpO(:), tmpJ(:,:)
+  double precision, allocatable :: tmpA(:,:,:), tmpB(:,:,:)
+  double precision, allocatable :: tmpC(:,:,:,:), tmpD(:,:,:,:)
+  double precision, allocatable :: tmpE(:,:,:,:)
+
+
+  call wall_time(t0)
+
+  if(ne_a .eq. ne_b) then
+
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    allocate(tmpA(n_grid,3,n_mo), tmpB(n_grid,3,n_mo))
+    allocate(tmpC(n_grid,4,n_mo,n_mo), tmpD(n_grid,4,n_mo,n_mo))
+    allocate(tmpE(n_mo,n_mo,n_mo,n_mo))
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO(ipoint)   = tmpO(ipoint)   + wr1(ipoint) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ(ipoint,1) = tmpJ(ipoint,1) + wr1(ipoint) * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ(ipoint,2) = tmpJ(ipoint,2) + wr1(ipoint) * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ(ipoint,3) = tmpJ(ipoint,3) + wr1(ipoint) * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, ipoint)          &
+    !$OMP SHARED(n_mo, ne_b, n_grid,     &
+    !$OMP        wr1,                    &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12,         &
+    !$OMP        tmpA, tmpB)
+  
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpA(:,:,p) = 0.d0
+      tmpB(:,:,p) = 0.d0
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+          tmpA(ipoint,1,p) = tmpA(ipoint,1,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,p,i)
+          tmpA(ipoint,2,p) = tmpA(ipoint,2,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,p,i)
+          tmpA(ipoint,3,p) = tmpA(ipoint,3,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,p,i)
+          tmpB(ipoint,1,p) = tmpB(ipoint,1,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,i,p)
+          tmpB(ipoint,2,p) = tmpB(ipoint,2,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,i,p)
+          tmpB(ipoint,3,p) = tmpB(ipoint,3,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,i,p)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, s, i, ipoint)       &
+    !$OMP SHARED(n_mo, ne_b, n_grid,     & 
+    !$OMP        wr1,                    &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12,         &
+    !$OMP        tmpA, tmpB, tmpO, tmpJ, tmpC, tmpD)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+
+          tmpC(ipoint,1,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,1,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,1,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,1,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,1)
+          tmpC(ipoint,2,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,2,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,2,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,2,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,2)
+          tmpC(ipoint,3,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,3,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,3,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,3,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,3)
+
+          tmpD(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpD(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpD(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+          tmpD(ipoint,4,p,s) = wr1(ipoint) * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s)
+
+        enddo ! ipoint
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) += int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo ! ipoint
+        enddo ! i
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate(tmpO, tmpJ, tmpA, tmpB)
+
+
+    call dgemm( 'T', 'N', n_mo*n_mo, n_mo*n_mo, 4*n_grid, 0.5d0  &
+              , tmpC(1,1,1,1), 4*n_grid, tmpD(1,1,1,1), 4*n_grid &
+              , 0.d0, tmpE(1,1,1,1), n_mo*n_mo)
+
+    deallocate(tmpC, tmpD)
+
+    call sum_a_at(tmpE, n_mo*n_mo)
+
+    !$OMP PARALLEL            &
+    !$OMP DEFAULT(NONE)       &
+    !$OMP PRIVATE(t, s, q, p) &
+    !$OMP SHARED(n_mo, tmpE, noL_2e)
+  
+    !$OMP DO COLLAPSE(3)
+    do t = 1, n_mo
+      do s = 1, n_mo
+        do q = 1, n_mo
+          do p = 1, n_mo
+            noL_2e(p,q,s,t) = tmpE(p,s,q,t)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+    deallocate(tmpE)
+
+  else
+
+    allocate(tmpO(n_grid), tmpJ(n_grid,3))
+    allocate(tmpA(n_grid,3,n_mo), tmpB(n_grid,3,n_mo))
+    allocate(tmpC(n_grid,4,n_mo,n_mo), tmpD(n_grid,4,n_mo,n_mo))
+    allocate(tmpE(n_mo,n_mo,n_mo,n_mo))
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO(ipoint)   = tmpO(ipoint)   + wr1(ipoint) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ(ipoint,1) = tmpJ(ipoint,1) + wr1(ipoint) * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ(ipoint,2) = tmpJ(ipoint,2) + wr1(ipoint) * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ(ipoint,3) = tmpJ(ipoint,3) + wr1(ipoint) * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    do i = ne_b+1, ne_a
+      do ipoint = 1, n_grid
+        tmpO(ipoint)   = tmpO(ipoint)   + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ(ipoint,1) = tmpJ(ipoint,1) + 0.5d0 * wr1(ipoint) * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ(ipoint,2) = tmpJ(ipoint,2) + 0.5d0 * wr1(ipoint) * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ(ipoint,3) = tmpJ(ipoint,3) + 0.5d0 * wr1(ipoint) * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, ipoint)            &
+    !$OMP SHARED(n_mo, ne_a, ne_b, n_grid, &
+    !$OMP        wr1,                      &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12,           &
+    !$OMP        tmpA, tmpB)
+  
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpA(:,:,p) = 0.d0
+      tmpB(:,:,p) = 0.d0
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+          tmpA(ipoint,1,p) = tmpA(ipoint,1,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,p,i)
+          tmpA(ipoint,2,p) = tmpA(ipoint,2,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,p,i)
+          tmpA(ipoint,3,p) = tmpA(ipoint,3,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,p,i)
+          tmpB(ipoint,1,p) = tmpB(ipoint,1,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,i,p)
+          tmpB(ipoint,2,p) = tmpB(ipoint,2,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,i,p)
+          tmpB(ipoint,3,p) = tmpB(ipoint,3,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,i,p)
+        enddo
+      enddo
+      do i = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+          tmpA(ipoint,1,p) = tmpA(ipoint,1,p) + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,p,i)
+          tmpA(ipoint,2,p) = tmpA(ipoint,2,p) + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,p,i)
+          tmpA(ipoint,3,p) = tmpA(ipoint,3,p) + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,p,i)
+          tmpB(ipoint,1,p) = tmpB(ipoint,1,p) + 0.5d0 * wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,i,p)
+          tmpB(ipoint,2,p) = tmpB(ipoint,2,p) + 0.5d0 * wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,i,p)
+          tmpB(ipoint,3,p) = tmpB(ipoint,3,p) + 0.5d0 * wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,i,p)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, s, i, ipoint)         &
+    !$OMP SHARED(n_mo, ne_a, ne_b, n_grid, &
+    !$OMP        wr1,                      &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12,           &
+    !$OMP        tmpA, tmpB, tmpO, tmpJ, tmpC, tmpD)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+
+          tmpC(ipoint,1,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,1,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,1,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,1,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,1)
+          tmpC(ipoint,2,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,2,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,2,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,2,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,2)
+          tmpC(ipoint,3,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,3,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,3,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,3,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,3)
+
+          tmpD(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpD(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpD(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+          tmpD(ipoint,4,p,s) = wr1(ipoint) * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s)
+
+        enddo ! ipoint
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) += int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo ! ipoint
+        enddo ! i
+        do i = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) += 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo ! ipoint
+        enddo ! i
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    deallocate(tmpO, tmpJ, tmpA, tmpB)
+
+
+    call dgemm( 'T', 'N', n_mo*n_mo, n_mo*n_mo, 4*n_grid, 0.5d0  &
+              , tmpC(1,1,1,1), 4*n_grid, tmpD(1,1,1,1), 4*n_grid &
+              , 0.d0, tmpE(1,1,1,1), n_mo*n_mo)
+
+    deallocate(tmpC, tmpD)
+
+    call sum_a_at(tmpE, n_mo*n_mo)
+
+    !$OMP PARALLEL            &
+    !$OMP DEFAULT(NONE)       &
+    !$OMP PRIVATE(t, s, q, p) &
+    !$OMP SHARED(n_mo, tmpE, noL_2e)
+  
+    !$OMP DO COLLAPSE(3)
+    do t = 1, n_mo
+      do s = 1, n_mo
+        do q = 1, n_mo
+          do p = 1, n_mo
+            noL_2e(p,q,s,t) = tmpE(p,s,q,t)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+    deallocate(tmpE)
+
+  endif
+
+  call wall_time(t1)
+  write(*,"(A,2X,F15.7)") ' wall time for noL_2e (sec) = ', (t1 - t0)
+
+  return
+end
+
+! ---
+
+subroutine provide_no_2e_tmp(n_grid, n_mo, ne_a, ne_b, wr1, mos_l_in_r, mos_r_in_r, int2_grad1_u12, &
+                             tmpO, tmpJ, tmpA, tmpB, tmpC, tmpD, tmpE, noL_2e)
+
+  implicit none
+
+  integer,          intent(in)  :: n_grid, n_mo
+  integer,          intent(in)  :: ne_a, ne_b
+  double precision, intent(in)  :: wr1(n_grid)
+  double precision, intent(in)  :: mos_l_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: mos_r_in_r(n_grid,n_mo)
+  double precision, intent(in)  :: int2_grad1_u12(n_grid,3,n_mo,n_mo)
+  double precision, intent(out) :: tmpO(n_grid), tmpJ(n_grid,3)
+  double precision, intent(out) :: tmpA(n_grid,3,n_mo), tmpB(n_grid,3,n_mo)
+  double precision, intent(out) :: tmpC(n_grid,4,n_mo,n_mo), tmpD(n_grid,4,n_mo,n_mo)
+  double precision, intent(out) :: tmpE(n_mo,n_mo,n_mo,n_mo)
+  double precision, intent(out) :: noL_2e(n_mo,n_mo,n_mo,n_mo)
+
+  integer                       :: p, q, s, t, i, ipoint
+  double precision              :: t0, t1
+
+
+  call wall_time(t0)
+
+  if(ne_a .eq. ne_b) then
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO(ipoint)   = tmpO(ipoint)   + wr1(ipoint) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ(ipoint,1) = tmpJ(ipoint,1) + wr1(ipoint) * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ(ipoint,2) = tmpJ(ipoint,2) + wr1(ipoint) * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ(ipoint,3) = tmpJ(ipoint,3) + wr1(ipoint) * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, i, ipoint)          &
+    !$OMP SHARED(n_mo, ne_b, n_grid,     &
+    !$OMP        wr1,                    &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12,         &
+    !$OMP        tmpA, tmpB)
+  
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpA(:,:,p) = 0.d0
+      tmpB(:,:,p) = 0.d0
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+          tmpA(ipoint,1,p) = tmpA(ipoint,1,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,p,i)
+          tmpA(ipoint,2,p) = tmpA(ipoint,2,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,p,i)
+          tmpA(ipoint,3,p) = tmpA(ipoint,3,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,p,i)
+          tmpB(ipoint,1,p) = tmpB(ipoint,1,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,i,p)
+          tmpB(ipoint,2,p) = tmpB(ipoint,2,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,i,p)
+          tmpB(ipoint,3,p) = tmpB(ipoint,3,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,i,p)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+
+    !$OMP PARALLEL                       &
+    !$OMP DEFAULT(NONE)                  &
+    !$OMP PRIVATE(p, s, i, ipoint)       &
+    !$OMP SHARED(n_mo, ne_b, n_grid,     & 
+    !$OMP        wr1,                    &
+    !$OMP        mos_l_in_r, mos_r_in_r, &
+    !$OMP        int2_grad1_u12,         &
+    !$OMP        tmpA, tmpB, tmpO, tmpJ, tmpC, tmpD)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+
+          tmpC(ipoint,1,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,1,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,1,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,1,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,1)
+          tmpC(ipoint,2,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,2,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,2,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,2,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,2)
+          tmpC(ipoint,3,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,3,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,3,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,3,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,3)
+
+          tmpD(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpD(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpD(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+          tmpD(ipoint,4,p,s) = wr1(ipoint) * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s)
+
+        enddo ! ipoint
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) += int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo ! ipoint
+        enddo ! i
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+
+    call dgemm( 'T', 'N', n_mo*n_mo, n_mo*n_mo, 4*n_grid, 0.5d0  &
+              , tmpC(1,1,1,1), 4*n_grid, tmpD(1,1,1,1), 4*n_grid &
+              , 0.d0, tmpE(1,1,1,1), n_mo*n_mo)
+
+    call sum_a_at(tmpE, n_mo*n_mo)
+
+    !$OMP PARALLEL            &
+    !$OMP DEFAULT(NONE)       &
+    !$OMP PRIVATE(t, s, q, p) &
+    !$OMP SHARED(n_mo, tmpE, noL_2e)
+  
+    !$OMP DO COLLAPSE(3)
+    do t = 1, n_mo
+      do s = 1, n_mo
+        do q = 1, n_mo
+          do p = 1, n_mo
+            noL_2e(p,q,s,t) = tmpE(p,s,q,t)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+  else
+
+    tmpO = 0.d0
+    tmpJ = 0.d0
+    do i = 1, ne_b
+      do ipoint = 1, n_grid
+        tmpO(ipoint)   = tmpO(ipoint)   + wr1(ipoint) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ(ipoint,1) = tmpJ(ipoint,1) + wr1(ipoint) * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ(ipoint,2) = tmpJ(ipoint,2) + wr1(ipoint) * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ(ipoint,3) = tmpJ(ipoint,3) + wr1(ipoint) * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+    do i = ne_b+1, ne_a
+      do ipoint = 1, n_grid
+        tmpO(ipoint)   = tmpO(ipoint)   + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * mos_r_in_r(ipoint,i)
+        tmpJ(ipoint,1) = tmpJ(ipoint,1) + 0.5d0 * wr1(ipoint) * int2_grad1_u12(ipoint,1,i,i)
+        tmpJ(ipoint,2) = tmpJ(ipoint,2) + 0.5d0 * wr1(ipoint) * int2_grad1_u12(ipoint,2,i,i)
+        tmpJ(ipoint,3) = tmpJ(ipoint,3) + 0.5d0 * wr1(ipoint) * int2_grad1_u12(ipoint,3,i,i)
+      enddo
+    enddo
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, i, ipoint)            &
+    !$OMP SHARED(n_mo, ne_a, ne_b, n_grid, &
+    !$OMP        wr1,                      &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12,           &
+    !$OMP        tmpA, tmpB)
+  
+    !$OMP DO
+    do p = 1, n_mo
+
+      tmpA(:,:,p) = 0.d0
+      tmpB(:,:,p) = 0.d0
+      do i = 1, ne_b
+        do ipoint = 1, n_grid
+          tmpA(ipoint,1,p) = tmpA(ipoint,1,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,p,i)
+          tmpA(ipoint,2,p) = tmpA(ipoint,2,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,p,i)
+          tmpA(ipoint,3,p) = tmpA(ipoint,3,p) + wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,p,i)
+          tmpB(ipoint,1,p) = tmpB(ipoint,1,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,i,p)
+          tmpB(ipoint,2,p) = tmpB(ipoint,2,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,i,p)
+          tmpB(ipoint,3,p) = tmpB(ipoint,3,p) + wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,i,p)
+        enddo
+      enddo
+      do i = ne_b+1, ne_a
+        do ipoint = 1, n_grid
+          tmpA(ipoint,1,p) = tmpA(ipoint,1,p) + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,p,i)
+          tmpA(ipoint,2,p) = tmpA(ipoint,2,p) + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,p,i)
+          tmpA(ipoint,3,p) = tmpA(ipoint,3,p) + 0.5d0 * wr1(ipoint) * mos_l_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,p,i)
+          tmpB(ipoint,1,p) = tmpB(ipoint,1,p) + 0.5d0 * wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,1,i,p)
+          tmpB(ipoint,2,p) = tmpB(ipoint,2,p) + 0.5d0 * wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,2,i,p)
+          tmpB(ipoint,3,p) = tmpB(ipoint,3,p) + 0.5d0 * wr1(ipoint) * mos_r_in_r(ipoint,i) * int2_grad1_u12(ipoint,3,i,p)
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+
+    !$OMP PARALLEL                         &
+    !$OMP DEFAULT(NONE)                    &
+    !$OMP PRIVATE(p, s, i, ipoint)         &
+    !$OMP SHARED(n_mo, ne_a, ne_b, n_grid, &
+    !$OMP        wr1,                      &
+    !$OMP        mos_l_in_r, mos_r_in_r,   &
+    !$OMP        int2_grad1_u12,           &
+    !$OMP        tmpA, tmpB, tmpO, tmpJ, tmpC, tmpD)
+  
+    !$OMP DO COLLAPSE(2)
+    do s = 1, n_mo
+      do p = 1, n_mo
+
+        do ipoint = 1, n_grid
+
+          tmpC(ipoint,1,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,1,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,1,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,1,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,1)
+          tmpC(ipoint,2,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,2,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,2,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,2,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,2)
+          tmpC(ipoint,3,p,s) = mos_r_in_r(ipoint,s) * tmpA(ipoint,3,p)     &
+                             + mos_l_in_r(ipoint,p) * tmpB(ipoint,3,s)     &
+                             - tmpO(ipoint) * int2_grad1_u12(ipoint,3,p,s) &
+                             - 2.d0 * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s) * tmpJ(ipoint,3)
+
+          tmpD(ipoint,1,p,s) = int2_grad1_u12(ipoint,1,p,s)
+          tmpD(ipoint,2,p,s) = int2_grad1_u12(ipoint,2,p,s)
+          tmpD(ipoint,3,p,s) = int2_grad1_u12(ipoint,3,p,s)
+          tmpD(ipoint,4,p,s) = wr1(ipoint) * mos_l_in_r(ipoint,p) * mos_r_in_r(ipoint,s)
+
+        enddo ! ipoint
+
+        tmpC(:,4,p,s) = 0.d0
+        do i = 1, ne_b
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) += int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                + int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                + int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo ! ipoint
+        enddo ! i
+        do i = ne_b+1, ne_a
+          do ipoint = 1, n_grid
+            tmpC(ipoint,4,p,s) += 0.5d0 * int2_grad1_u12(ipoint,1,p,i) * int2_grad1_u12(ipoint,1,i,s) &
+                                + 0.5d0 * int2_grad1_u12(ipoint,2,p,i) * int2_grad1_u12(ipoint,2,i,s) &
+                                + 0.5d0 * int2_grad1_u12(ipoint,3,p,i) * int2_grad1_u12(ipoint,3,i,s)
+          enddo ! ipoint
+        enddo ! i
+
+      enddo ! p
+    enddo ! s
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+
+    call dgemm( 'T', 'N', n_mo*n_mo, n_mo*n_mo, 4*n_grid, 0.5d0  &
+              , tmpC(1,1,1,1), 4*n_grid, tmpD(1,1,1,1), 4*n_grid &
+              , 0.d0, tmpE(1,1,1,1), n_mo*n_mo)
+
+    call sum_a_at(tmpE, n_mo*n_mo)
+
+    !$OMP PARALLEL            &
+    !$OMP DEFAULT(NONE)       &
+    !$OMP PRIVATE(t, s, q, p) &
+    !$OMP SHARED(n_mo, tmpE, noL_2e)
+  
+    !$OMP DO COLLAPSE(3)
+    do t = 1, n_mo
+      do s = 1, n_mo
+        do q = 1, n_mo
+          do p = 1, n_mo
+            noL_2e(p,q,s,t) = tmpE(p,s,q,t)
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+  
+  endif
+
+  call wall_time(t1)
+  write(*,"(A,2X,F15.7)") ' wall time for noL_2e & tmp tensors (sec) = ', (t1 - t0)
+
+  return
+end
+
+! ---
+
+
diff --git a/plugins/local/tc_int/uninstall b/plugins/local/tc_int/uninstall
new file mode 100755
index 00000000..e37a5491
--- /dev/null
+++ b/plugins/local/tc_int/uninstall
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Check if the QP_ROOT environment variable is set.
+if [[ -z ${QP_ROOT} ]]
+then
+  print "The QP_ROOT environment variable is not set."
+  print "Please reload the quantum_package.rc file."
+  exit -1
+fi
+
+rm -rf ${PWD}/CuTC
+rm ${QP_ROOT}/lib/libcutcint.so
+
diff --git a/plugins/local/tc_int/write_tc_int.irp.f b/plugins/local/tc_int/write_tc_int.irp.f
new file mode 100644
index 00000000..9f25a6fd
--- /dev/null
+++ b/plugins/local/tc_int/write_tc_int.irp.f
@@ -0,0 +1,56 @@
+! ---
+
+program write_tc_int
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call provide_int2_grad1_u12_ao()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+
diff --git a/plugins/local/tc_int/write_tc_int_cuda.irp.f b/plugins/local/tc_int/write_tc_int_cuda.irp.f
new file mode 100644
index 00000000..756630b8
--- /dev/null
+++ b/plugins/local/tc_int/write_tc_int_cuda.irp.f
@@ -0,0 +1,194 @@
+! ---
+
+program write_tc_int_cuda
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call do_work_on_gpu()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+
+subroutine do_work_on_gpu()
+
+  use cutc_module
+
+  implicit none
+
+  integer :: k, ipoint
+
+  double precision, allocatable :: rn(:,:), aos_data1(:,:,:), aos_data2(:,:,:)
+  double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:)
+  double precision, allocatable :: int_2e_ao(:,:,:,:)
+
+  double precision :: time0, time1
+  double precision :: cuda_time0, cuda_time1
+
+  call wall_time(time0)
+  print*, ' start calculation of TC-integrals'
+
+  allocate(rn(3,nucl_num))
+  allocate(aos_data1(n_points_final_grid,ao_num,4))
+  allocate(aos_data2(n_points_extra_final_grid,ao_num,4))
+  allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,3))
+  allocate(int_2e_ao(ao_num,ao_num,ao_num,ao_num))
+
+
+  do k = 1, nucl_num
+    rn(1,k) = nucl_coord(k,1)
+    rn(2,k) = nucl_coord(k,2)
+    rn(3,k) = nucl_coord(k,3)
+  enddo
+
+  do k = 1, ao_num
+    do ipoint = 1, n_points_final_grid
+      aos_data1(ipoint,k,1) = aos_in_r_array(k,ipoint)
+      aos_data1(ipoint,k,2) = aos_grad_in_r_array(k,ipoint,1)
+      aos_data1(ipoint,k,3) = aos_grad_in_r_array(k,ipoint,2)
+      aos_data1(ipoint,k,4) = aos_grad_in_r_array(k,ipoint,3)
+    enddo
+
+    do ipoint = 1, n_points_extra_final_grid
+      aos_data2(ipoint,k,1) = aos_in_r_array_extra(k,ipoint)
+      aos_data2(ipoint,k,2) = aos_grad_in_r_array_extra(k,ipoint,1)
+      aos_data2(ipoint,k,3) = aos_grad_in_r_array_extra(k,ipoint,2)
+      aos_data2(ipoint,k,4) = aos_grad_in_r_array_extra(k,ipoint,3)
+    enddo
+  enddo
+
+  ! ---
+
+  integer :: nB
+  integer :: sB
+
+  PROVIDE nxBlocks nyBlocks nzBlocks
+  PROVIDE blockxSize blockySize blockzSize
+
+  sB = 32
+  nB = (n_points_final_grid + sB - 1) / sB
+
+  call ezfio_set_tc_int_blockxSize(sB)
+  call ezfio_set_tc_int_nxBlocks(nB)
+
+
+
+  call wall_time(cuda_time0)
+  print*, ' start CUDA kernel'
+
+  call cutc_int(nxBlocks, nyBlocks, nzBlocks, blockxSize, blockySize, blockzSize,           &
+                n_points_final_grid, n_points_extra_final_grid, ao_num, nucl_num, jBH_size, &
+                final_grid_points, final_weight_at_r_vector,                                &
+                final_grid_points_extra, final_weight_at_r_vector_extra,                    &
+                rn, aos_data1, aos_data2, jBH_c, jBH_m, jBH_n, jBH_o,                       &
+                int2_grad1_u12_ao, int_2e_ao)
+
+  call wall_time(cuda_time1)
+  print*, ' wall time for CUDA kernel (min) = ', (cuda_time1-cuda_time0) / 60.d0
+
+  deallocate(aos_data1, aos_data2)
+
+  ! ---
+
+  integer :: i, j, l
+  double precision :: t1, t2
+  double precision :: tmp
+  double precision, external :: get_ao_two_e_integral
+
+  call wall_time(t1)
+
+  PROVIDE ao_integrals_map
+  tmp = get_ao_two_e_integral(1, 1, 1, 1, ao_integrals_map)
+
+  !$OMP PARALLEL DEFAULT(NONE)                      &
+  !$OMP SHARED(ao_num, int_2e_ao, ao_integrals_map) &
+  !$OMP PRIVATE(i, j, k, l)
+  !$OMP DO COLLAPSE(3)
+  do j = 1, ao_num
+    do l = 1, ao_num
+      do i = 1, ao_num
+        do k = 1, ao_num
+          !                                         < 1:i, 2:j | 1:k, 2:l >
+          int_2e_ao(k,i,l,j) = int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call wall_time(t2)
+  print*, ' wall time of Coulomb part of tc_int_2e_ao (min) ', (t2 - t1) / 60.d0
+
+  ! ---
+
+  print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
+    call ezfio_set_work_empty(.False.)
+    write(11) int2_grad1_u12_ao
+  close(11)
+  deallocate(int2_grad1_u12_ao)
+
+  print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
+  open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
+  call ezfio_set_work_empty(.False.)
+  do k = 1, ao_num
+    write(11) int_2e_ao(:,:,:,k)
+  enddo
+  close(11)
+  deallocate(int_2e_ao)
+
+  ! ----
+
+
+  call wall_time(time1)
+  print*, ' wall time for TC-integrals (min) = ', (time1-time0) / 60.d0
+
+  return
+end
+
+! ---
diff --git a/plugins/local/tc_int/write_tc_int_gpu.irp.f b/plugins/local/tc_int/write_tc_int_gpu.irp.f
new file mode 100644
index 00000000..c0dd9c90
--- /dev/null
+++ b/plugins/local/tc_int/write_tc_int_gpu.irp.f
@@ -0,0 +1,56 @@
+! ---
+
+program write_tc_int_gpu
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  my_extra_grid_becke  = .True.
+  PROVIDE tc_grid2_a tc_grid2_r
+  my_n_pt_r_extra_grid = tc_grid2_r
+  my_n_pt_a_extra_grid = tc_grid2_a
+  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+  call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  call provide_int2_grad1_u12_ao_gpu()
+
+  call ezfio_set_tc_keywords_io_tc_integ('Read')
+
+end
+
+! ---
+
diff --git a/plugins/local/tc_keywords/EZFIO.cfg b/plugins/local/tc_keywords/EZFIO.cfg
index e0776136..b858fa5b 100644
--- a/plugins/local/tc_keywords/EZFIO.cfg
+++ b/plugins/local/tc_keywords/EZFIO.cfg
@@ -14,7 +14,7 @@ default: False
 type: logical
 doc: If |true|, three-body terms are included
 interface: ezfio,provider,ocaml
-default: True
+default: False
 
 [three_e_3_idx_term]
 type: logical
@@ -50,7 +50,7 @@ default: False
 type: logical
 doc: If |true|, standard normal-ordering for L (to be used with three_body_h_tc |false|)
 interface: ezfio,provider,ocaml
-default: False
+default: True
 
 [core_tc_op]
 type: logical
@@ -100,30 +100,12 @@ doc: If |true|, the states are re-ordered to match the input states
 default: False
 interface: ezfio,provider,ocaml
 
-[bi_ortho]
-type: logical
-doc: If |true|, the MO basis is assumed to be bi-orthonormal
-interface: ezfio,provider,ocaml
-default: True
-
 [symmetric_fock_tc]
 type: logical
 doc: If |true|, using F+F^t as Fock TC
 interface: ezfio,provider,ocaml
 default: False
 
-[thresh_tcscf]
-type: Threshold
-doc: Threshold on the convergence of the Hartree Fock energy.
-interface: ezfio,provider,ocaml
-default: 1.e-8
-
-[n_it_tcscf_max]
-type: Strictly_positive_int
-doc: Maximum number of SCF iterations
-interface: ezfio,provider,ocaml
-default: 50
-
 [selection_tc]
 type: integer
 doc: if +1: only positive is selected, -1: only negative is selected, :0 both positive and negative
@@ -160,30 +142,6 @@ doc: If |true|, maximize the overlap between orthogonalized left- and right eige
 interface: ezfio,provider,ocaml
 default: False
 
-[max_dim_diis_tcscf]
-type: integer
-doc: Maximum size of the DIIS extrapolation procedure
-interface: ezfio,provider,ocaml
-default: 15
-
-[level_shift_tcscf]
-type: Positive_float
-doc: Energy shift on the virtual MOs to improve TCSCF convergence
-interface: ezfio,provider,ocaml
-default: 0.
-
-[tcscf_algorithm]
-type: character*(32)
-doc: Type of TCSCF algorithm used. Possible choices are [Simple | DIIS]
-interface: ezfio,provider,ocaml
-default: DIIS
-
-[im_thresh_tcscf]
-type: Threshold
-doc: Thresholds on the Imag part of energy
-interface: ezfio,provider,ocaml
-default: 1.e-7
-
 [test_cycle_tc]
 type: logical
 doc: If |true|, the integrals of the three-body jastrow are computed with cycles
@@ -226,12 +184,6 @@ doc: Read/Write normal_two_body_bi_orth from/to disk [ Write | Read | None ]
 interface: ezfio,provider,ocaml
 default: None
 
-[debug_tc_pt2]
-type: integer
-doc: If :: 1 then you compute the TC-PT2 the old way, :: 2 then you check with the new version but without three-body
-interface: ezfio,provider,ocaml
-default: -1
-
 [only_spin_tc_right]
 type: logical
 doc: If |true|, only the right part of WF is used to compute spin dens
@@ -278,10 +230,46 @@ default: 70
 type: character*(32)
 doc: approach used to evaluate TC integrals [ analytic | numeric | semi-analytic ]
 interface: ezfio,ocaml,provider
-default: semi-analytic
+default: numeric
 
 [minimize_lr_angles]
 type: logical
 doc: If |true|, you minimize the angle between the left and right vectors associated to degenerate orbitals
 interface: ezfio,provider,ocaml
 default: False
+
+[thresh_de_tc_angles]
+type: Threshold
+doc: Thresholds on delta E for changing angles between orbitals 
+interface: ezfio,provider,ocaml
+default: 1.e-6
+
+[ao_to_mo_tc_n3]
+type: logical
+doc: If |true|, memory scale of TC ao -> mo: O(N3)
+interface: ezfio,provider,ocaml
+default: False
+
+[tc_save_mem_loops]
+type: logical
+doc: If |true|, use loops to save memory TC
+interface: ezfio,provider,ocaml
+default: False
+
+[tc_save_mem]
+type: logical
+doc: If |true|, more calc but less mem
+interface: ezfio,provider,ocaml
+default: False
+
+[im_thresh_tc]
+type: Threshold
+doc: Thresholds on the Imag part of TC energy
+interface: ezfio,provider,ocaml
+default: 1.e-7
+
+[transpose_two_e_int]
+type: logical
+doc: If |true|, you duplicate the two-electron TC integrals with the transpose matrix. Acceleates the PT2.  
+interface: ezfio,provider,ocaml
+default: False
diff --git a/plugins/local/tc_progs/NEED b/plugins/local/tc_progs/NEED
new file mode 100644
index 00000000..9deb3db4
--- /dev/null
+++ b/plugins/local/tc_progs/NEED
@@ -0,0 +1 @@
+tc_bi_ortho
diff --git a/plugins/local/tc_bi_ortho/print_he_tc_energy.irp.f b/plugins/local/tc_progs/print_he_tc_energy.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_he_tc_energy.irp.f
rename to plugins/local/tc_progs/print_he_tc_energy.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_dump.irp.f b/plugins/local/tc_progs/print_tc_dump.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_dump.irp.f
rename to plugins/local/tc_progs/print_tc_dump.irp.f
diff --git a/plugins/local/tc_progs/print_tc_energy.irp.f b/plugins/local/tc_progs/print_tc_energy.irp.f
new file mode 100644
index 00000000..979d792b
--- /dev/null
+++ b/plugins/local/tc_progs/print_tc_energy.irp.f
@@ -0,0 +1,53 @@
+program print_tc_energy
+
+  BEGIN_DOC
+  ! TODO : Put the documentation of the program here
+  END_DOC
+
+  implicit none
+
+  read_wf = .True.
+  touch read_wf
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+  endif
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE j2e_type
+  PROVIDE j1e_type
+  PROVIDE env_type
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  call write_tc_energy()
+
+end
+
diff --git a/plugins/local/tc_bi_ortho/print_tc_spin_dens.irp.f b/plugins/local/tc_progs/print_tc_spin_dens.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/print_tc_spin_dens.irp.f
rename to plugins/local/tc_progs/print_tc_spin_dens.irp.f
diff --git a/plugins/local/tc_bi_ortho/print_tc_var.irp.f b/plugins/local/tc_progs/print_tc_var.irp.f
similarity index 70%
rename from plugins/local/tc_bi_ortho/print_tc_var.irp.f
rename to plugins/local/tc_progs/print_tc_var.irp.f
index bec34f18..6743cd11 100644
--- a/plugins/local/tc_bi_ortho/print_tc_var.irp.f
+++ b/plugins/local/tc_progs/print_tc_var.irp.f
@@ -6,7 +6,8 @@ program print_tc_var
 
   implicit none
 
-  print *, 'Hello world'
+  print *, ' TC VAR is available only for HF REF WF'
+  print *, ' DO NOT FORGET TO RUN A CISD CALCULATION BEF'
 
   my_grid_becke = .True.
   PROVIDE tc_grid1_a tc_grid1_r
@@ -17,7 +18,7 @@ program print_tc_var
   read_wf = .True.
   touch read_wf
 
-  call write_tc_var()
+  call write_tc_gs_var_HF()
 
 end
 
diff --git a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f b/plugins/local/tc_progs/print_tc_wf.irp.f
similarity index 88%
rename from plugins/local/tc_bi_ortho/print_tc_wf.irp.f
rename to plugins/local/tc_progs/print_tc_wf.irp.f
index 2b88bc5b..3e010e01 100644
--- a/plugins/local/tc_bi_ortho/print_tc_wf.irp.f
+++ b/plugins/local/tc_progs/print_tc_wf.irp.f
@@ -37,7 +37,7 @@ subroutine write_l_r_wf
  integer :: i
  print*,'Writing the left-right wf'
  do i = 1, N_det
-  write(i_unit_output,*)i, psi_coef_sorted_tc(i,1)/psi_coef_sorted_tc(i,1)                       &
+  write(i_unit_output,'(I8,X,10(F16.10,X))')i, psi_coef_sorted_tc(i,1),psi_coef_sorted_tc(i,1)/psi_coef_sorted_tc(1,1)&
                          , psi_l_coef_sorted_bi_ortho_left(i)/psi_l_coef_sorted_bi_ortho_left(1) &
                          , psi_r_coef_sorted_bi_ortho_right(i)/psi_r_coef_sorted_bi_ortho_right(1)
  enddo
@@ -61,12 +61,12 @@ subroutine routine
  do i = 1, N_det
   call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
    if(degree == 1 .or. degree == 2)then
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),HF_bitmask,N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i),psi_det(1,1,i),N_int,hmono,htwoe,hthree,e_i0)
     delta_e = e_tilde_00 - e_i0
     coef_pt1 = htilde_ij / delta_e
  
-    call htilde_mu_mat_bi_ortho_slow(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
+    call htilde_mu_mat_opt_bi_ortho(HF_bitmask,psi_det(1,1,i),N_int,hmono,htwoe,hthree,htilde_ij)
     contrib_pt = coef_pt1 * htilde_ij
     e_pt2 += contrib_pt
  
diff --git a/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f b/plugins/local/tc_progs/save_bitcpsileft_for_qmcchem.irp.f
similarity index 91%
rename from plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f
rename to plugins/local/tc_progs/save_bitcpsileft_for_qmcchem.irp.f
index efa4aa2c..ac90f737 100644
--- a/plugins/local/tc_bi_ortho/save_bitcpsileft_for_qmcchem.irp.f
+++ b/plugins/local/tc_progs/save_bitcpsileft_for_qmcchem.irp.f
@@ -38,9 +38,9 @@ subroutine main()
         call ezfio_has_cisd_energy(exists)
         if(.not.exists) then
 
-          call ezfio_has_tc_scf_bitc_energy(exists)
+          call ezfio_has_tc_scf_tcscf_energy(exists)
           if(exists) then
-            call ezfio_get_tc_scf_bitc_energy(e_ref)
+            call ezfio_get_tc_scf_tcscf_energy(e_ref)
           endif
 
         else
@@ -59,7 +59,7 @@ subroutine main()
 
   close(iunit)
 
-end subroutine main
+end
 
 ! --
 
@@ -89,7 +89,7 @@ subroutine write_lr_spindeterminants()
   call ezfio_set_spindeterminants_psi_left_coef_matrix_values(buffer)
   deallocate(buffer)
 
-end subroutine write_lr_spindeterminants
+end
 
 ! ---
 
diff --git a/plugins/local/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f b/plugins/local/tc_progs/save_tc_bi_ortho_nat.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/save_tc_bi_ortho_nat.irp.f
rename to plugins/local/tc_progs/save_tc_bi_ortho_nat.irp.f
diff --git a/plugins/local/tc_bi_ortho/select_dets_bi_ortho.irp.f b/plugins/local/tc_progs/select_dets_bi_ortho.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/select_dets_bi_ortho.irp.f
rename to plugins/local/tc_progs/select_dets_bi_ortho.irp.f
diff --git a/plugins/local/tc_bi_ortho/tc_bi_ortho_prop.irp.f b/plugins/local/tc_progs/tc_bi_ortho_prop.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/tc_bi_ortho_prop.irp.f
rename to plugins/local/tc_progs/tc_bi_ortho_prop.irp.f
diff --git a/plugins/local/tc_bi_ortho/tc_som.irp.f b/plugins/local/tc_progs/tc_som.irp.f
similarity index 82%
rename from plugins/local/tc_bi_ortho/tc_som.irp.f
rename to plugins/local/tc_progs/tc_som.irp.f
index 1d11c81b..6bdcc1f0 100644
--- a/plugins/local/tc_bi_ortho/tc_som.irp.f
+++ b/plugins/local/tc_progs/tc_som.irp.f
@@ -49,8 +49,8 @@ subroutine main()
   U_SOM = 0.d0 
   do i = 1, N_det
     if(i == i_HF) cycle
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
-    call htilde_mu_mat_bi_ortho_slow(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i_HF), psi_det(1,1,i), N_int, hmono_1, htwoe_1, hthree_1, htot_1)
+    call htilde_mu_mat_opt_bi_ortho(psi_det(1,1,i), psi_det(1,1,i_HF), N_int, hmono_2, htwoe_2, hthree_2, htot_2)
     U_SOM += htot_1 * htot_2
   enddo
   U_SOM = 0.5d0 * U_SOM
diff --git a/plugins/local/tc_bi_ortho/test_tc_two_rdm.irp.f b/plugins/local/tc_progs/test_tc_two_rdm.irp.f
similarity index 100%
rename from plugins/local/tc_bi_ortho/test_tc_two_rdm.irp.f
rename to plugins/local/tc_progs/test_tc_two_rdm.irp.f
diff --git a/plugins/local/tc_scf/EZFIO.cfg b/plugins/local/tc_scf/EZFIO.cfg
index 3dfa9a71..e3d24338 100644
--- a/plugins/local/tc_scf/EZFIO.cfg
+++ b/plugins/local/tc_scf/EZFIO.cfg
@@ -1,6 +1,6 @@
-[bitc_energy]
+[tcscf_energy]
 type: Threshold
-doc: Energy bi-tc HF
+doc: TC-SCF ENERGY 
 interface: ezfio
 
 [converged_tcscf]
@@ -9,3 +9,33 @@ doc: If |true|, tc-scf has converged
 interface: ezfio,provider,ocaml
 default: False
 
+[max_dim_diis_tcscf]
+type: integer
+doc: Maximum size of the DIIS extrapolation procedure
+interface: ezfio,provider,ocaml
+default: 15
+
+[level_shift_tcscf]
+type: Positive_float
+doc: Energy shift on the virtual MOs to improve TCSCF convergence
+interface: ezfio,provider,ocaml
+default: 0.
+
+[thresh_tcscf]
+type: Threshold
+doc: Threshold on the convergence of the Hartree Fock energy.
+interface: ezfio,provider,ocaml
+default: 1.e-8
+
+[n_it_tcscf_max]
+type: Strictly_positive_int
+doc: Maximum number of SCF iterations
+interface: ezfio,provider,ocaml
+default: 50
+
+[tc_Brillouin_Right]
+type: logical
+doc: If |true|, impose only right-Brillouin condition
+interface: ezfio,provider,ocaml
+default: False
+
diff --git a/plugins/local/tc_scf/combine_lr_tcscf.irp.f b/plugins/local/tc_scf/combine_lr_tcscf.irp.f
deleted file mode 100644
index a22614ba..00000000
--- a/plugins/local/tc_scf/combine_lr_tcscf.irp.f
+++ /dev/null
@@ -1,75 +0,0 @@
-
-! ---
-
-program combine_lr_tcscf
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  bi_ortho = .True.
-  touch bi_ortho
-
-  call comb_orbitals()
-
-end
-
-! ---
-
-subroutine comb_orbitals()
-
-  implicit none
-  integer                       :: i, m, n, nn, mm
-  double precision              :: accu_d, accu_nd
-  double precision, allocatable :: R(:,:), L(:,:), Rnew(:,:), tmp(:,:), S(:,:)
-
-  n  = ao_num
-  m  = mo_num
-  nn = elec_alpha_num
-  mm = m - nn
-
-  allocate(L(n,m), R(n,m), Rnew(n,m), S(m,m))
-  L = mo_l_coef
-  R = mo_r_coef
-
-  call check_weighted_biorthog(n, m, ao_overlap, L, R, accu_d, accu_nd, S, .true.)
-
-  allocate(tmp(n,nn))
-  do i = 1, nn 
-    tmp(1:n,i) = R(1:n,i)
-  enddo
-  call impose_weighted_orthog_svd(n, nn, ao_overlap, tmp)
-  do i = 1, nn
-    Rnew(1:n,i) = tmp(1:n,i)
-  enddo
-  deallocate(tmp)
-
-  allocate(tmp(n,mm))
-  do i = 1, mm
-    tmp(1:n,i) = L(1:n,i+nn)
-  enddo
-  call impose_weighted_orthog_svd(n, mm, ao_overlap, tmp)
-  do i = 1, mm
-    Rnew(1:n,i+nn) = tmp(1:n,i)
-  enddo
-  deallocate(tmp)
-
-  call check_weighted_biorthog(n, m, ao_overlap, Rnew, Rnew, accu_d, accu_nd, S, .true.)
-
-  mo_r_coef = Rnew
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-
-  deallocate(L, R, Rnew, S)
-
-end subroutine comb_orbitals
-
-! ---
-
diff --git a/plugins/local/tc_scf/diago_vartcfock.irp.f b/plugins/local/tc_scf/diago_vartcfock.irp.f
deleted file mode 100644
index 0c881dcb..00000000
--- a/plugins/local/tc_scf/diago_vartcfock.irp.f
+++ /dev/null
@@ -1,96 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_vartc_eigvec_mo, (mo_num, mo_num)]
-
-  implicit none
-
-  integer                       :: i, j
-  integer                       :: liwork, lwork, n, info
-  integer,          allocatable :: iwork(:)
-  double precision, allocatable :: work(:), F(:,:), F_save(:,:)
-  double precision, allocatable :: diag(:)
-
-  PROVIDE mo_r_coef
-  PROVIDE Fock_matrix_vartc_mo_tot
-
-  allocate( F(mo_num,mo_num), F_save(mo_num,mo_num)  )
-  allocate (diag(mo_num) )
-
-  do j = 1, mo_num
-    do i = 1, mo_num
-      F(i,j) = Fock_matrix_vartc_mo_tot(i,j)
-    enddo
-  enddo
-
-  ! Insert level shift here
-  do i = elec_beta_num+1, elec_alpha_num
-    F(i,i) += 0.5d0 * level_shift_tcscf
-  enddo
-  do i = elec_alpha_num+1, mo_num
-    F(i,i) += level_shift_tcscf
-  enddo
-
-  n = mo_num
-  lwork = 1+6*n + 2*n*n
-  liwork = 3 + 5*n
-
-  allocate(work(lwork))
-  allocate(iwork(liwork) )
-
-  lwork = -1
-  liwork = -1
-
-  F_save = F
-  call dsyevd('V', 'U', mo_num, F, size(F, 1), diag, work, lwork, iwork, liwork, info)
-
-  if (info /= 0) then
-    print *,  irp_here//' DSYEVD failed : ', info
-    stop 1
-  endif
-  lwork = int(work(1))
-  liwork = iwork(1)
-  deallocate(iwork)
-  deallocate(work)
-
-  allocate(work(lwork))
-  allocate(iwork(liwork) )
-  call dsyevd('V', 'U', mo_num, F, size(F, 1), diag, work, lwork, iwork, liwork, info)
-  deallocate(iwork)
-
-  if (info /= 0) then
-    F = F_save
-    call dsyev('V', 'L', mo_num, F, size(F, 1), diag, work, lwork, info)
-
-    if (info /= 0) then
-      print *,  irp_here//' DSYEV failed : ', info
-      stop 1
-    endif
-  endif
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      fock_vartc_eigvec_mo(j,i) = F(j,i)
-    enddo
-  enddo
-
-  deallocate(work, F, F_save, diag)
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_vartc_eigvec_ao, (ao_num, mo_num)]
-
-  implicit none
-
-  PROVIDE mo_r_coef
-
-  call dgemm( 'N', 'N', ao_num, mo_num, mo_num, 1.d0                                             &
-            , mo_r_coef, size(mo_r_coef, 1), fock_vartc_eigvec_mo, size(fock_vartc_eigvec_mo, 1) &
-            , 0.d0, fock_vartc_eigvec_ao, size(fock_vartc_eigvec_ao, 1))
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/diis_tcscf.irp.f b/plugins/local/tc_scf/diis_tcscf.irp.f
index 5d7d6b2e..ccc8eb15 100644
--- a/plugins/local/tc_scf/diis_tcscf.irp.f
+++ b/plugins/local/tc_scf/diis_tcscf.irp.f
@@ -91,28 +91,14 @@ BEGIN_PROVIDER [double precision, FQS_SQF_ao, (ao_num, ao_num)]
   double precision, allocatable :: tmp(:,:)
   double precision, allocatable :: F(:,:)
 
-  !print *, ' Providing FQS_SQF_ao ...'
-  !call wall_time(t0)
+  PROVIDE Fock_matrix_tc_ao_tot
 
   allocate(F(ao_num,ao_num))
-  if(var_tc) then
-
-    do i = 1, ao_num
-      do j = 1, ao_num
-        F(j,i) = Fock_matrix_vartc_ao_tot(j,i)
-      enddo
+  do i = 1, ao_num
+    do j = 1, ao_num
+      F(j,i) = Fock_matrix_tc_ao_tot(j,i)
     enddo
-
-  else
-
-    PROVIDE Fock_matrix_tc_ao_tot
-    do i = 1, ao_num
-      do j = 1, ao_num
-        F(j,i) = Fock_matrix_tc_ao_tot(j,i)
-      enddo
-    enddo
-
-  endif
+  enddo
 
   allocate(tmp(ao_num,ao_num))
 
@@ -140,9 +126,6 @@ BEGIN_PROVIDER [double precision, FQS_SQF_ao, (ao_num, ao_num)]
   deallocate(tmp)
   deallocate(F)
 
-  !call wall_time(t1)
-  !print *, ' Wall time for FQS_SQF_ao =', t1-t0
-
 END_PROVIDER
 
 ! ---
@@ -152,61 +135,13 @@ BEGIN_PROVIDER [double precision, FQS_SQF_mo, (mo_num, mo_num)]
   implicit none
   double precision :: t0, t1
 
-  !print*, ' Providing FQS_SQF_mo ...'
-  !call wall_time(t0)
-
   PROVIDE mo_r_coef mo_l_coef
   PROVIDE FQS_SQF_ao
 
   call ao_to_mo_bi_ortho( FQS_SQF_ao, size(FQS_SQF_ao, 1) &
                         , FQS_SQF_mo, size(FQS_SQF_mo, 1) )
 
-  !call wall_time(t1)
-  !print*, ' Wall time for FQS_SQF_mo =', t1-t0 
-
 END_PROVIDER
 
 ! ---
 
-! BEGIN_PROVIDER [ double precision, eigenval_Fock_tc_ao, (ao_num) ]
-!&BEGIN_PROVIDER [ double precision, eigenvec_Fock_tc_ao, (ao_num,ao_num) ]
-!
-!  BEGIN_DOC
-!  !
-!  ! Eigenvalues and eigenvectors of the Fock matrix over the ao basis
-!  !
-!  ! F' = X.T x F x X   where X = ao_overlap^(-1/2)
-!  ! 
-!  ! F'   x Cr' = Cr' x E ==> F   Cr = Cr x E with Cr = X x Cr'
-!  ! F'.T x Cl' = Cl' x E ==> F.T Cl = Cl x E with Cl = X x Cl'
-!  !
-!  END_DOC
-!
-!  implicit none
-!  double precision, allocatable :: tmp1(:,:), tmp2(:,:)
-!
-!  ! ---
-!  ! Fock matrix in orthogonal basis: F' = X.T x F x X
-!
-!  allocate(tmp1(ao_num,ao_num))
-!  call dgemm( 'N', 'N', ao_num, ao_num, ao_num, 1.d0                                                 &
-!            , Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1), S_half_inv, size(S_half_inv, 1) &
-!            , 0.d0, tmp1, size(tmp1, 1) )
-!
-!  allocate(tmp2(ao_num,ao_num))
-!  call dgemm( 'T', 'N', ao_num, ao_num, ao_num, 1.d0               &
-!            , S_half_inv, size(S_half_inv, 1), tmp1, size(tmp1, 1) &
-!            , 0.d0, tmp2, size(tmp2, 1) )
-!
-!  ! ---
-!
-!  ! Diagonalize F' to obtain eigenvectors in orthogonal basis C' and eigenvalues
-!  ! TODO
-!
-!  ! Back-transform eigenvectors: C =X.C'
-!
-!END_PROVIDER
-
-! ---
-
-~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
deleted file mode 100644
index 0b883865..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_cs.irp.f
+++ /dev/null
@@ -1,280 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs, (mo_num, mo_num)]
-
-  implicit none
-  integer                       :: a, b, i, j, ipoint
-  double precision              :: ti, tf
-  double precision              :: loc_1, loc_2, loc_3
-  double precision, allocatable :: Okappa(:), Jkappa(:,:)
-  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
-  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
-  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' PROVIDING fock_3e_uhf_mo_cs ...'
-  !call wall_time(ti)
-
-  ! ---
-
-  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
-  Jkappa = 0.d0
-  Okappa = 0.d0
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)               &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
-  tmp_omp_d2 = 0.d0
-  tmp_omp_d1 = 0.d0
-
-  !$OMP DO
-  do i = 1, elec_beta_num
-    do ipoint = 1, n_points_final_grid
-      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
-      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
-      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
-      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
-    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2, tmp_omp_d1)
-
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_1(n_points_final_grid,4))
-
-  do ipoint = 1, n_points_final_grid
-
-    loc_1 = 2.d0 * Okappa(ipoint) 
-
-    tmp_1(ipoint,1) = loc_1 * Jkappa(ipoint,1)
-    tmp_1(ipoint,2) = loc_1 * Jkappa(ipoint,2)
-    tmp_1(ipoint,3) = loc_1 * Jkappa(ipoint,3)
-
-    tmp_1(ipoint,4) = Okappa(ipoint)
-  enddo
-
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, i, j, loc_1, tmp_omp_d2)                 &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3))
-  tmp_omp_d2 = 0.d0
-
-  !$OMP DO COLLAPSE(2)
-  do i = 1, elec_beta_num
-    do j = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
-
-        tmp_omp_d2(ipoint,1) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) 
-        tmp_omp_d2(ipoint,2) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) 
-        tmp_omp_d2(ipoint,3) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) 
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2)
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, a, b)                                    &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
-        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
-        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                 &
-  !$OMP DEFAULT (NONE)                                           &
-  !$OMP PRIVATE (ipoint, a, b, i)                                &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,      &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t, &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-      tmp_2(:,4,b,a) = 0.d0
-      do i = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,4,b,a) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, -2.d0 &
-            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2)  &
-            , tmp_1(1,1), 1                                    &
-            , 0.d0, fock_3e_uhf_mo_cs(1,1), 1)
-
-  deallocate(tmp_1, tmp_2)
-
-  ! ---
-
-  allocate(tmp_3(n_points_final_grid,5,mo_num), tmp_4(n_points_final_grid,5,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector, Jkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    tmp_3(:,:,b) = 0.d0
-    tmp_4(:,:,b) = 0.d0
-    do ipoint = 1, n_points_final_grid
-      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
-
-      tmp_4(ipoint,1,b) = -2.d0 * mos_r_in_r_array_transp(ipoint,b) * ( Jkappa(ipoint,1) * Jkappa(ipoint,1) &
-                                                                      + Jkappa(ipoint,2) * Jkappa(ipoint,2) &
-                                                                      + Jkappa(ipoint,3) * Jkappa(ipoint,3) )
-      tmp_4(ipoint,5,b) = mos_r_in_r_array_transp(ipoint,b)
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2)                      &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         Jkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-
-        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
-        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
-        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
-        tmp_3(ipoint,5,b) += 2.d0 * loc_1 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
-                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
-                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
-                                                                                                       
-        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-        tmp_4(ipoint,1,b) += 2.d0 * loc_2 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)            &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do j = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
-          loc_2 = mos_r_in_r_array_transp(ipoint,b)
-          loc_3 = mos_r_in_r_array_transp(ipoint,i)
-
-          tmp_3(ipoint,5,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
-
-          tmp_4(ipoint,1,b) += ( loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) ) &
-                               - loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b)   &
-                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) ) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemm( 'T', 'N', mo_num, mo_num, 5*n_points_final_grid, 1.d0 &
-            , tmp_3(1,1,1), 5*n_points_final_grid                   &
-            , tmp_4(1,1,1), 5*n_points_final_grid                   &
-            , 1.d0, fock_3e_uhf_mo_cs(1,1), mo_num)
-
-  deallocate(tmp_3, tmp_4)
-  deallocate(Jkappa, Okappa)
-
-  ! ---
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_cs =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f
deleted file mode 100644
index 4bbce720..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_os.irp.f
+++ /dev/null
@@ -1,536 +0,0 @@
-
-! ---
-
- BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a_os, (mo_num, mo_num)]
-&BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_b_os, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! Open Shell part of the Fock matrix from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: a, b, i, j, ipoint
-  double precision              :: loc_1, loc_2, loc_3, loc_4
-  double precision              :: ti, tf
-  double precision, allocatable :: Okappa(:), Jkappa(:,:), Obarkappa(:), Jbarkappa(:,:)
-  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
-  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
-  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' Providing fock_3e_uhf_mo_a_os and fock_3e_uhf_mo_b_os ...'
-  !call wall_time(ti)
-
-  ! ---
-
-  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
-  allocate(Jbarkappa(n_points_final_grid,3), Obarkappa(n_points_final_grid))
-  Jkappa    = 0.d0
-  Okappa    = 0.d0
-  Jbarkappa = 0.d0
-  Obarkappa = 0.d0
-
-  !$OMP PARALLEL                                                    &
-  !$OMP DEFAULT (NONE)                                              &
-  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)                 &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa, Obarkappa, Jbarkappa)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
-
-  tmp_omp_d2 = 0.d0
-  tmp_omp_d1 = 0.d0
-  !$OMP DO
-  do i = 1, elec_beta_num
-    do ipoint = 1, n_points_final_grid
-      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
-      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
-      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
-      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
-    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
-  enddo
-  !$OMP END CRITICAL
-
-  tmp_omp_d2 = 0.d0
-  tmp_omp_d1 = 0.d0
-  !$OMP DO
-  do i = elec_beta_num+1, elec_alpha_num
-    do ipoint = 1, n_points_final_grid
-      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
-      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
-      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
-      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    Jbarkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    Jbarkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    Jbarkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
-    Obarkappa(ipoint)   += tmp_omp_d1(ipoint)   
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2, tmp_omp_d1)
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_1(n_points_final_grid,4))
-
-  do ipoint = 1, n_points_final_grid
-
-    loc_1 = -2.d0 * Okappa   (ipoint) 
-    loc_2 = -2.d0 * Obarkappa(ipoint) 
-    loc_3 =         Obarkappa(ipoint) 
-
-    tmp_1(ipoint,1) = (loc_1 - loc_3) * Jbarkappa(ipoint,1) + loc_2 * Jkappa(ipoint,1)
-    tmp_1(ipoint,2) = (loc_1 - loc_3) * Jbarkappa(ipoint,2) + loc_2 * Jkappa(ipoint,2)
-    tmp_1(ipoint,3) = (loc_1 - loc_3) * Jbarkappa(ipoint,3) + loc_2 * Jkappa(ipoint,3)
-
-    tmp_1(ipoint,4) = Obarkappa(ipoint)
-  enddo
-
-
-  !$OMP PARALLEL                                                    &
-  !$OMP DEFAULT (NONE)                                              &
-  !$OMP PRIVATE (ipoint, i, j, loc_1, loc_2, tmp_omp_d2)            &
-  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
-
-  allocate(tmp_omp_d2(n_points_final_grid,3))
-
-  tmp_omp_d2 = 0.d0
-  !$OMP DO COLLAPSE(2)
-  do i = 1, elec_beta_num
-    do j = elec_beta_num+1, elec_alpha_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
-        loc_2 = mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j) 
-
-        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,1,j,i) 
-        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,2,j,i) 
-        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,3,j,i) 
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
-  enddo
-  !$OMP END CRITICAL
-
-  tmp_omp_d2 = 0.d0
-  !$OMP DO COLLAPSE(2)
-  do i = elec_beta_num+1, elec_alpha_num
-    do j = elec_beta_num+1, elec_alpha_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
-
-        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j)
-        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j)
-        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  !$OMP CRITICAL
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
-    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
-    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp_omp_d2)
-  !$OMP END PARALLEL
-
-  ! ---
-
-  allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, a, b)                                    &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
-        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
-        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, a, b, i)                                           &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      tmp_2(:,4,b,a) = 0.d0
-      do i = 1, elec_beta_num
-        do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,4,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, 1.d0 &
-            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
-            , tmp_1(1,1), 1                                   &
-            , 0.d0, fock_3e_uhf_mo_b_os(1,1), 1)
-
-  deallocate(tmp_1, tmp_2)
-
-  ! ---
-
-  allocate(tmp_3(n_points_final_grid,2,mo_num), tmp_4(n_points_final_grid,2,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    tmp_3(:,:,b) = 0.d0
-    tmp_4(:,:,b) = 0.d0
-    do ipoint = 1, n_points_final_grid
-
-      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
-
-      loc_1 = -2.0d0 * mos_r_in_r_array_transp(ipoint,b)
-
-      tmp_4(ipoint,1,b) = loc_1 * ( Jbarkappa(ipoint,1) * (Jkappa(ipoint,1) + 0.25d0 * Jbarkappa(ipoint,1)) &
-                                  + Jbarkappa(ipoint,2) * (Jkappa(ipoint,2) + 0.25d0 * Jbarkappa(ipoint,2)) &
-                                  + Jbarkappa(ipoint,3) * (Jkappa(ipoint,3) + 0.25d0 * Jbarkappa(ipoint,3)) )
-
-      tmp_4(ipoint,2,b) = mos_r_in_r_array_transp(ipoint,b)
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-
-    do i = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-
-        tmp_3(ipoint,2,b) += loc_1 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
-                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
-                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
-                                                                                                       
-        tmp_4(ipoint,1,b) += loc_2 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_2 = mos_r_in_r_array_transp(ipoint,b)
-
-          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-        enddo
-      enddo
-    enddo
-
-    do i = elec_beta_num+1, elec_alpha_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
-
-          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemm( 'T', 'N', mo_num, mo_num, 2*n_points_final_grid, 1.d0 &
-            , tmp_3(1,1,1), 2*n_points_final_grid                   &
-            , tmp_4(1,1,1), 2*n_points_final_grid                   &
-            , 1.d0, fock_3e_uhf_mo_b_os(1,1), mo_num)
-
-  deallocate(tmp_3, tmp_4)
-
-
-
-
-  ! ---
-
-  fock_3e_uhf_mo_a_os = fock_3e_uhf_mo_b_os
-
-  allocate(tmp_1(n_points_final_grid,1))
-
-  do ipoint = 1, n_points_final_grid
-    tmp_1(ipoint,1) = Obarkappa(ipoint) + 2.d0 * Okappa(ipoint) 
-  enddo
-
-  allocate(tmp_2(n_points_final_grid,1,mo_num,mo_num))
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, a, b, i)                                           &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         tmp_2)
-  !$OMP DO COLLAPSE(2)
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      tmp_2(:,1,b,a) = 0.d0
-      do i = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-          tmp_2(ipoint,1,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
-                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  call dgemv( 'T', n_points_final_grid, mo_num*mo_num, 1.d0   &
-            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
-            , tmp_1(1,1), 1                                   &
-            , 1.d0, fock_3e_uhf_mo_a_os(1,1), 1)
-
-  deallocate(tmp_1, tmp_2)
-
-  ! ---
-
-  allocate(tmp_3(n_points_final_grid,8,mo_num), tmp_4(n_points_final_grid,8,mo_num))
-
-  !$OMP PARALLEL                                                  &
-  !$OMP DEFAULT (NONE)                                            &
-  !$OMP PRIVATE (ipoint, b)                                       &
-  !$OMP SHARED (n_points_final_grid, mo_num,                      &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    tmp_3(:,:,b) = 0.d0
-    tmp_4(:,:,b) = 0.d0
-    do ipoint = 1, n_points_final_grid
-
-      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
-
-      tmp_4(ipoint,8,b) = mos_r_in_r_array_transp(ipoint,b)
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-    do i = 1, elec_beta_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-
-        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
-        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
-        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
-
-        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-      enddo
-    enddo
-
-    do i = elec_beta_num+1, elec_alpha_num
-      do ipoint = 1, n_points_final_grid
-
-        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-        loc_3 = 2.d0 * loc_1
-        loc_2 = mos_r_in_r_array_transp(ipoint,i)
-        loc_4 = 2.d0 * loc_2
-
-        tmp_3(ipoint,5,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
-        tmp_3(ipoint,6,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
-        tmp_3(ipoint,7,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
-
-        tmp_3(ipoint,8,b) += loc_3 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
-                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
-                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
-                                                                                                       
-        tmp_4(ipoint,1,b) += loc_4 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-
-        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-
-        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
-        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
-        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  !$OMP PARALLEL                                                            &
-  !$OMP DEFAULT (NONE)                                                      &
-  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
-  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
-  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
-  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
-  !$OMP         tmp_3, tmp_4)
-  !$OMP DO
-  do b = 1, mo_num
-
-    do i = 1, elec_beta_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
-          loc_2 = mos_r_in_r_array_transp(ipoint,b)
-          loc_3 = mos_r_in_r_array_transp(ipoint,i)
-
-          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
-
-          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
-          loc_3 = mos_r_in_r_array_transp(ipoint,j)
-
-          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
-
-          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,j,i) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,j,i) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,j,i) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
-        enddo
-      enddo
-    enddo
-
-    do i = elec_beta_num+1, elec_alpha_num
-      do j = elec_beta_num+1, elec_alpha_num
-        do ipoint = 1, n_points_final_grid
-
-          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
-          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
-          loc_3 = mos_r_in_r_array_transp(ipoint,i)
-
-          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
-
-          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
-                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-  ! ---
-
-  call dgemm( 'T', 'N', mo_num, mo_num, 8*n_points_final_grid, 1.d0 &
-            , tmp_3(1,1,1), 8*n_points_final_grid                   &
-            , tmp_4(1,1,1), 8*n_points_final_grid                   &
-            , 1.d0, fock_3e_uhf_mo_a_os(1,1), mo_num)
-
-  deallocate(tmp_3, tmp_4)
-  deallocate(Jkappa, Okappa)
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a_os and fock_3e_uhf_mo_b_os =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
deleted file mode 100644
index 63a1e162..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf.irp.f
+++ /dev/null
@@ -1,77 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! Fock matrix alpha from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  double precision :: ti, tf
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' Providing fock_3e_uhf_mo_a ...'
-  !call wall_time(ti)
-
-  ! CLOSED-SHELL PART
-  PROVIDE fock_3e_uhf_mo_cs
-  fock_3e_uhf_mo_a = fock_3e_uhf_mo_cs
-
-  if(elec_alpha_num .ne. elec_beta_num) then
-
-    ! OPEN-SHELL PART
-    PROVIDE fock_3e_uhf_mo_a_os
-
-    fock_3e_uhf_mo_a += fock_3e_uhf_mo_a_os
-  endif
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_b, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! Fock matrix beta from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  double precision :: ti, tf
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' Providing and fock_3e_uhf_mo_b ...'
-  !call wall_time(ti)
-
-  ! CLOSED-SHELL PART
-  PROVIDE fock_3e_uhf_mo_cs
-  fock_3e_uhf_mo_b = fock_3e_uhf_mo_cs
-
-  if(elec_alpha_num .ne. elec_beta_num) then
-
-    ! OPEN-SHELL PART
-    PROVIDE fock_3e_uhf_mo_b_os
-
-    fock_3e_uhf_mo_b += fock_3e_uhf_mo_b_os
-  endif
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_b =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f b/plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f
deleted file mode 100644
index 3bf6bd85..00000000
--- a/plugins/local/tc_scf/fock_3e_bi_ortho_uhf_old.irp.f
+++ /dev/null
@@ -1,490 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_cs_old, (mo_num, mo_num)]
-
-  implicit none
-  integer                       :: a, b, i, j
-  double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
-  double precision              :: ti, tf
-  double precision, allocatable :: tmp(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-  call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
-
-  !print *, ' PROVIDING fock_3e_uhf_mo_cs_old ...'
-  !call wall_time(ti)
-
-  fock_3e_uhf_mo_cs_old = 0.d0
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                     &
-  !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, elec_beta_num, fock_3e_uhf_mo_cs_old)
-
-  allocate(tmp(mo_num,mo_num))
-  tmp = 0.d0
-
-  !$OMP DO
-  do a = 1, mo_num
-    do b = 1, mo_num
-   
-      do j = 1, elec_beta_num
-        do i = 1, elec_beta_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 4.d0 * I_bij_aij &
-                              +        I_bij_ija &
-                              +        I_bij_jai &
-                              - 2.d0 * I_bij_aji &
-                              - 2.d0 * I_bij_iaj &
-                              - 2.d0 * I_bij_jia )
-
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do a = 1, mo_num
-    do b = 1, mo_num
-      fock_3e_uhf_mo_cs_old(b,a) += tmp(b,a)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_cs_old =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_a_old, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! ALPHA part of the Fock matrix from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: a, b, i, j, o
-  double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
-  double precision              :: ti, tf
-  double precision, allocatable :: tmp(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-  PROVIDE fock_3e_uhf_mo_cs
-
-  !print *, ' Providing fock_3e_uhf_mo_a_old ...'
-  !call wall_time(ti)
-
-  o = elec_beta_num + 1
-  call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
-
-  PROVIDE fock_3e_uhf_mo_cs_old
-  fock_3e_uhf_mo_a_old = fock_3e_uhf_mo_cs_old
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                     &
-  !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, o, elec_alpha_num, elec_beta_num, fock_3e_uhf_mo_a_old)
-
-  allocate(tmp(mo_num,mo_num))
-  tmp = 0.d0
-
-  !$OMP DO
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = 1, elec_beta_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              +        I_bij_ija &
-                              +        I_bij_jai &
-                              -        I_bij_aji &
-                              -        I_bij_iaj &
-                              - 2.d0 * I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = 1, elec_beta_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              +        I_bij_ija &
-                              +        I_bij_jai &
-                              -        I_bij_aji &
-                              - 2.d0 * I_bij_iaj &
-                              -        I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( I_bij_aij &
-                              + I_bij_ija &
-                              + I_bij_jai &
-                              - I_bij_aji &
-                              - I_bij_iaj &
-                              - I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do a = 1, mo_num
-    do b = 1, mo_num
-      fock_3e_uhf_mo_a_old(b,a) += tmp(b,a)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' Wall time for fock_3e_uhf_mo_a_old =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_mo_b_old, (mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! BETA part of the Fock matrix from three-electron terms 
-  !
-  ! WARNING :: non hermitian if bi-ortho MOS used 
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: a, b, i, j, o
-  double precision              :: I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia
-  double precision              :: ti, tf
-  double precision, allocatable :: tmp(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' PROVIDING fock_3e_uhf_mo_b_old ...'
-  !call wall_time(ti)
-
-  o = elec_beta_num + 1
-  call give_integrals_3_body_bi_ort(1, 1, 1, 1, 1, 1, I_bij_aij)
-
-  PROVIDE fock_3e_uhf_mo_cs_old
-  fock_3e_uhf_mo_b_old = fock_3e_uhf_mo_cs_old
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                     &
-  !$OMP PRIVATE (a, b, i, j, I_bij_aij, I_bij_ija, I_bij_jai, I_bij_aji, I_bij_iaj, I_bij_jia, tmp) &
-  !$OMP SHARED  (mo_num, o, elec_alpha_num, elec_beta_num, fock_3e_uhf_mo_b_old)
-
-  allocate(tmp(mo_num,mo_num))
-  tmp = 0.d0
-
-  !$OMP DO
-  do a = 1, mo_num
-    do b = 1, mo_num
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = 1, elec_beta_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              -        I_bij_aji &
-                              -        I_bij_iaj )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = 1, elec_beta_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( 2.d0 * I_bij_aij &
-                              -        I_bij_aji &
-                              -        I_bij_jia )
-
-        enddo
-      enddo
-
-      ! ---
-
-      do j = o, elec_alpha_num
-        do i = o, elec_alpha_num
-
-          call give_integrals_3_body_bi_ort(b, i, j, a, i, j, I_bij_aij)
-          call give_integrals_3_body_bi_ort(b, i, j, i, j, a, I_bij_ija)
-          call give_integrals_3_body_bi_ort(b, i, j, j, a, i, I_bij_jai)
-          call give_integrals_3_body_bi_ort(b, i, j, a, j, i, I_bij_aji)
-          call give_integrals_3_body_bi_ort(b, i, j, i, a, j, I_bij_iaj)
-          call give_integrals_3_body_bi_ort(b, i, j, j, i, a, I_bij_jia)
-
-          tmp(b,a) -= 0.5d0 * ( I_bij_aij &
-                              - I_bij_aji )
-
-        enddo
-      enddo
-
-      ! ---
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do a = 1, mo_num
-    do b = 1, mo_num
-      fock_3e_uhf_mo_b_old(b,a) += tmp(b,a)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_mo_b_old =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_a, (ao_num, ao_num)]
-
-  BEGIN_DOC
-  !
-  ! Equations (B6) and (B7)
-  !
-  ! g <--> gamma
-  ! d <--> delta
-  ! e <--> eta
-  ! k <--> kappa
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: g, d, e, k, mu, nu
-  double precision              :: dm_ge_a, dm_ge_b, dm_ge
-  double precision              :: dm_dk_a, dm_dk_b, dm_dk
-  double precision              :: i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu
-  double precision              :: ti, tf
-  double precision, allocatable :: f_tmp(:,:)
-
-  !print *, ' PROVIDING fock_3e_uhf_ao_a ...'
-  !call wall_time(ti)
-
-  fock_3e_uhf_ao_a = 0.d0
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                &
-  !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk, f_tmp,  &
-  !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
-  !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_a)
-
-  allocate(f_tmp(ao_num,ao_num))
-  f_tmp = 0.d0
-
-  !$OMP DO
-  do g = 1, ao_num
-    do e = 1, ao_num
-      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
-      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
-      dm_ge   = dm_ge_a + dm_ge_b
-      do d = 1, ao_num
-        do k = 1, ao_num
-          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
-          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
-          dm_dk   = dm_dk_a + dm_dk_b
-          do mu = 1, ao_num
-            do nu = 1, ao_num
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
-              f_tmp(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
-                                      + dm_ge_a * dm_dk_a * i_mugd_eknu &
-                                      + dm_ge_a * dm_dk_a * i_mugd_knue &
-                                      - dm_ge_a * dm_dk   * i_mugd_enuk &
-                                      - dm_ge   * dm_dk_a * i_mugd_kenu &
-                                      - dm_ge_a * dm_dk_a * i_mugd_nuke &
-                                      - dm_ge_b * dm_dk_b * i_mugd_nuke )
-            enddo
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do mu = 1, ao_num
-    do nu = 1, ao_num
-      fock_3e_uhf_ao_a(mu,nu) += f_tmp(mu,nu)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(f_tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_ao_a =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [double precision, fock_3e_uhf_ao_b, (ao_num, ao_num)]
-
-  BEGIN_DOC
-  !
-  ! Equations (B6) and (B7)
-  !
-  ! g <--> gamma
-  ! d <--> delta
-  ! e <--> eta
-  ! k <--> kappa
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: g, d, e, k, mu, nu
-  double precision              :: dm_ge_a, dm_ge_b, dm_ge
-  double precision              :: dm_dk_a, dm_dk_b, dm_dk
-  double precision              :: i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu
-  double precision              :: ti, tf
-  double precision, allocatable :: f_tmp(:,:)
-
-  !print *, ' PROVIDING fock_3e_uhf_ao_b ...'
-  !call wall_time(ti)
-
-  fock_3e_uhf_ao_b = 0.d0
-
-  !$OMP PARALLEL DEFAULT (NONE)                                                                &
-  !$OMP PRIVATE (g, e, d, k, mu, nu, dm_ge_a, dm_ge_b, dm_ge, dm_dk_a, dm_dk_b, dm_dk, f_tmp,  &
-  !$OMP          i_mugd_nuek, i_mugd_eknu, i_mugd_knue, i_mugd_nuke, i_mugd_enuk, i_mugd_kenu) &
-  !$OMP SHARED  (ao_num, TCSCF_bi_ort_dm_ao_alpha, TCSCF_bi_ort_dm_ao_beta, fock_3e_uhf_ao_b)
-
-  allocate(f_tmp(ao_num,ao_num))
-  f_tmp = 0.d0
-
-  !$OMP DO
-  do g = 1, ao_num
-    do e = 1, ao_num
-      dm_ge_a = TCSCF_bi_ort_dm_ao_alpha(g,e)
-      dm_ge_b = TCSCF_bi_ort_dm_ao_beta (g,e)
-      dm_ge   = dm_ge_a + dm_ge_b
-      do d = 1, ao_num
-        do k = 1, ao_num
-          dm_dk_a = TCSCF_bi_ort_dm_ao_alpha(d,k)
-          dm_dk_b = TCSCF_bi_ort_dm_ao_beta (d,k)
-          dm_dk   = dm_dk_a + dm_dk_b
-          do mu = 1, ao_num
-            do nu = 1, ao_num
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, e, k, i_mugd_nuek)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, k, nu, i_mugd_eknu)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, nu, e, i_mugd_knue)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, nu, k, e, i_mugd_nuke)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, e, nu, k, i_mugd_enuk)
-              call give_integrals_3_body_bi_ort_ao(mu, g, d, k, e, nu, i_mugd_kenu)
-              f_tmp(mu,nu) -= 0.5d0 * ( dm_ge   * dm_dk   * i_mugd_nuek &
-                                      + dm_ge_b * dm_dk_b * i_mugd_eknu &
-                                      + dm_ge_b * dm_dk_b * i_mugd_knue &
-                                      - dm_ge_b * dm_dk   * i_mugd_enuk &
-                                      - dm_ge   * dm_dk_b * i_mugd_kenu &
-                                      - dm_ge_b * dm_dk_b * i_mugd_nuke &
-                                      - dm_ge_a * dm_dk_a * i_mugd_nuke )
-            enddo
-          enddo
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-
-  !$OMP CRITICAL
-  do mu = 1, ao_num
-    do nu = 1, ao_num
-      fock_3e_uhf_ao_b(mu,nu) += f_tmp(mu,nu)
-    enddo
-  enddo
-  !$OMP END CRITICAL
-
-  deallocate(f_tmp)
-  !$OMP END PARALLEL
-
-  !call wall_time(tf)
-  !print *, ' total Wall time for fock_3e_uhf_ao_b =', tf - ti
-
-END_PROVIDER 
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_hermit.irp.f b/plugins/local/tc_scf/fock_hermit.irp.f
deleted file mode 100644
index 3460157e..00000000
--- a/plugins/local/tc_scf/fock_hermit.irp.f
+++ /dev/null
@@ -1,107 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, good_hermit_tc_fock_mat, (mo_num, mo_num)]
-
-  BEGIN_DOC
-! good_hermit_tc_fock_mat = Hermitian Upper triangular Fock matrix
-!
-! The converged eigenvectors of such matrix yield to orthonormal vectors satisfying the left Brillouin theorem
-  END_DOC
-  implicit none
-  integer :: i, j
-
-  good_hermit_tc_fock_mat = Fock_matrix_tc_mo_tot
-  do j = 1, mo_num
-    do i = 1, j-1
-      good_hermit_tc_fock_mat(i,j) = Fock_matrix_tc_mo_tot(j,i)
-    enddo
-  enddo
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, hermit_average_tc_fock_mat, (mo_num, mo_num)]
-
-  BEGIN_DOC
-! hermit_average_tc_fock_mat = (F + F^\dagger)/2
-  END_DOC
-  implicit none
-  integer :: i, j
-
-  hermit_average_tc_fock_mat = Fock_matrix_tc_mo_tot
-  do j = 1, mo_num
-    do i = 1, mo_num
-      hermit_average_tc_fock_mat(i,j) = 0.5d0 * (Fock_matrix_tc_mo_tot(j,i) + Fock_matrix_tc_mo_tot(i,j))
-    enddo
-  enddo
-
-END_PROVIDER
-
-
-! ---
-BEGIN_PROVIDER [ double precision, grad_hermit]
- implicit none
- BEGIN_DOC
- ! square of gradient of the energy
- END_DOC
- if(symmetric_fock_tc)then
-  grad_hermit = grad_hermit_average_tc_fock_mat
- else
-  grad_hermit = grad_good_hermit_tc_fock_mat
- endif
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, grad_good_hermit_tc_fock_mat]
-  implicit none
-  BEGIN_DOC
-  ! grad_good_hermit_tc_fock_mat = norm of gradients of the upper triangular TC fock
-  END_DOC
-  integer :: i, j
-  grad_good_hermit_tc_fock_mat = 0.d0
-  do i = 1, elec_alpha_num
-    do j = elec_alpha_num+1, mo_num
-      grad_good_hermit_tc_fock_mat += dabs(good_hermit_tc_fock_mat(i,j))
-    enddo
-  enddo
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, grad_hermit_average_tc_fock_mat]
-  implicit none
-  BEGIN_DOC
-  ! grad_hermit_average_tc_fock_mat = norm of gradients of the upper triangular TC fock
-  END_DOC
-  integer :: i, j
-  grad_hermit_average_tc_fock_mat = 0.d0
-  do i = 1, elec_alpha_num
-    do j = elec_alpha_num+1, mo_num
-      grad_hermit_average_tc_fock_mat += dabs(hermit_average_tc_fock_mat(i,j))
-    enddo
-  enddo
-END_PROVIDER
-
-
-! ---
-
-subroutine save_good_hermit_tc_eigvectors()
-
-  implicit none
-  integer        :: sign
-  character*(64) :: label
-  logical        :: output
-
-  sign = 1
-  label = "Canonical"
-  output = .False.
-
-  if(symmetric_fock_tc)then
-   call mo_as_eigvectors_of_mo_matrix(hermit_average_tc_fock_mat, mo_num, mo_num, label, sign, output)
-  else
-   call mo_as_eigvectors_of_mo_matrix(good_hermit_tc_fock_mat, mo_num, mo_num, label, sign, output)
-  endif
-end subroutine save_good_hermit_tc_eigvectors
-
-! ---
-
diff --git a/plugins/local/tc_scf/fock_tc.irp.f b/plugins/local/tc_scf/fock_tc.irp.f
index 282f9873..16bb5c87 100644
--- a/plugins/local/tc_scf/fock_tc.irp.f
+++ b/plugins/local/tc_scf/fock_tc.irp.f
@@ -1,78 +1,15 @@
 
+
 ! ---
 
- BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_seq_beta , (ao_num, ao_num)]
+ BEGIN_PROVIDER [ double precision, two_e_tc_integral_alpha, (ao_num, ao_num)]
+&BEGIN_PROVIDER [ double precision, two_e_tc_integral_beta , (ao_num, ao_num)]
 
   BEGIN_DOC
   !
-  ! two_e_tc_non_hermit_integral_seq_alpha(k,i) = <k| F^tc_alpha |i> ON THE AO BASIS 
+  ! two_e_tc_integral_alpha(k,i) = <k| F^tc_2e_alpha |i> ON THE AO BASIS 
   !
-  ! where F^tc is the TWO-BODY part of the TC Fock matrix and k,i are AO basis functions
-  !
-  ! works in SEQUENTIAL 
-  END_DOC
-
-  implicit none
-  integer          :: i, j, k, l
-  double precision :: density, density_a, density_b
-  double precision :: t0, t1
-
-  PROVIDE ao_two_e_tc_tot
-
-  !print*, ' providing two_e_tc_non_hermit_integral_seq ...'
-  !call wall_time(t0)
-
-  two_e_tc_non_hermit_integral_seq_alpha = 0.d0
-  two_e_tc_non_hermit_integral_seq_beta  = 0.d0
-
-  do i = 1, ao_num
-    do k = 1, ao_num
-      do j = 1, ao_num
-        do l = 1, ao_num
-
-          density_a = TCSCF_density_matrix_ao_alpha(l,j)
-          density_b = TCSCF_density_matrix_ao_beta (l,j)
-          density   = density_a + density_b
-
-          !!                                         rho(l,j)   *      < k l| T | i j>
-          !two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
-          !!                                         rho(l,j)   *      < k l| T | i j>
-          !two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(l,j,k,i)
-          !!                                         rho_a(l,j) *      < l k| T | i j>
-          !two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
-          !!                                         rho_b(l,j) *      < l k| T | i j>
-          !two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
-
-          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_seq_alpha(k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
-          !                                         rho(l,j)   *      < k l| T | i j>
-          two_e_tc_non_hermit_integral_seq_beta (k,i) += density   * ao_two_e_tc_tot(k,i,l,j)
-          !                                         rho_a(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_seq_alpha(k,i) -= density_a * ao_two_e_tc_tot(k,j,l,i)
-          !                                         rho_b(l,j) *      < k l| T | j i>
-          two_e_tc_non_hermit_integral_seq_beta (k,i) -= density_b * ao_two_e_tc_tot(k,j,l,i)
-
-        enddo
-      enddo
-    enddo
-  enddo
-
-  !call wall_time(t1)
-  !print*, ' wall time for two_e_tc_non_hermit_integral_seq after = ', t1 - t0
-
-END_PROVIDER 
-
-! ---
-
- BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_tc_non_hermit_integral_beta , (ao_num, ao_num)]
-
-  BEGIN_DOC
-  !
-  ! two_e_tc_non_hermit_integral_alpha(k,i) = <k| F^tc_alpha |i> ON THE AO BASIS 
-  !
-  ! where F^tc is the TWO-BODY part of the TC Fock matrix and k,i are AO basis functions
+  ! where F^tc_2e is the TWO-BODY part of the TC Fock matrix and k,i are AO basis functions
   !
   END_DOC
 
@@ -86,16 +23,13 @@ END_PROVIDER
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE TCSCF_density_matrix_ao_alpha TCSCF_density_matrix_ao_beta
 
-  !print*, ' Providing two_e_tc_non_hermit_integral ...'
-  !call wall_time(t0)
-
-  two_e_tc_non_hermit_integral_alpha = 0.d0
-  two_e_tc_non_hermit_integral_beta  = 0.d0
+  two_e_tc_integral_alpha = 0.d0
+  two_e_tc_integral_beta  = 0.d0
 
   !$OMP PARALLEL DEFAULT (NONE)                                                                        &
   !$OMP PRIVATE (i, j, k, l, density_a, density_b, density, tmp_a, tmp_b, I_coul, I_kjli)              &
   !$OMP SHARED  (ao_num, TCSCF_density_matrix_ao_alpha, TCSCF_density_matrix_ao_beta, ao_two_e_tc_tot, &
-  !$OMP         two_e_tc_non_hermit_integral_alpha, two_e_tc_non_hermit_integral_beta)
+  !$OMP         two_e_tc_integral_alpha, two_e_tc_integral_beta)
 
   allocate(tmp_a(ao_num,ao_num), tmp_b(ao_num,ao_num))
   tmp_a = 0.d0
@@ -124,8 +58,8 @@ END_PROVIDER
   !$OMP CRITICAL
   do i = 1, ao_num
     do j = 1, ao_num
-      two_e_tc_non_hermit_integral_alpha(j,i) += tmp_a(j,i)
-      two_e_tc_non_hermit_integral_beta (j,i) += tmp_b(j,i)
+      two_e_tc_integral_alpha(j,i) += tmp_a(j,i)
+      two_e_tc_integral_beta (j,i) += tmp_b(j,i)
     enddo
   enddo
   !$OMP END CRITICAL
@@ -133,9 +67,6 @@ END_PROVIDER
   deallocate(tmp_a, tmp_b)
   !$OMP END PARALLEL
 
-  !call wall_time(t1)
-  !print*, ' Wall time for two_e_tc_non_hermit_integral = ', t1 - t0
-
 END_PROVIDER 
 
 ! ---
@@ -149,13 +80,7 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_alpha, (ao_num, ao_num)]
   implicit none
   double precision :: t0, t1
 
-  !print*, ' Providing Fock_matrix_tc_ao_alpha ...'
-  !call wall_time(t0)
-
-  Fock_matrix_tc_ao_alpha = ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_alpha
-
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_ao_alpha =', t1-t0
+  Fock_matrix_tc_ao_alpha = ao_one_e_integrals_tc_tot + two_e_tc_integral_alpha
 
 END_PROVIDER 
 
@@ -169,13 +94,13 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_beta, (ao_num, ao_num)]
 
   implicit none
 
-  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot + two_e_tc_non_hermit_integral_beta 
+  Fock_matrix_tc_ao_beta = ao_one_e_integrals_tc_tot + two_e_tc_integral_beta 
 
 END_PROVIDER 
 
 ! ---
 
-BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num) ]
+BEGIN_PROVIDER [double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num)]
 
   BEGIN_DOC
   ! Total alpha TC Fock matrix : h_c + Two-e^TC terms on the MO basis
@@ -185,31 +110,16 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_alpha, (mo_num, mo_num) ]
   double precision              :: t0, t1, tt0, tt1
   double precision, allocatable :: tmp(:,:)
 
-  !print*, ' Providing Fock_matrix_tc_mo_alpha ...'
-  !call wall_time(t0)
+  PROVIDE mo_l_coef mo_r_coef
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-
-    call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
-                          , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
-
-    if(three_body_h_tc) then
-      PROVIDE fock_3e_uhf_mo_a
-      Fock_matrix_tc_mo_alpha += fock_3e_uhf_mo_a
-    endif
-
-  else
-
-    call ao_to_mo( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
-                 , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
+  call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_alpha, size(Fock_matrix_tc_ao_alpha, 1) &
+                        , Fock_matrix_tc_mo_alpha, size(Fock_matrix_tc_mo_alpha, 1) )
 
+  if(three_body_h_tc) then
+    PROVIDE fock_3e_mo_a
+    Fock_matrix_tc_mo_alpha += fock_3e_mo_a
   endif
 
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_mo_alpha =', t1-t0
-
 END_PROVIDER
 
 ! ---
@@ -223,21 +133,12 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_beta, (mo_num,mo_num) ]
   implicit none
   double precision, allocatable :: tmp(:,:)
 
-  if(bi_ortho) then
-
-    call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
-                          , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
-
-    if(three_body_h_tc) then
-      PROVIDE fock_3e_uhf_mo_b
-      Fock_matrix_tc_mo_beta += fock_3e_uhf_mo_b
-    endif
-
-  else
-
-    call ao_to_mo( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
-                 , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
+  call ao_to_mo_bi_ortho( Fock_matrix_tc_ao_beta, size(Fock_matrix_tc_ao_beta, 1) &
+                        , Fock_matrix_tc_mo_beta, size(Fock_matrix_tc_mo_beta, 1) )
 
+  if(three_body_h_tc) then
+    PROVIDE fock_3e_mo_b
+    Fock_matrix_tc_mo_beta += fock_3e_mo_b
   endif
 
 END_PROVIDER
@@ -286,20 +187,895 @@ BEGIN_PROVIDER [ double precision, Fock_matrix_tc_ao_tot, (ao_num, ao_num) ]
   implicit none
   double precision :: t0, t1
 
-  !print*, ' Providing Fock_matrix_tc_ao_tot ...'
-  !call wall_time(t0)
-
   PROVIDE mo_l_coef mo_r_coef
   PROVIDE Fock_matrix_tc_mo_tot
 
   call mo_to_ao_bi_ortho( Fock_matrix_tc_mo_tot, size(Fock_matrix_tc_mo_tot, 1) &
                         , Fock_matrix_tc_ao_tot, size(Fock_matrix_tc_ao_tot, 1) )
 
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_ao_tot =', t1-t0
-
 END_PROVIDER
 
 ! ---
 
 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_mo_a, (mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! Fock matrix alpha from three-electron terms 
+  !
+  ! WARNING :: non hermitian if bi-ortho MOS used 
+  !
+  END_DOC
+
+  implicit none
+  double precision :: ti, tf
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! CLOSED-SHELL PART
+  PROVIDE fock_3e_mo_cs
+  fock_3e_mo_a = fock_3e_mo_cs
+
+  if(elec_alpha_num .ne. elec_beta_num) then
+
+    ! OPEN-SHELL PART
+    PROVIDE fock_3e_mo_a_os
+
+    fock_3e_mo_a += fock_3e_mo_a_os
+  endif
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_mo_b, (mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! Fock matrix beta from three-electron terms 
+  !
+  ! WARNING :: non hermitian if bi-ortho MOS used 
+  !
+  END_DOC
+
+  implicit none
+  double precision :: ti, tf
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! CLOSED-SHELL PART
+  PROVIDE fock_3e_mo_cs
+  fock_3e_mo_b = fock_3e_mo_cs
+
+  if(elec_alpha_num .ne. elec_beta_num) then
+
+    ! OPEN-SHELL PART
+    PROVIDE fock_3e_mo_b_os
+
+    fock_3e_mo_b += fock_3e_mo_b_os
+  endif
+
+END_PROVIDER 
+
+! ---
+
+
+! ---
+
+ BEGIN_PROVIDER [double precision, fock_3e_mo_a_os, (mo_num, mo_num)]
+&BEGIN_PROVIDER [double precision, fock_3e_mo_b_os, (mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! Open Shell part of the Fock matrix from three-electron terms 
+  !
+  ! WARNING :: non hermitian if bi-ortho MOS used 
+  !
+  END_DOC
+
+  implicit none
+  integer                       :: a, b, i, j, ipoint
+  double precision              :: loc_1, loc_2, loc_3, loc_4
+  double precision              :: ti, tf
+  double precision, allocatable :: Okappa(:), Jkappa(:,:), Obarkappa(:), Jbarkappa(:,:)
+  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
+  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:)
+  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! ---
+
+  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
+  allocate(Jbarkappa(n_points_final_grid,3), Obarkappa(n_points_final_grid))
+  Jkappa    = 0.d0
+  Okappa    = 0.d0
+  Jbarkappa = 0.d0
+  Obarkappa = 0.d0
+
+  !$OMP PARALLEL                                                    &
+  !$OMP DEFAULT (NONE)                                              &
+  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)                 &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa, Obarkappa, Jbarkappa)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
+
+  tmp_omp_d2 = 0.d0
+  tmp_omp_d1 = 0.d0
+  !$OMP DO
+  do i = 1, elec_beta_num
+    do ipoint = 1, n_points_final_grid
+      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
+    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  tmp_omp_d2 = 0.d0
+  tmp_omp_d1 = 0.d0
+  !$OMP DO
+  do i = elec_beta_num+1, elec_alpha_num
+    do ipoint = 1, n_points_final_grid
+      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    Jbarkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    Jbarkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    Jbarkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
+    Obarkappa(ipoint)   += tmp_omp_d1(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2, tmp_omp_d1)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_1(n_points_final_grid,4))
+
+  do ipoint = 1, n_points_final_grid
+
+    loc_1 = -2.d0 * Okappa   (ipoint) 
+    loc_2 = -2.d0 * Obarkappa(ipoint) 
+    loc_3 =         Obarkappa(ipoint) 
+
+    tmp_1(ipoint,1) = (loc_1 - loc_3) * Jbarkappa(ipoint,1) + loc_2 * Jkappa(ipoint,1)
+    tmp_1(ipoint,2) = (loc_1 - loc_3) * Jbarkappa(ipoint,2) + loc_2 * Jkappa(ipoint,2)
+    tmp_1(ipoint,3) = (loc_1 - loc_3) * Jbarkappa(ipoint,3) + loc_2 * Jkappa(ipoint,3)
+
+    tmp_1(ipoint,4) = Obarkappa(ipoint)
+  enddo
+
+  !$OMP PARALLEL                                                    &
+  !$OMP DEFAULT (NONE)                                              &
+  !$OMP PRIVATE (ipoint, i, j, loc_1, loc_2, tmp_omp_d2)            &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num, elec_alpha_num, &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
+  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3))
+
+  tmp_omp_d2 = 0.d0
+  !$OMP DO COLLAPSE(2)
+  do i = 1, elec_beta_num
+    do j = elec_beta_num+1, elec_alpha_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+        loc_2 = mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j) 
+
+        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,1,j,i) 
+        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,2,j,i) 
+        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) + loc_2 * int2_grad1_u12_bimo_t(ipoint,3,j,i) 
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
+  enddo
+  !$OMP END CRITICAL
+
+  tmp_omp_d2 = 0.d0
+  !$OMP DO COLLAPSE(2)
+  do i = elec_beta_num+1, elec_alpha_num
+    do j = elec_beta_num+1, elec_alpha_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+
+        tmp_omp_d2(ipoint,1) += loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j)
+        tmp_omp_d2(ipoint,2) += loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j)
+        tmp_omp_d2(ipoint,3) += loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, a, b)                                    &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+  !$OMP         tmp_2)
+  !$OMP DO COLLAPSE(2)
+  do a = 1, mo_num
+    do b = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+        tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+        tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, a, b, i)                                           &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         tmp_2)
+  !$OMP DO COLLAPSE(2)
+  do a = 1, mo_num
+    do b = 1, mo_num
+
+      tmp_2(:,4,b,a) = 0.d0
+      do i = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,4,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, 1.d0 &
+            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
+            , tmp_1(1,1), 1                                   &
+            , 0.d0, fock_3e_mo_b_os(1,1), 1)
+
+  deallocate(tmp_1, tmp_2)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,2,mo_num), tmp_4(n_points_final_grid,2,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    tmp_3(:,:,b) = 0.d0
+    tmp_4(:,:,b) = 0.d0
+    do ipoint = 1, n_points_final_grid
+
+      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+
+      loc_1 = -2.0d0 * mos_r_in_r_array_transp(ipoint,b)
+
+      tmp_4(ipoint,1,b) = loc_1 * ( Jbarkappa(ipoint,1) * (Jkappa(ipoint,1) + 0.25d0 * Jbarkappa(ipoint,1)) &
+                                  + Jbarkappa(ipoint,2) * (Jkappa(ipoint,2) + 0.25d0 * Jbarkappa(ipoint,2)) &
+                                  + Jbarkappa(ipoint,3) * (Jkappa(ipoint,3) + 0.25d0 * Jbarkappa(ipoint,3)) )
+
+      tmp_4(ipoint,2,b) = mos_r_in_r_array_transp(ipoint,b)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_3(ipoint,2,b) += loc_1 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+                                                                                                       
+        tmp_4(ipoint,1,b) += loc_2 * ( Jbarkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                     + Jbarkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                     + Jbarkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_2 = mos_r_in_r_array_transp(ipoint,b)
+
+          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+
+    do i = elec_beta_num+1, elec_alpha_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
+
+          tmp_4(ipoint,1,b) += loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 2*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 2*n_points_final_grid                   &
+            , tmp_4(1,1,1), 2*n_points_final_grid                   &
+            , 1.d0, fock_3e_mo_b_os(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+
+  ! ---
+
+  fock_3e_mo_a_os = fock_3e_mo_b_os
+
+  allocate(tmp_1(n_points_final_grid,1))
+
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) = Obarkappa(ipoint) + 2.d0 * Okappa(ipoint) 
+  enddo
+
+  allocate(tmp_2(n_points_final_grid,1,mo_num,mo_num))
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, a, b, i)                                           &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         tmp_2)
+  !$OMP DO COLLAPSE(2)
+  do a = 1, mo_num
+    do b = 1, mo_num
+
+      tmp_2(:,1,b,a) = 0.d0
+      do i = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,1,b,a) += final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                    + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  call dgemv( 'T', n_points_final_grid, mo_num*mo_num, 1.d0   &
+            , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2) &
+            , tmp_1(1,1), 1                                   &
+            , 1.d0, fock_3e_mo_a_os(1,1), 1)
+
+  deallocate(tmp_1, tmp_2)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,8,mo_num), tmp_4(n_points_final_grid,8,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b)                                       &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    tmp_3(:,:,b) = 0.d0
+    tmp_4(:,:,b) = 0.d0
+    do ipoint = 1, n_points_final_grid
+
+      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+
+      tmp_4(ipoint,8,b) = mos_r_in_r_array_transp(ipoint,b)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2, loc_3, loc_4)                  &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         Jkappa, Jbarkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+
+        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+      enddo
+    enddo
+
+    do i = elec_beta_num+1, elec_alpha_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_3 = 2.d0 * loc_1
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+        loc_4 = 2.d0 * loc_2
+
+        tmp_3(ipoint,5,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmp_3(ipoint,6,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmp_3(ipoint,7,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+
+        tmp_3(ipoint,8,b) += loc_3 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+                                                                                                       
+        tmp_4(ipoint,1,b) += loc_4 * ( (Jkappa(ipoint,1) + 0.5d0 * Jbarkappa(ipoint,1)) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                     + (Jkappa(ipoint,2) + 0.5d0 * Jbarkappa(ipoint,2)) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                     + (Jkappa(ipoint,3) + 0.5d0 * Jbarkappa(ipoint,3)) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+
+        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+
+        tmp_4(ipoint,5,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,6,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,7,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                            &
+  !$OMP DEFAULT (NONE)                                                      &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)                      &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, elec_alpha_num, &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,            &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp,           &
+  !$OMP         tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+
+    do i = 1, elec_beta_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = mos_r_in_r_array_transp(ipoint,b)
+          loc_3 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+          loc_3 = mos_r_in_r_array_transp(ipoint,j)
+
+          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) )
+
+          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,j,i) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,j,i) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,j,i) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+        enddo
+      enddo
+    enddo
+
+    do i = elec_beta_num+1, elec_alpha_num
+      do j = elec_beta_num+1, elec_alpha_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = 0.5d0 * mos_r_in_r_array_transp(ipoint,b)
+          loc_3 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmp_3(ipoint,8,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmp_4(ipoint,1,b) -= loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 8*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 8*n_points_final_grid                   &
+            , tmp_4(1,1,1), 8*n_points_final_grid                   &
+            , 1.d0, fock_3e_mo_a_os(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+  deallocate(Jkappa, Okappa)
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, fock_3e_mo_cs, (mo_num, mo_num)]
+
+  implicit none
+  integer                       :: a, b, i, j, ipoint
+  double precision              :: ti, tf
+  double precision              :: loc_1, loc_2, loc_3
+  double precision, allocatable :: Okappa(:), Jkappa(:,:)
+  double precision, allocatable :: tmp_omp_d1(:), tmp_omp_d2(:,:)
+  double precision, allocatable :: tmp_1(:,:), tmp_2(:,:,:,:), tmp_22(:,:,:)
+  double precision, allocatable :: tmp_3(:,:,:), tmp_4(:,:,:)
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  ! ---
+
+  allocate(Jkappa(n_points_final_grid,3), Okappa(n_points_final_grid))
+  Jkappa = 0.d0
+  Okappa = 0.d0
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, i, tmp_omp_d1, tmp_omp_d2)               &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, Okappa, Jkappa)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3), tmp_omp_d1(n_points_final_grid))
+  tmp_omp_d2 = 0.d0
+  tmp_omp_d1 = 0.d0
+
+  !$OMP DO
+  do i = 1, elec_beta_num
+    do ipoint = 1, n_points_final_grid
+      tmp_omp_d2(ipoint,1) += int2_grad1_u12_bimo_t(ipoint,1,i,i)
+      tmp_omp_d2(ipoint,2) += int2_grad1_u12_bimo_t(ipoint,2,i,i)
+      tmp_omp_d2(ipoint,3) += int2_grad1_u12_bimo_t(ipoint,3,i,i)
+      tmp_omp_d1(ipoint)   += mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    Jkappa(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    Jkappa(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    Jkappa(ipoint,3) += tmp_omp_d2(ipoint,3) 
+    Okappa(ipoint)   += tmp_omp_d1(ipoint)   
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2, tmp_omp_d1)
+
+  !$OMP END PARALLEL
+
+  ! ---
+
+  allocate(tmp_1(n_points_final_grid,4))
+
+  do ipoint = 1, n_points_final_grid
+    loc_1 = 2.d0 * Okappa(ipoint) 
+    tmp_1(ipoint,1) = loc_1 * Jkappa(ipoint,1)
+    tmp_1(ipoint,2) = loc_1 * Jkappa(ipoint,2)
+    tmp_1(ipoint,3) = loc_1 * Jkappa(ipoint,3)
+    tmp_1(ipoint,4) = Okappa(ipoint)
+  enddo
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, i, j, loc_1, tmp_omp_d2)                 &
+  !$OMP SHARED (n_points_final_grid, elec_beta_num,               &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         int2_grad1_u12_bimo_t, tmp_1)
+
+  allocate(tmp_omp_d2(n_points_final_grid,3))
+  tmp_omp_d2 = 0.d0
+
+  !$OMP DO COLLAPSE(2)
+  do i = 1, elec_beta_num
+    do j = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i) 
+
+        tmp_omp_d2(ipoint,1) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,i,j) 
+        tmp_omp_d2(ipoint,2) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,i,j) 
+        tmp_omp_d2(ipoint,3) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,i,j) 
+      enddo
+    enddo
+  enddo
+  !$OMP END DO NOWAIT
+
+  !$OMP CRITICAL
+  do ipoint = 1, n_points_final_grid
+    tmp_1(ipoint,1) += tmp_omp_d2(ipoint,1) 
+    tmp_1(ipoint,2) += tmp_omp_d2(ipoint,2) 
+    tmp_1(ipoint,3) += tmp_omp_d2(ipoint,3) 
+  enddo
+  !$OMP END CRITICAL
+
+  deallocate(tmp_omp_d2)
+  !$OMP END PARALLEL
+
+  ! ---
+
+  if(tc_save_mem) then
+
+    allocate(tmp_22(n_points_final_grid,4,mo_num))
+    do a = 1, mo_num
+      !$OMP PARALLEL                                                  &
+      !$OMP DEFAULT (NONE)                                            &
+      !$OMP PRIVATE (ipoint, b, i)                                    &
+      !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num, a,    &
+      !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+      !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+      !$OMP         tmp_22)
+      !$OMP DO
+      do b = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp_22(ipoint,1,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+          tmp_22(ipoint,2,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+          tmp_22(ipoint,3,b) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        enddo
+        tmp_22(:,4,b) = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+            tmp_22(ipoint,4,b) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                     + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                     + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call dgemv( 'T', 4*n_points_final_grid, mo_num, -2.d0        &
+                , tmp_22(1,1,1), size(tmp_22, 1) * size(tmp_22, 2) &
+                , tmp_1(1,1), 1                                    &
+                , 0.d0, fock_3e_mo_cs(1,a), 1)
+    enddo
+    deallocate(tmp_22)
+
+  else
+
+    allocate(tmp_2(n_points_final_grid,4,mo_num,mo_num))
+    !$OMP PARALLEL                                                  &
+    !$OMP DEFAULT (NONE)                                            &
+    !$OMP PRIVATE (ipoint, a, b, i)                                 &
+    !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+    !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+    !$OMP         int2_grad1_u12_bimo_t, final_weight_at_r_vector,  &
+    !$OMP         tmp_2)
+    !$OMP DO COLLAPSE(2)
+    do a = 1, mo_num
+      do b = 1, mo_num
+        do ipoint = 1, n_points_final_grid
+          tmp_2(ipoint,1,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,1,b,a)
+          tmp_2(ipoint,2,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,2,b,a)
+          tmp_2(ipoint,3,b,a) = final_weight_at_r_vector(ipoint) * int2_grad1_u12_bimo_t(ipoint,3,b,a)
+        enddo
+        tmp_2(:,4,b,a) = 0.d0
+        do i = 1, elec_beta_num
+          do ipoint = 1, n_points_final_grid
+            tmp_2(ipoint,4,b,a) -= final_weight_at_r_vector(ipoint) * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,a) &
+                                                                      + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,a) &
+                                                                      + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,a) )
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END DO
+    !$OMP END PARALLEL
+    call dgemv( 'T', 4*n_points_final_grid, mo_num*mo_num, -2.d0 &
+              , tmp_2(1,1,1,1), size(tmp_2, 1) * size(tmp_2, 2)  &
+              , tmp_1(1,1), 1                                    &
+              , 0.d0, fock_3e_mo_cs(1,1), 1)
+    deallocate(tmp_2)
+
+  endif
+
+  deallocate(tmp_1)
+
+  ! ---
+
+  allocate(tmp_3(n_points_final_grid,5,mo_num), tmp_4(n_points_final_grid,5,mo_num))
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, loc_1, loc_2)                         &
+  !$OMP SHARED (n_points_final_grid, mo_num,                      &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         final_weight_at_r_vector, Jkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    tmp_3(:,:,b) = 0.d0
+    tmp_4(:,:,b) = 0.d0
+    do ipoint = 1, n_points_final_grid
+      tmp_3(ipoint,1,b) = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,b)
+
+      tmp_4(ipoint,1,b) = -2.d0 * mos_r_in_r_array_transp(ipoint,b) * ( Jkappa(ipoint,1) * Jkappa(ipoint,1) &
+                                                                      + Jkappa(ipoint,2) * Jkappa(ipoint,2) &
+                                                                      + Jkappa(ipoint,3) * Jkappa(ipoint,3) )
+      tmp_4(ipoint,5,b) = mos_r_in_r_array_transp(ipoint,b)
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, i, loc_1, loc_2)                      &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         Jkappa, tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do ipoint = 1, n_points_final_grid
+
+        loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,i)
+        loc_2 = mos_r_in_r_array_transp(ipoint,i)
+
+        tmp_3(ipoint,2,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,1,b,i)
+        tmp_3(ipoint,3,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,2,b,i)
+        tmp_3(ipoint,4,b) -= loc_1 * int2_grad1_u12_bimo_t(ipoint,3,b,i)
+        tmp_3(ipoint,5,b) += 2.d0 * loc_1 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,b,i) &
+                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,b,i) &
+                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,b,i) )
+                                                                                                       
+        tmp_4(ipoint,2,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,1,i,b)
+        tmp_4(ipoint,3,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,2,i,b)
+        tmp_4(ipoint,4,b) += loc_2 * int2_grad1_u12_bimo_t(ipoint,3,i,b)
+        tmp_4(ipoint,1,b) += 2.d0 * loc_2 * ( Jkappa(ipoint,1) * int2_grad1_u12_bimo_t(ipoint,1,i,b) &
+                                            + Jkappa(ipoint,2) * int2_grad1_u12_bimo_t(ipoint,2,i,b) &
+                                            + Jkappa(ipoint,3) * int2_grad1_u12_bimo_t(ipoint,3,i,b) )
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !$OMP PARALLEL                                                  &
+  !$OMP DEFAULT (NONE)                                            &
+  !$OMP PRIVATE (ipoint, b, i, j, loc_1, loc_2, loc_3)            &
+  !$OMP SHARED (n_points_final_grid, mo_num, elec_beta_num,       &
+  !$OMP         final_weight_at_r_vector, int2_grad1_u12_bimo_t,  &
+  !$OMP         mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
+  !$OMP         tmp_3, tmp_4)
+  !$OMP DO
+  do b = 1, mo_num
+    do i = 1, elec_beta_num
+      do j = 1, elec_beta_num
+        do ipoint = 1, n_points_final_grid
+
+          loc_1 = final_weight_at_r_vector(ipoint) * mos_l_in_r_array_transp(ipoint,j)
+          loc_2 = mos_r_in_r_array_transp(ipoint,b)
+          loc_3 = mos_r_in_r_array_transp(ipoint,i)
+
+          tmp_3(ipoint,5,b) -= loc_1 * ( int2_grad1_u12_bimo_t(ipoint,1,b,i) * int2_grad1_u12_bimo_t(ipoint,1,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,2,b,i) * int2_grad1_u12_bimo_t(ipoint,2,i,j) &
+                                       + int2_grad1_u12_bimo_t(ipoint,3,b,i) * int2_grad1_u12_bimo_t(ipoint,3,i,j) )
+
+          tmp_4(ipoint,1,b) += ( loc_2 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i) ) &
+                               - loc_3 * ( int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,b)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,b)   &
+                                         + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,b) ) )
+        enddo
+      enddo
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  ! ---
+
+  call dgemm( 'T', 'N', mo_num, mo_num, 5*n_points_final_grid, 1.d0 &
+            , tmp_3(1,1,1), 5*n_points_final_grid                   &
+            , tmp_4(1,1,1), 5*n_points_final_grid                   &
+            , 1.d0, fock_3e_mo_cs(1,1), mo_num)
+
+  deallocate(tmp_3, tmp_4)
+  deallocate(Jkappa, Okappa)
+
+  ! ---
+
+END_PROVIDER 
+
+! ---
+
diff --git a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
index eb8973ff..fd490af6 100644
--- a/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
+++ b/plugins/local/tc_scf/fock_tc_mo_tot.irp.f
@@ -1,4 +1,6 @@
 
+! ---
+
  BEGIN_PROVIDER [ double precision, Fock_matrix_tc_mo_tot, (mo_num,mo_num) ]
 &BEGIN_PROVIDER [ double precision, Fock_matrix_tc_diag_mo_tot, (mo_num)]
 
@@ -23,9 +25,6 @@
   integer          :: i, j, n
   double precision :: t0, t1
 
-  !print*, ' Providing Fock_matrix_tc_mo_tot ...'
-  !call wall_time(t0)
-
   if(elec_alpha_num == elec_beta_num) then
 
     PROVIDE Fock_matrix_tc_mo_alpha
@@ -133,7 +132,7 @@
      enddo
    endif
 
-  if(no_oa_or_av_opt)then
+  if(no_oa_or_av_opt) then
     do i = 1, n_act_orb
       iorb = list_act(i)
       do j = 1, n_inact_orb
@@ -154,12 +153,25 @@
     enddo
   endif
 
-  if(.not.bi_ortho .and. three_body_h_tc)then
-    Fock_matrix_tc_mo_tot += fock_3_mat
-  endif
+  if(tc_Brillouin_Right) then
 
-  !call wall_time(t1)
-  !print*, ' Wall time for Fock_matrix_tc_mo_tot =', t1-t0
+    double precision, allocatable :: tmp(:,:)
+    allocate(tmp(mo_num,mo_num))
+
+    tmp = Fock_matrix_tc_mo_tot
+    do j = 1, mo_num
+      do i = 1, j-1
+        tmp(i,j) = Fock_matrix_tc_mo_tot(j,i)
+      enddo
+    enddo
+
+    Fock_matrix_tc_mo_tot = tmp
+    deallocate(tmp)
+
+  endif
 
 END_PROVIDER
 
+! ---
+
+
diff --git a/plugins/local/tc_scf/fock_three_hermit.irp.f b/plugins/local/tc_scf/fock_three_hermit.irp.f
deleted file mode 100644
index 00d47fae..00000000
--- a/plugins/local/tc_scf/fock_three_hermit.irp.f
+++ /dev/null
@@ -1,771 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_3_mat, (mo_num, mo_num)] 
-
-  implicit none
-  integer :: i,j
-  double precision :: contrib
-
-  fock_3_mat = 0.d0
-  if(.not.bi_ortho .and. three_body_h_tc) then
-
-    call give_fock_ia_three_e_total(1, 1, contrib)
-    !!  !$OMP PARALLEL                  &
-    !!  !$OMP DEFAULT (NONE)            &
-    !!  !$OMP PRIVATE (i,j,m,integral) & 
-    !!  !$OMP SHARED (mo_num,three_body_3_index)
-    !!  !$OMP DO SCHEDULE (guided) COLLAPSE(3)
-    do i = 1, mo_num
-      do j = 1, mo_num
-        call give_fock_ia_three_e_total(j,i,contrib)
-        fock_3_mat(j,i) = -contrib
-       enddo
-     enddo
-    !else if(bi_ortho.and.three_body_h_tc) then
-    !!  !$OMP END DO
-    !!  !$OMP END PARALLEL
-    !!  do i = 1, mo_num
-    !!   do j = 1, i-1
-    !!    mat_three(j,i) = mat_three(i,j)
-    !!   enddo
-    !!  enddo
-  endif
-
-END_PROVIDER 
-
-
-subroutine give_fock_ia_three_e_total(i,a,contrib)
- implicit none
- BEGIN_DOC
-! contrib is the TOTAL (same spins / opposite spins) contribution from the three body term to the Fock operator 
-!
- END_DOC
- integer, intent(in) :: i,a
- double precision, intent(out) :: contrib
- double precision :: int_1, int_2, int_3
- double precision :: mos_i, mos_a, w_ia
- double precision :: mos_ia, weight
-
- integer :: mm, ipoint,k,l
-
- int_1 = 0.d0
- int_2 = 0.d0
- int_3 = 0.d0
- do mm = 1, 3
-  do ipoint = 1, n_points_final_grid
-   weight = final_weight_at_r_vector(ipoint)                                                                          
-   mos_i  = mos_in_r_array_transp(ipoint,i) 
-   mos_a  = mos_in_r_array_transp(ipoint,a) 
-   mos_ia = mos_a * mos_i
-   w_ia   = x_W_ij_erf_rk(ipoint,mm,i,a) 
-     
-   int_1  += weight * fock_3_w_kk_sum(ipoint,mm) * (4.d0 * fock_3_rho_beta(ipoint) * w_ia               & 
-                                                  + 2.0d0 * mos_ia * fock_3_w_kk_sum(ipoint,mm)         & 
-                                                  - 2.0d0 * fock_3_w_ki_mos_k(ipoint,mm,i) * mos_a      & 
-                                                  - 2.0d0 * fock_3_w_ki_mos_k(ipoint,mm,a) * mos_i      )
-   int_2  += weight * (-1.d0) * ( 2.0d0 * fock_3_w_kl_mo_k_mo_l(ipoint,mm) * w_ia                     & 
-                                + 2.0d0 * fock_3_rho_beta(ipoint) * fock_3_w_ki_wk_a(ipoint,mm,i,a)   & 
-                                + 1.0d0 * mos_ia * fock_3_trace_w_tilde(ipoint,mm)                    )
-
-   int_3  += weight *   1.d0  * (fock_3_w_kl_wla_phi_k(ipoint,mm,i) * mos_a + fock_3_w_kl_wla_phi_k(ipoint,mm,a) * mos_i & 
-                                +fock_3_w_ki_mos_k(ipoint,mm,i)     * fock_3_w_ki_mos_k(ipoint,mm,a)                     )
-  enddo
- enddo
- contrib = int_1 + int_2 + int_3
-
-end
-
-! ---
-
-BEGIN_PROVIDER [double precision, diag_three_elem_hf]
-
-  implicit none
-  integer                       :: i, j, k, ipoint, mm
-  double precision              :: contrib, weight, four_third, one_third, two_third, exchange_int_231
-  double precision              :: integral_aaa, hthree, integral_aab, integral_abb, integral_bbb
-  double precision, allocatable :: tmp(:)
-  double precision, allocatable :: tmp_L(:,:), tmp_R(:,:)
-  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
-  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  !print *, ' providing diag_three_elem_hf'
-
-  if(.not. three_body_h_tc) then
-
-   if(noL_standard) then
-      PROVIDE noL_0e
-      diag_three_elem_hf = noL_0e
-    else
-      diag_three_elem_hf = 0.d0
-    endif
-
-  else
-
-    if(.not. bi_ortho) then
-
-      ! ---
-
-      one_third  = 1.d0/3.d0
-      two_third  = 2.d0/3.d0
-      four_third = 4.d0/3.d0
-      diag_three_elem_hf = 0.d0
-      do i = 1, elec_beta_num
-        do j = 1, elec_beta_num
-          do k = 1, elec_beta_num
-            call give_integrals_3_body(k, j, i, j, i, k, exchange_int_231)   
-            diag_three_elem_hf += two_third * exchange_int_231
-          enddo
-        enddo
-      enddo
-      do mm = 1, 3
-        do ipoint = 1, n_points_final_grid
-          weight  = final_weight_at_r_vector(ipoint)                                                                          
-          contrib = 3.d0 * fock_3_w_kk_sum(ipoint,mm) * fock_3_rho_beta(ipoint) * fock_3_w_kk_sum(ipoint,mm) & 
-                  - 2.d0 * fock_3_w_kl_mo_k_mo_l(ipoint,mm) * fock_3_w_kk_sum(ipoint,mm)                     & 
-                  - 1.d0 * fock_3_rho_beta(ipoint) * fock_3_w_kl_w_kl(ipoint,mm)
-          contrib *= four_third
-          contrib += -two_third  * fock_3_rho_beta(ipoint)    * fock_3_w_kl_w_kl(ipoint,mm) & 
-                     -four_third * fock_3_w_kk_sum(ipoint,mm) * fock_3_w_kl_mo_k_mo_l(ipoint,mm)
-          diag_three_elem_hf += weight * contrib
-       enddo
-      enddo
-
-      diag_three_elem_hf = - diag_three_elem_hf
-
-      ! ---
-
-    else
-
-      ! ------------
-      ! SLOW VERSION
-      ! ------------
-
-      !call give_aaa_contrib(integral_aaa)
-      !call give_aab_contrib(integral_aab)
-      !call give_abb_contrib(integral_abb)
-      !call give_bbb_contrib(integral_bbb)
-      !diag_three_elem_hf = integral_aaa + integral_aab + integral_abb + integral_bbb
-
-      ! ------------
-      ! ------------
-
-      PROVIDE int2_grad1_u12_bimo_t
-      PROVIDE mos_l_in_r_array_transp
-      PROVIDE mos_r_in_r_array_transp
-
-      if(elec_alpha_num .eq. elec_beta_num) then
-    
-        allocate(tmp(elec_beta_num))
-        allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-    
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT(NONE)                                            &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                      &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = 1, elec_beta_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        diag_three_elem_hf = -2.d0 * sum(tmp)
-    
-        deallocate(tmp)
-        deallocate(tmp_L, tmp_R)
-    
-        ! ---
-    
-        allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-        tmp_O = 0.d0
-        tmp_J = 0.d0
-    
-        !$OMP PARALLEL                                                  &
-        !$OMP DEFAULT(NONE)                                             &
-        !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,                & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,  &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-    
-        allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-        tmp_O_priv = 0.d0
-        tmp_J_priv = 0.d0
-      
-        !$OMP DO 
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_O = tmp_O + tmp_O_priv
-        tmp_J = tmp_J + tmp_J_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_O_priv, tmp_J_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-        tmp_M = 0.d0
-        tmp_S = 0.d0
-    
-        !$OMP PARALLEL                                                 &
-        !$OMP DEFAULT(NONE)                                            &
-        !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)            &
-        !$OMP SHARED(elec_beta_num, n_points_final_grid,               & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp, &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-    
-        allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-        tmp_M_priv = 0.d0
-        tmp_S_priv = 0.d0
-      
-        !$OMP DO COLLAPSE(2)
-        do i = 1, elec_beta_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_M = tmp_M + tmp_M_priv
-        tmp_S = tmp_S + tmp_S_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_M_priv, tmp_S_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp(n_points_final_grid))
-    
-        do ipoint = 1, n_points_final_grid
-    
-          tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-    
-          tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                           - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                    + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                    + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-        enddo
-    
-        diag_three_elem_hf = diag_three_elem_hf -2.d0 * (sum(tmp))
-    
-        deallocate(tmp)
-    
-      else
-    
-        allocate(tmp(elec_alpha_num))
-        allocate(tmp_L(n_points_final_grid,3), tmp_R(n_points_final_grid,3))
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = 1, elec_beta_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = elec_beta_num+1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-    
-          do i = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        ! ---
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(j, i, ipoint, tmp_L, tmp_R)                        &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp, final_weight_at_r_vector)
-    
-        !$OMP DO
-        do j = elec_beta_num+1, elec_alpha_num
-    
-          tmp_L = 0.d0
-          tmp_R = 0.d0
-          do i = 1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-              tmp_L(ipoint,1) = tmp_L(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,2) = tmp_L(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i)
-              tmp_L(ipoint,3) = tmp_L(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i)
-    
-              tmp_R(ipoint,1) = tmp_R(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,2) = tmp_R(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_R(ipoint,3) = tmp_R(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_r_in_r_array_transp(ipoint,i)
-            enddo
-          enddo
-    
-          tmp(j) = 0.d0
-          do ipoint = 1, n_points_final_grid
-            tmp(j) = tmp(j) + 0.5d0 * final_weight_at_r_vector(ipoint) * (tmp_L(ipoint,1)*tmp_R(ipoint,1) + tmp_L(ipoint,2)*tmp_R(ipoint,2) + tmp_L(ipoint,3)*tmp_R(ipoint,3))
-          enddo
-        enddo ! j
-        !$OMP END DO
-        !$OMP END PARALLEL
-    
-        diag_three_elem_hf = -2.d0 * sum(tmp)
-    
-        deallocate(tmp)
-        deallocate(tmp_L, tmp_R)
-    
-        ! ---
-    
-        allocate(tmp_O(n_points_final_grid), tmp_J(n_points_final_grid,3))
-        tmp_O = 0.d0
-        tmp_J = 0.d0
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(i, ipoint, tmp_O_priv, tmp_J_priv)                 &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_O, tmp_J)
-    
-        allocate(tmp_O_priv(n_points_final_grid), tmp_J_priv(n_points_final_grid,3))
-        tmp_O_priv = 0.d0
-        tmp_J_priv = 0.d0
-      
-        !$OMP DO 
-        do i = 1, elec_beta_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO 
-        do i = elec_beta_num+1, elec_alpha_num
-          do ipoint = 1, n_points_final_grid
-            tmp_O_priv(ipoint)   = tmp_O_priv(ipoint)   + 0.5d0 * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,i)
-            tmp_J_priv(ipoint,1) = tmp_J_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,i)
-            tmp_J_priv(ipoint,2) = tmp_J_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,i)
-            tmp_J_priv(ipoint,3) = tmp_J_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,i)
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_O = tmp_O + tmp_O_priv
-        tmp_J = tmp_J + tmp_J_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_O_priv, tmp_J_priv)
-        !$OMP END PARALLEL
-    
-        ! ---
-    
-        allocate(tmp_M(n_points_final_grid,3), tmp_S(n_points_final_grid))
-        tmp_M = 0.d0
-        tmp_S = 0.d0
-    
-        !$OMP PARALLEL                                                   &
-        !$OMP DEFAULT(NONE)                                              &
-        !$OMP PRIVATE(i, j, ipoint, tmp_M_priv, tmp_S_priv)              &
-        !$OMP SHARED(elec_beta_num, elec_alpha_num, n_points_final_grid, & 
-        !$OMP        mos_l_in_r_array_transp, mos_r_in_r_array_transp,   &
-        !$OMP        int2_grad1_u12_bimo_t, tmp_M, tmp_S)
-    
-        allocate(tmp_M_priv(n_points_final_grid,3), tmp_S_priv(n_points_final_grid))
-        tmp_M_priv = 0.d0
-        tmp_S_priv = 0.d0
-      
-        !$OMP DO COLLAPSE(2)
-        do i = 1, elec_beta_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO COLLAPSE(2)
-        do i = elec_beta_num+1, elec_alpha_num
-          do j = 1, elec_beta_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * mos_l_in_r_array_transp(ipoint,j) * mos_r_in_r_array_transp(ipoint,i)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP DO COLLAPSE(2)
-        do i = elec_beta_num+1, elec_alpha_num
-          do j = elec_beta_num+1, elec_alpha_num
-            do ipoint = 1, n_points_final_grid
-    
-              tmp_M_priv(ipoint,1) = tmp_M_priv(ipoint,1) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,2) = tmp_M_priv(ipoint,2) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-              tmp_M_priv(ipoint,3) = tmp_M_priv(ipoint,3) + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,j,i) * mos_l_in_r_array_transp(ipoint,i) * mos_r_in_r_array_transp(ipoint,j)
-    
-              tmp_S_priv(ipoint)   = tmp_S_priv(ipoint)   + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,1,i,j) * int2_grad1_u12_bimo_t(ipoint,1,j,i) &
-                                                          + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,2,i,j) * int2_grad1_u12_bimo_t(ipoint,2,j,i) &
-                                                          + 0.5d0 * int2_grad1_u12_bimo_t(ipoint,3,i,j) * int2_grad1_u12_bimo_t(ipoint,3,j,i)
-            enddo
-          enddo
-        enddo
-        !$OMP END DO NOWAIT
-    
-        !$OMP CRITICAL
-        tmp_M = tmp_M + tmp_M_priv
-        tmp_S = tmp_S + tmp_S_priv
-        !$OMP END CRITICAL
-    
-        deallocate(tmp_M_priv, tmp_S_priv)
-        !$OMP END PARALLEL
-    
-        allocate(tmp(n_points_final_grid))
-    
-        do ipoint = 1, n_points_final_grid
-    
-          tmp_S(ipoint) = 2.d0 * (tmp_J(ipoint,1)*tmp_J(ipoint,1) + tmp_J(ipoint,2)*tmp_J(ipoint,2) + tmp_J(ipoint,3)*tmp_J(ipoint,3)) - tmp_S(ipoint)
-    
-          tmp(ipoint) = final_weight_at_r_vector(ipoint) * ( tmp_O(ipoint) * tmp_S(ipoint)              &
-                                                           - 2.d0 * ( tmp_J(ipoint,1) * tmp_M(ipoint,1) &
-                                                                    + tmp_J(ipoint,2) * tmp_M(ipoint,2) &
-                                                                    + tmp_J(ipoint,3) * tmp_M(ipoint,3)))
-        enddo
-    
-        diag_three_elem_hf = diag_three_elem_hf - 2.d0 * (sum(tmp))
-    
-        deallocate(tmp)
-    
-      endif
-
-
-    endif
-
-  endif
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, fock_3_mat_a_op_sh, (mo_num, mo_num)]
- implicit none 
- integer :: h,p,i,j
- double precision :: direct_int, exch_int, exchange_int_231, exchange_int_312
- double precision :: exchange_int_23, exchange_int_12, exchange_int_13 
-
- fock_3_mat_a_op_sh = 0.d0
- do h = 1, mo_num
-  do p = 1, mo_num
-   !F_a^{ab}(h,p) 
-   do i = 1, elec_beta_num ! beta 
-    do j = elec_beta_num+1, elec_alpha_num ! alpha
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)    ! <hji|pji>
-     call  give_integrals_3_body(h,j,i,j,p,i,exch_int)   
-     fock_3_mat_a_op_sh(h,p) -= direct_int - exch_int
-    enddo
-   enddo
-   !F_a^{aa}(h,p)
-   do i = 1, elec_beta_num ! alpha 
-    do j = elec_beta_num+1, elec_alpha_num ! alpha
-       call  give_integrals_3_body(h,j,i,p,j,i,direct_int) 
-       call  give_integrals_3_body(h,j,i,i,p,j,exchange_int_231)
-       call  give_integrals_3_body(h,j,i,j,i,p,exchange_int_312) 
-       call  give_integrals_3_body(h,j,i,p,i,j,exchange_int_23) 
-       call  give_integrals_3_body(h,j,i,i,j,p,exchange_int_12)
-       call  give_integrals_3_body(h,j,i,j,p,i,exchange_int_13)  
-       fock_3_mat_a_op_sh(h,p) -= ( direct_int + exchange_int_231 + exchange_int_312 & 
-              -  exchange_int_23 & ! i <-> j
-              -  exchange_int_12 & ! p <-> j
-              -  exchange_int_13  )! p <-> i
-    enddo 
-   enddo
-  enddo
- enddo
-! symmetrized 
-! do p = 1, elec_beta_num
-!  do h = elec_alpha_num +1, mo_num
-!   fock_3_mat_a_op_sh(h,p) = fock_3_mat_a_op_sh(p,h)
-!  enddo
-! enddo
- 
-! do h = elec_beta_num+1, elec_alpha_num
-!  do p = elec_alpha_num +1, mo_num
-!   !F_a^{bb}(h,p) 
-!   do i = 1, elec_beta_num
-!    do j = i+1, elec_beta_num
-!     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-!     call  give_integrals_3_body(h,j,i,p,i,j,exch_int)   
-!     fock_3_mat_a_op_sh(h,p) -= direct_int - exch_int
-!    enddo
-!   enddo
-!  enddo
-! enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_mat_b_op_sh, (mo_num, mo_num)]
- implicit none 
- integer :: h,p,i,j
- double precision :: direct_int, exch_int
- fock_3_mat_b_op_sh = 0.d0
- do h = 1, elec_beta_num
-  do p = elec_alpha_num +1, mo_num
-   !F_b^{aa}(h,p) 
-   do i = 1, elec_beta_num
-    do j = elec_beta_num+1, elec_alpha_num
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-     call  give_integrals_3_body(h,j,i,p,i,j,exch_int)   
-     fock_3_mat_b_op_sh(h,p) += direct_int - exch_int
-    enddo
-   enddo
-
-   !F_b^{ab}(h,p) 
-   do i = elec_beta_num+1, elec_beta_num
-    do j = 1, elec_beta_num
-     call  give_integrals_3_body(h,j,i,p,j,i,direct_int)   
-     call  give_integrals_3_body(h,j,i,j,p,i,exch_int)   
-     fock_3_mat_b_op_sh(h,p) += direct_int - exch_int
-    enddo
-   enddo
- 
-  enddo
- enddo
-
-END_PROVIDER 
-
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kk_sum, (n_points_final_grid,3)]
- implicit none
- integer :: mm, ipoint,k
- double precision :: w_kk
- fock_3_w_kk_sum = 0.d0
- do k = 1, elec_beta_num
-  do mm = 1, 3
-   do ipoint = 1, n_points_final_grid
-    w_kk   = x_W_ij_erf_rk(ipoint,mm,k,k) 
-    fock_3_w_kk_sum(ipoint,mm) += w_kk
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_ki_mos_k, (n_points_final_grid,3,mo_num)]
- implicit none
- integer :: mm, ipoint,k,i
- double precision :: w_ki, mo_k
- fock_3_w_ki_mos_k = 0.d0
- do i = 1, mo_num
-  do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     w_ki   = x_W_ij_erf_rk(ipoint,mm,k,i) 
-     mo_k = mos_in_r_array(k,ipoint)
-     fock_3_w_ki_mos_k(ipoint,mm,i) += w_ki * mo_k
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_w_kl, (n_points_final_grid,3)]
- implicit none
- integer :: k,j,ipoint,mm
- double precision :: w_kj
- fock_3_w_kl_w_kl = 0.d0
- do j = 1, elec_beta_num
-  do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     w_kj   = x_W_ij_erf_rk(ipoint,mm,k,j) 
-     fock_3_w_kl_w_kl(ipoint,mm) += w_kj * w_kj
-    enddo
-   enddo
-  enddo
- enddo
-
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_rho_beta, (n_points_final_grid)]
- implicit none
- integer :: ipoint,k
- fock_3_rho_beta = 0.d0
- do ipoint = 1, n_points_final_grid
-  do k = 1, elec_beta_num
-   fock_3_rho_beta(ipoint) += mos_in_r_array(k,ipoint) * mos_in_r_array(k,ipoint)
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_mo_k_mo_l, (n_points_final_grid,3)]
- implicit none
- integer :: ipoint,k,l,mm
- double precision :: mos_k, mos_l, w_kl
- fock_3_w_kl_mo_k_mo_l = 0.d0
- do k = 1, elec_beta_num
-  do l = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     mos_k  = mos_in_r_array_transp(ipoint,k) 
-     mos_l  = mos_in_r_array_transp(ipoint,l) 
-     w_kl   = x_W_ij_erf_rk(ipoint,mm,l,k)
-     fock_3_w_kl_mo_k_mo_l(ipoint,mm) += w_kl * mos_k * mos_l 
-    enddo
-   enddo
-  enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_ki_wk_a, (n_points_final_grid,3,mo_num, mo_num)]
- implicit none
- integer :: ipoint,i,a,k,mm
- double precision :: w_ki,w_ka
- fock_3_w_ki_wk_a = 0.d0
- do i = 1, mo_num
-  do a = 1, mo_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     do k = 1, elec_beta_num
-      w_ki   = x_W_ij_erf_rk(ipoint,mm,k,i)
-      w_ka   = x_W_ij_erf_rk(ipoint,mm,k,a)
-      fock_3_w_ki_wk_a(ipoint,mm,a,i) += w_ki * w_ka
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_trace_w_tilde, (n_points_final_grid,3)]
- implicit none
- integer :: ipoint,k,mm
- fock_3_trace_w_tilde = 0.d0
- do k = 1, elec_beta_num
-   do mm = 1, 3
-    do ipoint = 1, n_points_final_grid
-     fock_3_trace_w_tilde(ipoint,mm) += fock_3_w_ki_wk_a(ipoint,mm,k,k)
-    enddo
-   enddo
- enddo
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, fock_3_w_kl_wla_phi_k, (n_points_final_grid,3,mo_num)]
- implicit none
- integer :: ipoint,a,k,mm,l
- double precision :: w_kl,w_la, mo_k
- fock_3_w_kl_wla_phi_k = 0.d0
- do a = 1, mo_num
-  do k = 1, elec_beta_num 
-   do l = 1, elec_beta_num
-    do mm = 1, 3
-     do ipoint = 1, n_points_final_grid
-      w_kl   = x_W_ij_erf_rk(ipoint,mm,l,k)
-      w_la   = x_W_ij_erf_rk(ipoint,mm,l,a)
-      mo_k  = mos_in_r_array_transp(ipoint,k) 
-      fock_3_w_kl_wla_phi_k(ipoint,mm,a) += w_kl * w_la * mo_k
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-END_PROVIDER 
-
-
-
-
-
diff --git a/plugins/local/tc_scf/fock_vartc.irp.f b/plugins/local/tc_scf/fock_vartc.irp.f
deleted file mode 100644
index 2b4a57e5..00000000
--- a/plugins/local/tc_scf/fock_vartc.irp.f
+++ /dev/null
@@ -1,287 +0,0 @@
-
-! ---
-
- BEGIN_PROVIDER [ double precision, two_e_vartc_integral_alpha, (ao_num, ao_num)]
-&BEGIN_PROVIDER [ double precision, two_e_vartc_integral_beta , (ao_num, ao_num)]
-
-  implicit none
-  integer                       :: i, j, k, l
-  double precision              :: density, density_a, density_b, I_coul, I_kjli
-  double precision              :: t0, t1
-  double precision, allocatable :: tmp_a(:,:), tmp_b(:,:)
-
-  two_e_vartc_integral_alpha = 0.d0
-  two_e_vartc_integral_beta  = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                                                        &
- !$OMP PRIVATE (i, j, k, l, density_a, density_b, density, tmp_a, tmp_b, I_coul, I_kjli)              &
- !$OMP SHARED  (ao_num, TCSCF_density_matrix_ao_alpha, TCSCF_density_matrix_ao_beta, ao_two_e_tc_tot, &
- !$OMP         two_e_vartc_integral_alpha, two_e_vartc_integral_beta)
-
-  allocate(tmp_a(ao_num,ao_num), tmp_b(ao_num,ao_num))
-  tmp_a = 0.d0
-  tmp_b = 0.d0
-
- !$OMP DO
-  do j = 1, ao_num
-    do l = 1, ao_num
-      density_a = TCSCF_density_matrix_ao_alpha(l,j)
-      density_b = TCSCF_density_matrix_ao_beta (l,j)
-      density   = density_a + density_b                      
-      do i = 1, ao_num
-        do k = 1, ao_num
-
-          I_coul = density * ao_two_e_tc_tot(k,i,l,j)
-          I_kjli = ao_two_e_tc_tot(k,j,l,i)
-
-          tmp_a(k,i) += I_coul - density_a * I_kjli
-          tmp_b(k,i) += I_coul - density_b * I_kjli
-        enddo
-      enddo
-    enddo
-  enddo
- !$OMP END DO NOWAIT
-
- !$OMP CRITICAL
-  do i = 1, ao_num
-    do j = 1, ao_num
-      two_e_vartc_integral_alpha(j,i) += tmp_a(j,i)
-      two_e_vartc_integral_beta (j,i) += tmp_b(j,i)
-    enddo
-  enddo
- !$OMP END CRITICAL
-
-  deallocate(tmp_a, tmp_b)
- !$OMP END PARALLEL
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_ao_alpha, (ao_num, ao_num)]
-
-  implicit none
-
-  Fock_matrix_vartc_ao_alpha = ao_one_e_integrals_tc_tot + two_e_vartc_integral_alpha 
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_ao_beta, (ao_num, ao_num)]
-
-  implicit none
-
-  Fock_matrix_vartc_ao_beta = ao_one_e_integrals_tc_tot + two_e_vartc_integral_beta 
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_mo_alpha, (mo_num, mo_num) ]
-
-  implicit none
-
-  call ao_to_mo_bi_ortho( Fock_matrix_vartc_ao_alpha, size(Fock_matrix_vartc_ao_alpha, 1) &
-                        , Fock_matrix_vartc_mo_alpha, size(Fock_matrix_vartc_mo_alpha, 1) )
-  if(three_body_h_tc) then
-    Fock_matrix_vartc_mo_alpha += fock_3e_uhf_mo_a
-  endif
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_mo_beta, (mo_num,mo_num) ]
-
-  implicit none
-
-  call ao_to_mo_bi_ortho( Fock_matrix_vartc_ao_beta, size(Fock_matrix_vartc_ao_beta, 1) &
-                        , Fock_matrix_vartc_mo_beta, size(Fock_matrix_vartc_mo_beta, 1) )
-  if(three_body_h_tc) then
-    Fock_matrix_vartc_mo_beta += fock_3e_uhf_mo_b
-  endif
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ double precision, grad_vartc]
-
-  implicit none
-  integer          :: i, k
-  double precision :: grad_left, grad_right
-
-  grad_left  = 0.d0
-  grad_right = 0.d0
-
-  do i = 1, elec_beta_num ! doc --> SOMO
-    do k = elec_beta_num+1, elec_alpha_num
-      grad_left  = max(grad_left , dabs(Fock_matrix_vartc_mo_tot(k,i)))
-      grad_right = max(grad_right, dabs(Fock_matrix_vartc_mo_tot(i,k)))
-    enddo
-  enddo
-
-  do i = 1, elec_beta_num ! doc --> virt 
-    do k = elec_alpha_num+1, mo_num
-      grad_left  = max(grad_left , dabs(Fock_matrix_vartc_mo_tot(k,i)))
-      grad_right = max(grad_right, dabs(Fock_matrix_vartc_mo_tot(i,k)))
-    enddo
-  enddo
-
-  do i = elec_beta_num+1, elec_alpha_num ! SOMO --> virt 
-    do k = elec_alpha_num+1, mo_num
-      grad_left  = max(grad_left , dabs(Fock_matrix_vartc_mo_tot(k,i)))
-      grad_right = max(grad_right, dabs(Fock_matrix_vartc_mo_tot(i,k)))
-    enddo
-  enddo
-
-  grad_vartc = grad_left + grad_right
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_ao_tot, (ao_num, ao_num) ]
-
-  implicit none
-
-  call mo_to_ao_bi_ortho( Fock_matrix_vartc_mo_tot, size(Fock_matrix_vartc_mo_tot, 1) &
-                        , Fock_matrix_vartc_ao_tot, size(Fock_matrix_vartc_ao_tot, 1) )
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_mo_tot, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, Fock_matrix_vartc_diag_mo_tot, (mo_num)]
-
-  implicit none
-  integer :: i, j, n
-
-  if(elec_alpha_num == elec_beta_num) then
-    Fock_matrix_vartc_mo_tot = Fock_matrix_vartc_mo_alpha
-  else
-
-    do j = 1, elec_beta_num
-      ! F-K
-      do i = 1, elec_beta_num !CC
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-            - (Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F+K/2
-      do i = elec_beta_num+1, elec_alpha_num  !CA
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             + 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F
-      do i = elec_alpha_num+1, mo_num !CV
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))
-      enddo
-    enddo
-
-    do j = elec_beta_num+1, elec_alpha_num
-      ! F+K/2
-      do i = 1, elec_beta_num !AC
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             + 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F
-      do i = elec_beta_num+1, elec_alpha_num !AA
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))
-      enddo
-      ! F-K/2
-      do i = elec_alpha_num+1, mo_num !AV
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             - 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-    enddo
-    
-    do j = elec_alpha_num+1, mo_num
-      ! F
-      do i = 1, elec_beta_num !VC
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))
-      enddo
-      ! F-K/2
-      do i = elec_beta_num+1, elec_alpha_num !VA
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j))&
-             - 0.5d0*(Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-      ! F+K
-      do i = elec_alpha_num+1, mo_num !VV
-         Fock_matrix_vartc_mo_tot(i,j) = 0.5d0*(Fock_matrix_vartc_mo_alpha(i,j)+Fock_matrix_vartc_mo_beta(i,j)) &
-             + (Fock_matrix_vartc_mo_beta(i,j) - Fock_matrix_vartc_mo_alpha(i,j))
-      enddo
-    enddo
-    if(three_body_h_tc)then
-      ! C-O
-      do j = 1, elec_beta_num
-        do i = elec_beta_num+1, elec_alpha_num
-          Fock_matrix_vartc_mo_tot(i,j) += 0.5d0*(fock_a_tot_3e_bi_orth(i,j) + fock_b_tot_3e_bi_orth(i,j))
-          Fock_matrix_vartc_mo_tot(j,i) += 0.5d0*(fock_a_tot_3e_bi_orth(j,i) + fock_b_tot_3e_bi_orth(j,i))
-        enddo
-      enddo
-      ! C-V
-      do j = 1, elec_beta_num
-        do i = elec_alpha_num+1, mo_num
-          Fock_matrix_vartc_mo_tot(i,j) += 0.5d0*(fock_a_tot_3e_bi_orth(i,j) + fock_b_tot_3e_bi_orth(i,j))
-          Fock_matrix_vartc_mo_tot(j,i) += 0.5d0*(fock_a_tot_3e_bi_orth(j,i) + fock_b_tot_3e_bi_orth(j,i))
-        enddo
-      enddo
-      ! O-V
-      do j = elec_beta_num+1, elec_alpha_num
-        do i = elec_alpha_num+1, mo_num
-          Fock_matrix_vartc_mo_tot(i,j) += 0.5d0*(fock_a_tot_3e_bi_orth(i,j) + fock_b_tot_3e_bi_orth(i,j))
-          Fock_matrix_vartc_mo_tot(j,i) += 0.5d0*(fock_a_tot_3e_bi_orth(j,i) + fock_b_tot_3e_bi_orth(j,i))
-        enddo
-      enddo
-    endif
-
-  endif
-
-  do i = 1, mo_num
-    Fock_matrix_vartc_diag_mo_tot(i) = Fock_matrix_vartc_mo_tot(i,i)
-  enddo
-
-  if(frozen_orb_scf)then
-    integer :: iorb, jorb
-    do i = 1, n_core_orb
-     iorb = list_core(i)
-     do j = 1, n_act_orb
-      jorb = list_act(j)
-      Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-      Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0
-     enddo
-    enddo
-  endif
-
-  if(no_oa_or_av_opt)then
-    do i = 1, n_act_orb
-      iorb = list_act(i)
-      do j = 1, n_inact_orb
-        jorb = list_inact(j)
-        Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-        Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0
-      enddo
-      do j = 1, n_virt_orb
-        jorb = list_virt(j)
-        Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-        Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0
-      enddo
-      do j = 1, n_core_orb
-        jorb = list_core(j)
-        Fock_matrix_vartc_mo_tot(iorb,jorb) = 0.d0
-        Fock_matrix_vartc_mo_tot(jorb,iorb) = 0.d0                                                                                                                 
-      enddo
-    enddo
-  endif
-
-  !call check_sym(Fock_matrix_vartc_mo_tot, mo_num)
-  !do i = 1, mo_num
-  !  write(*,'(100(F15.8, I4))') Fock_matrix_vartc_mo_tot(i,:)
-  !enddo
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/integrals_in_r_stuff.irp.f b/plugins/local/tc_scf/integrals_in_r_stuff.irp.f
deleted file mode 100644
index 3ce85a97..00000000
--- a/plugins/local/tc_scf/integrals_in_r_stuff.irp.f
+++ /dev/null
@@ -1,391 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, tc_scf_dm_in_r, (n_points_final_grid) ]
-
-  implicit none
-  integer :: i, j
-
-  tc_scf_dm_in_r = 0.d0
-  do i = 1, n_points_final_grid
-    do j = 1, elec_beta_num
-      tc_scf_dm_in_r(i) += mos_r_in_r_array(j,i) * mos_l_in_r_array(j,i)
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, w_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: ipoint, j, xi
-
-  w_sum_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        !w_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,j)
-        w_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, ww_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer          :: ipoint, j, xi
-  double precision :: tmp
-
-  ww_sum_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        tmp = x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-        ww_sum_in_r(ipoint,xi) += tmp * tmp
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_r_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_r_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_r_in_r(ipoint,xi,i) += mos_r_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_l_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_l_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_l_in_r(ipoint,xi,i) += mos_l_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        W1_in_r(ipoint,xi) += W1_l_in_r(ipoint,xi,j) * mos_r_in_r_array_transp(ipoint,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_diag_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_diag_in_r = 0.d0
-  do j = 1, elec_beta_num
-    do xi = 1, 3
-      do ipoint = 1, n_points_final_grid
-        W1_diag_in_r(ipoint,xi) += mos_r_in_r_array_transp(ipoint,j) * mos_l_in_r_array_transp(ipoint,j) * x_W_ki_bi_ortho_erf_rk_diag(ipoint,xi,j)
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, v_sum_in_r, (n_points_final_grid, 3)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-  v_sum_in_r = 0.d0
-  do i = 1, elec_beta_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          v_sum_in_r(ipoint,xi) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_W1_r_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, m, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_W1_r_in_r = 0.d0
-  do i = 1, mo_num
-    do m = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_W1_r_in_r(ipoint,xi,i) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,m,i) * W1_r_in_r(ipoint,xi,m)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, W1_W1_l_in_r, (n_points_final_grid, 3, mo_num)]
-
-  implicit none
-  integer :: i, j, xi, ipoint
-
-  ! TODO: call lapack
-
-  W1_W1_l_in_r = 0.d0
-  do i = 1, mo_num
-    do j = 1, elec_beta_num
-      do xi = 1, 3
-        do ipoint = 1, n_points_final_grid
-          W1_W1_l_in_r(ipoint,xi,i) += x_W_ki_bi_ortho_erf_rk(ipoint,xi,i,j) * W1_l_in_r(ipoint,xi,j)
-        enddo
-      enddo
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-subroutine direct_term_imj_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | i m j > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight, tmp
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-      !integral += ( mos_l_in_r_array(a,ipoint) * mos_r_in_r_array(i,ipoint) * w_sum_in_r(ipoint,xi) * w_sum_in_r(ipoint,xi) & 
-      !            + 2.d0 * tc_scf_dm_in_r(ipoint) * w_sum_in_r(ipoint,xi) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i) ) * weight
-
-      tmp = w_sum_in_r(ipoint,xi)
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * tmp * tmp & 
-                  + 2.d0 * tc_scf_dm_in_r(ipoint) * tmp * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i)       &
-                  ) * weight
-    enddo
-  enddo
-
-end 
-
-! ---
-
-subroutine exch_term_jmi_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | j m i > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi, j
-  double precision              :: weight, tmp
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      tmp = 0.d0
-      do j = 1, elec_beta_num
-        tmp = tmp + x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,j) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,j,i) 
-      enddo
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_r_in_r(ipoint,xi,i) * w_sum_in_r(ipoint,xi) & 
-                  + tc_scf_dm_in_r(ipoint) * tmp                                                       &
-                  + mos_r_in_r_array_transp(ipoint,i) * W1_l_in_r(ipoint,xi,a) * w_sum_in_r(ipoint,xi) & 
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine exch_term_ijm_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | i j m > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * v_sum_in_r(ipoint,xi) & 
-                  + 2.d0 * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i) * W1_in_r(ipoint,xi)                             &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine direct_term_ijj_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j = 1, elec_beta_num) < a j j | i j j > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * mos_r_in_r_array_transp(ipoint,i) * ww_sum_in_r(ipoint,xi) & 
-                  + 2.d0 * W1_diag_in_r(ipoint, xi) * x_W_ki_bi_ortho_erf_rk(ipoint,xi,a,i)                        &
-                  ) * weight
-    enddo
-  enddo
-
-end 
-
-! ---
-
-subroutine cyclic_term_jim_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | j i m > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_W1_r_in_r(ipoint,xi,i)  & 
-                  + W1_W1_l_in_r(ipoint,xi,a) * mos_r_in_r_array_transp(ipoint,i)  &
-                  + W1_l_in_r(ipoint,xi,a) * W1_r_in_r(ipoint,xi,i)                &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
-subroutine cyclic_term_mji_bi_ortho(a, i, integral)
-
-  BEGIN_DOC
-  ! computes sum_(j,m = 1, elec_beta_num) < a m j | m j i > with bi ortho mos
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, a
-  double precision, intent(out) :: integral
-
-  integer                       :: ipoint, xi
-  double precision              :: weight
-
-  integral = 0.d0
-  do xi = 1, 3
-    do ipoint = 1, n_points_final_grid
-      weight = final_weight_at_r_vector(ipoint)
-        
-      integral += ( mos_l_in_r_array_transp(ipoint,a) * W1_W1_r_in_r(ipoint,xi,i)  & 
-                  + W1_l_in_r(ipoint,xi,a) * W1_r_in_r(ipoint,xi,i)                &
-                  + W1_W1_l_in_r(ipoint,xi,a) * mos_r_in_r_array_transp(ipoint,i)  &
-                  ) * weight
-
-    enddo
-  enddo
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/jast_schmos_90.irp.f b/plugins/local/tc_scf/jast_schmos_90.irp.f
deleted file mode 100644
index 5c5e625f..00000000
--- a/plugins/local/tc_scf/jast_schmos_90.irp.f
+++ /dev/null
@@ -1,318 +0,0 @@
- BEGIN_PROVIDER [integer ,  m_max_sm_7]
-&BEGIN_PROVIDER [integer ,  n_max_sm_7]
-&BEGIN_PROVIDER [integer ,  o_max_sm_7]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_7 version of Table IV
- END_DOC
- m_max_sm_7 = 4
- n_max_sm_7 = 0
- o_max_sm_7 = 4
-END_PROVIDER 
-
- BEGIN_PROVIDER [integer ,  m_max_sm_9]
-&BEGIN_PROVIDER [integer ,  n_max_sm_9]
-&BEGIN_PROVIDER [integer ,  o_max_sm_9]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_9 version of Table IV
- END_DOC
- m_max_sm_9 = 4
- n_max_sm_9 = 2
- o_max_sm_9 = 4
-END_PROVIDER 
-
-
- BEGIN_PROVIDER [integer ,  m_max_sm_17]
-&BEGIN_PROVIDER [integer ,  n_max_sm_17]
-&BEGIN_PROVIDER [integer ,  o_max_sm_17]
- implicit none
- BEGIN_DOC
-! maximum value of the "m", "n" and "o" integer in the Jastrow function as in Eq. (4) 
-! of Schmidt,Moskowitz, JCP, 93, 4172 (1990)  for the SM_17 version of Table IV
- END_DOC
- m_max_sm_17 = 6
- n_max_sm_17 = 2
- o_max_sm_17 = 6
-END_PROVIDER 
-
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_7, (0:m_max_sm_7,0:n_max_sm_7,0:o_max_sm_7,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_7(0:4,0:4,2:10) = coefficient for the SM_7 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_7
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_7 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_7(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_7(0,0,2,2) =  0.50516d0 
- c_mn_o_sm_7(0,0,3,2) = -0.19313d0 
- c_mn_o_sm_7(0,0,4,2) =  0.30276d0 
- ! one-electron terms 
- c_mn_o_sm_7(2,0,0,2) = -0.16995d0
- c_mn_o_sm_7(3,0,0,2) = -0.34505d0
- c_mn_o_sm_7(4,0,0,2) = -0.54777d0
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_7(0,0,2,10) = -0.792d0
- c_mn_o_sm_7(0,0,3,10) =  1.05232d0 
- c_mn_o_sm_7(0,0,4,10) = -0.65615d0 
- ! one-electron terms 
- c_mn_o_sm_7(2,0,0,10) = -0.13312d0
- c_mn_o_sm_7(3,0,0,10) = -0.00131d0
- c_mn_o_sm_7(4,0,0,10) =  0.09083d0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_9, (0:m_max_sm_9,0:n_max_sm_9,0:o_max_sm_9,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_9(0:4,0:4,2:10) = coefficient for the SM_9 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_9
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_9 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_9(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_9(0,0,2,2) =  0.50516d0 
- c_mn_o_sm_9(0,0,3,2) = -0.19313d0 
- c_mn_o_sm_9(0,0,4,2) =  0.30276d0 
- ! one-electron terms 
- c_mn_o_sm_9(2,0,0,2) = -0.16995d0
- c_mn_o_sm_9(3,0,0,2) = -0.34505d0
- c_mn_o_sm_9(4,0,0,2) = -0.54777d0
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_9(0,0,2,10) = -0.792d0
- c_mn_o_sm_9(0,0,3,10) =  1.05232d0 
- c_mn_o_sm_9(0,0,4,10) = -0.65615d0 
- ! one-electron terms 
- c_mn_o_sm_9(2,0,0,10) = -0.13312d0
- c_mn_o_sm_9(3,0,0,10) = -0.00131d0
- c_mn_o_sm_9(4,0,0,10) =  0.09083d0
-
-END_PROVIDER 
-
-BEGIN_PROVIDER [ double precision, c_mn_o_sm_17, (0:m_max_sm_17,0:n_max_sm_17,0:o_max_sm_17,2:10)]
- implicit none
- BEGIN_DOC
- !
- !c_mn_o_17(0:4,0:4,2:10) = coefficient for the SM_17 correlation factor as given is Table IV of 
- !                         Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- !                         the first index (0:4) is the "m"  integer for the 1e part 
- !                         the second index(0:0) is the "n"  integer for the 1e part WHICH IS ALWAYS SET TO 0 FOR SM_17
- !                         the third index (0:4) is the "o" integer for the 2e part
- !                         the fourth index (2:10) is the nuclear charge of the atom 
- END_DOC
- c_mn_o_sm_17 = 0.d0
- integer :: i
- do i = 2, 10 ! loop over nuclear charge
-  c_mn_o_sm_17(0,0,1,i) = 0.5d0 ! all the linear terms are set to 1/2 to satisfy the anti-parallel spin condition
- enddo
- ! He atom 
- ! two electron terms 
- c_mn_o_sm_17(0,0,2,2) =  0.09239d0 
- c_mn_o_sm_17(0,0,3,2) = -0.38664d0 
- c_mn_o_sm_17(0,0,4,2) =  0.95764d0 
- ! one-electron terms 
- c_mn_o_sm_17(2,0,0,2) =  0.23208d0
- c_mn_o_sm_17(3,0,0,2) = -0.45032d0
- c_mn_o_sm_17(4,0,0,2) =  0.82777d0
- c_mn_o_sm_17(2,2,0,2) = -4.15388d0
- ! ee-n terms 
- c_mn_o_sm_17(2,0,2,2) =  0.80622d0
- c_mn_o_sm_17(2,2,2,2) = 10.19704d0
- c_mn_o_sm_17(4,0,2,2) = -4.96259d0
- c_mn_o_sm_17(2,0,4,2) = -1.35647d0
- c_mn_o_sm_17(4,2,2,2) = -5.90907d0
- c_mn_o_sm_17(6,0,2,2) =  0.90343d0
- c_mn_o_sm_17(4,0,4,2) =  5.50739d0
- c_mn_o_sm_17(2,2,4,2) = -0.03154d0
- c_mn_o_sm_17(2,0,6,2) = -1.1051860
-
-
- ! Ne atom 
- ! two electron terms 
- c_mn_o_sm_17(0,0,2,10) = -0.80909d0 
- c_mn_o_sm_17(0,0,3,10) = -0.00219d0 
- c_mn_o_sm_17(0,0,4,10) =  0.59188d0
- ! one-electron terms 
- c_mn_o_sm_17(2,0,0,10) = -0.00567d0
- c_mn_o_sm_17(3,0,0,10) =  0.14011d0
- c_mn_o_sm_17(4,0,0,10) = -0.05671d0
- c_mn_o_sm_17(2,2,0,10) = -3.33767d0
- ! ee-n terms 
- c_mn_o_sm_17(2,0,2,10) =  1.95067d0
- c_mn_o_sm_17(2,2,2,10) =  6.83340d0
- c_mn_o_sm_17(4,0,2,10) = -3.29231d0
- c_mn_o_sm_17(2,0,4,10) = -2.44998d0
- c_mn_o_sm_17(4,2,2,10) = -2.13029d0
- c_mn_o_sm_17(6,0,2,10) =  2.25768d0
- c_mn_o_sm_17(4,0,4,10) =  1.97951d0
- c_mn_o_sm_17(2,2,4,10) = -2.0924160
- c_mn_o_sm_17(2,0,6,10) =  0.35493d0
-
-END_PROVIDER 
-
- BEGIN_PROVIDER [ double precision, b_I_sm_90,(2:10)]
-&BEGIN_PROVIDER [ double precision, d_I_sm_90,(2:10)]
- implicit none
- BEGIN_DOC
-! "b_I" and "d_I" parameters of Eqs. (4) and (5) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- END_DOC
- b_I_sm_90 = 1.d0
- d_I_sm_90 = 1.d0
- 
-END_PROVIDER 
-
-subroutine get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
- implicit none 
- double precision, intent(in) :: r1(3),r2(3),rI(3)
- integer, intent(in)          :: sm_j, i_charge 
- double precision, intent(out):: j_1e,j_2e,j_een,j_tot
- BEGIN_DOC
- ! Jastrow function as in Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! the i_charge variable is the integer specifying the charge of the atom for the Jastrow 
- ! the sm_j integer variable represents the "quality" of the jastrow : sm_j = 7, 9, 17
- END_DOC 
- double precision :: r_inucl,r_jnucl,r_ij,b_I, d_I
- b_I = b_I_sm_90(i_charge)
- d_I = d_I_sm_90(i_charge)
- call get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
- call jastrow_func_sm_90(r_inucl,r_jnucl,r_ij,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-end
-
-subroutine get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
- implicit none
- BEGIN_DOC
- ! rescaled variables of Eq. (5) and (6) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! the "b_I" and "d_I" parameters are the same as in Eqs. (5) and (6) 
- END_DOC
- double precision, intent(in) :: r1(3),r2(3),rI(3)
- double precision, intent(in) :: b_I, d_I
- double precision, intent(out):: r_inucl,r_jnucl,r_ij
- double precision :: rin, rjn, rij
- integer :: i
- rin = 0.d0
- rjn = 0.d0
- rij = 0.d0
- do i = 1,3
-  rin += (r1(i) - rI(i)) * (r1(i) - rI(i))
-  rjn += (r2(i) - rI(i)) * (r2(i) - rI(i))
-  rij += (r2(i) - r1(i)) * (r2(i) - r1(i))
- enddo
- rin = dsqrt(rin)
- rjn = dsqrt(rjn)
- rij = dsqrt(rij)
- r_inucl = b_I * rin/(1.d0 + b_I * rin)
- r_jnucl = b_I * rjn/(1.d0 + b_I * rjn)
- r_ij    = d_I * rij/(1.d0 + b_I * rij)
-end
-
-subroutine jastrow_func_sm_90(r_inucl,r_jnucl,r_ij,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
- implicit none
- BEGIN_DOC
- ! Jastrow function as in Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
- ! Here the r_inucl, r_jnucl are the rescaled variables as defined in Eq. (5) with "b_I" 
- !          r_ij is the rescaled variable as defined in Eq. (6) with "d_I" 
- ! the i_charge variable is the integer specifying the charge of the atom for the Jastrow 
- ! the sm_j integer variable represents the "quality" of the jastrow : sm_j = 7, 9, 17
- !
- ! it returns the j_1e  : sum of terms with "o" = "n" = 0, "m" /= 0,
- !                j_2e  : sum of terms with "m" = "n" = 0, "o" /= 0,
- !                j_een : sum of terms with "m" /=0, "n" /= 0, "o" /= 0,
- !                j_tot : the total sum 
- END_DOC
- double precision, intent(in) :: r_inucl,r_jnucl,r_ij
- integer, intent(in)          :: sm_j,i_charge
- double precision, intent(out):: j_1e,j_2e,j_een,j_tot
- j_1e  = 0.D0
- j_2e  = 0.D0
- j_een = 0.D0
- double precision :: delta_mn,jastrow_sm_90_atomic
- integer :: m,n,o
-BEGIN_TEMPLATE
- ! pure 2e part 
- n = 0
- m = 0 
- if(sm_j == $X )then
-  do o = 1, o_max_sm_$X
-   if(dabs(c_mn_o_sm_$X(m,n,o,i_charge)).lt.1.d-10)cycle 
-   j_2e += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-  enddo
-! else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- ! pure one-e part 
- o = 0 
- if(sm_j == $X)then
-  do n = 2, n_max_sm_$X
-   do m = 2, m_max_sm_$X
-    j_1e += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-   enddo
-  enddo
-! else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- ! e-e-n part 
- if(sm_j == $X)then
-  do o = 1, o_max_sm_$X
-   do m = 2, m_max_sm_$X
-    do n = 2, n_max_sm_$X
-     j_een += c_mn_o_sm_$X(m,n,o,i_charge) * jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
-    enddo
-   enddo
-  enddo
- else 
-!  print*,'sm_j = ',sm_j
-!  print*,'not implemented, stop'
-!  stop
- endif
- j_tot = j_1e + j_2e + j_een
-SUBST [ X]
-  7 ;; 
-  9 ;; 
-  17 ;;
-END_TEMPLATE 
-end
-
-double precision function jastrow_sm_90_atomic(m,n,o,i_charge,r_inucl,r_jnucl,r_ij)
- implicit none
- BEGIN_DOC
-! contribution to the function of Eq. (4) of Schmidt,Moskowitz, JCP, 93, 4172 (1990)
-! for a given m,n,o and atom 
- END_DOC
- double precision, intent(in) :: r_inucl,r_jnucl,r_ij
- integer         , intent(in) :: m,n,o,i_charge
- double precision :: delta_mn 
- if(m==n)then
-  delta_mn = 0.5d0
- else
-  delta_mn = 1.D0
- endif
- jastrow_sm_90_atomic = delta_mn * (r_inucl**m * r_jnucl**n + r_jnucl**m * r_inucl**n)*r_ij**o
-end
diff --git a/plugins/local/tc_scf/minimize_tc_angles.irp.f b/plugins/local/tc_scf/minimize_tc_angles.irp.f
index c7752930..e5f6cf87 100644
--- a/plugins/local/tc_scf/minimize_tc_angles.irp.f
+++ b/plugins/local/tc_scf/minimize_tc_angles.irp.f
@@ -20,7 +20,7 @@ program minimize_tc_angles
   ! TODO
   ! check if rotations of orbitals affect the TC energy
   ! and refuse the step
-  call minimize_tc_orb_angles
+  call minimize_tc_orb_angles()
 
 end
 
diff --git a/plugins/local/tc_scf/plot_j_schMos.irp.f b/plugins/local/tc_scf/plot_j_schMos.irp.f
deleted file mode 100644
index eda0dd25..00000000
--- a/plugins/local/tc_scf/plot_j_schMos.irp.f
+++ /dev/null
@@ -1,69 +0,0 @@
-program plot_j
- implicit none
- double precision :: r1(3),rI(3),r2(3)
- double precision :: r12,dx,xmax, j_1e,j_2e,j_een,j_tot
- double precision :: j_mu_F_x_j
- integer :: i,nx,m,i_charge,sm_j
-
- character*(128) :: output
- integer :: i_unit_output_He_sm_7,i_unit_output_Ne_sm_7
- integer :: i_unit_output_He_sm_17,i_unit_output_Ne_sm_17
- integer :: getUnitAndOpen
- output='J_SM_7_He'
- i_unit_output_He_sm_7 = getUnitAndOpen(output,'w')
- output='J_SM_7_Ne'
- i_unit_output_Ne_sm_7 = getUnitAndOpen(output,'w')
-
- output='J_SM_17_He'
- i_unit_output_He_sm_17 = getUnitAndOpen(output,'w')
- output='J_SM_17_Ne'
- i_unit_output_Ne_sm_17 = getUnitAndOpen(output,'w')
-
- rI = 0.d0
- r1 = 0.d0
- r2 = 0.d0
- r1(1) = 1.5d0
- xmax = 20.d0
- r2(1) = -xmax*0.5d0
- nx = 1000
- dx = xmax/dble(nx)
- do i = 1, nx
-  r12 = 0.d0
-  do m = 1, 3
-   r12 += (r1(m) - r2(m))*(r1(m) - r2(m))
-  enddo
-  r12 = dsqrt(r12)
-  double precision :: jmu,env_nucl,jmu_env,jmu_scaled, jmu_scaled_env
-  double precision :: b_I,d_I,r_inucl,r_jnucl,r_ij
-  b_I = 1.D0
-  d_I = 1.D0
-  call get_rescaled_variables_j_sm_90(r1,r2,rI,b_I,d_I,r_inucl,r_jnucl,r_ij)
-  jmu=j_mu_F_x_j(r12)
-  jmu_scaled=j_mu_F_x_j(r_ij)
-  jmu_env = jmu * env_nucl(r1) * env_nucl(r2)
-!  jmu_scaled_env= jmu_scaled * (1.d0 - env_coef(1) * dexp(-env_expo(1)*r_inucl**2)) * (1.d0 - env_coef(1) * dexp(-env_expo(1)*r_jnucl**2))
-  jmu_scaled_env= jmu_scaled * env_nucl(r1) * env_nucl(r2)
-  ! He 
-  i_charge = 2
-  ! SM 7 Jastrow 
-  sm_j = 7
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_He_sm_7,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! SM 17 Jastrow 
-  sm_j = 17
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_He_sm_17,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! Ne 
-  i_charge = 10
-  ! SM 7 Jastrow 
-  sm_j = 7
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_Ne_sm_7,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  ! SM 17 Jastrow 
-  sm_j = 17
-  call get_full_sm_90_jastrow(r1,r2,rI,sm_j,i_charge, j_1e,j_2e,j_een,j_tot)
-  write(i_unit_output_Ne_sm_17,'(100(F16.10,X))')r2(1),r12,j_mu_F_x_j(r12), j_1e,j_2e,j_een,j_tot,jmu_env,jmu_scaled,jmu_scaled_env
-  r2(1) += dx
- enddo
- 
-end
diff --git a/plugins/local/tc_scf/print_fit_param.irp.f b/plugins/local/tc_scf/print_fit_param.irp.f
deleted file mode 100644
index e62f0dde..00000000
--- a/plugins/local/tc_scf/print_fit_param.irp.f
+++ /dev/null
@@ -1,59 +0,0 @@
-program print_fit_param
-
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  !call create_guess
-  !call orthonormalize_mos
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  integer :: i
-
-  mu_erf = 1.d0
-  touch mu_erf
-
-  print *, ' fit for (1 - erf(x))^2'
-  do i = 1, n_max_fit_slat
-    print*, expo_gauss_1_erf_x_2(i), coef_gauss_1_erf_x_2(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)]'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_x(i), 2.d0 * coef_gauss_j_mu_x(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)]^2'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_x_2(i), 4.d0 * coef_gauss_j_mu_x_2(i)
-  enddo
-
-  print *, ''
-  print *, ' fit for [x * (1 - erf(x)) - 1/sqrt(pi) * exp(-x**2)] x [1 - erf(mu * r12)]'
-  do i = 1, n_max_fit_slat
-    print *, expo_gauss_j_mu_1_erf(i), 4.d0 * coef_gauss_j_mu_1_erf(i)
-  enddo
-
-  return
-end subroutine main
-
-! ---
-
diff --git a/plugins/local/tc_scf/print_tcscf_energy.irp.f b/plugins/local/tc_scf/print_tcscf_energy.irp.f
deleted file mode 100644
index 6f9afd9a..00000000
--- a/plugins/local/tc_scf/print_tcscf_energy.irp.f
+++ /dev/null
@@ -1,55 +0,0 @@
-program print_tcscf_energy
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, 'Hello world'
-  my_grid_becke = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  double precision :: etc_tot, etc_1e, etc_2e, etc_3e
-
-  PROVIDE j2e_type mu_erf
-  PROVIDE j1e_type j1e_coef j1e_expo
-  PROVIDE env_type env_coef env_expo
-
-  print*, ' j2e_type = ', j2e_type
-  print*, ' j1e_type = ', j1e_type
-  print*, ' env_type = ', env_type
-
-  print*, ' mu_erf   = ', mu_erf
-
-  etc_tot = TC_HF_energy
-  etc_1e  = TC_HF_one_e_energy
-  etc_2e  = TC_HF_two_e_energy
-  etc_3e  = 0.d0
-  if(three_body_h_tc) then
-    !etc_3e = diag_three_elem_hf
-    etc_3e = tcscf_energy_3e_naive
-  endif
-
-  print *, " E_TC = ", etc_tot
-  print *, " E_1e = ", etc_1e
-  print *, " E_2e = ", etc_2e
-  print *, " E_3e = ", etc_3e
-
-  return
-end subroutine main
-
-! ---
-
diff --git a/plugins/local/tc_scf/rh_tcscf_diis.irp.f b/plugins/local/tc_scf/rh_tcscf_diis.irp.f
index 431b6e08..1cade02a 100644
--- a/plugins/local/tc_scf/rh_tcscf_diis.irp.f
+++ b/plugins/local/tc_scf/rh_tcscf_diis.irp.f
@@ -61,7 +61,7 @@ subroutine rh_tcscf_diis()
   etc_tot = TC_HF_energy
   etc_1e  = TC_HF_one_e_energy
   etc_2e  = TC_HF_two_e_energy
-  etc_3e  = diag_three_elem_hf
+  etc_3e  = TC_HF_three_e_energy
   !tc_grad = grad_non_hermit
   er_DIIS = maxval(abs(FQS_SQF_mo))
   e_delta = dabs(etc_tot - e_save)
@@ -189,7 +189,7 @@ subroutine rh_tcscf_diis()
     etc_tot = TC_HF_energy
     etc_1e  = TC_HF_one_e_energy
     etc_2e  = TC_HF_two_e_energy
-    etc_3e  = diag_three_elem_hf
+    etc_3e  = TC_HF_three_e_energy
     !tc_grad  = grad_non_hermit
     er_DIIS  = maxval(abs(FQS_SQF_mo))
     e_delta  = dabs(etc_tot - e_save)
@@ -234,7 +234,7 @@ subroutine rh_tcscf_diis()
     call unlock_io
 
     if(er_delta .lt. 0.d0) then
-      call ezfio_set_tc_scf_bitc_energy(etc_tot)
+      call ezfio_set_tc_scf_tcscf_energy(etc_tot)
       call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
       call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
       write(json_unit, json_true_fmt) 'saved'
@@ -263,7 +263,7 @@ subroutine rh_tcscf_diis()
 
   deallocate(mo_r_coef_save, mo_l_coef_save, F_DIIS, E_DIIS)
 
-  call ezfio_set_tc_scf_bitc_energy(TC_HF_energy)
+  call ezfio_set_tc_scf_tcscf_energy(TC_HF_energy)
   call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
   call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
 
diff --git a/plugins/local/tc_scf/rh_tcscf_simple.irp.f b/plugins/local/tc_scf/rh_tcscf_simple.irp.f
deleted file mode 100644
index 0b79e8ea..00000000
--- a/plugins/local/tc_scf/rh_tcscf_simple.irp.f
+++ /dev/null
@@ -1,129 +0,0 @@
-! ---
-
-subroutine rh_tcscf_simple()
-
-  implicit none
-  integer                       :: i, j, it, dim_DIIS
-  double precision              :: t0, t1
-  double precision              :: e_save, e_delta, rho_delta
-  double precision              :: etc_tot, etc_1e, etc_2e, etc_3e, tc_grad
-  double precision              :: er_DIIS
-  double precision, allocatable :: rho_old(:,:), rho_new(:,:)
-
-  allocate(rho_old(ao_num,ao_num), rho_new(ao_num,ao_num))
-
-  it       = 0
-  e_save   = 0.d0
-  dim_DIIS = 0
-
-  ! ---
-
-  if(.not. bi_ortho) then
-   print *, ' grad_hermit = ', grad_hermit
-   call save_good_hermit_tc_eigvectors
-   TOUCH mo_coef 
-   call save_mos
-  endif
-
-  ! ---
-
-  if(bi_ortho) then
-
-    PROVIDE level_shift_tcscf
-    PROVIDE mo_l_coef mo_r_coef
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      '====', '================', '================', '================', '================', '================' &
-            , '================', '================', '================', '====', '========'
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
-            , '    gradient    ', '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
-
-    write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')              &
-      '====', '================', '================', '================', '================', '================' &
-            , '================', '================', '================', '====', '========'
-
-
-    ! first iteration (HF orbitals)
-    call wall_time(t0)
-
-    etc_tot = TC_HF_energy
-    etc_1e  = TC_HF_one_e_energy
-    etc_2e  = TC_HF_two_e_energy
-    etc_3e  = 0.d0
-    if(three_body_h_tc) then
-      etc_3e = diag_three_elem_hf
-    endif
-    tc_grad = grad_non_hermit
-    er_DIIS = maxval(abs(FQS_SQF_mo))
-    e_delta = dabs(etc_tot - e_save)
-    e_save  = etc_tot
-
-    call wall_time(t1)
-    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-
-    do while(tc_grad .gt. dsqrt(thresh_tcscf))
-      call wall_time(t0)
-
-      it += 1
-      if(it > n_it_tcscf_max) then
-        print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
-        stop
-      endif
-
-      mo_l_coef = fock_tc_leigvec_ao
-      mo_r_coef = fock_tc_reigvec_ao
-      call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-      call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-      TOUCH mo_l_coef mo_r_coef
-
-      etc_tot = TC_HF_energy
-      etc_1e  = TC_HF_one_e_energy
-      etc_2e  = TC_HF_two_e_energy
-      etc_3e  = 0.d0
-      if(three_body_h_tc) then
-        etc_3e = diag_three_elem_hf
-      endif
-      tc_grad = grad_non_hermit
-      er_DIIS = maxval(abs(FQS_SQF_mo))
-      e_delta = dabs(etc_tot - e_save)
-      e_save  = etc_tot
-
-      call ezfio_set_tc_scf_bitc_energy(etc_tot)
-
-      call wall_time(t1)
-      write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-        it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, tc_grad, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-    enddo
-
-  else
-
-   do while( (grad_hermit.gt.dsqrt(thresh_tcscf)) .and. (it.lt.n_it_tcscf_max) )
-      print*,'grad_hermit = ',grad_hermit
-      it += 1
-      print *, 'iteration = ', it
-      print *, '***'
-      print *, 'TC HF total energy = ', TC_HF_energy
-      print *, 'TC HF 1 e   energy = ', TC_HF_one_e_energy
-      print *, 'TC HF 2 e   energy = ', TC_HF_two_e_energy
-      print *, 'TC HF 3 body       = ', diag_three_elem_hf
-      print *, '***'
-      print *, ''
-      call save_good_hermit_tc_eigvectors
-      TOUCH mo_coef 
-      call save_mos
-    enddo
-
-  endif
-
-  print *, ' TCSCF Simple converged !'
-  !call print_energy_and_mos(good_angles)
-
-  deallocate(rho_old, rho_new)
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/rh_vartcscf_simple.irp.f b/plugins/local/tc_scf/rh_vartcscf_simple.irp.f
deleted file mode 100644
index ecb0709e..00000000
--- a/plugins/local/tc_scf/rh_vartcscf_simple.irp.f
+++ /dev/null
@@ -1,89 +0,0 @@
-! ---
-
-subroutine rh_vartcscf_simple()
-
-  implicit none
-  integer          :: i, j, it, dim_DIIS
-  double precision :: t0, t1
-  double precision :: e_save, e_delta, rho_delta
-  double precision :: etc_tot, etc_1e, etc_2e, etc_3e
-  double precision :: er_DIIS
-
-
-  it       = 0
-  e_save   = 0.d0
-  dim_DIIS = 0
-
-  ! ---
-
-  PROVIDE level_shift_tcscf
-  PROVIDE mo_r_coef
-
-  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')                      &
-    '====', '================', '================', '================', '================', '================' &
-          , '================', '================', '====', '========'
-  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')                      &
-    ' it ', '  SCF TC Energy ', '      E(1e)     ', '      E(2e)     ', '      E(3e)     ', '   energy diff  ' &
-          , '    DIIS error  ', '  level shift   ', 'DIIS', '  WT (m)'
-  write(6, '(A4,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A16,1X, A4, 1X, A8)')                      &
-    '====', '================', '================', '================', '================', '================' &
-          , '================', '================', '====', '========'
-
-
-  ! first iteration (HF orbitals)
-  call wall_time(t0)
-
-  etc_tot = VARTC_HF_energy
-  etc_1e  = VARTC_HF_one_e_energy
-  etc_2e  = VARTC_HF_two_e_energy
-  etc_3e  = 0.d0
-  if(three_body_h_tc) then
-    etc_3e = diag_three_elem_hf
-  endif
-  er_DIIS = maxval(abs(FQS_SQF_mo))
-  e_delta = dabs(etc_tot - e_save)
-  e_save  = etc_tot
-
-  call wall_time(t1)
-  write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-    it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-
-  do while(er_DIIS .gt. dsqrt(thresh_tcscf))
-    call wall_time(t0)
-
-    it += 1
-    if(it > n_it_tcscf_max) then
-      print *, ' max of TCSCF iterations is reached ', n_it_TCSCF_max
-      stop
-    endif
-
-    mo_r_coef = fock_vartc_eigvec_ao
-    mo_l_coef = mo_r_coef
-    call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-    call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-    TOUCH mo_l_coef mo_r_coef
-
-    etc_tot = VARTC_HF_energy
-    etc_1e  = VARTC_HF_one_e_energy
-    etc_2e  = VARTC_HF_two_e_energy
-    etc_3e  = 0.d0
-    if(three_body_h_tc) then
-      etc_3e = diag_three_elem_hf
-    endif
-    er_DIIS = maxval(abs(FQS_SQF_mo))
-    e_delta = dabs(etc_tot - e_save)
-    e_save  = etc_tot
-
-    call ezfio_set_tc_scf_bitc_energy(etc_tot)
-
-    call wall_time(t1)
-    write(6, '(I4,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, F16.10,1X, I4,1X, F8.2)')  &
-      it, etc_tot, etc_1e, etc_2e, etc_3e, e_delta, er_DIIS, level_shift_tcscf, dim_DIIS, (t1-t0)/60.d0
-  enddo
-
-  print *, ' VAR-TCSCF Simple converged !'
-
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f b/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
deleted file mode 100644
index 0f2663e5..00000000
--- a/plugins/local/tc_scf/rotate_tcscf_orbitals.irp.f
+++ /dev/null
@@ -1,369 +0,0 @@
-
-! ---
-
-program rotate_tcscf_orbitals
-
-  BEGIN_DOC
-  ! TODO : Rotate the bi-orthonormal orbitals in order to minimize left-right angles when degenerate
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  bi_ortho = .True.
-  touch bi_ortho
-
-  call minimize_tc_orb_angles()
-  !call maximize_overlap()
-
-end
-
-! ---
-
-subroutine maximize_overlap()
-
-  implicit none
-  integer                       :: i, m, n
-  double precision              :: accu_d, accu_nd
-  double precision, allocatable :: C(:,:), R(:,:), L(:,:), W(:,:), e(:)
-  double precision, allocatable :: S(:,:)
-
-  n = ao_num
-  m = mo_num
-
-  allocate(L(n,m), R(n,m), C(n,m), W(n,n), e(m))
-  L = mo_l_coef
-  R = mo_r_coef
-  C = mo_coef
-  W = ao_overlap
-
-  print*, ' fock matrix diag elements'
-  do i = 1, m
-    e(i) = Fock_matrix_tc_mo_tot(i,i)
-    print*, e(i)
-  enddo
-
-  ! ---
-   
-  print *, ' overlap before :'
-  print *, ' '
-
-  allocate(S(m,m)) 
-
-  call LTxSxR(n, m, L, W, R, S)
-  !print*, " L.T x R"
-  !do i = 1, m
-  !  write(*, '(100(F16.10,X))') S(i,i)
-  !enddo
-  call LTxSxR(n, m, L, W, C, S)
-  print*, " L.T x C"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-  call LTxSxR(n, m, C, W, R, S)
-  print*, " C.T x R"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-
-  deallocate(S)
-
-  ! ---
-
-  call rotate_degen_eigvec_to_maximize_overlap(n, m, e, C, W, L, R)
-
-  ! ---
-   
-  print *, ' overlap after :'
-  print *, ' '
-
-  allocate(S(m,m)) 
-
-  call LTxSxR(n, m, L, W, R, S)
-  !print*, " L.T x R"
-  !do i = 1, m
-  !  write(*, '(100(F16.10,X))') S(i,i)
-  !enddo
-  call LTxSxR(n, m, L, W, C, S)
-  print*, " L.T x C"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-  call LTxSxR(n, m, C, W, R, S)
-  print*, " C.T x R"
-  do i = 1, m
-    write(*, '(100(F16.10,X))') S(i,:)
-  enddo
-
-  deallocate(S)
-
-  ! ---
-
-  mo_l_coef = L
-  mo_r_coef = R
-  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-
-  ! ---
-
-  deallocate(L, R, C, W, e)
-
-end subroutine maximize_overlap
-
-! ---
-
-subroutine rotate_degen_eigvec_to_maximize_overlap(n, m, e0, C0, W0, L0, R0)
-
-  implicit none
-
-  integer,          intent(in)    :: n, m
-  double precision, intent(in)    :: e0(m), W0(n,n), C0(n,m)
-  double precision, intent(inout) :: L0(n,m), R0(n,m)
-
-
-  integer                         :: i, j, k, kk, mm, id1, tot_deg
-  double precision                :: ei, ej, de, de_thr
-  integer,          allocatable   :: deg_num(:)
-  double precision, allocatable   :: L(:,:), R(:,:), C(:,:), Lnew(:,:), Rnew(:,:), tmp(:,:)
-  !double precision, allocatable   :: S(:,:), Snew(:,:), T(:,:), Ttmp(:,:), Stmp(:,:)
-  double precision, allocatable   :: S(:,:), Snew(:,:), T(:,:), Ttmp(:,:), Stmp(:,:)
-  !real*8                          :: S(m,m), Snew(m,m), T(m,m)
-
-  id1 = 700
-  allocate(S(id1,id1), Snew(id1,id1), T(id1,id1))
-
-  ! ---
-
-  allocate( deg_num(m) )
-  do i = 1, m
-    deg_num(i) = 1
-  enddo
-
-  de_thr = thr_degen_tc
-
-  do i = 1, m-1
-    ei = e0(i)
-
-    ! already considered in degen vectors
-    if(deg_num(i).eq.0) cycle
-
-    do j = i+1, m
-      ej = e0(j)
-      de = dabs(ei - ej)
-
-      if(de .lt. de_thr) then
-        deg_num(i) = deg_num(i) + 1
-        deg_num(j) = 0
-      endif
-
-    enddo
-  enddo
-
-  tot_deg = 0
-  do i = 1, m
-    if(deg_num(i).gt.1) then
-      print *, ' degen on', i, deg_num(i)
-      tot_deg = tot_deg + 1
-    endif
-  enddo
-
-  if(tot_deg .eq. 0) then
-    print *, ' no degen'
-    return
-  endif
-
-  ! ---
-
-  do i = 1, m
-    mm = deg_num(i)
-
-    if(mm .gt. 1) then
-
-      allocate(L(n,mm), R(n,mm), C(n,mm))
-      do j = 1, mm
-        L(1:n,j) = L0(1:n,i+j-1)
-        R(1:n,j) = R0(1:n,i+j-1)
-        C(1:n,j) = C0(1:n,i+j-1)
-      enddo
-
-      ! ---
-
-      ! C.T x W0 x R
-      allocate(tmp(mm,n), Stmp(mm,mm))
-      call dgemm( 'T', 'N', mm, n, n, 1.d0       &
-                , C, size(C, 1), W0, size(W0, 1) &
-                , 0.d0, tmp, size(tmp, 1) )
-      call dgemm( 'N', 'N', mm, mm, n, 1.d0        &
-                , tmp, size(tmp, 1), R, size(R, 1) &
-                , 0.d0, Stmp, size(Stmp, 1) )
-      deallocate(C, tmp)
-
-      S = 0.d0
-      do k = 1, mm
-        do kk = 1, mm
-          S(kk,k) = Stmp(kk,k)
-        enddo
-      enddo
-      deallocate(Stmp)
-
-      !print*, " overlap bef"
-      !do k = 1, mm
-      !  write(*, '(100(F16.10,X))') (S(k,kk), kk=1, mm)
-      !enddo
-    
-      T    = 0.d0
-      Snew = 0.d0
-      call maxovl(mm, mm, S, T, Snew)
-
-      !print*, " overlap aft"
-      !do k = 1, mm
-      !  write(*, '(100(F16.10,X))') (Snew(k,kk), kk=1, mm)
-      !enddo
-
-      allocate(Ttmp(mm,mm))
-      Ttmp(1:mm,1:mm) = T(1:mm,1:mm)
-
-      allocate(Lnew(n,mm), Rnew(n,mm))
-      call dgemm( 'N', 'N', n, mm, mm, 1.d0               &
-                , R, size(R, 1), Ttmp(1,1), size(Ttmp, 1) &
-                , 0.d0, Rnew, size(Rnew, 1) )
-      call dgemm( 'N', 'N', n, mm, mm, 1.d0               &
-                , L, size(L, 1), Ttmp(1,1), size(Ttmp, 1) &
-                , 0.d0, Lnew, size(Lnew, 1) )
-
-      deallocate(L, R)
-      deallocate(Ttmp)
-
-      ! ---
-
-      do j = 1, mm
-        L0(1:n,i+j-1) = Lnew(1:n,j)
-        R0(1:n,i+j-1) = Rnew(1:n,j)
-      enddo
-      deallocate(Lnew, Rnew)
-
-    endif
-  enddo
-
-  deallocate(S, Snew, T)
-
-end subroutine rotate_degen_eigvec_to_maximize_overlap
-
-! ---
-
-subroutine fix_right_to_one()
-
-  implicit none
-  integer                       :: i, j, m, n, mm, tot_deg
-  double precision              :: accu_d, accu_nd
-  double precision              :: de_thr, ei, ej, de
-  integer,          allocatable :: deg_num(:)
-  double precision, allocatable :: R0(:,:), L0(:,:), W(:,:), e0(:)
-  double precision, allocatable :: R(:,:), L(:,:), S(:,:), Stmp(:,:), tmp(:,:)
-
-  n = ao_num
-  m = mo_num
-
-  allocate(L0(n,m), R0(n,m), W(n,n), e0(m))
-  L0 = mo_l_coef
-  R0 = mo_r_coef
-  W  = ao_overlap
-
-  print*, ' fock matrix diag elements'
-  do i = 1, m
-    e0(i) = Fock_matrix_tc_mo_tot(i,i)
-    print*, e0(i)
-  enddo
-
-  ! ---
-
-  allocate( deg_num(m) )
-  do i = 1, m
-    deg_num(i) = 1
-  enddo
-
-  de_thr = 1d-6
-
-  do i = 1, m-1
-    ei = e0(i)
-
-    ! already considered in degen vectors
-    if(deg_num(i).eq.0) cycle
-
-    do j = i+1, m
-      ej = e0(j)
-      de = dabs(ei - ej)
-
-      if(de .lt. de_thr) then
-        deg_num(i) = deg_num(i) + 1
-        deg_num(j) = 0
-      endif
-
-    enddo
-  enddo
-
-  deallocate(e0)
-
-  tot_deg = 0
-  do i = 1, m
-    if(deg_num(i).gt.1) then
-      print *, ' degen on', i, deg_num(i)
-      tot_deg = tot_deg + 1
-    endif
-  enddo
-
-  if(tot_deg .eq. 0) then
-    print *, ' no degen'
-    return
-  endif
-
-  ! ---
-
-  do i = 1, m
-    mm = deg_num(i)
-
-    if(mm .gt. 1) then
-
-      allocate(L(n,mm), R(n,mm))
-      do j = 1, mm
-        L(1:n,j) = L0(1:n,i+j-1)
-        R(1:n,j) = R0(1:n,i+j-1)
-      enddo
-
-      ! ---
-
-      call impose_weighted_orthog_svd(n, mm, W, R)
-      call impose_weighted_biorthog_qr(n, mm, thresh_biorthog_diag, thresh_biorthog_nondiag, R, W, L)
-
-      ! ---
-
-      do j = 1, mm
-        L0(1:n,i+j-1) = L(1:n,j)
-        R0(1:n,i+j-1) = R(1:n,j)
-      enddo
-      deallocate(L, R)
-
-    endif
-  enddo
-
-  call check_weighted_biorthog_binormalize(n, m, L0, W, R0, thresh_biorthog_diag, thresh_biorthog_nondiag, .true.)
-
-  deallocate(W, deg_num)
-
-  mo_l_coef = L0
-  mo_r_coef = R0
-  deallocate(L0, R0)
-
-  call ezfio_set_bi_ortho_mos_mo_l_coef(mo_l_coef)
-  call ezfio_set_bi_ortho_mos_mo_r_coef(mo_r_coef)
-  print *, ' orbitals are rotated '
-
-  return
-end subroutine fix_right_to_one
-
-! ---
diff --git a/plugins/local/tc_scf/routines_rotates.irp.f b/plugins/local/tc_scf/routines_rotates.irp.f
index fbfc9beb..64a81e8e 100644
--- a/plugins/local/tc_scf/routines_rotates.irp.f
+++ b/plugins/local/tc_scf/routines_rotates.irp.f
@@ -40,9 +40,6 @@ subroutine LTxSxR(n, m, L, S, R, C)
 
 end subroutine LTxR
 
-! ---
-
-
 ! ---
 
 subroutine minimize_tc_orb_angles()
@@ -103,7 +100,10 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
   double precision, allocatable :: stmp(:,:), T(:,:), Snew(:,:), smat2(:,:)
   double precision, allocatable :: mo_l_coef_tmp(:,:), mo_r_coef_tmp(:,:), mo_l_coef_new(:,:)
 
-  E_thr = 1d-04
+  PROVIDE TC_HF_energy
+  PROVIDE mo_r_coef mo_l_coef
+
+  E_thr = 1d-07
   E_old = TC_HF_energy
   allocate(mo_l_coef_old(ao_num,mo_num), mo_r_coef_old(ao_num,mo_num))
   mo_r_coef_old = mo_r_coef
@@ -111,7 +111,7 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
 
   good_angles = .False.
 
-  allocate(mo_l_coef_good(ao_num, mo_num), mo_r_coef_good(ao_num,mo_num))
+  allocate(mo_l_coef_good(ao_num,mo_num), mo_r_coef_good(ao_num,mo_num))
 
   print *, ' ***************************************'
   print *, ' ***************************************'
@@ -123,7 +123,7 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
   mo_r_coef_good = mo_r_coef
   mo_l_coef_good = mo_l_coef
 
-  allocate(mo_r_coef_new(ao_num, mo_num))
+  allocate(mo_r_coef_new(ao_num,mo_num))
   mo_r_coef_new = mo_r_coef
   do i = 1, mo_num
     norm = 1.d0/dsqrt(overlap_mo_r(i,i))
@@ -141,10 +141,11 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
   call build_s_matrix(ao_num, mo_num, mo_r_coef_new, mo_r_coef_new, ao_overlap, s_mat)
 ! call give_degen(fock_diag,mo_num,thr_deg,list_degen,n_degen_list)
   if(n_core_orb.ne.0)then
-   call give_degen_full_listcore(fock_diag, mo_num, list_core, n_core_orb, thr_deg, list_degen, n_degen_list)
+    call give_degen_full_listcore(fock_diag, mo_num, list_core, n_core_orb, thr_deg, list_degen, n_degen_list)
   else
-   call give_degen_full_list(fock_diag, mo_num, thr_deg, list_degen, n_degen_list)
+    call give_degen_full_list(fock_diag, mo_num, thr_deg, list_degen, n_degen_list)
   endif
+
   print *, ' fock_matrix_mo'
   do i = 1, mo_num
     print *, i, fock_diag(i), angle_left_right(i)
@@ -156,50 +157,52 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
 !  n_degen = ilast - ifirst +1
 
     n_degen = list_degen(i,0)
-    if(n_degen .ge. 1000)n_degen = 1 ! convention for core orbitals
+    if(n_degen .ge. 1000) n_degen = 1 ! convention for core orbitals
      
     if(n_degen .eq. 1) cycle
+    print*, ' working on orbital', i
+    print*, ' multiplicity =', n_degen
 
     allocate(stmp(n_degen,n_degen), smat2(n_degen,n_degen))
     allocate(mo_r_coef_tmp(ao_num,n_degen), mo_l_coef_tmp(ao_num,n_degen), mo_l_coef_new(ao_num,n_degen))
     allocate(T(n_degen,n_degen), Snew(n_degen,n_degen))
 
     print*,'Right orbitals before'
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))') mo_r_coef_new(1:ao_num,list_degen(i,j))
-     enddo
+    do j = 1, n_degen
+      write(*,'(1000(F16.10,X))') mo_r_coef_new(1:ao_num,list_degen(i,j))
+    enddo
     print*,'Left orbitals before'
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))')mo_l_coef(1:ao_num,list_degen(i,j)) 
-     enddo
+    do j = 1, n_degen
+      write(*,'(1000(F16.10,X))') mo_l_coef(1:ao_num,list_degen(i,j)) 
+    enddo
     if(angle_left_right(list_degen(i,1)).gt.80.d0.and.n_degen==2)then
-       integer :: i_list, j_list
-       i_list = list_degen(i,1)
-       j_list = list_degen(i,2)
-       print*,'Huge angle !!! == ',angle_left_right(list_degen(i,1)),angle_left_right(list_degen(i,2))
-       print*,'i_list = ',i_list
-       print*,'i_list = ',j_list
-       print*,'Swapping left/right orbitals'
-       call print_strong_overlap(i_list, j_list)
-       mo_r_coef_tmp(1:ao_num,1) = mo_r_coef_new(1:ao_num,i_list)
-       mo_r_coef_tmp(1:ao_num,2) = mo_l_coef(1:ao_num,i_list)
-       mo_l_coef_tmp(1:ao_num,1) = mo_l_coef(1:ao_num,j_list)
-       mo_l_coef_tmp(1:ao_num,2) = mo_r_coef_new(1:ao_num,j_list)
+      integer :: i_list, j_list
+      i_list = list_degen(i,1)
+      j_list = list_degen(i,2)
+      print*,'Huge angle !!! == ',angle_left_right(list_degen(i,1)),angle_left_right(list_degen(i,2))
+      print*,'i_list = ',i_list
+      print*,'i_list = ',j_list
+      print*,'Swapping left/right orbitals'
+      call print_strong_overlap(i_list, j_list)
+      mo_r_coef_tmp(1:ao_num,1) = mo_r_coef_new(1:ao_num,i_list)
+      mo_r_coef_tmp(1:ao_num,2) = mo_l_coef(1:ao_num,i_list)
+      mo_l_coef_tmp(1:ao_num,1) = mo_l_coef(1:ao_num,j_list)
+      mo_l_coef_tmp(1:ao_num,2) = mo_r_coef_new(1:ao_num,j_list)
     else
-     do j = 1, n_degen
-       print*,'i_list = ',list_degen(i,j)
-       mo_r_coef_tmp(1:ao_num,j) = mo_r_coef_new(1:ao_num,list_degen(i,j))
-       mo_l_coef_tmp(1:ao_num,j) = mo_l_coef(1:ao_num,list_degen(i,j))
-     enddo
+      do j = 1, n_degen
+        print*,'i_list = ',list_degen(i,j)
+        mo_r_coef_tmp(1:ao_num,j) = mo_r_coef_new(1:ao_num,list_degen(i,j))
+        mo_l_coef_tmp(1:ao_num,j) = mo_l_coef(1:ao_num,list_degen(i,j))
+      enddo
     endif
     print*,'Right orbitals '
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))')mo_r_coef_tmp(1:ao_num,j) 
-     enddo
+    do j = 1, n_degen
+      write(*,'(1000(F16.10,X))') mo_r_coef_tmp(1:ao_num,j) 
+    enddo
     print*,'Left orbitals '
-     do j = 1, n_degen
-       write(*,'(100(F16.10,X))')mo_l_coef_tmp(1:ao_num,j) 
-     enddo
+    do j = 1, n_degen
+      write(*,'(100(F16.10,X))') mo_l_coef_tmp(1:ao_num,j) 
+    enddo
     ! Orthogonalization of right functions
     print *, ' Orthogonalization of RIGHT functions'
     print *, ' ------------------------------------'
@@ -301,6 +304,7 @@ subroutine routine_save_rotated_mos(thr_deg, good_angles)
 
   ! check if TC energy has changed
   E_new = TC_HF_energy
+  E_thr = thresh_de_tc_angles
   if(dabs(E_new - E_old) .gt. E_thr) then
     mo_r_coef = mo_r_coef_old
     mo_l_coef = mo_l_coef_old
diff --git a/plugins/local/tc_scf/tc_petermann_factor.irp.f b/plugins/local/tc_scf/tc_petermann_factor.irp.f
deleted file mode 100644
index 14fff898..00000000
--- a/plugins/local/tc_scf/tc_petermann_factor.irp.f
+++ /dev/null
@@ -1,91 +0,0 @@
-
-! ---
-
-program tc_petermann_factor
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  call main()
-
-end
-
-! ---
-
-subroutine main()
-
-  implicit none
-  integer                       :: i, j
-  double precision              :: Pf_diag_av
-  double precision, allocatable :: Sl(:,:), Sr(:,:), Pf(:,:)
-
-  allocate(Sl(mo_num,mo_num), Sr(mo_num,mo_num), Pf(mo_num,mo_num))
-
-
-  call LTxSxR(ao_num, mo_num, mo_l_coef, ao_overlap, mo_r_coef, Sl)
-  !call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-  !          , mo_l_coef, size(mo_l_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
-  !          , 0.d0, Sl, size(Sl, 1) )
-
-  print *, ''
-  print *, ' left-right orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sl(:,i)
-  enddo
-
-  call LTxSxR(ao_num, mo_num, mo_l_coef, ao_overlap, mo_l_coef, Sl)
-  !call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-  !          , mo_l_coef, size(mo_l_coef, 1), mo_l_coef, size(mo_l_coef, 1) &
-  !          , 0.d0, Sl, size(Sl, 1) )
-
-  print *, ''
-  print *, ' left-orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sl(:,i)
-  enddo
-
-  call LTxSxR(ao_num, mo_num, mo_r_coef, ao_overlap, mo_r_coef, Sr)
-!  call dgemm( "T", "N", mo_num, mo_num, ao_num, 1.d0                       &
-!            , mo_r_coef, size(mo_r_coef, 1), mo_r_coef, size(mo_r_coef, 1) &
-!            , 0.d0, Sr, size(Sr, 1) )
-
-  print *, ''
-  print *, ' right-orthog matrix:'
-  do i = 1, mo_num
-    write(*,'(100(F8.4,X))') Sr(:,i)
-  enddo
-
-  print *, ''
-  print *, ' Petermann matrix:'
-  do i = 1, mo_num
-    do j = 1, mo_num
-      Pf(j,i) = Sl(j,i) * Sr(j,i)
-    enddo
-    write(*,'(100(F8.4,X))') Pf(:,i)
-  enddo
-
-  Pf_diag_av = 0.d0
-  do i = 1, mo_num
-    Pf_diag_av = Pf_diag_av + Pf(i,i)
-  enddo
-  Pf_diag_av = Pf_diag_av / dble(mo_num)
-
-  print *, ''
-  print *, ' mean of the diagonal Petermann factor = ', Pf_diag_av
-
-  deallocate(Sl, Sr, Pf)
-
-  return
-end subroutine
-
-! ---
-
diff --git a/plugins/local/tc_scf/tc_scf.irp.f b/plugins/local/tc_scf/tc_scf.irp.f
index 768069d6..83da03ec 100644
--- a/plugins/local/tc_scf/tc_scf.irp.f
+++ b/plugins/local/tc_scf/tc_scf.irp.f
@@ -7,19 +7,6 @@ program tc_scf
   END_DOC
 
   implicit none
-  integer :: i
-  logical :: good_angles
-
-  PROVIDE j1e_type
-  PROVIDE j2e_type
-  PROVIDE tcscf_algorithm
-  PROVIDE var_tc
-
-  print *, ' TC-SCF with:'
-  print *, ' j1e_type = ', j1e_type
-  print *, ' j2e_type = ', j2e_type
-
-  write(json_unit,json_array_open_fmt) 'tc-scf'
 
   my_grid_becke  = .True.
   PROVIDE tc_grid1_a tc_grid1_r
@@ -30,7 +17,6 @@ program tc_scf
   call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
   call write_int(6, my_n_pt_a_grid, 'angular external grid over')
 
-
   if(tc_integ_type .eq. "numeric") then
     my_extra_grid_becke  = .True.
     PROVIDE tc_grid2_a tc_grid2_r
@@ -42,48 +28,38 @@ program tc_scf
     call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
   endif
 
-  !call create_guess()
-  !call orthonormalize_mos()
+  call main()
 
+end
 
-  if(var_tc) then
+! ---
 
-    print *, ' VAR-TC'
+subroutine main()
 
-    if(tcscf_algorithm == 'DIIS') then
-      print*, ' NOT implemented yet'
-    elseif(tcscf_algorithm == 'Simple') then
-      call rh_vartcscf_simple()
-    else
-      print *, ' not implemented yet', tcscf_algorithm
-      stop
-    endif
+  implicit none
 
-  else
+  integer :: i
+  logical :: good_angles
 
-    if(tcscf_algorithm == 'DIIS') then
-      call rh_tcscf_diis()
-    elseif(tcscf_algorithm == 'Simple') then
-      call rh_tcscf_simple()
-    else
-      print *, ' not implemented yet', tcscf_algorithm
-      stop
-    endif
+  print *, ' TC-SCF with:'
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
 
-    PROVIDE Fock_matrix_tc_diag_mo_tot
-    print*, ' Eigenvalues:' 
-    do i = 1, mo_num
-      print*, i, Fock_matrix_tc_diag_mo_tot(i)
-    enddo
+  write(json_unit,json_array_open_fmt) 'tc-scf'
 
-    ! TODO 
-    ! rotate angles in separate code only if necessary
-    if(minimize_lr_angles)then
-     call minimize_tc_orb_angles()
-    endif
-    call print_energy_and_mos(good_angles)
+  call rh_tcscf_diis()
 
+  PROVIDE Fock_matrix_tc_diag_mo_tot
+  print*, ' Eigenvalues:' 
+  do i = 1, mo_num
+    print*, i, Fock_matrix_tc_diag_mo_tot(i)
+  enddo
+
+  if(minimize_lr_angles) then
+   call minimize_tc_orb_angles()
   endif
+  call print_energy_and_mos(good_angles)
 
   write(json_unit,json_array_close_fmtx)
   call json_close
@@ -119,7 +95,7 @@ subroutine create_guess()
     SOFT_TOUCH mo_label
   endif
 
-end subroutine create_guess
+end
 
 ! ---
 
diff --git a/plugins/local/tc_scf/tc_scf_dm.irp.f b/plugins/local/tc_scf/tc_scf_dm.irp.f
index bf31a4a1..5d25fce2 100644
--- a/plugins/local/tc_scf/tc_scf_dm.irp.f
+++ b/plugins/local/tc_scf/tc_scf_dm.irp.f
@@ -10,16 +10,8 @@ BEGIN_PROVIDER [double precision, TCSCF_density_matrix_ao_beta, (ao_num, ao_num)
 
   implicit none
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-    TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
-
-  else
-
-    TCSCF_density_matrix_ao_beta = SCF_density_matrix_ao_beta
-
-  endif
+  PROVIDE mo_l_coef mo_r_coef
+  TCSCF_density_matrix_ao_beta = TCSCF_bi_ort_dm_ao_beta
 
 END_PROVIDER 
 
@@ -35,16 +27,8 @@ BEGIN_PROVIDER [double precision, TCSCF_density_matrix_ao_alpha, (ao_num, ao_num
 
   implicit none
 
-  if(bi_ortho) then
-
-    PROVIDE mo_l_coef mo_r_coef
-    TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
-
-  else
-
-    TCSCF_density_matrix_ao_alpha = SCF_density_matrix_ao_alpha
-
-  endif
+  PROVIDE mo_l_coef mo_r_coef
+  TCSCF_density_matrix_ao_alpha = TCSCF_bi_ort_dm_ao_alpha
 
 END_PROVIDER 
 
diff --git a/plugins/local/tc_scf/tc_scf_energy.irp.f b/plugins/local/tc_scf/tc_scf_energy.irp.f
index 833b48aa..80ef2afb 100644
--- a/plugins/local/tc_scf/tc_scf_energy.irp.f
+++ b/plugins/local/tc_scf/tc_scf_energy.irp.f
@@ -1,7 +1,8 @@
 
- BEGIN_PROVIDER [ double precision, TC_HF_energy      ]
-&BEGIN_PROVIDER [ double precision, TC_HF_one_e_energy]
-&BEGIN_PROVIDER [ double precision, TC_HF_two_e_energy]
+ BEGIN_PROVIDER [double precision, TC_HF_energy        ]
+&BEGIN_PROVIDER [double precision, TC_HF_one_e_energy  ]
+&BEGIN_PROVIDER [double precision, TC_HF_two_e_energy  ]
+&BEGIN_PROVIDER [double precision, TC_HF_three_e_energy]
 
   BEGIN_DOC
   ! TC Hartree-Fock energy containing the nuclear repulsion, and its one- and two-body components.
@@ -11,11 +12,8 @@
   integer          :: i, j
   double precision :: t0, t1
 
-  !print*, ' Providing TC energy ...'
-  !call wall_time(t0)
-
   PROVIDE mo_l_coef mo_r_coef
-  PROVIDE two_e_tc_non_hermit_integral_alpha two_e_tc_non_hermit_integral_beta
+  PROVIDE two_e_tc_integral_alpha two_e_tc_integral_beta
 
   TC_HF_energy = nuclear_repulsion
   TC_HF_one_e_energy = 0.d0
@@ -23,47 +21,20 @@
 
   do j = 1, ao_num
     do i = 1, ao_num
-      TC_HF_two_e_energy += 0.5d0 * ( two_e_tc_non_hermit_integral_alpha(i,j) * TCSCF_density_matrix_ao_alpha(i,j) &
-                                    + two_e_tc_non_hermit_integral_beta (i,j) * TCSCF_density_matrix_ao_beta (i,j) )
+      TC_HF_two_e_energy += 0.5d0 * ( two_e_tc_integral_alpha(i,j) * TCSCF_density_matrix_ao_alpha(i,j) &
+                                    + two_e_tc_integral_beta (i,j) * TCSCF_density_matrix_ao_beta (i,j) )
       TC_HF_one_e_energy += ao_one_e_integrals_tc_tot(i,j) &
                           * (TCSCF_density_matrix_ao_alpha(i,j) + TCSCF_density_matrix_ao_beta (i,j) )
     enddo
   enddo
 
-  TC_HF_energy += TC_HF_one_e_energy + TC_HF_two_e_energy
-  TC_HF_energy += diag_three_elem_hf
+  if((three_body_h_tc .eqv. .False.) .and. (.not. noL_standard)) then
+    TC_HF_three_e_energy = 0.d0
+  else
+    TC_HF_three_e_energy = noL_0e
+  endif
 
-  !call wall_time(t1)
-  !print*, ' Wall time for TC energy=', t1-t0
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, VARTC_HF_energy]
-&BEGIN_PROVIDER [ double precision, VARTC_HF_one_e_energy]
-&BEGIN_PROVIDER [ double precision, VARTC_HF_two_e_energy]
-
-  implicit none
-  integer :: i, j
-
-  PROVIDE mo_r_coef
-
-  VARTC_HF_energy = nuclear_repulsion
-  VARTC_HF_one_e_energy = 0.d0
-  VARTC_HF_two_e_energy = 0.d0
-
-  do j = 1, ao_num
-    do i = 1, ao_num
-      VARTC_HF_two_e_energy += 0.5d0 * ( two_e_vartc_integral_alpha(i,j) * TCSCF_density_matrix_ao_alpha(i,j) &
-                                       + two_e_vartc_integral_beta (i,j) * TCSCF_density_matrix_ao_beta (i,j) )
-      VARTC_HF_one_e_energy += ao_one_e_integrals_tc_tot(i,j) &
-                             * (TCSCF_density_matrix_ao_alpha(i,j) + TCSCF_density_matrix_ao_beta (i,j) )
-    enddo
-  enddo
-
-  VARTC_HF_energy += VARTC_HF_one_e_energy + VARTC_HF_two_e_energy
-  VARTC_HF_energy += diag_three_elem_hf
+  TC_HF_energy += TC_HF_one_e_energy + TC_HF_two_e_energy + TC_HF_three_e_energy
 
 END_PROVIDER
 
diff --git a/plugins/local/tc_scf/tcscf_energy_naive.irp.f b/plugins/local/tc_scf/tcscf_energy_naive.irp.f
deleted file mode 100644
index 82bb8799..00000000
--- a/plugins/local/tc_scf/tcscf_energy_naive.irp.f
+++ /dev/null
@@ -1,80 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [double precision, tcscf_energy_3e_naive]
-
-  implicit none
-  integer          :: i, j, k
-  integer          :: neu, ned, D(elec_num)
-  integer          :: ii, jj, kk
-  integer          :: si, sj, sk
-  double precision :: I_ijk, I_jki, I_kij, I_jik, I_ikj, I_kji
-  double precision :: I_tot
-
-  PROVIDE mo_l_coef mo_r_coef
-
-  neu = elec_alpha_num
-  ned = elec_beta_num
-  if (neu > 0) D(1:neu) = [(2*i-1, i = 1, neu)]
-  if (ned > 0) D(neu+1:neu+ned) = [(2*i, i = 1, ned)]
-
-  !print*, "D = "
-  !do i = 1, elec_num
-  !  ii = (D(i) - 1) / 2 + 1
-  !  si = mod(D(i), 2)
-  !  print*, i, D(i), ii, si
-  !enddo
-
-  tcscf_energy_3e_naive = 0.d0
-
-  do i = 1, elec_num - 2
-    ii = (D(i) - 1) / 2 + 1
-    si = mod(D(i), 2)
-
-    do j = i + 1, elec_num - 1
-      jj = (D(j) - 1) / 2 + 1
-      sj = mod(D(j), 2)
-
-      do k = j + 1, elec_num
-        kk = (D(k) - 1) / 2 + 1
-        sk = mod(D(k), 2)
-
-        call give_integrals_3_body_bi_ort(ii, jj, kk, ii, jj, kk, I_ijk)
-        I_tot = I_ijk
-
-        if(sj==si .and. sk==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, kk, ii, I_jki)
-          I_tot += I_jki
-        endif
-
-        if(sk==si .and. si==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, ii, jj, I_kij)
-          I_tot += I_kij
-        endif
-
-        if(sj==si) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, jj, ii, kk, I_jik)
-          I_tot -= I_jik
-        endif
-
-        if(sk==sj) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, ii, kk, jj, I_ikj)
-          I_tot -= I_ikj
-        endif
-
-        if(sk==si) then
-          call give_integrals_3_body_bi_ort(ii, jj, kk, kk, jj, ii, I_kji)
-          I_tot -= I_kji
-        endif
-
-        tcscf_energy_3e_naive += I_tot
-      enddo
-    enddo
-  enddo
-
-  tcscf_energy_3e_naive = -tcscf_energy_3e_naive
-
-END_PROVIDER
-
-! ---
-
diff --git a/plugins/local/tc_scf/test_int.irp.f b/plugins/local/tc_scf/test_int.irp.f
deleted file mode 100644
index e135fcd8..00000000
--- a/plugins/local/tc_scf/test_int.irp.f
+++ /dev/null
@@ -1,970 +0,0 @@
-program test_ints
-
-  BEGIN_DOC
-  ! TODO : Put the documentation of the program here
-  END_DOC
-
-  implicit none
-
-  print *, ' starting test_ints ...'
-
-  my_grid_becke  = .True.
-  PROVIDE tc_grid1_a tc_grid1_r
-  my_n_pt_r_grid = tc_grid1_r
-  my_n_pt_a_grid = tc_grid1_a
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  my_extra_grid_becke = .True.
-  my_n_pt_r_extra_grid = 30
-  my_n_pt_a_extra_grid = 50 ! small extra_grid for quick debug
-  touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
-
-!! OK 
-! call routine_int2_u_grad1u_env2 
-! OK
-! call routine_v_ij_erf_rk_cst_mu_env
-! OK 
-! call routine_x_v_ij_erf_rk_cst_mu_env
-! OK
-! call routine_int2_u2_env2
-! OK
-! call routine_int2_u_grad1u_x_env2
-! OK 
-! call routine_int2_grad1u2_grad2u2_env2
-! call routine_int2_u_grad1u_env2
-! call test_int2_grad1_u12_ao_test
-! call routine_v_ij_u_cst_mu_env_test
-! call test_grid_points_ao
- !call test_int_gauss
-
-  !call test_fock_3e_uhf_ao()
-  !call test_fock_3e_uhf_mo()
-
-  !call test_two_e_tc_non_hermit_integral()
-
-!!PROVIDE TC_HF_energy VARTC_HF_energy
-!!print *, '    TC_HF_energy = ',    TC_HF_energy
-!!print *, ' VARTC_HF_energy = ', VARTC_HF_energy
-
-  call test_fock_3e_uhf_mo_cs()
-  call test_fock_3e_uhf_mo_a()
-  call test_fock_3e_uhf_mo_b()
-
-end
-
-! ---
-
-subroutine routine_test_env
- implicit none
- integer :: i,icount,j
- icount = 0
- do i = 1, List_env1s_square_size
-  if(dabs(List_env1s_square_coef(i)).gt.1.d-10)then
-   print*,''
-   print*,List_env1s_square_expo(i),List_env1s_square_coef(i)
-   print*,List_env1s_square_cent(1:3,i)
-   print*,''
-   icount += 1
-  endif
-  
- enddo
- print*,'List_env1s_square_coef,icount = ',List_env1s_square_size,icount
- do i = 1, ao_num
-  do j = 1, ao_num
-   do icount = 1, List_comb_thr_b3_size(j,i)
-    print*,'',j,i
-    print*,List_comb_thr_b3_expo(icount,j,i),List_comb_thr_b3_coef(icount,j,i)
-    print*,List_comb_thr_b3_cent(1:3,icount,j,i)
-    print*,''
-   enddo
-!   enddo
-  enddo
- enddo
- print*,'max_List_comb_thr_b3_size = ',max_List_comb_thr_b3_size,List_env1s_square_size
-
-end
-
-subroutine routine_int2_u_grad1u_env2
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += int2_u_grad1u_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += int2_u_grad1u_env2(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_int2_u_grad1u_env2'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-subroutine routine_v_ij_erf_rk_cst_mu_env
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += v_ij_erf_rk_cst_mu_env_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_erf_rk_cst_mu_env(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_v_ij_erf_rk_cst_mu_env'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-
-subroutine routine_x_v_ij_erf_rk_cst_mu_env
- implicit none
- integer :: i,j,ipoint,k,l,m
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      do m = 1, 3
-       array(j,i,l,k)     += x_v_ij_erf_rk_cst_mu_env_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += x_v_ij_erf_rk_cst_mu_env     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
-
- print*,'******'
- print*,'******'
- print*,'routine_x_v_ij_erf_rk_cst_mu_env'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-
-
-subroutine routine_v_ij_u_cst_mu_env_test
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += v_ij_u_cst_mu_env_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_u_cst_mu_env_fit (j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_v_ij_u_cst_mu_env_test'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-end
-
-subroutine routine_int2_grad1u2_grad2u2_env2
- implicit none
- integer :: i,j,ipoint,k,l
- integer :: ii , jj
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- double precision, allocatable :: ints(:,:,:)
- allocate(ints(ao_num, ao_num, n_points_final_grid))
-! do ipoint = 1, n_points_final_grid
-!  do i = 1, ao_num
-!   do j = 1, ao_num
-!    read(33,*)ints(j,i,ipoint)
-!   enddo
-!  enddo
-! enddo
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += int2_grad1u2_grad2u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!     !array(j,i,l,k)     += int2_grad1u2_grad2u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!      array_ref(j,i,l,k)     += int2_grad1u2_grad2u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!     !array(j,i,l,k) += ints(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!       array_ref(j,i,l,k) += int2_grad1u2_grad2u2_env2(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += ints(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-!      if(dabs(int2_grad1u2_grad2u2_env2_test(j,i,ipoint)).gt.1.d-6)then
-!       if(dabs(int2_grad1u2_grad2u2_env2_test(j,i,ipoint) - int2_grad1u2_grad2u2_env2_test(j,i,ipoint)).gt.1.d-6)then
-!        print*,j,i,ipoint
-!        print*,int2_grad1u2_grad2u2_env2_test(j,i,ipoint) , int2_grad1u2_grad2u2_env2_test(j,i,ipoint), dabs(int2_grad1u2_grad2u2_env2_test(j,i,ipoint) - int2_grad1u2_grad2u2_env2_test(j,i,ipoint))
-!        print*,int2_grad1u2_grad2u2_env2_test(i,j,ipoint) , int2_grad1u2_grad2u2_env2_test(i,j,ipoint), dabs(int2_grad1u2_grad2u2_env2_test(i,j,ipoint) - int2_grad1u2_grad2u2_env2_test(i,j,ipoint))
-!        stop
-!       endif
-!      endif
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- double precision :: e_ref, e_new
- accu_relat = 0.d0
- accu_abs   = 0.d0
- e_ref = 0.d0
- e_new = 0.d0
- do ii = 1, elec_alpha_num
-  do jj = ii, elec_alpha_num
-   do k = 1, ao_num
-    do l = 1, ao_num
-     do i = 1, ao_num
-      do j = 1, ao_num
-       e_ref += mo_coef(j,ii) * mo_coef(i,ii) * array_ref(j,i,l,k) * mo_coef(l,jj) * mo_coef(k,jj)
-       e_new += mo_coef(j,ii) * mo_coef(i,ii) * array(j,i,l,k) * mo_coef(l,jj) * mo_coef(k,jj)
-       contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-       accu_abs += contrib
-!       if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-!        accu_relat += contrib/dabs(array_ref(j,i,l,k))
-!       endif
-      enddo
-     enddo
-    enddo
-   enddo
-
-  enddo
- enddo
- print*,'e_ref = ',e_ref
- print*,'e_new = ',e_new
-! print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
-! print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-subroutine routine_int2_u2_env2
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += int2_u2_env2_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += int2_u2_env2(j,i,ipoint)      * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_int2_u2_env2'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-
-subroutine routine_int2_u_grad1u_x_env2
- implicit none
- integer :: i,j,ipoint,k,l,m
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      do m = 1, 3
-       array(j,i,l,k)     += int2_u_grad1u_x_env2_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += int2_u_grad1u_x_env2     (j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_int2_u_grad1u_x_env2'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-  
-
-end
-
-subroutine routine_v_ij_u_cst_mu_env
- implicit none
- integer :: i,j,ipoint,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
-
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do ipoint = 1, n_points_final_grid
-  weight = final_weight_at_r_vector(ipoint)
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      array(j,i,l,k)     += v_ij_u_cst_mu_env_test(j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      array_ref(j,i,l,k) += v_ij_u_cst_mu_env_fit (j,i,ipoint) * aos_in_r_array(k,ipoint) * aos_in_r_array(l,ipoint) * weight
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'routine_v_ij_u_cst_mu_env'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_ao()
-
-  implicit none
-  integer                       :: i, j
-  double precision              :: diff_tot, diff_ij, thr_ih, norm
-  double precision, allocatable :: fock_3e_uhf_ao_a_mo(:,:), fock_3e_uhf_ao_b_mo(:,:)
-
-  thr_ih = 1d-7
-
-  PROVIDE fock_a_tot_3e_bi_orth fock_b_tot_3e_bi_orth
-  PROVIDE fock_3e_uhf_ao_a fock_3e_uhf_ao_b
-
-  ! ---
-
-  allocate(fock_3e_uhf_ao_a_mo(mo_num,mo_num))
-  call ao_to_mo_bi_ortho( fock_3e_uhf_ao_a   , size(fock_3e_uhf_ao_a   , 1) &
-                        , fock_3e_uhf_ao_a_mo, size(fock_3e_uhf_ao_a_mo, 1) )
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_ao_a_mo(j,i) - fock_a_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_a_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_ao_a_mo  (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_a_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_a = ', diff_tot / norm
-  print *, ' '
-
-  deallocate(fock_3e_uhf_ao_a_mo)
-
-  ! ---
-
-  allocate(fock_3e_uhf_ao_b_mo(mo_num,mo_num))
-  call ao_to_mo_bi_ortho( fock_3e_uhf_ao_b   , size(fock_3e_uhf_ao_b   , 1) &
-                        , fock_3e_uhf_ao_b_mo, size(fock_3e_uhf_ao_b_mo, 1) )
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_ao_b_mo(j,i) - fock_b_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_b_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_ao_b_mo  (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_b_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_b = ', diff_tot/norm
-  print *, ' '
-
-  deallocate(fock_3e_uhf_ao_b_mo)
-
-  ! ---
-
-end subroutine test_fock_3e_uhf_ao()
-
-! ---
-
-subroutine test_fock_3e_uhf_mo()
-
-  implicit none
-  integer          :: i, j
-  double precision :: diff_tot, diff_ij, thr_ih, norm
-
-  thr_ih = 1d-12
-
-  PROVIDE fock_a_tot_3e_bi_orth fock_b_tot_3e_bi_orth
-  PROVIDE fock_3e_uhf_mo_a fock_3e_uhf_mo_b
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_mo_a(j,i) - fock_a_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_a_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_mo_a     (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_a_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_a = ', diff_tot / norm
-  print *, '      norm_a = ', norm
-  print *, ' '
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      diff_ij = dabs(fock_3e_uhf_mo_b(j,i) - fock_b_tot_3e_bi_orth(j,i))
-      if(diff_ij .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' MANU : ', fock_b_tot_3e_bi_orth(j,i)
-        print *, ' UHF  : ', fock_3e_uhf_mo_b     (j,i)
-        !stop
-      endif
-
-      norm     += dabs(fock_b_tot_3e_bi_orth(j,i))
-      diff_tot += diff_ij
-    enddo
-  enddo
-  print *, ' diff on F_b = ', diff_tot/norm
-  print *, '      norm_b = ', norm
-  print *, ' '
-
-  ! ---
-
-end
-
-! ---
-
-subroutine test_grid_points_ao
- implicit none
- integer :: i,j,ipoint,icount,icount_good, icount_bad,icount_full
- double precision :: thr
- thr = 1.d-10
-! print*,'max_n_pts_grid_ao_prod = ',max_n_pts_grid_ao_prod
-! print*,'n_pts_grid_ao_prod'
- do i = 1, ao_num
-  do j = i, ao_num
-  icount = 0
-  icount_good = 0
-  icount_bad = 0
-  icount_full = 0
-  do ipoint = 1, n_points_final_grid
-!   if(dabs(int2_u_grad1u_x_env2_test(j,i,ipoint,1)) & 
-!    + dabs(int2_u_grad1u_x_env2_test(j,i,ipoint,2)) &
-!    + dabs(int2_u_grad1u_x_env2_test(j,i,ipoint,3)) )
-!   if(dabs(int2_u2_env2_test(j,i,ipoint)).gt.thr)then
-!    icount += 1
-!   endif
-   if(dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)).gt.thr*0.1d0)then
-    icount_full += 1
-   endif
-   if(dabs(v_ij_u_cst_mu_env_test(j,i,ipoint)).gt.thr)then
-    icount += 1
-    if(dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)).gt.thr*0.1d0)then
-    icount_good += 1
-    else
-    print*,j,i,ipoint
-    print*,dabs(v_ij_u_cst_mu_env_test(j,i,ipoint)), dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)),dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint))/dabs(v_ij_u_cst_mu_env_test(j,i,ipoint))
-    icount_bad  += 1
-    endif
-   endif
-!   if(dabs(v_ij_u_cst_mu_env_ng_1_test(j,i,ipoint)).gt.thr)then
-!   endif
-  enddo
-   print*,''
-   print*,j,i
-   print*,icount,icount_full, icount_bad!,n_pts_grid_ao_prod(j,i)
-   print*,dble(icount)/dble(n_points_final_grid),dble(icount_full)/dble(n_points_final_grid)
-!          dble(n_pts_grid_ao_prod(j,i))/dble(n_points_final_grid)
-!   if(icount.gt.n_pts_grid_ao_prod(j,i))then
-!    print*,'pb !!'
-!   endif
-  enddo
- enddo
-end
-
-subroutine test_int_gauss
- implicit none
- integer :: i,j
- print*,'center'
- do i = 1, ao_num
-  do j = i, ao_num
-   print*,j,i
-   print*,ao_prod_sigma(j,i),ao_overlap_abs_grid(j,i)
-   print*,ao_prod_center(1:3,j,i)
-  enddo
- enddo
- print*,''
- double precision :: weight, r(3),integral_1,pi,center(3),f_r,alpha,distance,integral_2
- center = 0.d0
- pi = dacos(-1.d0)
- integral_1 = 0.d0
- integral_2 = 0.d0
- alpha = 0.75d0
- do i = 1,  n_points_final_grid
-  ! you get x, y and z of the ith grid point
-  r(1) = final_grid_points(1,i)
-  r(2) = final_grid_points(2,i)
-  r(3) = final_grid_points(3,i)
-  weight = final_weight_at_r_vector(i)
-  distance = dsqrt( (r(1) - center(1))**2 +  (r(2) - center(2))**2 + (r(3) - center(3))**2 )
-  f_r = dexp(-alpha * distance*distance)
-  ! you add the contribution of the grid point to the integral
-  integral_1 += f_r * weight
-  integral_2 += f_r * distance * weight
- enddo
- print*,'integral_1      =',integral_1
- print*,'(pi/alpha)**1.5 =',(pi / alpha)**1.5
- print*,'integral_2      =',integral_2
- print*,'(pi/alpha)**1.5 =',2.d0*pi / (alpha)**2
-
-
-end
-
-! ---
-
-subroutine test_two_e_tc_non_hermit_integral()
-
-  implicit none
-  integer          :: i, j
-  double precision :: diff_tot, diff, thr_ih, norm
-
-  thr_ih = 1d-10
-
-  PROVIDE two_e_tc_non_hermit_integral_beta two_e_tc_non_hermit_integral_alpha
-  PROVIDE two_e_tc_non_hermit_integral_seq_beta two_e_tc_non_hermit_integral_seq_alpha
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, ao_num
-    do j = 1, ao_num
-
-      diff = dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i) - two_e_tc_non_hermit_integral_alpha(j,i))
-      if(diff .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_alpha(j,i)
-        print *, ' //          : ', two_e_tc_non_hermit_integral_alpha    (j,i)
-        !stop
-      endif
-
-      norm     += dabs(two_e_tc_non_hermit_integral_seq_alpha(j,i))
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot a = ', diff_tot / norm
-  print *, '     norm a = ', norm
-  print *, ' '
-
-  ! ---
-
-  norm     = 0.d0
-  diff_tot = 0.d0
-  do i = 1, ao_num
-    do j = 1, ao_num
-
-      diff = dabs(two_e_tc_non_hermit_integral_seq_beta(j,i) - two_e_tc_non_hermit_integral_beta(j,i))
-      if(diff .gt. thr_ih) then
-        print *, ' difference on ', j, i
-        print *, ' seq         : ', two_e_tc_non_hermit_integral_seq_beta(j,i)
-        print *, ' //          : ', two_e_tc_non_hermit_integral_beta    (j,i)
-        !stop
-      endif
-
-      norm     += dabs(two_e_tc_non_hermit_integral_seq_beta(j,i))
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot b = ', diff_tot / norm
-  print *, '     norm b = ', norm
-  print *, ' '
-
-  ! ---
-
-  return
-
-end
-
-! ---
-
-subroutine test_int2_grad1_u12_ao_test
- implicit none
- integer :: i,j,ipoint,m,k,l
- double precision :: weight,accu_relat, accu_abs, contrib
- double precision, allocatable :: array(:,:,:,:), array_ref(:,:,:,:)
- allocate(array(ao_num, ao_num, ao_num, ao_num))
- array = 0.d0
- allocate(array_ref(ao_num, ao_num, ao_num, ao_num))
- array_ref = 0.d0
- do m = 1, 3
-  do ipoint = 1, n_points_final_grid
-   weight = final_weight_at_r_vector(ipoint)
-   do k = 1, ao_num
-    do l = 1, ao_num
-     do i = 1, ao_num
-      do j = 1, ao_num
-       array(j,i,l,k)     += int2_grad1_u12_ao_test(j,i,ipoint,m) * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-       array_ref(j,i,l,k) += int2_grad1_u12_ao(j,i,ipoint,m)      * aos_grad_in_r_array_transp(m,k,ipoint) * aos_in_r_array(l,ipoint) * weight
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
- enddo
-
- accu_relat = 0.d0
- accu_abs   = 0.d0
-  do k = 1, ao_num
-   do l = 1, ao_num
-    do i = 1, ao_num
-     do j = 1, ao_num
-      contrib = dabs(array(j,i,l,k) - array_ref(j,i,l,k))
-      accu_abs += contrib
-      if(dabs(array_ref(j,i,l,k)).gt.1.d-10)then
-       accu_relat += contrib/dabs(array_ref(j,i,l,k))
-      endif
-     enddo
-    enddo
-   enddo
-  enddo
- print*,'******'
- print*,'******'
- print*,'test_int2_grad1_u12_ao_test'
- print*,'accu_abs   = ',accu_abs/dble(ao_num)**4
- print*,'accu_relat = ',accu_relat/dble(ao_num)**4
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_mo_cs()
-
-  implicit none
-  integer          :: i, j
-  double precision :: I_old, I_new
-  double precision :: diff_tot, diff, thr_ih, norm
-
-!  double precision :: t0, t1
-!  print*, ' Providing fock_a_tot_3e_bi_orth ...'
-!  call wall_time(t0)
-!  PROVIDE fock_a_tot_3e_bi_orth
-!  call wall_time(t1)
-!  print*, ' Wall time for fock_a_tot_3e_bi_orth =', t1 - t0 
-
-  PROVIDE fock_3e_uhf_mo_cs fock_3e_uhf_mo_cs_old
-
-  thr_ih   = 1d-8
-  norm     = 0.d0
-  diff_tot = 0.d0
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      I_old = fock_3e_uhf_mo_cs_old(j,i)
-      I_new = fock_3e_uhf_mo_cs    (j,i)
-
-      diff = dabs(I_old - I_new)
-      if(diff .gt. thr_ih) then
-        print *, ' problem in fock_3e_uhf_mo_cs on ', j, i
-        print *, ' old value = ', I_old
-        print *, ' new value = ', I_new
-        !stop
-      endif
-
-      norm     += dabs(I_old)
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
-
-  return
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_mo_a()
-
-  implicit none
-  integer          :: i, j
-  double precision :: I_old, I_new
-  double precision :: diff_tot, diff, thr_ih, norm
-
-  PROVIDE fock_3e_uhf_mo_a fock_3e_uhf_mo_a_old
-
-  thr_ih   = 1d-8
-  norm     = 0.d0
-  diff_tot = 0.d0
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      I_old = fock_3e_uhf_mo_a_old(j,i)
-      I_new = fock_3e_uhf_mo_a    (j,i)
-
-      diff = dabs(I_old - I_new)
-      if(diff .gt. thr_ih) then
-        print *, ' problem in fock_3e_uhf_mo_a on ', j, i
-        print *, ' old value = ', I_old
-        print *, ' new value = ', I_new
-        !stop
-      endif
-
-      norm     += dabs(I_old)
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
-
-  return
-end
-
-! ---
-
-subroutine test_fock_3e_uhf_mo_b()
-
-  implicit none
-  integer          :: i, j
-  double precision :: I_old, I_new
-  double precision :: diff_tot, diff, thr_ih, norm
-
-  PROVIDE fock_3e_uhf_mo_b fock_3e_uhf_mo_b_old
-
-  thr_ih   = 1d-8
-  norm     = 0.d0
-  diff_tot = 0.d0
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-
-      I_old = fock_3e_uhf_mo_b_old(j,i)
-      I_new = fock_3e_uhf_mo_b    (j,i)
-
-      diff = dabs(I_old - I_new)
-      if(diff .gt. thr_ih) then
-        print *, ' problem in fock_3e_uhf_mo_b on ', j, i
-        print *, ' old value = ', I_old
-        print *, ' new value = ', I_new
-        !stop
-      endif
-
-      norm     += dabs(I_old)
-      diff_tot += diff
-    enddo
-  enddo
-
-  print *, ' diff tot (%) = ', 100.d0 * diff_tot / norm
-
-  return
-end
-
-! ---
-
diff --git a/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f b/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f
deleted file mode 100644
index 0c9ebbd7..00000000
--- a/plugins/local/tc_scf/three_e_energy_bi_ortho.irp.f
+++ /dev/null
@@ -1,189 +0,0 @@
-
-subroutine contrib_3e_diag_sss(i, j, k, integral)
-
-  BEGIN_DOC
-  ! returns the pure same spin contribution to diagonal matrix element of 3e term
-  END_DOC
-
-  implicit none
-  integer,          intent(in)  :: i, j, k
-  double precision, intent(out) :: integral
-  double precision              :: direct_int, exch_13_int, exch_23_int, exch_12_int, c_3_int, c_minus_3_int
-
-  call give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int )!!! < i k j | i k j >
-  call give_integrals_3_body_bi_ort(i, k, j, j, i, k, c_3_int)      ! < i k j | j i k >
-  call give_integrals_3_body_bi_ort(i, k, j, k, j, i, c_minus_3_int)! < i k j | k j i >
-  integral = direct_int + c_3_int + c_minus_3_int 
-
-  ! negative terms :: exchange contrib
-  call give_integrals_3_body_bi_ort(i, k, j, j, k, i, exch_13_int)!!! < i k j | j k i > : E_13 
-  call give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)!!! < i k j | i j k > : E_23
-  call give_integrals_3_body_bi_ort(i, k, j, k, i, j, exch_12_int)!!! < i k j | k i j > : E_12
-
-  integral += - exch_13_int - exch_23_int  - exch_12_int 
-  integral = -integral
-
-end
-
-! ---
-
-subroutine contrib_3e_diag_soo(i,j,k,integral)
- implicit none
- integer, intent(in) :: i,j,k
- BEGIN_DOC
- ! returns the pure same spin contribution to diagonal matrix element of 3e term
- END_DOC
- double precision, intent(out) :: integral
- double precision :: direct_int, exch_23_int
- call  give_integrals_3_body_bi_ort(i, k, j, i, k, j, direct_int) ! < i k j | i k j >
- call  give_integrals_3_body_bi_ort(i, k, j, i, j, k, exch_23_int)! < i k j | i j k > : E_23
- integral = direct_int - exch_23_int 
- integral = -integral
-end
-
-
-subroutine give_aaa_contrib_bis(integral_aaa)
- implicit none
- double precision, intent(out) :: integral_aaa
- double precision :: integral
- integer :: i,j,k
- integral_aaa = 0.d0
- do i = 1, elec_alpha_num
-  do j = i+1, elec_alpha_num
-   do k = j+1, elec_alpha_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_aaa += integral
-   enddo
-  enddo
- enddo
-
-end
-
-! ---
-
-subroutine give_aaa_contrib(integral_aaa)
-
-  implicit none
-  integer                       :: i, j, k
-  double precision              :: integral
-  double precision, intent(out) :: integral_aaa
-
-  integral_aaa = 0.d0
-  do i = 1, elec_alpha_num
-    do j = 1, elec_alpha_num
-      do k = 1, elec_alpha_num
-        call contrib_3e_diag_sss(i, j, k, integral)
-        integral_aaa += integral
-      enddo
-    enddo
-  enddo
-  integral_aaa *= 1.d0/6.d0 
-
-  return
-end
-
-! ---
-
-subroutine give_aab_contrib(integral_aab)
- implicit none
- double precision, intent(out) :: integral_aab
- double precision :: integral
- integer :: i,j,k
- integral_aab = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_alpha_num
-   do k = 1, elec_alpha_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_aab += integral
-   enddo
-  enddo
- enddo
- integral_aab *= 0.5d0
-end
-
-
-subroutine give_aab_contrib_bis(integral_aab)
- implicit none
- double precision, intent(out) :: integral_aab
- double precision :: integral
- integer :: i,j,k
- integral_aab = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_alpha_num
-   do k = j+1, elec_alpha_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_aab += integral
-   enddo
-  enddo
- enddo
-end
-
-
-subroutine give_abb_contrib(integral_abb)
- implicit none
- double precision, intent(out) :: integral_abb
- double precision :: integral
- integer :: i,j,k
- integral_abb = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_beta_num
-   do k = 1, elec_beta_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_abb += integral
-   enddo
-  enddo
- enddo
- integral_abb *= 0.5d0
-end
-
-subroutine give_abb_contrib_bis(integral_abb)
- implicit none
- double precision, intent(out) :: integral_abb
- double precision :: integral
- integer :: i,j,k
- integral_abb = 0.d0
- do i = 1, elec_alpha_num
-  do j = 1, elec_beta_num
-   do k = j+1, elec_beta_num
-    call contrib_3e_diag_soo(i,j,k,integral)
-    integral_abb += integral
-   enddo
-  enddo
- enddo
-end
-
-subroutine give_bbb_contrib_bis(integral_bbb)
- implicit none
- double precision, intent(out) :: integral_bbb
- double precision :: integral
- integer :: i,j,k
- integral_bbb = 0.d0
- do i = 1, elec_beta_num
-  do j = i+1, elec_beta_num
-   do k = j+1, elec_beta_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_bbb += integral
-   enddo
-  enddo
- enddo
-
-end
-
-subroutine give_bbb_contrib(integral_bbb)
- implicit none
- double precision, intent(out) :: integral_bbb
- double precision :: integral
- integer :: i,j,k
- integral_bbb = 0.d0
- do i = 1, elec_beta_num
-  do j = 1, elec_beta_num
-   do k = 1, elec_beta_num
-    call contrib_3e_diag_sss(i,j,k,integral)
-    integral_bbb += integral
-   enddo
-  enddo
- enddo
- integral_bbb *= 1.d0/6.d0 
-end
-
-
diff --git a/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
new file mode 100644
index 00000000..ec5167d1
--- /dev/null
+++ b/plugins/local/tc_scf/write_ao_2e_tc_integ.irp.f
@@ -0,0 +1,56 @@
+! ---
+
+program write_ao_2e_tc_integ
+
+  implicit none
+
+  print *, ' j2e_type = ', j2e_type
+  print *, ' j1e_type = ', j1e_type
+  print *, ' env_type = ', env_type
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  call write_int(6, my_n_pt_r_grid, 'radial  external grid over')
+  call write_int(6, my_n_pt_a_grid, 'angular external grid over')
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+
+    call write_int(6, my_n_pt_r_extra_grid, 'radial  internal grid over')
+    call write_int(6, my_n_pt_a_extra_grid, 'angular internal grid over')
+  endif
+
+  call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+
+  PROVIDE io_tc_integ
+
+  print*, 'io_tc_integ = ', io_tc_integ
+
+  if(io_tc_integ .ne. "Write") then
+    print*, 'io_tc_integ != Write'
+    print*, io_tc_integ
+    stop
+  endif
+
+  PROVIDE ao_two_e_tc_tot
+
+end
+
+! ---
+
diff --git a/plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f b/plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f
new file mode 100644
index 00000000..d09d100a
--- /dev/null
+++ b/plugins/local/tuto_plugins/tuto_I/test_cholesky.irp.f
@@ -0,0 +1,53 @@
+program my_program_to_print_stuffs
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+ integer :: i,j,k,l,m
+ double precision :: integral, accu, accu_tot, integral_cholesky
+ double precision :: get_ao_two_e_integral, get_two_e_integral ! declaration of the functions 
+ print*,'AO integrals, physicist notations : <i j|k l>'
+ accu_tot = 0.D0
+ do i = 1, ao_num
+  do j = 1, ao_num
+   do k = 1, ao_num
+    do l = 1, ao_num
+     integral = get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+     integral_cholesky = 0.D0
+     do m = 1, cholesky_ao_num
+      integral_cholesky += cholesky_ao_transp(m,i,k) * cholesky_ao_transp(m,j,l)
+     enddo
+     accu = dabs(integral_cholesky-integral)
+     accu_tot += accu
+     if(accu.gt.1.d-10)then
+      print*,i,j,k,l
+      print*,accu, integral, integral_cholesky
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu_tot',accu_tot
+
+ print*,'MO integrals, physicist notations : <i j|k l>'
+ do i = 1, mo_num
+  do j = 1, mo_num
+   do k = 1, mo_num
+    do l = 1, mo_num
+     integral = get_two_e_integral(i, j, k, l, mo_integrals_map)
+     accu = 0.D0
+     integral_cholesky = 0.D0
+     do m = 1, cholesky_mo_num
+      integral_cholesky += cholesky_mo_transp(m,i,k) * cholesky_mo_transp(m,j,l)
+     enddo
+     accu = dabs(integral_cholesky-integral)
+     accu_tot += accu
+     if(accu.gt.1.d-10)then
+      print*,i,j,k,l
+      print*,accu, integral, integral_cholesky
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+end
diff --git a/scripts/get_fci_tc_conv.sh b/scripts/get_fci_tc_conv.sh
index 643f3ac0..f0c99baf 100755
--- a/scripts/get_fci_tc_conv.sh
+++ b/scripts/get_fci_tc_conv.sh
@@ -1,2 +1,2 @@
 file=$1
-grep "Ndet,E,E+PT2,E+RPT2,|PT2|=" $file | cut -d "=" -f 2 > ${file}.conv_fci_tc
+grep "Ndet,E,E+PT2,pt2_minus,pt2_plus,pt2_abs=" $file | cut -d "=" -f 2 > ${file}.conv_fci_tc
diff --git a/src/ao_basis/aos_in_r.irp.f b/src/ao_basis/aos_in_r.irp.f
index 1b1595a3..053c86a2 100644
--- a/src/ao_basis/aos_in_r.irp.f
+++ b/src/ao_basis/aos_in_r.irp.f
@@ -1,67 +1,76 @@
-double precision function ao_value(i,r)
- implicit none
- BEGIN_DOC
-! Returns the value of the i-th ao at point $\textbf{r}$
- END_DOC
- double precision, intent(in) :: r(3)
- integer, intent(in) :: i
 
- integer :: m,num_ao
- double precision :: center_ao(3)
- double precision :: beta
- integer :: power_ao(3)
- double precision :: accu,dx,dy,dz,r2
- num_ao = ao_nucl(i)
- power_ao(1:3)= ao_power(i,1:3)
- center_ao(1:3) = nucl_coord(num_ao,1:3)
- dx = (r(1) - center_ao(1))
- dy = (r(2) - center_ao(2))
- dz = (r(3) - center_ao(3))
- r2 = dx*dx + dy*dy + dz*dz
- dx = dx**power_ao(1)
- dy = dy**power_ao(2)
- dz = dz**power_ao(3)
+! ---
 
- accu = 0.d0
- do m=1,ao_prim_num(i)
-   beta = ao_expo_ordered_transp(m,i)
-   accu += ao_coef_normalized_ordered_transp(m,i) * dexp(-beta*r2)
- enddo
- ao_value = accu * dx * dy * dz
+double precision function ao_value(i, r)
+
+  BEGIN_DOC
+  ! Returns the value of the i-th ao at point $\textbf{r}$
+  END_DOC
+
+  implicit none
+  integer,          intent(in) :: i
+  double precision, intent(in) :: r(3)
+
+  integer          :: m, num_ao
+  integer          :: power_ao(3)
+  double precision :: center_ao(3)
+  double precision :: beta
+  double precision :: accu, dx, dy, dz, r2
+
+  num_ao = ao_nucl(i)
+  power_ao(1:3) = ao_power(i,1:3)
+  center_ao(1:3) = nucl_coord(num_ao,1:3)
+  dx = r(1) - center_ao(1)
+  dy = r(2) - center_ao(2)
+  dz = r(3) - center_ao(3)
+  r2 = dx*dx + dy*dy + dz*dz
+  dx = dx**power_ao(1)
+  dy = dy**power_ao(2)
+  dz = dz**power_ao(3)
+ 
+  accu = 0.d0
+  do m = 1, ao_prim_num(i)
+    beta = ao_expo_ordered_transp(m,i)
+    accu += ao_coef_normalized_ordered_transp(m,i) * dexp(-beta*r2)
+  enddo
+  ao_value = accu * dx * dy * dz
 
 end
 
 
-double precision function primitive_value(i,j,r)
- implicit none
- BEGIN_DOC
-! Returns the value of the j-th primitive of the i-th |AO| at point $\textbf{r}
-! **without the coefficient**
- END_DOC
- double precision, intent(in) :: r(3)
- integer, intent(in) :: i,j
+double precision function primitive_value(i, j, r)
 
- integer :: m,num_ao
- double precision :: center_ao(3)
- double precision :: beta
- integer :: power_ao(3)
- double precision :: accu,dx,dy,dz,r2
- num_ao = ao_nucl(i)
- power_ao(1:3)= ao_power(i,1:3)
- center_ao(1:3) = nucl_coord(num_ao,1:3)
- dx = (r(1) - center_ao(1))
- dy = (r(2) - center_ao(2))
- dz = (r(3) - center_ao(3))
- r2 = dx*dx + dy*dy + dz*dz
- dx = dx**power_ao(1)
- dy = dy**power_ao(2)
- dz = dz**power_ao(3)
+  BEGIN_DOC
+  ! Returns the value of the j-th primitive of the i-th |AO| at point $\textbf{r}
+  ! **without the coefficient**
+  END_DOC
 
- accu = 0.d0
- m=j
- beta = ao_expo_ordered_transp(m,i)
- accu += dexp(-beta*r2)
- primitive_value = accu * dx * dy * dz
+  implicit none
+  integer,          intent(in) :: i, j
+  double precision, intent(in) :: r(3)
+ 
+  integer          :: m, num_ao
+  integer          :: power_ao(3)
+  double precision :: center_ao(3)
+  double precision :: beta
+  double precision :: accu, dx, dy, dz, r2
+
+  num_ao = ao_nucl(i)
+  power_ao(1:3)= ao_power(i,1:3)
+  center_ao(1:3) = nucl_coord(num_ao,1:3)
+  dx = r(1) - center_ao(1)
+  dy = r(2) - center_ao(2)
+  dz = r(3) - center_ao(3)
+  r2 = dx*dx + dy*dy + dz*dz
+  dx = dx**power_ao(1)
+  dy = dy**power_ao(2)
+  dz = dz**power_ao(3)
+ 
+  accu = 0.d0
+  m = j
+  beta = ao_expo_ordered_transp(m,i)
+  accu += dexp(-beta*r2)
+  primitive_value = accu * dx * dy * dz
 
 end
 
@@ -104,9 +113,9 @@ subroutine give_all_aos_at_r(r, tmp_array)
       dz2 = dz**p_ao(3)
 
       tmp_array(k) = 0.d0
-      do l = 1,ao_prim_num(k)
+      do l = 1, ao_prim_num(k)
         beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-        if(dabs(beta*r2).gt.40.d0) cycle
+        if(beta*r2.gt.50.d0) cycle
 
         tmp_array(k) += ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
       enddo
@@ -120,207 +129,232 @@ end
 
 ! ---
 
-subroutine give_all_aos_and_grad_at_r(r,aos_array,aos_grad_array)
- implicit none
- BEGIN_DOC
-! input : r(1) ==> r(1) = x, r(2) = y, r(3) = z
-!
-! output : 
-!
-! * aos_array(i) = ao(i) evaluated at ro
-! * aos_grad_array(1,i) = gradient X of the ao(i) evaluated at $\textbf{r}$
-!
- END_DOC
- double precision, intent(in) :: r(3)
- double precision, intent(out) :: aos_array(ao_num)
- double precision, intent(out) :: aos_grad_array(3,ao_num)
+subroutine give_all_aos_and_grad_at_r(r, aos_array, aos_grad_array)
 
- integer :: power_ao(3)
- integer :: i,j,k,l,m
- double precision :: dx,dy,dz,r2
- double precision ::      dx2,dy2,dz2
- double precision ::      dx1,dy1,dz1
- double precision :: center_ao(3)
- double precision :: beta,accu_1,accu_2,contrib
- do i = 1, nucl_num
-  center_ao(1:3) = nucl_coord(i,1:3)
-  dx = (r(1) - center_ao(1))
-  dy = (r(2) - center_ao(2))
-  dz = (r(3) - center_ao(3))
-  r2 = dx*dx + dy*dy + dz*dz
-  do j = 1,Nucl_N_Aos(i)
-   k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
-   aos_array(k) = 0.d0
-   aos_grad_array(1,k) = 0.d0
-   aos_grad_array(2,k) = 0.d0
-   aos_grad_array(3,k) = 0.d0
-   power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
-   dx2 = dx**power_ao(1)
-   dy2 = dy**power_ao(2)
-   dz2 = dz**power_ao(3)
-   if(power_ao(1) .ne. 0)then
-    dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
-   else
-    dx1 = 0.d0
-   endif
-   if(power_ao(2) .ne. 0)then
-    dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
-   else
-    dy1 = 0.d0
-   endif
-   if(power_ao(3) .ne. 0)then
-    dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
-   else
-    dz1 = 0.d0
-   endif
-   accu_1 = 0.d0
-   accu_2 = 0.d0
-   do l = 1,ao_prim_num(k)
-    beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-    contrib = 0.d0
-    if(beta*r2.gt.50.d0)cycle
-    contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
-    accu_1 += contrib
-    accu_2 += contrib * beta
-   enddo
-   aos_array(k) = accu_1 * dx2 * dy2 * dz2
-   aos_grad_array(1,k) = accu_1 * dx1  * dy2 * dz2- 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
-   aos_grad_array(2,k) = accu_1 * dx2  * dy1 * dz2- 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
-   aos_grad_array(3,k) = accu_1 * dx2  * dy2 * dz1- 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+  BEGIN_DOC
+  !
+  ! input : r(1) ==> r(1) = x, r(2) = y, r(3) = z
+  !
+  ! output : 
+  !
+  ! * aos_array(i) = ao(i) evaluated at ro
+  ! * aos_grad_array(1,i) = gradient X of the ao(i) evaluated at $\textbf{r}$
+  !
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: aos_array(ao_num)
+  double precision, intent(out) :: aos_grad_array(3,ao_num)
+
+  integer                       :: power_ao(3)
+  integer                       :: i, j, k, l, m
+  double precision              :: dx, dy, dz, r2
+  double precision              :: dx1, dy1, dz1
+  double precision              :: dx2, dy2, dz2
+  double precision              :: center_ao(3)
+  double precision              :: beta, accu_1, accu_2, contrib
+
+  do i = 1, nucl_num
+
+    center_ao(1:3) = nucl_coord(i,1:3)
+
+    dx = r(1) - center_ao(1)
+    dy = r(2) - center_ao(2)
+    dz = r(3) - center_ao(3)
+    r2 = dx*dx + dy*dy + dz*dz
+
+    do j = 1, Nucl_N_Aos(i)
+
+      k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
+
+      aos_array(k) = 0.d0
+      aos_grad_array(1,k) = 0.d0
+      aos_grad_array(2,k) = 0.d0
+      aos_grad_array(3,k) = 0.d0
+
+      power_ao(1:3) = ao_power_ordered_transp_per_nucl(1:3,j,i)
+      dx2 = dx**power_ao(1)
+      dy2 = dy**power_ao(2)
+      dz2 = dz**power_ao(3)
+
+      dx1 = 0.d0
+      if(power_ao(1) .ne. 0) then
+        dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
+      endif
+
+      dy1 = 0.d0
+      if(power_ao(2) .ne. 0) then
+        dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
+      endif
+
+      dz1 = 0.d0
+      if(power_ao(3) .ne. 0) then
+        dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
+      endif
+
+      accu_1 = 0.d0
+      accu_2 = 0.d0
+      do l = 1, ao_prim_num(k)
+        beta = ao_expo_ordered_transp_per_nucl(l,j,i)
+        if(beta*r2.gt.50.d0) cycle
+        contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
+        accu_1 += contrib
+        accu_2 += contrib * beta
+      enddo
+
+      aos_array(k) = accu_1 * dx2 * dy2 * dz2
+      aos_grad_array(1,k) = accu_1 * dx1 * dy2 * dz2 - 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
+      aos_grad_array(2,k) = accu_1 * dx2 * dy1 * dz2 - 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
+      aos_grad_array(3,k) = accu_1 * dx2 * dy2 * dz1 - 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+    enddo
   enddo
- enddo
+
 end
 
+! ---
 
-subroutine give_all_aos_and_grad_and_lapl_at_r(r,aos_array,aos_grad_array,aos_lapl_array)
- implicit none
- BEGIN_DOC
-! input  : r(1) ==> r(1) = x, r(2) = y, r(3) = z
-!
-! output :
-!
-! * aos_array(i) = ao(i) evaluated at $\textbf{r}$
-! * aos_grad_array(1,i) = $\nabla_x$ of the ao(i) evaluated at $\textbf{r}$
- END_DOC
- double precision, intent(in) :: r(3)
- double precision, intent(out) :: aos_array(ao_num)
- double precision, intent(out) :: aos_grad_array(3,ao_num)
- double precision, intent(out) :: aos_lapl_array(3,ao_num)
+subroutine give_all_aos_and_grad_and_lapl_at_r(r, aos_array, aos_grad_array, aos_lapl_array)
 
- integer :: power_ao(3)
- integer :: i,j,k,l,m
- double precision :: dx,dy,dz,r2
- double precision ::      dx2,dy2,dz2
- double precision ::      dx1,dy1,dz1
- double precision ::      dx3,dy3,dz3
- double precision ::      dx4,dy4,dz4
- double precision ::      dx5,dy5,dz5
- double precision :: center_ao(3)
- double precision :: beta,accu_1,accu_2,accu_3,contrib
- do i = 1, nucl_num
-  center_ao(1:3) = nucl_coord(i,1:3)
-  dx = (r(1) - center_ao(1))
-  dy = (r(2) - center_ao(2))
-  dz = (r(3) - center_ao(3))
-  r2 = dx*dx + dy*dy + dz*dz
-  do j = 1,Nucl_N_Aos(i)
-   k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
-   aos_array(k) = 0.d0
-   aos_grad_array(1,k) = 0.d0
-   aos_grad_array(2,k) = 0.d0
-   aos_grad_array(3,k) = 0.d0
+  BEGIN_DOC
+  !
+  ! input  : r(1) ==> r(1) = x, r(2) = y, r(3) = z
+  !
+  ! output :
+  !
+  ! * aos_array(i) = ao(i) evaluated at $\textbf{r}$
+  ! * aos_grad_array(1,i) = $\nabla_x$ of the ao(i) evaluated at $\textbf{r}$
+  !
+  END_DOC
 
-   aos_lapl_array(1,k) = 0.d0
-   aos_lapl_array(2,k) = 0.d0
-   aos_lapl_array(3,k) = 0.d0
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: aos_array(ao_num)
+  double precision, intent(out) :: aos_grad_array(3,ao_num)
+  double precision, intent(out) :: aos_lapl_array(3,ao_num)
 
-   power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
-   dx2 = dx**power_ao(1)
-   dy2 = dy**power_ao(2)
-   dz2 = dz**power_ao(3)
-   if(power_ao(1) .ne. 0)then
-    dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
-   else
-    dx1 = 0.d0
-   endif
-   ! For the Laplacian
-   if(power_ao(1) .ge. 2)then
-    dx3 = dble(power_ao(1)) * dble((power_ao(1)-1))  * dx**(power_ao(1)-2)
-   else
-    dx3 = 0.d0
-   endif
-   if(power_ao(1) .ge. 1)then
-    dx4 = dble((2 * power_ao(1) + 1))  * dx**(power_ao(1)) 
-   else
-    dx4 = dble((power_ao(1) + 1))  * dx**(power_ao(1)) 
-   endif
+  integer                       :: power_ao(3)
+  integer                       :: i, j, k, l, m
+  double precision              :: dx, dy, dz, r2
+  double precision              :: dx1, dy1, dz1
+  double precision              :: dx2, dy2, dz2
+  double precision              :: dx3, dy3, dz3
+  double precision              :: dx4, dy4, dz4
+  double precision              :: dx5, dy5, dz5
+  double precision              :: center_ao(3)
+  double precision              :: beta, accu_1, accu_2, accu_3, contrib
 
-   dx5 = dx**(power_ao(1)+2)
+  do i = 1, nucl_num
 
-   if(power_ao(2) .ne. 0)then
-    dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
-   else
-    dy1 = 0.d0
-   endif
-   ! For the Laplacian
-   if(power_ao(2) .ge. 2)then
-    dy3 = dble(power_ao(2)) * dble((power_ao(2)-1))  * dy**(power_ao(2)-2)
-   else
-    dy3 = 0.d0
-   endif
+    center_ao(1:3) = nucl_coord(i,1:3)
 
-   if(power_ao(2) .ge. 1)then
-    dy4 = dble((2 * power_ao(2) + 1))  * dy**(power_ao(2)) 
-   else
-    dy4 = dble((power_ao(2) + 1))  * dy**(power_ao(2)) 
-   endif
+    dx = r(1) - center_ao(1)
+    dy = r(2) - center_ao(2)
+    dz = r(3) - center_ao(3)
+    r2 = dx*dx + dy*dy + dz*dz
+    
+    do j = 1, Nucl_N_Aos(i)
 
-   dy5 = dy**(power_ao(2)+2)
+      k = Nucl_Aos_transposed(j,i) ! index of the ao in the ordered format
 
+      aos_array(k) = 0.d0
+      aos_grad_array(1,k) = 0.d0
+      aos_grad_array(2,k) = 0.d0
+      aos_grad_array(3,k) = 0.d0      
+      aos_lapl_array(1,k) = 0.d0
+      aos_lapl_array(2,k) = 0.d0
+      aos_lapl_array(3,k) = 0.d0
+      
+      power_ao(1:3)= ao_power_ordered_transp_per_nucl(1:3,j,i)
+      dx2 = dx**power_ao(1)
+      dy2 = dy**power_ao(2)
+      dz2 = dz**power_ao(3)
 
-   if(power_ao(3) .ne. 0)then
-    dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
-   else
-    dz1 = 0.d0
-   endif
-   ! For the Laplacian
-   if(power_ao(3) .ge. 2)then
-    dz3 = dble(power_ao(3)) * dble((power_ao(3)-1))  * dz**(power_ao(3)-2)
-   else
-    dz3 = 0.d0
-   endif
+      ! ---
 
-   if(power_ao(3) .ge. 1)then
-    dz4 = dble((2 * power_ao(3) + 1))  * dz**(power_ao(3)) 
-   else
-    dz4 = dble((power_ao(3) + 1))  * dz**(power_ao(3)) 
-   endif
+      dx1 = 0.d0
+      if(power_ao(1) .ne. 0) then
+        dx1 = dble(power_ao(1)) * dx**(power_ao(1)-1)
+      endif
 
-   dz5 = dz**(power_ao(3)+2)
+      dx3 = 0.d0
+      if(power_ao(1) .ge. 2) then
+        dx3 = dble(power_ao(1)) * dble((power_ao(1)-1)) * dx**(power_ao(1)-2)
+      endif
 
+      if(power_ao(1) .ge. 1) then
+        dx4 = dble((2 * power_ao(1) + 1)) * dx**(power_ao(1)) 
+      else
+        dx4 = dble((power_ao(1) + 1)) * dx**(power_ao(1)) 
+      endif
+      
+      dx5 = dx**(power_ao(1)+2)
+  
+      ! ---
+      
+      dy1 = 0.d0
+      if(power_ao(2) .ne. 0) then
+        dy1 = dble(power_ao(2)) * dy**(power_ao(2)-1)
+      endif
 
-   accu_1 = 0.d0
-   accu_2 = 0.d0
-   accu_3 = 0.d0
-   do l = 1,ao_prim_num(k)
-    beta = ao_expo_ordered_transp_per_nucl(l,j,i)
-    contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
-    accu_1 += contrib
-    accu_2 += contrib * beta
-    accu_3 += contrib * beta**2
-   enddo
-   aos_array(k) = accu_1 * dx2 * dy2 * dz2
+      dy3 = 0.d0
+      if(power_ao(2) .ge. 2) then
+        dy3 = dble(power_ao(2)) * dble((power_ao(2)-1))  * dy**(power_ao(2)-2)
+      endif
+      
+      if(power_ao(2) .ge. 1) then
+        dy4 = dble((2 * power_ao(2) + 1)) * dy**(power_ao(2)) 
+      else
+        dy4 = dble((power_ao(2) + 1)) * dy**(power_ao(2)) 
+      endif
+      
+      dy5 = dy**(power_ao(2)+2)
 
-   aos_grad_array(1,k) = accu_1 * dx1  * dy2 * dz2- 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
-   aos_grad_array(2,k) = accu_1 * dx2  * dy1 * dz2- 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
-   aos_grad_array(3,k) = accu_1 * dx2  * dy2 * dz1- 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+      ! ---
+      
+      dz1 = 0.d0
+      if(power_ao(3) .ne. 0) then
+        dz1 = dble(power_ao(3)) * dz**(power_ao(3)-1)
+      endif
 
-   aos_lapl_array(1,k) = accu_1 * dx3  * dy2 * dz2- 2.d0 * dx4 * dy2 * dz2* accu_2 +4.d0 * dx5 *dy2 * dz2* accu_3
-   aos_lapl_array(2,k) = accu_1 * dx2  * dy3 * dz2- 2.d0 * dx2 * dy4 * dz2* accu_2 +4.d0 * dx2 *dy5 * dz2* accu_3
-   aos_lapl_array(3,k) = accu_1 * dx2  * dy2 * dz3- 2.d0 * dx2 * dy2 * dz4* accu_2 +4.d0 * dx2 *dy2 * dz5* accu_3
+      dz3 = 0.d0
+      if(power_ao(3) .ge. 2) then
+        dz3 = dble(power_ao(3)) * dble((power_ao(3)-1)) * dz**(power_ao(3)-2)
+      endif
+      
+      if(power_ao(3) .ge. 1) then
+        dz4 = dble((2 * power_ao(3) + 1)) * dz**(power_ao(3)) 
+      else
+        dz4 = dble((power_ao(3) + 1)) * dz**(power_ao(3)) 
+      endif
+      
+      dz5 = dz**(power_ao(3)+2)
+      
+      ! ---
+      
+      accu_1 = 0.d0
+      accu_2 = 0.d0
+      accu_3 = 0.d0
+      do l = 1,ao_prim_num(k)
+        beta = ao_expo_ordered_transp_per_nucl(l,j,i)
+        if(beta*r2.gt.50.d0) cycle
+        contrib = ao_coef_normalized_ordered_transp_per_nucl(l,j,i) * dexp(-beta*r2)
+        accu_1 += contrib
+        accu_2 += contrib * beta
+        accu_3 += contrib * beta**2
+      enddo
 
+      aos_array(k) = accu_1 * dx2 * dy2 * dz2
+      aos_grad_array(1,k) = accu_1 * dx1 * dy2 * dz2 - 2.d0 * dx2 * dx  * dy2 * dz2 * accu_2
+      aos_grad_array(2,k) = accu_1 * dx2 * dy1 * dz2 - 2.d0 * dx2 * dy2 * dy  * dz2 * accu_2
+      aos_grad_array(3,k) = accu_1 * dx2 * dy2 * dz1 - 2.d0 * dx2 * dy2 * dz2 * dz  * accu_2
+      aos_lapl_array(1,k) = accu_1 * dx3 * dy2 * dz2 - 2.d0 * dx4 * dy2 * dz2 * accu_2 + 4.d0 * dx5 * dy2 * dz2 * accu_3
+      aos_lapl_array(2,k) = accu_1 * dx2 * dy3 * dz2 - 2.d0 * dx2 * dy4 * dz2 * accu_2 + 4.d0 * dx2 * dy5 * dz2 * accu_3
+      aos_lapl_array(3,k) = accu_1 * dx2 * dy2 * dz3 - 2.d0 * dx2 * dy2 * dz4 * accu_2 + 4.d0 * dx2 * dy2 * dz5 * accu_3
+    enddo
   enddo
- enddo
+
 end
 
+! ---
 
diff --git a/src/ao_two_e_ints/EZFIO.cfg b/src/ao_two_e_ints/EZFIO.cfg
index ff932b0c..a985149e 100644
--- a/src/ao_two_e_ints/EZFIO.cfg
+++ b/src/ao_two_e_ints/EZFIO.cfg
@@ -6,7 +6,7 @@ default: None
 
 [io_ao_cholesky]
 type: Disk_access
-doc: Read/Write |AO| integrals from/to disk [ Write | Read | None ] 
+doc: Read/Write |AO| Cholesky integrals from/to disk [ Write | Read | None ] 
 interface: ezfio,provider,ocaml
 default: None
 
@@ -25,16 +25,16 @@ default: 1.e-12
 
 [do_direct_integrals]
 type: logical
-doc: Compute integrals on the fly (very slow, only for debugging)
+doc: Compute integrals on the fly (Useful only for Cholesky decomposition)
 interface: ezfio,provider,ocaml
-default: False
+default: True
 ezfio_name: direct
 
 [do_ao_cholesky]
 type: logical
 doc: Perform Cholesky decomposition of AO integrals
 interface: ezfio,provider,ocaml
-default: False
+default: True
 
 [io_ao_two_e_integrals_erf]
 type: Disk_access
diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f
index 33304026..319e3c6e 100644
--- a/src/ao_two_e_ints/cholesky.irp.f
+++ b/src/ao_two_e_ints/cholesky.irp.f
@@ -1,3 +1,15 @@
+double precision function get_ao_integ_chol(i,j,k,l)
+ implicit none
+  BEGIN_DOC
+  !  CHOLESKY representation of the integral of the AO basis <ik|jl> or (ij|kl)
+  !     i(r1) j(r1) 1/r12 k(r2) l(r2)
+  END_DOC
+ integer, intent(in) :: i,j,k,l
+ double precision, external :: ddot                                                                                  
+ get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1)
+
+end
+
 BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ]
  implicit none
  BEGIN_DOC
@@ -6,7 +18,7 @@ BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num,
  integer :: i,j,k
  do j=1,ao_num
   do i=1,ao_num
-   do k=1,ao_num
+   do k=1,cholesky_ao_num
     cholesky_ao_transp(k,i,j) = cholesky_ao(i,j,k)
    enddo
   enddo
@@ -16,27 +28,35 @@ END_PROVIDER
 
  BEGIN_PROVIDER [ integer, cholesky_ao_num ]
 &BEGIN_PROVIDER [ double precision, cholesky_ao, (ao_num, ao_num, 1) ]
+   use mmap_module
    implicit none
    BEGIN_DOC
    ! Cholesky vectors in AO basis: (ik|a):
    ! <ij|kl> = (ik|jl) = sum_a (ik|a).(a|jl)
    !
    ! Last dimension of cholesky_ao is cholesky_ao_num
+   !
+   ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
+   !
+   ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
+   !
+   ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
    END_DOC
 
-   integer                        :: rank, ndim
-   double precision               :: tau
-   double precision, pointer      :: L(:,:), L_old(:,:)
-
+   integer*8                      :: ndim8
+   integer                        :: rank
+   double precision               :: tau, tau2
+   double precision, pointer      :: L(:,:)
 
    double precision               :: s
-   double precision, parameter    :: dscale = 1.d0
 
-   double precision, allocatable  :: D(:), Delta(:,:), Ltmp_p(:,:), Ltmp_q(:,:)
-   integer, allocatable           :: Lset(:), Dset(:), addr(:,:)
+   double precision, allocatable  :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:)
+   integer, allocatable           :: addr1(:), addr2(:)
+   integer*8, allocatable         :: Lset(:), Dset(:)
    logical, allocatable           :: computed(:)
 
-   integer                        :: i,j,k,m,p,q, qj, dj, p2, q2
+   integer                        :: i,j,k,m,p,q, dj, p2, q2, ii, jj
+   integer*8                      :: i8, j8, p8, qj8, rank_max, np8
    integer                        :: N, np, nq
 
    double precision               :: Dmax, Dmin, Qmax, f
@@ -44,19 +64,32 @@ END_PROVIDER
    logical, external              :: ao_two_e_integral_zero
 
    double precision, external     :: ao_two_e_integral
-   integer                        :: block_size, iblock, ierr
+   integer                        :: block_size, iblock
 
-   double precision               :: mem
+   double precision               :: mem, mem0
    double precision, external     :: memory_of_double, memory_of_int
+   double precision, external     :: memory_of_double8, memory_of_int8
 
    integer, external              :: getUnitAndOpen
-   integer                        :: iunit
+   integer                        :: iunit, ierr
 
-   ndim = ao_num*ao_num
+   ndim8 = ao_num*ao_num*1_8+1
+   double precision :: wall0,wall1
+
+   type(c_ptr)                    :: c_pointer(2)
+   integer                        :: fd(2)
+
+   PROVIDE nproc ao_cholesky_threshold do_direct_integrals qp_max_mem
+   PROVIDE nucl_coord ao_two_e_integral_schwartz
+   call set_multiple_levels_omp(.False.)
+
+   call wall_time(wall0)
+
+   ! Will be reallocated at the end
    deallocate(cholesky_ao)
 
    if (read_ao_cholesky) then
-     print *,  'Reading Cholesky vectors from disk...'
+     print *,  'Reading Cholesky AO vectors from disk...'
      iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'R')
      read(iunit) rank
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
@@ -66,7 +99,6 @@ END_PROVIDER
 
    else
 
-     PROVIDE nucl_coord ao_two_e_integral_schwartz
      call set_multiple_levels_omp(.False.)
 
      if (do_direct_integrals) then
@@ -79,66 +111,84 @@ END_PROVIDER
      endif
 
      tau = ao_cholesky_threshold
+     tau2 = tau*tau
 
-     mem = 6.d0 * memory_of_double(ndim) + 6.d0 * memory_of_int(ndim)
-     call check_mem(mem, irp_here)
+     rank = 0
+
+     allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8))
+     allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8), computed(ndim8) )
+
+     call resident_memory(mem0)
 
      call print_memory_usage()
 
-     allocate(L(ndim,1))
-
      print *,  ''
      print *,  'Cholesky decomposition of AO integrals'
      print *,  '======================================'
      print *,  ''
      print *,  '============ ============='
-     print *,  '    Rank      Threshold'
+     print *,  '    Rank       Threshold'
      print *,  '============ ============='
 
 
-     rank = 0
-
-     allocate( D(ndim), Lset(ndim), Dset(ndim) )
-     allocate( addr(3,ndim) )
-
      ! 1.
-     k=0
+     i8=0
      do j=1,ao_num
        do i=1,ao_num
-         k = k+1
-         addr(1,k) = i
-         addr(2,k) = j
-         addr(3,k) = (i-1)*ao_num + j
+         i8 = i8+1
+         addr1(i8) = i
+         addr2(i8) = j
        enddo
      enddo
 
      if (do_direct_integrals) then
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided)
-       do i=1,ndim
-         D(i) = ao_two_e_integral(addr(1,i), addr(2,i),              &
-             addr(1,i), addr(2,i))
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
+       do i8=ndim8-1,1,-1
+         D(i8) = ao_two_e_integral(addr1(i8), addr2(i8),              &
+             addr1(i8), addr2(i8))
        enddo
        !$OMP END PARALLEL DO
      else
-       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) SCHEDULE(guided)
-       do i=1,ndim
-         D(i) = get_ao_two_e_integral(addr(1,i), addr(1,i),          &
-             addr(2,i), addr(2,i),                                   &
-             ao_integrals_map)
+       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21)
+       do i8=ndim8-1,1,-1
+         D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8),          &
+             addr2(i8), addr2(i8), ao_integrals_map)
        enddo
        !$OMP END PARALLEL DO
      endif
+     ! Just to guarentee termination 
+     D(ndim8) = 0.d0
 
-     Dmax = maxval(D)
+     D_sorted(:) = -D(:)
+     call dsort_noidx_big(D_sorted,ndim8)
+     D_sorted(:) = -D_sorted(:)
+     Dmax = D_sorted(1)
 
      ! 2.
-     np=0
-     do p=1,ndim
-       if ( dscale*dscale*Dmax*D(p) > tau*tau ) then
-         np = np+1
-         Lset(np) = p
+     np8=0_8
+     do p8=1,ndim8
+       if ( Dmax*D(p8) >= tau2 ) then
+         np8 = np8+1_8
+         Lset(np8) = p8
        endif
      enddo
+     if (np8 > ndim8) stop 'np>ndim8'
+     np = int(np8,4)
+     if (np <= 0) stop 'np<=0'
+
+     rank_max = np
+     ! Avoid too large arrays when there are many electrons
+     if (elec_num > 10) then
+       rank_max = min(np,20*elec_num*elec_num)
+     endif
+     call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
+     call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
+
+     ! Deleting the file while it is open makes the file invisible on the filesystem,
+     ! and automatically deleted, even if the program crashes
+     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R')
+     close(iunit,status='delete')
+
 
      ! 3.
      N = 0
@@ -146,77 +196,68 @@ END_PROVIDER
      ! 4.
      i = 0
 
+     mem = memory_of_double(np)                & ! Delta(np,nq)
+         + (np+1)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
+
+!     call check_mem(mem)
+
      ! 5.
-     do while ( (Dmax > tau).and.(rank < ndim) )
+     do while ( (Dmax > tau).and.(np > 0) )
        ! a.
        i = i+1
+       
 
-       s = 0.01d0
 
-       ! Inrease s until the arrays fit in memory
+       block_size = max(N,24)
+
+       ! Determine nq so that Delta fits in memory
+
+       s = 0.1d0
+       Dmin = max(s*Dmax,tau)
+       do nq=2,np-1
+         if (D_sorted(nq) < Dmin) exit
+       enddo
+
        do while (.True.)
 
-         ! b.
-         Dmin = max(s*Dmax,tau)
+         mem = mem0                                 &
+             + np*memory_of_double(nq)              & ! Delta(np,nq)
+             + (np+nq)*memory_of_double(block_size)   ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
 
-         ! c.
-         nq=0
-         do p=1,np
-           if ( D(Lset(p)) > Dmin ) then
-             nq = nq+1
-             Dset(nq) = Lset(p)
-           endif
-         enddo
-
-         call total_memory(mem)
-         mem = mem                                                   &
-             + np*memory_of_double(nq)                               &! Delta(np,nq)
-             + (rank+nq)* memory_of_double(ndim)                     &! L(ndim,rank+nq)
-             + (np+nq)*memory_of_double(block_size)    ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size)
-
-         if (mem > qp_max_mem) then
-           s = s*2.d0
+         if (mem > qp_max_mem*0.5d0) then
+           Dmin = D_sorted(nq/2)
+           do ii=nq/2,np-1
+             if (D_sorted(ii) < Dmin) then
+               nq = ii
+               exit
+             endif
+           enddo
          else
            exit
          endif
 
-         if ((s > 1.d0).or.(nq == 0)) then
-           call print_memory_usage()
-           print *,  'Not enough memory. Reduce cholesky threshold'
-           stop -1
+       enddo
+!call print_memory_usage
+!print *, 'np, nq, Predicted memory: ', np, nq, mem
+
+       if (nq <= 0) then
+         print *, nq
+         stop 'bug in cholesky: nq <= 0'
+       endif
+
+       Dmin = D_sorted(nq)
+       nq=0
+       do p=1,np
+         if ( D(Lset(p)) >= Dmin ) then
+           nq = nq+1
+           Dset(nq) = Lset(p)
          endif
-
        enddo
 
-       ! d., e.
-       block_size = max(N,24)
-
-       L_old => L
-       allocate(L(ndim,rank+nq), stat=ierr)
-       if (ierr /= 0) then
-         call print_memory_usage()
-         print *,  irp_here, ': allocation failed : (L(ndim,rank+nq))'
-         stop -1
-       endif
-
-       !$OMP PARALLEL DO PRIVATE(k,j)
-       do k=1,rank
-         do j=1,ndim
-           L(j,k) = L_old(j,k)
-         enddo
-       enddo
-       !$OMP END PARALLEL DO
-
-       deallocate(L_old)
-
-       allocate(Delta(np,nq), stat=ierr)
-       if (ierr /= 0) then
-         call print_memory_usage()
-         print *,  irp_here, ': allocation failed : (Delta(np,nq))'
-         stop -1
-       endif
 
+       allocate(Delta(np,nq))
        allocate(Ltmp_p(np,block_size), stat=ierr)
+
        if (ierr /= 0) then
          call print_memory_usage()
          print *,  irp_here, ': allocation failed : (Ltmp_p(np,block_size))'
@@ -224,6 +265,7 @@ END_PROVIDER
        endif
 
        allocate(Ltmp_q(nq,block_size), stat=ierr)
+
        if (ierr /= 0) then
          call print_memory_usage()
          print *,  irp_here, ': allocation failed : (Ltmp_q(nq,block_size))'
@@ -231,36 +273,39 @@ END_PROVIDER
        endif
 
 
-       allocate(computed(nq))
+       computed(1:nq) = .False.
 
-       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(m,k,p,q,j)
 
-       !$OMP DO
-       do q=1,nq
-         do j=1,np
-           Delta(j,q) = 0.d0
-         enddo
-         computed(q) = .False.
-       enddo
-       !$OMP ENDDO NOWAIT
-
-       !$OMP DO
+       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q)
        do k=1,N
+         !$OMP DO
          do p=1,np
-           Ltmp_p(p,k) = L(Lset(p),k)
+            Ltmp_p(p,k) = L(Lset(p),k)
          enddo
+         !$OMP END DO NOWAIT
+
+         !$OMP DO
          do q=1,nq
            Ltmp_q(q,k) = L(Dset(q),k)
          enddo
+         !$OMP END DO NOWAIT
        enddo
-       !$OMP END DO NOWAIT
-
        !$OMP BARRIER
        !$OMP END PARALLEL
 
        if (N>0) then
-         call dgemm('N','T', np, nq, N, -1.d0,                       &
-             Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
+
+           call dgemm('N', 'T', np, nq, N, -1.d0,                       &
+                  Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np)
+
+       else
+
+         !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j)
+         do q=1,nq
+           Delta(:,q) = 0.d0
+         enddo
+         !$OMP END PARALLEL DO
+
        endif
 
        ! f.
@@ -272,53 +317,84 @@ END_PROVIDER
        ! g.
 
        iblock = 0
+       
        do j=1,nq
 
-         if ( (Qmax <= Dmin).or.(N+j > ndim) ) exit
+         if ( (Qmax < Dmin).or.(N+j*1_8 > ndim8) ) exit
+
          ! i.
          rank = N+j
+         if (rank == rank_max) then
+           print *, 'cholesky: rank_max reached'
+           exit
+         endif
 
          if (iblock == block_size) then
-           call dgemm('N','T',np,nq,block_size,-1.d0,                &
-               Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
-           iblock = 0
+
+            call dgemm('N','T',np,nq,block_size,-1.d0,                &
+                 Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np)
+
+            iblock = 0
+
          endif
 
          ! ii.
          do dj=1,nq
-           qj = Dset(dj)
-           if (D(qj) == Qmax) then
+           qj8 = Dset(dj)
+           if (D(qj8) == Qmax) then
              exit
            endif
          enddo
 
-         L(1:ndim, rank) = 0.d0
-
-         if (.not.computed(dj)) then
-           m = dj
-           !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(guided)
-           do k=np,1,-1
-             if (.not.ao_two_e_integral_zero( addr(1,Lset(k)), addr(1,Dset(m)),&
-                   addr(2,Lset(k)), addr(2,Dset(m)) ) ) then
-               if (do_direct_integrals) then
-                 Delta(k,m) = Delta(k,m) + &
-                     ao_two_e_integral(addr(1,Lset(k)), addr(2,Lset(k)),&
-                     addr(1,Dset(m)), addr(2,Dset(m)))
-               else
-                 Delta(k,m) = Delta(k,m) + &
-                     get_ao_two_e_integral( addr(1,Lset(k)), addr(1,Dset(m)),&
-                     addr(2,Lset(k)), addr(2,Dset(m)), ao_integrals_map)
-               endif
-             endif
-           enddo
-           !$OMP END PARALLEL DO
-           computed(dj) = .True.
-         endif
+         do i8=1,ndim8
+           L(i8, rank) = 0.d0
+         enddo
 
          iblock = iblock+1
+         !$OMP PARALLEL DO PRIVATE(p)
          do p=1,np
            Ltmp_p(p,iblock) = Delta(p,dj)
          enddo
+         !$OMP END PARALLEL DO
+
+         if (.not.computed(dj)) then
+           m = dj
+           if (do_direct_integrals) then
+               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
+               do k=1,np
+                 Delta_col(k) = 0.d0
+                 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
+                       addr2(Lset(k)), addr2(Dset(m)) ) ) then
+                     Delta_col(k) = &
+                         ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),&
+                         addr1(Dset(m)), addr2(Dset(m)))
+                 endif
+               enddo
+               !$OMP END PARALLEL DO
+           else
+               PROVIDE ao_integrals_map
+               !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21)
+               do k=1,np
+                 Delta_col(k) = 0.d0
+                 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),&
+                       addr2(Lset(k)), addr2(Dset(m)) ) ) then
+                     Delta_col(k) = &
+                         get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),&
+                         addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map)
+                 endif
+               enddo
+               !$OMP END PARALLEL DO
+           endif
+
+           !$OMP PARALLEL DO PRIVATE(p)
+           do p=1,np
+             Ltmp_p(p,iblock) =  Ltmp_p(p,iblock) + Delta_col(p)
+             Delta(p,dj) =  Ltmp_p(p,iblock)
+           enddo
+           !$OMP END PARALLEL DO
+
+           computed(dj) = .True.
+         endif
 
          ! iv.
          if (iblock > 1) then
@@ -329,7 +405,7 @@ END_PROVIDER
          ! iii.
          f = 1.d0/dsqrt(Qmax)
 
-         !$OMP PARALLEL PRIVATE(m,p,q,k) DEFAULT(shared)
+         !$OMP PARALLEL PRIVATE(p,q) DEFAULT(shared)
          !$OMP DO
          do p=1,np
            Ltmp_p(p,iblock) = Ltmp_p(p,iblock) * f
@@ -343,7 +419,6 @@ END_PROVIDER
            Ltmp_q(q,iblock) = L(Dset(q), rank)
          enddo
          !$OMP END DO
-
          !$OMP END PARALLEL
 
          Qmax = D(Dset(1))
@@ -355,49 +430,62 @@ END_PROVIDER
 
        print '(I10, 4X, ES12.3)', rank, Qmax
 
-       deallocate(computed)
-       deallocate(Delta)
        deallocate(Ltmp_p)
        deallocate(Ltmp_q)
+       deallocate(Delta)
 
        ! i.
        N = rank
 
        ! j.
-       Dmax = D(Lset(1))
-       do p=1,np
-         Dmax = max(Dmax, D(Lset(p)))
-       enddo
+       D_sorted(:) = -D(:)
+       call dsort_noidx_big(D_sorted,ndim8)
+       D_sorted(:) = -D_sorted(:)
 
-       np=0
-       do p=1,ndim
-         if ( dscale*dscale*Dmax*D(p) > tau*tau ) then
-           np = np+1
-           Lset(np) = p
+       Dmax = D_sorted(1)
+
+       np8=0_8
+       do p8=1,ndim8
+         if ( Dmax*D(p8) >= tau2 ) then
+           np8 = np8+1_8
+           Lset(np8) = p8
          endif
        enddo
+       np = int(np8,4)
 
      enddo
 
+
+     print *,  '============ ============='
+     print *,  ''
+
+     deallocate( D, Lset, Dset, D_sorted )
+     deallocate( addr1, addr2, Delta_col, computed )
+
+
      allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr)
+
      if (ierr /= 0) then
        call print_memory_usage()
        print *,  irp_here, ': Allocation failed'
        stop -1
      endif
-     !$OMP PARALLEL DO PRIVATE(k)
+
+
+     !$OMP PARALLEL DO PRIVATE(k,j)
      do k=1,rank
-       call dcopy(ndim, L(1,k), 1, cholesky_ao(1,1,k), 1)
+       do j=1,ao_num
+           cholesky_ao(1:ao_num,j,k) = L((j-1_8)*ao_num+1_8:1_8*j*ao_num,k)
+       enddo
      enddo
      !$OMP END PARALLEL DO
-     deallocate(L)
+
+     call munmap( (/ ndim8, rank_max /), 8, fd(1), c_pointer(1) )
+
      cholesky_ao_num = rank
 
-     print *,  '============ ============='
-     print *,  ''
-
      if (write_ao_cholesky) then
-       print *,  'Writing Cholesky vectors to disk...'
+       print *,  'Writing Cholesky AO vectors to disk...'
        iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
        write(iunit) rank
        write(iunit) cholesky_ao
@@ -409,6 +497,9 @@ END_PROVIDER
 
    print *, 'Rank  : ', cholesky_ao_num, '(', 100.d0*dble(cholesky_ao_num)/dble(ao_num*ao_num), ' %)'
    print *,  ''
+   call wall_time(wall1)
+   print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
+
 
 END_PROVIDER
 
diff --git a/src/ao_two_e_ints/two_e_integrals.irp.f b/src/ao_two_e_ints/two_e_integrals.irp.f
index b55b5f0d..d12f3d45 100644
--- a/src/ao_two_e_ints/two_e_integrals.irp.f
+++ b/src/ao_two_e_ints/two_e_integrals.irp.f
@@ -460,8 +460,8 @@ BEGIN_PROVIDER [ double precision, ao_two_e_integral_schwartz, (ao_num, ao_num)
   !$OMP PARALLEL DO PRIVATE(i,k)                                     &
       !$OMP DEFAULT(NONE)                                            &
       !$OMP SHARED (ao_num,ao_two_e_integral_schwartz)              &
-      !$OMP SCHEDULE(guided)
-  do i=1,ao_num
+      !$OMP SCHEDULE(dynamic)
+  do i=ao_num,1,-1
     do k=1,i
       ao_two_e_integral_schwartz(i,k) = dsqrt(ao_two_e_integral(i,i,k,k))
       ao_two_e_integral_schwartz(k,i) = ao_two_e_integral_schwartz(i,k)
diff --git a/src/becke_numerical_grid/extra_grid_vector.irp.f b/src/becke_numerical_grid/extra_grid_vector.irp.f
index ae167282..e054e22c 100644
--- a/src/becke_numerical_grid/extra_grid_vector.irp.f
+++ b/src/becke_numerical_grid/extra_grid_vector.irp.f
@@ -47,8 +47,12 @@ END_PROVIDER
   END_DOC
 
   implicit none
-  integer                        :: i,j,k,l,i_count
-  double precision               :: r(3)
+  integer          :: i, j, k, l, i_count
+  double precision :: r(3)
+  double precision :: wall0, wall1
+
+  call wall_time(wall0)
+  print *, ' Providing extra_final_grid_points ...'
 
   i_count = 0
   do j = 1, nucl_num
@@ -70,6 +74,10 @@ END_PROVIDER
     enddo
   enddo
 
+  call wall_time(wall1)
+  print *, ' wall time for extra_final_grid_points,', wall1 - wall0
+  call print_memory_usage()
+
 END_PROVIDER
 
 
diff --git a/src/becke_numerical_grid/grid_becke_vector.irp.f b/src/becke_numerical_grid/grid_becke_vector.irp.f
index 473096d0..9da8a099 100644
--- a/src/becke_numerical_grid/grid_becke_vector.irp.f
+++ b/src/becke_numerical_grid/grid_becke_vector.irp.f
@@ -67,13 +67,6 @@ END_PROVIDER
         index_final_points(2,i_count) = i
         index_final_points(3,i_count) = j
         index_final_points_reverse(k,i,j) = i_count
-
-        if(final_weight_at_r_vector(i_count) .lt. 0.d0) then
-          print *, ' !!! WARNING !!!'
-          print *, ' negative weight !!!!'
-          print *, i_count, final_weight_at_r_vector(i_count)
-          stop
-        endif 
       enddo
     enddo
   enddo
diff --git a/src/casscf_cipsi/EZFIO.cfg b/src/casscf_cipsi/EZFIO.cfg
index 18e0b6b1..5b72d906 100644
--- a/src/casscf_cipsi/EZFIO.cfg
+++ b/src/casscf_cipsi/EZFIO.cfg
@@ -79,3 +79,9 @@ type: logical
 doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change
 interface: ezfio,provider,ocaml
 default: False
+
+[act_mos_opt]
+type: logical
+doc: If |true|, the active orbitals are also optimized variationally
+interface: ezfio,provider,ocaml
+default: False
diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED
index dd91c7bd..32f5ae90 100644
--- a/src/casscf_cipsi/NEED
+++ b/src/casscf_cipsi/NEED
@@ -3,3 +3,4 @@ selectors_full
 generators_cas
 two_body_rdm
 dav_general_mat
+mo_optimization_utils
diff --git a/src/casscf_cipsi/bielec.irp.f b/src/casscf_cipsi/bielec.irp.f
index 0a44f994..a4901985 100644
--- a/src/casscf_cipsi/bielec.irp.f
+++ b/src/casscf_cipsi/bielec.irp.f
@@ -1,18 +1,25 @@
-BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
   BEGIN_DOC
-  ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx
+  !
+  ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active
   ! indices are unshifted orbital numbers
   END_DOC
   implicit none
   integer                        :: i,j,ii,jj,p,q,i3,j3,t3,v3
   real*8                         :: mo_two_e_integral
+  print*,''
+  print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
   
-  bielec_PQxx(:,:,:,:) = 0.d0
+  bielec_PQxx_array(:,:,:,:) = 0.d0
   PROVIDE mo_two_e_integrals_in_map
   
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(i,ii,j,jj,i3,j3) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, &
   !$OMP  n_act_orb,mo_integrals_map,list_act)
 
   !$OMP DO
@@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
     ii=list_core_inact(i)
     do j=i,n_core_inact_orb
       jj=list_core_inact(j)
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map)
-      bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map)
+      bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j)
     end do
     do j=1,n_act_orb
       jj=list_act(j)
       j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3)
     end do
   end do
   !$OMP END DO
@@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
     do j=i,n_act_orb
       jj=list_act(j)
       j3=j+n_core_inact_orb
-      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map)
-      bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3)
+      call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map)
+      bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3)
     end do
   end do
   !$OMP END DO
@@ -52,9 +59,13 @@ END_PROVIDER
 
 
 
-BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
   BEGIN_DOC
-  ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ
+  !
+  ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active
   ! indices are unshifted orbital numbers
   END_DOC
   implicit none
@@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
   double precision, allocatable  :: integrals_array(:,:)
   real*8                         :: mo_two_e_integral
   
+  print*,''
+  print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
   PROVIDE mo_two_e_integrals_in_map
-  bielec_PxxQ = 0.d0
+  bielec_PxxQ_array = 0.d0
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) &
-  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, &
+  !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, &
   !$OMP  n_act_orb,mo_integrals_map,list_act)
 
   allocate(integrals_array(mo_num,mo_num))
@@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i,j,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
       call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
       do q=1,mo_num
         do p=1,mo_num
-          bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q)
-          bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p)
+          bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q)
+          bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p)
         end do
       end do
     end do
@@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   BEGIN_DOC
   ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active
   ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
   END_DOC
   implicit none
   integer                        :: i,j,k,p,t,u,v
   double precision, external     :: mo_two_e_integral
+  double precision :: wall0, wall1 
+  call wall_time(wall0)
+  print*,'Providing bielecCI'
   PROVIDE mo_two_e_integrals_in_map
   
   !$OMP PARALLEL DO DEFAULT(NONE) &
@@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
     end do
   end do
   !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI = ',wall1 - wall0
 
 END_PROVIDER
diff --git a/src/casscf_cipsi/bielec_natorb.irp.f b/src/casscf_cipsi/bielec_natorb.irp.f
index 9968530c..99734a0b 100644
--- a/src/casscf_cipsi/bielec_natorb.irp.f
+++ b/src/casscf_cipsi/bielec_natorb.irp.f
@@ -1,30 +1,38 @@
- BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
+ BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
   BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PQxx_no
+  !
   ! integral (pq|xx) in the basis of natural MOs
   ! indices are unshifted orbital numbers
+  ! 
   END_DOC
   implicit none
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  print*,''
+  print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
 
 
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
   !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI)
+  !$OMP   bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI)
 
   allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), &
       d(n_act_orb,mo_num,n_core_inact_act_orb))
 
   !$OMP DO
   do l=1,n_core_inact_act_orb
-    bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l)
+    bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l)
 
     do k=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l)
+          f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l)
         end do
       end do
     end do
@@ -36,13 +44,13 @@
     do k=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k)
+          bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k)
         end do
       end do
 
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l)
+          f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l)
         end do
       end do
     end do
@@ -54,7 +62,7 @@
     do k=1,n_core_inact_act_orb
       do p=1,n_act_orb
         do j=1,mo_num
-          bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k)
+          bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k)
         end do
       end do
     end do
@@ -71,7 +79,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)
         end do
       end do
     end do
@@ -83,7 +91,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p)
         end do
       end do
     end do
@@ -97,7 +105,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)
         end do
       end do
     end do
@@ -109,7 +117,7 @@
     do p=1,n_act_orb
       do k=1,mo_num
         do j=1,mo_num
-          bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
         end do
       end do
     end do
@@ -123,8 +131,12 @@ END_PROVIDER
 
 
 
-BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
+BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
   BEGIN_DOC
+  ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! 
+  ! 
+  ! Replaced by the Cholesky-based function bielec_PxxQ_no
+  !
   ! integral (px|xq) in the basis of natural MOs
   ! indices are unshifted orbital numbers
   END_DOC
@@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
 
+  print*,''
+  print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
+  print*,''
+
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
   !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
-  !$OMP   bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI)
+  !$OMP   bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI)
 
 
   allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), &
@@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
 
   !$OMP DO
   do j=1,mo_num
-    bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j)
+    bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j)
     do l=1,n_core_inact_act_orb
       do k=1,n_core_inact_act_orb
         do p=1,n_act_orb
-            f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j)
+            f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j)
         end do
       end do
     end do
@@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do k=1,n_core_inact_act_orb
         do p=1,n_act_orb
-          bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l)
+          bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l)
         end do
       end do
     end do
@@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)
+          f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)
         end do
       end do
     end do
@@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do l=1,n_core_inact_act_orb
       do j=1,mo_num
         do p=1,n_act_orb
-          bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l)
+          bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l)
         end do
       end do
     end do
@@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do l=1,n_core_inact_act_orb
         do j=1,mo_num
-          f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)
+          f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)
         end do
       end do
     end do
@@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do l=1,n_core_inact_act_orb
         do j=1,mo_num
-          bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p)
+          bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p)
         end do
       end do
     end do
@@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do k=1,n_core_inact_act_orb
         do j=1,mo_num
-          f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)
+          f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)
         end do
       end do
     end do
@@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
     do p=1,n_act_orb
       do k=1,n_core_inact_act_orb
         do j=1,mo_num
-          bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
+          bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
         end do
       end do
     end do
@@ -259,10 +275,16 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   BEGIN_DOC
   ! integrals (tu|vp) in the basis of natural MOs
   ! index p runs over the whole basis, t,u,v only over the active orbitals
+  ! 
+  ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
   END_DOC
   implicit none
   integer                        :: i,j,k,l,t,u,p,q
   double precision, allocatable  :: f(:,:,:), d(:,:,:)
+ 
+  double precision :: wall0, wall1
+  call wall_time(wall0)
+  print*,'Providing bielecCI_no'
   
   !$OMP PARALLEL DEFAULT(NONE) &
   !$OMP PRIVATE(j,k,l,p,d,f) &
@@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
   
   deallocate(d,f)
   !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide bielecCI_no = ',wall1-wall0
 
 
 END_PROVIDER
diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f
index d0a26d36..b64a9d8f 100644
--- a/src/casscf_cipsi/casscf.irp.f
+++ b/src/casscf_cipsi/casscf.irp.f
@@ -11,7 +11,7 @@ program casscf
   if(small_active_space)then
    pt2_relative_error = 0.00001
   else
-   thresh_scf = 1.d-4
+   thresh_scf = max(1.d-4,thresh_scf)
    pt2_relative_error = 0.04
   endif
   touch pt2_relative_error 
@@ -46,94 +46,101 @@ subroutine run
   do while (.not.converged)
     print*,'pt2_max = ',pt2_max
     call run_stochastic_cipsi(Ev,PT2)
-    print*,'Ev,PT2',Ev(1),PT2(1)
-    E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
-    energy_old = energy
-    energy = eone+etwo+ecore
-    pt2_max_before = pt2_max
-
-    call write_time(6)
-    call write_int(6,iteration,'CAS-SCF iteration = ')
-    call write_double(6,energy,'State-average CAS-SCF energy = ')
-!    if(n_states == 1)then
-!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
-!     call ezfio_get_casscf_cipsi_energy(PT2)
-     double precision :: delta_E_istate, e_av
-     e_av = 0.d0
-     do istate=1,N_states
-      e_av += state_average_weight(istate) * Ev(istate) 
-      if(istate.gt.1)then
-       delta_E_istate = E_PT2(istate)  - E_PT2(1)
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2    = ',delta_E_istate
-      endif
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
-      write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
-!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
-!      call write_double(6,PT2(istate),'  PT2          = ')
-     enddo
-     call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
-     call write_double(6,pt2_max,' PT2_MAX       = ')
+!    if(act_mos_opt)then DOES NOT WORK
+!     call run_orb_opt_trust_v2
+!     call run_stochastic_cipsi(Ev,PT2)
 !    endif
-
-    print*,''
-    call write_double(6,norm_grad_vec2,'Norm of gradients = ')
-    call write_double(6,norm_grad_vec2_tab(1), '     Core-active  gradients = ')
-    call write_double(6,norm_grad_vec2_tab(2), '     Core-virtual gradients = ')
-    call write_double(6,norm_grad_vec2_tab(3), '   Active-virtual gradients = ')
-    print*,''
-    call write_double(6,energy_improvement, 'Predicted energy improvement = ')
-
-    if(criterion_casscf == "energy")then
-     converged = dabs(energy_improvement) < thresh_scf
-    else if (criterion_casscf == "gradients")then
-     converged = norm_grad_vec2 < thresh_scf
-    else if (criterion_casscf == "e_pt2")then
-     delta_E = 0.d0
-     do istate = 1, N_states
-      delta_E += dabs(E_PT2(istate) - ept2_before(istate))
-     enddo
-     converged = dabs(delta_E) < thresh_casscf
-    endif
-    ept2_before = E_PT2
-    if(.not.small_active_space)then
-     if(adaptive_pt2_max)then
-      pt2_max = dabs(energy_improvement / (pt2_relative_error))
-      pt2_max = min(pt2_max, pt2_max_before)
-      if(n_act_orb.ge.n_big_act_orb)then
-       pt2_max = max(pt2_max,pt2_min_casscf)
-      endif
+    if(.True.)then
+     print*,'Ev,PT2',Ev(1),PT2(1)
+     E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
+     energy_old = energy
+     energy = eone+etwo+ecore
+     pt2_max_before = pt2_max
+ 
+     call write_time(6)
+     call write_int(6,iteration,'CAS-SCF iteration = ')
+     call write_double(6,energy,'State-average CAS-SCF energy = ')
+!!    if(n_states == 1)then
+!!     call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
+!!     call ezfio_get_casscf_cipsi_energy(PT2)
+      double precision :: delta_E_istate, e_av
+      e_av = 0.d0
+      do istate=1,N_states
+       e_av += state_average_weight(istate) * Ev(istate) 
+       if(istate.gt.1)then
+        delta_E_istate = E_PT2(istate)  - E_PT2(1)
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2    = ',delta_E_istate
+       endif
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
+       write(*,'(A6,I2,A18,F16.10)')'state ',istate,'     PT2 energy = ',PT2(istate)
+!!      call write_double(6,E_PT2(istate),'E + PT2 energy = ')
+!!      call write_double(6,PT2(istate),'  PT2          = ')
+      enddo
+      call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
+      call write_double(6,pt2_max,' PT2_MAX       = ')
+!!    endif
+ 
+     print*,''
+     call write_double(6,norm_grad_vec2,'Norm of gradients = ')
+     call write_double(6,norm_grad_vec2_tab(1), '     Core-active  gradients = ')
+     call write_double(6,norm_grad_vec2_tab(2), '     Core-virtual gradients = ')
+     call write_double(6,norm_grad_vec2_tab(3), '   Active-virtual gradients = ')
+     print*,''
+     call write_double(6,energy_improvement, 'Predicted energy improvement = ')
+ 
+     if(criterion_casscf == "energy")then
+      converged = dabs(energy_improvement) < thresh_scf
+     else if (criterion_casscf == "gradients")then
+      converged = norm_grad_vec2 < thresh_scf
+     else if (criterion_casscf == "e_pt2")then
+      delta_E = 0.d0
+      do istate = 1, N_states
+       delta_E += dabs(E_PT2(istate) - ept2_before(istate))
+      enddo
+      converged = dabs(delta_E) < thresh_casscf
      endif
-    endif
-    print*,''
-    call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
-
-    mo_coef = NewOrbs
-    mo_occ  = occnum
-    if(.not.converged)then
-     call save_mos
-     iteration += 1
-     if(norm_grad_vec2.gt.0.01d0)then
-      N_det = N_states
-     else
-      N_det = max(N_det/8 ,N_states)
-     endif
-     psi_det = psi_det_sorted
-     psi_coef = psi_coef_sorted
-     read_wf = .True.
-     call clear_mo_map
-     SOFT_TOUCH mo_coef N_det psi_det psi_coef
+     ept2_before = E_PT2
      if(.not.small_active_space)then
       if(adaptive_pt2_max)then
-        SOFT_TOUCH pt2_max  
+       pt2_max = dabs(energy_improvement / (pt2_relative_error))
+       pt2_max = min(pt2_max, pt2_max_before)
+       if(n_act_orb.ge.n_big_act_orb)then
+        pt2_max = max(pt2_max,pt2_min_casscf)
+       endif
       endif
      endif
-     if(iteration .gt. 3)then
-      state_following_casscf = state_following_casscf_cipsi_save
-      soft_touch state_following_casscf
+     print*,''
+     call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
+ 
+     mo_coef = NewOrbs
+     mo_occ  = occnum
+     if(.not.converged)then
+      call save_mos
+      iteration += 1
+      if(norm_grad_vec2.gt.0.01d0)then
+       N_det = N_states
+      else
+       N_det = max(N_det/8 ,N_states)
+      endif
+      psi_det = psi_det_sorted
+      psi_coef = psi_coef_sorted
+      read_wf = .True.
+      call clear_mo_map
+      SOFT_TOUCH mo_coef N_det psi_det psi_coef
+      if(.not.small_active_space)then
+       if(adaptive_pt2_max)then
+         SOFT_TOUCH pt2_max  
+       endif
+      endif
+      if(iteration .gt. 3)then
+       state_following_casscf = state_following_casscf_cipsi_save
+       soft_touch state_following_casscf
+      endif
      endif
     endif
-
+ 
   enddo
+  if(.True.)then
      integer :: i
     print*,'Converged CASSCF '
     print*,'--------------------------'
@@ -153,6 +160,7 @@ subroutine run
 !   write(*,*)mcscf_fock_alpha_mo(i,i)
   enddo
 
+ endif
 
 end
 
diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f
new file mode 100644
index 00000000..f69832c1
--- /dev/null
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@@ -0,0 +1,248 @@
+
+BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)]
+ BEGIN_DOC
+ ! Cholesky vectors with ONE orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,i_mo,jj_act
+ double precision, allocatable :: chol_tmp(:,:)
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_1_idx_transp'
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb))
+ cholesky_no_1_idx_transp = 0.D0
+ do i_mo = 1, mo_num
+  ! Get all the integrals corresponding to the "i_mo"
+  do i_act = 1, n_act_orb
+   jj_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
+   enddo
+  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
+ enddo
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
+ BEGIN_DOC
+ ! Cholesky vectors with TWO orbital on the active natural orbital basis 
+ END_DOC
+ implicit none
+ integer :: i_chol,i_act,j_act,jj_act
+ double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
+ allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_2_idx_transp'
+ cholesky_no_2_idx_transp = 0.D0
+ do i_act = 1, n_act_orb
+  ! Get all the integrals corresponding to the "j_act"
+  do j_act = 1, n_act_orb
+   jj_act = list_act(j_act)
+   do i_chol = 1, cholesky_mo_num
+    chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act)
+   enddo
+  enddo
+  call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0,  &
+        chol_tmp, size(chol_tmp,1),                              &
+        natorbsCI, size(natorbsCI,1),                                              &
+        0.d0,                                                      &
+        cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
+ enddo
+ call wall_time(wall1)
+ print*,'Time to provide  cholesky_no_2_idx_transp = ', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors defined on all basis including the NO basis
+ END_DOC
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt
+ double precision :: wall0,wall1
+ call wall_time(wall0)
+ print*,'Providing cholesky_no_total_transp '
+ ! Block when two orbitals belong to the core/inact 
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact)
+   enddo
+  enddo
+ enddo
+
+ ! Block when one orbitals belongs to the core/inact and one belongs to the active
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ do j_core_inact = 1, n_core_inact_orb
+  jj_core_inact = list_core_inact(j_core_inact)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when two orbitals belong to the active 
+ do j_act = 1, n_act_orb
+  jj_act = list_act(j_act)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num 
+    cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act)
+   enddo
+  enddo
+ enddo
+
+ ! Block when two orbitals belong to the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do j_virt = 1, n_virt_orb
+   jj_virt = list_virt(j_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt)
+   enddo
+  enddo
+ enddo
+ 
+ ! Block when one orbital is in active and the other in the virtuals 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_act = 1, n_act_orb
+   ii_act = list_act(i_act)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
+   enddo
+  enddo
+ enddo
+ ! Block when one orbital is in the virtual and one in the core-inact 
+ do i_virt = 1, n_virt_orb
+  ii_virt = list_virt(i_virt)
+  do i_core_inact = 1, n_core_inact_orb
+   ii_core_inact = list_core_inact(i_core_inact)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt)
+   enddo
+  enddo
+ enddo
+ do i_core_inact = 1, n_core_inact_orb
+  ii_core_inact = list_core_inact(i_core_inact)
+  do i_virt = 1, n_virt_orb
+   ii_virt = list_virt(i_virt)
+   do i_chol = 1, cholesky_mo_num
+    cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact)
+   enddo
+  enddo
+ enddo
+
+ call wall_time(wall1)
+ print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0
+END_PROVIDER 
+
+
+double precision function bielec_no_basis(i_1,j_1,i_2,j_2)
+ implicit none
+ integer, intent(in) :: i_1,j_1,i_2,j_2
+  BEGIN_DOC
+  ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
+  ! 
+  END_DOC
+  integer :: i 
+ bielec_no_basis = 0.d0
+ do i = 1, cholesky_mo_num
+  bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2)
+ enddo
+end
+
+double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ BEGIN_DOC
+ ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition  on the NO basis for active orbitals 
+ ! 
+ ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+ END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
+end
+
+double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
+ implicit none 
+  BEGIN_DOC
+  ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition  on the NO basis for active orbitals 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
+ integer :: ii_ca, jj_ca
+ double precision :: bielec_no_basis
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
+
+end
+
+
+double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! indices are unshifted orbital numbers
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none 
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca)
+end
+
+double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+  BEGIN_DOC
+  ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition 
+  ! 
+  ! where i_ca, j_ca are in [1:n_core_inact_act_orb]
+  END_DOC
+ implicit none
+ integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
+ double precision :: mo_two_e_integral
+ integer :: ii_ca, jj_ca
+ ii_ca = list_core_inact_act(i_ca)
+ jj_ca = list_core_inact_act(j_ca)
+ bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo)
+end
+
diff --git a/src/casscf_cipsi/chol_garb.irp.f b/src/casscf_cipsi/chol_garb.irp.f
new file mode 100644
index 00000000..c4a8fa59
--- /dev/null
+++ b/src/casscf_cipsi/chol_garb.irp.f
@@ -0,0 +1,34 @@
+
+!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED
+!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition 
+!  ! 
+!  ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE 
+!  END_DOC
+! implicit none
+! integer, intent(in) :: i_a, j_a, k_a, i_mo
+! integer :: ii_a, jj_a, kk_a
+! double precision :: mo_two_e_integral
+! ii_a = list_act(i_a)
+! jj_a = list_act(j_a)
+! kk_a = list_act(k_a)
+! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo)
+!end
+
+!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
+!  BEGIN_DOC
+!  ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals 
+!  ! 
+!  ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb]
+!  END_DOC
+! implicit none 
+! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
+! integer :: ii_ca, jj_ca, kk_ca
+! double precision :: bielec_no_basis_chol
+! ii_ca = list_act(i_ca)
+! jj_ca = list_act(j_ca)
+! kk_ca = list_act(k_ca)
+! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
+!
+!end
diff --git a/src/casscf_cipsi/gradient.irp.f b/src/casscf_cipsi/gradient.irp.f
index a1c5e947..961d260d 100644
--- a/src/casscf_cipsi/gradient.irp.f
+++ b/src/casscf_cipsi/gradient.irp.f
@@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t)
   
   integer                        :: ii,tt,v,vv,x,y
   integer                        :: x3,y3
+  double precision :: bielec_PQxx_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
diff --git a/src/casscf_cipsi/hessian.irp.f b/src/casscf_cipsi/hessian.irp.f
index 458c6aa6..9a7a9031 100644
--- a/src/casscf_cipsi/hessian.irp.f
+++ b/src/casscf_cipsi/hessian.irp.f
@@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u)
   implicit none
   integer                        :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj
   real*8                         :: term,t2
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
@@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a)
   implicit none
   integer                        :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ! it/ja
   ii=list_core_inact(i)
@@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a)
   implicit none
   integer                        :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   tt=list_act(t)
@@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b)
   implicit none
   integer                        :: i,a,j,b,ii,aa,jj,bb
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   aa=list_virt(a)
@@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b)
   implicit none
   integer                        :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3
   real*8                         :: term
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   ii=list_core_inact(i)
   aa=list_virt(a)
@@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b)
   integer                        :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y
   integer                        :: v3,x3
   real*8                         :: term,t1,t2,t3
+  double precision :: bielec_pqxx_no,bielec_pxxq_no
   
   tt=list_act(t)
   aa=list_virt(a)
diff --git a/src/casscf_cipsi/mcscf_fock.irp.f b/src/casscf_cipsi/mcscf_fock.irp.f
index 0f4b7a99..82b710a7 100644
--- a/src/casscf_cipsi/mcscf_fock.irp.f
+++ b/src/casscf_cipsi/mcscf_fock.irp.f
@@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ]
    END_DOC
    implicit none
    integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
    
    do q=1,mo_num
      do p=1,mo_num
@@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ]
    END_DOC
    implicit none
    integer                        :: p,q,k,kk,t,tt,u,uu
+   double precision :: bielec_pxxq_no, bielec_pqxx_no
    
    Fapq = 0.d0
    
diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f
new file mode 100644
index 00000000..bcce7cf7
--- /dev/null
+++ b/src/casscf_cipsi/test_chol.irp.f
@@ -0,0 +1,116 @@
+program test_chol
+ implicit none
+ read_wf= .True.
+ touch read_wf 
+! call routine_bielec_PxxQ_no
+! call routine_bielecCI_no
+! call test_bielec_PxxQ_chol
+! call test_bielecCI
+
+end
+
+subroutine routine_bielec_PQxx_no
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PQxx_no
+ 
+ accu = 0.d0
+ do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+  do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+   do i_mo = 1, mo_num
+    do j_mo = 1, mo_num
+     exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) 
+     new   = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) 
+     error = dabs(exact-new)
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+     accu += error
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine routine_bielec_PxxQ_no_array
+ implicit none
+ integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
+ integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
+ double precision :: exact, new, error, accu, bielec_no_basis_chol
+ double precision :: bielec_PxxQ_no
+ 
+ accu = 0.d0
+ do i_mo = 1, mo_num
+  do i_core_inact = 1, n_core_inact_act_orb
+  ii_core_inact = list_core_inact_act(i_core_inact)
+   do j_core_inact = 1, n_core_inact_act_orb
+   jj_core_inact = list_core_inact_act(j_core_inact)
+    do j_mo = 1, mo_num
+     exact = bielec_PxxQ_no_array(j_mo, j_core_inact,  i_core_inact,i_mo) 
+!     new   = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) 
+     new   = bielec_PxxQ_no(j_mo, j_core_inact,  i_core_inact,i_mo) 
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+ implicit none
+ integer :: i_mo, j_mo, i_ca, j_ca 
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PQxx
+ 
+ accu = 0.d0
+ do j_ca = 1, n_core_inact_act_orb
+  do i_ca = 1, n_core_inact_act_orb
+   do j_mo = 1, mo_num
+    do i_mo = 1, mo_num
+     exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca)
+     new   = bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
+
+subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo)
+ implicit none
+ integer :: i_mo, i_ca, j_ca, j_mo
+ double precision :: exact, new, error, accu
+ double precision :: bielec_PxxQ
+ accu = 0.d0
+ do j_mo = 1, mo_num
+  do j_ca = 1, n_core_inact_act_orb
+   do i_ca =1, n_core_inact_act_orb
+    do i_mo = 1, mo_num
+     exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo)
+     new   = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
+     error = dabs(exact-new)
+     accu += error
+     if(dabs(exact).gt.1.d-10)then
+      print*,exact,new,error
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+ print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
+end
diff --git a/src/casscf_cipsi/tot_en.irp.f b/src/casscf_cipsi/tot_en.irp.f
index 1d70e087..37ceac05 100644
--- a/src/casscf_cipsi/tot_en.irp.f
+++ b/src/casscf_cipsi/tot_en.irp.f
@@ -8,6 +8,7 @@
    implicit none
    integer                        :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3
    real*8                         :: e_one_all,e_two_all
+   double precision :: bielec_PQxx,bielec_PxxQ
    e_one_all=0.D0
    e_two_all=0.D0
    do i=1,n_core_inact_orb
diff --git a/src/ccsd/NEED b/src/ccsd/NEED
index e6e6bc59..8298f28e 100644
--- a/src/ccsd/NEED
+++ b/src/ccsd/NEED
@@ -1,2 +1,3 @@
+gpu
 hartree_fock
 utils_cc
diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 555a2552..d8131a9c 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -1,4 +1,5 @@
 subroutine run_ccsd_space_orb
+  use gpu
 
   implicit none
 
@@ -9,9 +10,19 @@ subroutine run_ccsd_space_orb
   double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
   logical :: not_converged
 
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
-  double precision, allocatable :: t1(:,:), r1(:,:)
-  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
+  type(gpu_double4) :: t2, r2, tau, tau_x
+  type(gpu_double2) :: t1, r1
+  type(gpu_double2) :: H_oo, H_vv, H_vo
+
+  type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
+  type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
+
+  type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
+  type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+
+  type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
+  type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
 
   double precision, allocatable :: all_err(:,:), all_t(:,:)
   integer, allocatable          :: list_occ(:), list_vir(:)
@@ -20,7 +31,7 @@ subroutine run_ccsd_space_orb
 
   call set_multiple_levels_omp(.False.)
 
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
     PROVIDE cholesky_mo_transp
     FREE cholesky_ao
   else
@@ -51,11 +62,77 @@ subroutine run_ccsd_space_orb
   !print*,'occ',list_occ
   !print*,'vir',list_vir
 
-  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
-  allocate(t1(nO,nV), r1(nO,nV))
-  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
+  ! GPU arrays
+  call gpu_allocate(d_cc_space_f_oo, nO, nO)
+  call gpu_allocate(d_cc_space_f_vo, nV, nO)
+  call gpu_allocate(d_cc_space_f_ov, nO, nV)
+  call gpu_allocate(d_cc_space_f_vv, nV, nV)
+
+  call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
+  call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
+  call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
+  call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
+
+!  FREE cc_space_f_oo
+!  FREE cc_space_f_vo
+!  FREE cc_space_f_vv
+
+  if (do_mo_cholesky) then
+    call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
+    call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
+    call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
+    call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
+
+    call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
+    call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
+    call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
+    call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
+
+!    FREE cc_space_v_oo_chol
+!    FREE cc_space_v_ov_chol
+!    FREE cc_space_v_vo_chol
+!    FREE cc_space_v_vv_chol
+  endif
+
+  call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
+  call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
+  call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
+  call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
+  call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
+  call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
+  call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
+  call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
+
+  call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
+  call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
+  call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
+  call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
+  call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
+  call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
+  call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
+  call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
+  call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
+
+!  FREE cc_space_v_voov
+!  FREE cc_space_v_ovov
+!  FREE cc_space_v_oovo
+!  FREE cc_space_v_oovv
+!  FREE cc_space_v_vooo
+!  FREE cc_space_v_oooo
+!  FREE cc_space_v_vvoo
+!  FREE cc_space_v_ovvo
+!  FREE cc_space_v_ovoo
+
+  call gpu_allocate(t2, nO,nO,nV,nV)
+  call gpu_allocate(r2, nO,nO,nV,nV)
+  call gpu_allocate(tau, nO,nO,nV,nV)
+  call gpu_allocate(tau_x, nO,nO,nV,nV)
+  call gpu_allocate(t1, nO,nV)
+  call gpu_allocate(r1, nO,nV)
+  call gpu_allocate(H_oo, nO, nO)
+  call gpu_allocate(H_vo, nV, nO)
+  call gpu_allocate(H_vv, nV, nV)
 
   if (cc_update_method == 'diis') then
     double precision :: rss, diis_mem, extra_mem
@@ -97,14 +174,22 @@ subroutine run_ccsd_space_orb
   endif
 
   ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
+  double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
+  allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
+
+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+
+  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
   call update_tau_x_space(nO,nV,tau,tau_x)
-  !print*,'hf_energy', hf_energy
   call det_energy(det,uncorr_energy)
   print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+
+  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
   print*,'Guess energy', uncorr_energy+energy, energy
 
   nb_iter = 0
@@ -120,43 +205,45 @@ subroutine run_ccsd_space_orb
   do while (not_converged)
 
     ! Residue
-    if (do_ao_cholesky) then
-!    if (.False.) then
-      call compute_H_oo_chol(nO,nV,tau_x,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
-      call compute_H_vo_chol(nO,nV,t1,H_vo)
+    if (do_mo_cholesky) then
+      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
+      call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
 
-      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+           d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+      call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+           d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+           d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+           d_cc_space_f_vo, &
+           r2, max_r2)
     else
-      call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
-      call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
-      call compute_H_vo(nO,nV,t1,t2,H_vo)
+      call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
+      call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
+      call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
 
-      call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-      call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
+      call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
+      call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
     endif
     max_r = max(max_r1,max_r2)
 
     ! Update
     if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
+      call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
 
     ! Standard update as T = T - Delta
     elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
-      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
+      call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
+      call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
     else
       print*,'Unkown cc_method_method: '//cc_update_method
     endif
 
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call update_tau_space(nO,nV,t1%f,t1,t2,tau)
     call update_tau_x_space(nO,nV,tau,tau_x)
 
     ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
     write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
 
     nb_iter = nb_iter + 1
@@ -181,8 +268,8 @@ subroutine run_ccsd_space_orb
   print*,''
 
   if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(nO,nV,t1%f)
+    call write_t2(nO,nV,t2%f)
     call ezfio_set_utils_cc_io_amplitudes('Read')
   endif
 
@@ -191,210 +278,198 @@ subroutine run_ccsd_space_orb
     deallocate(all_err,all_t)
   endif
 
-  deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
+  call gpu_deallocate(H_oo)
+  call gpu_deallocate(H_vv)
+  call gpu_deallocate(H_vo)
+
+  call gpu_deallocate(r1)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_x)
 
   ! CCSD(T)
-  double precision :: e_t
+  double precision :: e_t, e_t_err
   e_t = 0.d0
 
   if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
 
-    ! Dumb way
-    !call wall_time(ta)
-    !call ccsd_par_t_space(nO,nV,t1,t2,e_t)
-    !call wall_time(tb)
-    !print*,'Time: ',tb-ta, ' s'
-
-    !print*,''
-    !write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
-    !print*,''
-
     ! New
+    e_t = uncorr_energy + energy ! For print in (T) call
+    e_t_err = 0.d0
+
     print*,'Computing (T) correction...'
     call wall_time(ta)
-!    call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-!         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
 
-    e_t = uncorr_energy + energy ! For print in next call
-    call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
-         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
+    call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
+         ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
 
     call wall_time(tb)
     print*,'Time: ',tb-ta, ' s'
 
     print*,''
-    write(*,'(A15,F18.12,A3)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha'
-    write(*,'(A15,F18.12,A3)') ' E(T)        = ', e_t, ' Ha'
-    write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
+    write(*,'(A15,F18.12,A7,F18.12)') ' E(CCSD(T))  = ', uncorr_energy + energy + e_t, ' Ha +/- ', e_t_err
+    write(*,'(A15,F18.12,A7,F18.12)') ' E(T)        = ', e_t, ' Ha +/- ', e_t_err
+    write(*,'(A15,F18.12,A7,F18.12)') ' Correlation = ', energy + e_t, ' Ha +/- ', e_t_err
     print*,''
   endif
 
   call save_energy(uncorr_energy + energy, e_t)
 
-  deallocate(t1,t2)
+  deallocate(h_t1, h_t2)
+  if (do_mo_cholesky) then
+    call gpu_deallocate(d_cc_space_v_oo_chol)
+    call gpu_deallocate(d_cc_space_v_ov_chol)
+    call gpu_deallocate(d_cc_space_v_vo_chol)
+    call gpu_deallocate(d_cc_space_v_vv_chol)
+  endif
+
+  call gpu_deallocate(d_cc_space_v_oovv)
+  call gpu_deallocate(d_cc_space_v_voov)
+  call gpu_deallocate(d_cc_space_v_ovov)
+  call gpu_deallocate(d_cc_space_v_oovo)
+  call gpu_deallocate(d_cc_space_v_ovvo)
+  call gpu_deallocate(d_cc_space_v_vooo)
+  call gpu_deallocate(d_cc_space_v_oooo)
+  call gpu_deallocate(d_cc_space_v_vvoo)
+  call gpu_deallocate(d_cc_space_v_ovoo)
+
+  call gpu_deallocate(d_cc_space_f_oo)
+  call gpu_deallocate(d_cc_space_f_vo)
+  call gpu_deallocate(d_cc_space_f_ov)
+  call gpu_deallocate(d_cc_space_f_vv)
+
+  call gpu_deallocate(t1)
+  call gpu_deallocate(t2)
 
 end
 
 ! Energy
 
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
+subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
+  integer, intent(in)            :: nO, nV
+  type(gpu_double4), intent(in)  :: tau_x, d_cc_space_v_oovv
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  double precision, intent(out)  :: energy
 
   ! internal
   integer :: i,j,a,b
   double precision :: e
 
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+  type(gpu_stream) :: s1, s2
+  call gpu_stream_create(s1)
+  call gpu_stream_create(s2)
 
-end
+  call gpu_set_stream(blas_handle,s1)
+  call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
 
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+  call gpu_set_stream(blas_handle,s2)
+  call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
 
-  implicit none
+  call gpu_synchronize()
+  call gpu_stream_destroy(s1)
+  call gpu_stream_destroy(s2)
 
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
+   energy = energy + 2.d0*e
 
 end
 
 ! Tau
 
-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
+subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  use gpu
   implicit none
 
   ! in
   integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
+  double precision, intent(in)  :: h_t1(nO,nV)
+  type(gpu_double2), intent(in) :: t1
+  type(gpu_double4), intent(in) :: t2
 
   ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
+  type(gpu_double4) :: tau
 
   ! internal
   integer                       :: i,j,a,b
 
+  type(gpu_stream) :: stream(nV)
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
+  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
+  do b=1,nV
+    call gpu_stream_create(stream(b))
+    call gpu_set_stream(blas_handle,stream(b))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
+         1.d0, t2%f(1,j,1,b), nO*nO, &
+         h_t1(j,b), t1%f(1,1), nO, &
+         tau%f(1,j,1,b), nO*nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+
+
 end
 
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
+  integer, intent(in)         :: nO, nV
+  type(gpu_double4), intent(in)  :: tau
 
   ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
+  type(gpu_double4) :: tau_x
 
   ! internal
   integer                       :: i,j,a,b
 
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
   !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
-  !$OMP PRIVATE(i,j,a,b) &
+  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
+  !$OMP PRIVATE(a,b) &
   !$OMP DEFAULT(NONE)
   !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
+  do b=1,nV
+    do a=1,nV
+      call gpu_set_stream(blas_handle,stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
+          2.d0, tau%f(1,1,a,b), nO, &
+         -1.d0, tau%f(1,1,b,a), nO, &
+         tau_x%f(1,1,a,b), nO)
     enddo
   enddo
   !$OMP END DO
   !$OMP END PARALLEL
 
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do b=1,nV
+    call gpu_stream_destroy(stream(b))
+  enddo
+
+
 end
 
 ! R1
diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
index b59dc0bb..6f65ea79 100644
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@@ -1,989 +1,789 @@
-subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-! Tau
-
-subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
-
-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
-
-  ! internal
-  integer                       :: i,j,a,b
-
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-end
-
-! R1
-
-subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
-
-  ! out
-  double precision, intent(out) :: r1(nO,nV), max_r1
-
-  ! internal
-  integer                       :: u,i,j,beta,a,b
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r1,cc_space_f_ov) &
-  !$omp private(u,beta) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      r1(u,beta) = cc_space_f_ov(u,beta)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  double precision, allocatable :: X_oo(:,:)
-  allocate(X_oo(nO,nO))
-  call dgemm('N','N', nO, nO, nV, &
-             -2d0, t1    , size(t1,1), &
-                   cc_space_f_vo, size(cc_space_f_vo,1), &
-              0d0, X_oo  , size(X_oo,1))
-
-  call dgemm('T','N', nO, nV, nO, &
-             1d0, X_oo, size(X_oo,2), &
-                  t1  , size(t1,1), &
-             1d0, r1  , size(r1,1))
-  deallocate(X_oo)
-
-  call dgemm('N','N', nO, nV, nV, &
-             1d0, t1  , size(t1,1), &
-                  H_vv, size(H_vv,1), &
-             1d0, r1  , size(r1,1))
-
-  call dgemm('N','N', nO, nV, nO, &
-             -1d0, H_oo, size(H_oo,1), &
-                   t1  , size(t1,1), &
-              1d0, r1, size(r1,1))
-
-  double precision, allocatable :: X_voov(:,:,:,:)
-  allocate(X_voov(nV, nO, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,X_voov,t2,t1) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nV*nO, nO*nV, &
-             1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
-                  H_vo  , 1, &
-             1d0, r1    , 1)
-
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovov(:,:,:,:)
-  allocate(X_ovov(nO, nV, nO, nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
-  !$omp private(u,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nv
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemv('T', nO*nV, nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  t1     , 1, &
-             1d0, r1     , 1)
-
-  deallocate(X_ovov)
-
-  integer :: iblock, block_size, nVmax
-  double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
-  block_size = 16
-  allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
-
-  !$omp parallel &
-  !$omp private(u,i,b,a) &
-  !$omp default(shared)
-  !$omp do
-  do u = 1, nO
-    do i = 1, nO
-      do b = 1, nV
-        do a = 1, nV
-          T_vvoo(a,b,i,u) = tau(i,u,a,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  do iblock = 1, nV, block_size
-    nVmax = min(block_size,nV-iblock+1)
-
-    call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
-      cc_space_v_vo_chol            , cholesky_mo_num, &
-      cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-      0.d0, W_vvov_tmp, nV*nO)
-
-    !$omp parallel &
-    !$omp private(b,i,a,beta) &
-    !$omp default(shared)
-    do beta = 1,  nVmax
-      do i = 1, nO
-        !$omp do
-        do b = 1, nV
-          do a = 1, nV
-            W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
-          enddo
-        enddo
-        !$omp end do nowait
-      enddo
-    enddo
-    !$omp barrier
-    !$omp end parallel
-
-    call dgemm('T','N',nO,nVmax,nO*nV*nV, &
-             1d0, T_vvoo, nV*nV*nO, &
-                  W_vvov, nO*nV*nV, &
-             1d0, r1(1,iblock), nO)
-  enddo
-
-  deallocate(W_vvov,T_vvoo)
-
-
-  double precision, allocatable :: W_oovo(:,:,:,:)
-  allocate(W_oovo(nO,nO,nV,nO))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) &
-  !$omp private(u,a,i,j) &
-  !$omp default(none)
-  do u = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-!          W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
-          W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  call dgemm('T','N', nO, nV, nO*nO*nV, &
-             -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
-                   tau   , size(tau,1) * size(tau,2) * size(tau,3), &
-              1d0, r1    , size(r1,1))
-
-  deallocate(W_oovo)
-
-  max_r1 = 0d0
-  do a = 1, nV
-    do i = 1, nO
-      max_r1 = max(dabs(r1(i,a)), max_r1)
-    enddo
-  enddo
-
-  ! Change the sign for consistency with the code in spin orbitals
-  !$omp parallel &
-  !$omp shared(nO,nV,r1) &
-  !$omp private(a,i) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      r1(i,a) = -r1(i,a)
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-end
-
 ! H_oo
 
-subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
-
+subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
+    d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
+  use gpu
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_oo(nO, nO)
+  type(gpu_double2), intent(in)    :: d_cc_space_f_oo
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_oo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
+  type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
 
-  allocate(tau_kau(cholesky_mo_num,nV,nO))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,u,j,k,a,b,tmp_vov)
-  allocate(tmp_vov(nV,nO,nV) )
-  !$omp do
-  do u = 1, nO
+  call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
+
+  type(gpu_blas) :: blas
+
+
+  !$OMP PARALLEL  &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_oo, H_oo)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_allocate(tmp_ovv, nO, nV, nV)
+  call gpu_allocate(tmp_vov, nV, nO, nV)
+
+  call gpu_blas_create(blas)
+
+  !$OMP DO
+  do u=1,nO
+    call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1)
     do b=1,nV
-      do j=1,nO
-        do a=1,nV
-          tmp_vov(a,j,b) = tau_x(u,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, &
+           tmp_ovv%f(1,1,b), nO, 0.d0, &
+           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
-      0.d0, tau_kau(1,1,u), cholesky_mo_num)
+    call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, &
+      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_vov)
-  !$omp do
-  do i = 1, nO
-    do u = 1, nO
-      H_oo(u,i) = cc_space_f_oo(u,i)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
-    tau_kau, cholesky_mo_num*nV,  cc_space_v_vo_chol, cholesky_mo_num*nV, &
-    1.d0, H_oo, nO)
+  !$OMP END DO
 
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(tmp_vov)
+  call gpu_deallocate(tmp_ovv)
+
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
+    tau_kau%f(1,1,1), cholesky_mo_num*nV,  d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
+    1.d0, H_oo%f(1,1), nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kau)
 end
 
 ! H_vv
 
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
-
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
+         d_cc_space_v_ov_chol,H_vv)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: tau_x(nO, nO, nV, nV)
-  double precision, intent(out) :: H_vv(nV, nV)
+  integer, intent(in)              :: nO,nV
+  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
+  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
+  type(gpu_double4), intent(in)    :: tau_x
+  type(gpu_double2), intent(out)   :: H_vv
 
   integer :: a,b,i,j,u,k, beta
 
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
+  type(gpu_double3) :: tau_kia, tmp_oov
 
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
-  !$omp parallel &
-  !$omp default(shared) &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  allocate(tmp_oov(nO,nO,nV) )
-  !$omp do
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL &
+  !$OMP DEFAULT(SHARED) &
+  !$OMP PRIVATE(a,b,tmp_oov,blas)
+
+  !$OMP SINGLE
+  !$OMP TASK
+  call gpu_copy(d_cc_space_f_vv, H_vv)
+  !$OMP END TASK
+  !$OMP END SINGLE
+
+  call gpu_blas_create(blas)
+  call gpu_allocate(tmp_oov, nO, nO, nV)
+
+  !$OMP DO
   do a = 1, nV
     do b=1,nV
-      do j=1,nO
-        do i=1,nO
-          tmp_oov(i,j,b) = tau_x(i,j,a,b)
-        enddo
-      enddo
+      call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, &
+        tau_x%f(1,1,a,b), nO, 0.d0, &
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
     enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+    call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, &
+      d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, &
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
   enddo
-  !$omp end do nowait
-  deallocate(tmp_oov)
+  !$OMP END DO
 
-  !$omp do
-  do beta = 1, nV
-    do a = 1, nV
-      H_vv(a,beta) = cc_space_f_vv(a,beta)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp barrier
-  !$omp end  parallel
-  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
-    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
-    1.d0, H_vv, nV)
+  call gpu_blas_destroy(blas)
 
+  call gpu_deallocate(tmp_oov)
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
+    tau_kia%f(1,1,1), cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vv%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tau_kia)
 end
 
 ! H_vo
-subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
-
+subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
+         d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(out) :: H_vo(nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
+  type(gpu_double2), intent(out) :: H_vo
 
   integer :: a,b,i,j,u,k
 
-  double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp, tmp2
+
+  call gpu_copy(d_cc_space_f_vo, H_vo)
+
+  call gpu_allocate(tmp_k, cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+     t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+      H_vo%f(1,1), nV*nO)
+
+  call gpu_deallocate(tmp_k)
+
+
+  call gpu_allocate(tmp,  cholesky_mo_num, nO, nO)
+
+  call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
+
+  type(gpu_stream) :: stream(nO)
   do i=1,nO
-    do a=1,nV
-      H_vo(a,i) = cc_space_f_vo(a,i)
-    enddo
+    call gpu_stream_create(stream(i))
   enddo
 
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num, &
-     t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-      H_vo, nV*nO)
-  deallocate(tmp_k)
-
-  allocate(tmp(cholesky_mo_num,nO,nO))
-  allocate(tmp2(cholesky_mo_num,nO,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
-
+  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
   do i=1,nO
     do j=1,nO
-      do k=1,cholesky_mo_num
-        tmp2(k,j,i) = tmp(k,i,j)
-      enddo
+      call gpu_set_stream(blas_handle,stream(j))
+      call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
+        tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
     enddo
   enddo
-  deallocate(tmp)
+  !$OMP END PARALLEL DO
 
-  call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
-    1.d0, H_vo, nV)
+  call gpu_set_stream(blas_handle,gpu_default_stream)
+  call gpu_synchronize()
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, &
+    1.d0, H_vo%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp2)
+end
+
+! R1
+
+subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
+    d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
+  use gpu
+  implicit none
+
+  ! in
+  integer, intent(in)           :: nO, nV
+  type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo
+
+  ! out
+  type(gpu_double2), intent(out) :: r1
+  double precision, intent(out)  :: max_r1
+
+  ! internal
+  integer                       :: u,i,j,beta,a,b
+
+  type(gpu_stream) :: stream(nV)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  type(gpu_double2) :: X_oo
+  call gpu_allocate(X_oo,nO,nO)
+
+  call gpu_copy(d_cc_space_f_ov, r1)
+
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  H_vv%f(1,1), size(H_vv%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, &
+             -1d0, H_oo%f(1,1), size(H_oo%f,1), &
+                   t1%f(1,1)  , size(t1%f,1), &
+              1d0, r1%f(1,1), size(r1%f,1))
+
+  call gpu_set_stream(blas_handle, stream(nV))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
+             -2d0, t1%f(1,1), size(t1%f,1), &
+                   d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+              0d0, X_oo%f(1,1), size(X_oo%f,1))
+
+  call gpu_synchronize()
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
+             1d0, X_oo%f(1,1), size(X_oo%f,2), &
+                  t1%f(1,1)  , size(t1%f,1), &
+             1d0, r1%f(1,1)  , size(r1%f,1))
+
+
+
+  type(gpu_double4) :: X_voov
+  call gpu_allocate(X_voov, nV, nO, nO, nV)
+
+  do i=1,nO
+    do beta=1,nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO,  -1.d0, t2%f(1,i,1,beta), &
+         nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO)
+    enddo
+  enddo
+
+  do beta=1,nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO,  1.d0, X_voov%f(1,1,1,beta), &
+         nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV)
+  enddo
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_oo)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
+             1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+                  H_vo%f(1,1)  , 1, &
+             1d0, r1%f(1,1)    , 1)
+
+  type(gpu_double4) :: X_ovov
+  call gpu_allocate(X_ovov, nO, nV, nO, nV)
+
+  do beta = 1, nV
+    call gpu_set_stream(blas_handle, stream(beta))
+    do u=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), &
+      nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  t1%f(1,1), 1, &
+             1d0, r1%f(1,1), 1)
+
+
+  integer :: iblock, block_size, nVmax
+  type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
+
+  block_size = 16
+  call gpu_allocate(T_vvoo, nV,nV,nO,nO)
+
+  call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), &
+    nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV)
+
+  call gpu_allocate(W_vvov,nV, nV,nO,block_size)
+  call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
+
+  do iblock = 1, nV, block_size
+    nVmax = min(block_size,nV-iblock+1)
+
+    call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, &
+      d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+      0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
+
+    call gpu_synchronize()
+    do b=1,nV
+      call gpu_set_stream(blas_handle, stream(b))
+      do i=1,nO
+        call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax,  2.d0, W_vvov_tmp%f(1,i,b,1), &
+         nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO)
+      enddo
+    enddo
+
+    call gpu_synchronize()
+
+    do beta = 1,  nVmax
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO,  1.d0, W_vvov%f(1,1,1,beta), &
+         nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV)
+    enddo
+    call gpu_synchronize()
+
+    call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
+             1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
+                  W_vvov%f(1,1,1,1), nO*nV*nV, &
+             1d0, r1%f(1,iblock), nO)
+  enddo
+
+  call gpu_deallocate(X_ovov)
+
+  type(gpu_double4) :: W_oovo
+  call gpu_allocate(W_oovo, nO,nO,nV,nO)
+
+  do u = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO,  2.d0, d_cc_space_v_oovo%f(1,1,a,u), &
+         nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+
+  call gpu_deallocate(W_vvov)
+  call gpu_deallocate(T_vvoo)
+
+  ! Change the sign for consistency with the code in spin orbitals
+  call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
+              1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), &
+                   tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), &
+             -1d0, r1%f(1,1), size(r1%f,1))
+
+  call gpu_synchronize()
+  call gpu_deallocate(W_oovo)
+
+  max_r1 = 0d0
+  do a = 1, nV
+    do i = 1, nO
+      max_r1 = max(dabs(r1%f(i,a)), max_r1)
+    enddo
+  enddo
+
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
+  enddo
 
 end
 
 
 ! R2
 
-subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
-
+subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
+    d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
+    d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
+    d_cc_space_f_vo, &
+    r2,max_r2)
+  use gpu
   implicit none
 
   ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
+  integer, intent(in)            :: nO, nV
+  type(gpu_double2), intent(in)  :: t1, H_oo, H_vv, d_cc_space_f_vo
+  type(gpu_double4), intent(in)  :: t2, tau, d_cc_space_v_oovv
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vvoo, d_cc_space_v_oovo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovvo, d_cc_space_v_ovoo
+  type(gpu_double4), intent(in)  :: d_cc_space_v_ovov
+  type(gpu_double3), intent(in)  :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol
+  type(gpu_double3), intent(in)  :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
 
   ! out
-  double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
+  double precision, intent(out)  :: max_r2
+  type(gpu_double4), intent(out) :: r2
 
   ! internal
   integer                       :: u,v,i,j,beta,gam,a,b
   double precision              :: max_r2_local
 
+  type(gpu_stream) :: stream(nV)
+
   call set_multiple_levels_omp(.False.)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,cc_space_v_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-         r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  call gpu_copy(d_cc_space_v_oovv, r2)
 
-  double precision, allocatable :: A1(:,:,:,:)
-  allocate(A1(nO,nO,nO,nO))
-  call compute_A1_chol(nO,nV,t1,t2,tau,A1)
-  call dgemm('N','N',nO*nO,nV*nV,nO*nO, &
-             1d0, A1, size(A1,1) * size(A1,2), &
-                  tau, size(tau,1) * size(tau,2), &
-             1d0, r2, size(r2,1) * size(r2,2))
+  type(gpu_double4) :: A1
+  call gpu_allocate(A1,nO,nO,nO,nO)
+  call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+      d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), &
+                  tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+             1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2))
+
+  call gpu_deallocate(A1)
 
-  deallocate(A1)
   integer :: block_size, iblock, k
   block_size = 16
-  double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1
-  double precision, dimension(:,:), allocatable :: tmp_cc2
+  type(gpu_double3) :: tmp_cc, B1, tmpB1
+  type(gpu_double2) :: tmp_cc2
 
-  allocate(tmp_cc(cholesky_mo_num,nV,nV))
-  call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
-      cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV)
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
+      d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
 
   call set_multiple_levels_omp(.False.)
+  call gpu_synchronize()
+
+  type(gpu_blas) :: blas
+
+  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas)
+  call gpu_allocate(B1,nV,nV,block_size)
+  call gpu_allocate(tmpB1,nV,block_size,nV)
+  call gpu_allocate(tmp_cc2,cholesky_mo_num,nV)
+
+  call gpu_blas_create(blas)
 
-  !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a)
-  allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV))
   !$OMP DO
   do gam = 1, nV
 
-    do a=1,nV
-      do k=1,cholesky_mo_num
-        tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam)
-      enddo
-    enddo
+    call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), &
+         cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num)
 
     do iblock = 1, nV, block_size
 
-        call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, &
-                cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, &
-                0.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, &
+                d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, &
+                0.d0, tmpB1%f(1,1,1), nV*block_size)
 
-        call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
-                1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
-                tmp_cc2, cholesky_mo_num, &
-                1.d0, tmpB1, nV*block_size)
+        call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
+                1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
+                tmp_cc2%f(1,1), cholesky_mo_num, &
+                1.d0, tmpB1%f(1,1,1), nV*block_size)
 
         do beta = iblock, min(nV, iblock+block_size-1)
-          do b = 1, nV
-            do a = 1, nV
-              B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b)
-            enddo
-          enddo
+          call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), &
+             nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV)
         enddo
 
-        call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
-              1d0, tau, size(tau,1) * size(tau,2), &
-                   B1 , size(B1 ,1) * size(B1 ,2), &
-              1d0, r2(1,1,iblock,gam),  size(r2 ,1) * size(r2 ,2))
+        call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
+              1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                   B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), &
+              1d0, r2%f(1,1,iblock,gam),  size(r2%f ,1) * size(r2%f ,2))
       enddo
 
   enddo
   !$OMP ENDDO
 
-  deallocate(B1, tmpB1, tmp_cc2)
+  call gpu_blas_destroy(blas)
+
+  call gpu_deallocate(B1)
+  call gpu_deallocate(tmpB1)
+  call gpu_deallocate(tmp_cc2)
   !$OMP END PARALLEL
 
-  deallocate(tmp_cc)
+  call gpu_deallocate(tmp_cc)
+
+  type(gpu_double4) :: X_oovv
+  call gpu_allocate(X_oovv,nO,nO,nV,nV)
+  call gpu_copy(t2,X_oovv)
+
+  type(gpu_double2) :: g_occ, g_vir
+  call gpu_allocate(g_vir,nV,nV)
+  call gpu_allocate(g_occ,nO,nO)
+  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+    d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
+  call compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+    d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
+
+  type(gpu_double4) :: Y_oovv
+  call gpu_allocate(Y_oovv,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, &
+             1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), &
+                  g_vir%f(1,1), size(g_vir%f,1), &
+             0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
+             -1d0, g_occ%f(1,1), size(g_occ%f,1), &
+                  t2%f(1,1,1,1)    , size(t2%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, &
+            -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
+                  t1%f(1,1) , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
 
 
-  double precision, allocatable :: X_oovv(:,:,:,:)
-  allocate(X_oovv(nO,nO,nV,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,X_oovv) &
-  !$omp private(u,v,gam,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
+  call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), &
+         nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_oovv)
+
+  call gpu_deallocate(g_vir)
+  call gpu_deallocate(g_occ)
+
+  type(gpu_double4) :: X_vovo, Y_oovo
+  call gpu_allocate(X_vovo,nV,nO,nV,nO)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do beta = 1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+  enddo
+
+  do i = 1, nO
     do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          X_oovv(u,v,gam,a) = t2(u,v,gam,a)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(gam))
+      call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), &
+           nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  double precision, allocatable :: g_vir(:,:)
-  allocate(g_vir(nV,nV))
-  call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
-
-  double precision, allocatable :: Y_oovv(:,:,:,:)
-  allocate(Y_oovv(nO,nO,nV,nV))
-
-  call dgemm('N','N',nO*nO*nV,nV,nV, &
-             1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), &
-                  g_vir, size(g_vir,1), &
-             0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3))
-  deallocate(g_vir)
-  deallocate(X_oovv)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
+  do a=1,nV
+    call gpu_stream_destroy(stream(a))
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: g_occ(:,:)
-  allocate(g_occ(nO,nO))
-  call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
 
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO,nO*nV*nV,nO, &
-             1d0, g_occ , size(g_occ,1), &
-                  t2    , size(t2,1), &
-             0d0, X_oovv, size(X_oovv,1))
-  deallocate(g_occ)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(X_oovv)
-
-  double precision, allocatable :: X_vovv(:,:,:,:)
-
-  allocate(X_vovv(nV,nO,nV,block_size))
-  allocate(Y_oovv(nO,nO,nV,nV))
+  call gpu_allocate(Y_oovo,nO,nO,nV,nO)
 
+  !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv)
+  call gpu_blas_create(blas)
+  type(gpu_double4) :: X_vovv
+  call gpu_allocate(X_vovv,nV,nO,nV,block_size)
+  !$OMP DO
   do iblock = 1, nV, block_size
     do gam = iblock, min(nV, iblock+block_size-1)
-      call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
-        cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, &
-        cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV)
+      call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
+        d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), &
+        cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV)
 
     enddo
-    call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, &
-             1d0, t1    , size(t1,1), &
-                  X_vovv, size(X_vovv,1), &
-             0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1))
 
+    call gpu_dgemm(blas, 'N','N', nO, &
+             nO*nV*min(block_size, nV-iblock+1),nV, &
+             1.d0, t1%f(1,1)    , size(t1%f,1), &
+             X_vovv%f(1,1,1,1), size(X_vovv%f,1), &
+             0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
+  enddo
+  !$OMP END DO
+
+  call gpu_blas_destroy(blas)
+  call gpu_deallocate(X_vovv)
+  !$OMP END PARALLEL
+
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  X_vovo%f(1,1,1,1), size(X_vovo%f,1), &
+             0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1))
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, &
+            -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
+
+  call gpu_synchronize()
+  call gpu_deallocate(X_vovo)
+  call gpu_deallocate(Y_oovo)
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
   enddo
-  deallocate(X_vovv)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Y_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
+           nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
+    enddo
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Y_oovv)
 
-  double precision, allocatable :: X_ovvo(:,:,:,:)
-  double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
-  allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO))
-  allocate(tcc(cholesky_mo_num,nO,nV))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-     cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, &
-     0.d0, tcc2, cholesky_mo_num*nV)
 
-  call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
-     cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, &
-     0.d0, tcc, cholesky_mo_num*nO)
+  call gpu_synchronize()
+  call gpu_deallocate(Y_oovv)
 
-  call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
-              tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, &
-              X_ovvo, nO*nV)
+  type(gpu_double4) :: X_ovvo
+  type(gpu_double3) :: tcc, tcc2
+  call gpu_allocate(tcc2,cholesky_mo_num,nV,nO)
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(tcc,cholesky_mo_num,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+     d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, &
+     0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
+     d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, &
+     0.d0, tcc%f(1,1,1), cholesky_mo_num*nO)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
+              tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, &
+              X_ovvo%f(1,1,1,1), nO*nV)
+
+  call gpu_synchronize()
 
-  deallocate(tcc, tcc2)
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_ovvo) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
   do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), &
+           nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
     do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), &
+           nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp do
-  do beta = 1, nV
-    do gam = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-          r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v)
-        enddo
-      enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize
+  call gpu_deallocate(tcc)
+  call gpu_deallocate(tcc2)
+  call gpu_deallocate(X_ovvo)
+
+
+  type(gpu_double4) :: J1, K1
+  type(gpu_double4) :: Y_voov, Z_ovov
+
+
+  call gpu_allocate(J1,nO,nV,nV,nO)
+  call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, &
+       d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
+
+  call gpu_allocate(K1,nO,nV,nO,nV)
+  call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, &
+       d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
+
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+  call gpu_allocate(Y_voov,nV,nO,nO,nV)
+
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do i=1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+           nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO)
+      call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), &
+           nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(X_ovvo)
-  !-----
+  call gpu_allocate(Z_ovov,nO,nV,nO,nV)
 
-  allocate(X_oovv(nO,nO,nV,nV))
+  call gpu_synchronize()
+  call gpu_deallocate(J1)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  call dgemm('N','N',nO*nO*nV,nV,nO, &
-             1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
-                  t1 , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta)
-        enddo
-      enddo
+  call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
+  call gpu_deallocate(Y_voov)
+  call gpu_deallocate(X_ovvo)
+
+  type(gpu_double4) :: Y_ovov, X_ovov
+  call gpu_allocate(X_ovov,nO,nV,nO,nV)
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  do a=1, nV
+    call gpu_set_stream(blas_handle, stream(a))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), &
+           nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO)
+    enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), &
+           nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(X_oovv)
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
-  allocate(X_vovo(nV,nO,nV,nO))
+  call gpu_synchronize()
 
-  !$omp parallel &
-  !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) &
-  !$omp private(a,v,gam,i) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do gam = 1, nV
-      do v = 1, nO
-        do a = 1, nV
-          X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i)
-        enddo
-      enddo
+  call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, &
+            -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
+
+  call gpu_synchronize()
+
+  do gam=1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
+           nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), &
+           nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), &
+           nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO)
     enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-
-  allocate(Y_oovo(nO,nO,nV,nO))
-  call dgemm('N','N',nO,nO*nV*nO,nV, &
-             1d0, t1, size(t1,1), &
-                  X_vovo, size(X_vovo,1), &
-             0d0, Y_oovo, size(Y_oovo,1))
-
-  deallocate(X_vovo)
-  allocate(X_oovv(nO,nO,nV,nV))
-  call dgemm('N','N',nO*nO*nV, nV, nO, &
-             1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), &
-                  t1    , size(t1,1), &
-             0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3))
-  deallocate(Y_oovo)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,X_oovv) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam)
-        enddo
-      enddo
+    do beta=1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(X_oovv)
 
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  double precision, allocatable :: J1(:,:,:,:)
-  allocate(J1(nO,nV,nV,nO))
-  call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, &
-       cc_space_v_vvoo,J1)
+  call gpu_deallocate(K1)
 
-  double precision, allocatable :: K1(:,:,:,:)
-  allocate(K1(nO,nV,nO,nV))
-  call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, &
-       cc_space_v_ovov,K1)
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, &
+             1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
+                  Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+             0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
 
-  allocate(X_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(J1)
+  call gpu_synchronize()
 
-  double precision, allocatable :: Y_voov(:,:,:,:)
-  allocate(Y_voov(nV,nO,nO,nV))
-
-  !$omp parallel &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(shared)
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  double precision, allocatable :: Z_ovov(:,:,:,:)
-  allocate(Z_ovov(nO,nV,nO,nV))
-
-  call dgemm('N','N', nO*nV,nO*nV,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_voov, size(Y_voov,1) * size(Y_voov,2), &
-             0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2))
-
-  deallocate(X_ovvo,Y_voov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
-
-  double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:)
-  allocate(X_ovov(nO,nV,nO,nV))
-  allocate(Y_ovov(nO,nV,nO,nV))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,a,i,beta,gam) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
-    do u = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do gam = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov(i,a,v,gam) = t2(i,v,gam,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('T','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-  deallocate(X_ovov, Y_ovov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-  deallocate(Z_ovov)
-
-  allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV))
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
-  !$omp private(u,v,gam,beta,i,a) &
-  !$omp default(none)
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      do gam = 1, nV
-        do u = 1, nO
-          X_ovov(u,gam,i,a) = K1(u,a,i,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do beta = 1, nV
-    do v = 1, nO
-      do a = 1, nV
-        do i = 1, nO
-          Y_ovov(i,a,v,beta) = t2(i,v,beta,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(K1)
-
-  allocate(Z_ovov(nO,nV,nO,nV))
-  call dgemm('N','N',nO*nV,nO*nV,nO*nV, &
-             1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
-                  Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-             0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
-
-  deallocate(X_ovov,Y_ovov)
-
-  !$omp parallel &
-  !$omp shared(nO,nV,r2,Z_ovov) &
-  !$omp private(u,v,gam,beta) &
-  !$omp default(none)
-  !$omp do
-  do gam = 1, nV
-    do beta = 1, nV
-      do v = 1, nO
-        do u = 1, nO
-           r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  deallocate(Z_ovov)
+  call gpu_deallocate(X_ovov)
+  call gpu_deallocate(Y_ovov)
 
   ! Change the sign for consistency with the code in spin orbitals
+  do gam = 1, nV
+    call gpu_set_stream(blas_handle, stream(gam))
+    do j=1,nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV,  1.d0, r2%f(1,j,1,gam), &
+           nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO)
+    enddo
+    do beta = 1, nV
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), &
+           nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO)
+    enddo
+  enddo
+
+  call gpu_deallocate(Z_ovov)
 
   max_r2 = 0d0
   !$omp parallel &
@@ -996,8 +796,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
     do a = 1, nV
       do j = 1, nO
         do i = 1, nO
-          r2(i,j,a,b) = -r2(i,j,a,b)
-          max_r2_local = max(r2(i,j,a,b), max_r2_local)
+          max_r2_local = max(r2%f(i,j,a,b), max_r2_local)
         enddo
       enddo
     enddo
@@ -1012,447 +811,458 @@ end
 
 ! A1
 
-subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1)
-
+subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
+  d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: tau(nO, nO, nV, nV)
-  double precision, intent(out) :: A1(nO, nO, nO, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, tau
+  type(gpu_double4), intent(in)  :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo
+  type(gpu_double4), intent(out) :: A1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
 
-  double precision, allocatable :: Y_oooo(:,:,:,:)
-  allocate(Y_oooo(nO,nO,nO,nO))
+  type(gpu_double4) :: Y_oooo
+  call gpu_allocate(Y_oooo,nO,nO,nO,nO)
 
   ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
   ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
 
-  call dgemm('N','N', nO, nO*nO*nO, nV, &
-             1d0, t1    , size(t1,1), &
-                  cc_space_v_vooo, size(cc_space_v_vooo,1), &
-             0d0, Y_oooo, size(Y_oooo,1))
+  call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, &
+             1d0, t1%f(1,1)  , size(t1%f,1), &
+                  d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), &
+             0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1))
 
-  !$omp parallel &
-  !$omp private(u,v,i,j) &
-  !$omp default(shared)
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do v = 1, nO
-        do u = 1, nO
-          A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j)
-        enddo
-      enddo
-    enddo
+  type(gpu_stream) :: stream(nO)
+
+  do i=1, nO
+    call gpu_stream_create(stream(i))
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(Y_oooo)
+  call gpu_synchronize()
+
+  do j = 1, nO
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), &
+           nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO)
+    enddo
+    call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), &
+         nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  do i=1, nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_deallocate(Y_oooo)
 
   ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
-  call dgemm('N','N', nO*nO, nO*nO, nV*nV, &
-             1d0, tau     , size(tau,1) * size(tau,2), &
-                  cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), &
-             1d0, A1      , size(A1,1) * size(A1,2))
+  call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, &
+             1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
+                  d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
+             1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2))
+  call gpu_synchronize()
 
 end
 
 ! g_occ
 
-subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ)
+subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
+  d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
+  use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_oo(nO, nO)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_occ(nO, nO)
+  type(gpu_double2), intent(in)  :: t1, H_oo, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol
+  type(gpu_double4), intent(in)  :: t2, d_cc_space_v_ovoo
+  type(gpu_double2), intent(out) :: g_occ
 
-  g_occ = H_oo
+  call gpu_copy(H_oo, g_occ)
 
-  call dgemm('N','N',nO,nO,nV, &
-             1d0, t1, size(t1,1), &
-                  cc_space_f_vo, size(cc_space_f_vo,1), &
-             1d0, g_occ, size(g_occ,1))
+  call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, &
+             1d0, t1%f(1,1), size(t1%f,1), &
+                  d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
+             1d0, g_occ%f(1,1), size(g_occ%f,1))
 
-  double precision, allocatable :: X(:)
-  allocate(X(cholesky_mo_num))
-  call dgemv('N',cholesky_mo_num,nO*nV,2.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, &
-    t1, 1, 0.d0, X, 1)
+  type(gpu_double1) :: X
+  call gpu_allocate(X,cholesky_mo_num)
 
-  call dgemv('T',cholesky_mo_num,nO*nO,1.d0, &
-    cc_space_v_oo_chol, cholesky_mo_num, &
-    X, 1, 1.d0, g_occ, 1)
-  deallocate(X)
+  call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
+    t1%f(1,1), 1, 0.d0, X%f(1), 1)
 
-  call dgemv('T',nO*nV,nO*nO,-1.d0, &
-    cc_space_v_ovoo, nO*nV, &
-    t1, 1, 1.d0, g_occ, 1)
+  call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, &
+    d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, &
+    X%f(1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, &
+    d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, &
+    t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1)
+
+  call gpu_synchronize()
+  call gpu_deallocate(X)
 
 end
 
 ! g_vir
 
-subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir)
+subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
+  d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
+  use gpu
 
   implicit none
 
   integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV), H_vv(nV, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(out) :: g_vir(nV, nV)
+  type(gpu_double2), intent(in)  :: t1, H_vv, d_cc_space_f_vo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(in)  :: t2
+  type(gpu_double2), intent(out) :: g_vir
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  call dgemm('N','N',nV,nV,nO, &
-             -1d0, cc_space_f_vo , size(cc_space_f_vo,1), &
-                   t1   , size(t1,1), &
-              0d0, g_vir, size(g_vir,1))
+  type(gpu_stream) :: stream(max(nO,4))
 
-  double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:)
-  allocate(tmp_k(cholesky_mo_num))
-  call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
-
-  call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
-    g_vir, nV*nV)
-  deallocate(tmp_k)
-
-  allocate(tmp_vo(cholesky_mo_num,nV,nO))
-  call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
-    cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV)
-
-  allocate(tmp_vo2(cholesky_mo_num,nO,nV))
-  do beta=1,nV
-    do i=1,nO
-      do k=1,cholesky_mo_num
-        tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i)
-      enddo
-    enddo
-  enddo
-  deallocate(tmp_vo)
-
-  do beta = 1, nV
-    do a = 1, nV
-      g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta)
-    enddo
+  do i=1,max(nO,4)
+    call gpu_stream_create(stream(i))
   enddo
 
-  call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
-     cc_space_v_ov_chol, cholesky_mo_num*nO, &
-     tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV)
+  call gpu_set_stream(blas_handle, stream(1))
+  call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, &
+             -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), &
+                   t1%f(1,1)   , size(t1%f,1), &
+              0d0, g_vir%f(1,1), size(g_vir%f,1))
+
+  type(gpu_double1) :: tmp_k
+  type(gpu_double3) :: tmp_vo, tmp_vo2
+
+  call gpu_allocate(tmp_k,cholesky_mo_num)
+
+  call gpu_set_stream(blas_handle, stream(2))
+  call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
+    g_vir%f(1,1), nV*nV)
+
+  call gpu_set_stream(blas_handle, stream(3))
+  call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO)
+
+  call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
+    d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_k)
+
+  do i=1,nO
+    call gpu_set_stream(blas_handle, stream(i))
+    call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), &
+         cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,max(nO,4)
+    call gpu_stream_destroy(stream(i))
+  enddo
+  call gpu_deallocate(tmp_vo)
+
+  call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), &
+         nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
+     d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
+     tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_vo2)
 
 end
 
 ! J1
 
-subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1)
+subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
+  use gpu
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO)
-  double precision, intent(out) :: J1(nO, nV, nV, nO)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_ovvo, v_ovoo, v_vvoo
+  type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: J1
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:)
-  allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV))
+  type(gpu_double4) :: X_ovoo, Y_ovov
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) &
-  !$omp private(i,j,a,u,beta) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = v_ovvo(u,a,beta,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_allocate(X_ovoo,nO,nV,nO,nO)
 
-  !$omp do collapse(2)
-  do j = 1, nO
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
+  type(gpu_stream) :: stream(nV)
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), &
-                  t1    , size(t1,1), &
-             0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3))
-
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Y_ovov) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-  !$omp end parallel
-  deallocate(X_ovoo)
-
-  double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:)
-  allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO))
-
-  call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
-      cc_space_v_vv_chol, cholesky_mo_num*nV, &
-      t1, nO, &
-      0.d0, tmp_cc, cholesky_mo_num*nV)
-
-  call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
-      tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, &
-      0.d0, J1_tmp, nV*nO)
-
-  deallocate(tmp_cc)
 
   do i=1,nO
-    do b=1,nV
-      do a=1,nV
-        do u=1,nO
-          J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
-  deallocate(J1_tmp)
-
-  !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) &
-  double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:)
-  allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) &
-  !$omp private(i,beta,a,u,b,j) &
-  !$omp default(none)
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do beta = 1, nV
-        do u = 1, nO
-          Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-
-  !$omp do
-  do b = 1, nV
-    do j = 1, nO
-      do i = 1, nO
-        do a = 1, nV
-          X_voov(a,i,j,b) = v_vvoo(a,b,i,j)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do
-  !$omp end parallel
-
-  call dgemm('N','T',nO*nV,nV*nO,nO*nV, &
-             -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
-                   X_voov, size(X_voov,1) * size(X_voov,2), &
-              0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
-  deallocate(X_voov)
-
-  double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:)
-  allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO))
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  do i = 1, nO
-    !$omp do
-    do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
-
-  !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b)
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do i = 1, nO
-        do a = 1, nV
-          Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j))
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
+    call gpu_stream_create(stream(i))
   enddo
 
   do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          X_ovvo(u,beta,b,j) = t2(u,j,beta,b)
-        enddo
-      enddo
+    call gpu_set_stream(blas_handle, stream(j))
+    do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), &
+         nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  call dgemm('N','T',nO*nV,nV*nO,nV*nO, &
-             1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), &
-                  Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), &
-             0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2))
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  do i=1,nO
+    call gpu_stream_destroy(stream(i))
+  enddo
+
+  call gpu_allocate(Y_ovov,nO,nV,nO,nV)
+
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+             0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3))
+
+
+  call gpu_copy(v_ovvo, J1)
+
+  call gpu_synchronize()
+
+  do a=1,nV
+    call gpu_stream_create(stream(a))
+  enddo
 
-  !$omp parallel &
-  !$omp shared(nO,nV,J1,Z_ovvo) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
   do i = 1, nO
-    !$omp do
     do beta = 1, nV
-      do a = 1, nV
-        do u = 1, nO
-          J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i)
-        enddo
-      enddo
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+         nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO)
     enddo
-    !$omp end do nowait
   enddo
-  !$omp end parallel
 
-  deallocate(X_ovvo,Z_ovvo,Y_ovov)
+  call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO)
+  call gpu_allocate(J1_tmp,nV,nO,nV,nO)
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  type(gpu_double4) :: J1_tmp
+  type(gpu_double3) :: tmp_cc
+
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
+      d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, &
+      t1%f(1,1), nO, &
+      0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
+      tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, &
+      0.d0, J1_tmp%f(1,1,1,1), nV*nO)
+
+
+  call gpu_deallocate(X_ovoo)
+
+  call gpu_synchronize()
+  call gpu_deallocate(tmp_cc)
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), &
+         nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_voov, Z_ovvo
+
+  call gpu_allocate(X_voov,nV,nO,nO,nV)
+  call gpu_allocate(Z_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
+         nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO)
+    enddo
+  enddo
+
+  do b = 1, nV
+    call gpu_set_stream(blas_handle, stream(b))
+    call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), &
+         nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV)
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+
+  call gpu_synchronize()
+  call gpu_deallocate(J1_tmp)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, &
+             -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
+                   X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
+              0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do a = 1, nV
+      call gpu_set_stream(blas_handle, stream(a))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
+          nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV)
+    enddo
+  enddo
+
+  type(gpu_double4) :: X_ovvo, Y_vovo
+  call gpu_allocate(Y_vovo,nV,nO,nV,nO)
+
+  do j = 1, nO
+    do i = 1, nO
+      call gpu_set_stream(blas_handle, stream(i))
+      call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), &
+          nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV)
+    enddo
+  enddo
+
+  call gpu_allocate(X_ovvo,nO,nV,nV,nO)
+
+  do j = 1, nO
+    do b = 1, nV
+      call gpu_set_stream(blas_handle, stream(b))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), &
+          nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_synchronize()
+  call gpu_deallocate(X_voov)
+
+  call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, &
+             1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
+                  Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), &
+             0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
+
+  call gpu_synchronize()
+
+  do i = 1, nO
+    do beta = 1, nV
+      call gpu_set_stream(blas_handle, stream(beta))
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
+          nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO)
+    enddo
+  enddo
+
+  call gpu_set_stream(blas_handle, gpu_default_stream)
+  call gpu_deallocate(Y_ovov)
+  call gpu_deallocate(X_ovvo)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(Z_ovvo)
 
 end
 
 ! K1
 
-subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1)
+subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
+   d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
+  use gpu
 
   implicit none
 
-  integer, intent(in)           :: nO,nV
-  double precision, intent(in)  :: t1(nO, nV)
-  double precision, intent(in)  :: t2(nO, nO, nV, nV)
-  double precision, intent(in)  :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV)
-  double precision, intent(in)  :: v_ovoo(nO,nV,nO,nO)
-  double precision, intent(out) :: K1(nO, nV, nO, nV)
+  integer, intent(in)            :: nO,nV
+  type(gpu_double2), intent(in)  :: t1
+  type(gpu_double4), intent(in)  :: t2, v_vvoo, v_ovov, v_ovoo
+  type(gpu_double3), intent(in)  :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
+  type(gpu_double4), intent(out) :: K1
 
-  double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:)
+  type(gpu_double4) :: X, Y, Z
 
   integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
 
-  allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) &
-  !$omp private(i,beta,a,u,j,b) &
-  !$omp default(none)
-  !$omp do
-  do beta = 1, nV
+  call gpu_copy(v_ovov, K1)
+
+  type(gpu_stream) :: stream(nV)
+  do a = 1, nV
+    call gpu_stream_create(stream(a))
+  enddo
+
+  call gpu_allocate(X,nV,nO,nV,nO)
+  call gpu_allocate(Y,nO,nV,nV,nO)
+
+  do a = 1, nV
+    call gpu_set_stream(blas_handle, stream(a))
     do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = v_ovov(u,a,i,beta)
-        enddo
-      enddo
+      call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
+          nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), &
+          nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV)
     enddo
   enddo
-  !$omp end do nowait
 
-  do i = 1, nO
-    !$omp do
-    do a = 1, nV
-      do j = 1, nO
-        do b = 1, nV
-          X(b,j,a,i) = - v_vvoo(b,a,i,j)
-        enddo
-      enddo
-    enddo
-    !$omp end do nowait
-  enddo
+  call gpu_set_stream(blas_handle, gpu_default_stream)
 
-  do j = 1, nO
-    !$omp do
-    do b = 1, nV
-      do beta = 1, nV
-        do u = 1, nO
-          Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)
-        enddo
-      enddo
-    enddo
-    !$omp end do
-  enddo
-  !$omp end parallel
+  call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
+            -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), &
+                  t1%f(1,1)    , size(t1%f,1), &
+            1d0, K1%f(1,1,1,1)    , size(K1%f,1) * size(K1%f,2) * size(K1%f,3))
 
-  call dgemm('N','N',nO*nV*nO,nV,nO, &
-            -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), &
-                  t1    , size(t1,1), &
-            1d0, K1    , size(K1,1) * size(K1,2) * size(K1,3))
+  type(gpu_double4) :: K1tmp
+  type(gpu_double3) :: t1v
 
-  double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:)
-  allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO))
+  call gpu_allocate(t1v,cholesky_mo_num,nO,nO)
 
-  call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
-    cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, &
-    t1v, cholesky_mo_num*nO)
+  call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
+    d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, &
+    t1v%f(1,1,1), cholesky_mo_num*nO)
 
-  call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
-    t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, &
-    K1tmp, nO*nO)
+  call gpu_allocate(K1tmp,nO,nO,nV,nV)
+
+  call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
+    t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, &
+    K1tmp%f(1,1,1,1), nO*nO)
+
+  call gpu_allocate(Z,nO,nV,nV,nO)
+  call gpu_synchronize()
 
-  deallocate(t1v)
   ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i)
-  call dgemm('N','N',nV*nO,nO*nV,nV*nO, &
-             1d0, Y, size(Y,1) * size(Y,2), &
-                  X, size(X,1) * size(X,2), &
-             0d0, Z, size(Z,1) * size(Z,2))
+  call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, &
+             1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), &
+                  X%f(1,1,1,1), size(X%f,1) * size(X%f,2), &
+             0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2))
 
-  !$omp parallel &
-  !$omp shared(nO,nV,K1,Z,K1tmp) &
-  !$omp private(i,beta,a,u) &
-  !$omp default(none)
-  !$omp do
-   do beta = 1, nV
-    do i = 1, nO
-      do a = 1, nV
-        do u = 1, nO
-          K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i)
-        enddo
-      enddo
+  call gpu_synchronize()
+  call gpu_deallocate(t1v)
+
+  do beta = 1, nV
+   call gpu_set_stream(blas_handle, stream(beta))
+   do i = 1, nO
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO)
+      call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
+          nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO)
     enddo
   enddo
-  !$omp end do
-  !$omp end parallel
 
-  deallocate(K1tmp,X,Y,Z)
+  call gpu_deallocate(X)
+  call gpu_deallocate(Y)
+
+  do a = 1, nV
+    call gpu_stream_destroy(stream(a))
+  enddo
+
+  call gpu_deallocate(K1tmp)
+  call gpu_deallocate(Z)
 
 end
diff --git a/src/ccsd/ccsd_t_space_orb_stoch.irp.f b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
index 1093c59d..851b6a9f 100644
--- a/src/ccsd/ccsd_t_space_orb_stoch.irp.f
+++ b/src/ccsd/ccsd_t_space_orb_stoch.irp.f
@@ -1,5 +1,5 @@
 ! Main
-subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy)
+subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energy,t_error)
 
   implicit none
 
@@ -7,7 +7,7 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   double precision, intent(in)  :: t1(nO,nV), f_o(nO), f_v(nV)
   double precision, intent(in)  :: t2(nO,nO,nV,nV)
   double precision, intent(in)  :: v_vvvo(nV,nV,nV,nO), v_vvoo(nV,nV,nO,nO), v_vooo(nV,nO,nO,nO)
-  double precision, intent(inout) :: energy
+  double precision, intent(inout) :: energy, t_error
 
   double precision, allocatable :: X_vovv(:,:,:,:), X_ooov(:,:,:,:), X_oovv(:,:,:,:)
   double precision, allocatable :: T_voov(:,:,:,:), T_oovv(:,:,:,:)
@@ -220,8 +220,10 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
   call set_multiple_levels_omp(.False.)
   call wall_time(t00)
   imin = 1_8
-  !$OMP PARALLEL                                                     &
-      !$OMP PRIVATE(ieta,eta,a,b,c,kiter,isample)                    &
+  t_error = huge(1.d0)
+
+  !$OMP PARALLEL                                      &
+      !$OMP PRIVATE(ieta,eta,a,b,c,kiter,isample)     &
       !$OMP DEFAULT(SHARED) NUM_THREADS(nthreads_pt2)
 
   do kiter=1,Nabc
@@ -328,15 +330,23 @@ subroutine ccsd_par_t_space_stoch(nO,nV,t1,t2,f_o,f_v,v_vvvo,v_vvoo,v_vooo,energ
       if (norm > 0.d0) then
         energy_stoch = ET / norm
         variance = ET2 / norm - energy_stoch*energy_stoch
+        if (norm > 1.d0) then
+          t_error = dsqrt(variance/(norm-1.d0))
+        else
+          t_error = dsqrt(variance)
+        endif
       endif
 
       energy = energy_det + energy_stoch
 
-      print '(''   '',F20.8, ''   '', ES12.4,''   '', F8.2,''  '')', eccsd+energy, dsqrt(variance/(norm-1.d0)), 100.*real(Ncomputed)/real(Nabc)
+      print '(''   '',F20.8, ''   '', ES12.4,''   '', F8.2,''  '')', eccsd+energy, t_error, 100.*real(Ncomputed)/real(Nabc)
+
     endif
     !$OMP END MASTER
+    if (t_error < cc_par_t_stop) exit
     if (imin > Nabc) exit
   enddo
+  !$OMP TASKWAIT
 
   !$OMP END PARALLEL
   print '(A)', ' ======================= ============== ========== '
diff --git a/src/cipsi/selection.irp.f b/src/cipsi/selection.irp.f
index 0281a1d4..517220a8 100644
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@@ -178,7 +178,7 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
   integer(bit_kind), allocatable        :: minilist(:, :, :), fullminilist(:, :, :)
   logical, allocatable                  :: banned(:,:,:), bannedOrb(:,:)
   double precision, allocatable         :: coef_fullminilist_rev(:,:)
-  double precision, allocatable         :: mat(:,:,:)
+  double precision, allocatable         :: mat(:,:,:), hij_cache(:,:,:)
 
 
   PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
@@ -205,7 +205,7 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
   ! Removed to avoid introducing determinants already presents in the wf
   !double precision, parameter :: norm_thr = 1.d-16
 
-  allocate (indices(N_det),                                          &
+  allocate (indices(N_det), hij_cache(mo_num,mo_num,2),          &
       exc_degree(max(N_det_alpha_unique,N_det_beta_unique)))
 
   ! Pre-compute excitation degrees wrt alpha determinants
@@ -511,11 +511,15 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
 
           maskInd = maskInd + 1
           if(mod(maskInd, csubset) == (subset-1)) then
+            call get_mo_two_e_integrals_ij(h2,h1,mo_num,hij_cache(1,1,1),mo_integrals_map)
+            if (sp /= 3) then  ! AA or BB
+              call get_mo_two_e_integrals_ij(h1,h2,mo_num,hij_cache(1,1,2),mo_integrals_map)
+            endif
 
             call spot_isinwf(mask, fullminilist, i_generator, fullinteresting(0), banned, fullMatch, fullinteresting)
             if(fullMatch) cycle
 
-            call splash_pq(mask, sp, minilist, i_generator, interesting(0), bannedOrb, banned, mat, interesting)
+            call splash_pq(mask, sp, minilist, i_generator, interesting(0), bannedOrb, banned, mat, interesting, hij_cache)
 
             call fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_diag_tmp, E0, pt2_data, mat, buf)
           end if
@@ -531,7 +535,7 @@ subroutine select_singles_and_doubles(i_generator, hole_mask, particle_mask, foc
     enddo
   enddo
   deallocate(preinteresting, prefullinteresting, interesting, fullinteresting)
-  deallocate(banned, bannedOrb,mat)
+  deallocate(banned, bannedOrb, mat, hij_cache)
 end subroutine
 
 BEGIN_TEMPLATE
@@ -556,7 +560,7 @@ subroutine fill_buffer_$DOUBLE(i_generator, sp, h1, h2, bannedOrb, banned, fock_
   double precision, external :: diag_H_mat_elem_fock
   double precision :: E_shift
   double precision :: s_weight(N_states,N_states)
-  PROVIDE dominant_dets_of_cfgs N_dominant_dets_of_cfgs
+  PROVIDE dominant_dets_of_cfgs N_dominant_dets_of_cfgs thresh_sym excitation_ref hf_bitmask elec_alpha_num
   do jstate=1,N_states
     do istate=1,N_states
       s_weight(istate,jstate) = dsqrt(selection_weight(istate)*selection_weight(jstate))
@@ -742,7 +746,7 @@ subroutine fill_buffer_$DOUBLE(i_generator, sp, h1, h2, bannedOrb, banned, fock_
       do istate=1,N_states
         delta_E = E0(istate) - Hii + E_shift
         alpha_h_psi = mat(istate, p1, p2)
-        if (alpha_h_psi == 0.d0) cycle
+        if (dabs(alpha_h_psi) < mo_integrals_threshold) cycle
 
         val = alpha_h_psi + alpha_h_psi
         tmp = dsqrt(delta_E * delta_E + val * val)
@@ -914,7 +918,7 @@ single ; do p1=1,mo_num ; enddo ; p2=1            ;       ; .False. ;;
 
 END_TEMPLATE
 
-subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, interesting)
+subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, interesting, hij_cache)
   use bitmasks
   implicit none
   BEGIN_DOC
@@ -926,6 +930,7 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
   integer, intent(in)            :: sp, i_gen, N_sel
   integer, intent(in)            :: interesting(0:N_sel)
   integer(bit_kind),intent(in)   :: mask(N_int, 2), det(N_int, 2, N_sel)
+  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
   logical, intent(inout)         :: bannedOrb(mo_num, 2), banned(mo_num, mo_num, 2)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
 
@@ -995,18 +1000,36 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
         if(nt == 4) then
           call get_d2(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
         else if(nt == 3) then
-          call get_d1(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
+          call get_d1(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)), hij_cache)
         else
-          call get_d0(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)))
+          call get_d0(det(1,1,i), phasemask, bannedOrb, banned, mat, mask, h, p, sp, psi_selectors_coef_transp(1, interesting(i)), hij_cache)
         end if
     else if(nt == 4) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
         call bitstring_to_list_in_selection(mobMask(1,2), p(1,2), p(0,2), N_int)
-        call past_d2(banned, p, sp)
+        if(sp == 3) then
+          do j=1,p(0,2)
+            do ii=1,p(0,1)
+              banned(p(ii,1), p(j,2),1) = .true.
+            end do
+          end do
+        else
+          do ii=1,p(0, sp)
+            do j=1,ii-1
+              banned(p(j,sp), p(ii,sp),1) = .true.
+              banned(p(ii,sp), p(j,sp),1) = .true.
+            end do
+          end do
+        end if
     else if(nt == 3) then
         call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
         call bitstring_to_list_in_selection(mobMask(1,2), p(1,2), p(0,2), N_int)
-        call past_d1(bannedOrb, p)
+        do ii = 1, p(0, 1)
+          bannedOrb(p(ii, 1), 1) = .true.
+        end do
+        do ii = 1, p(0, 2)
+          bannedOrb(p(ii, 2), 2) = .true.
+        end do
     end if
   end do
 
@@ -1037,6 +1060,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   integer :: bant
   bant = 1
 
+  PROVIDE mo_integrals_threshold
   tip = p(0,1) * p(0,2)
 
   ma = sp
@@ -1062,7 +1086,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         p2 = p(i2, ma)
 
         hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2, p1, h1, h2)
-        if (hij == 0.d0) cycle
+        if (dabs(hij) < mo_integrals_threshold) cycle
 
         hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
 
@@ -1092,7 +1116,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           p1 = p(turn2(i), 1)
 
           hij = mo_two_e_integral(p1, p2, h1, h2)
-          if (hij /= 0.d0) then
+          if (dabs(hij) > mo_integrals_threshold) then
             hij = hij * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
             !DIR$ LOOP COUNT AVG(4)
             do k=1,N_states
@@ -1120,7 +1144,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           p1 = p(i1, ma)
           p2 = p(i2, ma)
           hij = mo_two_e_integral(p1, p2, h1, h2) - mo_two_e_integral(p2,p1, h1, h2)
-          if (hij == 0.d0) cycle
+          if (dabs(hij) < mo_integrals_threshold) cycle
 
           hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1142,7 +1166,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         p2 = p(i, ma)
 
         hij = mo_two_e_integral(p1, p2, h1, h2)
-        if (hij == 0.d0) cycle
+        if (dabs(hij) < mo_integrals_threshold) cycle
 
         hij = hij * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
         if (puti < putj) then
@@ -1179,7 +1203,7 @@ subroutine get_d2(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
 end
 
 
-subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs, hij_cache)
   use bitmasks
   implicit none
 
@@ -1190,6 +1214,8 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   double precision, intent(in)   :: coefs(N_states)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
   integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
+
   double precision, external     :: get_phase_bi, mo_two_e_integral
   logical                        :: ok
 
@@ -1201,12 +1227,12 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
 
   integer                        :: bant
-  double precision, allocatable :: hij_cache(:,:)
+  double precision, allocatable :: hij_cache1(:,:)
   double precision               :: hij, tmp_row(N_states, mo_num), tmp_row2(N_states, mo_num)
   PROVIDE mo_integrals_map N_int
 
   allocate (lbanned(mo_num, 2))
-  allocate (hij_cache(mo_num,2))
+  allocate (hij_cache1(mo_num,2))
   lbanned = bannedOrb
 
   do i=1, p(0,1)
@@ -1230,13 +1256,11 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     p1 = p(1,ma)
     p2 = p(2,ma)
     if(.not. bannedOrb(puti, mi)) then
-      call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map)
-      call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map)
       tmp_row = 0d0
       do putj=1, hfix-1
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
-        hij = hij_cache(putj,1) - hij_cache(putj,2)
+        hij = hij_cache(hfix,putj,1) - hij_cache(putj,hfix,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1248,7 +1272,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       do putj=hfix+1, mo_num
         if(lbanned(putj, ma)) cycle
         if(banned(putj, puti,bant)) cycle
-        hij = hij_cache(putj,2) - hij_cache(putj,1)
+        hij = hij_cache(putj,hfix,1) - hij_cache(hfix,putj,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1274,15 +1298,15 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     pfix = p(1,mi)
     tmp_row = 0d0
     tmp_row2 = 0d0
-    call get_mo_two_e_integrals(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map)
-    call get_mo_two_e_integrals(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map)
+    call get_mo_two_e_integrals(hfix,pfix,p1,mo_num,hij_cache1(1,1),mo_integrals_map)
+    call get_mo_two_e_integrals(hfix,pfix,p2,mo_num,hij_cache1(1,2),mo_integrals_map)
     putj = p1
     do puti=1,mo_num !HOT
       if(lbanned(puti,mi)) cycle
       !p1 fixed
       putj = p1
       if(.not. banned(putj,puti,bant)) then
-        hij = hij_cache(puti,2)
+        hij = hij_cache1(puti,2)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
           !DIR$ LOOP COUNT AVG(4)
@@ -1296,7 +1320,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       putj = p2
 !    do puti=1,mo_num !HOT
       if(.not. banned(putj,puti,bant)) then
-        hij = hij_cache(puti,1)
+        hij = hij_cache1(puti,1)
         if (hij /= 0.d0) then
           hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
           do k=1,N_states
@@ -1327,13 +1351,13 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         puti = p(i, ma)
         p1 = p(turn3(1,i), ma)
         p2 = p(turn3(2,i), ma)
-        call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map)
-        call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map)
+        call get_mo_two_e_integrals(hfix,p1,p2,mo_num,hij_cache1(1,1),mo_integrals_map)
+        call get_mo_two_e_integrals(hfix,p2,p1,mo_num,hij_cache1(1,2),mo_integrals_map)
         tmp_row = 0d0
         do putj=1,hfix-1
           if(banned(putj,puti,1)) cycle
           if(lbanned(putj,ma)) cycle
-          hij = hij_cache(putj,1) - hij_cache(putj,2)
+          hij = hij_cache1(putj,1) - hij_cache1(putj,2)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
             tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
@@ -1342,7 +1366,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
         do putj=hfix+1,mo_num
           if(banned(putj,puti,1)) cycle
           if(lbanned(putj,ma)) cycle
-          hij = hij_cache(putj,2) - hij_cache(putj,1)
+          hij = hij_cache1(putj,2) - hij_cache1(putj,1)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
             tmp_row(:,putj) = tmp_row(:,putj) + hij * coefs(:)
@@ -1364,14 +1388,14 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       p2 = p(2,ma)
       tmp_row = 0d0
       tmp_row2 = 0d0
-      call get_mo_two_e_integrals(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map)
-      call get_mo_two_e_integrals(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map)
+      call get_mo_two_e_integrals(hfix,p1,pfix,mo_num,hij_cache1(1,1),mo_integrals_map)
+      call get_mo_two_e_integrals(hfix,p2,pfix,mo_num,hij_cache1(1,2),mo_integrals_map)
       putj = p2
       do puti=1,mo_num
         if(lbanned(puti,ma)) cycle
         putj = p2
         if(.not. banned(puti,putj,1)) then
-          hij = hij_cache(puti,1)
+          hij = hij_cache1(puti,1)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
             !DIR$ LOOP COUNT AVG(4)
@@ -1383,7 +1407,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
 
         putj = p1
         if(.not. banned(puti,putj,1)) then
-          hij = hij_cache(puti,2)
+          hij = hij_cache1(puti,2)
           if (hij /= 0.d0) then
             hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
             do k=1,N_states
@@ -1408,7 +1432,7 @@ subroutine get_d1(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
       enddo
     end if
   end if
-  deallocate(lbanned,hij_cache)
+  deallocate(lbanned,hij_cache1)
 
  !! MONO
     if(sp == 3) then
@@ -1439,7 +1463,7 @@ end
 
 
 
-subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
+subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs, hij_cache)
   use bitmasks
   implicit none
 
@@ -1450,6 +1474,7 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   double precision, intent(in) :: coefs(N_states)
   double precision, intent(inout) :: mat(N_states, mo_num, mo_num)
   integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+  double precision, intent(in)   :: hij_cache(mo_num, mo_num, 2)
 
   integer :: i, j, k, s, h1, h2, p1, p2, puti, putj
   double precision :: hij, phase
@@ -1457,16 +1482,14 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
   logical :: ok
 
   integer, parameter :: bant=1
-  double precision, allocatable :: hij_cache1(:), hij_cache2(:)
-  allocate (hij_cache1(mo_num),hij_cache2(mo_num))
 
+  PROVIDE mo_integrals_threshold
 
   if(sp == 3) then ! AB
     h1 = p(1,1)
     h2 = p(1,2)
     do p1=1, mo_num
       if(bannedOrb(p1, 1)) cycle
-      call get_mo_two_e_integrals(p1,h2,h1,mo_num,hij_cache1,mo_integrals_map)
       do p2=1, mo_num
         if(bannedOrb(p2,2)) cycle
         if(banned(p1, p2, bant)) cycle ! rentable?
@@ -1475,9 +1498,9 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
           call i_h_j(gen, det, N_int, hij)
         else
           phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
-          hij = hij_cache1(p2) * phase
+          hij = hij_cache(p2,p1,1) * phase
         end if
-        if (hij == 0.d0) cycle
+        if (dabs(hij) < mo_integrals_threshold) cycle
         !DIR$ LOOP COUNT AVG(4)
         do k=1,N_states
           mat(k, p1, p2) = mat(k, p1, p2) + coefs(k) * hij  ! HOTSPOT
@@ -1490,18 +1513,16 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     p2 = p(2,sp)
     do puti=1, mo_num
       if (bannedOrb(puti, sp)) cycle
-      call get_mo_two_e_integrals(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map)
-      call get_mo_two_e_integrals(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map)
       do putj=puti+1, mo_num
         if(bannedOrb(putj, sp)) cycle
         if(banned(puti, putj, bant)) cycle ! rentable?
         if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
           call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
           call i_h_j(gen, det, N_int, hij)
-          if (hij == 0.d0) cycle
+          if (dabs(hij) < mo_integrals_threshold) cycle
         else
-          hij = hij_cache1(putj) - hij_cache2(putj)
-          if (hij == 0.d0) cycle
+          hij = hij_cache(putj,puti,1) - hij_cache(putj,puti,2)
+          if (dabs(hij) < mo_integrals_threshold) cycle
           hij = hij * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
         end if
         !DIR$ LOOP COUNT AVG(4)
@@ -1512,50 +1533,9 @@ subroutine get_d0(gen, phasemask, bannedOrb, banned, mat, mask, h, p, sp, coefs)
     end do
   end if
 
-  deallocate(hij_cache1,hij_cache2)
 end
 
 
-subroutine past_d1(bannedOrb, p)
-  use bitmasks
-  implicit none
-
-  logical, intent(inout) :: bannedOrb(mo_num, 2)
-  integer, intent(in) :: p(0:4, 2)
-  integer :: i,s
-
-  do s = 1, 2
-    do i = 1, p(0, s)
-      bannedOrb(p(i, s), s) = .true.
-    end do
-  end do
-end
-
-
-subroutine past_d2(banned, p, sp)
-  use bitmasks
-  implicit none
-
-  logical, intent(inout) :: banned(mo_num, mo_num)
-  integer, intent(in) :: p(0:4, 2), sp
-  integer :: i,j
-
-  if(sp == 3) then
-    do j=1,p(0,2)
-      do i=1,p(0,1)
-        banned(p(i,1), p(j,2)) = .true.
-      end do
-    end do
-  else
-    do i=1,p(0, sp)
-      do j=1,i-1
-        banned(p(j,sp), p(i,sp)) = .true.
-        banned(p(i,sp), p(j,sp)) = .true.
-      end do
-    end do
-  end if
-end
-
 subroutine spot_isinwf(mask, det, i_gen, N, banned, fullMatch, interesting)
   use bitmasks
   implicit none
diff --git a/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f b/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f
index 0dc939cb..f57b7f92 100644
--- a/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_diag_dressed_ext_rout.irp.f
@@ -1,6 +1,5 @@
 
 subroutine davidson_general_ext_rout_diag_dressed(u_in,H_jj,Dress_jj,energies,sze,N_st,N_st_diag_in,converged,hcalc)
-  use mmap_module
   implicit none
   BEGIN_DOC
   ! Generic Davidson diagonalization with ONE DIAGONAL DRESSING OPERATOR 
diff --git a/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f b/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
index 1a8269f4..c8848998 100644
--- a/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_diag_dressed_ext_rout_nonsym_B1space.irp.f
@@ -3,8 +3,6 @@
 
 subroutine davidson_general_diag_dressed_ext_rout_nonsym_b1space(u_in, H_jj, Dress_jj,energies, sze, N_st, N_st_diag_in, converged, hcalc)
 
-  use mmap_module
-
   BEGIN_DOC
   ! Generic modified-Davidson diagonalization 
   !
diff --git a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
index 24f4fa10..1ff6632c 100644
--- a/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_double_dress_ext_rout.irp.f
@@ -1,5 +1,4 @@
 subroutine dav_double_dressed(u_in,H_jj,Dress_jj,Dressing_vec,idx_dress,energies,sze,N_st,N_st_diag,converged,hcalc)
-  use mmap_module
   BEGIN_DOC
   ! Generic Davidson diagonalization with TWO DRESSING VECTORS 
   !
diff --git a/src/dav_general_mat/dav_dressed_ext_rout.irp.f b/src/dav_general_mat/dav_dressed_ext_rout.irp.f
index cedaaf0a..ca59a688 100644
--- a/src/dav_general_mat/dav_dressed_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_dressed_ext_rout.irp.f
@@ -1,5 +1,4 @@
 subroutine davidson_general_ext_rout_dressed(u_in,H_jj,energies,sze,N_st,N_st_diag,dressing_state,dressing_vec,idress,converged,hcalc)
-  use mmap_module
   implicit none
   BEGIN_DOC
   ! Davidson diagonalization.
diff --git a/src/dav_general_mat/dav_ext_rout.irp.f b/src/dav_general_mat/dav_ext_rout.irp.f
index deb7e3a9..ad60b2a8 100644
--- a/src/dav_general_mat/dav_ext_rout.irp.f
+++ b/src/dav_general_mat/dav_ext_rout.irp.f
@@ -1,6 +1,5 @@
 
 subroutine davidson_general_ext_rout(u_in,H_jj,energies,sze,N_st,N_st_diag_in,converged,hcalc)
-  use mmap_module
   implicit none
   BEGIN_DOC
   ! Generic Davidson diagonalization 
diff --git a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
index d89aaadb..ca0a835e 100644
--- a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
@@ -3,8 +3,6 @@
 
 subroutine davidson_general_ext_rout_nonsym_b1space(u_in, H_jj, energies, sze, N_st, N_st_diag_in, converged, hcalc)
 
-  use mmap_module
-
   BEGIN_DOC
   ! Generic modified-Davidson diagonalization 
   !
diff --git a/src/dav_general_mat/dav_general.irp.f b/src/dav_general_mat/dav_general.irp.f
index 9940bf1e..a277d9ef 100644
--- a/src/dav_general_mat/dav_general.irp.f
+++ b/src/dav_general_mat/dav_general.irp.f
@@ -1,6 +1,6 @@
 
 subroutine davidson_general(u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag_in,converged,h_mat)
-  use mmap_module
+!  use mmap_module
   implicit none
   BEGIN_DOC
   ! Davidson diagonalization with specific diagonal elements of the H matrix
@@ -160,9 +160,9 @@ subroutine davidson_general(u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag_in,conv
 !    type(c_ptr) :: ptr_w, ptr_s
 !    integer :: fd_s, fd_w
 !    call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-!        8, fd_w, .False., ptr_w)
+!        8, fd_w, .False., .True., ptr_w)
 !    call mmap(trim(ezfio_work_dir)//'davidson_s', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-!        4, fd_s, .False., ptr_s)
+!        4, fd_s, .False., .True., ptr_s)
 !    call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
 !    call c_f_pointer(ptr_s, s, (/sze,N_st_diag*itermax/))
 !  else
diff --git a/src/davidson/diagonalization_h_dressed.irp.f b/src/davidson/diagonalization_h_dressed.irp.f
index b7179c18..15bf256d 100644
--- a/src/davidson/diagonalization_h_dressed.irp.f
+++ b/src/davidson/diagonalization_h_dressed.irp.f
@@ -228,7 +228,7 @@ subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_dia
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
   else
     allocate(W(sze,N_st_diag*itermax))
diff --git a/src/davidson/diagonalization_hcsf_dressed.irp.f b/src/davidson/diagonalization_hcsf_dressed.irp.f
index fa8aff80..656dd1d9 100644
--- a/src/davidson/diagonalization_hcsf_dressed.irp.f
+++ b/src/davidson/diagonalization_hcsf_dressed.irp.f
@@ -229,7 +229,7 @@ subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call c_f_pointer(ptr_w, W_csf, (/sze_csf,N_st_diag*itermax/))
   else
     allocate(W(sze,N_st_diag),W_csf(sze_csf,N_st_diag*itermax))
diff --git a/src/davidson/diagonalization_hs2_dressed.irp.f b/src/davidson/diagonalization_hs2_dressed.irp.f
index fd967ecc..191e0021 100644
--- a/src/davidson/diagonalization_hs2_dressed.irp.f
+++ b/src/davidson/diagonalization_hs2_dressed.irp.f
@@ -270,9 +270,9 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call mmap(trim(ezfio_work_dir)//'davidson_s', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        4, fd_s, .False., ptr_s)
+        4, fd_s, .False., .True., ptr_s)
     call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
     call c_f_pointer(ptr_s, s, (/sze,N_st_diag*itermax/))
   else
@@ -594,6 +594,13 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
           state(l) = idx
         enddo
 
+        ! Check if all states are attributed. If not, exit and N_st_diag will be increased.
+        do l=1,N_st
+          if (state(l) == 0) then
+            return
+          endif
+        enddo
+
         ! tmp array before setting state_ok
         ok = .False.
         do l = 1, N_st
@@ -627,47 +634,6 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
       ! Swapped eigenvectors
       prev_y = y
 
-!      if (state_following) then
-!
-!        overlap = -1.d0
-!        do k=1,shift2
-!          do i=1,shift2
-!            overlap(k,i) = dabs(y(k,i))
-!          enddo
-!        enddo
-!        do k=1,N_st
-!          cmax = -1.d0
-!          do i=1,N_st
-!            if (overlap(i,k) > cmax) then
-!              cmax = overlap(i,k)
-!              order(k) = i
-!            endif
-!          enddo
-!          do i=1,N_st_diag
-!            overlap(order(k),i) = -1.d0
-!          enddo
-!        enddo
-!        overlap = y
-!        do k=1,N_st
-!          l = order(k)
-!          if (k /= l) then
-!            y(1:shift2,k) = overlap(1:shift2,l)
-!          endif
-!        enddo
-!        do k=1,N_st
-!          overlap(k,1) = lambda(k)
-!          overlap(k,2) = s2(k)
-!        enddo
-!        do k=1,N_st
-!          l = order(k)
-!          if (k /= l) then
-!            lambda(k) = overlap(l,1)
-!            s2(k) = overlap(l,2)
-!          endif
-!        enddo
-!
-!      endif
-
 
       ! Express eigenvectors of h in the determinant basis
       ! --------------------------------------------------
@@ -703,7 +669,7 @@ subroutine davidson_diag_hjj_sjj(dets_in,u_in,H_jj,s2_out,energies,dim_in,sze,N_
 
 
       if ((itertot>1).and.(iter == 1)) then
-        !don't print
+        ! Don't print
         continue
       else
         write(*,'(1X,I3,1X,100(1X,F16.10,1X,F11.6,1X,ES11.3))') iter-1, to_print(1:3,1:N_st)
diff --git a/src/davidson/diagonalization_nonsym_h_dressed.irp.f b/src/davidson/diagonalization_nonsym_h_dressed.irp.f
index 96ca84ab..86df3a19 100644
--- a/src/davidson/diagonalization_nonsym_h_dressed.irp.f
+++ b/src/davidson/diagonalization_nonsym_h_dressed.irp.f
@@ -251,7 +251,7 @@ subroutine davidson_diag_nonsym_hjj(dets_in, u_in, H_jj, energies, dim_in, sze,
     type(c_ptr) :: ptr_w, ptr_s
     integer :: fd_s, fd_w
     call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
-        8, fd_w, .False., ptr_w)
+        8, fd_w, .False., .True., ptr_w)
     call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
   else
     allocate(W(sze,N_st_diag*itermax))
diff --git a/src/davidson/diagonalize_ci.irp.f b/src/davidson/diagonalize_ci.irp.f
index 59c8313a..6b852905 100644
--- a/src/davidson/diagonalize_ci.irp.f
+++ b/src/davidson/diagonalize_ci.irp.f
@@ -282,9 +282,8 @@ END_PROVIDER
          print*,'  Within the ',N_det,'determinants selected'
          print*,'  and the ',N_states_diag,'states requested'
          print*,'  We did not find only states with S^2 values close to ',expected_s2
-         print*,'  We will then set the first N_states eigenvectors of the H matrix'
-         print*,'  as the CI_eigenvectors'
-         print*,'  You should consider more states and maybe ask for s2_eig to be .True. or just enlarge the CI space'
+         print*,'  You should consider more states, or change s2_eig, or just enlarge the CI space'
+         print*,'!!!!!!!!   WARNING  !!!!!!!!!'
          print*,''
 
          do j=1,min(N_states_diag,N_det)
diff --git a/src/determinants/generate_cas_space.irp.f b/src/determinants/generate_cas_space.irp.f
index 47a2ca30..05201c74 100644
--- a/src/determinants/generate_cas_space.irp.f
+++ b/src/determinants/generate_cas_space.irp.f
@@ -33,7 +33,7 @@ subroutine generate_cas_space
   print *, 'CAS(', n_alpha_act+n_beta_act, ', ', n_act_orb, ')'
   print *, ''
 
-  n_det_alpha_unique = binom_int(n_act_orb, n_alpha_act)
+  n_det_alpha_unique = int(binom_int(n_act_orb, n_alpha_act),4)
   TOUCH n_det_alpha_unique
 
   n = n_alpha_act
@@ -56,7 +56,7 @@ subroutine generate_cas_space
     u = ior(t1,t2)
   enddo
 
-  n_det_beta_unique = binom_int(n_act_orb, n_beta_act)
+  n_det_beta_unique = int(binom_int(n_act_orb, n_beta_act),4)
   TOUCH n_det_beta_unique
 
   n = n_beta_act
diff --git a/src/determinants/slater_rules_general.irp.f b/src/determinants/slater_rules_general.irp.f
new file mode 100644
index 00000000..e987c846
--- /dev/null
+++ b/src/determinants/slater_rules_general.irp.f
@@ -0,0 +1,192 @@
+subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, for each spin, of holes/particles between key_i and key_j 
+!
+! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
+ END_DOC
+  include 'utils/constants.include.F'
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
+ double precision, intent(out)  :: phase
+ integer :: ispin,k,i,pos
+ integer(bit_kind) :: key_hole, key_particle
+ integer(bit_kind) :: xorvec(N_int_max,2)
+ holes_array = -1
+ particles_array = -1
+ degree_array = 0
+  do i = 1, N_int
+   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
+   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
+   degree_array(1) += popcnt(xorvec(i,1))
+   degree_array(2) += popcnt(xorvec(i,2))
+  enddo
+  degree_array(1) = shiftr(degree_array(1),1)
+  degree_array(2) = shiftr(degree_array(2),1)
+  
+ do ispin = 1, 2
+  k = 1
+  !!! GETTING THE HOLES 
+  do i = 1, N_int
+   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
+   do while(key_hole .ne.0_bit_kind)
+    pos = trailz(key_hole)
+    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_hole = ibclr(key_hole,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_excitation_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+ do ispin = 1, 2
+  k = 1
+  !!! GETTING THE PARTICLES
+  do i = 1, N_int
+   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
+   do while(key_particle .ne.0_bit_kind)
+    pos = trailz(key_particle)
+    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_particle = ibclr(key_particle,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_excitation_general '
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo
+  enddo 
+ enddo
+ integer :: h,p, i_ok
+ integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
+ integer                        :: exc(0:2,2,2)
+ double precision :: phase_tmp
+ allocate(det_i(Nint,2),det_ip(N_int,2))
+ det_i = key_i
+ phase = 1.d0
+ do ispin = 1, 2
+  do i = 1, degree_array(ispin)
+   h = holes_array(i,ispin)
+   p = particles_array(i,ispin)
+   det_ip = det_i
+   call do_single_excitation(det_ip,h,p,ispin,i_ok)
+   if(i_ok == -1)then
+     print*,'excitation was not possible '
+     stop
+   endif
+   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
+   phase *= phase_tmp
+   det_i = det_ip
+  enddo
+ enddo
+
+end
+
+subroutine get_holes_general(key_i, key_j,Nint, holes_array)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, per spin, of holes between key_i and key_j 
+!
+! with the following convention: a_{hole}|key_i> --> |key_j>
+ END_DOC
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: holes_array(100,2)
+ integer(bit_kind) :: key_hole
+ integer :: ispin,k,i,pos
+ holes_array = -1
+ do ispin = 1, 2
+  k = 1
+  do i = 1, N_int
+   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
+   do while(key_hole .ne.0_bit_kind)
+    pos = trailz(key_hole)
+    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_hole = ibclr(key_hole,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_holes_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+end
+
+subroutine get_particles_general(key_i, key_j,Nint,particles_array)
+ use bitmasks
+ BEGIN_DOC
+! returns the array, per spin, of particles between key_i and key_j 
+!
+! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
+ END_DOC
+ implicit none
+ integer, intent(in)            :: Nint
+ integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
+ integer, intent(out)           :: particles_array(100,2)
+ integer(bit_kind) :: key_particle
+ integer :: ispin,k,i,pos
+ particles_array = -1
+ do ispin = 1, 2
+  k = 1
+  do i = 1, N_int
+   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
+   do while(key_particle .ne.0_bit_kind)
+    pos = trailz(key_particle)
+    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
+    key_particle = ibclr(key_particle,pos)
+    k += 1
+    if(k .gt.100)then
+     print*,'WARNING in get_holes_general'
+     print*,'More than a 100-th excitation for spin ',ispin
+     print*,'Those are the two determinants'
+     call debug_det(key_i, N_int)
+     call debug_det(key_j, N_int)
+     print*,'stoping ...'
+     stop
+    endif
+   enddo 
+  enddo
+ enddo
+end
+
+subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
+ implicit none
+ integer, intent(in)            :: degree(2), Nint
+ integer(bit_kind), intent(in)  :: key_i(Nint,2)
+ integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
+ double precision, intent(out)  :: phase
+ integer :: i,ispin,h,p, i_ok
+ integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
+ integer                        :: exc(0:2,2,2)
+ double precision :: phase_tmp
+ allocate(det_i(Nint,2),det_ip(N_int,2))
+ det_i = key_i
+ phase = 1.d0
+ do ispin = 1, 2
+  do i = 1, degree(ispin)
+   h = holes_array(i,ispin)
+   p = particles_array(i,ispin)
+   det_ip = det_i
+   call do_single_excitation(det_ip,h,p,ispin,i_ok)
+   if(i_ok == -1)then
+     print*,'excitation was not possible '
+     stop
+   endif
+   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
+   phase *= phase_tmp
+   det_i = det_ip
+  enddo
+ enddo
+
+end
diff --git a/src/dft_utils_func/on_top_from_ueg.irp.f b/src/dft_utils_func/on_top_from_ueg.irp.f
index 4e28ad89..711ffc39 100644
--- a/src/dft_utils_func/on_top_from_ueg.irp.f
+++ b/src/dft_utils_func/on_top_from_ueg.irp.f
@@ -32,7 +32,6 @@ double precision function g0_UEG_mu_inf(rho_a,rho_b)
  C = 0.08193d0
  D = -0.01277d0
  E = 0.001859d0                     
- x = -d2*rs
  if (dabs(rho) > 1.d-20) then
   rs = (3d0 / (4d0*pi*rho))**(1d0/3d0) ! JT: serious bug fixed 20/03/19
   x = -d2*rs
diff --git a/src/dft_utils_in_r/ao_in_r.irp.f b/src/dft_utils_in_r/ao_in_r.irp.f
index 16414f39..e9c003d4 100644
--- a/src/dft_utils_in_r/ao_in_r.irp.f
+++ b/src/dft_utils_in_r/ao_in_r.irp.f
@@ -52,35 +52,39 @@ END_PROVIDER
 
 BEGIN_PROVIDER[double precision, aos_grad_in_r_array, (ao_num,n_points_final_grid,3)]
 
- BEGIN_DOC
- ! aos_grad_in_r_array(i,j,k)          = value of the kth component of the gradient of ith ao on the jth grid point
- !
- ! k = 1 : x, k= 2, y, k  3, z
- END_DOC
+  BEGIN_DOC
+  !
+  ! aos_grad_in_r_array(i,j,k) = value of the kth component of the gradient of ith ao on the jth grid point
+  !
+  ! k = 1 : x, k= 2, y, k  3, z
+  !
+  END_DOC
 
- implicit none
- integer :: i,j,m
- double precision :: aos_array(ao_num), r(3)
- double precision :: aos_grad_array(3,ao_num)
- !$OMP PARALLEL DO &
- !$OMP DEFAULT (NONE)  &
- !$OMP PRIVATE (i,r,aos_array,aos_grad_array,j,m) & 
- !$OMP SHARED(aos_grad_in_r_array,n_points_final_grid,ao_num,final_grid_points)
- do i = 1, n_points_final_grid
-  r(1) = final_grid_points(1,i)
-  r(2) = final_grid_points(2,i)
-  r(3) = final_grid_points(3,i)
-  call give_all_aos_and_grad_at_r(r,aos_array,aos_grad_array)
-  do m = 1, 3
-   do j = 1, ao_num
-    aos_grad_in_r_array(j,i,m) = aos_grad_array(m,j)
-   enddo
+  implicit none
+  integer          :: i, j, m
+  double precision :: aos_array(ao_num), r(3)
+  double precision :: aos_grad_array(3,ao_num)
+
+  !$OMP PARALLEL DO                                &
+  !$OMP DEFAULT (NONE)                             &
+  !$OMP PRIVATE (i,j,m,r,aos_array,aos_grad_array) & 
+  !$OMP SHARED(aos_grad_in_r_array,n_points_final_grid,ao_num,final_grid_points)
+  do i = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    call give_all_aos_and_grad_at_r(r,aos_array,aos_grad_array)
+    do m = 1, 3
+      do j = 1, ao_num
+        aos_grad_in_r_array(j,i,m) = aos_grad_array(m,j)
+      enddo
+    enddo
   enddo
- enddo
- !$OMP END PARALLEL DO
+  !$OMP END PARALLEL DO
 
+END_PROVIDER 
 
- END_PROVIDER 
+! ---
 
 
  BEGIN_PROVIDER[double precision, aos_grad_in_r_array_transp, (3,ao_num,n_points_final_grid)]
@@ -205,18 +209,53 @@ BEGIN_PROVIDER[double precision, aos_grad_in_r_array, (ao_num,n_points_final_gri
  END_PROVIDER
 
 
- BEGIN_PROVIDER[double precision, aos_in_r_array_extra_transp, (n_points_extra_final_grid,ao_num)]
- implicit none
- BEGIN_DOC
- ! aos_in_r_array_extra_transp(i,j) = value of the jth ao on the ith grid point
- END_DOC
- integer :: i,j
- double precision :: aos_array(ao_num), r(3)
- do i = 1, n_points_extra_final_grid
-  do j = 1, ao_num
-   aos_in_r_array_extra_transp(i,j) = aos_in_r_array_extra(j,i)
+! ---
+
+BEGIN_PROVIDER[double precision, aos_in_r_array_extra_transp, (n_points_extra_final_grid,ao_num)]
+
+  BEGIN_DOC
+  ! aos_in_r_array_extra_transp(i,j) = value of the jth ao on the ith grid point
+  END_DOC
+ 
+  implicit none
+  integer          :: i, j
+  double precision :: aos_array(ao_num), r(3)
+
+  do i = 1, n_points_extra_final_grid
+    do j = 1, ao_num
+      aos_in_r_array_extra_transp(i,j) = aos_in_r_array_extra(j,i)
+    enddo
   enddo
- enddo
 
- END_PROVIDER
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER[double precision, aos_grad_in_r_array_extra, (ao_num,n_points_extra_final_grid,3)]
+
+  implicit none
+  integer          :: i, j, m
+  double precision :: aos_array(ao_num), r(3)
+  double precision :: aos_grad_array(3,ao_num)
+
+  !$OMP PARALLEL DO                                &
+  !$OMP DEFAULT (NONE)                             &
+  !$OMP PRIVATE (i,j,m,r,aos_array,aos_grad_array) & 
+  !$OMP SHARED(aos_grad_in_r_array_extra,n_points_extra_final_grid,ao_num,final_grid_points_extra)
+  do i = 1, n_points_extra_final_grid
+    r(1) = final_grid_points_extra(1,i)
+    r(2) = final_grid_points_extra(2,i)
+    r(3) = final_grid_points_extra(3,i)
+    call give_all_aos_and_grad_at_r(r, aos_array, aos_grad_array)
+    do m = 1, 3
+      do j = 1, ao_num
+        aos_grad_in_r_array_extra(j,i,m) = aos_grad_array(m,j)
+      enddo
+    enddo
+  enddo
+  !$OMP END PARALLEL DO
+
+END_PROVIDER 
+
+! ---
 
diff --git a/src/dft_utils_in_r/mo_in_r.irp.f b/src/dft_utils_in_r/mo_in_r.irp.f
index 192cb25a..ad931402 100644
--- a/src/dft_utils_in_r/mo_in_r.irp.f
+++ b/src/dft_utils_in_r/mo_in_r.irp.f
@@ -48,7 +48,7 @@
  integer :: i,j
  do i = 1, n_points_final_grid
   do j = 1, mo_num
-   mos_in_r_array_transp(i,j) = mos_in_r_array(j,i) 
+   mos_in_r_array_transp(i,j) = mos_in_r_array_omp(j,i) 
   enddo
  enddo
  END_PROVIDER
diff --git a/src/ezfio_files/01.convert.bats b/src/ezfio_files/convert_bats_old
similarity index 100%
rename from src/ezfio_files/01.convert.bats
rename to src/ezfio_files/convert_bats_old
diff --git a/src/ezfio_files/get_unit_and_open.irp.f b/src/ezfio_files/get_unit_and_open.irp.f
index 6440579f..d6a7efac 100644
--- a/src/ezfio_files/get_unit_and_open.irp.f
+++ b/src/ezfio_files/get_unit_and_open.irp.f
@@ -47,11 +47,13 @@ integer function getUnitAndOpen(f,mode)
     endif
     open(unit=getUnitAndOpen,file=f,status='OLD',action='READ',form='UNFORMATTED')
   else if (mode.eq.'W') then
-    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='WRITE',form='UNFORMATTED')
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',form='UNFORMATTED')
+  else if (mode.eq.'A') then
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',position='APPEND',form='UNFORMATTED')
   else if (mode.eq.'w') then
-    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='WRITE',form='FORMATTED')
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',form='FORMATTED')
   else if (mode.eq.'a') then
-    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='WRITE',position='APPEND',form='FORMATTED')
+    open(unit=getUnitAndOpen,file=new_f,status='UNKNOWN',action='READWRITE',position='APPEND',form='FORMATTED')
   else if (mode.eq.'x') then
     open(unit=getUnitAndOpen,file=new_f,form='FORMATTED')
   endif
diff --git a/src/gpu/NEED b/src/gpu/NEED
new file mode 100644
index 00000000..c2af78d2
--- /dev/null
+++ b/src/gpu/NEED
@@ -0,0 +1 @@
+gpu_arch
diff --git a/src/gpu/README.rst b/src/gpu/README.rst
new file mode 100644
index 00000000..17ee28a0
--- /dev/null
+++ b/src/gpu/README.rst
@@ -0,0 +1,6 @@
+===
+gpu
+===
+
+Bindings for GPU routines (architecture independent).
+Architecture-dependent files are in gpu_arch.
diff --git a/src/gpu/gpu.h b/src/gpu/gpu.h
new file mode 100644
index 00000000..ac70e21a
--- /dev/null
+++ b/src/gpu/gpu.h
@@ -0,0 +1,41 @@
+#include <stdint.h>
+
+int  gpu_ndevices();
+void gpu_set_device(int32_t i);
+
+void gpu_allocate(void** ptr, const int64_t n);
+void gpu_free(void** ptr);
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n);
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n);
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n);
+
+void gpu_stream_create(void** ptr);
+void gpu_stream_destroy(void** ptr);
+void gpu_set_stream(void* handle, void* stream);
+void gpu_synchronize();
+
+void gpu_blas_create(void** handle);
+void gpu_blas_destroy(void** handle);
+
+void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result);
+
+void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
+
+void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
+
+void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
+
+void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
+
+void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
+
+void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
+
+void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f
new file mode 100644
index 00000000..3b2feeb6
--- /dev/null
+++ b/src/gpu/gpu.irp.f
@@ -0,0 +1,26 @@
+use gpu
+
+BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
+ implicit none
+ BEGIN_DOC
+ ! Handle for cuBLAS or RocBLAS
+ END_DOC
+ call gpu_blas_create(blas_handle)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
+ implicit none
+ BEGIN_DOC
+ ! Default stream
+ END_DOC
+ gpu_default_stream%c = C_NULL_PTR
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, gpu_num ]
+ implicit none
+ BEGIN_DOC
+ ! Number of usable GPUs
+ END_DOC
+ gpu_num = gpu_ndevices()
+END_PROVIDER
+
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
new file mode 100644
index 00000000..6050075f
--- /dev/null
+++ b/src/gpu/gpu_module.F90
@@ -0,0 +1,707 @@
+module gpu
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+! Data types
+! ----------
+
+  type gpu_double1
+    type(c_ptr) :: c
+    double precision, pointer :: f(:)
+  end type
+
+  type gpu_double2
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:)
+  end type
+
+  type gpu_double3
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:)
+  end type
+
+  type gpu_double4
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:)
+  end type
+
+  type gpu_double5
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:)
+  end type
+
+  type gpu_double6
+    type(c_ptr) :: c
+    double precision, pointer :: f(:,:,:,:,:,:)
+  end type
+
+
+  type gpu_blas
+    type(c_ptr) :: c
+  end type
+
+  type gpu_stream
+    type(c_ptr) :: c
+  end type
+
+
+! C interfaces
+! ------------
+
+  interface
+    logical(c_bool) function no_gpu() bind(C)
+      import
+    end function
+
+    integer function gpu_ndevices() bind(C)
+      import
+    end function
+
+    subroutine gpu_set_device(id) bind(C)
+      import
+      integer(c_int32_t), value :: id
+    end subroutine
+
+    subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
+      import
+      type(c_ptr) :: ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
+      import
+      type(c_ptr) :: ptr
+    end subroutine
+
+    subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
+      import
+      type(c_ptr), value :: cpu_ptr
+      type(c_ptr), value :: gpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
+      import
+      type(c_ptr), value :: gpu_ptr
+      type(c_ptr), value :: cpu_ptr
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
+      import
+      type(c_ptr), value :: gpu_ptr_src
+      type(c_ptr), value :: gpu_ptr_dest
+      integer(c_int64_t), value :: n
+    end subroutine
+
+    subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
+      import
+      type(c_ptr) :: stream
+    end subroutine
+
+    subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
+      import
+      type(c_ptr), value :: handle, stream
+    end subroutine
+
+    subroutine gpu_synchronize() bind(C)
+      import
+    end subroutine
+
+    subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
+      import
+      type(c_ptr) :: handle
+    end subroutine
+
+    subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), value             :: dx, dy
+      real(c_double), intent(out)    :: res
+    end subroutine
+
+    subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
+      import
+      type(c_ptr), value, intent(in) :: handle
+      integer(c_int64_t), value      :: n, incx, incy
+      type(c_ptr), intent(in), value :: dx, dy
+      real(c_float), intent(out)     :: res
+    end subroutine
+
+    subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_dgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      type(c_ptr), value :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
+      b, ldb, c, ldc) bind(C, name='gpu_sgeam')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+    subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double)                        :: a, x, y
+    end subroutine
+
+    subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
+      x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa
+      integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float)                         :: a, x, y
+    end subroutine
+
+
+    subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in)         :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_double), intent(in)            :: alpha, beta
+      real(c_double) :: a, b, c
+    end subroutine
+
+    subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
+      b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
+      import
+      type(c_ptr), value, intent(in)        :: handle
+      character(c_char), intent(in), value  :: transa, transb
+      integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
+      real(c_float), intent(in)             :: alpha, beta
+      real(c_float) :: a, b, c
+    end subroutine
+
+  end interface
+
+
+! Polymorphic interfaces
+! ----------------------
+
+  interface gpu_allocate
+    procedure gpu_allocate_double1     &
+             ,gpu_allocate_double2     &
+             ,gpu_allocate_double3     &
+             ,gpu_allocate_double4     &
+             ,gpu_allocate_double5     &
+             ,gpu_allocate_double6     &
+             ,gpu_allocate_double1_64  &
+             ,gpu_allocate_double2_64  &
+             ,gpu_allocate_double3_64  &
+             ,gpu_allocate_double4_64  &
+             ,gpu_allocate_double5_64  &
+             ,gpu_allocate_double6_64
+  end interface gpu_allocate
+
+  interface gpu_deallocate
+    procedure gpu_deallocate_double1     &
+             ,gpu_deallocate_double2     &
+             ,gpu_deallocate_double3     &
+             ,gpu_deallocate_double4     &
+             ,gpu_deallocate_double5     &
+             ,gpu_deallocate_double6
+  end interface gpu_deallocate
+
+  interface gpu_upload
+    procedure gpu_upload_double1  &
+             ,gpu_upload_double2  &
+             ,gpu_upload_double3  &
+             ,gpu_upload_double4  &
+             ,gpu_upload_double5  &
+             ,gpu_upload_double6
+  end interface gpu_upload
+
+  interface gpu_download
+    procedure gpu_download_double1  &
+             ,gpu_download_double2  &
+             ,gpu_download_double3  &
+             ,gpu_download_double4  &
+             ,gpu_download_double5  &
+             ,gpu_download_double6
+  end interface gpu_download
+
+  interface gpu_copy
+    procedure gpu_copy_double1  &
+             ,gpu_copy_double2  &
+             ,gpu_copy_double3  &
+             ,gpu_copy_double4  &
+             ,gpu_copy_double5  &
+             ,gpu_copy_double6
+  end interface gpu_copy
+
+
+  contains
+
+
+! gpu_allocate
+! ------------
+
+    subroutine gpu_allocate_double1(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+    subroutine gpu_allocate_double1_64(ptr, s)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      integer*8, intent(in) :: s
+
+      call gpu_allocate_c(ptr%c, s)
+      call c_f_pointer(ptr%c, ptr%f, (/ s /))
+    end subroutine
+
+    subroutine gpu_allocate_double2_64(ptr, s1, s2)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2
+
+      call gpu_allocate_c(ptr%c, s1*s2*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
+    end subroutine
+
+    subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
+    end subroutine
+
+    subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
+    end subroutine
+
+    subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
+    end subroutine
+
+    subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
+
+      call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
+      call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
+    end subroutine
+
+
+! gpu_deallocate
+! --------------
+
+    subroutine gpu_deallocate_double1(ptr)
+      implicit none
+      type(gpu_double1), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double2(ptr)
+      implicit none
+      type(gpu_double2), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double3(ptr)
+      implicit none
+      type(gpu_double3), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double4(ptr)
+      implicit none
+      type(gpu_double4), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double5(ptr)
+      implicit none
+      type(gpu_double5), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+    subroutine gpu_deallocate_double6(ptr)
+      implicit none
+      type(gpu_double6), intent(inout) :: ptr
+      call gpu_deallocate_c(ptr%c)
+      NULLIFY(ptr%f)
+    end subroutine
+
+
+! gpu_upload
+! ----------
+
+    subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(*)
+      type(gpu_double1), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:)
+      type(gpu_double2), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:)
+      type(gpu_double3), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:)
+      type(gpu_double4), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:)
+      type(gpu_double5), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+    subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
+      implicit none
+      double precision, target, intent(in)     :: cpu_ptr(:,:,:,:,:,:)
+      type(gpu_double6), intent(in)    :: gpu_ptr
+      call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
+    end subroutine
+
+
+! gpu_download
+! ------------
+
+    subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double1), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
+    end subroutine
+
+    subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double2), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double3), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double4), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double5), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+    subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
+      implicit none
+      type(gpu_double6), intent(in)  :: gpu_ptr
+      double precision, target, intent(in)   :: cpu_ptr(:,:,:,:,:,:)
+      call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
+    end subroutine
+
+! gpu_copy
+! --------
+
+    subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double1), intent(in)        :: gpu_ptr_src
+      type(gpu_double1), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
+    end subroutine
+
+    subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double2), intent(in)        :: gpu_ptr_src
+      type(gpu_double2), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double3), intent(in)        :: gpu_ptr_src
+      type(gpu_double3), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double4), intent(in)        :: gpu_ptr_src
+      type(gpu_double4), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double5), intent(in)        :: gpu_ptr_src
+      type(gpu_double5), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+    subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
+      implicit none
+      type(gpu_double6), intent(in)        :: gpu_ptr_src
+      type(gpu_double6), intent(in)        :: gpu_ptr_dest
+      call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
+    end subroutine
+
+
+! gpu_stream
+! ----------
+
+    subroutine gpu_stream_create(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_create_c(stream%c)
+    end subroutine
+
+    subroutine gpu_stream_destroy(stream)
+      type(gpu_stream) :: stream
+      call gpu_stream_destroy_c(stream%c)
+    end subroutine
+
+    subroutine gpu_set_stream(handle, stream)
+      type(gpu_blas)   :: handle
+      type(gpu_stream) :: stream
+      call gpu_set_stream_c(handle%c, stream%c)
+    end subroutine
+
+
+! gpu_blas
+! --------
+
+    subroutine gpu_blas_create(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_create_c(handle%c)
+    end subroutine
+
+    subroutine gpu_blas_destroy(handle)
+      type(gpu_blas) :: handle
+      call gpu_blas_destroy_c(handle%c)
+    end subroutine
+
+
+
+
+
+! dot
+! ---
+
+subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*4                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
+end subroutine
+
+
+subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
+!  use gpu
+  type(gpu_blas), intent(in)     :: handle
+  integer*8                      :: n, incx, incy
+  double precision, target       :: dx, dy
+  double precision, intent(out)  :: res
+  call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
+end subroutine
+
+
+! geam
+! ----
+
+subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
+  b, ldb, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision, target     :: a, b, c
+  call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
+        c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
+end subroutine
+
+
+! gemv
+! ----
+
+subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*4, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
+  x, incx, beta, y, incy)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa
+  integer*8, intent(in)        :: m, n, lda, incx, incy
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, x, y
+  call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
+end subroutine
+
+
+! gemm
+! ----
+
+subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*4, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), &
+        b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
+  b, ldb, beta, c, ldc)
+!  use gpu
+  type(gpu_blas), intent(in)   :: handle
+  character, intent(in)        :: transa, transb
+  integer*8, intent(in)        :: m, n, k, lda, ldb, ldc
+  double precision, intent(in) :: alpha, beta
+  double precision             :: a, b, c
+  call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
+        alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
+end subroutine
+
+end module
diff --git a/src/hartree_fock/10.hf.bats b/src/hartree_fock/10.hf.bats
index b496a089..214dfa86 100644
--- a/src/hartree_fock/10.hf.bats
+++ b/src/hartree_fock/10.hf.bats
@@ -115,9 +115,6 @@ rm -rf $EZFIO
   run hco.ezfio -113.1841002944744
 }
 
-@test "HBO" { # 0.805600 1.4543s
-  run  hbo.ezfio  -100.018582259096
-}
 
 @test "H2S" { # 1.655600 4.21402s
   run h2s.ezfio -398.6944130421982
@@ -127,9 +124,6 @@ rm -rf $EZFIO
   run h3coh.ezfio  -114.9865030596373
 }
 
-@test "H2O" { # 1.811100 1.84387s
-  run  h2o.ezfio  -0.760270218692179E+02
-}
 
 @test "H2O2" { # 2.217000 8.50267s
   run h2o2.ezfio -150.7806608469964
@@ -187,13 +181,6 @@ rm -rf $EZFIO
   run oh.ezfio -75.42025413469165
 }
 
-@test "[Cu(NH3)4]2+" { # 59.610100 4.18766m
-  [[ -n $TRAVIS ]] && skip
-  qp set_file cu_nh3_4_2plus.ezfio
-  qp set scf_utils thresh_scf 1.e-10
-  run  cu_nh3_4_2plus.ezfio -1862.97590358903
-}
-
 @test "SO2" { # 71.894900  3.22567m
   [[ -n $TRAVIS ]] && skip
   run so2.ezfio -41.55800401346361
diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f
index 65b3d63c..6d917322 100644
--- a/src/hartree_fock/fock_matrix_hf.irp.f
+++ b/src/hartree_fock/fock_matrix_hf.irp.f
@@ -194,17 +194,28 @@ END_PROVIDER
  endif
 
 
- double precision :: rss
+ double precision :: rss, mem0, mem
  double precision :: memory_of_double
 
  integer :: iblock
- integer, parameter :: block_size = 32
+ integer :: block_size
+
+ call resident_memory(mem0)
+
+ block_size = 1024
+
+ rss = memory_of_double(2.d0*ao_num*ao_num)
+ do
+   mem = mem0 + block_size*rss
+   if ( (block_size < 2).or.(mem < qp_max_mem) ) exit
+   block_size = block_size/2
+ enddo
+
+ call check_mem(block_size*rss, irp_here)
 
- rss = memory_of_double(ao_num*ao_num)
- call check_mem(2.d0*block_size*rss, irp_here)
  allocate(X2(ao_num,ao_num,block_size,2))
  allocate(X3(ao_num,block_size,ao_num,2))
-    
+
 ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j)
 
  do iblock=1,cholesky_ao_num,block_size
diff --git a/src/mo_optimization/NEED b/src/mo_optimization/NEED
index 91f41ee3..33f770c3 100644
--- a/src/mo_optimization/NEED
+++ b/src/mo_optimization/NEED
@@ -1,7 +1,3 @@
-two_body_rdm
-hartree_fock
-cipsi
-davidson_undressed
+mo_optimization_utils
 selectors_full
 generators_full
-utils_trust_region
diff --git a/src/mo_optimization/cipsi_orb_opt.irp.f b/src/mo_optimization/cipsi_orb_opt.irp.f
index 7e3a79eb..19b3e9db 100644
--- a/src/mo_optimization/cipsi_orb_opt.irp.f
+++ b/src/mo_optimization/cipsi_orb_opt.irp.f
@@ -2,87 +2,7 @@ program optimization
   
   read_wf = .true. ! must be True for the orbital optimization !!!
   TOUCH read_wf 
-  call run_optimization
+  call run_optimization_mos_CIPSI
 
 end
 
-subroutine run_optimization
-
-  implicit none
-
-  double precision :: e_cipsi, e_opt, delta_e
-  double precision, allocatable :: Ev(:),PT2(:)
-  integer :: nb_iter,i
-  logical :: not_converged
-  character (len=100) :: filename
-
-  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
-  allocate(Ev(N_states),PT2(N_states))
-
-  not_converged = .True.
-  nb_iter = 0
-
-  ! To start from the wf
-  N_det_max = max(n_det,5)
-  TOUCH N_det_max
-
-  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
-  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
-  call state_average_energy(e_cipsi)
-  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
-  close(10)
-
-  do while (not_converged)
-      print*,''
-      print*,'======================'
-      print*,' Cipsi step:', nb_iter
-      print*,'======================'
-      print*,'' 
-      print*,'********** cipsi step **********'
-      ! cispi calculation
-      call run_stochastic_cipsi(Ev,PT2)
-
-      ! State average energy after the cipsi step
-      call state_average_energy(e_cipsi)
-
-      print*,''
-      print*,'********** optimization step **********'
-      ! orbital optimization
-      call run_orb_opt_trust_v2
-
-      ! State average energy after the orbital optimization
-      call state_average_energy(e_opt)
-
-      print*,''
-      print*,'********** diff step **********'
-      ! Gain in energy
-      delta_e = e_opt - e_cipsi
-      print*, 'Gain in energy during the orbital optimization:', delta_e
-
-      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
-      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
-      close(10)
-
-      ! Exit
-      if (delta_e > 1d-12) then
-          print*, 'WARNING, something wrong happened'
-          print*, 'The gain (delta_e) in energy during the optimization process'
-          print*, 'is > 0, but it must be < 0'
-          print*, 'The program will exit'
-          exit
-      endif
-
-      if (n_det > n_det_max_opt) then
-          print*, 'The number of determinants in the wf > n_det_max_opt'
-          print*, 'The program will exit'
-          exit
-      endif
-      
-      ! To double the number of determinants in the wf
-      N_det_max = int(dble(n_det * 2)*0.9)
-      TOUCH N_det_max
-
-      nb_iter = nb_iter + 1
-  enddo
-
-end
diff --git a/src/mo_optimization/EZFIO.cfg b/src/mo_optimization_utils/EZFIO.cfg
similarity index 100%
rename from src/mo_optimization/EZFIO.cfg
rename to src/mo_optimization_utils/EZFIO.cfg
diff --git a/src/mo_optimization_utils/NEED b/src/mo_optimization_utils/NEED
new file mode 100644
index 00000000..1a78a17f
--- /dev/null
+++ b/src/mo_optimization_utils/NEED
@@ -0,0 +1,5 @@
+two_body_rdm
+hartree_fock
+cipsi
+davidson_undressed
+utils_trust_region
diff --git a/src/mo_optimization_utils/README.md b/src/mo_optimization_utils/README.md
new file mode 100644
index 00000000..94f29aee
--- /dev/null
+++ b/src/mo_optimization_utils/README.md
@@ -0,0 +1,74 @@
+# Orbital optimization
+
+## Methods  
+Different methods are available:  
+- full hessian  
+``` 
+qp set orbital_optimization optimization_method full  
+```  
+- diagonal hessian  
+``` 
+qp set orbital_optimization optimization_method diag  
+``` 
+- identity matrix  
+``` 
+qp set orbital_optimization optimization_method none  
+``` 
+
+After the optimization the ezfio contains the optimized orbitals
+ 
+## For a fixed number of determinants
+To optimize the MOs for the actual determinants:  
+``` 
+qp run orb_opt
+``` 
+ 
+## For a complete optimization, i.e, with a larger and larger wave function
+To optimize the MOs with a larger and larger wave function:  
+``` 
+qp run optimization  
+``` 
+
+The results are stored in the EZFIO in "mo_optimization/result_opt",
+with the following format:  
+(1) (2) (3) (4)  
+1: Number of determinants in the wf,  
+2: Cispi energy before the optimization,   
+3: Cipsi energy after the optimization,  
+4: Energy difference between (2) and (3).  
+ 
+The optimization process if the following: 
+- we do a first cipsi step to obtain a small number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- we do a new cipsi step to double the number of determinants in the wf 
+- we run an orbital optimization for this wf 
+- ... 
+- we do that until the energy difference between (2) and (3) is  
+  smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) 
+  or the wf is larger than a given size (n_det_max_opt in qp_edit) 
+- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation  
+  
+### End of the optimization
+You can choos the number of determinants after what the 
+optimization will stop:
+```
+qp set orbital_optimization n_det_max_opt 1e5 # or any number
+```
+## Weight of the states
+You can change the weights of the differents states directly in qp edit.  
+It will affect ths weights used in the orbital optimization.
+
+# Tests
+To run the tests:  
+``` 
+qp test
+``` 
+
+# Org files
+The org files are stored in the directory org in order to avoid overwriting on user changes.
+The org files can be modified, to export the change to the source code, run
+```
+./TANGLE_org_mode.sh
+mv *.irp.f ../.
+```
+
diff --git a/src/mo_optimization/constants.h b/src/mo_optimization_utils/constants.h
similarity index 100%
rename from src/mo_optimization/constants.h
rename to src/mo_optimization_utils/constants.h
diff --git a/src/mo_optimization/diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/diagonal_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/diagonal_hessian_opt.irp.f
rename to src/mo_optimization_utils/diagonal_hessian_opt.irp.f
diff --git a/src/mo_optimization/diagonalization_hessian.irp.f b/src/mo_optimization_utils/diagonalization_hessian.irp.f
similarity index 100%
rename from src/mo_optimization/diagonalization_hessian.irp.f
rename to src/mo_optimization_utils/diagonalization_hessian.irp.f
diff --git a/src/mo_optimization/first_diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_diagonal_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/first_diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_diagonal_hessian_opt.irp.f
rename to src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f
diff --git a/src/mo_optimization/first_gradient_list_opt.irp.f b/src/mo_optimization_utils/first_gradient_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_gradient_list_opt.irp.f
rename to src/mo_optimization_utils/first_gradient_list_opt.irp.f
diff --git a/src/mo_optimization/first_gradient_opt.irp.f b/src/mo_optimization_utils/first_gradient_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_gradient_opt.irp.f
rename to src/mo_optimization_utils/first_gradient_opt.irp.f
diff --git a/src/mo_optimization/first_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_hessian_list_opt.irp.f
rename to src/mo_optimization_utils/first_hessian_list_opt.irp.f
diff --git a/src/mo_optimization/first_hessian_opt.irp.f b/src/mo_optimization_utils/first_hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/first_hessian_opt.irp.f
rename to src/mo_optimization_utils/first_hessian_opt.irp.f
diff --git a/src/mo_optimization/gradient_list_opt.irp.f b/src/mo_optimization_utils/gradient_list_opt.irp.f
similarity index 99%
rename from src/mo_optimization/gradient_list_opt.irp.f
rename to src/mo_optimization_utils/gradient_list_opt.irp.f
index 9b7228c7..9331c80f 100644
--- a/src/mo_optimization/gradient_list_opt.irp.f
+++ b/src/mo_optimization_utils/gradient_list_opt.irp.f
@@ -319,7 +319,7 @@ call omp_set_max_active_levels(4)
 ! \end{equation}
 
 ! We need a vector to use the gradient. Here the gradient is a
-! antisymetric matrix so we can transform it in a vector of length
+! antisymmetric matrix so we can transform it in a vector of length
 ! mo_num*(mo_num-1)/2.
 
 ! Here we do these two things at the same time.
diff --git a/src/mo_optimization/gradient_opt.irp.f b/src/mo_optimization_utils/gradient_opt.irp.f
similarity index 99%
rename from src/mo_optimization/gradient_opt.irp.f
rename to src/mo_optimization_utils/gradient_opt.irp.f
index 25be6b5a..10d42b35 100644
--- a/src/mo_optimization/gradient_opt.irp.f
+++ b/src/mo_optimization_utils/gradient_opt.irp.f
@@ -284,7 +284,7 @@ call omp_set_max_active_levels(4)
 ! \end{equation}
 
 ! We need a vector to use the gradient. Here the gradient is a
-! antisymetric matrix so we can transform it in a vector of length
+! antisymmetric matrix so we can transform it in a vector of length
 ! mo_num*(mo_num-1)/2.
 
 ! Here we do these two things at the same time.
diff --git a/src/mo_optimization/hessian_list_opt.irp.f b/src/mo_optimization_utils/hessian_list_opt.irp.f
similarity index 100%
rename from src/mo_optimization/hessian_list_opt.irp.f
rename to src/mo_optimization_utils/hessian_list_opt.irp.f
diff --git a/src/mo_optimization/hessian_opt.irp.f b/src/mo_optimization_utils/hessian_opt.irp.f
similarity index 100%
rename from src/mo_optimization/hessian_opt.irp.f
rename to src/mo_optimization_utils/hessian_opt.irp.f
diff --git a/src/mo_optimization/org/TODO.org b/src/mo_optimization_utils/org/TODO.org
similarity index 100%
rename from src/mo_optimization/org/TODO.org
rename to src/mo_optimization_utils/org/TODO.org
diff --git a/src/mo_optimization/org/debug_gradient_list_opt.org b/src/mo_optimization_utils/org/debug_gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_gradient_list_opt.org
rename to src/mo_optimization_utils/org/debug_gradient_list_opt.org
diff --git a/src/mo_optimization/org/debug_gradient_opt.org b/src/mo_optimization_utils/org/debug_gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_gradient_opt.org
rename to src/mo_optimization_utils/org/debug_gradient_opt.org
diff --git a/src/mo_optimization/org/debug_hessian_list_opt.org b/src/mo_optimization_utils/org/debug_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_hessian_list_opt.org
rename to src/mo_optimization_utils/org/debug_hessian_list_opt.org
diff --git a/src/mo_optimization/org/debug_hessian_opt.org b/src/mo_optimization_utils/org/debug_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/debug_hessian_opt.org
rename to src/mo_optimization_utils/org/debug_hessian_opt.org
diff --git a/src/mo_optimization/org/diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/diagonal_hessian_list_opt.org
rename to src/mo_optimization_utils/org/diagonal_hessian_list_opt.org
diff --git a/src/mo_optimization/org/diagonal_hessian_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/diagonal_hessian_opt.org
rename to src/mo_optimization_utils/org/diagonal_hessian_opt.org
diff --git a/src/mo_optimization/org/diagonalization_hessian.org b/src/mo_optimization_utils/org/diagonalization_hessian.org
similarity index 100%
rename from src/mo_optimization/org/diagonalization_hessian.org
rename to src/mo_optimization_utils/org/diagonalization_hessian.org
diff --git a/src/mo_optimization/org/first_diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_diagonal_hessian_list_opt.org
rename to src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org
diff --git a/src/mo_optimization/org/first_diagonal_hessian_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_diagonal_hessian_opt.org
rename to src/mo_optimization_utils/org/first_diagonal_hessian_opt.org
diff --git a/src/mo_optimization/org/first_gradient_list_opt.org b/src/mo_optimization_utils/org/first_gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_gradient_list_opt.org
rename to src/mo_optimization_utils/org/first_gradient_list_opt.org
diff --git a/src/mo_optimization/org/first_gradient_opt.org b/src/mo_optimization_utils/org/first_gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_gradient_opt.org
rename to src/mo_optimization_utils/org/first_gradient_opt.org
diff --git a/src/mo_optimization/org/first_hessian_list_opt.org b/src/mo_optimization_utils/org/first_hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_hessian_list_opt.org
rename to src/mo_optimization_utils/org/first_hessian_list_opt.org
diff --git a/src/mo_optimization/org/first_hessian_opt.org b/src/mo_optimization_utils/org/first_hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/first_hessian_opt.org
rename to src/mo_optimization_utils/org/first_hessian_opt.org
diff --git a/src/mo_optimization/org/gradient_list_opt.org b/src/mo_optimization_utils/org/gradient_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/gradient_list_opt.org
rename to src/mo_optimization_utils/org/gradient_list_opt.org
diff --git a/src/mo_optimization/org/gradient_opt.org b/src/mo_optimization_utils/org/gradient_opt.org
similarity index 100%
rename from src/mo_optimization/org/gradient_opt.org
rename to src/mo_optimization_utils/org/gradient_opt.org
diff --git a/src/mo_optimization/org/hessian_list_opt.org b/src/mo_optimization_utils/org/hessian_list_opt.org
similarity index 100%
rename from src/mo_optimization/org/hessian_list_opt.org
rename to src/mo_optimization_utils/org/hessian_list_opt.org
diff --git a/src/mo_optimization/org/hessian_opt.org b/src/mo_optimization_utils/org/hessian_opt.org
similarity index 100%
rename from src/mo_optimization/org/hessian_opt.org
rename to src/mo_optimization_utils/org/hessian_opt.org
diff --git a/src/mo_optimization/org/my_providers.org b/src/mo_optimization_utils/org/my_providers.org
similarity index 100%
rename from src/mo_optimization/org/my_providers.org
rename to src/mo_optimization_utils/org/my_providers.org
diff --git a/src/mo_optimization/org/optimization.org b/src/mo_optimization_utils/org/optimization.org
similarity index 100%
rename from src/mo_optimization/org/optimization.org
rename to src/mo_optimization_utils/org/optimization.org
diff --git a/src/mo_optimization/org/orb_opt_trust_v2.org b/src/mo_optimization_utils/org/orb_opt_trust_v2.org
similarity index 100%
rename from src/mo_optimization/org/orb_opt_trust_v2.org
rename to src/mo_optimization_utils/org/orb_opt_trust_v2.org
diff --git a/src/mo_optimization/org/state_average_energy.org b/src/mo_optimization_utils/org/state_average_energy.org
similarity index 100%
rename from src/mo_optimization/org/state_average_energy.org
rename to src/mo_optimization_utils/org/state_average_energy.org
diff --git a/src/mo_optimization/org/state_weight_normalization.org b/src/mo_optimization_utils/org/state_weight_normalization.org
similarity index 100%
rename from src/mo_optimization/org/state_weight_normalization.org
rename to src/mo_optimization_utils/org/state_weight_normalization.org
diff --git a/src/mo_optimization/org/update_parameters.org b/src/mo_optimization_utils/org/update_parameters.org
similarity index 100%
rename from src/mo_optimization/org/update_parameters.org
rename to src/mo_optimization_utils/org/update_parameters.org
diff --git a/src/mo_optimization/org/update_st_av_ci_energy.org b/src/mo_optimization_utils/org/update_st_av_ci_energy.org
similarity index 100%
rename from src/mo_optimization/org/update_st_av_ci_energy.org
rename to src/mo_optimization_utils/org/update_st_av_ci_energy.org
diff --git a/src/mo_optimization_utils/routine_opt_mos.irp.f b/src/mo_optimization_utils/routine_opt_mos.irp.f
new file mode 100644
index 00000000..fceba2c5
--- /dev/null
+++ b/src/mo_optimization_utils/routine_opt_mos.irp.f
@@ -0,0 +1,81 @@
+
+subroutine run_optimization_mos_CIPSI
+
+  implicit none
+
+  double precision :: e_cipsi, e_opt, delta_e
+  double precision, allocatable :: Ev(:),PT2(:)
+  integer :: nb_iter,i
+  logical :: not_converged
+  character (len=100) :: filename
+
+  PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
+  allocate(Ev(N_states),PT2(N_states))
+
+  not_converged = .True.
+  nb_iter = 0
+
+  ! To start from the wf
+  N_det_max = max(n_det,5)
+  TOUCH N_det_max
+
+  open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
+  write(10,*) "   Ndet        E_cipsi         E_opt          Delta_e"
+  call state_average_energy(e_cipsi)
+  write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
+  close(10)
+
+  do while (not_converged)
+      print*,''
+      print*,'======================'
+      print*,' Cipsi step:', nb_iter
+      print*,'======================'
+      print*,'' 
+      print*,'********** cipsi step **********'
+      ! cispi calculation
+      call run_stochastic_cipsi(Ev,PT2)
+
+      ! State average energy after the cipsi step
+      call state_average_energy(e_cipsi)
+
+      print*,''
+      print*,'********** optimization step **********'
+      ! orbital optimization
+      call run_orb_opt_trust_v2
+
+      ! State average energy after the orbital optimization
+      call state_average_energy(e_opt)
+
+      print*,''
+      print*,'********** diff step **********'
+      ! Gain in energy
+      delta_e = e_opt - e_cipsi
+      print*, 'Gain in energy during the orbital optimization:', delta_e
+
+      open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
+      write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
+      close(10)
+
+      ! Exit
+      if (delta_e > 1d-12) then
+          print*, 'WARNING, something wrong happened'
+          print*, 'The gain (delta_e) in energy during the optimization process'
+          print*, 'is > 0, but it must be < 0'
+          print*, 'The program will exit'
+          exit
+      endif
+
+      if (n_det > n_det_max_opt) then
+          print*, 'The number of determinants in the wf > n_det_max_opt'
+          print*, 'The program will exit'
+          exit
+      endif
+      
+      ! To double the number of determinants in the wf
+      N_det_max = int(dble(n_det * 2)*0.9)
+      TOUCH N_det_max
+
+      nb_iter = nb_iter + 1
+  enddo
+
+end
diff --git a/src/mo_optimization/run_orb_opt_trust_v2.irp.f b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
similarity index 100%
rename from src/mo_optimization/run_orb_opt_trust_v2.irp.f
rename to src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f
diff --git a/src/mo_optimization/save_energy.irp.f b/src/mo_optimization_utils/save_energy.irp.f
similarity index 100%
rename from src/mo_optimization/save_energy.irp.f
rename to src/mo_optimization_utils/save_energy.irp.f
diff --git a/src/mo_optimization/state_average_energy.irp.f b/src/mo_optimization_utils/state_average_energy.irp.f
similarity index 100%
rename from src/mo_optimization/state_average_energy.irp.f
rename to src/mo_optimization_utils/state_average_energy.irp.f
diff --git a/src/mo_optimization/state_weight_normalization.irp.f b/src/mo_optimization_utils/state_weight_normalization.irp.f
similarity index 100%
rename from src/mo_optimization/state_weight_normalization.irp.f
rename to src/mo_optimization_utils/state_weight_normalization.irp.f
diff --git a/src/mo_optimization/update_parameters.irp.f b/src/mo_optimization_utils/update_parameters.irp.f
similarity index 100%
rename from src/mo_optimization/update_parameters.irp.f
rename to src/mo_optimization_utils/update_parameters.irp.f
diff --git a/src/mo_optimization/update_st_av_ci_energy.irp.f b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f
similarity index 100%
rename from src/mo_optimization/update_st_av_ci_energy.irp.f
rename to src/mo_optimization_utils/update_st_av_ci_energy.irp.f
diff --git a/src/mo_two_e_ints/EZFIO.cfg b/src/mo_two_e_ints/EZFIO.cfg
index 088a2416..da9d8fc9 100644
--- a/src/mo_two_e_ints/EZFIO.cfg
+++ b/src/mo_two_e_ints/EZFIO.cfg
@@ -1,9 +1,21 @@
+[io_mo_cholesky]
+type: Disk_access
+doc: Read/Write |MO| Cholesky integrals from/to disk [ Write | Read | None ]
+interface: ezfio,provider,ocaml
+default: None
+
 [io_mo_two_e_integrals]
 type: Disk_access
 doc: Read/Write |MO| integrals from/to disk [ Write | Read | None ] 
 interface: ezfio,provider,ocaml
 default: None
 
+[mo_integrals_cache_shift]
+type: integer
+doc: Adjusts the size of the MO integrals cache. 2: 2KB, 3: 32KB, 4: 512KB, 5: 8MB, 6: 128MB, 7: 2GB, 8: 32GB, 9: 512GB
+interface: ezfio, provider, ocaml
+default: 7
+
 [mo_integrals_threshold]
 type: Threshold
 doc: If | <ij|kl> | < `mo_integrals_threshold` then <ij|kl> is zero
@@ -11,12 +23,6 @@ interface: ezfio,provider,ocaml
 default: 1.e-15
 ezfio_name: threshold_mo
 
-[no_vvvv_integrals]
-type: logical
-doc: If `True`, computes all integrals except for the integrals having 3 or 4 virtual indices
-interface: ezfio,provider,ocaml
-default: false
-
 [io_mo_two_e_integrals_erf]
 type: Disk_access
 doc: Read/Write MO integrals with the long range interaction from/to disk [    Write | Read | None ]
diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f
index 349f13b9..1fed949d 100644
--- a/src/mo_two_e_ints/cholesky.irp.f
+++ b/src/mo_two_e_ints/cholesky.irp.f
@@ -1,9 +1,26 @@
+BEGIN_PROVIDER [ logical, do_mo_cholesky ]
+ implicit none
+ BEGIN_DOC
+ ! If True, use Cholesky vectors for MO integrals
+ END_DOC
+ do_mo_cholesky = do_ao_cholesky
+! do_mo_cholesky = .False.
+END_PROVIDER
+
 BEGIN_PROVIDER [ integer, cholesky_mo_num ]
  implicit none
  BEGIN_DOC
  ! Number of Cholesky vectors in MO basis
  END_DOC
- cholesky_mo_num = cholesky_ao_num
+ integer, external              :: getUnitAndOpen
+ integer                        :: iunit
+ if (read_mo_cholesky) then
+   iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'R')
+   read(iunit) cholesky_mo_num
+   close(iunit)
+ else
+   cholesky_mo_num = cholesky_ao_num
+ endif
 END_PROVIDER
 
 BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num) ]
@@ -28,24 +45,90 @@ BEGIN_PROVIDER [ double precision, cholesky_mo, (mo_num, mo_num, cholesky_mo_num
 END_PROVIDER
 
 BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num, mo_num) ]
+ implicit none
+ BEGIN_DOC
+ ! Cholesky vectors in MO basis. Warning: it is transposed wrt cholesky_ao:
+ !
+ ! -  cholesky_ao        is (ao_num^2 x cholesky_ao_num)
+ !
+ ! - cholesky_mo_transp is (cholesky_mo_num x mo_num^2)
+ END_DOC
+
+ double precision, allocatable :: X(:,:,:)
+ double precision :: wall0, wall1
+ integer, external              :: getUnitAndOpen
+ integer                        :: iunit, ierr, rank
+
+  if (read_mo_cholesky) then
+      print *,  'Reading Cholesky MO vectors from disk...'
+     iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'R')
+     read(iunit) rank
+     if (cholesky_mo_num /= rank) then
+        stop 'inconsistent rank'
+     endif
+     read(iunit) cholesky_mo_transp
+     close(iunit)
+  else
+     print *, ''
+     print *, 'AO->MO Transformation of Cholesky vectors'
+     print *, '-----------------------------------------'
+     print *, ''
+
+     call wall_time(wall0)
+
+     allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
+     if (ierr /= 0) then
+       print *, irp_here, ': Allocation failed'
+     endif
+     call dgemm('T','N', ao_num*cholesky_mo_num, mo_num, ao_num, 1.d0, &
+         cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_mo_num)
+     call dgemm('T','N', cholesky_mo_num*mo_num, mo_num, ao_num, 1.d0, &
+         X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_mo_num*mo_num)
+     deallocate(X)
+     call wall_time(wall1)
+     print*,'Time to provide MO cholesky vectors = ',(wall1-wall0)/60.d0, ' min'
+
+
+     if (write_mo_cholesky) then
+       print *,  'Writing Cholesky MO vectors to disk...'
+       iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
+       write(iunit) cholesky_mo_num
+       write(iunit) cholesky_mo_transp
+       close(iunit)
+       call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
+     endif
+  endif
+
+END_PROVIDER
+
+
+BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ]
  implicit none
  BEGIN_DOC
  ! Cholesky vectors in MO basis
  END_DOC
 
  double precision, allocatable :: X(:,:,:)
+ double precision :: wall0, wall1
  integer :: ierr
- print *, 'AO->MO Transformation of Cholesky vectors'
+ print *, 'Semi AO->MO Transformation of Cholesky vectors'
+  call wall_time(wall0)
 
  allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
  if (ierr /= 0) then
    print *, irp_here, ': Allocation failed'
  endif
- call dgemm('T','N', ao_num*cholesky_mo_num, mo_num, ao_num, 1.d0, &
-     cholesky_ao, ao_num, mo_coef, ao_num, 0.d0, X, ao_num*cholesky_mo_num)
- call dgemm('T','N', cholesky_mo_num*mo_num, mo_num, ao_num, 1.d0, &
-     X, ao_num, mo_coef, ao_num, 0.d0, cholesky_mo_transp, cholesky_mo_num*mo_num)
- deallocate(X)
+ integer :: i_chol, i_mo, j_mo, i_ao 
+ cholesky_semi_mo_transp_simple = 0.d0
+ do i_mo = 1, mo_num
+  do i_ao = 1, ao_num
+   do j_mo = 1, mo_num
+    do i_chol = 1, cholesky_mo_num
+     cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao)
+    enddo
+   enddo
+  enddo
+ enddo
 
 END_PROVIDER
 
diff --git a/src/mo_two_e_ints/four_idx_novvvv.irp.f b/src/mo_two_e_ints/four_idx_novvvv.irp.f
deleted file mode 100644
index 2be09689..00000000
--- a/src/mo_two_e_ints/four_idx_novvvv.irp.f
+++ /dev/null
@@ -1,189 +0,0 @@
-!BEGIN_PROVIDER [ logical, no_vvvv_integrals  ]
-!  implicit none
-!  BEGIN_DOC
-! If `True`, computes all integrals except for the integrals having 3 or 4 virtual indices
-!  END_DOC
-!
-!  no_vvvv_integrals = .False.
-!END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, mo_coef_novirt, (ao_num,n_core_inact_act_orb) ]
- implicit none
- BEGIN_DOC
- ! MO coefficients without virtual MOs
- END_DOC
- integer :: j,jj
-
- do j=1,n_core_inact_act_orb
-   jj = list_core_inact_act(j)
-   mo_coef_novirt(:,j) = mo_coef(:,jj)
- enddo
-
-END_PROVIDER
-
-subroutine ao_to_mo_novirt(A_ao,LDA_ao,A_mo,LDA_mo)
-  implicit none
-  BEGIN_DOC
-  ! Transform A from the |AO| basis to the |MO| basis excluding virtuals
-  !
-  ! $C^\dagger.A_{ao}.C$
-  END_DOC
-  integer, intent(in)            :: LDA_ao,LDA_mo
-  double precision, intent(in)   :: A_ao(LDA_ao,ao_num)
-  double precision, intent(out)  :: A_mo(LDA_mo,n_core_inact_act_orb)
-  double precision, allocatable  :: T(:,:)
-
-  allocate ( T(ao_num,n_core_inact_act_orb) )
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: T
-
-  call dgemm('N','N', ao_num, n_core_inact_act_orb, ao_num,          &
-      1.d0, A_ao,LDA_ao,                                             &
-      mo_coef_novirt, size(mo_coef_novirt,1),                        &
-      0.d0, T, size(T,1))
-  
-  call dgemm('T','N', n_core_inact_act_orb, n_core_inact_act_orb, ao_num,&
-      1.d0, mo_coef_novirt,size(mo_coef_novirt,1),                   &
-      T, ao_num,                                                     &
-      0.d0, A_mo, size(A_mo,1))
-  
-  deallocate(T)
-end
-
-
-subroutine four_idx_novvvv
-  print*,'********'
-  print*,'********'
-  print*,'********'
-  print*,'WARNING :: Using four_idx_novvvv, and we are not sure that this routine is not bugged ...'
-  print*,'********'
-  print*,'********'
-  print*,'********'
-  use map_module
-  implicit none
-  BEGIN_DOC
-  ! Retransform MO integrals for next CAS-SCF step
-  END_DOC
-  print*,'Using partial transformation'
-  print*,'It will not transform all integrals with at least 3 indices within the virtuals'
-  integer                        :: i,j,k,l,n_integrals
-  double precision, allocatable  :: f(:,:,:), f2(:,:,:), d(:,:), T(:,:,:,:), T2(:,:,:,:)
-  double precision, external     :: get_ao_two_e_integral
-  integer(key_kind), allocatable :: idx(:)
-  real(integral_kind), allocatable :: values(:)
-
-  integer                        :: p,q,r,s
-  double precision               :: c
-  allocate( T(n_core_inact_act_orb,n_core_inact_act_orb,ao_num,ao_num) , &
-            T2(n_core_inact_act_orb,n_core_inact_act_orb,ao_num,ao_num) )
-  
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-  !$OMP SHARED(mo_num,ao_num,T,n_core_inact_act_orb, mo_coef_transp, &
-  !$OMP   mo_integrals_threshold,mo_coef,mo_integrals_map,           &
-  !$OMP   list_core_inact_act,T2,ao_integrals_map)            &
-  !$OMP PRIVATE(i,j,k,l,p,q,r,s,idx,values,n_integrals,              &
-  !$OMP   f,f2,d,c)
-  allocate(f(ao_num,ao_num,ao_num), f2(ao_num,ao_num,ao_num), d(mo_num,mo_num), &
-           idx(mo_num*mo_num), values(mo_num*mo_num) )
-  
-  ! <aa|vv>
-  !$OMP DO
-  do s=1,ao_num
-    do r=1,ao_num
-      do q=1,ao_num
-        do p=1,r
-          f (p,q,r) = get_ao_two_e_integral(p,q,r,s,ao_integrals_map)
-          f (r,q,p) = f(p,q,r)
-        enddo
-      enddo
-    enddo
-    do r=1,ao_num
-      do q=1,ao_num
-        do p=1,ao_num
-          f2(p,q,r) = f(p,r,q)
-        enddo
-      enddo
-    enddo
-    ! f (p,q,r) = <pq|rs>
-    ! f2(p,q,r) = <pr|qs>
-
-    do r=1,ao_num
-      call ao_to_mo_novirt(f (1,1,r),size(f ,1),T (1,1,r,s),size(T,1))
-      call ao_to_mo_novirt(f2(1,1,r),size(f2,1),T2(1,1,r,s),size(T,1))
-    enddo
-    ! T (i,j,p,q) = <ij|rs>
-    ! T2(i,j,p,q) = <ir|js>
-
-  enddo
-  !$OMP END DO 
-
-  !$OMP DO
-  do j=1,n_core_inact_act_orb
-    do i=1,n_core_inact_act_orb
-      do s=1,ao_num
-        do r=1,ao_num
-          f (r,s,1) = T (i,j,r,s)
-          f2(r,s,1) = T2(i,j,r,s)
-        enddo
-      enddo
-      call ao_to_mo(f ,size(f ,1),d,size(d,1))
-      n_integrals = 0
-      do l=1,mo_num
-        do k=1,mo_num
-          n_integrals+=1
-          call two_e_integrals_index(list_core_inact_act(i),list_core_inact_act(j),k,l,idx(n_integrals))
-          values(n_integrals) = d(k,l)
-        enddo
-      enddo
-      call map_append(mo_integrals_map, idx, values, n_integrals)
-
-      call ao_to_mo(f2,size(f2,1),d,size(d,1))
-      n_integrals = 0
-      do l=1,mo_num
-        do k=1,mo_num
-          n_integrals+=1
-          call two_e_integrals_index(list_core_inact_act(i),k,list_core_inact_act(j),l,idx(n_integrals))
-          values(n_integrals) = d(k,l)
-        enddo
-      enddo
-      call map_append(mo_integrals_map, idx, values, n_integrals)
-    enddo
-  enddo
-  !$OMP END DO
-  deallocate(f,f2,d,idx,values)
-  
-  !$OMP END PARALLEL
-
-  deallocate(T,T2)
-  
-  
-  call map_sort(mo_integrals_map)
-  call map_unique(mo_integrals_map)
-  call map_shrink(mo_integrals_map,real(mo_integrals_threshold,integral_kind))
-  
-end
-
-subroutine four_idx_novvvv2
-  use bitmasks
-  implicit none
-  integer                        :: i
-  integer(bit_kind)              :: mask_ijkl(N_int,4)
-
-    print*, '<ix|ix>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  full_ijkl_bitmask_4(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  full_ijkl_bitmask_4(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-
-    print*, '<ii|vv>'
-    do i = 1,N_int
-      mask_ijkl(i,1) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) = virt_bitmask(i,1)
-      mask_ijkl(i,4) = virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-
-end
diff --git a/src/mo_two_e_ints/integrals_3_index.irp.f b/src/mo_two_e_ints/integrals_3_index.irp.f
index eb05da84..c0dab506 100644
--- a/src/mo_two_e_ints/integrals_3_index.irp.f
+++ b/src/mo_two_e_ints/integrals_3_index.irp.f
@@ -10,7 +10,7 @@
  double precision :: get_two_e_integral
  double precision :: integral
 
- if (do_ao_cholesky) then
+ if (do_mo_cholesky) then
 
     double precision, allocatable :: buffer_jj(:,:), buffer(:,:,:)
     allocate(buffer_jj(cholesky_mo_num,mo_num), buffer(mo_num,mo_num,mo_num))
diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f
index ada256a2..eeb4279f 100644
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@@ -32,60 +32,96 @@ subroutine insert_into_mo_integrals_map(n_integrals,                 &
   call map_update(mo_integrals_map, buffer_i, buffer_values, n_integrals, thr)
 end
 
- BEGIN_PROVIDER [ integer*4, mo_integrals_cache_min ]
-&BEGIN_PROVIDER [ integer*4, mo_integrals_cache_max ]
-&BEGIN_PROVIDER [ integer*8, mo_integrals_cache_min_8 ]
-&BEGIN_PROVIDER [ integer*8, mo_integrals_cache_max_8 ]
+ BEGIN_PROVIDER [ integer, mo_integrals_cache_min ]
+&BEGIN_PROVIDER [ integer, mo_integrals_cache_max ]
+&BEGIN_PROVIDER [ integer, mo_integrals_cache_size ]
  implicit none
  BEGIN_DOC
  ! Min and max values of the MOs for which the integrals are in the cache
  END_DOC
- mo_integrals_cache_min_8 = max(1_8,elec_alpha_num - 63_8)
- mo_integrals_cache_max_8 = min(int(mo_num,8),mo_integrals_cache_min_8+127_8)
- mo_integrals_cache_min   = max(1,elec_alpha_num - 63)
- mo_integrals_cache_max   = min(mo_num,mo_integrals_cache_min+127)
+
+ mo_integrals_cache_size  = 2**mo_integrals_cache_shift
+
+ mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
+ mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
+ print *, 'MO integrals cache: (', mo_integrals_cache_min, ', ', mo_integrals_cache_max, ')'
 
 END_PROVIDER
 
-BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:128_8*128_8*128_8*128_8) ]
+BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:(1_8*mo_integrals_cache_size)**4) ]
  implicit none
  BEGIN_DOC
  ! Cache of MO integrals for fast access
  END_DOC
  PROVIDE mo_two_e_integrals_in_map
- integer*8                      :: i,j,k,l
- integer*4                      :: i4,j4,k4,l4
+ integer                        :: i,j,k,l
  integer*8                      :: ii
  integer(key_kind)              :: idx
  real(integral_kind)            :: integral
  FREE ao_integrals_cache
- !$OMP PARALLEL DO PRIVATE (i,j,k,l,i4,j4,k4,l4,idx,ii,integral)
- do l=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-   l4 = int(l,4)
-   do k=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-     k4 = int(k,4)
-     do j=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-       j4 = int(j,4)
-       do i=mo_integrals_cache_min_8,mo_integrals_cache_max_8
-         i4 = int(i,4)
-         !DIR$ FORCEINLINE
-         call two_e_integrals_index(i4,j4,k4,l4,idx)
-         !DIR$ FORCEINLINE
-         call map_get(mo_integrals_map,idx,integral)
-         ii = l-mo_integrals_cache_min_8
-         ii = ior( shiftl(ii,7), k-mo_integrals_cache_min_8)
-         ii = ior( shiftl(ii,7), j-mo_integrals_cache_min_8)
-         ii = ior( shiftl(ii,7), i-mo_integrals_cache_min_8)
-         mo_integrals_cache(ii) = integral
+ if (do_mo_cholesky) then
+
+   call set_multiple_levels_omp(.False.)
+   !$OMP PARALLEL DO PRIVATE (k,l,ii)
+   do l=mo_integrals_cache_min,mo_integrals_cache_max
+     do k=mo_integrals_cache_min,mo_integrals_cache_max
+         ii = int(l-mo_integrals_cache_min,8)
+         ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+         ii = shiftl(ii,mo_integrals_cache_shift)
+         ii = shiftl(ii,mo_integrals_cache_shift)
+         call dgemm('T','N', mo_integrals_cache_max-mo_integrals_cache_min+1, &
+                             mo_integrals_cache_max-mo_integrals_cache_min+1, &
+           cholesky_mo_num, 1.d0, &
+           cholesky_mo_transp(1,mo_integrals_cache_min,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,mo_integrals_cache_min,l), cholesky_mo_num, 0.d0, &
+           mo_integrals_cache(ii), mo_integrals_cache_size)
+     enddo
+   enddo
+   !$OMP END PARALLEL DO
+
+ else
+   !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx,ii,integral)
+   do l=mo_integrals_cache_min,mo_integrals_cache_max
+     do k=mo_integrals_cache_min,mo_integrals_cache_max
+       do j=mo_integrals_cache_min,mo_integrals_cache_max
+         do i=mo_integrals_cache_min,mo_integrals_cache_max
+           !DIR$ FORCEINLINE
+           call two_e_integrals_index(i,j,k,l,idx)
+           !DIR$ FORCEINLINE
+           call map_get(mo_integrals_map,idx,integral)
+           ii = int(l-mo_integrals_cache_min,8)
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), int(j-mo_integrals_cache_min,8))
+           ii = ior( shiftl(ii,mo_integrals_cache_shift), int(i-mo_integrals_cache_min,8))
+           mo_integrals_cache(ii) = integral
+         enddo
        enddo
      enddo
    enddo
- enddo
- !$OMP END PARALLEL DO
+   !$OMP END PARALLEL DO
+ endif
 
 END_PROVIDER
 
 
+double precision function get_two_e_integral_cache(i,j,k,l)
+  use map_module
+  implicit none
+  BEGIN_DOC
+  ! Returns one integral <ij|kl> in the MO basis taken from the cache
+  END_DOC
+  integer, intent(in)            :: i,j,k,l
+  integer*8                      :: ii
+
+  ii = int(l-mo_integrals_cache_min,8)
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), int(j-mo_integrals_cache_min,8))
+  ii = ior( shiftl(ii,mo_integrals_cache_shift), int(i-mo_integrals_cache_min,8))
+  get_two_e_integral_cache = mo_integrals_cache(ii)
+
+end
+
+
 double precision function get_two_e_integral(i,j,k,l,map)
   use map_module
   implicit none
@@ -95,10 +131,11 @@ double precision function get_two_e_integral(i,j,k,l,map)
   integer, intent(in)            :: i,j,k,l
   integer(key_kind)              :: idx
   integer                        :: ii
-  integer*8                      :: ii_8
   type(map_type), intent(inout)  :: map
   real(integral_kind)            :: tmp
-  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
+
+  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache do_mo_cholesky
+
   if (use_banned_excitation) then
     if (banned_excitation(i,k)) then
       get_two_e_integral = 0.d0
@@ -109,26 +146,181 @@ double precision function get_two_e_integral(i,j,k,l,map)
       return
     endif
   endif
+
+
   ii = l-mo_integrals_cache_min
   ii = ior(ii, k-mo_integrals_cache_min)
   ii = ior(ii, j-mo_integrals_cache_min)
   ii = ior(ii, i-mo_integrals_cache_min)
-  if (iand(ii, -128) /= 0) then
-    !DIR$ FORCEINLINE
-    call two_e_integrals_index(i,j,k,l,idx)
-    !DIR$ FORCEINLINE
-    call map_get(map,idx,tmp)
-    get_two_e_integral = dble(tmp)
+
+  if (iand(ii, -mo_integrals_cache_size) == 0) then
+
+    double precision, external :: get_two_e_integral_cache
+    get_two_e_integral = get_two_e_integral_cache(i,j,k,l)
+
   else
-    ii_8 = int(l,8)-mo_integrals_cache_min_8
-    ii_8 = ior( shiftl(ii_8,7), int(k,8)-mo_integrals_cache_min_8)
-    ii_8 = ior( shiftl(ii_8,7), int(j,8)-mo_integrals_cache_min_8)
-    ii_8 = ior( shiftl(ii_8,7), int(i,8)-mo_integrals_cache_min_8)
-    get_two_e_integral = mo_integrals_cache(ii_8)
+
+    ! Integral is not in the cache
+
+    if  (do_mo_cholesky) then
+
+      double precision, external :: ddot
+      get_two_e_integral = ddot(cholesky_mo_num, cholesky_mo_transp(1,i,k), 1, cholesky_mo_transp(1,j,l), 1)
+!       double precision, external :: get_from_mo_cholesky_cache
+!       get_two_e_integral = get_from_mo_cholesky_cache(i,j,k,l,.False.)
+
+    else
+
+      !DIR$ FORCEINLINE
+      call two_e_integrals_index(i,j,k,l,idx)
+      !DIR$ FORCEINLINE
+      call map_get(map,idx,tmp)
+      get_two_e_integral = dble(tmp)
+    endif
+
   endif
 end
 
 
+subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
+  use map_module
+  implicit none
+  BEGIN_DOC
+  ! Returns multiple integrals <ij|kl> in the MO basis, all
+  ! i for j,k,l fixed.
+  END_DOC
+  integer, intent(in)            :: j,k,l, sze
+  double precision, intent(out)  :: out_val(sze)
+  type(map_type), intent(inout)  :: map
+  integer                        :: i
+
+  integer                        :: ii
+  real(integral_kind)            :: tmp
+  integer(key_kind)              :: i1, idx
+  integer(key_kind)              :: p,q,r,s,i2
+  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
+
+  if (banned_excitation(j,l)) then
+      out_val(1:sze) = 0.d0
+      return
+  endif
+!
+  ii = l-mo_integrals_cache_min
+  ii = ior(ii, k-mo_integrals_cache_min)
+  ii = ior(ii, j-mo_integrals_cache_min)
+
+  if (iand(ii, -mo_integrals_cache_size) == 0) then
+    ! Some integrals are in the cache
+
+    if (mo_integrals_cache_min > 1) then
+
+      if (do_mo_cholesky) then
+
+        !TODO: here
+        call dgemv('T', cholesky_mo_num, mo_integrals_cache_min-1, 1.d0, &
+           cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,j,l), 1, 0.d0, &
+           out_val, 1)
+
+      else
+
+        q = min(j,l)
+        s = max(j,l)
+        q = q+shiftr(s*s-s,1)
+
+        do i=1,mo_integrals_cache_min-1
+          if (banned_excitation(i,k)) then
+            out_val(i) = 0.d0
+            cycle
+          endif
+          p = min(i,k)
+          r = max(i,k)
+          p = p+shiftr(r*r-r,1)
+          i1 = min(p,q)
+          i2 = max(p,q)
+          idx = i1+shiftr(i2*i2-i2,1)
+          !DIR$ FORCEINLINE
+          call map_get(map,idx,tmp)
+          out_val(i) = dble(tmp)
+        enddo
+
+      endif
+
+    endif
+
+    call get_mo_two_e_integrals_cache(j,k,l,sze,out_val)
+
+    if (mo_integrals_cache_max < mo_num) then
+
+      if (do_mo_cholesky) then
+
+        !TODO: here
+        call dgemv('T', cholesky_mo_num, mo_num-mo_integrals_cache_max, 1.d0, &
+           cholesky_mo_transp(1,mo_integrals_cache_max+1,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,j,l), 1, 0.d0, &
+           out_val(mo_integrals_cache_max+1), 1)
+
+      else
+
+        q = min(j,l)
+        s = max(j,l)
+        q = q+shiftr(s*s-s,1)
+
+        do i=mo_integrals_cache_max+1,mo_num
+          if (banned_excitation(i,k)) then
+            out_val(i) = 0.d0
+            cycle
+          endif
+          p = min(i,k)
+          r = max(i,k)
+          p = p+shiftr(r*r-r,1)
+          i1 = min(p,q)
+          i2 = max(p,q)
+          idx = i1+shiftr(i2*i2-i2,1)
+          !DIR$ FORCEINLINE
+          call map_get(map,idx,tmp)
+          out_val(i) = dble(tmp)
+        enddo
+
+      endif
+
+    endif
+
+  else
+
+    if (do_mo_cholesky) then
+
+      !TODO: here
+      call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
+           cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+           cholesky_mo_transp(1,j,l), 1, 0.d0, &
+           out_val, 1)
+
+    else
+
+      q = min(j,l)
+      s = max(j,l)
+      q = q+shiftr(s*s-s,1)
+
+      do i=1,sze
+        if (banned_excitation(i,k)) cycle
+        p = min(i,k)
+        r = max(i,k)
+        p = p+shiftr(r*r-r,1)
+        i1 = min(p,q)
+        i2 = max(p,q)
+        idx = i1+shiftr(i2*i2-i2,1)
+        !DIR$ FORCEINLINE
+        call map_get(map,idx,tmp)
+        out_val(i) = dble(tmp)
+      enddo
+
+    endif
+
+  endif
+
+end
+
 double precision function mo_two_e_integral(i,j,k,l)
   implicit none
   BEGIN_DOC
@@ -142,68 +334,25 @@ double precision function mo_two_e_integral(i,j,k,l)
   return
 end
 
-subroutine get_mo_two_e_integrals(j,k,l,sze,out_val,map)
+
+subroutine get_mo_two_e_integrals_cache(j,k,l,sze,out_val)
   use map_module
   implicit none
   BEGIN_DOC
   ! Returns multiple integrals <ij|kl> in the MO basis, all
-  ! i for j,k,l fixed.
+  ! i for j,k,l fixed, all integrals from the cache
   END_DOC
   integer, intent(in)            :: j,k,l, sze
   double precision, intent(out)  :: out_val(sze)
-  type(map_type), intent(inout)  :: map
-  integer                        :: i
-  double precision, external :: get_two_e_integral
+  integer*8                      :: ii
 
-  integer                        :: ii, ii0
-  integer*8                      :: ii_8, ii0_8
-  real(integral_kind)            :: tmp
-  integer(key_kind)              :: i1, idx
-  integer(key_kind)              :: p,q,r,s,i2
-  PROVIDE mo_two_e_integrals_in_map mo_integrals_cache
+  ii = int(l-mo_integrals_cache_min,8)
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
+  ii = ior( shiftl(ii, mo_integrals_cache_shift), int(j-mo_integrals_cache_min,8))
+  ii = shiftl(ii, mo_integrals_cache_shift)
+  out_val(mo_integrals_cache_min:mo_integrals_cache_max) = &
+       mo_integrals_cache(ii:ii+int(mo_integrals_cache_max-mo_integrals_cache_min,8))
 
-!DEBUG
-!  do i=1,sze
-!    out_val(i) = get_two_e_integral(i,j,k,l,map)
-!  enddo
-!  return
-!DEBUG
-
-  out_val(1:sze) = 0.d0
-  if (banned_excitation(j,l)) then
-    return
-  endif
-
-  ii0 = l-mo_integrals_cache_min
-  ii0 = ior(ii0, k-mo_integrals_cache_min)
-  ii0 = ior(ii0, j-mo_integrals_cache_min)
-
-  ii0_8 = int(l,8)-mo_integrals_cache_min_8
-  ii0_8 = ior( shiftl(ii0_8,7), int(k,8)-mo_integrals_cache_min_8)
-  ii0_8 = ior( shiftl(ii0_8,7), int(j,8)-mo_integrals_cache_min_8)
-
-  q = min(j,l)
-  s = max(j,l)
-  q = q+shiftr(s*s-s,1)
-
-  do i=1,sze
-    if (banned_excitation(i,k)) cycle
-    ii = ior(ii0, i-mo_integrals_cache_min)
-    if (iand(ii, -128) == 0) then
-      ii_8 = ior( shiftl(ii0_8,7), int(i,8)-mo_integrals_cache_min_8)
-      out_val(i) = mo_integrals_cache(ii_8)
-    else
-      p = min(i,k)
-      r = max(i,k)
-      p = p+shiftr(r*r-r,1)
-      i1 = min(p,q)
-      i2 = max(p,q)
-      idx = i1+shiftr(i2*i2-i2,1)
-      !DIR$ FORCEINLINE
-      call map_get(map,idx,tmp)
-      out_val(i) = dble(tmp)
-    endif
-  enddo
 end
 
 subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
@@ -220,9 +369,32 @@ subroutine get_mo_two_e_integrals_ij(k,l,sze,out_array,map)
   integer                        :: j
   real(integral_kind), allocatable :: tmp_val(:)
 
-  do j=1,sze
-    call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
-  enddo
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
+
+    if (do_mo_cholesky) then
+
+      call dgemm('T', 'N', mo_num, mo_num, cholesky_mo_num, 1.d0, &
+         cholesky_mo_transp(1,1,k), cholesky_mo_num, &
+         cholesky_mo_transp(1,1,l), cholesky_mo_num, 0.d0, &
+         out_array, sze)
+
+    else
+
+      do j=1,sze
+        call get_mo_two_e_integrals(j,k,l,sze,out_array(1,j),map)
+      enddo
+
+    endif
+
+  else
+
+    double precision, external :: get_two_e_integral_cache
+    do j=1,sze
+      call get_mo_two_e_integrals_cache(j,k,l,sze,out_array(1,j))
+    enddo
+
+  endif
+
 end
 
 subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
@@ -239,9 +411,31 @@ subroutine get_mo_two_e_integrals_i1j1(k,l,sze,out_array,map)
   integer                        :: j
   PROVIDE mo_two_e_integrals_in_map
 
-  do j=1,sze
-    call get_mo_two_e_integrals(k,j,l,sze,out_array(1,j),map)
-  enddo
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
+
+    if (do_mo_cholesky) then
+
+      call dgemv('T', cholesky_mo_num, mo_num*mo_num, 1.d0, &
+         cholesky_mo_transp(1,1,1), cholesky_mo_num, &
+         cholesky_mo_transp(1,k,l), 1, 0.d0, &
+         out_array, 1)
+
+    else
+
+      do j=1,sze
+        call get_mo_two_e_integrals(k,j,l,sze,out_array(1,j),map)
+      enddo
+
+    endif
+
+  else
+
+    double precision, external :: get_two_e_integral_cache
+    do j=1,sze
+      call get_mo_two_e_integrals_cache(k,j,l,sze,out_array(1,j))
+    enddo
+
+  endif
 
 end
 
@@ -261,9 +455,32 @@ subroutine get_mo_two_e_integrals_coulomb_ii(k,l,sze,out_val,map)
   double precision, external     :: get_two_e_integral
   PROVIDE mo_two_e_integrals_in_map
 
-  do i=1,sze
-    out_val(i) = get_two_e_integral(k,i,l,i,map)
-  enddo
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
+
+    if (do_mo_cholesky) then
+
+      call dgemv('T', cholesky_mo_num, mo_num, 1.d0, &
+         cholesky_mo_transp(1,1,1), cholesky_mo_num*(mo_num+1), &
+         cholesky_mo_transp(1,k,l), 1, 0.d0, &
+         out_val, 1)
+
+    else
+
+      do i=1,sze
+        out_val(i) = get_two_e_integral(i,k,i,l,map)
+      enddo
+
+    endif
+
+  else
+
+    double precision, external :: get_two_e_integral_cache
+    do i=1,sze
+      out_val(i) = get_two_e_integral_cache(i,k,i,l)
+    enddo
+
+  endif
+
 
 end
 
@@ -282,9 +499,33 @@ subroutine get_mo_two_e_integrals_exch_ii(k,l,sze,out_val,map)
   double precision, external     :: get_two_e_integral
   PROVIDE mo_two_e_integrals_in_map
 
-  do i=1,sze
-    out_val(i) = get_two_e_integral(k,i,i,l,map)
-  enddo
+  if ( (mo_integrals_cache_min>1).or.(mo_integrals_cache_max<mo_num) ) then
+
+    if (do_mo_cholesky) then
+
+      double precision, external :: ddot
+      do i=1,sze
+        out_val(i) = ddot(cholesky_mo_num, cholesky_mo_transp(1,i,k), 1, &
+                                           cholesky_mo_transp(1,i,l), 1)
+      enddo
+
+    else
+
+      do i=1,sze
+        out_val(i) = get_two_e_integral(i,i,k,l,map)
+      enddo
+
+    endif
+
+  else
+
+    double precision, external :: get_two_e_integral_cache
+    do i=1,sze
+      out_val(i) = get_two_e_integral_cache(i,i,k,l)
+    enddo
+
+  endif
+
 
 end
 
diff --git a/src/mo_two_e_ints/mo_bi_integrals.irp.f b/src/mo_two_e_ints/mo_bi_integrals.irp.f
index 0e77b6a2..549bbed2 100644
--- a/src/mo_two_e_ints/mo_bi_integrals.irp.f
+++ b/src/mo_two_e_ints/mo_bi_integrals.irp.f
@@ -1,3 +1,21 @@
+! 1,2-index integrals are always taken from:
+! - mo_two_e_integrals_jj_exchange
+! - mo_two_e_integrals_jj_anti
+! - mo_two_e_integrals_jj
+! 
+! 3-index integrals are always taken from:
+! - big_array_exchange_integrals
+! - big_array_coulomb_integrals
+!
+! If (do_mo_cholesky):
+! - Integrals with four 4 active orbitals are stored in the cache map,
+!   all other integrals are used from cholesky vectors
+! - 1,2,3-index arrays are built from cholesky vectors
+! Else:
+! - All integrals are stored in the map or cache map
+! - 1,2,3-index arrays are built from the map
+!
+
 subroutine mo_two_e_integrals_index(i,j,k,l,i1)
   use map_module
   implicit none
@@ -39,29 +57,18 @@ BEGIN_PROVIDER [ logical, mo_two_e_integrals_in_map ]
     return
   endif
 
-  if (.not. do_direct_integrals) then
-    PROVIDE ao_two_e_integrals_in_map
-  endif
-
-  print *,  ''
-  print *,  'AO -> MO integrals transformation'
-  print *,  '---------------------------------'
-  print *,  ''
-
   call wall_time(wall_1)
   call cpu_time(cpu_1)
 
-  if(no_vvvv_integrals)then
-    call four_idx_novvvv_old
+  if (do_mo_cholesky) then
+     PROVIDE cholesky_mo_transp
   else
     if (do_ao_cholesky) then
       call add_integrals_to_map_cholesky
+    else if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
+      call four_idx_dgemm
     else
-      if (dble(ao_num)**4 * 32.d-9 < dble(qp_max_mem)) then
-        call four_idx_dgemm
-      else
-        call add_integrals_to_map(full_ijkl_bitmask_4)
-      endif
+      call add_integrals_to_map(full_ijkl_bitmask_4)
     endif
   endif
 
@@ -92,8 +99,15 @@ subroutine four_idx_dgemm
   double precision, allocatable :: a1(:,:,:,:)
   double precision, allocatable :: a2(:,:,:,:)
 
+  PROVIDE ao_two_e_integrals_in_map mo_coef
+
+  print *, ''
+  print *, 'DGEMM-based AO->MO Transformation'
+  print *, '---------------------------------'
+  print *, ''
+
   if (ao_num > 1289) then
-    print *,  irp_here, ': Integer overflow in ao_num**3'
+    print *,  irp_here, ': Integer overflow in ao_num**3. Set do_ao_cholesky=.True.'
   endif
 
   allocate (a1(ao_num,ao_num,ao_num,ao_num))
@@ -213,6 +227,12 @@ subroutine add_integrals_to_map(mask_ijkl)
 
   PROVIDE ao_two_e_integrals_in_map  mo_coef
 
+
+  print *, ''
+  print *, 'Sparse AO->MO Transformation'
+  print *, '----------------------------'
+  print *, ''
+
   !Get list of MOs for i,j,k and l
   !-------------------------------
 
@@ -453,6 +473,9 @@ subroutine add_integrals_to_map(mask_ijkl)
 
 end
 
+
+
+
 subroutine add_integrals_to_map_cholesky
   use bitmasks
   implicit none
@@ -469,6 +492,7 @@ subroutine add_integrals_to_map_cholesky
   integer(key_kind)  , allocatable :: buffer_i(:)
   real(integral_kind), allocatable :: buffer_value(:)
 
+  PROVIDE cholesky_mo_transp
   call set_multiple_levels_omp(.False.)
 
   !$OMP PARALLEL DEFAULT(SHARED) &
@@ -515,837 +539,7 @@ subroutine add_integrals_to_map_cholesky
 
 end
 
-subroutine add_integrals_to_map_three_indices(mask_ijk)
-  use bitmasks
-  implicit none
 
-  BEGIN_DOC
-  ! Adds integrals to the MO map according to some bitmask
-  END_DOC
-
-  integer(bit_kind), intent(in)  :: mask_ijk(N_int,3)
-
-  integer                        :: i,j,k,l
-  integer                        :: i0,j0,k0,l0
-  double precision               :: c, cpu_1, cpu_2, wall_1, wall_2, wall_0
-
-  integer, allocatable           :: list_ijkl(:,:)
-  integer                        :: n_i, n_j, n_k
-  integer                        :: m
-  integer, allocatable           :: two_e_tmp_0_idx(:)
-  real(integral_kind), allocatable :: two_e_tmp_0(:,:)
-  double precision, allocatable  :: two_e_tmp_1(:)
-  double precision, allocatable  :: two_e_tmp_2(:,:)
-  double precision, allocatable  :: two_e_tmp_3(:,:,:)
-  !DIR$ ATTRIBUTES ALIGN : 64    :: two_e_tmp_1, two_e_tmp_2, two_e_tmp_3
-
-  integer                        :: n_integrals
-  integer                        :: size_buffer
-  integer(key_kind),allocatable  :: buffer_i(:)
-  real(integral_kind),allocatable :: buffer_value(:)
-  double precision               :: map_mb
-
-  integer                        :: i1,j1,k1,l1, ii1, kmax, thread_num
-  integer                        :: i2,i3,i4
-  double precision,parameter     :: thr_coef = 1.d-10
-
-  PROVIDE ao_two_e_integrals_in_map  mo_coef
-
-  !Get list of MOs for i,j,k and l
-  !-------------------------------
-
-  allocate(list_ijkl(mo_num,4))
-  call bitstring_to_list( mask_ijk(1,1), list_ijkl(1,1), n_i, N_int )
-  call bitstring_to_list( mask_ijk(1,2), list_ijkl(1,2), n_j, N_int )
-  call bitstring_to_list( mask_ijk(1,3), list_ijkl(1,3), n_k, N_int )
-  j = 0
-  do i = 1, N_int
-    j += popcnt(mask_ijk(i,1))
-  enddo
-  if(j==0)then
-    return
-  endif
-
-  j = 0
-  do i = 1, N_int
-    j += popcnt(mask_ijk(i,2))
-  enddo
-  if(j==0)then
-    return
-  endif
-
-  j = 0
-  do i = 1, N_int
-    j += popcnt(mask_ijk(i,3))
-  enddo
-  if(j==0)then
-    return
-  endif
-
-  if (ao_num > 1289) then
-    print *,  irp_here, ': Integer overflow in ao_num**3'
-  endif
-  size_buffer = min(ao_num*ao_num*ao_num,16000000)
-  print*, 'Providing the molecular integrals '
-  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
-      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'
-
-  call wall_time(wall_1)
-  call cpu_time(cpu_1)
-  !$OMP PARALLEL PRIVATE(m,l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax, &
-      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
-      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num)                             &
-      !$OMP  DEFAULT(NONE)                                           &
-      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,       &
-      !$OMP  mo_coef_transp,                                         &
-      !$OMP  mo_coef_transp_is_built, list_ijkl,                     &
-      !$OMP  mo_coef_is_built, wall_1,                               &
-      !$OMP  mo_coef,mo_integrals_threshold,mo_integrals_map)
-  n_integrals = 0
-  wall_0 = wall_1
-  allocate(two_e_tmp_3(mo_num, n_j, n_k),                 &
-      two_e_tmp_1(mo_num),                                &
-      two_e_tmp_0(ao_num,ao_num),                             &
-      two_e_tmp_0_idx(ao_num),                                &
-      two_e_tmp_2(mo_num, n_j),                           &
-      buffer_i(size_buffer),                                   &
-      buffer_value(size_buffer) )
-
-  thread_num = 0
-  !$  thread_num = omp_get_thread_num()
-  !$OMP DO SCHEDULE(guided)
-  do l1 = 1,ao_num
-    two_e_tmp_3 = 0.d0
-    do k1 = 1,ao_num
-      two_e_tmp_2 = 0.d0
-      do j1 = 1,ao_num
-        call get_ao_two_e_integrals(j1,k1,l1,ao_num,two_e_tmp_0(1,j1))
-      enddo
-      do j1 = 1,ao_num
-        kmax = 0
-        do i1 = 1,ao_num
-          c = two_e_tmp_0(i1,j1)
-          if (c == 0.d0) then
-            cycle
-          endif
-          kmax += 1
-          two_e_tmp_0(kmax,j1) = c
-          two_e_tmp_0_idx(kmax) = i1
-        enddo
-
-        if (kmax==0) then
-          cycle
-        endif
-
-        two_e_tmp_1 = 0.d0
-        ii1=1
-        do ii1 = 1,kmax-4,4
-          i1 = two_e_tmp_0_idx(ii1)
-          i2 = two_e_tmp_0_idx(ii1+1)
-          i3 = two_e_tmp_0_idx(ii1+2)
-          i4 = two_e_tmp_0_idx(ii1+3)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i)  =  two_e_tmp_1(i) +                    &
-                mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1) +        &
-                mo_coef_transp(i,i2) * two_e_tmp_0(ii1+1,j1) +      &
-                mo_coef_transp(i,i3) * two_e_tmp_0(ii1+2,j1) +      &
-                mo_coef_transp(i,i4) * two_e_tmp_0(ii1+3,j1)
-          enddo ! i
-        enddo  ! ii1
-
-        i2 = ii1
-        do ii1 = i2,kmax
-          i1 = two_e_tmp_0_idx(ii1)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i) = two_e_tmp_1(i) + mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1)
-          enddo ! i
-        enddo  ! ii1
-        c = 0.d0
-
-        do i = list_ijkl(1,1), list_ijkl(n_i,1)
-          c = max(c,abs(two_e_tmp_1(i)))
-          if (c>mo_integrals_threshold) exit
-        enddo
-        if ( c < mo_integrals_threshold ) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          c = mo_coef_transp(j,j1)
-          if (abs(c) < thr_coef) then
-            cycle
-          endif
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_2(i,j0)  = two_e_tmp_2(i,j0) + c * two_e_tmp_1(i)
-          enddo ! i
-        enddo  ! j
-      enddo !j1
-      if ( maxval(abs(two_e_tmp_2)) < mo_integrals_threshold ) then
-        cycle
-      endif
-
-
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        c = mo_coef_transp(k,k1)
-        if (abs(c) < thr_coef) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          do i = list_ijkl(1,1), k
-            two_e_tmp_3(i,j0,k0) = two_e_tmp_3(i,j0,k0) + c* two_e_tmp_2(i,j0)
-          enddo!i
-        enddo !j
-
-      enddo  !k
-    enddo   !k1
-
-
-
-    do l0 = 1,n_j
-      l = list_ijkl(l0,2)
-      c = mo_coef_transp(l,l1)
-      if (abs(c) < thr_coef) then
-        cycle
-      endif
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        i1 = shiftr((k*k-k),1)
-        two_e_tmp_1 = 0.d0
-        j0 = l0
-        j = list_ijkl(j0,2)
-        do i0 = 1, n_i
-          i = list_ijkl(i0,1)
-          if (i>k) then
-            exit
-          endif
-          two_e_tmp_1(i) = c*two_e_tmp_3(i,j0,k0)
-        enddo
-
-        do i0 = 1, n_i
-          i = list_ijkl(i0,1)
-          if (i>k) then !min(k,j1-i1)
-            exit
-          endif
-          if (abs(two_e_tmp_1(i)) < mo_integrals_threshold) then
-            cycle
-          endif
-          n_integrals += 1
-          buffer_value(n_integrals) = two_e_tmp_1(i)
-          if(i==k .and. j==l .and. i.ne.j)then
-            buffer_value(n_integrals) = buffer_value(n_integrals) *0.5d0
-          endif
-          !DIR$ FORCEINLINE
-          call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
-          if (n_integrals == size_buffer) then
-            call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-                real(mo_integrals_threshold,integral_kind))
-            n_integrals = 0
-          endif
-        enddo
-      enddo
-    enddo
-
-    do l0 = 1,n_j
-      l = list_ijkl(l0,2)
-      c = mo_coef_transp(l,l1)
-      if (abs(c) < thr_coef) then
-        cycle
-      endif
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        i1 = shiftr((k*k-k),1)
-        two_e_tmp_1 = 0.d0
-        j0 = k0
-        j = list_ijkl(k0,2)
-        i0 = l0
-        i = list_ijkl(i0,2)
-        if (k==l) then
-          cycle
-        endif
-        two_e_tmp_1(i) = c*two_e_tmp_3(i,j0,k0)
-
-        n_integrals += 1
-        buffer_value(n_integrals) = two_e_tmp_1(i)
-        !DIR$ FORCEINLINE
-        call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
-        if (n_integrals == size_buffer) then
-          call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-              real(mo_integrals_threshold,integral_kind))
-          n_integrals = 0
-        endif
-      enddo
-    enddo
-
-    call wall_time(wall_2)
-    if (thread_num == 0) then
-      if (wall_2 - wall_0 > 1.d0) then
-        wall_0 = wall_2
-        print*, 100.*float(l1)/float(ao_num), '% in ',               &
-            wall_2-wall_1, 's', map_mb(mo_integrals_map) ,'MB'
-      endif
-    endif
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)
-
-  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-      real(mo_integrals_threshold,integral_kind))
-  deallocate(buffer_i, buffer_value)
-  !$OMP END PARALLEL
-  call map_merge(mo_integrals_map)
-
-  call wall_time(wall_2)
-  call cpu_time(cpu_2)
-  integer*8                      :: get_mo_map_size, mo_map_size
-  mo_map_size = get_mo_map_size()
-
-  deallocate(list_ijkl)
-
-
-  print*,'Molecular integrals provided:'
-  print*,' Size of MO map           ', map_mb(mo_integrals_map) ,'MB'
-  print*,' Number of MO integrals: ',  mo_map_size
-  print*,' cpu  time :',cpu_2 - cpu_1, 's'
-  print*,' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1), ')'
-
-end
-
-
-subroutine add_integrals_to_map_no_exit_34(mask_ijkl)
-  use bitmasks
-  implicit none
-
-  BEGIN_DOC
-  ! Adds integrals to tha MO map according to some bitmask
-  END_DOC
-
-  integer(bit_kind), intent(in)  :: mask_ijkl(N_int,4)
-
-  integer                        :: i,j,k,l
-  integer                        :: i0,j0,k0,l0
-  double precision               :: c, cpu_1, cpu_2, wall_1, wall_2, wall_0
-
-  integer, allocatable           :: list_ijkl(:,:)
-  integer                        :: n_i, n_j, n_k, n_l
-  integer, allocatable           :: two_e_tmp_0_idx(:)
-  real(integral_kind), allocatable :: two_e_tmp_0(:,:)
-  double precision, allocatable  :: two_e_tmp_1(:)
-  double precision, allocatable  :: two_e_tmp_2(:,:)
-  double precision, allocatable  :: two_e_tmp_3(:,:,:)
-  !DIR$ ATTRIBUTES ALIGN : 64    :: two_e_tmp_1, two_e_tmp_2, two_e_tmp_3
-
-  integer                        :: n_integrals
-  integer                        :: size_buffer
-  integer(key_kind),allocatable  :: buffer_i(:)
-  real(integral_kind),allocatable :: buffer_value(:)
-  double precision               :: map_mb
-
-  integer                        :: i1,j1,k1,l1, ii1, kmax, thread_num
-  integer                        :: i2,i3,i4
-  double precision,parameter     :: thr_coef = 1.d-10
-
-  PROVIDE ao_two_e_integrals_in_map  mo_coef
-
-  !Get list of MOs for i,j,k and l
-  !-------------------------------
-
-  allocate(list_ijkl(mo_num,4))
-  call bitstring_to_list( mask_ijkl(1,1), list_ijkl(1,1), n_i, N_int )
-  call bitstring_to_list( mask_ijkl(1,2), list_ijkl(1,2), n_j, N_int )
-  call bitstring_to_list( mask_ijkl(1,3), list_ijkl(1,3), n_k, N_int )
-  call bitstring_to_list( mask_ijkl(1,4), list_ijkl(1,4), n_l, N_int )
-
-  if (ao_num > 1289) then
-    print *,  irp_here, ': Integer overflow in ao_num**3'
-  endif
-  size_buffer = min(ao_num*ao_num*ao_num,16000000)
-  print*, 'Providing the molecular integrals '
-  print*, 'Buffers : ', 8.*(mo_num*(n_j)*(n_k+1) + mo_num+&
-      ao_num+ao_num*ao_num+ size_buffer*3)/(1024*1024), 'MB / core'
-
-  call wall_time(wall_1)
-  call cpu_time(cpu_1)
-
-  !$OMP PARALLEL PRIVATE(l1,k1,j1,i1,i2,i3,i4,i,j,k,l,c, ii1,kmax,   &
-      !$OMP  two_e_tmp_0_idx, two_e_tmp_0, two_e_tmp_1,two_e_tmp_2,two_e_tmp_3,&
-      !$OMP  buffer_i,buffer_value,n_integrals,wall_2,i0,j0,k0,l0,   &
-      !$OMP  wall_0,thread_num)                                      &
-      !$OMP  DEFAULT(NONE)                                           &
-      !$OMP  SHARED(size_buffer,ao_num,mo_num,n_i,n_j,n_k,n_l,   &
-      !$OMP  mo_coef_transp,                                         &
-      !$OMP  mo_coef_transp_is_built, list_ijkl,                     &
-      !$OMP  mo_coef_is_built, wall_1,                               &
-      !$OMP  mo_coef,mo_integrals_threshold,mo_integrals_map)
-  n_integrals = 0
-  wall_0 = wall_1
-  allocate(two_e_tmp_3(mo_num, n_j, n_k),                 &
-      two_e_tmp_1(mo_num),                                &
-      two_e_tmp_0(ao_num,ao_num),                                   &
-      two_e_tmp_0_idx(ao_num),                                      &
-      two_e_tmp_2(mo_num, n_j),                           &
-      buffer_i(size_buffer),                                         &
-      buffer_value(size_buffer) )
-
-  thread_num = 0
-  !$  thread_num = omp_get_thread_num()
-  !$OMP DO SCHEDULE(guided)
-  do l1 = 1,ao_num
-    !IRP_IF COARRAY
-    !    if (mod(l1-this_image(),num_images()) /= 0 ) then
-    !      cycle
-    !    endif
-    !IRP_ENDIF
-    two_e_tmp_3 = 0.d0
-    do k1 = 1,ao_num
-      two_e_tmp_2 = 0.d0
-      do j1 = 1,ao_num
-        call get_ao_two_e_integrals(j1,k1,l1,ao_num,two_e_tmp_0(1,j1))
-      enddo
-      do j1 = 1,ao_num
-        kmax = 0
-        do i1 = 1,ao_num
-          c = two_e_tmp_0(i1,j1)
-          if (c == 0.d0) then
-            cycle
-          endif
-          kmax += 1
-          two_e_tmp_0(kmax,j1) = c
-          two_e_tmp_0_idx(kmax) = i1
-        enddo
-
-        if (kmax==0) then
-          cycle
-        endif
-
-        two_e_tmp_1 = 0.d0
-        ii1=1
-        do ii1 = 1,kmax-4,4
-          i1 = two_e_tmp_0_idx(ii1)
-          i2 = two_e_tmp_0_idx(ii1+1)
-          i3 = two_e_tmp_0_idx(ii1+2)
-          i4 = two_e_tmp_0_idx(ii1+3)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i)  =  two_e_tmp_1(i) +                    &
-                mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1) +        &
-                mo_coef_transp(i,i2) * two_e_tmp_0(ii1+1,j1) +      &
-                mo_coef_transp(i,i3) * two_e_tmp_0(ii1+2,j1) +      &
-                mo_coef_transp(i,i4) * two_e_tmp_0(ii1+3,j1)
-          enddo ! i
-        enddo  ! ii1
-
-        i2 = ii1
-        do ii1 = i2,kmax
-          i1 = two_e_tmp_0_idx(ii1)
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_1(i) = two_e_tmp_1(i) + mo_coef_transp(i,i1) * two_e_tmp_0(ii1,j1)
-          enddo ! i
-        enddo  ! ii1
-        c = 0.d0
-
-        do i = list_ijkl(1,1), list_ijkl(n_i,1)
-          c = max(c,abs(two_e_tmp_1(i)))
-          if (c>mo_integrals_threshold) exit
-        enddo
-        if ( c < mo_integrals_threshold ) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          c = mo_coef_transp(j,j1)
-          if (abs(c) < thr_coef) then
-            cycle
-          endif
-          do i = list_ijkl(1,1), list_ijkl(n_i,1)
-            two_e_tmp_2(i,j0)  = two_e_tmp_2(i,j0) + c * two_e_tmp_1(i)
-          enddo ! i
-        enddo  ! j
-      enddo !j1
-      if ( maxval(abs(two_e_tmp_2)) < mo_integrals_threshold ) then
-        cycle
-      endif
-
-
-      do k0 = 1, n_k
-        k = list_ijkl(k0,3)
-        c = mo_coef_transp(k,k1)
-        if (abs(c) < thr_coef) then
-          cycle
-        endif
-
-        do j0 = 1, n_j
-          j = list_ijkl(j0,2)
-          do i = list_ijkl(1,1), k
-            two_e_tmp_3(i,j0,k0) = two_e_tmp_3(i,j0,k0) + c* two_e_tmp_2(i,j0)
-          enddo!i
-        enddo !j
-
-      enddo  !k
-    enddo   !k1
-
-
-
-    do l0 = 1,n_l
-      l = list_ijkl(l0,4)
-      c = mo_coef_transp(l,l1)
-      if (abs(c) < thr_coef) then
-        cycle
-      endif
-      j1 = shiftr((l*l-l),1)
-      do j0 = 1, n_j
-        j = list_ijkl(j0,2)
-        if (j > l)  then
-          exit
-        endif
-        j1 += 1
-        do k0 = 1, n_k
-          k = list_ijkl(k0,3)
-          i1 = shiftr((k*k-k),1)
-          two_e_tmp_1 = 0.d0
-          do i0 = 1, n_i
-            i = list_ijkl(i0,1)
-            if (i>k) then
-              exit
-            endif
-            two_e_tmp_1(i) = c*two_e_tmp_3(i,j0,k0)
-          enddo
-
-          do i0 = 1, n_i
-            i = list_ijkl(i0,1)
-            if(i> k)then
-              exit
-            endif
-
-            if (abs(two_e_tmp_1(i)) < mo_integrals_threshold) then
-              cycle
-            endif
-            n_integrals += 1
-            buffer_value(n_integrals) = two_e_tmp_1(i)
-            !DIR$ FORCEINLINE
-            call mo_two_e_integrals_index(i,j,k,l,buffer_i(n_integrals))
-            if (n_integrals == size_buffer) then
-              call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-                  real(mo_integrals_threshold,integral_kind))
-              n_integrals = 0
-            endif
-          enddo
-        enddo
-      enddo
-    enddo
-
-    call wall_time(wall_2)
-    if (thread_num == 0) then
-      if (wall_2 - wall_0 > 1.d0) then
-        wall_0 = wall_2
-        print*, 100.*float(l1)/float(ao_num), '% in ',               &
-            wall_2-wall_1, 's', map_mb(mo_integrals_map) ,'MB'
-      endif
-    endif
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate (two_e_tmp_1,two_e_tmp_2,two_e_tmp_3)
-
-  call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,&
-      real(mo_integrals_threshold,integral_kind))
-  deallocate(buffer_i, buffer_value)
-  !$OMP END PARALLEL
-  !IRP_IF COARRAY
-  !  print*, 'Communicating the map'
-  !  call communicate_mo_integrals()
-  !IRP_ENDIF
-  call map_merge(mo_integrals_map)
-
-  call wall_time(wall_2)
-  call cpu_time(cpu_2)
-  integer*8                      :: get_mo_map_size, mo_map_size
-  mo_map_size = get_mo_map_size()
-
-  deallocate(list_ijkl)
-
-
-  print*,'Molecular integrals provided:'
-  print*,' Size of MO map           ', map_mb(mo_integrals_map) ,'MB'
-  print*,' Number of MO integrals: ',  mo_map_size
-  print*,' cpu  time :',cpu_2 - cpu_1, 's'
-  print*,' wall time :',wall_2 - wall_1, 's  ( x ', (cpu_2-cpu_1)/(wall_2-wall_1), ')'
-
-
-end
-
-
-
- BEGIN_PROVIDER [ double precision, mo_two_e_integral_jj_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_jj_exchange_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_jj_anti_from_ao, (mo_num,mo_num) ]
-  implicit none
-  BEGIN_DOC
-  ! mo_two_e_integral_jj_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_jj_exchange_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_jj_anti_from_ao(i,j) = J_ij - K_ij
-  END_DOC
-
-  integer                        :: i,j,p,q,r,s
-  double precision               :: c
-  real(integral_kind)            :: integral
-  integer                        :: n, pp
-  real(integral_kind), allocatable :: int_value(:)
-  integer, allocatable           :: int_idx(:)
-
-  double precision, allocatable  :: iqrs(:,:), iqsr(:,:), iqis(:), iqri(:)
-
-  if (.not.do_direct_integrals) then
-    PROVIDE ao_two_e_integrals_in_map mo_coef
-  endif
-
-  mo_two_e_integral_jj_from_ao = 0.d0
-  mo_two_e_integrals_jj_exchange_from_ao = 0.d0
-
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: iqrs, iqsr
-
-
-  !$OMP PARALLEL DEFAULT(NONE)                                       &
-      !$OMP PRIVATE (i,j,p,q,r,s,integral,c,n,pp,int_value,int_idx,  &
-      !$OMP  iqrs, iqsr,iqri,iqis)                                   &
-      !$OMP SHARED(mo_num,mo_coef_transp,ao_num,                 &
-      !$OMP  ao_integrals_threshold,do_direct_integrals)             &
-      !$OMP REDUCTION(+:mo_two_e_integral_jj_from_ao,mo_two_e_integrals_jj_exchange_from_ao)
-
-  allocate( int_value(ao_num), int_idx(ao_num),                      &
-      iqrs(mo_num,ao_num), iqis(mo_num), iqri(mo_num),   &
-      iqsr(mo_num,ao_num) )
-
-  !$OMP DO SCHEDULE (guided)
-  do s=1,ao_num
-    do q=1,ao_num
-
-      do j=1,ao_num
-        do i=1,mo_num
-          iqrs(i,j) = 0.d0
-          iqsr(i,j) = 0.d0
-        enddo
-      enddo
-
-      if (do_direct_integrals) then
-        double precision               :: ao_two_e_integral
-        do r=1,ao_num
-          call compute_ao_two_e_integrals(q,r,s,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call compute_ao_two_e_integrals(q,s,r,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-
-      else
-
-        do r=1,ao_num
-          call get_ao_two_e_integrals_non_zero(q,r,s,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call get_ao_two_e_integrals_non_zero(q,s,r,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i=1,mo_num
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-      endif
-      iqis = 0.d0
-      iqri = 0.d0
-      do r=1,ao_num
-        do i=1,mo_num
-          iqis(i) += mo_coef_transp(i,r) * iqrs(i,r)
-          iqri(i) += mo_coef_transp(i,r) * iqsr(i,r)
-        enddo
-      enddo
-      do i=1,mo_num
-        do j=1,mo_num
-          c = mo_coef_transp(j,q)*mo_coef_transp(j,s)
-          mo_two_e_integral_jj_from_ao(j,i) += c * iqis(i)
-          mo_two_e_integrals_jj_exchange_from_ao(j,i) += c * iqri(i)
-        enddo
-      enddo
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate(iqrs,iqsr,int_value,int_idx)
-  !$OMP END PARALLEL
-
-  mo_two_e_integrals_jj_anti_from_ao = mo_two_e_integral_jj_from_ao - mo_two_e_integrals_jj_exchange_from_ao
-
-
-END_PROVIDER
-
- BEGIN_PROVIDER [ double precision, mo_two_e_integrals_vv_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_vv_exchange_from_ao, (mo_num,mo_num) ]
-&BEGIN_PROVIDER [ double precision, mo_two_e_integrals_vv_anti_from_ao, (mo_num,mo_num) ]
-  implicit none
-  BEGIN_DOC
-  ! mo_two_e_integrals_vv_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_vv_exchange_from_ao(i,j) = J_ij
-  ! mo_two_e_integrals_vv_anti_from_ao(i,j) = J_ij - K_ij
-  ! but only for the virtual orbitals
-  END_DOC
-
-  integer                        :: i,j,p,q,r,s
-  integer                        :: i0,j0
-  double precision               :: c
-  real(integral_kind)            :: integral
-  integer                        :: n, pp
-  real(integral_kind), allocatable :: int_value(:)
-  integer, allocatable           :: int_idx(:)
-
-  double precision, allocatable  :: iqrs(:,:), iqsr(:,:), iqis(:), iqri(:)
-
-  if (.not.do_direct_integrals) then
-    PROVIDE ao_two_e_integrals_in_map mo_coef
-  endif
-
-  mo_two_e_integrals_vv_from_ao = 0.d0
-  mo_two_e_integrals_vv_exchange_from_ao = 0.d0
-
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: iqrs, iqsr
-
-
-  !$OMP PARALLEL DEFAULT(NONE)                                            &
-      !$OMP PRIVATE (i0,j0,i,j,p,q,r,s,integral,c,n,pp,int_value,int_idx, &
-      !$OMP  iqrs, iqsr,iqri,iqis)                                        &
-      !$OMP SHARED(n_virt_orb,mo_num,list_virt,mo_coef_transp,ao_num, &
-      !$OMP  ao_integrals_threshold,do_direct_integrals)                  &
-      !$OMP REDUCTION(+:mo_two_e_integrals_vv_from_ao,mo_two_e_integrals_vv_exchange_from_ao)
-
-  allocate( int_value(ao_num), int_idx(ao_num),                      &
-      iqrs(mo_num,ao_num), iqis(mo_num), iqri(mo_num),&
-      iqsr(mo_num,ao_num) )
-
-  !$OMP DO SCHEDULE (guided)
-  do s=1,ao_num
-    do q=1,ao_num
-
-      do j=1,ao_num
-        do i0=1,n_virt_orb
-          i = list_virt(i0)
-          iqrs(i,j) = 0.d0
-          iqsr(i,j) = 0.d0
-        enddo
-      enddo
-
-      if (do_direct_integrals) then
-        double precision               :: ao_two_e_integral
-        do r=1,ao_num
-          call compute_ao_two_e_integrals(q,r,s,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i = list_virt(i0)
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call compute_ao_two_e_integrals(q,s,r,ao_num,int_value)
-          do p=1,ao_num
-            integral = int_value(p)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i =list_virt(i0)
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-
-      else
-
-        do r=1,ao_num
-          call get_ao_two_e_integrals_non_zero(q,r,s,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i =list_virt(i0)
-                iqrs(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-          call get_ao_two_e_integrals_non_zero(q,s,r,ao_num,int_value,int_idx,n)
-          do pp=1,n
-            p = int_idx(pp)
-            integral = int_value(pp)
-            if (abs(integral) > ao_integrals_threshold) then
-              do i0=1,n_virt_orb
-                i = list_virt(i0)
-                iqsr(i,r) += mo_coef_transp(i,p) * integral
-              enddo
-            endif
-          enddo
-        enddo
-      endif
-      iqis = 0.d0
-      iqri = 0.d0
-      do r=1,ao_num
-        do i0=1,n_virt_orb
-          i = list_virt(i0)
-          iqis(i) += mo_coef_transp(i,r) * iqrs(i,r)
-          iqri(i) += mo_coef_transp(i,r) * iqsr(i,r)
-        enddo
-      enddo
-      do i0=1,n_virt_orb
-        i= list_virt(i0)
-        do j0=1,n_virt_orb
-          j = list_virt(j0)
-          c = mo_coef_transp(j,q)*mo_coef_transp(j,s)
-          mo_two_e_integrals_vv_from_ao(j,i) += c * iqis(i)
-          mo_two_e_integrals_vv_exchange_from_ao(j,i) += c * iqri(i)
-        enddo
-      enddo
-
-    enddo
-  enddo
-  !$OMP END DO NOWAIT
-  deallocate(iqrs,iqsr,int_value,int_idx)
-  !$OMP END PARALLEL
-
-  mo_two_e_integrals_vv_anti_from_ao = mo_two_e_integrals_vv_from_ao - mo_two_e_integrals_vv_exchange_from_ao
-  ! print*, '**********'
-  ! do i0 =1, n_virt_orb
-  !  i = list_virt(i0)
-  !  print*, mo_two_e_integrals_vv_from_ao(i,i)
-  ! enddo
-  ! print*, '**********'
-
-
-END_PROVIDER
 
 
  BEGIN_PROVIDER [ double precision, mo_two_e_integrals_jj, (mo_num,mo_num) ]
@@ -1362,7 +556,7 @@ END_PROVIDER
   double precision               :: get_two_e_integral
 
 
-  if (do_ao_cholesky) then
+  if (do_mo_cholesky) then
     double precision, allocatable :: buffer(:,:)
     allocate (buffer(cholesky_mo_num,mo_num))
     do k=1,cholesky_mo_num
diff --git a/src/mo_two_e_ints/no_vvvv.irp.f b/src/mo_two_e_ints/no_vvvv.irp.f
deleted file mode 100644
index 48a7f5e2..00000000
--- a/src/mo_two_e_ints/no_vvvv.irp.f
+++ /dev/null
@@ -1,88 +0,0 @@
-
-subroutine four_idx_novvvv_old
-  use map_module
-  use bitmasks
-  implicit none
-  BEGIN_DOC
-  ! Retransform MO integrals for next CAS-SCF step
-  END_DOC
-  integer(bit_kind)              :: mask_ijkl(N_int,4)
-  integer(bit_kind)              :: mask_ijk(N_int,3)
-
-  print*,'Using partial transformation'
-  print*,'It will not transform all integrals with at least 3 indices within the virtuals'
-    integer                        :: i,j,k,l
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I I I I !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 4
-    ! <ii|ii>
-    print*, ''
-    print*, '<ii|ii>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  core_inact_act_bitmask_4(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I I V V !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 2  (virt) ^2
-    ! <iv|iv>  = J_iv
-    print*, ''
-    print*, '<iv|iv>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  virt_bitmask(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-
-    ! (core+inact+act) ^ 2  (virt) ^2
-    ! <ii|vv> = (iv|iv)
-    print*, ''
-    print*, '<ii|vv>'
-    do i = 1,N_int
-      mask_ijkl(i,1) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) = core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) = virt_bitmask(i,1)
-      mask_ijkl(i,4) = virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! V V V !!!!!!!!!!!!!!!!!!!!!!!
-!    if(.not.no_vvv_integrals)then
-      print*, ''
-      print*, '<rv|sv> and <rv|vs>'
-      do i = 1,N_int
-        mask_ijk(i,1) =  virt_bitmask(i,1)
-        mask_ijk(i,2) =  virt_bitmask(i,1)
-        mask_ijk(i,3) =  virt_bitmask(i,1)
-      enddo
-      call add_integrals_to_map_three_indices(mask_ijk)
-!    endif
-
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I I I V !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 3  (virt) ^1
-    ! <iv|ii>
-    print*, ''
-    print*, '<iv|ii>'
-    do i = 1,N_int
-      mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,2) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,3) =  core_inact_act_bitmask_4(i,1)
-      mask_ijkl(i,4) =  virt_bitmask(i,1)
-    enddo
-    call add_integrals_to_map(mask_ijkl)
-    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  I V V V !!!!!!!!!!!!!!!!!!!!
-    ! (core+inact+act) ^ 1  (virt) ^3
-    ! <iv|vv>
-!    if(.not.no_ivvv_integrals)then
-      print*, ''
-      print*, '<iv|vv>'
-      do i = 1,N_int
-        mask_ijkl(i,1) =  core_inact_act_bitmask_4(i,1)
-        mask_ijkl(i,2) =  virt_bitmask(i,1)
-        mask_ijkl(i,3) =  virt_bitmask(i,1)
-        mask_ijkl(i,4) =  virt_bitmask(i,1)
-      enddo
-      call add_integrals_to_map_no_exit_34(mask_ijkl)
-end
diff --git a/src/mol_properties/multi_s_dipole_moment.irp.f b/src/mol_properties/multi_s_dipole_moment.irp.f
index 913ae2f3..8aae3bf4 100644
--- a/src/mol_properties/multi_s_dipole_moment.irp.f
+++ b/src/mol_properties/multi_s_dipole_moment.irp.f
@@ -18,7 +18,7 @@
 
 
 
-BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
+ BEGIN_PROVIDER [double precision, multi_s_dipole_moment  , (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment, (N_states, N_states)]
 &BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment, (N_states, N_states)]
@@ -40,27 +40,153 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
   ! gamma^{nm}: density matrix \bra{\Psi^n} a^{\dagger}_a a_i \ket{\Psi^m}
   END_DOC
 
-  integer          :: istate,jstate ! States
-  integer          :: i,j           ! general spatial MOs
+  integer          :: istate, jstate ! States
+  integer          :: i, j           ! general spatial MOs
   double precision :: nuclei_part_x, nuclei_part_y, nuclei_part_z
  
   multi_s_x_dipole_moment = 0.d0
   multi_s_y_dipole_moment = 0.d0
   multi_s_z_dipole_moment = 0.d0
+
+  if(8.d0*mo_num*mo_num*n_states*n_states*1d-9 .lt. 200.d0) then
  
-  do jstate = 1, N_states
-    do istate = 1, N_states
- 
-      do i = 1, mo_num  
-        do j = 1, mo_num  
-          multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
-          multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
-          multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
-        enddo
+    do jstate = 1, N_states
+      do istate = 1, N_states
+        do i = 1, mo_num  
+          do j = 1, mo_num  
+            multi_s_x_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_x(j,i)  
+            multi_s_y_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_y(j,i) 
+            multi_s_z_dipole_moment(istate,jstate) -= one_e_tr_dm_mo(j,i,istate,jstate) * mo_dipole_z(j,i) 
+          enddo
+        enddo 
       enddo
- 
     enddo
-  enddo
+
+  else
+
+    ! no enouph memory
+    ! on the fly scheme
+
+    PROVIDE psi_det_alpha_unique psi_det_beta_unique
+
+    integer           :: l, k_a, k_b
+    integer           :: occ(N_int*bit_kind_size,2)
+    integer           :: h1, h2, p1, p2, degree
+    integer           :: exc(0:2,2), n_occ(2)
+    integer           :: krow, kcol, lrow, lcol
+    integer(bit_kind) :: tmp_det(N_int,2), tmp_det2(N_int)
+    double precision  :: ck, ckl, phase
+
+    !$OMP PARALLEL DEFAULT(NONE)                                                      &
+    !$OMP PRIVATE(j, l, k_a, k_b, istate, jstate, occ, ck, ckl, h1, h2, p1, p2, exc,  & 
+    !$OMP         phase, degree, n_occ, krow, kcol, lrow, lcol, tmp_det, tmp_det2)    &
+    !$OMP SHARED(N_int, N_states, elec_alpha_num, elec_beta_num, N_det,               &
+    !$OMP        psi_bilinear_matrix_rows, psi_bilinear_matrix_columns,               &
+    !$OMP        psi_bilinear_matrix_transp_rows, psi_bilinear_matrix_transp_columns, &
+    !$OMP        psi_det_alpha_unique, psi_det_beta_unique,                           &
+    !$OMP        psi_bilinear_matrix_values, psi_bilinear_matrix_transp_values,       &
+    !$OMP        mo_dipole_x, mo_dipole_y, mo_dipole_z,                               &
+    !$OMP        multi_s_x_dipole_moment, multi_s_y_dipole_moment, multi_s_z_dipole_moment)
+    !$OMP DO COLLAPSE(2)
+    do istate = 1, N_states
+      do jstate = 1, N_states
+
+        do k_a = 1, N_det
+          krow = psi_bilinear_matrix_rows   (k_a)
+          kcol = psi_bilinear_matrix_columns(k_a)
+  
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+  
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(k_a,jstate)
+          do l = 1, elec_alpha_num
+            j = occ(l,1)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+  
+          if (k_a == N_det) cycle
+          l = k_a + 1
+          lrow = psi_bilinear_matrix_rows   (l)
+          lcol = psi_bilinear_matrix_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lcol == kcol)
+            tmp_det2(:) = psi_det_alpha_unique(:,lrow)
+            call get_excitation_degree_spin(tmp_det(1,1), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,1), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_values(k_a,istate)*psi_bilinear_matrix_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_values(k_a,jstate)*psi_bilinear_matrix_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_rows   (l)
+            lcol = psi_bilinear_matrix_columns(l)
+          enddo
+        enddo ! k_a
+  
+        do k_b = 1, N_det
+          krow = psi_bilinear_matrix_transp_rows   (k_b)
+          kcol = psi_bilinear_matrix_transp_columns(k_b)
+      
+          tmp_det(1:N_int,1) = psi_det_alpha_unique(1:N_int,krow)
+          tmp_det(1:N_int,2) = psi_det_beta_unique (1:N_int,kcol)
+      
+          ! Diagonal part
+          call bitstring_to_list_ab(tmp_det, occ, n_occ, N_int)
+          ck = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(k_b,jstate)
+          do l = 1, elec_beta_num
+            j = occ(l,2)
+            multi_s_x_dipole_moment(istate,jstate) -= ck * mo_dipole_x(j,j) 
+            multi_s_y_dipole_moment(istate,jstate) -= ck * mo_dipole_y(j,j) 
+            multi_s_z_dipole_moment(istate,jstate) -= ck * mo_dipole_z(j,j) 
+          enddo
+      
+          if (k_b == N_det) cycle
+          l = k_b+1
+          lrow = psi_bilinear_matrix_transp_rows   (l)
+          lcol = psi_bilinear_matrix_transp_columns(l)
+          ! Fix beta determinant, loop over alphas
+          do while (lrow == krow)
+            tmp_det2(:) = psi_det_beta_unique(:,lcol)
+            call get_excitation_degree_spin(tmp_det(1,2), tmp_det2, degree, N_int)
+            if (degree == 1) then
+              exc = 0
+              call get_single_excitation_spin(tmp_det(1,2), tmp_det2, exc, phase, N_int)
+              call decode_exc_spin(exc, h1, p1, h2, p2)
+              ckl = psi_bilinear_matrix_transp_values(k_b,istate)*psi_bilinear_matrix_transp_values(l,jstate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(h1,p1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(h1,p1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(h1,p1) 
+              ckl = psi_bilinear_matrix_transp_values(k_b,jstate)*psi_bilinear_matrix_transp_values(l,istate) * phase
+              multi_s_x_dipole_moment(istate,jstate) -= ckl * mo_dipole_x(p1,h1) 
+              multi_s_y_dipole_moment(istate,jstate) -= ckl * mo_dipole_y(p1,h1) 
+              multi_s_z_dipole_moment(istate,jstate) -= ckl * mo_dipole_z(p1,h1) 
+            endif
+            l = l+1
+            if (l > N_det) exit
+            lrow = psi_bilinear_matrix_transp_rows   (l)
+            lcol = psi_bilinear_matrix_transp_columns(l)
+          enddo
+        enddo ! k_b
+
+      enddo ! istate
+    enddo ! jstate
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+  endif ! memory condition
  
   ! Nuclei part
   nuclei_part_x = 0.d0
@@ -91,3 +217,42 @@ BEGIN_PROVIDER [double precision, multi_s_dipole_moment, (N_states, N_states)]
   enddo
 
 END_PROVIDER
+
+! ---
+
+ BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment_eigenvec, (N_states, N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment_eigenvec, (N_states, N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment_eigenvec, (N_states, N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_x_dipole_moment_eigenval,           (N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_y_dipole_moment_eigenval,           (N_states)]
+&BEGIN_PROVIDER [double precision, multi_s_z_dipole_moment_eigenval,           (N_states)]
+
+  implicit none
+  double precision, allocatable :: eigval(:), eigvec(:,:), A(:,:)
+
+  PROVIDE multi_s_x_dipole_moment multi_s_y_dipole_moment multi_s_z_dipole_moment
+
+  allocate(A(N_states,N_states), eigvec(N_states,N_states), eigval(N_states))
+
+  A = multi_s_x_dipole_moment
+  call lapack_diag(eigval(1), eigvec(1,1), A(1,1), N_states, N_states)
+  multi_s_x_dipole_moment_eigenval = eigval
+  multi_s_x_dipole_moment_eigenvec = eigvec
+
+  A = multi_s_y_dipole_moment
+  call lapack_diag(eigval(1), eigvec(1,1), A(1,1), N_states, N_states)
+  multi_s_y_dipole_moment_eigenval = eigval
+  multi_s_y_dipole_moment_eigenvec = eigvec
+
+  A = multi_s_z_dipole_moment
+  call lapack_diag(eigval(1), eigvec(1,1), A(1,1), N_states, N_states)
+  multi_s_z_dipole_moment_eigenval = eigval
+  multi_s_z_dipole_moment_eigenvec = eigvec
+
+  deallocate(A, eigvec, eigval)
+
+END_PROVIDER
+
+! ---
+
+
diff --git a/src/mol_properties/print_properties.irp.f b/src/mol_properties/print_properties.irp.f
index af413a88..c392c148 100644
--- a/src/mol_properties/print_properties.irp.f
+++ b/src/mol_properties/print_properties.irp.f
@@ -100,7 +100,7 @@ subroutine print_transition_dipole_moment
       dip_str = d_x**2 + d_y**2 + d_z**2
       d = multi_s_dipole_moment(istate,jstate)
       f = 2d0/3d0 * d * d * dabs(ci_energy_no_diag(istate) - ci_energy_no_diag(jstate))
-      write(*,'(I4,I4,A4,I3,6(F12.6))') (istate-1), (jstate-1), '  ->', (istate-1), d_x, d_y, d_z, d, dip_str, f
+      write(*,'(I4,I4,A4,I3,6(F12.6))') (jstate -1) * (2*N_states-jstate)/2 + istate - jstate, (jstate-1), '  ->', (istate-1), d_x, d_y, d_z, d, dip_str, f
     enddo
   enddo
 
@@ -117,7 +117,7 @@ subroutine print_transition_dipole_moment
       dip_str = d_x**2 + d_y**2 + d_z**2
       f = 2d0/3d0 * d * d * dabs(ci_energy_no_diag(istate) - ci_energy_no_diag(jstate))
       d = multi_s_dipole_moment(istate,jstate) * au_to_D
-      write(*,'(I4,I4,A4,I3,6(F12.6))') (istate-1), (jstate-1), '  ->', (istate-1), d_x, d_y, d_z, d, dip_str, f
+      write(*,'(I4,I4,A4,I3,6(F12.6))') (jstate -1) * (2*N_states-jstate)/2 + istate - jstate, (jstate-1), '  ->', (istate-1), d_x, d_y, d_z, d, dip_str, f
     enddo
   enddo
   print*,'=============================================='
@@ -181,10 +181,9 @@ subroutine print_oscillator_strength
       ! Mixed gauge
       f_m = 2d0/3d0 * d * v
 
-      write(*,'(A19,I3,A9,F10.6,A5,F7.1,A10,F9.6,A6,F9.6,A6,F9.6,A8,F7.3)') '   #  Transition n.', (istate-1), ': Excit.=', dabs((ci_energy_no_diag(istate) - ci_energy_no_diag(jstate)))*ha_to_ev, &
+      write(*,'(A19,I3,A9,F10.6,A5,F7.1,A10,F9.6,A6,F9.6,A6,F9.6,A8,F7.3)') '   #  Transition n.', (jstate -1) * (2*N_states-jstate)/2 + istate - jstate, ': Excit.=', dabs((ci_energy_no_diag(istate) - ci_energy_no_diag(jstate)))*ha_to_ev, &
       ' eV ( ',dabs((ci_energy_no_diag(istate) - ci_energy_no_diag(jstate)))*Ha_to_nm,' nm), f_l=',f_l, ', f_v=', f_v, ', f_m=', f_m, ', <S^2>=', s2_values(istate)
       !write(*,'(I4,I4,A4,I3,A6,F6.1,A6,F6.1)') (istate-1), (jstate-1), '  ->', (istate-1), ', %T1=', percent_exc(2,istate), ', %T2=',percent_exc(3,istate)
-  
     enddo
   enddo
 
diff --git a/src/mu_of_r/EZFIO.cfg b/src/mu_of_r/EZFIO.cfg
index a66b00ef..7a39b012 100644
--- a/src/mu_of_r/EZFIO.cfg
+++ b/src/mu_of_r/EZFIO.cfg
@@ -6,9 +6,9 @@ size: (becke_numerical_grid.n_points_final_grid,determinants.n_states)
 
 [mu_of_r_potential]
 type: character*(32)
-doc: type of potential for the mu(r) interaction: can be [ hf| cas_full | cas_truncated | pure_act]
+doc: type of potential for the mu(r) interaction: can be [ hf| hf_sparse | cas_full | cas_truncated | pure_act]
 interface: ezfio, provider, ocaml
-default: hf
+default: hf_sparse
 
 [io_mu_of_r]                                                                                                                      
 type: Disk_access
diff --git a/src/mu_of_r/basis_def.irp.f b/src/mu_of_r/basis_def.irp.f
index fff9f581..e433f4d8 100644
--- a/src/mu_of_r/basis_def.irp.f
+++ b/src/mu_of_r/basis_def.irp.f
@@ -114,3 +114,48 @@ BEGIN_PROVIDER [double precision, basis_mos_in_r_array, (n_basis_orb,n_points_fi
   enddo
  enddo
 END_PROVIDER 
+
+! BEGIN_PROVIDER [integer, n_docc_val_orb_for_cas]
+!&BEGIN_PROVIDER [integer, n_max_docc_val_orb_for_cas]
+! implicit none
+! BEGIN_DOC
+! ! Number of DOUBLY OCCUPIED VALENCE ORBITALS for the CAS wave function
+! !
+! ! This determines the size of the space \mathcal{A} of Eqs. (15-16) of Phys.Chem.Lett.2019, 10, 2931   2937
+! END_DOC
+! integer :: i
+! n_docc_val_orb_for_cas = 0
+! ! You browse the BETA  ELECTRONS and check if its not a CORE ORBITAL 
+! do i = 1, elec_beta_num
+!  if(  trim(mo_class(i))=="Inactive" & 
+!  .or. trim(mo_class(i))=="Active"   &
+!  .or. trim(mo_class(i))=="Virtual" )then
+!   n_docc_val_orb_for_cas +=1 
+!  endif
+! enddo
+! n_max_docc_val_orb_for_cas = maxval(n_docc_val_orb_for_cas)
+!
+!END_PROVIDER 
+!
+!BEGIN_PROVIDER [integer, list_doc_valence_orb_for_cas, (n_max_docc_val_orb_for_cas)]
+! implicit none
+! BEGIN_DOC
+! ! List of OCCUPIED valence orbitals for each spin to build the f_{HF}(r_1,r_2) function 
+! !
+! ! This corresponds to ALL OCCUPIED orbitals in the HF wave function, except those defined as "core" 
+! !
+! ! This determines the space \mathcal{A} of Eqs. (15-16) of Phys.Chem.Lett.2019, 10, 2931   2937
+! END_DOC
+! j = 0
+! ! You browse the BETA  ELECTRONS and check if its not a CORE ORBITAL 
+! do i = 1, elec_beta_num
+!  if(  trim(mo_class(i))=="Inactive" & 
+!  .or. trim(mo_class(i))=="Active"   &
+!  .or. trim(mo_class(i))=="Virtual" )then
+!   j +=1 
+!   list_doc_valence_orb_for_cas(j) = i
+!  endif
+! enddo
+!
+!END_PROVIDER 
+
diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f
new file mode 100644
index 00000000..179b80dd
--- /dev/null
+++ b/src/mu_of_r/f_hf_cholesky.irp.f
@@ -0,0 +1,420 @@
+BEGIN_PROVIDER [integer, list_couple_hf_orb_r1, (2,n_couple_orb_r1)]
+ implicit none
+ integer :: ii,i,mm,m,itmp
+ itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(1)
+   i = list_valence_orb_for_hf(ii,1)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    itmp += 1
+    list_couple_hf_orb_r1(1,itmp) = i
+    list_couple_hf_orb_r1(2,itmp) = m
+   enddo
+  enddo
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [integer, list_couple_hf_orb_r2, (2,n_couple_orb_r2)]
+ implicit none
+ integer :: ii,i,mm,m,itmp
+ itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(2)
+   i = list_valence_orb_for_hf(ii,2)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    itmp += 1
+    list_couple_hf_orb_r2(1,itmp) = i
+    list_couple_hf_orb_r2(2,itmp) = m
+   enddo
+  enddo
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [integer, n_couple_orb_r1] 
+ implicit none
+ BEGIN_DOC
+ ! number of couples of alpha occupied times any basis orbital
+ END_DOC
+ n_couple_orb_r1 = n_occ_val_orb_for_hf(1) * n_basis_orb
+END_PROVIDER 
+
+BEGIN_PROVIDER [integer, n_couple_orb_r2] 
+ implicit none
+ BEGIN_DOC
+ ! number of couples of beta occupied times any basis orbital
+ END_DOC
+ n_couple_orb_r2 = n_occ_val_orb_for_hf(2) * n_basis_orb
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, mos_times_cholesky_r1, (cholesky_mo_num,n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! V1_AR = \sum_{I}V_AI Phi_IR where "R" specifies the index of the grid point and A the number of cholesky point
+ ! 
+ ! here Phi_IR is phi_i(R)xphi_b(R) for r1 and V_AI = (ib|A) chollesky vector
+ END_DOC
+ double precision, allocatable :: mos_ib_r1(:,:),mo_chol_r1(:,:)
+ double precision, allocatable :: test(:,:)
+ double precision :: mo_i_r1,mo_b_r1
+ integer :: ii,i,mm,m,itmp,ipoint,ll
+ allocate(mos_ib_r1(n_couple_orb_r1,n_points_final_grid))
+ allocate(mo_chol_r1(cholesky_mo_num,n_couple_orb_r1))
+
+ do ipoint = 1, n_points_final_grid
+  itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(1)
+   i = list_valence_orb_for_hf(ii,1)
+   mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+    itmp += 1
+    mos_ib_r1(itmp,ipoint) = mo_i_r1 * mo_b_r1
+   enddo
+  enddo
+ enddo
+
+ itmp = 0
+ do ii = 1, n_occ_val_orb_for_hf(1)
+  i = list_valence_orb_for_hf(ii,1)
+  do mm = 1, n_basis_orb ! electron 1 
+   m = list_basis(mm)
+   itmp += 1
+   do ll = 1, cholesky_mo_num
+    mo_chol_r1(ll,itmp) = cholesky_mo_transp(ll,m,i)
+   enddo
+   enddo
+  enddo
+
+ call get_AB_prod(mo_chol_r1,cholesky_mo_num,n_couple_orb_r1,mos_ib_r1,n_points_final_grid,mos_times_cholesky_r1)
+   
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, mos_times_cholesky_r2, (cholesky_mo_num,n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! V1_AR = \sum_{I}V_AI Phi_IR where "R" specifies the index of the grid point and A the number of cholesky point
+ ! 
+ ! here Phi_IR is phi_i(R)xphi_b(R) for r2 and V_AI = (ib|A) chollesky vector
+ END_DOC
+ double precision, allocatable :: mos_ib_r2(:,:),mo_chol_r2(:,:)
+ double precision, allocatable :: test(:,:)
+ double precision :: mo_i_r2,mo_b_r2
+ integer :: ii,i,mm,m,itmp,ipoint,ll
+ allocate(mos_ib_r2(n_couple_orb_r2,n_points_final_grid))
+ allocate(mo_chol_r2(cholesky_mo_num,n_couple_orb_r2))
+
+ do ipoint = 1, n_points_final_grid
+  itmp = 0
+  do ii = 1, n_occ_val_orb_for_hf(2)
+   i = list_valence_orb_for_hf(ii,2)
+   mo_i_r2 = mos_in_r_array_omp(i,ipoint)
+   do mm = 1, n_basis_orb ! electron 1 
+    m = list_basis(mm)
+    mo_b_r2 = mos_in_r_array_omp(m,ipoint)
+    itmp += 1
+    mos_ib_r2(itmp,ipoint) = mo_i_r2 * mo_b_r2
+   enddo
+  enddo
+ enddo
+
+ itmp = 0
+ do ii = 1, n_occ_val_orb_for_hf(2)
+  i = list_valence_orb_for_hf(ii,2)
+  do mm = 1, n_basis_orb ! electron 1 
+   m = list_basis(mm)
+   itmp += 1
+   do ll = 1, cholesky_mo_num
+    mo_chol_r2(ll,itmp) = cholesky_mo_transp(ll,m,i)
+   enddo
+   enddo
+  enddo
+
+ call get_AB_prod(mo_chol_r2,cholesky_mo_num,n_couple_orb_r2,mos_ib_r2,n_points_final_grid,mos_times_cholesky_r2)
+
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [ double precision, f_hf_cholesky, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,k
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1
+ if(elec_alpha_num == elec_beta_num)then
+  print*,'providing f_hf_cholesky ...'
+  call wall_time(wall0)
+  provide mos_times_cholesky_r1
+  !$OMP PARALLEL DO &
+  !$OMP DEFAULT (NONE)  &
+  !$OMP PRIVATE (ipoint,m) & 
+  !$OMP ShARED (mos_times_cholesky_r1,cholesky_mo_num,f_hf_cholesky,n_points_final_grid) 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky(ipoint) = 0.d0
+    do m = 1, cholesky_mo_num
+     f_hf_cholesky(ipoint) =  f_hf_cholesky(ipoint) + &
+       mos_times_cholesky_r1(m,ipoint) * mos_times_cholesky_r1(m,ipoint)
+    enddo
+    f_hf_cholesky(ipoint) *= 2.D0
+   enddo
+  !$OMP END PARALLEL DO
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky = ',wall1-wall0
+  free mos_times_cholesky_r1
+ else
+  print*,'providing f_hf_cholesky ...'
+  call wall_time(wall0)
+  provide mos_times_cholesky_r2 mos_times_cholesky_r1
+  !$OMP PARALLEL DO &
+  !$OMP DEFAULT (NONE)  &
+  !$OMP PRIVATE (ipoint,m) & 
+  !$OMP ShARED (mos_times_cholesky_r2,mos_times_cholesky_r1,cholesky_mo_num,f_hf_cholesky,n_points_final_grid) 
+  do ipoint = 1, n_points_final_grid
+   f_hf_cholesky(ipoint) = 0.D0
+    do m = 1, cholesky_mo_num
+     f_hf_cholesky(ipoint) =  f_hf_cholesky(ipoint) + &
+            mos_times_cholesky_r2(m,ipoint)*mos_times_cholesky_r1(m,ipoint)
+    enddo
+    f_hf_cholesky(ipoint) *= 2.D0
+  enddo
+  !$OMP END PARALLEL DO
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky = ',wall1-wall0
+  free mos_times_cholesky_r2 mos_times_cholesky_r1
+ endif
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,mm,i,ii,p
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ double precision :: thresh_1,thresh_2
+ double precision, allocatable :: accu_vec(:),delta_vec(:)
+ thresh_2 = ao_cholesky_threshold * 100.d0
+ thresh_1 = dsqrt(thresh_2)
+ provide cholesky_mo_transp
+ if(elec_alpha_num == elec_beta_num)then
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)*mo_i_r1
+       if(dabs(mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse(ipoint) = f_hf_cholesky_sparse(ipoint) + accu_vec(p) * accu_vec(p)
+     enddo
+    f_hf_cholesky_sparse(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse = ',wall1-wall0
+ else
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(2)
+      i = list_valence_orb_for_hf(ii,2)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     delta_vec = 0.d0
+     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse(ipoint) = f_hf_cholesky_sparse(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
+     enddo
+    f_hf_cholesky_sparse(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse = ',wall1-wall0
+ endif
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,m,mm,i,ii,p
+ !!f(R) =  \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
+ !!     =  \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
+ !!     =  \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
+ !!     =  \sum_A V_AR G_AR 
+ !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
+ double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
+ double precision :: thresh_1,thresh_2
+ double precision, allocatable :: accu_vec(:),delta_vec(:)
+ thresh_2 = ao_cholesky_threshold * 100.d0
+ thresh_1 = dsqrt(thresh_2)
+ provide cholesky_mo_transp
+ if(elec_alpha_num == elec_beta_num)then
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) 
+  allocate(accu_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, ao_num ! electron 1 
+       mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1
+       if(dabs(mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ else
+  call wall_time(wall0)
+  !$OMP PARALLEL DEFAULT(NONE)                                      &
+  !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & 
+  !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & 
+  !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) 
+  allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
+  !$OMP DO 
+   do ipoint = 1, n_points_final_grid
+    f_hf_cholesky_sparse_bis(ipoint) = 0.d0
+     accu_vec = 0.d0
+     do ii = 1, n_occ_val_orb_for_hf(2)
+      i = list_valence_orb_for_hf(ii,2)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     delta_vec = 0.d0
+     do ii =  n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
+      i = list_valence_orb_for_hf(ii,1)
+      mo_i_r1 = mos_in_r_array_omp(i,ipoint)
+      if(dabs(mo_i_r1).lt.thresh_1)cycle
+      do mm = 1, n_basis_orb ! electron 1 
+       m = list_basis(mm)
+       mo_b_r1 = mos_in_r_array_omp(m,ipoint)
+       if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
+       do p = 1, cholesky_mo_num
+        delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
+       enddo
+      enddo
+     enddo
+     do p = 1, cholesky_mo_num
+      f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
+     enddo
+    f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
+   enddo
+  !$OMP END DO
+  deallocate(accu_vec)
+  !$OMP END PARALLEL
+  call wall_time(wall1)
+  print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
+ endif
+END_PROVIDER 
+
+
+BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
+ implicit none
+ integer :: ipoint,i,ii
+ double precision :: dm_a, dm_b,wall0,wall1
+ print*,'providing on_top_hf_grid ...'
+ provide mos_in_r_array_omp
+ call wall_time(wall0)
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (ipoint,dm_a,dm_b,ii,i) & 
+ !$OMP ShARED (n_points_final_grid,n_occ_val_orb_for_hf,mos_in_r_array_omp,list_valence_orb_for_hf,on_top_hf_grid) 
+ do ipoint = 1, n_points_final_grid
+  dm_a = 0.d0
+  do ii = 1, n_occ_val_orb_for_hf(1)
+   i = list_valence_orb_for_hf(ii,1)
+   dm_a += mos_in_r_array_omp(i,ipoint)*mos_in_r_array_omp(i,ipoint)
+  enddo
+  dm_b = 0.d0
+  do ii = 1, n_occ_val_orb_for_hf(2)
+   i = list_valence_orb_for_hf(ii,2)
+   dm_b += mos_in_r_array_omp(i,ipoint)*mos_in_r_array_omp(i,ipoint)
+  enddo
+   on_top_hf_grid(ipoint) = 2.D0 * dm_a*dm_b
+ enddo
+ !$OMP END PARALLEL DO
+ call wall_time(wall1)
+ print*,'Time to provide on_top_hf_grid = ',wall1-wall0
+END_PROVIDER 
+
diff --git a/src/mu_of_r/mu_of_r_conditions.irp.f b/src/mu_of_r/mu_of_r_conditions.irp.f
index 6b49b9df..f2bb7145 100644
--- a/src/mu_of_r/mu_of_r_conditions.irp.f
+++ b/src/mu_of_r/mu_of_r_conditions.irp.f
@@ -13,7 +13,6 @@
  integer :: ipoint,istate
  double precision :: wall0,wall1
  print*,'providing mu_of_r ...'
-! PROVIDE mo_two_e_integrals_in_map mo_integrals_map big_array_exchange_integrals 
  call wall_time(wall0)
 
  if (read_mu_of_r) then
@@ -26,6 +25,10 @@
   do ipoint = 1, n_points_final_grid
    if(mu_of_r_potential.EQ."hf")then
     mu_of_r_prov(ipoint,istate) =  mu_of_r_hf(ipoint)
+   else if(mu_of_r_potential.EQ."hf_old")then
+    mu_of_r_prov(ipoint,istate) =  mu_of_r_hf_old(ipoint)
+   else if(mu_of_r_potential.EQ."hf_sparse")then
+    mu_of_r_prov(ipoint,istate) =  mu_of_r_hf_sparse(ipoint)
    else if(mu_of_r_potential.EQ."cas_full".or.mu_of_r_potential.EQ."cas_truncated".or.mu_of_r_potential.EQ."pure_act")then
     mu_of_r_prov(ipoint,istate) =  mu_of_r_psi_cas(ipoint,istate)
    else 
@@ -61,18 +64,17 @@
  END_DOC
  integer :: ipoint
  double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
- PROVIDE mo_two_e_integrals_in_map mo_integrals_map big_array_exchange_integrals 
  print*,'providing mu_of_r_hf ...'
  call wall_time(wall0)
+ PROVIDE f_hf_cholesky on_top_hf_grid
  sqpi = dsqrt(dacos(-1.d0))
- provide f_psi_hf_ab 
  !$OMP PARALLEL DO &
  !$OMP DEFAULT (NONE)  &
  !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
- !$OMP ShARED (n_points_final_grid,mu_of_r_hf,f_psi_hf_ab,on_top_hf_mu_r,sqpi) 
+ !$OMP ShARED (n_points_final_grid,mu_of_r_hf,f_hf_cholesky,on_top_hf_grid,sqpi) 
  do ipoint = 1, n_points_final_grid
-  f_hf   = f_psi_hf_ab(ipoint)
-  on_top = on_top_hf_mu_r(ipoint)
+  f_hf   = f_hf_cholesky(ipoint)
+  on_top = on_top_hf_grid(ipoint)
   if(on_top.le.1.d-12.or.f_hf.le.0.d0.or.f_hf * on_top.lt.0.d0)then
     w_hf   = 1.d+10
   else 
@@ -85,6 +87,80 @@
  print*,'Time to provide mu_of_r_hf = ',wall1-wall0
  END_PROVIDER 
 
+ BEGIN_PROVIDER [double precision, mu_of_r_hf_sparse, (n_points_final_grid) ]
+ implicit none 
+ BEGIN_DOC
+ ! mu(r) computed with a HF wave function (assumes that HF MOs are stored in the EZFIO)
+ !
+ ! corresponds to Eq. (37) of J. Chem. Phys. 149, 194301 (2018) but for \Psi^B = HF^B
+ !
+ ! !!!!!! WARNING !!!!!! if no_core_density == .True. then all contributions from the core orbitals 
+ !
+ ! in the two-body density matrix are excluded
+ END_DOC
+ integer :: ipoint
+ double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
+ print*,'providing mu_of_r_hf_sparse ...'
+ call wall_time(wall0)
+ sqpi = dsqrt(dacos(-1.d0))
+ PROVIDE f_hf_cholesky_sparse on_top_hf_grid
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
+ !$OMP ShARED (n_points_final_grid,mu_of_r_hf_sparse,f_hf_cholesky_sparse,on_top_hf_grid,sqpi) 
+ do ipoint = 1, n_points_final_grid
+  f_hf   = f_hf_cholesky_sparse(ipoint)
+  on_top = on_top_hf_grid(ipoint)
+  if(on_top.le.1.d-12.or.f_hf.le.0.d0.or.f_hf * on_top.lt.0.d0)then
+    w_hf   = 1.d+10
+  else 
+    w_hf  = f_hf /  on_top
+  endif
+  mu_of_r_hf_sparse(ipoint) =  w_hf * sqpi * 0.5d0
+ enddo
+ !$OMP END PARALLEL DO
+ call wall_time(wall1)
+ print*,'Time to provide mu_of_r_hf_sparse = ',wall1-wall0
+ END_PROVIDER 
+
+ BEGIN_PROVIDER [double precision, mu_of_r_hf_old, (n_points_final_grid) ]
+ implicit none 
+ BEGIN_DOC
+ ! mu(r) computed with a HF wave function (assumes that HF MOs are stored in the EZFIO)
+ !
+ ! corresponds to Eq. (37) of J. Chem. Phys. 149, 194301 (2018) but for \Psi^B = HF^B
+ !
+ ! !!!!!! WARNING !!!!!! if no_core_density == .True. then all contributions from the core orbitals 
+ !
+ ! in the two-body density matrix are excluded
+ END_DOC
+ integer :: ipoint
+ double precision :: wall0,wall1,f_hf,on_top,w_hf,sqpi
+ PROVIDE mo_two_e_integrals_in_map mo_integrals_map big_array_exchange_integrals 
+ print*,'providing mu_of_r_hf_old ...'
+ call wall_time(wall0)
+ sqpi = dsqrt(dacos(-1.d0))
+ provide f_psi_hf_ab 
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (ipoint,f_hf,on_top,w_hf) & 
+ !$OMP ShARED (n_points_final_grid,mu_of_r_hf_old,f_psi_hf_ab,on_top_hf_mu_r,sqpi) 
+ do ipoint = 1, n_points_final_grid
+  f_hf   = f_psi_hf_ab(ipoint)
+  on_top = on_top_hf_mu_r(ipoint)
+  if(on_top.le.1.d-12.or.f_hf.le.0.d0.or.f_hf * on_top.lt.0.d0)then
+    w_hf   = 1.d+10
+  else 
+    w_hf  = f_hf /  on_top
+  endif
+  mu_of_r_hf_old(ipoint) =  w_hf * sqpi * 0.5d0
+ enddo
+ !$OMP END PARALLEL DO
+ call wall_time(wall1)
+ print*,'Time to provide mu_of_r_hf_old = ',wall1-wall0
+ END_PROVIDER 
+
+
  BEGIN_PROVIDER [double precision, mu_of_r_psi_cas, (n_points_final_grid,N_states) ]
  implicit none 
  BEGIN_DOC
diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f
index f9aba094..94052b18 100644
--- a/src/mu_of_r/test_proj_op.irp.f
+++ b/src/mu_of_r/test_proj_op.irp.f
@@ -12,10 +12,27 @@ program projected_operators
   mu_of_r_potential = "cas_full"
   touch mu_of_r_potential 
   print*,'Using Valence Only functions'
-!  call test_f_HF_valence_ab
-!  call routine_full_mos
-!   call test_f_ii_valence_ab
-   call test_f_ia_valence_ab
+  call test_f_HF_valence_ab
+  call routine_full_mos
+  call test_f_ii_valence_ab
+  call test_f_ia_valence_ab
   call test_f_ii_ia_aa_valence_ab
+  call test
+end
+
+
+subroutine test
+ implicit none
+ integer :: i_point
+ double precision :: ref, new, accu, weight
+ accu = 0.d0
+ do i_point = 1, n_points_final_grid
+  ref = f_hf_cholesky_sparse(i_point)
+  new = f_hf_cholesky_sparse_bis(i_point)
+  weight = final_weight_at_r_vector(i_point)
+  accu += dabs(ref - new) * weight
+ enddo
+ print*,'accu = ',accu
+
 end
 
diff --git a/src/scf_utils/roothaan_hall_scf.irp.f b/src/scf_utils/roothaan_hall_scf.irp.f
index 3f5c8549..e0fe5319 100644
--- a/src/scf_utils/roothaan_hall_scf.irp.f
+++ b/src/scf_utils/roothaan_hall_scf.irp.f
@@ -217,7 +217,7 @@ END_DOC
   do while (i<mo_num)
     j=i
     m=1
-    do while ( (j<mo_num).and.(fock_matrix_diag_mo(j+1)-fock_matrix_diag_mo(i) < 1.d-8) )
+    do while ( (j+1<mo_num).and.(fock_matrix_diag_mo(j+1)-fock_matrix_diag_mo(i) < 1.d-8) )
       j += 1
       m += 1
     enddo
diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f
index 92e87cad..fc6bface 100644
--- a/src/tools/four_idx_transform.irp.f
+++ b/src/tools/four_idx_transform.irp.f
@@ -12,6 +12,9 @@ program four_idx_transform
 !
   END_DOC
 
+  if (do_mo_cholesky) then
+    stop 'Not implemented with Cholesky integrals'
+  endif
   io_mo_two_e_integrals = 'Write'
   SOFT_TOUCH io_mo_two_e_integrals
   if (.true.) then
diff --git a/src/tools/print_detweights.irp.f b/src/tools/print_detweights.irp.f
new file mode 100644
index 00000000..5e5f2bb1
--- /dev/null
+++ b/src/tools/print_detweights.irp.f
@@ -0,0 +1,97 @@
+program print_detweights
+
+  implicit none
+
+  read_wf = .True.
+  touch read_wf
+
+  call print_exc()
+  !call main()
+
+end
+
+! ---
+
+subroutine main()
+
+  implicit none
+  integer                       :: i
+  integer                       :: degree
+  integer                       :: ios
+  integer,          allocatable :: deg(:), ii(:), deg_sorted(:)
+  double precision, allocatable :: c(:)
+
+  PROVIDE N_int
+  PROVIDE N_det
+  PROVIDE psi_det
+  PROVIDe psi_coef
+
+  allocate(deg(N_det), ii(N_det), deg_sorted(N_det), c(N_det))
+
+  do i = 1, N_det
+
+    call debug_det(psi_det(1,1,i), N_int)
+    call get_excitation_degree(psi_det(1,1,i), psi_det(1,1,1), degree, N_int)
+
+    ii (i) = i
+    deg(i) = degree
+    c  (i) = dabs(psi_coef(i,1))
+  enddo
+
+  call dsort(c, ii, N_det)
+
+  do i = 1, N_det
+    deg_sorted(i) = deg(ii(i))
+    print *, deg_sorted(i), c(i)
+  enddo
+
+  print *, ' saving psi'
+
+  ! Writing output in binary format
+  open(unit=10, file="coef.bin", status="replace", action="write", iostat=ios, form="unformatted")
+
+    if(ios /= 0) then
+      print *, ' Error opening file!'
+      stop
+    endif
+
+    write(10) N_det
+    write(10) deg_sorted
+    write(10) c
+
+  close(10)
+
+  deallocate(deg, ii, deg_sorted, c)
+
+end
+
+! ---
+
+subroutine print_exc()
+
+  implicit none
+
+  integer              :: i
+  integer, allocatable :: deg(:)
+
+  PROVIDE N_int
+  PROVIDE N_det
+  PROVIDE psi_det
+
+  allocate(deg(N_det))
+
+  do i = 1, N_det
+    call get_excitation_degree(psi_det(1,1,1), psi_det(1,1,i), deg(i), N_int)
+  enddo
+
+  open(unit=10, file="exc.dat", action="write")
+    write(10,*) N_det
+    write(10,*) deg
+  close(10)
+
+  deallocate(deg)
+
+end
+
+
+
diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f
index 63630243..0eec68bd 100644
--- a/src/trexio/export_trexio_routines.irp.f
+++ b/src/trexio/export_trexio_routines.irp.f
@@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path)
     do k=1,cholesky_ao_num
      do j=1,mo_num
       do i=1,mo_num
-         integral = cholesky_mo(i,j,k)
+         integral = cholesky_mo_transp(k,i,j)
          if (integral == 0.d0) cycle
          icount += 1_8
          chol_buffer(icount) = integral
diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f
index 8c6b79d7..556ed7bc 100644
--- a/src/trexio/import_trexio_integrals.irp.f
+++ b/src/trexio/import_trexio_integrals.irp.f
@@ -28,7 +28,7 @@ subroutine run(f)
   integer(trexio_t), intent(in)  :: f ! TREXIO file handle
   integer(trexio_exit_code)      :: rc
 
-  integer ::i,j,k,l
+  integer :: i,j,k,l, iunit
   integer(8) :: m, n_integrals
   double precision :: integral
 
@@ -41,6 +41,13 @@ subroutine run(f)
   integer         , allocatable :: Vi(:,:)
   double precision              :: s
 
+  integer*4 :: BUFSIZE
+  integer :: rank
+  double precision, allocatable :: tmp(:,:,:)
+  integer*8 :: offset, icount
+
+  integer, external :: getUnitAndOpen
+
   if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then
     rc = trexio_read_nucleus_repulsion(f, s)
     if (rc /= TREXIO_SUCCESS) then
@@ -115,45 +122,88 @@ subroutine run(f)
   rc = trexio_has_ao_2e_int(f)
   PROVIDE ao_num
   if (rc /= TREXIO_HAS_NOT) then
-      PROVIDE ao_integrals_map
 
-      integer*4 :: BUFSIZE
-      BUFSIZE=ao_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_ao_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
 
-      integer*8 :: offset, icount
+          rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          allocate(tmp(ao_num,ao_num,rank))
+          tmp(:,:,:) = 0.d0
 
-      call map_sort(ao_integrals_map)
-      call map_unique(ao_integrals_map)
+          BUFSIZE=ao_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
-      call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
 
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'AO integrals read from TREXIO file'
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(i,j,k) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky AO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky AO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_ao_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          PROVIDE ao_integrals_map
+
+          BUFSIZE=ao_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values)
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
+
+          call map_sort(ao_integrals_map)
+          call map_unique(ao_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map)
+          call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read')
+
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'AO integrals read from TREXIO file'
+      endif
   else
       print *, 'AO integrals not found in TREXIO file'
   endif
@@ -181,40 +231,85 @@ subroutine run(f)
   rc = trexio_has_mo_2e_int(f)
   if (rc /= TREXIO_HAS_NOT) then
 
-      BUFSIZE=mo_num**2
-      allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
-      allocate(Vi(4,BUFSIZE), V(BUFSIZE))
+      rc = trexio_has_mo_2e_int_eri_cholesky(f)
+      if (rc /= TREXIO_HAS_NOT) then
+
+          rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank)
+          call trexio_assert(rc, TREXIO_SUCCESS)
+
+          allocate(tmp(rank,mo_num,mo_num))
+          tmp(:,:,:) = 0.d0
+
+          BUFSIZE=mo_num**2
+          allocate(Vi(3,BUFSIZE), V(BUFSIZE))
+
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              integral = V(m)
+              tmp(k,i,j) = integral
+            enddo
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+
+          print *,  'Writing Cholesky MO vectors to disk...'
+          iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W')
+          write(iunit) rank
+          write(iunit) tmp(:,:,:)
+          close(iunit)
+          call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read')
+
+          deallocate(Vi, V, tmp)
+          print *, 'Cholesky MO integrals read from TREXIO file'
+      endif
+
+      rc = trexio_has_mo_2e_int_eri(f)
+      if (rc /= TREXIO_HAS_NOT) then
+          BUFSIZE=mo_num**2
+          allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE))
+          allocate(Vi(4,BUFSIZE), V(BUFSIZE))
 
 
-      offset = 0_8
-      icount = BUFSIZE
-      rc = TREXIO_SUCCESS
-      do while (icount == size(V))
-        rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
-        do m=1,icount
-          i = Vi(1,m)
-          j = Vi(2,m)
-          k = Vi(3,m)
-          l = Vi(4,m)
-          integral = V(m)
-          call two_e_integrals_index(i, j, k, l, buffer_i(m) )
-          buffer_values(m) = integral
-        enddo
-        call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
-        offset = offset + icount
-        if (rc /= TREXIO_SUCCESS) then
-            exit
-        endif
-      end do
-      n_integrals = offset
+          offset = 0_8
+          icount = BUFSIZE
+          rc = TREXIO_SUCCESS
+          do while (icount == size(V))
+            rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V)
+            do m=1,icount
+              i = Vi(1,m)
+              j = Vi(2,m)
+              k = Vi(3,m)
+              l = Vi(4,m)
+              integral = V(m)
+              call two_e_integrals_index(i, j, k, l, buffer_i(m) )
+              buffer_values(m) = integral
+            enddo
+            call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4))
+            offset = offset + icount
+            if (rc /= TREXIO_SUCCESS) then
+                exit
+            endif
+          end do
+          n_integrals = offset
 
-      call map_sort(mo_integrals_map)
-      call map_unique(mo_integrals_map)
+          call map_sort(mo_integrals_map)
+          call map_unique(mo_integrals_map)
+
+          call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
+          call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
+          deallocate(buffer_i, buffer_values, Vi, V)
+          print *, 'MO integrals read from TREXIO file'
+      endif
 
-      call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map)
-      call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read')
-      deallocate(buffer_i, buffer_values, Vi, V)
-      print *, 'MO integrals read from TREXIO file'
   else
       print *, 'MO integrals not found in TREXIO file'
   endif
diff --git a/src/utils/block_diag_degen.irp.f b/src/utils/block_diag_degen.irp.f
index 188bfa58..1a9ca8d6 100644
--- a/src/utils/block_diag_degen.irp.f
+++ b/src/utils/block_diag_degen.irp.f
@@ -191,7 +191,7 @@ subroutine give_degen_full_list(A, n, thr, list_degen, n_degen_list)
     list_degen(n_degen_list,1) = i
     icount = 1
     do j = i+1, n
-      if(dabs(A(i)-A(j)).lt.thr.and.is_ok(j)) then
+      if(dabs(A(i)-A(j)).lt.thr .and. is_ok(j)) then
         is_ok(j) = .False.
         icount += 1
         list_degen(n_degen_list,icount) = j
diff --git a/src/utils/constants.include.F b/src/utils/constants.include.F
index 422eff95..830b71a1 100644
--- a/src/utils/constants.include.F
+++ b/src/utils/constants.include.F
@@ -9,6 +9,9 @@ double precision, parameter :: pi_5_2 =  34.9868366552d0
 double precision, parameter :: dfour_pi =  4.d0*dacos(-1.d0)
 double precision, parameter :: dtwo_pi =  2.d0*dacos(-1.d0)
 double precision, parameter :: inv_sq_pi =  1.d0/dsqrt(dacos(-1.d0))
+double precision, parameter :: c_mu_gauss =  27.d0/(8.d0*dsqrt(dacos(-1.d0)))
+double precision, parameter :: c_mu_gauss_tot =  1.5d0*27.d0/(8.d0*dsqrt(dacos(-1.d0)))+3.d0/dsqrt(dacos(-1.d0))
+double precision, parameter :: alpha_mu_gauss =  1.5d0
 double precision, parameter :: inv_sq_pi_2 = 0.5d0/dsqrt(dacos(-1.d0))
 double precision, parameter :: thresh = 1.d-15
 double precision, parameter :: cx_lda = -0.73855876638202234d0
@@ -18,3 +21,30 @@ double precision, parameter :: c_4_3 = 4.d0/3.d0
 double precision, parameter :: c_1_3 = 1.d0/3.d0
 double precision, parameter :: sq_op5 = dsqrt(0.5d0)
 double precision, parameter :: dlog_2pi =  dlog(2.d0*dacos(-1.d0))
+
+! physical constants and units conversion factors
+double precision, parameter :: k_boltzman_si    = 1.38066d-23       ! K k^-1 
+double precision, parameter :: k_boltzman_au    = 3.1667d-6         ! Hartree k^-1 
+double precision, parameter :: k_boltzman_m1_au = 315795.26d0       ! Hartree^-1 k 
+double precision, parameter :: bohr_radius_si   = 0.529177d-10      ! m
+double precision, parameter :: bohr_radius_cm   = 0.529177d-8       ! cm
+double precision, parameter :: bohr_radius_angs = 0.529177d0        ! Angstrom
+double precision, parameter :: electronmass_si  = 9.10953d-31       ! Kg
+double precision, parameter :: electronmass_uma = 5.4858d-4         ! uma
+double precision, parameter :: electronvolt_si  = 1.6021892d-19     ! J  
+double precision, parameter :: uma_si           = 1.66057d-27       ! Kg
+double precision, parameter :: debye_si         = 3.33564d-30       ! coulomb meter
+double precision, parameter :: debye_au         = 0.393427228d0     ! e * Bohr
+double precision, parameter :: angstrom_to_au   = 1.889727d0        ! au
+double precision, parameter :: au_to_ohmcmm1    = 46000.0d0         ! (ohm cm)^-1
+double precision, parameter :: au_to_kb         = 294210.0d0        ! kbar
+double precision, parameter :: au_to_eV         = 27.211652d0
+double precision, parameter :: uma_to_au        = 1822.89d0
+double precision, parameter :: au_to_terahertz  = 2.4189d-5
+double precision, parameter :: au_to_sec        = 2.4189d-17
+double precision, parameter :: au_to_fsec       = 2.4189d-2
+double precision, parameter :: Wcm2             = 3.5d16
+double precision, parameter :: amconv           = 1.66042d-24/9.1095d-28*0.5d0  !  mass conversion: a.m.u to a.u. (ry)
+double precision, parameter :: uakbar           = 147105.d0         !  pressure conversion from ry/(a.u)^3 to k
+
+
diff --git a/src/utils/fortran_mmap.c b/src/utils/fortran_mmap.c
index e8d85a2f..0306f64f 100644
--- a/src/utils/fortran_mmap.c
+++ b/src/utils/fortran_mmap.c
@@ -7,7 +7,7 @@
 #include <sys/mman.h>
 
 
-void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
+void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only, int single_node)
 {
     int fd;
     int result;
@@ -21,7 +21,7 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
             perror("Error opening mmap file for reading");
             exit(EXIT_FAILURE);
         }
-        map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0);
+        map = mmap(NULL, bytes, PROT_READ, MAP_PRIVATE, fd, 0);
     }
     else
     {
@@ -39,8 +39,8 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
             perror("Error calling lseek() to stretch the file");
             exit(EXIT_FAILURE);
         }
-        
-        result = write(fd, "", 1);
+
+        result = write(fd, " ", 1);
         if (result != 1) {
             close(fd);
             printf("%s:\n", filename);
@@ -48,7 +48,17 @@ void* mmap_fortran(char* filename, size_t bytes, int* file_descr, int read_only)
             exit(EXIT_FAILURE);
         }
 
-        map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (single_node == 1) {
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+/*
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE | MAP_NONBLOCK | MAP_NORESERVE, fd, 0);
+          if (map == MAP_FAILED) {
+             map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+          }
+*/
+        } else {
+          map = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        }
     }
 
     if (map == MAP_FAILED) {
diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f
index 20386b30..4e7ca87d 100644
--- a/src/utils/linear_algebra.irp.f
+++ b/src/utils/linear_algebra.irp.f
@@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U)
 !
 ! matrix A is destroyed inside this subroutine
 ! Cholesky vectors are stored in U
-! dimension of U: U(1:rank, 1:n)
+! dimension of U: U(1:n, 1:rank)
 ! U is allocated inside this subroutine
 ! rank is the number of Cholesky vectors depending on tol
 !
diff --git a/src/utils/map_functions.irp.f b/src/utils/map_functions.irp.f
index 97d0e8bf..e3a62b07 100644
--- a/src/utils/map_functions.irp.f
+++ b/src/utils/map_functions.irp.f
@@ -21,13 +21,13 @@ subroutine map_save_to_disk(filename,map)
     stop 'map already consolidated'
   endif
 
-  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .False., c_pointer(1))
+  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .False., .False., c_pointer(1))
   call c_f_pointer(c_pointer(1),map % consolidated_idx, (/ map % map_size +2_8/))
 
-  call mmap(trim(filename)//'_consolidated_key', (/ n_elements /), cache_key_kind, fd(2), .False., c_pointer(2))
+  call mmap(trim(filename)//'_consolidated_key', (/ n_elements /), cache_key_kind, fd(2), .False., .False., c_pointer(2))
   call c_f_pointer(c_pointer(2),map % consolidated_key, (/ n_elements /))
 
-  call mmap(trim(filename)//'_consolidated_value', (/ n_elements /), integral_kind, fd(3), .False., c_pointer(3))
+  call mmap(trim(filename)//'_consolidated_value', (/ n_elements /), integral_kind, fd(3), .False., .False., c_pointer(3))
   call c_f_pointer(c_pointer(3),map % consolidated_value, (/ n_elements /))
 
   if (.not.associated(map%consolidated_key)) then
@@ -85,15 +85,15 @@ subroutine map_load_from_disk(filename,map)
     stop 'map already consolidated'
   endif
 
-  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .True., c_pointer(1))
+  call mmap(trim(filename)//'_consolidated_idx', (/ map % map_size + 2_8 /), 8, fd(1), .True., .False., c_pointer(1))
   call c_f_pointer(c_pointer(1),map % consolidated_idx, (/ map % map_size + 2_8/))
 
   map% n_elements = map % consolidated_idx (map % map_size+2_8)-1_8
 
-  call mmap(trim(filename)//'_consolidated_key', (/ map % n_elements /), cache_key_kind, fd(2), .True., c_pointer(2))
+  call mmap(trim(filename)//'_consolidated_key', (/ map % n_elements /), cache_key_kind, fd(2), .True., .False., c_pointer(2))
   call c_f_pointer(c_pointer(2),map % consolidated_key, (/ map % n_elements /))
 
-  call mmap(trim(filename)//'_consolidated_value', (/ map % n_elements /), integral_kind, fd(3), .True., c_pointer(3))
+  call mmap(trim(filename)//'_consolidated_value', (/ map % n_elements /), integral_kind, fd(3), .True., .False., c_pointer(3))
   call c_f_pointer(c_pointer(3),map % consolidated_value, (/ map % n_elements /))
 
   l = 0_8
diff --git a/src/utils/memory.irp.f b/src/utils/memory.irp.f
index e69bf71e..e2e8dd76 100644
--- a/src/utils/memory.irp.f
+++ b/src/utils/memory.irp.f
@@ -6,7 +6,7 @@ BEGIN_PROVIDER [ integer, qp_max_mem ]
  character*(128) :: env
  integer, external :: get_total_available_memory
 
- qp_max_mem = get_total_available_memory()
+ qp_max_mem = max(get_total_available_memory() - 1,3)
  call write_int(6,qp_max_mem,'Total available memory (GB)')
  call getenv('QP_MAXMEM',env)
  if (trim(env) /= '') then
@@ -79,6 +79,26 @@ IRP_ENDIF
   call unlock_io()
 end function
 
+double precision function memory_of_double8(n)
+  implicit none
+  BEGIN_DOC
+! Computes the memory required for n double precision elements in gigabytes.
+  END_DOC
+  integer*8, intent(in) :: n
+  double precision, parameter :: f = 8.d0 / (1024.d0*1024.d0*1024.d0)
+  memory_of_double8 = dble(n) * f
+end function
+
+double precision function memory_of_int8(n)
+  implicit none
+  BEGIN_DOC
+! Computes the memory required for n double precision elements in gigabytes.
+  END_DOC
+  integer*8, intent(in) :: n
+  double precision, parameter :: f = 4.d0 / (1024.d0*1024.d0*1024.d0)
+  memory_of_int8 = dble(n) * f
+end function
+
 double precision function memory_of_double(n)
   implicit none
   BEGIN_DOC
diff --git a/src/utils/mmap.f90 b/src/utils/mmap.f90
index 41e60224..e342b422 100644
--- a/src/utils/mmap.f90
+++ b/src/utils/mmap.f90
@@ -7,12 +7,13 @@ module mmap_module
     ! File descriptors
     ! ----------------
 
-    type(c_ptr) function c_mmap_fortran(filename, length, fd, read_only) bind(c,name='mmap_fortran')
+    type(c_ptr) function c_mmap_fortran(filename, length, fd, read_only, single_node) bind(c,name='mmap_fortran')
       use iso_c_binding
       character(c_char), intent(in)  :: filename(*)
       integer(c_size_t), intent(in), value :: length
       integer(c_int), intent(out)    :: fd
       integer(c_int), intent(in), value    :: read_only
+      integer(c_int), intent(in), value    :: single_node
     end function
 
     subroutine c_munmap_fortran(length, fd, map) bind(c,name='munmap_fortran')
@@ -33,31 +34,33 @@ module mmap_module
 
   contains
 
-  subroutine mmap(filename, shape, bytes, fd, read_only, map)
+  subroutine mmap(filename, shape, bytes, fd, read_only, single_node, map)
       use iso_c_binding
       implicit none
       character*(*), intent(in)      :: filename   ! Name of the mapped file
       integer*8, intent(in)          :: shape(:)   ! Shape of the array to map
       integer, intent(in)            :: bytes      ! Number of bytes per element
       logical, intent(in)            :: read_only  ! If true, mmap is read-only
+      logical, intent(in)            :: single_node! If true, mmap is on a single node
       integer, intent(out)           :: fd         ! File descriptor
       type(c_ptr), intent(out)       :: map        ! C Pointer
 
       integer(c_size_t)              :: length
-      integer(c_int)                 :: fd_
+      integer(c_int)                 :: fd_, read_only_, single_node_
 
       integer :: i
 
+      read_only_ = 0
+      single_node_ = 0
+      if (read_only) read_only_ = 1
+      if (single_node) single_node_ = 1
+
       length = int(bytes,8)
       do i=1,size(shape)
         length = length * shape(i)
       enddo
 
-      if (read_only) then
-          map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 1)
-      else
-          map = c_mmap_fortran( trim(filename)//char(0), length, fd_, 0)
-      endif
+      map = c_mmap_fortran( trim(filename)//char(0), length, fd_, read_only_, single_node_)
       fd = fd_
   end subroutine
 
diff --git a/src/utils/util.irp.f b/src/utils/util.irp.f
index de01656b..29ec8ed4 100644
--- a/src/utils/util.irp.f
+++ b/src/utils/util.irp.f
@@ -576,7 +576,7 @@ logical function is_same_spin(sigma_1, sigma_2)
     is_same_spin = .false.
   endif
 
-end function is_same_spin
+end
 
 ! ---
    
@@ -596,7 +596,7 @@ function Kronecker_delta(i, j) result(delta)
     delta = 0.d0
   endif
 
-end function Kronecker_delta
+end
 
 ! ---
 
@@ -634,7 +634,81 @@ subroutine diagonalize_sym_matrix(N, A, e)
     print*,'Problem in diagonalize_sym_matrix (dsyev)!!'
   endif
 
-end subroutine diagonalize_sym_matrix
+end
+
+! ---
+
+
+subroutine give_degen(A, n, shift, list_degen, n_degen_list)
+
+  BEGIN_DOC
+  ! returns n_degen_list :: the number of degenerated SET of elements (i.e. with |A(i)-A(i+1)| below shift)
+  !
+  ! for each of these sets, list_degen(1,i) = first degenerate element of the set i, 
+  !
+  !                         list_degen(2,i) = last degenerate element of the set i.
+  END_DOC
+
+  implicit none
+
+  double precision, intent(in)  :: A(n)
+  double precision, intent(in)  :: shift
+  integer,          intent(in)  :: n
+  integer,          intent(out) :: list_degen(2,n), n_degen_list
+
+  integer                       :: i, j, n_degen, k
+  logical                       :: keep_on
+  double precision, allocatable :: Aw(:)
+
+  list_degen = -1
+  allocate(Aw(n))
+  Aw = A
+  i=1
+  k = 0
+  do while(i.lt.n)
+   if(dabs(Aw(i)-Aw(i+1)).lt.shift)then
+    k+=1
+    j=1
+    list_degen(1,k) = i
+    keep_on = .True.
+    do while(keep_on)
+     if(i+j.gt.n)then
+      keep_on = .False.
+      exit
+     endif
+     if(dabs(Aw(i)-Aw(i+j)).lt.shift)then
+      j+=1
+     else
+      keep_on=.False.
+      exit
+     endif
+    enddo
+    n_degen = j
+    list_degen(2,k) = list_degen(1,k)-1 + n_degen
+    j=0
+    keep_on = .True.
+    do while(keep_on)
+     if(i+j+1.gt.n)then
+      keep_on = .False.
+      exit
+     endif
+     if(dabs(Aw(i+j)-Aw(i+j+1)).lt.shift)then
+      Aw(i+j) += (j-n_degen/2) * shift
+      j+=1
+     else
+      keep_on = .False.
+      exit
+     endif
+    enddo
+    Aw(i+n_degen-1) += (n_degen-1-n_degen/2) * shift
+    i+=n_degen
+   else
+    i+=1
+   endif
+  enddo
+  n_degen_list = k
+
+end
 
 ! ---
 
diff --git a/src/utils_cc/EZFIO.cfg b/src/utils_cc/EZFIO.cfg
index fb6d9034..7d50d66a 100644
--- a/src/utils_cc/EZFIO.cfg
+++ b/src/utils_cc/EZFIO.cfg
@@ -58,6 +58,12 @@ doc: If true, the CCSD(T) will be computed.
 interface: ezfio,ocaml,provider
 default: False
 
+[cc_par_t_stop]
+type: double precision
+doc: Stops the calculation when the statistical error bar is below the given value.
+interface: ezfio,ocaml,provider
+default: 1.e-5
+
 [cc_dev]
 type: logical
 doc: Only for dev purposes.