Merge pull request #346 from QuantumPackage/dev-stable

Dev stable
2024-12-22 04:13:55 +01:00 · 2024-09-09 12:51:00 +02:00 · 2024-09-09 12:51:00 +02:00 · 151b1c5e68
commit 151b1c5e68
parent 38aa8ef547 9a284e7fcf
323 changed files with 17904 additions and 13672 deletions
--- a/bin/zcat
+++ b/bin/zcat
@ -1,23 +0,0 @@
-#!/bin/bash
-
-# On Darwin: try gzcat if available, otherwise use Python
-
-if [[ $(uname -s) = Darwin ]] ; then
-   which gzcat &> /dev/null
-   if [[ $? -eq 0 ]] ; then 
-      exec gzcat $@
-   else
-
-      exec python3 << EOF
-import sys
-import gzip
-with gzip.open("$1", "rt") as f:
-    print(f.read())
-EOF
-   fi
-else
-   SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-   command=$(which -a zcat | grep -v "$SCRIPTPATH/" | head -1)
-   exec $command $@
-fi
-
--- a/config/gfortran_debug_mkl.cfg
+++ b/config/gfortran_debug_mkl.cfg
@ -0,0 +1,63 @@
+# Common flags
+##############
+#
+# -ffree-line-length-none : Needed for IRPF90 which produces long lines
+# -lblas -llapack         : Link with libblas and liblapack libraries provided by the system
+# -I .                    : Include the curent directory (Mandatory)
+#
+# --ninja                 : Allow the utilisation of ninja. (Mandatory)
+# --align=32              : Align all provided arrays on a 32-byte boundary
+#
+#
+[COMMON]
+FC           : gfortran -g -ffree-line-length-none -I . -fPIC -std=legacy
+LAPACK_LIB   : -I${MKLROOT}/include -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_gf_lp64 -lmkl_core -lpthread -lm -ldl -lmkl_gnu_thread -lgomp -fopenmp
+IRPF90       : irpf90
+IRPF90_FLAGS : --ninja --align=32 --assert -DSET_NESTED
+
+# Global options
+################
+#
+# 1 : Activate
+# 0 : Deactivate
+#
+[OPTION]
+MODE    : DEBUG      ; [ OPT | PROFILE | DEBUG ] : Chooses the section below
+CACHE   : 0          ; Enable cache_compile.py
+OPENMP  : 1          ; Append OpenMP flags
+
+# Optimization flags
+####################
+#
+# -Ofast                  : Disregard strict standards compliance. Enables all -O3 optimizations.
+#                           It also enables optimizations that are not valid
+#                           for all standard-compliant programs.  It turns on
+#                           -ffast-math and the Fortran-specific
+#                           -fno-protect-parens and -fstack-arrays.
+[OPT]
+FCFLAGS : -Ofast
+
+# Profiling flags
+#################
+#
+[PROFILE]
+FC      : -p -g
+FCFLAGS : -Ofast
+
+# Debugging flags
+#################
+#
+# -fcheck=all  : Checks uninitialized variables,  array subscripts, etc...
+# -g           : Extra debugging information
+#
+[DEBUG]
+#FCFLAGS : -g -msse4.2  -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant -Wuninitialized  -fbacktrace -ffpe-trap=zero,overflow,underflow -finit-real=nan
+FCFLAGS : -g -mavx  -fcheck=all -Waliasing -Wampersand -Wconversion -Wsurprising -Wintrinsics-std -Wno-tabs -Wintrinsic-shadow -Wline-truncation -Wreal-q-constant -Wuninitialized  -fbacktrace -ffpe-trap=zero,overflow -finit-real=nan
+
+# OpenMP flags
+#################
+#
+[OPENMP]
+FC           : -fopenmp
+IRPF90_FLAGS : --openmp
+
--- a/40
+++ b/40
@ -40,14 +40,16 @@ Usage:
  $(basename $0) -c <file>    
  $(basename $0) -h           
  $(basename $0) -i <package> 
+  $(basename $0) -g [nvidia|intel|none]

 Options:
-  -c  <file>    Define a COMPILATION configuration file,
-                in "${QP_ROOT}/config/".
-  -h            Print the HELP message
-  -i <package>  INSTALL <package>. Use at your OWN RISK:
-                no support will be provided for the installation of
-                dependencies.
+  -c  <file>        Define a COMPILATION configuration file,
+                    in "${QP_ROOT}/config/".
+  -h                Print the HELP message
+  -i <package>      INSTALL <package>. Use at your OWN RISK:
+                    no support will be provided for the installation of
+                    dependencies.
+  -g [nvidia|intel|none]  Choose GPU acceleration

 Example:
  ./$(basename $0) -c config/gfortran.cfg
@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""


-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
    case "$c" in
        c)
            case "$OPTARG" in
@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
                "") help ; break;;
                *) PACKAGES="${PACKAGE} $OPTARG"
            esac;;
+        g)
+            GPU=$OPTARG;
+            break;;
        h)
            help
            exit 0;;
@ -109,6 +114,27 @@ while getopts "d:c:i:h" c ; do
    esac
 done

+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu_arch
+case "$GPU" in
+  amd) # AMD
+     echo "Activating AMD GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+     ;;
+  intel) # Intel
+     echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
+     ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
+     ;;
+  nvidia) # Nvidia
+     echo "Activating Nvidia GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+     ;;
+  *) # No Acceleration
+     echo "Disabling GPU acceleration"
+     ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
+     ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)

--- a/etc/paths.rc
+++ b/etc/paths.rc
@ -28,6 +28,15 @@ function qp_prepend_export () {
  fi
 }

+function qp_append_export () {
+  eval "value_1="\${$1}""
+  if [[ -z $value_1 ]] ; then
+      echo "${2}:"
+  else
+      echo "${value_1}:${2}"
+  fi
+}
+
 export PYTHONPATH=$(qp_prepend_export "PYTHONPATH" "${QP_EZFIO}/Python":"${QP_PYTHON}")

 export PATH=$(qp_prepend_export "PATH" "${QP_PYTHON}":"${QP_ROOT}"/bin:"${QP_ROOT}"/ocaml)
--- a/external/irpf90
+++ b/external/irpf90
@ -1 +1 @@
-Subproject commit beac615343f421bd6c0571a408ba389a6d5a32ac
+Subproject commit 4ab1b175fc7ed0d96c1912f13dc53579b24157a6
--- a/plugins/local/basis_correction/51.basis_c.bats
+++ b/plugins/local/basis_correction/51.basis_c.bats
@ -37,14 +37,6 @@ function run_sd() {
  eq $energy1 $1 $thresh
 }

-@test "O2 CAS" {  
-  qp set_file o2_cas.gms.ezfio
-  qp set_mo_class -c "[1-2]" -a "[3-10]" -d "[11-46]"
-  run -149.72435425 3.e-4 10000
-  qp set_mo_class -c "[1-2]" -a "[3-10]" -v "[11-46]"
-  run_md -0.1160222327 1.e-6 
-}
-

@test "LiF RHF" {  
  qp set_file lif.ezfio 
--- a/plugins/local/basis_correction/basis_correction.irp.f
+++ b/plugins/local/basis_correction/basis_correction.irp.f
@ -7,10 +7,6 @@ program basis_correction
  touch read_wf
  no_core_density = .True.
  touch no_core_density
-  if(io_mo_two_e_integrals .ne. "Read")then
-   provide ao_two_e_integrals_in_map
-  endif
-  provide mo_two_e_integrals_in_map
  call print_basis_correction
 end

--- a/plugins/local/basis_correction/print_routine.irp.f
+++ b/plugins/local/basis_correction/print_routine.irp.f
@ -22,7 +22,7 @@ subroutine print_basis_correction
 print*, '****************************************'
 print*, '****************************************'
 print*, 'mu_of_r_potential = ',mu_of_r_potential
- if(mu_of_r_potential.EQ."hf")then
+ if(mu_of_r_potential.EQ."hf".or.mu_of_r_potential.EQ."hf_old".or.mu_of_r_potential.EQ."hf_sparse")then
   print*, ''
   print*,'Using a HF-like two-body density to define mu(r)'
   print*,'This assumes that HF is a qualitative representation of the wave function '
--- a/plugins/local/basis_correction/test_chol_bas.irp.f
+++ b/plugins/local/basis_correction/test_chol_bas.irp.f
@ -0,0 +1,18 @@
+program pouet
+ implicit none
+ call test
+end
+subroutine test
+ implicit none
+! provide mos_times_cholesky_r1
+! provide mos_times_cholesky_r2
+ integer :: ipoint
+ double precision :: accu,weight
+ accu = 0.d0
+ do ipoint = 1, n_points_final_grid
+  weight = final_weight_at_r_vector(ipoint)
+!  accu += dabs(mu_of_r_hf(ipoint) - mu_of_r_hf_old(ipoint)) * weight
+  accu += dabs(f_hf_cholesky_sparse(ipoint) - f_hf_cholesky(ipoint)) * weight
+ enddo
+ print*,'accu = ',accu
+end
--- a/plugins/local/bi_ort_ints/bi_ort_ints.irp.f
+++ b/plugins/local/bi_ort_ints/bi_ort_ints.irp.f
@ -17,12 +17,15 @@ program bi_ort_ints
 ! call test_3e
 ! call test_5idx
 ! call test_5idx2
-  call test_4idx()
+!  call test_4idx()
  !call test_4idx_n4()
  !call test_4idx2()
  !call test_5idx2
  !call test_5idx

+  call test_mos_in_r()
+  call test_int2_grad1_u12_bimo_t()
+
 end

 subroutine test_5idx2
@ -472,4 +475,94 @@ subroutine test_4idx()
  return
 end

+! ---
+
+subroutine test_mos_in_r()
+
+  implicit none
+
+  integer          :: i, j
+  double precision :: err_tot, nrm_tot, err_loc, acc_thr
+
+  PROVIDE mos_l_in_r_array_transp_old mos_r_in_r_array_transp_old
+  PROVIDE mos_l_in_r_array_transp mos_r_in_r_array_transp
+
+  acc_thr = 1d-13
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, n_points_final_grid
+      err_loc = dabs(mos_l_in_r_array_transp_old(j,i) - mos_l_in_r_array_transp(j,i))
+      if(err_loc > acc_thr) then
+        print*, " error on", j, i
+        print*, " old res", mos_l_in_r_array_transp_old(j,i)
+        print*, " new res", mos_l_in_r_array_transp    (j,i)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(mos_l_in_r_array_transp_old(j,i))
+    enddo
+  enddo
+  print *, ' absolute accuracy on mos_l_in_r_array_transp (%) =', 100.d0 * err_tot / nrm_tot
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, n_points_final_grid
+      err_loc = dabs(mos_r_in_r_array_transp_old(j,i) - mos_r_in_r_array_transp(j,i))
+      if(err_loc > acc_thr) then
+        print*, " error on", j, i
+        print*, " old res", mos_r_in_r_array_transp_old(j,i)
+        print*, " new res", mos_r_in_r_array_transp    (j,i)
+        stop
+      endif
+      err_tot = err_tot + err_loc
+      nrm_tot = nrm_tot + dabs(mos_r_in_r_array_transp_old(j,i))
+    enddo
+  enddo
+  print *, ' absolute accuracy on mos_r_in_r_array_transp (%) =', 100.d0 * err_tot / nrm_tot
+
+  return
+end
+
+! ---
+
+subroutine test_int2_grad1_u12_bimo_t()
+
+  implicit none
+  integer          :: i, j, ipoint, m
+  double precision :: err_tot, nrm_tot, err_loc, acc_thr
+
+  PROVIDE int2_grad1_u12_bimo_t_old
+  PROVIDE int2_grad1_u12_bimo_t
+
+  acc_thr = 1d-13
+
+  err_tot = 0.d0
+  nrm_tot = 0.d0
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = 1, 3
+        do ipoint = 1, n_points_final_grid
+          err_loc = dabs(int2_grad1_u12_bimo_t_old(ipoint,m,j,i) - int2_grad1_u12_bimo_t(ipoint,m,j,i))
+          if(err_loc > acc_thr) then
+            print*, " error on", ipoint, m, j, i
+            print*, " old res", int2_grad1_u12_bimo_t_old(ipoint,m,j,i)
+            print*, " new res", int2_grad1_u12_bimo_t    (ipoint,m,j,i)
+            stop
+          endif
+          err_tot = err_tot + err_loc
+          nrm_tot = nrm_tot + dabs(int2_grad1_u12_bimo_t_old(ipoint,m,j,i))
+        enddo
+      enddo
+    enddo
+  enddo
+  print *, ' absolute accuracy on int2_grad1_u12_bimo_t (%) =', 100.d0 * err_tot / nrm_tot
+
+  return
+end
+
+! ---
+

--- a/plugins/local/bi_ort_ints/no_dressing.irp.f
+++ b/plugins/local/bi_ort_ints/no_dressing.irp.f
@ -322,6 +322,12 @@ END_PROVIDER

 BEGIN_PROVIDER [double precision, noL_0e]

+  BEGIN_DOC
+  ! 
+  ! < Phi_left | L | Phi_right >
+  !
+  END_DOC 
+
  implicit none
  integer                       :: i, j, k, ipoint
  double precision              :: t0, t1
@ -330,10 +336,6 @@ BEGIN_PROVIDER [double precision, noL_0e]
  double precision, allocatable :: tmp_M(:,:), tmp_S(:), tmp_O(:), tmp_J(:,:)
  double precision, allocatable :: tmp_M_priv(:,:), tmp_S_priv(:), tmp_O_priv(:), tmp_J_priv(:,:)

-
-  call wall_time(t0)
-  print*, " Providing noL_0e ..."
-
  if(elec_alpha_num .eq. elec_beta_num) then

    allocate(tmp(elec_beta_num))
@ -708,10 +710,7 @@ BEGIN_PROVIDER [double precision, noL_0e]

  endif

-  call wall_time(t1)
-  print*, " Wall time for noL_0e (min) = ", (t1 - t0)/60.d0
-
-  print*, " noL_0e = ", noL_0e
+  print*, " noL_0e =", noL_0e

 END_PROVIDER

--- a/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
+++ b/plugins/local/bi_ort_ints/semi_num_ints_mo.irp.f
@ -1,350 +1,54 @@

 ! ---

-! TODO :: optimization : transform into a DGEMM
-
-BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! mo_v_ki_bi_ortho_erf_rk_cst_mu(k,i,ip) = int dr chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1 )/(2|r - R_ip|) on the BI-ORTHO MO basis 
-  ! 
-  ! where phi_k(r) is a LEFT MOs and phi_i(r) is a RIGHT MO
-  !
-  ! R_ip = the "ip"-th point of the DFT Grid
-  !
-  END_DOC
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]

  implicit none
-  integer :: ipoint
- !$OMP PARALLEL         &
- !$OMP DEFAULT (NONE)   &
- !$OMP PRIVATE (ipoint) & 
- !$OMP SHARED (n_points_final_grid,v_ij_erf_rk_cst_mu,mo_v_ki_bi_ortho_erf_rk_cst_mu)
- !$OMP DO SCHEDULE (dynamic)
-  do ipoint = 1, n_points_final_grid
-    call ao_to_mo_bi_ortho( v_ij_erf_rk_cst_mu            (1,1,ipoint), size(v_ij_erf_rk_cst_mu,             1) &
-                          , mo_v_ki_bi_ortho_erf_rk_cst_mu(1,1,ipoint), size(mo_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  mo_v_ki_bi_ortho_erf_rk_cst_mu = mo_v_ki_bi_ortho_erf_rk_cst_mu * 0.5d0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! int dr phi_i(r) phi_j(r) (erf(mu(R) |r - R|) - 1)/(2|r - R|) on the BI-ORTHO MO basis
-  !
-  END_DOC
-
-  implicit none
-  integer :: ipoint, i, j
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,j,i) = mo_v_ki_bi_ortho_erf_rk_cst_mu(j,i,ipoint)
-      enddo
-    enddo
-  enddo
-
-  !FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
-
-END_PROVIDER 
-
-! ---
-
-! TODO :: optimization : transform into a DGEMM
-
-BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, 3, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! mo_x_v_ki_bi_ortho_erf_rk_cst_mu(k,i,m,ip) = int dr x(m) * chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1)/2|r - R_ip| on the BI-ORTHO MO basis 
-  !
-  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => x(m) = x, m=2 => x(m) = y, m=3 => x(m) = z,
-  !
-  ! R_ip = the "ip"-th point of the DFT Grid
-  !
-  END_DOC
-
-  implicit none
-  integer :: ipoint
-
- !$OMP PARALLEL         &
- !$OMP DEFAULT (NONE)   &
- !$OMP PRIVATE (ipoint) & 
- !$OMP SHARED (n_points_final_grid,x_v_ij_erf_rk_cst_mu_transp,mo_x_v_ki_bi_ortho_erf_rk_cst_mu)
- !$OMP DO SCHEDULE (dynamic)
-  do ipoint = 1, n_points_final_grid
-
-    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,1,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
-                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,1,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,2,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
-                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,2,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,3,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
-                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,3,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
-
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  mo_x_v_ki_bi_ortho_erf_rk_cst_mu = 0.5d0 * mo_x_v_ki_bi_ortho_erf_rk_cst_mu
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3, n_points_final_grid)]
-
-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  print *, ' providing int2_grad1_u12_ao_transp ...'
-  call wall_time(wall0)
-
-  if(test_cycle_tc) then
-
-    PROVIDE int2_grad1_u12_ao_test
-
-    do ipoint = 1, n_points_final_grid
-      do i = 1, ao_num
-        do j = 1, ao_num
-          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,1)
-          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,2)
-          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,3)
-        enddo
-      enddo
-    enddo
-
-    FREE int2_grad1_u12_ao_test
-
-  else
-
-    PROVIDE int2_grad1_u12_ao
-
-    do ipoint = 1, n_points_final_grid
-      do i = 1, ao_num
-        do j = 1, ao_num
-          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(j,i,ipoint,1)
-          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(j,i,ipoint,2)
-          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(j,i,ipoint,3)
-        enddo
-      enddo
-    enddo
-
-  endif
-
-  call wall_time(wall1)
-  print *, ' wall time for int2_grad1_u12_ao_transp ', wall1 - wall0
-  call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
-
-  implicit none
-  integer          :: ipoint
-  double precision :: wall0, wall1
+  integer                       :: i, j, ipoint
+  double precision              :: tt1, tt2
+  double precision, allocatable :: tmp(:,:,:,:)

  PROVIDE mo_l_coef mo_r_coef
-  PROVIDE int2_grad1_u12_ao_transp
+  PROVIDE int2_grad1_u12_ao

-  !print *, ' providing int2_grad1_u12_bimo_transp'
-  !call wall_time(wall0)
+  call wall_time(tt1)
+
+  allocate(tmp(mo_num,mo_num,n_points_final_grid,3))

  !$OMP PARALLEL         &
  !$OMP DEFAULT (NONE)   &
  !$OMP PRIVATE (ipoint) & 
-  !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao_transp,int2_grad1_u12_bimo_transp)
+  !$OMP SHARED (ao_num, mo_num, n_points_final_grid, int2_grad1_u12_ao, tmp)
  !$OMP DO SCHEDULE (dynamic)
-   do ipoint = 1, n_points_final_grid
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,1,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,1,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,2,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,2,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-     call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,3,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
-                           , int2_grad1_u12_bimo_transp(1,1,3,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
-   enddo
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,1), ao_num, tmp(1,1,ipoint,1), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,2), ao_num, tmp(1,1,ipoint,2), mo_num)
+    call ao_to_mo_bi_ortho(int2_grad1_u12_ao(1,1,ipoint,3), ao_num, tmp(1,1,ipoint,3), mo_num)
+  enddo
  !$OMP END DO
  !$OMP END PARALLEL

-  !call wall_time(wall1)
-  !print *, ' Wall time for providing int2_grad1_u12_bimo_transp',wall1 - wall0
-  !call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_bimo_t, (n_points_final_grid, 3, mo_num, mo_num)]
-
-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  !call wall_time(wall0)
-  !print *, ' Providing int2_grad1_u12_bimo_t ...'
-
-  PROVIDE mo_l_coef mo_r_coef
-  PROVIDE int2_grad1_u12_bimo_transp
-
+  !$OMP PARALLEL               &
+  !$OMP DEFAULT (NONE)         &
+  !$OMP PRIVATE (i, j, ipoint) & 
+  !$OMP SHARED (mo_num, n_points_final_grid, tmp, int2_grad1_u12_bimo_t)
+  !$OMP DO COLLAPSE(2) SCHEDULE (dynamic)
  do ipoint = 1, n_points_final_grid
    do i = 1, mo_num
      do j = 1, mo_num
-        int2_grad1_u12_bimo_t(ipoint,1,j,i) = int2_grad1_u12_bimo_transp(j,i,1,ipoint)
-        int2_grad1_u12_bimo_t(ipoint,2,j,i) = int2_grad1_u12_bimo_transp(j,i,2,ipoint)
-        int2_grad1_u12_bimo_t(ipoint,3,j,i) = int2_grad1_u12_bimo_transp(j,i,3,ipoint)
+        int2_grad1_u12_bimo_t(ipoint,1,j,i) = tmp(j,i,ipoint,1)
+        int2_grad1_u12_bimo_t(ipoint,2,j,i) = tmp(j,i,ipoint,2)
+        int2_grad1_u12_bimo_t(ipoint,3,j,i) = tmp(j,i,ipoint,3)
      enddo                                  
    enddo
  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL

-  FREE int2_grad1_u12_bimo_transp
+  deallocate(tmp)

-  !call wall_time(wall1)
-  !print *, ' wall time for int2_grad1_u12_bimo_t,', wall1 - wall0
-  !call print_memory_usage()
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3, ao_num, ao_num)]
-
-  implicit none
-  integer :: i, j, ipoint
-
-  PROVIDE int2_grad1_u12_ao
-
-  do ipoint = 1, n_points_final_grid
-    do i = 1, ao_num
-      do j = 1, ao_num
-        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(j,i,ipoint,1)
-        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(j,i,ipoint,2)
-        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(j,i,ipoint,3)
-      enddo                                  
-    enddo
-  enddo
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, 3, mo_num, mo_num)]
-
-  implicit none
-  integer :: i, j, ipoint
-
-  do i = 1, mo_num
-    do j = 1, mo_num
-      do ipoint = 1, n_points_final_grid
-        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,1,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,1,ipoint)
-        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,2,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,2,ipoint)
-        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,3,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,3,ipoint)
-      enddo
-    enddo
-  enddo
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk, (n_points_final_grid, 3, mo_num, mo_num)]
-
-  BEGIN_DOC
-  !
-  ! x_W_ki_bi_ortho_erf_rk(ip,m,k,i) = \int dr chi_k(r) \frac{(1 - erf(mu |r-R_ip|))}{2|r-R_ip|} (x(m)-R_ip(m)) phi_i(r) ON THE BI-ORTHO MO BASIS 
-  !
-  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
-  !
-  ! R_ip = the "ip"-th point of the DFT Grid
-  END_DOC
- 
-  implicit none
-  include 'constants.include.F'
- 
-  integer          :: ipoint, m, i, k
-  double precision :: xyz
-  double precision :: wall0, wall1
- 
-  print*, ' providing x_W_ki_bi_ortho_erf_rk ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                   &
- !$OMP DEFAULT (NONE)             &
- !$OMP PRIVATE (ipoint,m,i,k,xyz) & 
- !$OMP SHARED (x_W_ki_bi_ortho_erf_rk,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do k = 1, mo_num
-      do m = 1, 3
-        do ipoint = 1, n_points_final_grid
-          xyz = final_grid_points(m,ipoint)
-          x_W_ki_bi_ortho_erf_rk(ipoint,m,k,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,k,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,k,i)
-        enddo
-      enddo
-    enddo
-  enddo
-
- !$OMP END DO
- !$OMP END PARALLEL
-
- ! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu_transp 
- ! FREE mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp
-
-  call wall_time(wall1)
-  print *, ' time to provide x_W_ki_bi_ortho_erf_rk = ', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk_diag, (n_points_final_grid, 3, mo_num)]
-  BEGIN_DOC
-  ! x_W_ki_bi_ortho_erf_rk_diag(ip,m,i) = \int dr chi_i(r) (1 - erf(mu |r-R_ip|)) (x(m)-X(m)_ip) phi_i(r) ON THE BI-ORTHO MO BASIS 
-!
-! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
-!
-! R_ip = the "ip"-th point of the DFT Grid
-  END_DOC
-
-  implicit none
-  include 'constants.include.F'
- 
-  integer          :: ipoint, m, i
-  double precision :: xyz
-  double precision :: wall0, wall1
- 
-  print*,'providing x_W_ki_bi_ortho_erf_rk_diag ...'
-  call wall_time(wall0)
-
- !$OMP PARALLEL                 &
- !$OMP DEFAULT (NONE)           &
- !$OMP PRIVATE (ipoint,m,i,xyz) & 
- !$OMP SHARED (x_W_ki_bi_ortho_erf_rk_diag,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
- !$OMP DO SCHEDULE (dynamic)
-  do i = 1, mo_num
-    do m = 1, 3
-      do ipoint = 1, n_points_final_grid
-        xyz = final_grid_points(m,ipoint)
-        x_W_ki_bi_ortho_erf_rk_diag(ipoint,m,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,i,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,i,i)
-      enddo
-    enddo
-  enddo
-
- !$OMP END DO
- !$OMP END PARALLEL
-
-  call wall_time(wall1)
-  print*,'time to provide x_W_ki_bi_ortho_erf_rk_diag = ',wall1 - wall0
+  call wall_time(tt2)
+  write(*,"(A,2X,F15.7)") ' wall time for int2_grad1_u12_bimo_t (sec) = ', (tt2 - tt1)

 END_PROVIDER 

--- a/plugins/local/bi_ort_ints/semi_num_ints_mo_old.irp.f
+++ b/plugins/local/bi_ort_ints/semi_num_ints_mo_old.irp.f
@ -0,0 +1,362 @@
+
+! ---
+
+! TODO :: optimization : transform into a DGEMM
+
+BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  !
+  ! mo_v_ki_bi_ortho_erf_rk_cst_mu(k,i,ip) = int dr chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1 )/(2|r - R_ip|) on the BI-ORTHO MO basis 
+  ! 
+  ! where phi_k(r) is a LEFT MOs and phi_i(r) is a RIGHT MO
+  !
+  ! R_ip = the "ip"-th point of the DFT Grid
+  !
+  END_DOC
+
+  implicit none
+  integer :: ipoint
+ !$OMP PARALLEL         &
+ !$OMP DEFAULT (NONE)   &
+ !$OMP PRIVATE (ipoint) & 
+ !$OMP SHARED (n_points_final_grid,v_ij_erf_rk_cst_mu,mo_v_ki_bi_ortho_erf_rk_cst_mu)
+ !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho( v_ij_erf_rk_cst_mu            (1,1,ipoint), size(v_ij_erf_rk_cst_mu,             1) &
+                          , mo_v_ki_bi_ortho_erf_rk_cst_mu(1,1,ipoint), size(mo_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  mo_v_ki_bi_ortho_erf_rk_cst_mu = mo_v_ki_bi_ortho_erf_rk_cst_mu * 0.5d0
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, mo_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! int dr phi_i(r) phi_j(r) (erf(mu(R) |r - R|) - 1)/(2|r - R|) on the BI-ORTHO MO basis
+  !
+  END_DOC
+
+  implicit none
+  integer :: ipoint, i, j
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,j,i) = mo_v_ki_bi_ortho_erf_rk_cst_mu(j,i,ipoint)
+      enddo
+    enddo
+  enddo
+
+  !FREE mo_v_ki_bi_ortho_erf_rk_cst_mu
+
+END_PROVIDER 
+
+! ---
+
+! TODO :: optimization : transform into a DGEMM
+
+BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu, (mo_num, mo_num, 3, n_points_final_grid)]
+
+  BEGIN_DOC
+  !
+  ! mo_x_v_ki_bi_ortho_erf_rk_cst_mu(k,i,m,ip) = int dr x(m) * chi_k(r) phi_i(r) (erf(mu |r - R_ip|) - 1)/2|r - R_ip| on the BI-ORTHO MO basis 
+  !
+  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => x(m) = x, m=2 => x(m) = y, m=3 => x(m) = z,
+  !
+  ! R_ip = the "ip"-th point of the DFT Grid
+  !
+  END_DOC
+
+  implicit none
+  integer :: ipoint
+
+ !$OMP PARALLEL         &
+ !$OMP DEFAULT (NONE)   &
+ !$OMP PRIVATE (ipoint) & 
+ !$OMP SHARED (n_points_final_grid,x_v_ij_erf_rk_cst_mu_transp,mo_x_v_ki_bi_ortho_erf_rk_cst_mu)
+ !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+
+    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,1,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
+                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,1,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,2,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
+                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,2,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+    call ao_to_mo_bi_ortho( x_v_ij_erf_rk_cst_mu_transp     (1,1,3,ipoint), size(x_v_ij_erf_rk_cst_mu_transp,      1) &
+                          , mo_x_v_ki_bi_ortho_erf_rk_cst_mu(1,1,3,ipoint), size(mo_x_v_ki_bi_ortho_erf_rk_cst_mu, 1) )
+
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  mo_x_v_ki_bi_ortho_erf_rk_cst_mu = 0.5d0 * mo_x_v_ki_bi_ortho_erf_rk_cst_mu
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, int2_grad1_u12_ao_transp, (ao_num, ao_num, 3, n_points_final_grid)]
+
+  implicit none
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !print *, ' providing int2_grad1_u12_ao_transp ...'
+  !call wall_time(wall0)
+
+  if(test_cycle_tc) then
+
+    PROVIDE int2_grad1_u12_ao_test
+
+    do ipoint = 1, n_points_final_grid
+      do i = 1, ao_num
+        do j = 1, ao_num
+          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,1)
+          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,2)
+          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao_test(j,i,ipoint,3)
+        enddo
+      enddo
+    enddo
+
+    FREE int2_grad1_u12_ao_test
+
+  else
+
+    PROVIDE int2_grad1_u12_ao
+
+    do ipoint = 1, n_points_final_grid
+      do i = 1, ao_num
+        do j = 1, ao_num
+          int2_grad1_u12_ao_transp(j,i,1,ipoint) = int2_grad1_u12_ao(j,i,ipoint,1)
+          int2_grad1_u12_ao_transp(j,i,2,ipoint) = int2_grad1_u12_ao(j,i,ipoint,2)
+          int2_grad1_u12_ao_transp(j,i,3,ipoint) = int2_grad1_u12_ao(j,i,ipoint,3)
+        enddo
+      enddo
+    enddo
+
+  endif
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_ao_transp (min) = ', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_transp, (mo_num, mo_num, 3, n_points_final_grid)]
+
+  implicit none
+  integer          :: ipoint
+  double precision :: wall0, wall1
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE int2_grad1_u12_ao_transp
+
+  !print *, ' providing int2_grad1_u12_bimo_transp ...'
+  !call wall_time(wall0)
+
+  !$OMP PARALLEL         &
+  !$OMP DEFAULT (NONE)   &
+  !$OMP PRIVATE (ipoint) & 
+  !$OMP SHARED (n_points_final_grid,int2_grad1_u12_ao_transp,int2_grad1_u12_bimo_transp)
+  !$OMP DO SCHEDULE (dynamic)
+  do ipoint = 1, n_points_final_grid
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,1,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,1,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,2,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,2,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+    call ao_to_mo_bi_ortho( int2_grad1_u12_ao_transp  (1,1,3,ipoint), size(int2_grad1_u12_ao_transp  , 1) &
+                          , int2_grad1_u12_bimo_transp(1,1,3,ipoint), size(int2_grad1_u12_bimo_transp, 1) )
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  !FREE int2_grad1_u12_ao_transp
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_bimo_transp (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, int2_grad1_u12_bimo_t_old, (n_points_final_grid, 3, mo_num, mo_num)]
+
+  implicit none
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_bimo_t_old ...'
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE int2_grad1_u12_bimo_transp
+
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      do j = 1, mo_num
+        int2_grad1_u12_bimo_t_old(ipoint,1,j,i) = int2_grad1_u12_bimo_transp(j,i,1,ipoint)
+        int2_grad1_u12_bimo_t_old(ipoint,2,j,i) = int2_grad1_u12_bimo_transp(j,i,2,ipoint)
+        int2_grad1_u12_bimo_t_old(ipoint,3,j,i) = int2_grad1_u12_bimo_transp(j,i,3,ipoint)
+      enddo                                  
+    enddo
+  enddo
+
+  FREE int2_grad1_u12_bimo_transp
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_bimo_t_old (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [double precision, int2_grad1_u12_ao_t, (n_points_final_grid, 3, ao_num, ao_num)]
+
+  implicit none
+  integer          :: i, j, ipoint
+  double precision :: wall0, wall1
+
+  !call wall_time(wall0)
+  !print *, ' providing int2_grad1_u12_ao_t ...'
+
+  PROVIDE int2_grad1_u12_ao
+
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      do j = 1, ao_num
+        int2_grad1_u12_ao_t(ipoint,1,j,i) = int2_grad1_u12_ao(j,i,ipoint,1)
+        int2_grad1_u12_ao_t(ipoint,2,j,i) = int2_grad1_u12_ao(j,i,ipoint,2)
+        int2_grad1_u12_ao_t(ipoint,3,j,i) = int2_grad1_u12_ao(j,i,ipoint,3)
+      enddo                                  
+    enddo
+  enddo
+
+  !call wall_time(wall1)
+  !print *, ' wall time for int2_grad1_u12_ao_t (min) =', (wall1 - wall0) / 60.d0
+  !call print_memory_usage()
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp, (n_points_final_grid, 3, mo_num, mo_num)]
+
+  implicit none
+  integer :: i, j, ipoint
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do ipoint = 1, n_points_final_grid
+        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,1,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,1,ipoint)
+        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,2,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,2,ipoint)
+        mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,3,j,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu(j,i,3,ipoint)
+      enddo
+    enddo
+  enddo
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk, (n_points_final_grid, 3, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! x_W_ki_bi_ortho_erf_rk(ip,m,k,i) = \int dr chi_k(r) \frac{(1 - erf(mu |r-R_ip|))}{2|r-R_ip|} (x(m)-R_ip(m)) phi_i(r) ON THE BI-ORTHO MO BASIS 
+  !
+  ! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
+  !
+  ! R_ip = the "ip"-th point of the DFT Grid
+  END_DOC
+ 
+  implicit none
+  include 'constants.include.F'
+ 
+  integer          :: ipoint, m, i, k
+  double precision :: xyz
+  double precision :: wall0, wall1
+ 
+  !print*, ' providing x_W_ki_bi_ortho_erf_rk ...'
+  !call wall_time(wall0)
+
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (ipoint,m,i,k,xyz) & 
+ !$OMP SHARED (x_W_ki_bi_ortho_erf_rk,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do m = 1, 3
+        do ipoint = 1, n_points_final_grid
+          xyz = final_grid_points(m,ipoint)
+          x_W_ki_bi_ortho_erf_rk(ipoint,m,k,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,k,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+ ! FREE mo_v_ki_bi_ortho_erf_rk_cst_mu_transp 
+ ! FREE mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp
+
+  !call wall_time(wall1)
+  !print *, ' time to provide x_W_ki_bi_ortho_erf_rk = ', wall1 - wall0
+
+END_PROVIDER 
+
+! ---
+
+BEGIN_PROVIDER [ double precision, x_W_ki_bi_ortho_erf_rk_diag, (n_points_final_grid, 3, mo_num)]
+  BEGIN_DOC
+  ! x_W_ki_bi_ortho_erf_rk_diag(ip,m,i) = \int dr chi_i(r) (1 - erf(mu |r-R_ip|)) (x(m)-X(m)_ip) phi_i(r) ON THE BI-ORTHO MO BASIS 
+!
+! where chi_k(r)/phi_i(r) are left/right MOs, m=1 => X(m) = x, m=2 => X(m) = y, m=3 => X(m) = z,
+!
+! R_ip = the "ip"-th point of the DFT Grid
+  END_DOC
+
+  implicit none
+  include 'constants.include.F'
+ 
+  integer          :: ipoint, m, i
+  double precision :: xyz
+  double precision :: wall0, wall1
+ 
+  !print*,'providing x_W_ki_bi_ortho_erf_rk_diag ...'
+  !call wall_time(wall0)
+
+ !$OMP PARALLEL                 &
+ !$OMP DEFAULT (NONE)           &
+ !$OMP PRIVATE (ipoint,m,i,xyz) & 
+ !$OMP SHARED (x_W_ki_bi_ortho_erf_rk_diag,n_points_final_grid,mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_v_ki_bi_ortho_erf_rk_cst_mu_transp,mo_num,final_grid_points) 
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do m = 1, 3
+      do ipoint = 1, n_points_final_grid
+        xyz = final_grid_points(m,ipoint)
+        x_W_ki_bi_ortho_erf_rk_diag(ipoint,m,i) = mo_x_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,m,i,i) - xyz * mo_v_ki_bi_ortho_erf_rk_cst_mu_transp(ipoint,i,i)
+      enddo
+    enddo
+  enddo
+
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  !call wall_time(wall1)
+  !print*,'time to provide x_W_ki_bi_ortho_erf_rk_diag = ',wall1 - wall0
+
+END_PROVIDER 
+
+! ---
+
--- a/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
+++ b/plugins/local/bi_ort_ints/three_body_ints_bi_ort.irp.f
@ -123,7 +123,7 @@ subroutine give_integrals_3_body_bi_ort_spin( n, sigma_n, l, sigma_l, k, sigma_k
  endif

  return
-end subroutine give_integrals_3_body_bi_ort_spin
+end

 ! ---

@ -168,7 +168,7 @@ subroutine give_integrals_3_body_bi_ort(n, l, k, m, j, i, integral)
    integral = integral + tmp * final_weight_at_r_vector(ipoint)
  enddo

-end subroutine give_integrals_3_body_bi_ort
+end

 ! ---

--- a/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
+++ b/plugins/local/bi_ort_ints/total_twoe_pot.irp.f
@ -16,10 +16,10 @@ double precision function bi_ortho_mo_ints(l, k, j, i)
  integer             :: m, n, p, q

  bi_ortho_mo_ints = 0.d0
-  do m = 1, ao_num
-    do p = 1, ao_num
-      do n = 1, ao_num
-        do q = 1, ao_num
+  do p = 1, ao_num
+    do m = 1, ao_num
+      do q = 1, ao_num
+        do n = 1, ao_num
          !                                   p1h1p2h2   l1                  l2              r1               r2
          bi_ortho_mo_ints += ao_two_e_tc_tot(n,q,m,p) * mo_l_coef(m,l) * mo_l_coef(n,k) * mo_r_coef(p,j) * mo_r_coef(q,i)
        enddo
@ -27,7 +27,7 @@ double precision function bi_ortho_mo_ints(l, k, j, i)
    enddo
  enddo

-end function bi_ortho_mo_ints
+end

 ! ---

@ -40,38 +40,106 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
  END_DOC

  implicit none
-  integer                       :: i, j, k, l, m, n, p, q
+  integer                       :: i, j, k, l, m, n, p, q, s, r
+  double precision              :: t1, t2, tt1, tt2
  double precision, allocatable :: a1(:,:,:,:), a2(:,:,:,:)
+  double precision, allocatable :: a_jkp(:,:,:), a_kpq(:,:,:), ao_two_e_tc_tot_tmp(:,:,:)
+
+  print *, ' PROVIDING mo_bi_ortho_tc_two_e_chemist ...'
+  call wall_time(t1)
+  call print_memory_usage()

  PROVIDE mo_r_coef mo_l_coef

-  allocate(a2(ao_num,ao_num,ao_num,mo_num))
+  if(ao_to_mo_tc_n3) then

-  call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
-            , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
-            , 0.d0 , a2(1,1,1,1), ao_num*ao_num*ao_num)
+    print*, ' memory scale of TC ao -> mo: O(N3) '

-  allocate(a1(ao_num,ao_num,mo_num,mo_num))
+    if(.not.read_tc_integ) then
+       stop 'read_tc_integ needs to be set to true'
+    endif

-  call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
-            , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+    allocate(a_jkp(ao_num,ao_num,mo_num))
+    allocate(a_kpq(ao_num,mo_num,mo_num))
+    allocate(ao_two_e_tc_tot_tmp(ao_num,ao_num,ao_num))

-  deallocate(a2)
-  allocate(a2(ao_num,mo_num,mo_num,mo_num))
+    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="read")

-  call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
-            , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+    call wall_time(tt1)

-  deallocate(a1)
+    mo_bi_ortho_tc_two_e_chemist(:,:,:,:) = 0.d0
+    do l = 1, ao_num
+      read(11) ao_two_e_tc_tot_tmp(:,:,:)

-  call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
-            , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
-            , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+      do s = 1, mo_num

-  deallocate(a2)
+        call dgemm( 'T', 'N', ao_num*ao_num, mo_num, ao_num, 1.d0              &
+                  , ao_two_e_tc_tot_tmp(1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+                  , 0.d0, a_jkp(1,1,1), ao_num*ao_num)

+        call dgemm( 'T', 'N', ao_num*mo_num, mo_num, ao_num, 1.d0 &
+                  , a_jkp(1,1,1), ao_num, mo_r_coef(1,1), ao_num  &
+                  , 0.d0, a_kpq(1,1,1), ao_num*mo_num)
+
+        call dgemm( 'T', 'N', mo_num*mo_num, mo_num, ao_num, mo_r_coef(l,s) &
+                  , a_kpq(1,1,1), ao_num, mo_l_coef(1,1), ao_num            &
+                  , 1.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,s), mo_num*mo_num)
+
+      enddo ! s
+
+      if(l == 2) then
+        call wall_time(tt2)
+        print*, ' 1 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-1)*(tt2-tt1)/60.d0
+      elseif(l == 11) then
+        call wall_time(tt2)
+        print*, ' 10 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-10)*(tt2-tt1)/(60.d0*10.d0)
+      elseif(l == 101) then
+        call wall_time(tt2)
+        print*, ' 100 / mo_num done in (min)', (tt2-tt1)/60.d0
+        print*, ' estimated time required (min)', dble(mo_num-100)*(tt2-tt1)/(60.d0*100.d0)
+      endif
+    enddo ! l
+
+    close(11)
+
+    deallocate(a_jkp, a_kpq, ao_two_e_tc_tot_tmp)
+
+  else
+
+    print*, ' memory scale of TC ao -> mo: O(N4) '
+
+    allocate(a2(ao_num,ao_num,ao_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*ao_num*ao_num, mo_num, ao_num, 1.d0     &
+              , ao_two_e_tc_tot(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num &
+              , 0.d0, a2(1,1,1,1), ao_num*ao_num*ao_num)
+  
+    FREE ao_two_e_tc_tot
+
+    allocate(a1(ao_num,ao_num,mo_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*ao_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+              , 0.d0, a1(1,1,1,1), ao_num*ao_num*mo_num)
+  
+    deallocate(a2)
+    allocate(a2(ao_num,mo_num,mo_num,mo_num))
+  
+    call dgemm( 'T', 'N', ao_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a1(1,1,1,1), ao_num, mo_l_coef(1,1), ao_num          &
+              , 0.d0, a2(1,1,1,1), ao_num*mo_num*mo_num)
+  
+    deallocate(a1)
+  
+    call dgemm( 'T', 'N', mo_num*mo_num*mo_num, mo_num, ao_num, 1.d0 &
+              , a2(1,1,1,1), ao_num, mo_r_coef(1,1), ao_num          &
+              , 0.d0, mo_bi_ortho_tc_two_e_chemist(1,1,1,1), mo_num*mo_num*mo_num)
+  
+    deallocate(a2)
+  
+  endif

  !allocate(a1(mo_num,ao_num,ao_num,ao_num))
  !a1 = 0.d0
@ -135,6 +203,10 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e_chemist, (mo_num, mo_num,
  !enddo
  !deallocate(a1)

+  call wall_time(t2)
+  print *, ' WALL TIME for PROVIDING mo_bi_ortho_tc_two_e_chemist (min)', (t2-t1)/60.d0
+  call print_memory_usage()
+
 END_PROVIDER 

 ! ---
@ -176,6 +248,34 @@ BEGIN_PROVIDER [double precision, mo_bi_ortho_tc_two_e, (mo_num, mo_num, mo_num,

 END_PROVIDER 

+BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_transp, (mo_num, mo_num, mo_num, mo_num)]
+ implicit none
+ BEGIN_DOC
+  !
+  ! mo_bi_ortho_tc_two_e_transp(i,j,k,l) = <k l| V(r_12) |i j> = transpose of mo_bi_ortho_tc_two_e
+  !
+  ! the potential V(r_12) contains ALL TWO-E CONTRIBUTION OF THE TC-HAMILTONIAN
+  !
+ END_DOC
+
+ integer :: i,j,k,l
+ print*,'Providing mo_bi_ortho_tc_two_e_transp'
+ double precision :: t0,t1
+ call wall_time(t0)
+ do i = 1, mo_num
+  do j = 1, mo_num
+   do k = 1, mo_num
+    do l = 1, mo_num
+     mo_bi_ortho_tc_two_e_transp(i,j,k,l) = mo_bi_ortho_tc_two_e(k,l,i,j)
+    enddo
+   enddo
+  enddo
+ enddo
+ call wall_time(t1)
+ 
+ print *, ' WALL TIME for PROVIDING mo_bi_ortho_tc_two_e_transp (min)', (t1-t0)/60.d0
+
+END_PROVIDER 
 ! ---

 BEGIN_PROVIDER [ double precision, mo_bi_ortho_tc_two_e_jj,          (mo_num,mo_num)]
@ -232,3 +332,23 @@ END_PROVIDER

 ! ---

+ BEGIN_PROVIDER [double precision, tc_2e_3idx_coulomb_integrals_transp , (mo_num,mo_num,mo_num)]
+&BEGIN_PROVIDER [double precision, tc_2e_3idx_exchange_integrals_transp, (mo_num,mo_num,mo_num)]
+
+  BEGIN_DOC
+  ! tc_2e_3idx_coulomb_integrals_transp (j,k,i) = <jk|ji> 
+  ! tc_2e_3idx_exchange_integrals_transp(j,k,i) = <kj|ji> 
+  END_DOC
+ implicit none
+ integer :: i, j, k
+
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        tc_2e_3idx_coulomb_integrals_transp(j, k,i) = mo_bi_ortho_tc_two_e_transp(j ,k ,j ,i ) 
+        tc_2e_3idx_exchange_integrals_transp(j,k,i) = mo_bi_ortho_tc_two_e_transp(k ,j ,j ,i ) 
+      enddo
+    enddo
+  enddo
+
+END_PROVIDER 
--- a/plugins/local/bi_ortho_mos/bi_ort_mos_in_r.irp.f
+++ b/plugins/local/bi_ortho_mos/bi_ort_mos_in_r.irp.f
@ -1,135 +1,70 @@

-! TODO: left & right MO without duplicate AO calculation
-
-! ---
-
-BEGIN_PROVIDER[double precision, mos_r_in_r_array, (mo_num, n_points_final_grid)]
+ BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid, mo_num)]
+&BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid, mo_num)]

  BEGIN_DOC
-  ! mos_in_r_array(i,j) = value of the ith RIGHT mo on the jth grid point
+  !
+  ! mos_l_in_r_array_transp(i,j) = value of the jth left-mo  on the ith grid point
+  ! mos_r_in_r_array_transp(i,j) = value of the jth right-mo on the ith grid point
+  !
  END_DOC

  implicit none
-  integer          :: i, j
-  double precision :: mos_array(mo_num), r(3)

- !$OMP PARALLEL DO &
- !$OMP DEFAULT (NONE)  &
- !$OMP PRIVATE (i, j, r, mos_array) & 
- !$OMP SHARED (mos_r_in_r_array, n_points_final_grid, mo_num, final_grid_points)
+  integer                       :: i
+  double precision              :: tt0, tt1, tt2, tt3
+  double precision              :: r(3)
+  double precision, allocatable :: aos_r(:,:)
+
+  call wall_time(tt0)
+
+  allocate(aos_r(ao_num,n_points_final_grid))
+
+  ! provide everything required before OpenMP
+  r(1) = final_grid_points(1,1)
+  r(2) = final_grid_points(2,1)
+  r(3) = final_grid_points(3,1)
+  call give_all_aos_at_r(r, aos_r(1,1))
+
+
+  call wall_time(tt2)
+
+  !$OMP PARALLEL       &
+  !$OMP DEFAULT (NONE) &
+  !$OMP PRIVATE (i, r) & 
+  !$OMP SHARED(n_points_final_grid, final_grid_points, aos_r)
+  !$OMP DO
  do i = 1, n_points_final_grid
    r(1) = final_grid_points(1,i)
    r(2) = final_grid_points(2,i)
    r(3) = final_grid_points(3,i)
-    call give_all_mos_r_at_r(r, mos_array)
-    do j = 1, mo_num
-      mos_r_in_r_array(j,i) = mos_array(j)
-    enddo
+    call give_all_aos_at_r(r, aos_r(1,i))
  enddo
- !$OMP END PARALLEL DO
- 
-END_PROVIDER
+  !$OMP END DO
+  !$OMP END PARALLEL

-! ---
+  call wall_time(tt3)
+  write(*,"(A,2X,F15.7)") ' wall time for AOs on r (sec) = ', (tt3 - tt2)

-BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp, (n_points_final_grid, mo_num)]

-  BEGIN_DOC
-  ! mos_r_in_r_array_transp(i,j) = value of the jth mo on the ith grid point
-  END_DOC
+  call dgemm("T", "N", n_points_final_grid, mo_num, ao_num, &
+             1.d0,                                          &
+             aos_r(1,1), ao_num,                            &
+             mo_l_coef(1,1), ao_num,                        &
+             0.d0,                                          &
+             mos_l_in_r_array_transp(1,1), n_points_final_grid)

-  implicit none
-  integer :: i,j
+  call dgemm("T", "N", n_points_final_grid, mo_num, ao_num, &
+             1.d0,                                          &
+             aos_r(1,1), ao_num,                            &
+             mo_r_coef(1,1), ao_num,                        &
+             0.d0,                                          &
+             mos_r_in_r_array_transp(1,1), n_points_final_grid)

-  do i = 1, n_points_final_grid
-    do j = 1, mo_num
-      mos_r_in_r_array_transp(i,j) = mos_r_in_r_array(j,i) 
-    enddo
-  enddo
-
-END_PROVIDER
-
-! ---
-
-subroutine give_all_mos_r_at_r(r, mos_r_array)
-
-  BEGIN_DOC
-  ! mos_r_array(i) = ith RIGHT MO function evaluated at "r"
-  END_DOC
-
-  implicit none
-  double precision, intent(in)  :: r(3)
-  double precision, intent(out) :: mos_r_array(mo_num)
-  double precision              :: aos_array(ao_num)
-
-  call give_all_aos_at_r(r, aos_array)
-  call dgemv('N', mo_num, ao_num, 1.d0, mo_r_coef_transp, mo_num, aos_array, 1, 0.d0, mos_r_array, 1)
-
-end subroutine give_all_mos_r_at_r
-
-! ---
-
-BEGIN_PROVIDER[double precision, mos_l_in_r_array, (mo_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  ! mos_in_r_array(i,j) = value of the ith LEFT mo on the jth grid point
-  END_DOC
-
-  implicit none
-  integer          :: i, j
-  double precision :: mos_array(mo_num), r(3)
-
- !$OMP PARALLEL DO &
- !$OMP DEFAULT (NONE)  &
- !$OMP PRIVATE (i,r,mos_array,j) & 
- !$OMP SHARED(mos_l_in_r_array,n_points_final_grid,mo_num,final_grid_points)
-  do i = 1, n_points_final_grid
-    r(1) = final_grid_points(1,i)
-    r(2) = final_grid_points(2,i)
-    r(3) = final_grid_points(3,i)
-    call give_all_mos_l_at_r(r, mos_array)
-    do j = 1, mo_num
-      mos_l_in_r_array(j,i) = mos_array(j)
-    enddo
-  enddo
- !$OMP END PARALLEL DO
- 
-END_PROVIDER
-
-! ---
-
-subroutine give_all_mos_l_at_r(r, mos_l_array)
-
-  BEGIN_DOC
-  ! mos_l_array(i) = ith LEFT MO function evaluated at "r"
-  END_DOC
-
-  implicit none
-  double precision, intent(in)  :: r(3)
-  double precision, intent(out) :: mos_l_array(mo_num)
-  double precision              :: aos_array(ao_num)
-
-  call give_all_aos_at_r(r, aos_array)
-  call dgemv('N', mo_num, ao_num, 1.d0, mo_l_coef_transp, mo_num, aos_array, 1, 0.d0, mos_l_array, 1)
-
-end subroutine give_all_mos_l_at_r
-
-! ---
-
-BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp, (n_points_final_grid,mo_num)]
-
-  BEGIN_DOC
-  ! mos_l_in_r_array_transp(i,j) = value of the jth mo on the ith grid point
-  END_DOC
-
-  implicit none
-  integer :: i, j
-
-  do i = 1, n_points_final_grid
-    do j = 1, mo_num
-      mos_l_in_r_array_transp(i,j) = mos_l_in_r_array(j,i) 
-    enddo
-  enddo
+  deallocate(aos_r)
+
+  call wall_time(tt1)
+  write(*,"(A,2X,F15.7)") ' wall time for mos_l_in_r_array_transp & mos_r_in_r_array_transp (sec) = ', (tt1 - tt0)

 END_PROVIDER

--- a/plugins/local/bi_ortho_mos/bi_ort_mos_in_r_old.irp.f
+++ b/plugins/local/bi_ortho_mos/bi_ort_mos_in_r_old.irp.f
@ -0,0 +1,137 @@
+
+! TODO: left & right MO without duplicate AO calculation
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_r_in_r_array, (mo_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  ! mos_in_r_array(i,j) = value of the ith RIGHT mo on the jth grid point
+  END_DOC
+
+  implicit none
+  integer          :: i, j
+  double precision :: mos_array(mo_num), r(3)
+
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (i, j, r, mos_array) & 
+ !$OMP SHARED (mos_r_in_r_array, n_points_final_grid, mo_num, final_grid_points)
+  do i = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    call give_all_mos_r_at_r(r, mos_array)
+    do j = 1, mo_num
+      mos_r_in_r_array(j,i) = mos_array(j)
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+ 
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_r_in_r_array_transp_old, (n_points_final_grid, mo_num)]
+
+  BEGIN_DOC
+  ! mos_r_in_r_array_transp_old(i,j) = value of the jth mo on the ith grid point
+  END_DOC
+
+  implicit none
+  integer :: i,j
+
+  do i = 1, n_points_final_grid
+    do j = 1, mo_num
+      mos_r_in_r_array_transp_old(i,j) = mos_r_in_r_array(j,i) 
+    enddo
+  enddo
+
+END_PROVIDER
+
+! ---
+
+subroutine give_all_mos_r_at_r(r, mos_r_array)
+
+  BEGIN_DOC
+  ! mos_r_array(i) = ith RIGHT MO function evaluated at "r"
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: mos_r_array(mo_num)
+  double precision              :: aos_array(ao_num)
+
+  call give_all_aos_at_r(r, aos_array)
+  call dgemv('N', mo_num, ao_num, 1.d0, mo_r_coef_transp, mo_num, aos_array, 1, 0.d0, mos_r_array, 1)
+
+end subroutine give_all_mos_r_at_r
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_l_in_r_array, (mo_num, n_points_final_grid)]
+
+  BEGIN_DOC
+  ! mos_in_r_array(i,j) = value of the ith LEFT mo on the jth grid point
+  END_DOC
+
+  implicit none
+  integer          :: i, j
+  double precision :: mos_array(mo_num), r(3)
+
+ !$OMP PARALLEL DO &
+ !$OMP DEFAULT (NONE)  &
+ !$OMP PRIVATE (i,r,mos_array,j) & 
+ !$OMP SHARED(mos_l_in_r_array,n_points_final_grid,mo_num,final_grid_points)
+  do i = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    call give_all_mos_l_at_r(r, mos_array)
+    do j = 1, mo_num
+      mos_l_in_r_array(j,i) = mos_array(j)
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+ 
+END_PROVIDER
+
+! ---
+
+subroutine give_all_mos_l_at_r(r, mos_l_array)
+
+  BEGIN_DOC
+  ! mos_l_array(i) = ith LEFT MO function evaluated at "r"
+  END_DOC
+
+  implicit none
+  double precision, intent(in)  :: r(3)
+  double precision, intent(out) :: mos_l_array(mo_num)
+  double precision              :: aos_array(ao_num)
+
+  call give_all_aos_at_r(r, aos_array)
+  call dgemv('N', mo_num, ao_num, 1.d0, mo_l_coef_transp, mo_num, aos_array, 1, 0.d0, mos_l_array, 1)
+
+end subroutine give_all_mos_l_at_r
+
+! ---
+
+BEGIN_PROVIDER[double precision, mos_l_in_r_array_transp_old, (n_points_final_grid,mo_num)]
+
+  BEGIN_DOC
+  ! mos_l_in_r_array_transp_old(i,j) = value of the jth mo on the ith grid point
+  END_DOC
+
+  implicit none
+  integer :: i, j
+
+  do i = 1, n_points_final_grid
+    do j = 1, mo_num
+      mos_l_in_r_array_transp_old(i,j) = mos_l_in_r_array(j,i) 
+    enddo
+  enddo
+
+END_PROVIDER
+
+! ---
+
--- a/plugins/local/bi_ortho_mos/overlap.irp.f
+++ b/plugins/local/bi_ortho_mos/overlap.irp.f
@ -56,10 +56,10 @@
    print*,'Average trace of overlap_bi_ortho is different from 1 by ', dabs(accu_d-1.d0)
    print*,'And bi orthogonality is off by an average of ',accu_nd
    print*,'****************'
-    print*,'Overlap matrix betwee mo_l_coef and mo_r_coef  '
-    do i = 1, mo_num
-      write(*,'(100(F16.10,X))')overlap_bi_ortho(i,:)
-    enddo
+    !print*,'Overlap matrix betwee mo_l_coef and mo_r_coef  '
+    !do i = 1, mo_num
+    !  write(*,'(100(F16.10,X))')overlap_bi_ortho(i,:)
+    !enddo
  endif
  print*,'Average trace of overlap_bi_ortho (should be 1.)'
  print*,'accu_d  = ',accu_d
--- a/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d0_transp.irp.f
@ -0,0 +1,108 @@
+subroutine get_d0_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be okay for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  integer :: i, j, k, s, h1, h2, p1, p2, puti, putj, mm
+  double precision :: phase
+  double precision :: hij,hji
+  double precision, external :: get_phase_bi
+  logical :: ok
+
+  integer, parameter :: bant=1
+  double precision, allocatable :: hij_cache1(:), hij_cache2(:)
+  allocate (hij_cache1(mo_num),hij_cache2(mo_num))
+  double precision, allocatable :: hji_cache1(:), hji_cache2(:)
+  allocate (hji_cache1(mo_num),hji_cache2(mo_num))
+!  print*,'in get_d0_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  if(sp == 3) then ! AB
+    h1 = p(1,1)
+    h2 = p(1,2)
+    do p1=1, mo_num
+      if(bannedOrb(p1, 1)) cycle
+!      call get_mo_two_e_integrals_complex(p1,h2,h1,mo_num,hij_cache1,mo_integrals_map)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,p1,h2,h1)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e_transp(mm,p1,h2,h1)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e_both(det,gen,N_int, hij,hji)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hij = hij_cache1(p2) * phase
+          hji = hji_cache1(p2) * phase
+        end if
+        if (hij == 0.d0.or.hji == 0.d0) cycle
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij  ! HOTSPOT
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji  ! HOTSPOT
+        enddo
+      end do
+    end do
+
+  else ! AA BB
+    p1 = p(1,sp)
+    p2 = p(2,sp)
+    do puti=1, mo_num
+      if(bannedOrb(puti, sp)) cycle
+!      call get_mo_two_e_integrals_complex(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,puti,p2,p1)
+       hij_cache2(mm) = mo_bi_ortho_tc_two_e(mm,puti,p1,p2)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e_transp(mm,puti,p2,p1)
+       hji_cache2(mm) = mo_bi_ortho_tc_two_e_transp(mm,puti,p1,p2)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          !call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e_both(det,gen,N_int, hij,hji)
+         if (hij == 0.d0.or.hji == 0.d0) cycle
+        else
+!          hij = (mo_two_e_integral_complex(p1, p2, puti, putj) -  mo_two_e_integral_complex(p2, p1, puti, putj))
+!          hij = (mo_bi_ortho_tc_two_e(p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e(p2, p1, puti, putj))
+          hij = (mo_bi_ortho_tc_two_e(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e(puti, putj, p2, p1))
+          hji = (mo_bi_ortho_tc_two_e_transp(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e_transp(puti, putj, p2, p1))
+          if (hij == 0.d0.or.hji == 0.d0) cycle
+          phase = get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+          hij = (hij) * phase
+          hji = (hji) * phase
+        end if
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+          mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+        enddo
+      end do
+
+    end do
+  end if
+
+  deallocate(hij_cache1,hij_cache2)
+end
+
--- a/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d1_transp.irp.f
@ -0,0 +1,358 @@
+subroutine get_d1_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices should be okay for complex?
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in)  :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in)  :: phasemask(N_int,2)
+  logical, intent(in)            :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind)              :: det(N_int, 2)
+  double precision, intent(in)   :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+  double precision, external     :: get_phase_bi
+  double precision, external     :: mo_two_e_integral_complex
+  logical                        :: ok
+
+  logical, allocatable           :: lbanned(:,:)
+  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j, istate
+  integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k, l, mm
+
+  integer, parameter             :: turn2(2) = (/2,1/)
+  integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer                        :: bant
+  double precision, allocatable :: hij_cache(:,:)
+  double precision               :: hij, tmp_rowij(N_states, mo_num), tmp_rowij2(N_states, mo_num),phase
+  double precision, allocatable :: hji_cache(:,:)
+  double precision               :: hji, tmp_rowji(N_states, mo_num), tmp_rowji2(N_states, mo_num)
+!  PROVIDE mo_integrals_map N_int
+!  print*,'in get_d1_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  allocate (lbanned(mo_num, 2))
+  allocate (hij_cache(mo_num,2))
+  allocate (hji_cache(mo_num,2))
+  lbanned = bannedOrb
+
+  do i=1, p(0,1)
+    lbanned(p(i,1), 1) = .true.
+  end do
+  do i=1, p(0,2)
+    lbanned(p(i,2), 2) = .true.
+  end do
+
+  ma = 1
+  if(p(0,2) >= 2) ma = 2
+  mi = turn2(ma)
+
+  bant = 1
+
+  if(sp == 3) then
+    !move MA
+    if(ma == 2) bant = 2
+    puti = p(1,mi)
+    hfix = h(1,ma)
+    p1 = p(1,ma)
+    p2 = p(2,ma)
+    if(.not. bannedOrb(puti, mi)) then
+!      call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+       do istate = 1,N_states
+        tmp_rowij(istate,mm)  = 0.d0
+        tmp_rowji(istate,mm)  = 0.d0
+       enddo
+      enddo
+      !! <alpha|H|psi>
+      do putj=1, hfix-1
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,1) - hij_cache(putj,2)
+        hji = hji_cache(putj,1) - hji_cache(putj,2)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,2)
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,1)
+          enddo
+        endif
+      end do
+      do putj=hfix+1, mo_num
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,2) - hij_cache(putj,1)
+        hji = hji_cache(putj,2) - hji_cache(putj,1)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,2)
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,1)
+          enddo
+        endif
+      end do
+
+      if(ma == 1) then
+        mat_r(1:N_states,1:mo_num,puti) = mat_r(1:N_states,1:mo_num,puti) + tmp_rowij(1:N_states,1:mo_num)
+        mat_l(1:N_states,1:mo_num,puti) = mat_l(1:N_states,1:mo_num,puti) + tmp_rowji(1:N_states,1:mo_num)
+      else
+        do l=1,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k,puti,l) = mat_r(k,puti,l) + tmp_rowij(k,l)
+            mat_l(k,puti,l) = mat_l(k,puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end if
+
+    end if
+
+    !MOVE MI
+    pfix = p(1,mi)
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+    do mm = 1, mo_num
+     do istate = 1,N_states
+      tmp_rowij(istate,mm)  = 0.d0
+      tmp_rowij2(istate,mm) = 0.d0
+      tmp_rowji(istate,mm)  = 0.d0
+      tmp_rowji2(istate,mm) = 0.d0
+     enddo
+     hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p1)
+     hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p2)
+     hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p1)
+     hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,pfix,p2)
+    enddo
+    putj = p1
+    !! <alpha|H|psi>
+    do puti=1,mo_num !HOT
+      if(lbanned(puti,mi)) cycle
+      !p1 fixed
+      putj = p1
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,2)
+        hji = hji_cache(puti,2)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,2)
+            tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,1)
+          enddo
+        endif
+      end if
+!      
+      putj = p2
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,1)
+        hji = hji_cache(puti,1)
+        if (hij /= 0.d0.and.hji/=0.d0) then
+          phase = get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          do k=1,N_states
+            tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,2)
+            tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,1)
+          enddo
+        endif
+      end if
+    end do
+
+    if(mi == 1) then
+      mat_r(:,:,p1) = mat_r(:,:,p1) + tmp_rowij(:,:)
+      mat_r(:,:,p2) = mat_r(:,:,p2) + tmp_rowij2(:,:)
+      mat_l(:,:,p1) = mat_l(:,:,p1) + tmp_rowji(:,:)
+      mat_l(:,:,p2) = mat_l(:,:,p2) + tmp_rowji2(:,:)
+    else
+      do l=1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij(k,l)
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij2(k,l)
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+
+  else  ! sp /= 3
+
+    if(p(0,ma) == 3) then
+      do i=1,3
+        hfix = h(1,ma)
+        puti = p(i, ma)
+        p1 = p(turn3(1,i), ma)
+        p2 = p(turn3(2,i), ma)
+!        call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!        call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+        do mm = 1, mo_num
+         hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+         hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+         hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,p2)
+         hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,p1)
+         do istate = 1, N_states
+          tmp_rowij(istate,mm) = 0.d0
+          tmp_rowji(istate,mm) = 0.d0
+         enddo
+        enddo
+    !! <alpha|H|psi>
+        do putj=1,hfix-1
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,1) - hij_cache(putj,2)
+          hji = hji_cache(putj,1) - hji_cache(putj,2)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,2)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,1)
+          endif
+        end do
+        do putj=hfix+1,mo_num
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,2) - hij_cache(putj,1)
+          hji = hji_cache(putj,2) - hji_cache(putj,1)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,2)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,1)
+          endif
+        end do
+
+        mat_r(:, :puti-1, puti) = mat_r(:, :puti-1, puti) + tmp_rowij(:,:puti-1)
+        mat_l(:, :puti-1, puti) = mat_l(:, :puti-1, puti) + tmp_rowji(:,:puti-1)
+        do l=puti,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, l) = mat_r(k, puti,l) + tmp_rowij(k,l)
+            mat_l(k, puti, l) = mat_l(k, puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end do
+    else
+      hfix = h(1,mi)
+      pfix = p(1,mi)
+      p1 = p(1,ma)
+      p2 = p(2,ma)
+!      call get_mo_two_e_integrals_complex(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,pfix)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,pfix)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p1,pfix)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e_transp(mm,hfix,p2,pfix)
+       do istate = 1,N_states
+        tmp_rowij (istate,mm) =  0.d0
+        tmp_rowij2(istate,mm) = 0.d0
+        tmp_rowji (istate,mm) =  0.d0
+        tmp_rowji2(istate,mm) = 0.d0
+       enddo
+      enddo
+      putj = p2
+    !! <alpha|H|psi>
+      do puti=1,mo_num
+        if(lbanned(puti,ma)) cycle
+        putj = p2
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,1)
+          hji = hji_cache(puti,1)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,2)
+              tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,1)
+            enddo
+          endif
+        end if
+
+        putj = p1
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,2)
+          hji = hji_cache(puti,2)
+          if (hij /= 0.d0.and.hji/=0.d0) then
+            phase = get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            do k=1,N_states
+              tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,2)
+              tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,1)
+            enddo
+          endif
+        end if
+      end do
+      mat_r(:,:p2-1,p2) = mat_r(:,:p2-1,p2) + tmp_rowij(:,:p2-1)
+      mat_l(:,:p2-1,p2) = mat_l(:,:p2-1,p2) + tmp_rowji(:,:p2-1)
+      do l=p2,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji(k,l)
+        enddo
+      enddo
+      mat_r(:,:p1-1,p1) = mat_r(:,:p1-1,p1) + tmp_rowij2(:,:p1-1)
+      mat_l(:,:p1-1,p1) = mat_l(:,:p1-1,p1) + tmp_rowji2(:,:p1-1)
+      do l=p1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij2(k,l)
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+  end if
+  deallocate(lbanned,hij_cache, hji_cache)
+
+ !! MONO
+    if(sp == 3) then
+      s1 = 1
+      s2 = 2
+    else
+      s1 = sp
+      s2 = sp
+    end if
+
+    do i1=1,p(0,s1)
+      ib = 1
+      if(s1 == s2) ib = i1+1
+      do i2=ib,p(0,s2)
+        p1 = p(i1,s1)
+        p2 = p(i2,s2)
+        if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
+        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+        ! gen is a selector; mask is ionized generator; det is alpha
+        ! hij is contribution to <psi|H|alpha>
+!        call i_h_j_complex(gen, det, N_int, hij)
+        call htilde_mu_mat_opt_bi_ortho_no_3e_both(det, gen, N_int, hij,hji)
+!        call htilde_mu_mat_opt_bi_ortho_no_3e(gen, det, N_int, hji)
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,1) * dconjg(hij)
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,2) * hij
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,1) * hji
+        enddo
+      end do
+    end do
+end
+
--- a/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d2_good.irp.f
@ -25,9 +25,6 @@ subroutine get_d2_new(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h,

  integer :: bant
  bant = 1
-!  print*, 'in get_d2_new'
-!  call debug_det(gen,N_int)
-!  print*,'coefs',coefs(1,:)

  tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles

--- a/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/get_d2_transp.irp.f
@ -0,0 +1,235 @@
+
+subroutine get_d2_new_transp(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be correct for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  double precision, external :: get_phase_bi
+
+  integer :: i, j, k, tip, ma, mi, puti, putj
+  integer :: h1, h2, p1, p2, i1, i2
+  double precision :: phase
+  double precision :: hij,hji
+
+  integer, parameter:: turn2d(2,3,4) = reshape((/0,0, 0,0, 0,0,  3,4, 0,0, 0,0,  2,4, 1,4, 0,0,  2,3, 1,3, 1,2 /), (/2,3,4/))
+  integer, parameter :: turn2(2) = (/2, 1/)
+  integer, parameter :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer :: bant
+  bant = 1
+
+  tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles
+
+  ma = sp !1:(alpha,alpha); 2:(b,b); 3:(a,b)
+  if(p(0,1) > p(0,2)) ma = 1 ! more alpha particles than beta particles
+  if(p(0,1) < p(0,2)) ma = 2 ! fewer alpha particles than beta particles
+  mi = mod(ma, 2) + 1
+
+  if(sp == 3) then ! if one alpha and one beta xhole 
+    !(where xholes refer to the ionizations from the generator, not the holes occupied in the ionized generator)
+    if(ma == 2) bant = 2 ! if more beta particles than alpha particles
+
+    if(tip == 3) then ! if 3 of one particle spin and 1 of the other particle spin
+      puti = p(1, mi)
+      if(bannedOrb(puti, mi)) return
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+
+      !! <alpha|H|psi>
+      do i = 1, 3    ! loop over all 3 combinations of 2 particles with spin ma
+        putj = p(i, ma)
+        if(banned(putj,puti,bant)) cycle
+        i1 = turn3(1,i)
+        i2 = turn3(2,i)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        
+     ! |G> = |psi_{gen,i}>
+     ! |G'> = a_{x1} a_{x2} |G>
+     ! |alpha> = a_{puti}^{\dagger} a_{putj}^{\dagger} |G'>
+     ! |alpha> = t_{x1,x2}^{puti,putj} |G>
+     ! hij = <psi_{selectors,i}|H|alpha>
+     ! |alpha> = t_{p1,p2}^{h1,h2}|psi_{selectors,i}>
+        !todo: <i|H|j>  =  (<h1,h2|p1,p2> - <h1,h2|p2,p1>) * phase
+        !    <psi|H|j> +=  dconjg(c_i) * <i|H|j>
+        !      <j|H|i>  =  (<p1,p2|h1,h2> - <p2,p1|h1,h2>) * phase
+        !    <j|H|psi> +=  <j|H|i> * c_i
+
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+
+!        hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e( h1, h2, p2, p1)
+!        hji = mo_bi_ortho_tc_two_e_transp(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e_transp( h1, h2, p2, p1)
+        hij = mo_bi_ortho_tc_two_e_transp(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e_transp( p1, p2, h2, h1)
+        hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)       - mo_bi_ortho_tc_two_e( p1, p2, h2, h1)
+        if (hij == 0.d0.or.hji==0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        phase = get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        hij = hij * phase
+        hji = hji * phase
+
+        if(ma == 1) then ! if particle spins are (alpha,alpha,alpha,beta), then puti is beta and putj is alpha
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,2) * hij
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,1) * hji
+          enddo
+        else            ! if particle spins are (beta,beta,beta,alpha), then puti is alpha and putj is beta
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        end if
+      end do
+    else ! if 2 alpha and 2 beta particles
+      h1 = h(1,1)
+      h2 = h(1,2)
+      !! <alpha|H|psi>
+      do j = 1,2 ! loop over all 4 combinations of one alpha and one beta particle
+        putj = p(j, 2)
+        if(bannedOrb(putj, 2)) cycle
+        p2 = p(turn2(j), 2)
+        do i = 1,2
+          puti = p(i, 1)
+          if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle
+          p1 = p(turn2(i), 1)
+    ! hij = <psi_{selectors,i}|H|alpha> 
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+!          hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2 )
+!          hji = mo_bi_ortho_tc_two_e_transp(h1, h2, p1, p2 )
+          hij = mo_bi_ortho_tc_two_e_transp(p1, p2 ,h1, h2 )
+          hji = mo_bi_ortho_tc_two_e( p1, p2, h1, h2)
+          if (hij /= 0.d0.or.hji==0.d0) then
+            ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!            hij = dconjg(hij) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            hij = hij * phase
+            hji = hji * phase
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+              mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+            enddo
+          endif
+        end do
+      end do
+    end if
+
+  else ! if holes are (a,a) or (b,b)
+    if(tip == 0) then ! if particles are (a,a,a,a) or (b,b,b,b)
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(i, ma)
+        if(bannedOrb(puti,ma)) cycle
+        do j=i+1,4
+          putj = p(j, ma)
+          if(bannedOrb(putj,ma)) cycle
+          if(banned(puti,putj,1)) cycle
+
+          i1 = turn2d(1, i, j)
+          i2 = turn2d(2, i, j)
+          p1 = p(i1, ma)
+          p2 = p(i2, ma)
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+          hij = mo_bi_ortho_tc_two_e_transp(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e_transp(p1, p2, h2,h1 )
+          hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p1, p2, h2,h1 )
+          if (hij == 0.d0.or.hji == 0.d0) cycle
+
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          phase = get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          hij = hij * phase
+          hji = hji * phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) +coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) +coefs(k,1) * hji
+          enddo
+        end do
+      end do
+    else if(tip == 3) then ! if particles are (a,a,a,b) (ma=1,mi=2) or (a,b,b,b) (ma=2,mi=1)
+      h1 = h(1, mi)
+      h2 = h(1, ma)
+      p1 = p(1, mi)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(turn3(1,i), ma)
+        if(bannedOrb(puti,ma)) cycle
+        putj = p(turn3(2,i), ma)
+        if(bannedOrb(putj,ma)) cycle
+        if(banned(puti,putj,1)) cycle
+        p2 = p(i, ma)
+
+!        hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = mo_bi_ortho_tc_two_e_transp(p1, p2 ,h1, h2)
+        hji = mo_bi_ortho_tc_two_e(p1, p2,h1, h2 )
+        if (hij == 0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        phase = get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        hij = hij * phase
+        hji = hji * phase
+        if (puti < putj) then
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        else
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,2) * hij
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,1) * hji
+          enddo
+        endif
+      end do
+    else ! tip == 4  (a,a,b,b)
+      puti = p(1, sp)
+      putj = p(2, sp)
+      if(.not. banned(puti,putj,1)) then
+        p1 = p(1, mi)
+        p2 = p(2, mi)
+        h1 = h(1, mi)
+        h2 = h(2, mi)
+      !! <alpha|H|psi>
+!        hij = (mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2))
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = (mo_bi_ortho_tc_two_e_transp(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e_transp(p2,p1,h1, h2))
+        hji = (mo_bi_ortho_tc_two_e(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e(p2,p1,h1, h2))
+        if (hij /= 0.d0.or.hji==0.d0) then
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          phase = get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          hij = hij * phase
+          hji = hji* phase
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,2) * hij
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,1) * hji
+          enddo
+        end if
+      end if
+    end if
+  end if
+end
--- a/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/pt2.irp.f
@ -65,8 +65,12 @@ subroutine tc_pt2
  call pt2_dealloc(pt2_data_err)
  call pt2_alloc(pt2_data, N_states)
  call pt2_alloc(pt2_data_err, N_states)
+  if(transpose_two_e_int)then
+   provide mo_bi_ortho_tc_two_e_transp tc_2e_3idx_coulomb_integrals_transp
+  endif
  call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  call print_summary_tc(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)

 end

--- a/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/selection.irp.f
@ -636,10 +636,7 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere
    negMask(i,2) = not(mask(i,2))
  end do

-!  print*,'in selection '
  do i = 1, N_sel
-!    call debug_det(det(1,1,i),N_int)
-!    print*,i,dabs(psi_selectors_coef_transp_tc(1,2,i) * psi_selectors_coef_transp_tc(1,1,i))
    if(interesting(i) < 0) then
      stop 'prefetch interesting(i) and det(i)'
    endif
@ -691,11 +688,23 @@ subroutine splash_pq(mask, sp, det, i_gen, N_sel, bannedOrb, banned, mat, intere

        call get_mask_phase(psi_det_sorted_tc(1,1,interesting(i)), phasemask,N_int)
        if(nt == 4) then
-          call get_d2_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          if(transpose_two_e_int)then
+           call get_d2_new_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          else
+           call get_d2_new       (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          endif
        elseif(nt == 3) then
-          call get_d1_new(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          if(transpose_two_e_int)then
+           call get_d1_transp(det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          else
+           call get_d1_new   (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+          endif
        else
-          call get_d0_new (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         if(transpose_two_e_int)then
+          call get_d0_transp (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         else
+          call get_d0_new    (det(1,1,i), phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, psi_selectors_coef_transp_tc(1, 1, interesting(i)))
+         endif
        endif
    elseif(nt == 4) then
        call bitstring_to_list_in_selection(mobMask(1,1), p(1,1), p(0,1), N_int)
@ -887,79 +896,11 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
        call diag_htilde_mu_mat_fock_bi_ortho(N_int, det, hmono, htwoe, hthree, hii)
        do istate = 1,N_states
          delta_E = E0(istate) - Hii + E_shift
-          double precision               :: alpha_h_psi_tmp, psi_h_alpha_tmp, error
-          if(debug_tc_pt2 == 1)then !! Using the old version
-            psi_h_alpha = 0.d0
-            alpha_h_psi = 0.d0
-            do iii = 1, N_det_selectors
-              call htilde_mu_mat_bi_ortho_tot_slow(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_bi_ortho_tot_slow(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-              call get_excitation_degree(psi_selectors(1,1,iii), det,degree,N_int)
-              if(degree == 0)then
-               print*,'problem !!!'
-               print*,'a determinant is already in the wave function !!'
-               print*,'it corresponds to the selector number ',iii
-               call debug_det(det,N_int)
-               stop
-              endif
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-              psi_h_alpha += i_h_alpha * psi_selectors_coef_tc(iii,2,1) ! left function
-              alpha_h_psi += alpha_h_i * psi_selectors_coef_tc(iii,1,1) ! right function
-            enddo
-          else if(debug_tc_pt2 == 2)then !! debugging the new version
-!            psi_h_alpha_tmp = 0.d0
-!            alpha_h_psi_tmp = 0.d0
-!            do iii = 1, N_det_selectors ! old version
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-!              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-!              psi_h_alpha_tmp += i_h_alpha * psi_selectors_coef_tc(iii,1,1) ! left function
-!              alpha_h_psi_tmp += alpha_h_i * psi_selectors_coef_tc(iii,2,1) ! right function
-!            enddo
-            psi_h_alpha_tmp = mat_l(istate, p1, p2) ! new version
-            alpha_h_psi_tmp = mat_r(istate, p1, p2) ! new version
-            psi_h_alpha = 0.d0
-            alpha_h_psi = 0.d0
-            do iii = 1, N_det ! old version
-              call htilde_mu_mat_opt_bi_ortho_no_3e(psi_det(1,1,iii), det, N_int, i_h_alpha)
-              call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_det(1,1,iii), N_int, alpha_h_i)
-              psi_h_alpha += i_h_alpha * psi_l_coef_bi_ortho(iii,1) ! left function
-              alpha_h_psi += alpha_h_i * psi_r_coef_bi_ortho(iii,1) ! right function
-            enddo
-            if(dabs(psi_h_alpha*alpha_h_psi/delta_E).gt.1.d-10)then
-              error = dabs(psi_h_alpha * alpha_h_psi - psi_h_alpha_tmp * alpha_h_psi_tmp)/dabs(psi_h_alpha * alpha_h_psi)
-              if(error.gt.1.d-2)then
-                call debug_det(det, N_int)
-                print*,'error =',error,psi_h_alpha * alpha_h_psi/delta_E,psi_h_alpha_tmp * alpha_h_psi_tmp/delta_E
-                print*,psi_h_alpha , alpha_h_psi
-                print*,psi_h_alpha_tmp , alpha_h_psi_tmp
-                print*,'selectors '
-                do iii = 1, N_det_selectors ! old version
-                 print*,'iii',iii,psi_selectors_coef_tc(iii,1,1),psi_selectors_coef_tc(iii,2,1)
-                 call htilde_mu_mat_opt_bi_ortho_no_3e(psi_selectors(1,1,iii), det, N_int, i_h_alpha)
-                 call htilde_mu_mat_opt_bi_ortho_no_3e(det, psi_selectors(1,1,iii), N_int, alpha_h_i)
-                 print*,i_h_alpha,alpha_h_i
-                 call debug_det(psi_selectors(1,1,iii),N_int)
-                enddo
-!                print*,'psi_det '
-!                do iii = 1, N_det! old version
-!                 print*,'iii',iii,psi_l_coef_bi_ortho(iii,1),psi_r_coef_bi_ortho(iii,1)
-!                 call debug_det(psi_det(1,1,iii),N_int)
-!                enddo
-                stop
-              endif
-            endif
-          else
-           psi_h_alpha = mat_l(istate, p1, p2)
-           alpha_h_psi = mat_r(istate, p1, p2)
-          endif
+          psi_h_alpha = mat_l(istate, p1, p2)
+          alpha_h_psi = mat_r(istate, p1, p2)
          val = 4.d0 * psi_h_alpha * alpha_h_psi
          tmp = dsqrt(delta_E * delta_E + val)
-!          if (delta_E < 0.d0) then
-!              tmp = -tmp
-!          endif
          e_pert(istate) = 0.25 * val / delta_E
-!          e_pert(istate) = 0.5d0 * (tmp - delta_E)
          if(dsqrt(tmp).gt.1.d-4.and.dabs(psi_h_alpha).gt.1.d-4)then
           coef(istate)   = e_pert(istate) / psi_h_alpha
          else
@ -976,15 +917,6 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
           if(e_pert(istate).gt.0.d0)e_pert(istate)=0.d0
          endif

-!         if(selection_tc     ==  1 )then
-!          if(e_pert(istate).lt.0.d0)then
-!           e_pert(istate) = 0.d0
-!          endif
-!         else if(selection_tc == -1)then
-!          if(e_pert(istate).gt.0.d0)then
-!           e_pert(istate) = 0.d0
-!          endif
-!         endif
        enddo


--- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@ -1,4 +1,36 @@

+
+! ---
+
+subroutine run_pouet
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+
+  use selection_types
+  implicit none
+  integer                       :: i, j, k, ndet
+  integer                       :: to_select
+  logical                       :: has
+  type(pt2_type)                :: pt2_data, pt2_data_err
+  double precision              :: rss
+  double precision              :: correlation_energy_ratio
+  double precision              :: hf_energy_ref
+  double precision              :: relative_error
+  double precision, allocatable :: zeros(:),E_tc(:), norm(:)
+
+  logical,          external    :: qp_stop
+  double precision, external    :: memory_of_double
+
+  PROVIDE mo_l_coef mo_r_coef
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*, ' Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+    write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
+  enddo
+end
 ! ---

 subroutine run_stochastic_cipsi
@ -88,6 +120,9 @@ subroutine run_stochastic_cipsi
    call pt2_dealloc(pt2_data_err)
    call pt2_alloc(pt2_data, N_states)
    call pt2_alloc(pt2_data_err, N_states)
+    if(transpose_two_e_int)then
+     provide mo_bi_ortho_tc_two_e_transp tc_2e_3idx_coulomb_integrals_transp
+    endif
    call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
 !    stop

@ -120,6 +155,7 @@ subroutine run_stochastic_cipsi
  call pt2_alloc(pt2_data_err, N_states)
  call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm)
+  call print_summary_tc(psi_energy_with_nucl_rep, pt2_data, pt2_data_err, N_det, N_configuration, N_states, psi_s2)
  call pt2_dealloc(pt2_data)
  call pt2_dealloc(pt2_data_err)

--- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f
@ -65,7 +65,15 @@ subroutine run_cipsi_tc()

  if (.not. is_zmq_slave) then

-    PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif
+    if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e 
+    endif

    if((elec_alpha_num+elec_beta_num) .ge. 3) then
      if(three_body_h_tc) then
@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
    call json_close

  else
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
+     PROVIDE Fock_matrix_tc_mo_tot 
+     ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot 
+     ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided 
+    endif

-    PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! 
+     PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    endif

    if((elec_alpha_num+elec_beta_num) .ge. 3) then
      if(three_body_h_tc) then
--- a/plugins/local/fci_tc_bi/pt2_tc.irp.f
+++ b/plugins/local/fci_tc_bi/pt2_tc.irp.f
@ -13,6 +13,8 @@ program tc_pt2_prog

  pruning = -1.d0
  touch pruning
+  read_wf = .True.
+  touch read_wf 

 !  pt2_relative_error = 0.01d0
 !  touch pt2_relative_error
--- a/plugins/local/gpu_intel/LIB
+++ b/plugins/local/gpu_intel/LIB
@ -0,0 +1,2 @@
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+
--- a/plugins/local/gpu_intel/NEED
+++ b/plugins/local/gpu_intel/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_intel/README.rst
+++ b/plugins/local/gpu_intel/README.rst
@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+icpx -fsycl gpu.cxx -c -qmkl=sequential
+```
--- a/plugins/local/gpu_intel/gpu.sycl
+++ b/plugins/local/gpu_intel/gpu.sycl
@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+    auto queue = sycl::queue(sycl::default_selector_v);
+
+    try {
+        *ptr = sycl::malloc_shared(size, queue);
+        assert(*ptr != nullptr);
+    } catch (const sycl::exception& e) {
+        std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+        *ptr = nullptr; // If allocation fails, set pointer to nullptr
+    }
+}
+
+void gpu_deallocate(void** ptr) {
+    assert(*ptr != nullptr);
+    sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
+    *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+    *ptr = new sycl::queue(sycl::default_selector_v);
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+    assert(*ptr != nullptr);
+    delete *ptr;
+    *ptr = nullptr;
+}
+
+void gpu_synchronize() {
+    sycl::queue queue(sycl::default_selector_v);
+    queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+  sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+  handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t** ptr) {
+    *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
+    assert(*ptr != nullptr);
+    (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+    assert((*ptr)->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t** ptr) {
+    assert(*ptr != nullptr);
+    delete (*ptr)->queue;
+    free(*ptr);
+    *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+              const double* y, const int64_t incy, double* result) {
+    // Ensure input parameters are valid
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+    assert(n > 0);
+    assert(incx > 0);
+    assert(incy > 0);
+    assert(x != nullptr);
+    assert(y != nullptr);
+    assert(result != nullptr);
+
+    oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+    assert(handle != nullptr);
+    assert(handle->queue != nullptr);
+
+    // Validate matrix dimensions and increments to be positive
+    assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+    assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine the operation type
+    oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+    if (*transa == 'T' || *transa == 't') {
+        transa_ = oneapi::mkl::transpose::trans;
+    }
+
+    // Perform DGEMV operation using oneMKL
+    oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Transpose operations
+    auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+    auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+    oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+                                    *alpha, a, lda, b, ldb, *beta, c, ldc);
+
+}
+
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+    assert(handle != nullptr && handle->queue != nullptr);
+    assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+    assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+    // Determine transpose operations
+    bool transA = (*transa == 'T' || *transa == 't');
+    bool transB = (*transb == 'T' || *transb == 't');
+
+    handle->queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+            const int i = idx[0];
+            const int j = idx[1];
+            const int ai = transA ? j * lda + i : i * lda + j;
+            const int bi = transB ? j * ldb + i : i * ldb + j;
+            const int ci = i * ldc + j;
+
+            c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
+        });
+    });
+
+}
+
+}  // extern C
--- a/plugins/local/gpu_nvidia/LIB
+++ b/plugins/local/gpu_nvidia/LIB
@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
--- a/plugins/local/gpu_nvidia/NEED
+++ b/plugins/local/gpu_nvidia/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_nvidia/README.rst
+++ b/plugins/local/gpu_nvidia/README.rst
@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@ -0,0 +1,326 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice((int) igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    /* Use managed memory if it does not fit on the GPU */
+//    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+//    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(cudaStream_t* ptr) {
+  cudaError_t rc = cudaStreamCreate(ptr);
+  assert (rc == cudaSuccess);
+}
+
+void gpu_stream_destroy(cudaStream_t* ptr) {
+  assert (ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy(*ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(cublasHandle_t* ptr) {
+  cublasStatus_t rc = cublasCreate(ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+void gpu_blas_destroy(cublasHandle_t* ptr) {
+  assert (ptr != NULL);
+  cublasStatus_t rc = cublasDestroy(*ptr);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  ptr = NULL;
+}
+
+
+void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+}
+
+
+
+void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int n_, incx_, incy_;
+
+  n_    = (int) n;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  float result_ = 0.;
+  cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *result = result_;
+}
+
+
+
+void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, incx_, incy_;
+
+  m_    = (int) m;
+  n_    = (int) n;
+  lda_  = (int) lda;
+  incx_ = (int) incx;
+  incy_ = (int) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
+}
+
+
+void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  k_   = (int) k;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+}
+
+
+void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  /* Convert to int */
+  int m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int) m;
+  n_   = (int) n;
+  lda_ = (int) lda;
+  ldb_ = (int) ldb;
+  ldc_ = (int) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
+  if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
+
+}
--- a/plugins/local/gpu_x86/NEED
+++ b/plugins/local/gpu_x86/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/gpu_x86/README.rst
+++ b/plugins/local/gpu_x86/README.rst
@ -0,0 +1,5 @@
+=======
+gpu_x86
+=======
+
+x86 implementation of GPU routines. For use when GPUs are not available.
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@ -0,0 +1,502 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  return 0;
+}
+
+void gpu_set_device(int32_t i) {
+  return;
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t n) {
+  *ptr = malloc((size_t) n);
+  if (*ptr == NULL) {
+    perror("Allocation failed");
+  }
+}
+
+void gpu_deallocate(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  memcpy(gpu_ptr, cpu_ptr, n);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  memcpy(cpu_ptr, gpu_ptr, n);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  memcpy(gpu_ptr_dest, gpu_ptr_src, n);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  *ptr = (void*) malloc(sizeof(char));
+}
+
+void gpu_stream_destroy(void** ptr) {
+  free(*ptr);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void* handle, void* stream) {
+  return;
+}
+
+void gpu_synchronize() {
+  return;
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  *handle = (void*) malloc(sizeof(char));
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  free(*handle);
+  *handle = NULL;
+}
+
+
+double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
+
+void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = ddot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
+
+void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  *result = sdot_(&n_, x, &incx_, y, &incy_);
+}
+
+
+void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
+            const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
+
+void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
+}
+
+
+void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
+               const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
+
+void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
+}
+
+
+void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
+            const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
+
+void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
+}
+
+
+
+void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
+            const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
+
+void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
+
+  assert (handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
+}
+
+
+void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+               const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
+
+
+void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
+               const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (handle != NULL);
+
+  if ( (*transa == 'N' && *transb == 'N') ||
+       (*transa == 'n' && *transb == 'N') ||
+       (*transa == 'N' && *transb == 'n') ||
+       (*transa == 'n' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'N' && *transb == 'T') ||
+              (*transa == 'n' && *transb == 'T') ||
+              (*transa == 'N' && *transb == 't') ||
+              (*transa == 'n' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'N') ||
+              (*transa == 't' && *transb == 'N') ||
+              (*transa == 'T' && *transb == 'n') ||
+              (*transa == 't' && *transb == 'n') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[j*ldb+i];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
+         }
+       }
+
+     }
+
+  } else if ( (*transa == 'T' && *transb == 'T') ||
+              (*transa == 't' && *transb == 'T') ||
+              (*transa == 'T' && *transb == 't') ||
+              (*transa == 't' && *transb == 't') ) {
+
+     if (*alpha == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *beta * b[i*ldb+j];
+         }
+       }
+
+     } else if (*beta == 0.) {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j];
+         }
+       }
+
+     } else {
+
+       for (int64_t j=0 ; j<n ; ++j) {
+         for (int64_t i=0 ; i<m ; ++i) {
+           c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
+         }
+       }
+
+     }
+
+  }
+}
--- a/plugins/local/jastrow/EZFIO.cfg
+++ b/plugins/local/jastrow/EZFIO.cfg
@ -1,7 +1,25 @@
+[log_jpsi]
+type: logical
+doc: If |true|, the Jpsi is taken as log(1+psi_cor)
+interface: ezfio,provider,ocaml
+default: False
+
+
+[mu_of_r_tc]
+type: character*(32)
+doc: type of the mu(r): [ Standard | Erfmu | Erfmugauss ]
+interface: ezfio,provider,ocaml
+default: Standard
+
+[mu_of_r_av]
+type: logical
+doc: If |true|, take the second formula for mu(r)
+interface: ezfio,provider,ocaml
+default: False

 [j2e_type]
 type: character*(32)
-doc: type of the 2e-Jastrow: [ None | Mu | Mu_Nu | Mur | Boys | Boys_Handy | Qmckl ]
+doc: type of the 2e-Jastrow: [ None | Mu | Mugauss | Mu_Nu | Mur | Murgauss | Bump | Boys | Boys_Handy | Qmckl ]
 interface: ezfio,provider,ocaml
 default: Mu

--- a/plugins/local/jastrow/bh_param.irp.f
+++ b/plugins/local/jastrow/bh_param.irp.f
@ -232,6 +232,14 @@

  ! ---

+  do i_nucl = 1, nucl_num
+    do p = 1, jBH_size
+      if(jBH_m(p,i_nucl) .eq. jBH_n(p,i_nucl)) then
+        jBH_c(p,i_nucl) = 0.5d0 * jBH_c(p,i_nucl)
+      endif
+    enddo
+  enddo
+
  print *, ' parameters for Boys-Handy Jastrow'
  print *, ' nb of terms per nucleus = ', jBH_size

--- a/plugins/local/mo_localization/README.md
+++ b/plugins/local/mo_localization/README.md
@ -3,7 +3,7 @@ To localize the MOs:
 ```
 qp run localization  
 ```
-By default, the different otbital classes are automatically set by splitting  
+By default, the different orbital classes are automatically set by splitting  
 the orbitales in the following classes:  
 - Core -> Core  
 - Active, doubly occupied -> Inactive  
--- a/plugins/local/non_h_ints_mu/NEED
+++ b/plugins/local/non_h_ints_mu/NEED
@ -3,3 +3,5 @@ hamiltonian
 jastrow
 ao_tc_eff_map
 bi_ortho_mos
+trexio
+mu_of_r
--- a/plugins/local/non_h_ints_mu/deb_aos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_aos.irp.f
@ -31,24 +31,63 @@ subroutine print_aos()
  integer          :: i, ipoint
  double precision :: r(3)
  double precision :: ao_val, ao_der(3), ao_lap
+  double precision :: accu_vgl(5)
+  double precision :: accu_vgl_nrm(5)
+
+  double precision :: mo_val, mo_der(3), mo_lap

  PROVIDE final_grid_points aos_in_r_array aos_grad_in_r_array aos_lapl_in_r_array

  do ipoint = 1, n_points_final_grid
    r(:) = final_grid_points(:,ipoint)
-    print*, r
+    write(1000, '(3(f15.7, 3X))') r
  enddo
-
+  
  do ipoint = 1, n_points_final_grid
-    r(:) = final_grid_points(:,ipoint)
    do i = 1, ao_num
      ao_val    = aos_in_r_array     (i,ipoint)
      ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
      ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
-      write(*, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+      write(111, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
    enddo
  enddo
 
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      ao_val    = aos_in_r_array_qmckl     (i,ipoint)
+      ao_der(:) = aos_grad_in_r_array_qmckl(i,ipoint,:)
+      ao_lap    = aos_lapl_in_r_array_qmckl(i,ipoint)
+      write(222, '(5(f15.7, 3X))') ao_val, ao_der, ao_lap
+    enddo
+  enddo
+ 
+  accu_vgl = 0.d0
+  accu_vgl_nrm = 0.d0
+  do ipoint = 1, n_points_final_grid
+    do i = 1, ao_num
+      ao_val    = aos_in_r_array     (i,ipoint)
+      ao_der(:) = aos_grad_in_r_array(i,ipoint,:)
+      ao_lap    = aos_lapl_in_r_array(1,i,ipoint) + aos_lapl_in_r_array(2,i,ipoint) + aos_lapl_in_r_array(3,i,ipoint)
+      accu_vgl_nrm(1) += dabs(ao_val)
+      accu_vgl_nrm(2) += dabs(ao_der(1))
+      accu_vgl_nrm(3) += dabs(ao_der(2))
+      accu_vgl_nrm(4) += dabs(ao_der(3))
+      accu_vgl_nrm(5) += dabs(ao_lap)
+
+      ao_val    -= aos_in_r_array_qmckl     (i,ipoint)
+      ao_der(:) -= aos_grad_in_r_array_qmckl(i,ipoint,:)
+      ao_lap    -= aos_lapl_in_r_array_qmckl(i,ipoint)
+      accu_vgl(1) += dabs(ao_val)
+      accu_vgl(2) += dabs(ao_der(1))
+      accu_vgl(3) += dabs(ao_der(2))
+      accu_vgl(4) += dabs(ao_der(3))
+      accu_vgl(5) += dabs(ao_lap)
+    enddo
+
+  enddo
+  accu_vgl(:) *= 1.d0 / accu_vgl_nrm(:)
+  print *, accu_vgl
+
  return
 end

--- a/plugins/local/non_h_ints_mu/deb_deriv_mu.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_deriv_mu.irp.f
@ -0,0 +1,28 @@
+program test_j_mu_of_r
+ implicit none
+ double precision :: x,mu_min,dmu,mu_max, mu, mu_p, mu_m
+ double precision :: j_simple,j_p, j_m,numeric_d_mu,d_dx_mu
+ double precision :: accu
+ integer :: npt,i
+ npt = 1000
+ mu_min = 0.3d0
+ mu_max = 10.d0
+ dmu = (mu_max - mu_min)/dble(npt)
+ x = 0.7d0
+ mu = mu_min
+ do i = 1, npt
+  call get_deriv_mu_j12(x,mu,d_dx_mu)
+  mu_p = mu + dmu
+  mu_m = mu - dmu
+  j_p = j_simple(x,mu_p)
+  j_m = j_simple(x,mu_m)
+  numeric_d_mu = 0.5d0 * (j_p - j_m)/dmu
+  print*,mu
+  print*,numeric_d_mu,d_dx_mu,dabs(d_dx_mu-numeric_d_mu)
+  accu += dabs(d_dx_mu-numeric_d_mu)
+  mu += dmu
+ enddo
+ accu *= dmu
+ print*,'accu = ',accu
+end
+
--- a/plugins/local/non_h_ints_mu/deb_j_bump.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_j_bump.irp.f
@ -0,0 +1,98 @@
+program test_j_mu_of_r
+ implicit none
+! call routine_test_mu_of_r
+ call routine_test_mu_of_r_tot
+end
+
+subroutine routine_test_mu_of_r_tot
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast_mu_r1(3),j_bump
+ double precision :: jast_p,jast_m,num_grad_jast_mu_r1(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+!   call grad_j_sum_mu_of_r(r1,r2,jast,grad_jast_mu_r1)
+   call get_grad_j_bump_mu_of_r(r1,r2,grad_jast_mu_r1)
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    jast_p = j_bump(r1bis,r2,a_boys)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    jast_m = j_bump(r1bis,r2,a_boys)
+
+    num_grad_jast_mu_r1(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast_mu_r1(k)*num_grad_jast_mu_r1(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.dr)then
+     print*,'/////'
+     print*,error,norm
+     print*,grad_jast_mu_r1
+     print*,num_grad_jast_mu_r1
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
+subroutine routine_test_mu_of_r
+ implicit none
+ integer :: ipoint,k 
+ double precision :: weight, dr, r1(3), r1bis(3),accu_grad(3),num_grad_mu_r1(3)
+ double precision :: mu_r1,dm_r1, mu_der_r1(3), grad_dm_r1(3)
+ double precision :: mu_der_rp(3), grad_dm_rp(3),mu_rp
+ double precision :: mu_der_rm(3), grad_dm_rm(3),mu_rm
+
+ dr = 0.0001d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad_mu_of_r_mean_field(r1,mu_r1,dm_r1, mu_der_r1, grad_dm_r1) 
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rp, dm_r1, mu_der_rp, grad_dm_r1) 
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rm, dm_r1, mu_der_rm, grad_dm_r1) 
+    num_grad_mu_r1(k) = (mu_rp - mu_rm)/(2.d0* dr)
+!    print*,jast_mu_r1_p,jast_mu_r1_m
+   enddo
+   print*,'/////'
+   print*,mu_der_r1
+   print*,num_grad_mu_r1
+   do k = 1,3
+    accu_grad(k) += weight * dabs(mu_der_r1(k) - num_grad_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
--- a/plugins/local/non_h_ints_mu/deb_j_gauss.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_j_gauss.irp.f
@ -0,0 +1,62 @@
+program test_j_mu_of_r
+ implicit none
+! call routine_test_mu_of_r
+ call routine_test_mu_of_r_tot
+end
+
+subroutine routine_test_mu_of_r_tot
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast(3),j_bump,j12_mu
+ double precision :: jast_p,jast_m,num_grad_jast(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad1_j12_mu(r1, r2, grad_jast)
+   grad_jast = - grad_jast
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    jast_p = j12_mu(r1bis, r2)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    jast_m = j12_mu(r1bis, r2)
+
+    num_grad_jast(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast(k)*num_grad_jast(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.dr)then
+     print*,'/////'
+     print*,error,norm
+     print*,grad_jast
+     print*,num_grad_jast
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
--- a/plugins/local/non_h_ints_mu/deb_j_mu_of_r.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_j_mu_of_r.irp.f
@ -0,0 +1,97 @@
+program test_j_mu_of_r
+ implicit none
+! call routine_test_mu_of_r
+ call routine_test_mu_of_r_tot
+end
+
+subroutine routine_test_mu_of_r_tot
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast_mu_r1(3)
+ double precision :: jast_p,jast_m,num_grad_jast_mu_r1(3)
+
+ dr = 0.000001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad_j_sum_mu_of_r(r1,r2,jast,grad_jast_mu_r1)
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call get_j_sum_mu_of_r(r1bis,r2,jast_p)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call get_j_sum_mu_of_r(r1bis,r2,jast_m)
+
+    num_grad_jast_mu_r1(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast_mu_r1(k)*num_grad_jast_mu_r1(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.10.d0*dr)then
+     print*,'/////'
+     print*,error,norm,dabs(error/norm)
+     print*,grad_jast_mu_r1
+     print*,num_grad_jast_mu_r1
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast_mu_r1(k) - num_grad_jast_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
+subroutine routine_test_mu_of_r
+ implicit none
+ integer :: ipoint,k 
+ double precision :: weight, dr, r1(3), r1bis(3),accu_grad(3),num_grad_mu_r1(3)
+ double precision :: mu_r1,dm_r1, mu_der_r1(3), grad_dm_r1(3)
+ double precision :: mu_der_rp(3), grad_dm_rp(3),mu_rp
+ double precision :: mu_der_rm(3), grad_dm_rm(3),mu_rm
+
+ dr = 0.0001d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call grad_mu_of_r_mean_field(r1,mu_r1,dm_r1, mu_der_r1, grad_dm_r1) 
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rp, dm_r1, mu_der_rp, grad_dm_r1) 
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call grad_mu_of_r_mean_field(r1bis,mu_rm, dm_r1, mu_der_rm, grad_dm_r1) 
+    num_grad_mu_r1(k) = (mu_rp - mu_rm)/(2.d0* dr)
+!    print*,jast_mu_r1_p,jast_mu_r1_m
+   enddo
+   print*,'/////'
+   print*,mu_der_r1
+   print*,num_grad_mu_r1
+   do k = 1,3
+    accu_grad(k) += weight * dabs(mu_der_r1(k) - num_grad_mu_r1(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
--- a/plugins/local/non_h_ints_mu/deb_j_psi.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_j_psi.irp.f
@ -0,0 +1,131 @@
+program test_j_mu_of_r
+ implicit none
+ call routine_deb_j_psi
+! call routine_deb_denom
+end
+
+subroutine routine_deb_j_psi
+ implicit none
+ integer :: ipoint,k 
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast(3),j_bump,jastrow_psi,grad_jast_bis(3)
+ double precision :: jast_p,jast_m,num_grad_jast(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ accu_grad = 0.d0
+  do ipoint = 1, n_points_final_grid
+   r1(1:3) = final_grid_points(1:3,ipoint)
+   weight = final_weight_at_r_vector(ipoint)
+   call get_grad_r1_jastrow_psi(r1,r2,grad_jast,jast)
+!   grad_jast = - grad_jast
+   double precision :: norm,error
+   norm = 0.D0
+   do k = 1, 3
+    r1bis= r1
+    r1bis(k) += dr
+    call get_grad_r1_jastrow_psi(r1bis,r2,grad_jast_bis,jast_p)
+
+    r1bis= r1
+    r1bis(k) -= dr
+    call get_grad_r1_jastrow_psi(r1bis,r2,grad_jast_bis,jast_m)
+
+    num_grad_jast(k) = (jast_p - jast_m)/(2.d0* dr)
+    norm += num_grad_jast(k)*num_grad_jast(k)
+   enddo
+   error = 0.d0
+   do k = 1, 3
+    error += dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+   error *= 0.33333333d0
+   norm = dsqrt(norm)
+   if(norm.gt.1.d-05)then
+    if(dabs(error/norm).gt.dr)then
+     print*,'/////'
+     print*,error,norm
+     print*,grad_jast
+     print*,num_grad_jast
+    endif
+   endif
+   do k = 1,3
+    accu_grad(k) += weight * dabs(grad_jast(k) - num_grad_jast(k))
+   enddo
+  enddo
+ print*,'accu_grad = '
+ print*, accu_grad
+
+end
+
+
+subroutine routine_deb_denom
+ implicit none
+ integer :: ipoint,k,i,j
+ double precision :: r2(3), weight, dr, r1(3), r1bis(3)
+ double precision :: accu_grad(3)
+ double precision :: jast,grad_jast(3),j_bump,jastrow_psi,grad_jast_bis(3)
+ double precision :: jast_p,jast_m,num_grad_jast(3)
+
+ dr = 0.00001d0
+ r2 = 0.d0
+ r2(1) =  0.5d0
+ r2(2) = -0.1d0
+ r2(3) =  1.0d0
+ double precision, allocatable :: mos_array_r1(:), mos_array_r2(:)
+ double precision, allocatable :: mos_grad_array_r1(:,:),mos_grad_array_r2(:,:)
+ allocate(mos_array_r1(mo_num), mos_array_r2(mo_num))
+ allocate(mos_grad_array_r1(3,mo_num), mos_grad_array_r2(3,mo_num))
+ do i = 1, 1
+  do j = 1, 1
+   accu_grad = 0.d0
+   call give_all_mos_and_grad_at_r(r2,mos_array_r2,mos_grad_array_r2)
+    do ipoint = 1, n_points_final_grid
+     r1(1:3) = final_grid_points(1:3,ipoint)
+     weight = final_weight_at_r_vector(ipoint)
+     call give_all_mos_and_grad_at_r(r1,mos_array_r1,mos_grad_array_r1)
+     call denom_jpsi(i,j,a_boys, mos_array_r1,mos_grad_array_r1,mos_array_r2,jast, grad_jast)
+     double precision :: norm,error
+     norm = 0.D0
+     do k = 1, 3
+      r1bis= r1
+      r1bis(k) += dr
+      call give_all_mos_and_grad_at_r(r1bis,mos_array_r1,mos_grad_array_r1)
+      call denom_jpsi(i,j,a_boys, mos_array_r1,mos_grad_array_r1,mos_array_r2,jast_p, grad_jast_bis)
+  
+      r1bis= r1
+      r1bis(k) -= dr
+      call give_all_mos_and_grad_at_r(r1bis,mos_array_r1,mos_grad_array_r1)
+      call denom_jpsi(i,j,a_boys, mos_array_r1,mos_grad_array_r1,mos_array_r2,jast_m, grad_jast_bis)
+  
+      num_grad_jast(k) = (jast_p - jast_m)/(2.d0* dr)
+      norm += num_grad_jast(k)*num_grad_jast(k)
+     enddo
+     error = 0.d0
+     do k = 1, 3
+      error += dabs(grad_jast(k) - num_grad_jast(k))
+     enddo
+     error *= 0.33333333d0
+     norm = dsqrt(norm)
+     if(norm.gt.1.d-05)then
+      if(dabs(error/norm).gt.dr)then
+       print*,'/////'
+       print*,error,norm
+       print*,grad_jast
+       print*,num_grad_jast
+      endif
+     endif
+     do k = 1,3
+      accu_grad(k) += weight * dabs(grad_jast(k) - num_grad_jast(k))
+     enddo
+    enddo
+   print*,'i,j = ',i,j
+   print*,'accu_grad = '
+   print*, accu_grad
+  enddo
+ enddo
+
+end
+
--- a/plugins/local/non_h_ints_mu/deb_mos.irp.f
+++ b/plugins/local/non_h_ints_mu/deb_mos.irp.f
@ -0,0 +1,101 @@
+
+! ---
+
+program deb_mos
+
+  implicit none
+
+  my_grid_becke  = .True.
+  PROVIDE tc_grid1_a tc_grid1_r
+  my_n_pt_r_grid = tc_grid1_r
+  my_n_pt_a_grid = tc_grid1_a
+  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
+
+  if(tc_integ_type .eq. "numeric") then
+    my_extra_grid_becke  = .True.
+    PROVIDE tc_grid2_a tc_grid2_r
+    my_n_pt_r_extra_grid = tc_grid2_r
+    my_n_pt_a_extra_grid = tc_grid2_a
+    touch my_extra_grid_becke my_n_pt_r_extra_grid my_n_pt_a_extra_grid
+  endif
+
+  call print_mos()
+  
+end
+
+! ---
+
+subroutine print_mos()
+
+  implicit none
+  integer          :: i, ipoint
+  double precision :: r(3)
+  double precision :: mo_val, mo_der(3), mo_lap
+
+  PROVIDE final_grid_points mos_in_r_array mos_grad_in_r_array mos_lapl_in_r_array
+
+!  do ipoint = 1, n_points_final_grid
+!    r(:) = final_grid_points(:,ipoint)
+!    print*, r
+!  enddo
+double precision :: accu_vgl(5)
+double precision :: accu_vgl_nrm(5)
+
+  do ipoint = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    write(1111, '(5(f15.7, 3X))') r
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      write(1111, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
+    enddo
+  enddo
+ 
+  do ipoint = 1, n_points_final_grid
+    r(1) = final_grid_points(1,i)
+    r(2) = final_grid_points(2,i)
+    r(3) = final_grid_points(3,i)
+    write(2222, '(5(f15.7, 3X))') r
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array_qmckl     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array_qmckl(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array_qmckl(i,ipoint)
+      write(2222, '(5(f15.7, 3X))') mo_val, mo_der, mo_lap
+    enddo
+  enddo
+ 
+  accu_vgl = 0.d0
+  accu_vgl_nrm = 0.d0
+  do ipoint = 1, n_points_final_grid
+    do i = 1, mo_num
+      mo_val    = mos_in_r_array     (i,ipoint)
+      mo_der(:) = mos_grad_in_r_array(i,ipoint,:)
+      mo_lap    = mos_lapl_in_r_array(i,ipoint,1) + mos_lapl_in_r_array(i,ipoint,2) + mos_lapl_in_r_array(i,ipoint,3)
+      accu_vgl_nrm(1) += dabs(mo_val)
+      accu_vgl_nrm(2) += dabs(mo_der(1))
+      accu_vgl_nrm(3) += dabs(mo_der(2))
+      accu_vgl_nrm(4) += dabs(mo_der(3))
+      accu_vgl_nrm(5) += dabs(mo_lap)
+
+      mo_val    -= mos_in_r_array_qmckl     (i,ipoint)
+      mo_der(:) -= mos_grad_in_r_array_qmckl(i,ipoint,:)
+      mo_lap    -= mos_lapl_in_r_array_qmckl(i,ipoint)
+      accu_vgl(1) += dabs(mo_val)
+      accu_vgl(2) += dabs(mo_der(1))
+      accu_vgl(3) += dabs(mo_der(2))
+      accu_vgl(4) += dabs(mo_der(3))
+      accu_vgl(5) += dabs(mo_lap)
+    enddo
+
+  enddo
+  accu_vgl(:) *= 1.d0 / accu_vgl_nrm(:)
+  print *, accu_vgl
+
+  return
+end
+
+! ---
+
--- a/plugins/local/non_h_ints_mu/j_bump.irp.f
+++ b/plugins/local/non_h_ints_mu/j_bump.irp.f
@ -0,0 +1,90 @@
+double precision function wigner_radius(rho)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: rho
+ wigner_radius = 4.d0 * pi * rho * 0.333333333333d0
+ wigner_radius = wigner_radius**(-0.3333333d0)
+end
+
+double precision function j_bump(r1,r2,a)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: r1(3),r2(3),a
+ double precision :: inv_a,factor,x_scaled,scalar
+ double precision :: r12
+ r12   = (r1(1) - r2(1))*(r1(1) - r2(1))
+ r12  += (r1(2) - r2(2))*(r1(2) - r2(2))
+ r12  += (r1(3) - r2(3))*(r1(3) - r2(3))
+ r12 = dsqrt(r12)
+ inv_a = 1.d0/a
+ x_scaled = r12*inv_a*inv_sq_pi
+ x_scaled*= x_scaled
+ j_bump = 0.5d0 * (r12-a) * dexp(-x_scaled)
+end
+
+subroutine get_grad_j_bump(x,a,grad)
+ implicit none
+ BEGIN_DOC
+ ! gradient of the Jastrow with a bump
+ !
+ ! j(x,a) = 1/2 * (x-a)* exp[-(x/(a*sqrt(pi)))^2]
+ !
+ ! d/dx j(x,a) = 1/(2 pi a^2) * exp[-(x/(a*sqrt(pi)))^2] * (pi a^2 + 2 a x - 2x^2)
+ END_DOC
+  include 'constants.include.F'
+ double precision, intent(in)  :: x,a
+ double precision, intent(out) :: grad
+ double precision :: inv_a,factor,x_scaled,scalar
+ inv_a = 1.d0/a
+ factor = 0.5d0*inv_pi*inv_a*inv_a 
+ x_scaled = x*inv_a*inv_sq_pi
+ x_scaled*= x_scaled
+ grad = factor * dexp(-x_scaled) * (pi*a*a + 2.d0 * a*x - 2.d0*x*x)
+end
+
+subroutine get_d_da_j_bump(x,a,d_da)
+ implicit none
+ BEGIN_DOC
+ ! Derivative with respect by to the parameter "a" of the Jastrow with a bump
+ !
+ ! j(x,a) = 1/2 * (x-a)* exp[-(x/(a*sqrt(pi)))^2]
+ !
+ ! d/da j(x,a) = - 1/(pi*a^3) * exp[-(x/(a*sqrt(pi)))^2] * (-2 x^3 + 2 a x^2 + pi a^x3)
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in)  :: x,a
+ double precision, intent(out) :: d_da
+ double precision :: factor, inv_a,x_scaled,scalar
+ inv_a = 1.d0/a
+ factor = inv_a*inv_a*inv_a*inv_pi
+ x_scaled = x*inv_a*inv_sq_pi
+ x_scaled*= x_scaled
+ d_da = factor * dexp(-x_scaled) * (-2.d0 * x*x*x + 2.d0*x*x*a+pi*a*a*a)
+end
+
+subroutine get_grad_j_bump_mu_of_r(r1,r2,grad_j_bump)
+ implicit none
+ BEGIN_DOC
+ ! d/dx1 j(x,a(r1,r2)) where j(x,a) is the Jastrow with a bump
+ !
+ ! j(x,a) = 1/2 * (x-a)* exp[-(x/(a*sqrt(pi)))^2]
+ !
+ ! a(r1,r2) = [rho(r1) a(r1) + rho(r2) a(r2)]/[rho(r1) + rho(r2)]
+ !
+ ! d/dx1 j(x,a) = d/dx1 j(x,a(r1,r2))
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: grad_j_bump(3)
+ double precision :: r12,r12_vec(3),grad_scal,inv_r12
+ r12_vec = r1 - r2
+ r12   = (r1(1) - r2(1))*(r1(1) - r2(1))
+ r12  += (r1(2) - r2(2))*(r1(2) - r2(2))
+ r12  += (r1(3) - r2(3))*(r1(3) - r2(3))
+ r12 = dsqrt(r12)
+ call get_grad_j_bump(r12,a_boys,grad_scal)
+ if(r12.lt.1.d-10)then
+  grad_j_bump = 0.d0
+ else 
+  grad_j_bump = grad_scal * r12_vec/r12
+ endif
+end
--- a/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_1e_utils.irp.f
@ -132,6 +132,7 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
  double precision, allocatable :: A(:,:,:,:), b(:), A_tmp(:,:,:,:)
  double precision, allocatable :: Pa(:,:), Pb(:,:), Pt(:,:)
  double precision, allocatable :: u1e_tmp(:), tmp(:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)
  double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:)


@ -176,26 +177,27 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
  ! --- --- ---
  ! get A

-  allocate(tmp(n_points_final_grid,ao_num,ao_num))
+  allocate(tmp1(n_points_final_grid,ao_num,ao_num), tmp2(n_points_final_grid,ao_num,ao_num))
  allocate(A(ao_num,ao_num,ao_num,ao_num))

  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
  !$OMP DO COLLAPSE(2)
  do j = 1, ao_num
    do i = 1, ao_num
      do ipoint = 1, n_points_final_grid
-        tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(ipoint,i,j) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(ipoint,i,j) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL

-  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
+  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0  &
+            , tmp1(1,1,1), n_points_final_grid, tmp2(1,1,1), n_points_final_grid &
            , 0.d0, A(1,1,1,1), ao_num*ao_num)

  allocate(A_tmp(ao_num,ao_num,ao_num,ao_num))
@ -207,13 +209,13 @@ subroutine get_j1e_coef_fit_ao2(dim_fit, coef_fit)
  allocate(b(ao_num*ao_num))

  do ipoint = 1, n_points_final_grid
-    u1e_tmp(ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * u1e_tmp(ipoint)
+    u1e_tmp(ipoint) = u1e_tmp(ipoint)
  enddo

-  call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1)
+  call dgemv("T", n_points_final_grid, ao_num*ao_num, 1.d0, tmp1(1,1,1), n_points_final_grid, u1e_tmp(1), 1, 0.d0, b(1), 1)

  deallocate(u1e_tmp)
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)

  ! --- --- ---
  ! solve Ax = b
--- a/plugins/local/non_h_ints_mu/jast_deriv.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv.irp.f
@ -31,7 +31,7 @@
  grad1_u12_squared_num = 0.d0

  if( ((j2e_type .eq. "Mu") .and. (env_type .eq. "None")) .or. &
-       (j2e_type .eq. "Mur") ) then
+       (j2e_type .eq. "Mur").or.(j2e_type .eq. "Mugauss") .or. (j2e_type .eq. "Murgauss")) then

    !$OMP PARALLEL                                                                                    &
    !$OMP DEFAULT (NONE)                                                                              &
--- a/plugins/local/non_h_ints_mu/jast_deriv_mu_of_r.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_mu_of_r.irp.f
@ -0,0 +1,306 @@
+subroutine get_j_sum_mu_of_r(r1,r2,jast)
+ implicit none
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: jast
+ double precision :: mu_r1, dm_r1, grad_mu_r1(3), grad_dm_r1(3), j_mu_r1
+ double precision :: mu_r2, dm_r2, grad_mu_r2(3), grad_dm_r2(3), j_mu_r2
+ double precision :: j12_mu_input,mu_tot,r12,j_simple
+ jast = 0.d0
+ if(murho_type==0)then
+! J(r1,r2) = [rho(r1) * j(mu(r1),r12) + rho(r2) * j(mu(r2),r12)] / [rho(r1) + rho(r2)]
+  call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1) 
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  j_mu_r1 = j12_mu_input(r1, r2, mu_r1)
+  j_mu_r2 = j12_mu_input(r1, r2, mu_r2)
+  if(dm_r1 + dm_r2.lt.1.d-7)return
+  jast = (dm_r1 * j_mu_r1 + dm_r2 * j_mu_r2) / (dm_r1 + dm_r2)
+ else if(murho_type==1)then
+! J(r1,r2) = j(0.5 * (mu(r1)+mu(r2)),r12), MU(r1,r2) = 0.5 *(mu(r1)+mu(r2))
+  call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1) 
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  mu_tot = 0.5d0 * (mu_r1 + mu_r2)
+  jast = j12_mu_input(r1, r2, mu_tot)
+ else if(murho_type==2)then
+! MU(r1,r2) = (rho(1) * mu(r1)+ rho(2) * mu(r2))/(rho(1)+rho(2))
+! J(r1,r2) = j(MU(r1,r2),r12)
+  call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1) 
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  double precision :: mu_tmp, dm_tot, dm_tot_inv
+  dm_tot = dm_r1**a_boys + dm_r2**a_boys  ! rho(1)**alpha+rho(2)**alpha
+  if(dm_tot.lt.1.d-12)then
+   dm_tot_inv = 1.d+12
+  else
+   dm_tot_inv = 1.d0/dm_tot
+  endif
+  mu_tmp = dm_r1**a_boys * mu_r1 + dm_r2**a_boys * mu_r2 !rho(1)**alpha * mu(r1)+ rho(2)**alpha * mu(r2)
+  mu_tot = nu_erf * mu_tmp*dm_tot_inv ! 
+  r12  = (r1(1) - r2(1)) * (r1(1) - r2(1))
+  r12 += (r1(2) - r2(2)) * (r1(2) - r2(2))
+  r12 += (r1(3) - r2(3)) * (r1(3) - r2(3))
+  r12 = dsqrt(r12)
+  jast = j_simple(r12,mu_tot)
+ endif
+
+end
+
+subroutine grad_j_sum_mu_of_r(r1,r2,jast,grad_jast)
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: jast, grad_jast(3)
+ jast = 0.d0
+ grad_jast = 0.d0
+ double precision :: num, denom, grad_num(3), grad_denom(3)
+ double precision :: j_r1, grad_j_r1(3),j_r2, grad_j_r2(3)
+ double precision :: dm_r1, grad_dm_r1(3), grad_jmu_r2(3)
+ double precision :: dm_r2, grad_dm_r2(3),mu_r2, grad_mu_r2(3),mu_r1
+ double precision :: j12_mu_input,r12,grad_mu_r1(3),grad_jmu_r1(3)
+ double precision :: mu_tot,dx,dy,dz,r12_vec(3),d_dmu_j,d_dr12_j
+
+ dx = r1(1) - r2(1)
+ dy = r1(2) - r2(2)
+ dz = r1(3) - r2(3)
+
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ if(r12.gt.1.d-10)then
+  r12_vec(1) = dx
+  r12_vec(2) = dy
+  r12_vec(3) = dz
+  r12_vec *= 1.d0/r12
+  ! r12_vec = grad_r1 (r12)
+ else
+  r12 = 1.d-10
+  r12_vec = 0.d0
+ endif
+
+ if(murho_type==0)then
+! J(r1,r2) = [rho(r1) * j(mu(r1),r12) + rho(r2) * j(mu(r2),r12)] / [rho(r1) + rho(r2)]
+! 
+!          = num(r1,r2) / denom(r1,r2)
+!
+! d/dx1 J(r1,r2) = [denom(r1,r2) X d/dx1 num(r1,r2) - num(r1,r2) X d/dx1 denom(r1,r2) ] / denom(r1,r2)^2
+!
+! d/dx1 num(r1,r2) =  j(mu(r1),r12)*d/dx1 rho(r1) + rho(r1) * d/dx1 j(mu(r1),r12) 
+!                   + rho(r2) d/dx1 j(mu(r2),r12)
+! d/dx1 denom(r1,r2) = d/dx1 rho(r1)
+  call grad_j_mu_of_r_1(r1,r2,j_r1, grad_j_r1,dm_r1, grad_dm_r1)
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  j_r2 = j12_mu_input(r1, r2, mu_r2) ! j(mu(r2),r1,r2)
+  num = dm_r1 * j_r1 + dm_r2 * j_r2
+  denom = dm_r1 + dm_r2
+  if(denom.lt.1.d-7)return
+  jast = num / denom
+ 
+  grad_denom = grad_dm_r1
+  call grad_j12_mu_input(r1, r2, mu_r2, grad_jmu_r2,r12)
+  grad_num =  j_r1 * grad_dm_r1 + dm_r1 * grad_j_r1 + dm_r2 * grad_jmu_r2
+  grad_jast = (grad_num * denom - num * grad_denom)/(denom*denom)
+ else if(murho_type==1)then
+! J(r1,r2) = j(0.5 * (mu(r1)+mu(r2)),r12), MU(r1,r2) = 0.5 *(mu(r1)+mu(r2))
+!
+! d/dx1 J(r1,r2) = d/dx1 j(MU(r1,r2),r12)|MU=cst 
+!                + d/dMU [j(MU,r12)]
+!                x d/d(mu(r1)) MU(r1,r2)
+!                x d/dx1 mu(r1)
+!                = 0.5 * (1 - erf(MU(r1,r2) *r12))/r12 * (x1 - x2) == grad_jmu_r1
+!                + e^{-(r12*MU(r1,r2))^2}/(2 sqrt(pi) * MU(r1,r2)^2) 
+!                x 0.5 
+!                x d/dx1 mu(r1)
+ call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1)
+ call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2)
+ mu_tot = 0.5d0 * (mu_r1 + mu_r2)
+ call grad_j12_mu_input(r1, r2, mu_tot, grad_jmu_r1,r12)
+ grad_jast = grad_jmu_r1
+ grad_jast+= dexp(-r12*mu_tot*r12*mu_tot) * inv_sq_pi_2 /(mu_tot* mu_tot) * 0.5d0 * grad_mu_r1
+ else if(murho_type==2)then
+! MU(r1,r2) = beta * (rho(1)**alpha * mu(r1)+ rho(2)**alpha * mu(r2))/(rho(1)**alpha+rho(2)**alpha)
+! J(r1,r2) = j(MU(r1,r2),r12)
+!
+! d/dx1 J(r1,r2) = d/dx1 j(MU(r1,r2),r12)|MU=cst 
+!                + d/dMU [j(MU,r12)] 
+!                x d/d(mu(r1)) MU(r1,r2)
+!                x d/dx1 mu(r1)
+!                = 0.5 * (1 - erf(MU(r1,r2) *r12))/r12 * (x1 - x2) == grad_jmu_r1
+!                + 0.5 e^{-(r12*MU(r1,r2))^2}/(2 sqrt(pi) * MU(r1,r2)^2) 
+!                x d/dx1 MU(r1,r2)
+! with d/dx1 MU(r1,r2) = beta * {[mu(1) d/dx1 [rho(1)**alpha] + rho(1)**alpha * d/dx1 mu(1)](rho(1)**alpha+rho(2)**alpha) 
+!                       - MU(1,2) d/dx1 [rho(1)]**alpha}/(rho(1)**alpha+rho(2)**alpha)^2
+! d/dx1 [rho(1)]**alpha = alpha [rho(1)]**(alpha-1) d/dx1 rho(1)
+!                        
+ call grad_mu_of_r_mean_field(r1,mu_r1, dm_r1, grad_mu_r1, grad_dm_r1)
+ call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2)
+ double precision :: dm_tot,dm_tot_inv,grad_mu_tot(3),mu_tmp,grad_dm_r1_alpha(3),d_dx_j
+ dm_tot = dm_r1**a_boys + dm_r2**a_boys  ! rho(1)**alpha+rho(2)**alpha
+ grad_dm_r1_alpha = a_boys * dm_r1**(a_boys-1) * grad_dm_r1
+ if(dm_tot.lt.1.d-12)then
+  dm_tot_inv = 1.d+12
+ else
+  dm_tot_inv = 1.d0/dm_tot
+ endif
+ mu_tmp = dm_r1**a_boys * mu_r1 + dm_r2**a_boys * mu_r2 !rho(1)**alpha * mu(r1)+ rho(2)**alpha * mu(r2)
+ mu_tot = nu_erf * mu_tmp*dm_tot_inv ! 
+ grad_mu_tot = ( mu_r1 * grad_dm_r1_alpha + dm_r1**a_boys * grad_mu_r1 ) * dm_tot & 
+              -  mu_tmp * grad_dm_r1_alpha
+ grad_mu_tot *= dm_tot_inv * dm_tot_inv * nu_erf
+ call get_deriv_r12_j12(r12,mu_tot,d_dr12_j) ! d/dr12 j(MU(r1,r2,r12)
+ ! d/dx1 j(MU(r1,r2),r12) | MU(r1,r2) = cst
+ ! d/dr12 j(MU(r1,r2,r12) x d/dx1 r12
+ grad_jmu_r1 = d_dr12_j * r12_vec 
+! call grad_j12_mu_input(r1, r2, mu_tot, grad_jmu_r1,r12)
+ grad_jast = grad_jmu_r1
+ ! d/dMU j(MU(r1,r2),r12)
+ call get_deriv_mu_j12(r12,mu_tot,d_dmu_j)
+ grad_jast+= d_dmu_j * grad_mu_tot
+ else if(murho_type==-1)then
+! J(r1,r2) = 0.5 * [j(mu(r1),r12) + j(mu(r2),r12)] 
+!
+! d/dx1 J(r1,r2)  = 0.5 * (d/dx1 j(mu(r1),r12) + d/dx1 j(mu(r2),r12))
+  call grad_j_mu_of_r_1(r1,r2,j_r1, grad_j_r1,dm_r1, grad_dm_r1)
+  call grad_mu_of_r_mean_field(r2,mu_r2, dm_r2, grad_mu_r2, grad_dm_r2) 
+  j_r2 = j12_mu_input(r1, r2, mu_r2) ! j(mu(r2),r1,r2)
+  call grad_j12_mu_input(r1, r2, mu_r2, grad_jmu_r2,r12)
+  jast = 0.5d0 * (j_r1 + j_r2)
+  grad_jast = 0.5d0 * (grad_j_r1 + grad_jmu_r2)
+  
+ endif
+
+end
+
+subroutine grad_j_mu_of_r_1(r1,r2,jast, grad_jast, dm_r1, grad_dm_r1)
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+! grad_r1 of j(mu(r1),r12)
+  !
+  !
+  ! d/dx1 j(mu(r1),r12) = exp(-(mu(r1)*r12)**2) /(2 *sqrt(pi) * mu(r1)**2 ) d/dx1 mu(r1) 
+  !                     + d/dx1 j(mu(r1),r12) 
+  !
+  ! with 
+  !
+  !           j(mu,r12) = 1/2 r12 (1 - erf(mu r12)) - 1/2 (sqrt(pi) * mu) e^{-(mu*r12)^2}
+  !
+  ! and d/dx1 j(mu,r12) = 0.5 * (1 - erf(mu *r12))/r12 * (x1 - x2)
+  !
+  !     d/d mu j(mu,r12) = e^{-(r12*mu)^2}/(2 sqrt(pi) * mu^2)
+  !
+  ! here mu(r1) is obtained by MU MEAN FIELD 
+ END_DOC
+ double precision, intent(in) :: r1(3),r2(3)
+ double precision, intent(out):: jast, grad_jast(3),dm_r1, grad_dm_r1(3)
+ double precision :: dx, dy, dz,  r12, mu_der(3)
+ double precision :: mu_tmp, tmp, grad(3), mu_val
+ jast = 0.d0
+ grad = 0.d0
+
+ dx  = r1(1) - r2(1)
+ dy  = r1(2) - r2(2)
+ dz  = r1(3) - r2(3)
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ ! get mu(r1) == mu_val and its gradient d/dx1 mu(r1) == mu_der 
+ call grad_mu_of_r_mean_field(r1,mu_val, dm_r1, mu_der, grad_dm_r1) 
+ mu_tmp  = mu_val * r12
+ ! evalulation of the jastrow j(mu(r1),r12)
+ jast = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_val
+
+ ! tmp = exp(-(mu(r1)*r12)**2) /(2 *sqrt(pi) * mu(r1)**2 )
+ tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
+ ! grad = 
+ grad(1) = tmp * mu_der(1)
+ grad(2) = tmp * mu_der(2)
+ grad(3) = tmp * mu_der(3)
+
+ if(r12 .lt. 1d-10) return
+ tmp     = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12 ! d/dx1 j(mu(r1),r12) 
+ grad(1) = grad(1) + tmp * dx
+ grad(2) = grad(2) + tmp * dy
+ grad(3) = grad(3) + tmp * dz
+
+ grad_jast = grad 
+end
+
+! ---
+
+double precision function j12_mu_input(r1, r2, mu)
+
+  BEGIN_DOC
+  ! j(mu,r12) = 1/2 r12 (1 - erf(mu r12)) - 1/2 (sqrt(pi) * mu) e^{-(mu*r12)^2}
+  END_DOC
+  include 'constants.include.F'
+
+  implicit none
+  double precision, intent(in) :: r1(3), r2(3), mu
+  double precision             :: mu_tmp, r12
+
+    r12 = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+               + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+               + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+    mu_tmp = mu * r12
+
+    j12_mu_input = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu
+
+end
+
+subroutine grad_j12_mu_input(r1, r2, mu, grad_jmu,r12)
+ implicit none
+ BEGIN_DOC
+ ! grad_jmu = d/dx1 j(mu,r12) assuming mu=cst(r1)
+ !
+ !          = 0.5/r_12 * (x_1 - x_2) * [1 - erf(mu*r12)]
+ END_DOC
+ double precision, intent(in) :: r1(3), r2(3), mu
+ double precision, intent(out):: grad_jmu(3),r12
+ double precision             :: mu_tmp, dx, dy, dz, grad(3), tmp
+ grad_jmu = 0.d0
+ dx  = r1(1) - r2(1)
+ dy  = r1(2) - r2(2)
+ dz  = r1(3) - r2(3)
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ if(r12 .lt. 1d-10) return
+ mu_tmp  = mu * r12
+ tmp     = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12 ! d/dx1 j(mu(r1),r12) 
+ grad(1) = tmp * dx
+ grad(2) = tmp * dy
+ grad(3) = tmp * dz
+
+ grad_jmu = grad 
+end
+
+subroutine j12_and_grad_j12_mu_input(r1, r2, mu, jmu, grad_jmu)
+ implicit none
+ include 'constants.include.F'
+ BEGIN_DOC
+ ! jmu = j(mu,r12) 
+ ! grad_jmu = d/dx1 j(mu,r12) assuming mu=cst(r1)
+ !
+ !          = 0.5/r_12 * (x_1 - x_2) * [1 - erf(mu*r12)]
+ END_DOC
+ double precision, intent(in) :: r1(3), r2(3), mu
+ double precision, intent(out):: grad_jmu(3), jmu
+ double precision             :: mu_tmp, r12, dx, dy, dz, grad(3), tmp
+ double precision :: erfc_mur12,inv_mu
+ inv_mu = 1.d0/mu
+
+ grad_jmu = 0.d0 ! initialization when r12 --> 0
+ jmu = - inv_sq_pi_2 * inv_mu ! initialization when r12 --> 0
+
+ dx  = r1(1) - r2(1)
+ dy  = r1(2) - r2(2)
+ dz  = r1(3) - r2(3)
+ r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+ if(r12 .lt. 1d-10) return
+ erfc_mur12 = (1.d0 - derf(mu_tmp))
+ mu_tmp  = mu * r12
+ tmp     = 0.5d0 * erfc_mur12  / r12 ! d/dx1 j(mu(r1),r12) 
+ grad(1) = tmp * dx
+ grad(2) = tmp * dy
+ grad(3) = tmp * dz
+
+ grad_jmu = grad 
+
+ jmu= 0.5d0 * r12 * erfc_mur12 - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) * inv_mu
+
+
+end
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils.irp.f
@ -1,8 +1,73 @@
+subroutine get_deriv_r12_j12(x,mu,d_dx_j)
+ implicit none
+  include 'constants.include.F'
+ BEGIN_DOC
+ ! d/dr12 j(mu,r12)
+ END_DOC
+ double precision, intent(in)  :: x,mu
+ double precision, intent(out) :: d_dx_j
+ 
+ d_dx_j = 0.d0
+ if(x .lt. 1d-10) return
+ if(j2e_type .eq. "Mu" .or. j2e_type .eq. "Mur") then
+  d_dx_j = 0.5d0 * (1.d0 - derf(mu * x))
+ else if(j2e_type .eq. "Mugauss" .or. j2e_type .eq. "Murgauss" ) then
+  double precision :: x_tmp
+  x_tmp = mu * x
+  ! gradient of j(mu,x)
+  d_dx_j = 0.5d0 * (1.d0 - derf(x_tmp))
+
+  ! gradient of gaussian additional term
+  x_tmp *= alpha_mu_gauss
+  x_tmp *= x_tmp 
+  d_dx_j += -0.5d0 * mu * c_mu_gauss * x * dexp(-x_tmp)
+ else
+   print *, ' Error in get_deriv_r12_j12: Unknown j2e_type = ', j2e_type
+   stop
+ endif
+end
+
+
+subroutine get_deriv_mu_j12(x,mu,d_d_mu)
+ implicit none
+ BEGIN_DOC
+ ! d/dmu j(mu,r12)
+ END_DOC
+  include 'constants.include.F'
+ double precision, intent(in)  :: x,mu
+ double precision, intent(out) :: d_d_mu
+ double precision :: x_tmp,inv_mu_2,inv_alpha_2
+ 
+ d_d_mu = 0.d0
+ if(x .lt. 1d-10) return
+ x_tmp = x*mu
+ if(mu.lt.1.d-10) return
+ inv_mu_2 = mu*mu
+ inv_mu_2 = 1.d0/inv_mu_2
+ if(j2e_type .eq. "Mu" .or. j2e_type .eq. "Mur") then
+  ! e^{-(r12*mu)^2}/(2 sqrt(pi) * mu^2)
+  d_d_mu = dexp(-x_tmp*x_tmp) * inv_sq_pi_2 * inv_mu_2
+ else if(j2e_type .eq. "Mugauss"  .or. j2e_type .eq. "Murgauss" ) then
+  d_d_mu  = dexp(-x_tmp*x_tmp) * inv_sq_pi_2 * inv_mu_2
+  inv_alpha_2 = 1.d0/alpha_mu_gauss
+  inv_alpha_2 *= inv_alpha_2
+  x_tmp *= alpha_mu_gauss
+  x_tmp *= x_tmp
+  d_d_mu += -0.25d0 * c_mu_gauss*inv_alpha_2*dexp(-x_tmp) * (1.d0 + 2.d0 * x_tmp) * inv_mu_2
+ else
+   print *, ' Error in get_deriv_r12_j12: Unknown j2e_type = ', j2e_type
+   stop
+ endif
+end
+

 ! ---

 double precision function j12_mu(r1, r2)

+  BEGIN_DOC
+  ! j(mu,r12) = 1/2 r12 (1 - erf(mu r12)) - 1/2 (sqrt(pi) * mu) e^{-(mu*r12)^2}
+  END_DOC
  include 'constants.include.F'

  implicit none
@ -18,6 +83,18 @@ double precision function j12_mu(r1, r2)

    j12_mu = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_erf

+  else if(j2e_type .eq. "Mugauss") then
+
+    r12 = dsqrt( (r1(1) - r2(1)) * (r1(1) - r2(1)) &
+               + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
+               + (r1(3) - r2(3)) * (r1(3) - r2(3)) )
+    double precision :: r12_tmp
+    r12_tmp = mu_erf * r12
+
+    j12_mu = 0.5d0 * r12 * (1.d0 - derf(r12_tmp)) - inv_sq_pi_2 * dexp(-r12_tmp*r12_tmp) / mu_erf
+    r12_tmp *= alpha_mu_gauss
+    j12_mu += 0.25d0 * c_mu_gauss / (alpha_mu_gauss*alpha_mu_gauss*mu_erf) * dexp(-r12_tmp*r12_tmp)
+
  else

    print *, ' Error in j12_mu: Unknown j2e_type = ', j2e_type
@ -57,7 +134,7 @@ subroutine grad1_j12_mu(r1, r2, grad)

  grad = 0.d0

-  if(j2e_type .eq. "Mu") then
+  if(j2e_type .eq. "Mu".or.j2e_type .eq. "Mugauss") then

    dx = r1(1) - r2(1)
    dy = r1(2) - r2(2)
@ -66,31 +143,42 @@ subroutine grad1_j12_mu(r1, r2, grad)
    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
    if(r12 .lt. 1d-10) return

-    tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
+    call get_deriv_r12_j12(r12,mu_erf,tmp)
+!    tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12

    grad(1) = tmp * dx
    grad(2) = tmp * dy
    grad(3) = tmp * dz
+    grad *= 1.d0/r12

-  elseif(j2e_type .eq. "Mur") then
+  elseif(j2e_type .eq. "Mur" .or. j2e_type .eq. "Murgauss") then
+   double precision :: jast
+   call grad_j_sum_mu_of_r(r1,r2,jast,grad)
+ 
+  elseif(j2e_type .eq. "Bump") then
+   double precision ::grad_jast(3)
+   call get_grad_j_bump_mu_of_r(r1,r2,grad_jast)
+     dx = r1(1) - r2(1)
+     dy = r1(2) - r2(2)
+     dz = r1(3) - r2(3)

-    dx  = r1(1) - r2(1)
-    dy  = r1(2) - r2(2)
-    dz  = r1(3) - r2(3)
-    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+     r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+     if(r12 .lt. 1d-10) then
+       grad(1) = 0.d0
+       grad(2) = 0.d0
+       grad(3) = 0.d0
+       return
+     endif                                                                                                                                                                            

-    call mu_r_val_and_grad(r1, r2, mu_val, mu_der)
-    mu_tmp  = mu_val * r12
-    tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
-    grad(1) = tmp * mu_der(1)
-    grad(2) = tmp * mu_der(2)
-    grad(3) = tmp * mu_der(3)
+     tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
+
+     grad(1) = 0.5d0 * tmp * dx
+     grad(2) = 0.5d0 * tmp * dy
+     grad(3) = 0.5d0 * tmp * dz
+     grad(1) += 0.5d0 * grad_jast(1)
+     grad(2) += 0.5d0 * grad_jast(2)
+     grad(3) += 0.5d0 * grad_jast(3)

-    if(r12 .lt. 1d-10) return
-    tmp     = 0.5d0 * (1.d0 - derf(mu_tmp)) / r12
-    grad(1) = grad(1) + tmp * dx
-    grad(2) = grad(2) + tmp * dy
-    grad(3) = grad(3) + tmp * dz

  else

@ -369,7 +457,18 @@ end
 ! ---

 subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
-
+ BEGIN_DOC
+! various flavours of mu(r1,r2) 
+! depends on essentially the density and other related quantities 
+!
+! change the variable "murho_type" to change type
+!
+!  murho_type == -1 :: mu(r1,r2) = (rho(r1) mu_mf(r1) + rho(r2) mu_mf(r2))/[rho(r1)+rho(r2)]
+!
+!             ==  0 :: mu(r1,r2) = (sqrt(rho(r1)) mu_mf(r1) + sqrt(rho(r2)) mu_mf(r2))/[sqrt(rho(r1))+sqrt(rho(r2))]
+!             
+!             == -2 :: mu(r1,r2) = 0.5(mu_mf(r1) + mu_mf(r2))
+ END_DOC
  implicit none
  double precision, intent(in)  :: r1(3), r2(3)
  double precision, intent(out) :: mu_val, mu_der(3)
@ -379,11 +478,50 @@ subroutine mu_r_val_and_grad(r1, r2, mu_val, mu_der)
  double precision :: rho1, grad_rho1(3),rho2,rho_tot,inv_rho_tot
  double precision :: f_rho1, f_rho2, d_drho_f_rho1
  double precision :: d_dx1_f_rho1(3),d_dx_rho_f_rho(3),nume
+  double precision :: mu_mf_r1, dm_r1, grad_mu_mf_r1(3), grad_dm_r1(3)
+  double precision :: mu_mf_r2, dm_r2, grad_mu_mf_r2(3), grad_dm_r2(3)
+
+  double precision :: num, denom, grad_denom(3), grad_num(3)
+  double precision :: dsqrt_dm_r1

  PROVIDE murho_type
  PROVIDE mu_r_ct mu_erf

-  if(murho_type .eq. 1) then
+  if(murho_type .eq. 0) then 
+   call grad_mu_of_r_mean_field(r1,mu_mf_r1, dm_r1, grad_mu_mf_r1, grad_dm_r1)
+   call grad_mu_of_r_mean_field(r2,mu_mf_r2, dm_r2, grad_mu_mf_r2, grad_dm_r2)
+   dsqrt_dm_r1 = dsqrt(dm_r1)
+   denom = (dsqrt_dm_r1 + dsqrt(dm_r2) )
+   if(denom.lt.1.d-7)then
+    mu_val = 1.d+10
+    mu_der = 0.d0
+    return
+   endif
+   num = (dsqrt(dm_r1) * mu_mf_r1 + dsqrt(dm_r2) * mu_mf_r2) 
+   mu_val = num / denom
+   grad_denom = grad_dm_r1/dsqrt_dm_r1
+   grad_num = dsqrt(dm_r1) * grad_mu_mf_r1 + mu_mf_r1 * grad_dm_r1
+   mu_der = (grad_num * denom - num * grad_denom)/(denom*denom)
+  else if(murho_type .eq. -1) then
+   call grad_mu_of_r_mean_field(r1,mu_mf_r1, dm_r1, grad_mu_mf_r1, grad_dm_r1)
+   call grad_mu_of_r_mean_field(r2,mu_mf_r2, dm_r2, grad_mu_mf_r2, grad_dm_r2)
+   denom = (dm_r1 + dm_r2 )
+   if(denom.lt.1.d-7)then
+    mu_val = 1.d+10
+    mu_der = 0.d0
+    return
+   endif
+   num = (dm_r1 * mu_mf_r1 + dm_r2 * mu_mf_r2) 
+   mu_val = num / denom
+   grad_denom = grad_dm_r1
+   grad_num = dm_r1 * grad_mu_mf_r1 + mu_mf_r1 * grad_dm_r1
+   mu_der = (grad_num * denom - num * grad_denom)/(denom*denom)
+  else if(murho_type .eq. -2) then
+   call grad_mu_of_r_mean_field(r1,mu_mf_r1, dm_r1, grad_mu_mf_r1, grad_dm_r1)
+   call grad_mu_of_r_mean_field(r2,mu_mf_r2, dm_r2, grad_mu_mf_r2, grad_dm_r2)
+   mu_val = 0.5d0 * (mu_mf_r1 + mu_mf_r2) 
+   mu_der = 0.5d0 * grad_mu_mf_r1
+  else if(murho_type .eq. 1) then

    !
    ! r = 0.5 (r1 + r2)
--- a/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
+++ b/plugins/local/non_h_ints_mu/jast_deriv_utils_vect.irp.f
@ -4,7 +4,7 @@
 subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)

  BEGIN_DOC
-  !
+  ! 
  ! grad_1 u(r1,r2)
  !
  ! we use grid for r1 and extra_grid for r2
@ -33,8 +33,12 @@ subroutine get_grad1_u12_withsq_r1_seq(ipoint, n_grid2, resx, resy, resz, res)
  r1(2) = final_grid_points(2,ipoint)
  r1(3) = final_grid_points(3,ipoint)

-  if( (j2e_type .eq. "Mu")  .or. &
-      (j2e_type .eq. "Mur") .or. &
+  if( (j2e_type .eq. "Mu")        .or. &
+      (j2e_type .eq. "Mur")       .or. &
+      (j2e_type .eq. "Jpsi")   .or. &
+      (j2e_type .eq. "Mugauss")   .or. &
+      (j2e_type .eq. "Murgauss")  .or. &
+      (j2e_type .eq. "Bump")      .or. &
      (j2e_type .eq. "Boys") ) then

    if(env_type .eq. "None") then
@ -167,7 +171,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
  integer                       :: jpoint
  integer                       :: i_nucl, p, mpA, npA, opA
  double precision              :: r2(3)
-  double precision              :: dx, dy, dz, r12, tmp, r12_inv
+  double precision              :: dx, dy, dz, r12, tmp
  double precision              :: mu_val, mu_tmp, mu_der(3)
  double precision              :: rn(3), f1A, grad1_f1A(3), f2A, grad2_f2A(3), g12, grad1_g12(3)
  double precision              :: tmp1, tmp2
@ -181,7 +185,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
    !  d/dy1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (y1 - y2)
    !  d/dz1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12] * (z1 - z2)

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 

      r2(1) = final_grid_points_extra(1,jpoint)
      r2(2) = final_grid_points_extra(2,jpoint)
@ -191,66 +195,107 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
      dy = r1(2) - r2(2)
      dz = r1(3) - r2(3)

-      r12 = dx * dx + dy * dy + dz * dz
-
-      if(r12 .lt. 1d-20) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
        cycle
      endif

-      r12_inv = 1.d0/dsqrt(r12)
-      r12 = r12*r12_inv
-
-      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) * r12_inv
+      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12

      gradx(jpoint) = tmp * dx
      grady(jpoint) = tmp * dy
      gradz(jpoint) = tmp * dz
    enddo

-  elseif(j2e_type .eq. "Mur") then
+  else if(j2e_type .eq. "Mugauss") then

-    ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 ) d/dx1 mu(r1,r2)
-    !                        + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)
+    !  d/dx1 j(mu,r12) = 0.5 * [(1 - erf(mu * r12)) / r12 - mu*c*r12*exp(-(mu*alpha*r12)^2] * (x1 - x2)

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 

      r2(1) = final_grid_points_extra(1,jpoint)
      r2(2) = final_grid_points_extra(2,jpoint)
      r2(3) = final_grid_points_extra(3,jpoint)

-      dx  = r1(1) - r2(1)
-      dy  = r1(2) - r2(2)
-      dz  = r1(3) - r2(3)
+      dx = r1(1) - r2(1)
+      dy = r1(2) - r2(2)
+      dz = r1(3) - r2(3)

-      r12 = dx * dx + dy * dy + dz * dz
-
-      if(r12 .lt. 1d-20) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
        cycle
      endif

-      r12_inv = 1.d0/dsqrt(r12)
-      r12 = r12*r12_inv
+      double precision :: r12_tmp
+      r12_tmp = mu_erf * r12
+      ! gradient of j(mu,r12)
+      tmp = 0.5d0 * (1.d0 - derf(r12_tmp)) / r12
+      ! gradient of gaussian additional term
+      r12_tmp *= alpha_mu_gauss
+      r12_tmp *= r12_tmp 
+      tmp += -0.5d0 * mu_erf * c_mu_gauss * r12 * dexp(-r12_tmp)/r12

-      call mu_r_val_and_grad(r1, r2, mu_val, mu_der)
+      gradx(jpoint) = tmp * dx
+      grady(jpoint) = tmp * dy
+      gradz(jpoint) = tmp * dz
+    enddo

-      mu_tmp  = mu_val * r12
-      tmp     = inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / (mu_val * mu_val)
+  elseif(j2e_type .eq. "Mur".or.j2e_type .eq. "Murgauss") then

-      gradx(jpoint) = tmp * mu_der(1)
-      grady(jpoint) = tmp * mu_der(2)
-      gradz(jpoint) = tmp * mu_der(3)
+    ! d/dx1 j(mu(r1,r2),r12) = exp(-(mu(r1,r2)*r12)**2) /(2 *sqrt(pi) * mu(r1,r2)**2 ) d/dx1 mu(r1,r2) 
+    !                        + 0.5 * (1 - erf(mu(r1,r2) *r12))/r12 * (x1 - x2)

-      tmp = 0.5d0 * (1.d0 - derf(mu_tmp)) * r12_inv
+    do jpoint = 1, n_points_extra_final_grid ! r2 

-      gradx(jpoint) = gradx(jpoint) + tmp * dx
-      grady(jpoint) = grady(jpoint) + tmp * dy
-      gradz(jpoint) = gradz(jpoint) + tmp * dz
+      r2(1) = final_grid_points_extra(1,jpoint)
+      r2(2) = final_grid_points_extra(2,jpoint)
+      r2(3) = final_grid_points_extra(3,jpoint)
+      double precision :: jast, grad_jast(3)
+      call grad_j_sum_mu_of_r(r1,r2,jast,grad_jast)
+      gradx(jpoint) = grad_jast(1)
+      grady(jpoint) = grad_jast(2)
+      gradz(jpoint) = grad_jast(3)
+    enddo
+  elseif(j2e_type .eq. "Bump") then
+
+    ! d/dx1 jbump(r1,r2)
+
+    do jpoint = 1, n_points_extra_final_grid ! r2 
+
+      r2(1) = final_grid_points_extra(1,jpoint)
+      r2(2) = final_grid_points_extra(2,jpoint)
+      r2(3) = final_grid_points_extra(3,jpoint)
+      call get_grad_j_bump_mu_of_r(r1,r2,grad_jast)
+
+      dx = r1(1) - r2(1)
+      dy = r1(2) - r2(2)
+      dz = r1(3) - r2(3)
+
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+      if(r12 .lt. 1d-10) then
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
+        cycle
+      endif
+
+      tmp = 0.5d0 * (1.d0 - derf(mu_erf * r12)) / r12
+
+      gradx(jpoint) = 0.5d0 * tmp * dx
+      grady(jpoint) = 0.5d0 * tmp * dy
+      gradz(jpoint) = 0.5d0 * tmp * dz
+      gradx(jpoint) += 0.5d0 * grad_jast(1)
+      grady(jpoint) += 0.5d0 * grad_jast(2)
+      gradz(jpoint) += 0.5d0 * grad_jast(3)
+!      gradx(jpoint) =  grad_jast(1)
+!      grady(jpoint) =  grad_jast(2)
+!      gradz(jpoint) =  grad_jast(3)
    enddo

  elseif(j2e_type .eq. "Boys") then
@ -264,7 +309,7 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)

    PROVIDE a_boys

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 

      r2(1) = final_grid_points_extra(1,jpoint)
      r2(2) = final_grid_points_extra(2,jpoint)
@ -273,17 +318,14 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
      dx  = r1(1) - r2(1)
      dy  = r1(2) - r2(2)
      dz  = r1(3) - r2(3)
-      r12 = dx * dx + dy * dy + dz * dz
-
+      r12 = dsqrt(dx * dx + dy * dy + dz * dz)
      if(r12 .lt. 1d-10) then
-        gradx(jpoint) = 0.d0
-        grady(jpoint) = 0.d0
-        gradz(jpoint) = 0.d0
+        gradx(jpoint) = 0.d0 
+        grady(jpoint) = 0.d0 
+        gradz(jpoint) = 0.d0 
        cycle
      endif

-      r12 = dsqrt(r12)
-
      tmp = 1.d0 + a_boys * r12
      tmp = 0.5d0 / (r12 * tmp * tmp)

@ -294,13 +336,16 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)

  elseif(j2e_type .eq. "Boys_Handy") then

-    integer :: powmax
-    powmax = max(maxval(jBH_m),maxval(jBH_n))
-
+    integer                       :: powmax1, powmax, powmax2
    double precision, allocatable :: f1A_power(:), f2A_power(:), double_p(:), g12_power(:)
-    allocate (f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))

-    do p=0,powmax
+    powmax1 = max(maxval(jBH_m), maxval(jBH_n))
+    powmax2 = maxval(jBH_o)
+    powmax  = max(powmax1, powmax2)
+
+    allocate(f1A_power(-1:powmax), f2A_power(-1:powmax), g12_power(-1:powmax), double_p(0:powmax))
+
+    do p = 0, powmax
      double_p(p) = dble(p)
    enddo

@ -318,11 +363,10 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
      r2(2) = final_grid_points_extra(2,jpoint)
      r2(3) = final_grid_points_extra(3,jpoint)

-      gradx(jpoint) = 0.d0
-      grady(jpoint) = 0.d0
-      gradz(jpoint) = 0.d0
-
-      do i_nucl = 1, nucl_num
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
+      do i_nucl = 1, nucl_num 

        rn(1) = nucl_coord(i_nucl,1)
        rn(2) = nucl_coord(i_nucl,2)
@ -332,61 +376,59 @@ subroutine grad1_j12_r1_seq(r1, n_grid2, gradx, grady, gradz)
        call jBH_elem_fct_grad(jBH_en(i_nucl), r2, rn, f2A, grad2_f2A)
        call jBH_elem_fct_grad(jBH_ee(i_nucl), r1, r2, g12, grad1_g12)

-
        ! Compute powers of f1A and f2A
-
-        do p = 1, maxval(jBH_m(:,i_nucl))
+        do p = 1, powmax1
          f1A_power(p) = f1A_power(p-1) * f1A
-        enddo
-
-        do p = 1, maxval(jBH_n(:,i_nucl))
          f2A_power(p) = f2A_power(p-1) * f2A
        enddo
-
-        do p = 1, maxval(jBH_o(:,i_nucl))
+        do p = 1, powmax2
          g12_power(p) = g12_power(p-1) * g12
        enddo

-
-
        do p = 1, jBH_size
          mpA = jBH_m(p,i_nucl)
          npA = jBH_n(p,i_nucl)
          opA = jBH_o(p,i_nucl)
          tmp = jBH_c(p,i_nucl)
-          if(mpA .eq. npA) then
-            tmp = tmp * 0.5d0
-          endif
-
-!TODO : Powers to optimize here
-
-!          tmp1 = 0.d0
-!          if(mpA .gt. 0) then
-!            tmp1 = tmp1 + dble(mpA) * f1A**(mpA-1) * f2A**npA
-!          endif
-!          if(npA .gt. 0) then
-!            tmp1 = tmp1 + dble(npA) * f1A**(npA-1) * f2A**mpA
-!          endif
-!          tmp1 = tmp1 * g12**(opA)
-!
-!          tmp2 = 0.d0
-!          if(opA .gt. 0) then
-!            tmp2 = tmp2 + dble(opA) * g12**(opA-1) * (f1A**(mpA) * f2A**(npA) + f1A**(npA) * f2A**(mpA))
-!          endif

          tmp1 = double_p(mpA) * f1A_power(mpA-1) * f2A_power(npA) + double_p(npA) * f1A_power(npA-1) * f2A_power(mpA)
-          tmp1 = tmp1 * g12_power(opA)
+          tmp1 = tmp1 * g12_power(opA) * tmp
+          tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA)) * tmp

-          tmp2 = double_p(opA) * g12_power(opA-1) * (f1A_power(mpA) * f2A_power(npA) + f1A_power(npA) * f2A_power(mpA))
+          !tmp1 = 0.d0
+          !if(mpA .gt. 0) then
+          !  tmp1 = tmp1 + dble(mpA) * f1A**dble(mpA-1) * f2A**dble(npA)
+          !endif
+          !if(npA .gt. 0) then
+          !  tmp1 = tmp1 + dble(npA) * f1A**dble(npA-1) * f2A**dble(mpA)
+          !endif
+          !tmp1 = tmp1 * g12**dble(opA)
+          !tmp2 = 0.d0
+          !if(opA .gt. 0) then
+          !  tmp2 = tmp2 + dble(opA) * g12**dble(opA-1) * (f1A**dble(mpA) * f2A**dble(npA) + f1A**dble(npA) * f2A**dble(mpA))
+          !endif

-
-          gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
-          grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
-          gradz(jpoint) = gradz(jpoint) + tmp * (tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3))
+!          gradx(jpoint) = gradx(jpoint) + tmp * (tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1))
+!          grady(jpoint) = grady(jpoint) + tmp * (tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2))
+!          gradz(jpoint) = gradz(jpoint) + tmp * (tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3))
+          gradx(jpoint) = gradx(jpoint) + tmp1 * grad1_f1A(1) + tmp2 * grad1_g12(1)
+          grady(jpoint) = grady(jpoint) + tmp1 * grad1_f1A(2) + tmp2 * grad1_g12(2)
+          gradz(jpoint) = gradz(jpoint) + tmp1 * grad1_f1A(3) + tmp2 * grad1_g12(3)
        enddo ! p
      enddo ! i_nucl
    enddo ! jpoint

+  elseif(j2e_type .eq. "Jpsi") then
+   double precision :: grad_j_psi_r1(3),jast_psi
+   do jpoint = 1, n_points_extra_final_grid ! r2 
+     r2(1) = final_grid_points_extra(1,jpoint)
+     r2(2) = final_grid_points_extra(2,jpoint)
+     r2(3) = final_grid_points_extra(3,jpoint)
+     call get_grad_r1_jastrow_psi(r1,r2,grad_j_psi_r1,jast_psi)
+     gradx(jpoint) = grad_j_psi_r1(1)
+     grady(jpoint) = grad_j_psi_r1(2)
+     gradz(jpoint) = grad_j_psi_r1(3)
+   enddo
  else

    print *, ' Error in grad1_j12_r1_seq: Unknown j2e_type = ', j2e_type
@ -418,10 +460,10 @@ subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)

  integer                       :: jpoint
  double precision              :: r2(3)
-  double precision              :: dx, dy, dz, r12, r12_inv, tmp
+  double precision              :: dx, dy, dz, r12, tmp


-  do jpoint = 1, n_points_extra_final_grid ! r2
+  do jpoint = 1, n_points_extra_final_grid ! r2 

    r2(1) = final_grid_points_extra(1,jpoint)
    r2(2) = final_grid_points_extra(2,jpoint)
@ -431,19 +473,15 @@ subroutine grad1_jmu_r1_seq(mu, r1, n_grid2, gradx, grady, gradz)
    dy = r1(2) - r2(2)
    dz = r1(3) - r2(3)

-    r12 = dx * dx + dy * dy + dz * dz
-
-    if(r12 .lt. 1d-20) then
-      gradx(jpoint) = 0.d0
-      grady(jpoint) = 0.d0
-      gradz(jpoint) = 0.d0
+    r12 = dsqrt(dx * dx + dy * dy + dz * dz)
+    if(r12 .lt. 1d-10) then
+      gradx(jpoint) = 0.d0 
+      grady(jpoint) = 0.d0 
+      gradz(jpoint) = 0.d0 
      cycle
    endif

-    r12_inv = 1.d0 / dsqrt(r12)
-    r12 = r12 * r12_inv
-
-    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) * r12_inv
+    tmp = 0.5d0 * (1.d0 - derf(mu * r12)) / r12

    gradx(jpoint) = tmp * dx
    grady(jpoint) = tmp * dy
@ -467,7 +505,7 @@ subroutine j12_r1_seq(r1, n_grid2, res)
  integer                       :: jpoint
  double precision              :: r2(3)
  double precision              :: dx, dy, dz
-  double precision              :: mu_tmp, r12, mu_erf_inv
+  double precision              :: mu_tmp, r12

  PROVIDE final_grid_points_extra

@ -475,21 +513,20 @@ subroutine j12_r1_seq(r1, n_grid2, res)

    PROVIDE mu_erf

-    mu_erf_inv = 1.d0 / mu_erf
-    do jpoint = 1, n_points_extra_final_grid ! r2
-
+    do jpoint = 1, n_points_extra_final_grid ! r2 
+  
      r2(1) = final_grid_points_extra(1,jpoint)
      r2(2) = final_grid_points_extra(2,jpoint)
      r2(3) = final_grid_points_extra(3,jpoint)
-
+  
      dx  = r1(1) - r2(1)
      dy  = r1(2) - r2(2)
      dz  = r1(3) - r2(3)
      r12 = dsqrt(dx * dx + dy * dy + dz * dz)

      mu_tmp = mu_erf * r12
-
-      res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) * mu_erf_inv
+  
+      res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(mu_tmp)) - inv_sq_pi_2 * dexp(-mu_tmp*mu_tmp) / mu_erf
    enddo

  elseif(j2e_type .eq. "Boys") then
@ -498,7 +535,7 @@ subroutine j12_r1_seq(r1, n_grid2, res)

    PROVIDE a_boys

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 

      r2(1) = final_grid_points_extra(1,jpoint)
      r2(2) = final_grid_points_extra(2,jpoint)
@ -540,19 +577,19 @@ subroutine jmu_r1_seq(mu, r1, n_grid2, res)

  tmp1 = inv_sq_pi_2 / mu

-  do jpoint = 1, n_points_extra_final_grid ! r2
-
+  do jpoint = 1, n_points_extra_final_grid ! r2 
+  
    r2(1) = final_grid_points_extra(1,jpoint)
    r2(2) = final_grid_points_extra(2,jpoint)
    r2(3) = final_grid_points_extra(3,jpoint)
-
+  
    dx  = r1(1) - r2(1)
    dy  = r1(2) - r2(2)
    dz  = r1(3) - r2(3)
    r12 = dsqrt(dx * dx + dy * dy + dz * dz)

    tmp2 = mu * r12
-
+  
    res(jpoint) = 0.5d0 * r12 * (1.d0 - derf(tmp2)) - tmp1 * dexp(-tmp2*tmp2)
  enddo

@ -579,7 +616,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)

    res = 1.d0

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
      r(1) = final_grid_points_extra(1,jpoint)
      r(2) = final_grid_points_extra(2,jpoint)
      r(3) = final_grid_points_extra(3,jpoint)
@ -598,7 +635,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)

    res = 1.d0

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
      r(1) = final_grid_points_extra(1,jpoint)
      r(2) = final_grid_points_extra(2,jpoint)
      r(3) = final_grid_points_extra(3,jpoint)
@ -618,7 +655,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)

    res = 1.d0

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
      r(1) = final_grid_points_extra(1,jpoint)
      r(2) = final_grid_points_extra(2,jpoint)
      r(3) = final_grid_points_extra(3,jpoint)
@ -636,7 +673,7 @@ subroutine env_nucl_r1_seq(n_grid2, res)

    res = 1.d0

-    do jpoint = 1, n_points_extra_final_grid ! r2
+    do jpoint = 1, n_points_extra_final_grid ! r2 
      r(1) = final_grid_points_extra(1,jpoint)
      r(2) = final_grid_points_extra(2,jpoint)
      r(3) = final_grid_points_extra(3,jpoint)
@ -666,7 +703,7 @@ end
 subroutine get_grad1_u12_2e_r1_seq(ipoint, n_grid2, resx, resy, resz)

  BEGIN_DOC
-  !
+  ! 
  ! grad_1 u_2e(r1,r2)
  !
  ! we use grid for r1 and extra_grid for r2
@ -695,8 +732,12 @@ subroutine get_grad1_u12_2e_r1_seq(ipoint, n_grid2, resx, resy, resz)
  r1(2) = final_grid_points(2,ipoint)
  r1(3) = final_grid_points(3,ipoint)

-  if( (j2e_type .eq. "Mu")  .or. &
-      (j2e_type .eq. "Mur") .or. &
+  if( (j2e_type .eq. "Mu")        .or. &
+      (j2e_type .eq. "Mugauss")   .or. &
+      (j2e_type .eq. "Mur")       .or. &
+      (j2e_type .eq. "Jpsi")       .or. &
+      (j2e_type .eq. "Murgauss")  .or. &
+      (j2e_type .eq. "Bump")      .or. &
      (j2e_type .eq. "Boys") ) then

    if(env_type .eq. "None") then
@ -786,7 +827,7 @@ end
 subroutine get_u12_2e_r1_seq(ipoint, n_grid2, res)

  BEGIN_DOC
-  !
+  ! 
  ! u_2e(r1,r2)
  !
  ! we use grid for r1 and extra_grid for r2
@ -813,8 +854,11 @@ subroutine get_u12_2e_r1_seq(ipoint, n_grid2, res)
  r1(2) = final_grid_points(2,ipoint)
  r1(3) = final_grid_points(3,ipoint)

-  if( (j2e_type .eq. "Mu")  .or. &
-      (j2e_type .eq. "Mur") .or. &
+  if( (j2e_type .eq. "Mu")        .or. &
+      (j2e_type .eq. "Mur")       .or. &
+      (j2e_type .eq. "Mugauss")  .or. &
+      (j2e_type .eq. "Murgauss")  .or. &
+      (j2e_type .eq. "Mugauss")   .or. &
      (j2e_type .eq. "Boys") ) then

    if(env_type .eq. "None") then
@ -893,23 +937,24 @@ subroutine jBH_elem_fct_grad(alpha, r1, r2, fct, grad1_fct)
              + (r1(2) - r2(2)) * (r1(2) - r2(2)) &
              + (r1(3) - r2(3)) * (r1(3) - r2(3)) )

-  tmp1 = 1.d0 / (1.d0 + alpha * dist)

-  fct = alpha * dist * tmp1
-
-  if(dist .lt. 1d-10) then
-    grad1_fct(1) = 0.d0
-    grad1_fct(2) = 0.d0
-    grad1_fct(3) = 0.d0
-  else
+  if(dist .ge. 1d-10) then
+    tmp1 = 1.d0 / (1.d0 + alpha * dist)
+    
+    fct = alpha * dist * tmp1
    tmp2 = alpha * tmp1 * tmp1 / dist
    grad1_fct(1) = tmp2 * (r1(1) - r2(1))
    grad1_fct(2) = tmp2 * (r1(2) - r2(2))
    grad1_fct(3) = tmp2 * (r1(3) - r2(3))
+  else
+    grad1_fct(1) = 0.d0
+    grad1_fct(2) = 0.d0
+    grad1_fct(3) = 0.d0
+    fct = 0.d0
  endif

  return
-end
+end 

 ! ---

--- a/plugins/local/non_h_ints_mu/jastrow_psi.irp.f
+++ b/plugins/local/non_h_ints_mu/jastrow_psi.irp.f
@ -0,0 +1,124 @@
+BEGIN_PROVIDER [ double precision, c_ij_ab_jastrow, (mo_num, mo_num, elec_alpha_num, elec_beta_num)]
+ implicit none
+ integer :: iunit, getUnitAndOpen
+ c_ij_ab_jastrow = 0.d0
+ iunit = getUnitAndOpen(trim(ezfio_work_dir)//'c_ij_ab', 'R')                                                     
+ read(iunit) c_ij_ab_jastrow
+ close(iunit)
+ print*,'c_ij_ab_jastrow = '
+ integer :: i,j,a,b
+ do i = 1, elec_beta_num ! r2
+  do j = 1, elec_alpha_num ! r1
+   do a = elec_beta_num+1, mo_num ! r2
+    do b = elec_alpha_num+1, mo_num ! r1 
+!     print*,b,a,j,i
+     print*,c_ij_ab_jastrow(b,a,j,i),b,a,j,i
+     if(dabs(c_ij_ab_jastrow(b,a,j,i)).lt.1.d-12)then
+      c_ij_ab_jastrow(b,a,j,i) = 0.d0
+     endif
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+double precision function jastrow_psi(r1,r2)
+ implicit none
+ double precision, intent(in) :: r1(3), r2(3)
+ integer :: i,j,a,b
+ double precision, allocatable :: mos_array_r1(:), mos_array_r2(:)
+ allocate(mos_array_r1(mo_num), mos_array_r2(mo_num))
+ call give_all_mos_at_r(r1,mos_array_r1)
+ call give_all_mos_at_r(r2,mos_array_r2)
+ double precision :: eps,coef, numerator,denominator
+ double precision :: phi_i_phi_j
+ eps = a_boys
+ jastrow_psi= 0.d0
+ do i = 1, elec_beta_num ! r1
+  do j = 1, elec_alpha_num ! r2
+   phi_i_phi_j = mos_array_r1(i) * mos_array_r2(j) + eps
+   denominator = 1.d0/phi_i_phi_j
+   do a = elec_beta_num+1, mo_num ! r1
+    do b = elec_alpha_num+1, mo_num ! r2 
+     coef = c_ij_ab_jastrow(b,a,j,i)
+     numerator = mos_array_r2(b) * mos_array_r1(a)
+     jastrow_psi += coef * numerator*denominator
+    enddo
+   enddo
+  enddo
+ enddo
+end
+
+subroutine get_grad_r1_jastrow_psi(r1,r2,grad_j_psi_r1,jast)
+ implicit none
+ double precision, intent(in) :: r1(3), r2(3)
+ double precision, intent(out):: grad_j_psi_r1(3),jast
+ integer :: i,j,a,b
+ double precision, allocatable :: mos_array_r1(:), mos_array_r2(:)
+ double precision, allocatable :: mos_grad_array_r1(:,:),mos_grad_array_r2(:,:)
+ double precision :: num_j, denom_j, num_j_grad(3), denom_j_grad(3),delta,coef
+ double precision :: inv_denom_j
+ allocate(mos_array_r1(mo_num), mos_array_r2(mo_num))
+ allocate(mos_grad_array_r1(3,mo_num), mos_grad_array_r2(3,mo_num))
+ delta = a_boys
+ call give_all_mos_and_grad_at_r(r1,mos_array_r1,mos_grad_array_r1)
+ call give_all_mos_and_grad_at_r(r2,mos_array_r2,mos_grad_array_r2)
+ grad_j_psi_r1 = 0.d0
+ jast = 0.d0
+ do i = 1, elec_beta_num ! r1
+  do j = 1, elec_alpha_num ! r2
+   call denom_jpsi(i,j,delta,mos_array_r1,mos_grad_array_r1,mos_array_r2,denom_j, denom_j_grad)
+   inv_denom_j = 1.d0/denom_j
+   do a = elec_beta_num+1, mo_num ! r1
+    do b = elec_alpha_num+1, mo_num ! r2 
+     call numerator_psi(a,b,mos_array_r1,mos_grad_array_r1,mos_array_r2,num_j, num_j_grad)
+     coef = c_ij_ab_jastrow(b,a,j,i)
+     jast += coef * num_j * inv_denom_j
+     grad_j_psi_r1 += coef * (num_j_grad * denom_j - num_j * denom_j_grad) * inv_denom_j * inv_denom_j
+    enddo
+   enddo
+  enddo
+ enddo
+ if(jast.lt.-1.d0.or.dabs(jast).gt.1.d0)then
+  print*,'pb ! '
+  print*,jast
+  print*,dsqrt(r1(1)**2+r1(2)**2+r1(3)**2),dsqrt(r2(1)**2+r2(2)**2+r2(3)**2)
+  print*,r1
+!  print*,mos_array_r1(1:2)
+  print*,r2
+!  print*,mos_array_r2(1:2)
+  stop
+ endif
+ if(log_jpsi)then
+  grad_j_psi_r1 = grad_j_psi_r1/(1.d0 + jast)
+ endif
+
+end
+
+
+subroutine denom_jpsi(i,j,delta,mos_array_r1,mos_grad_array_r1,mos_array_r2,denom, grad_denom)
+ implicit none
+ integer, intent(in)           :: i,j
+ double precision, intent(in)  :: mos_array_r1(mo_num),mos_grad_array_r1(3,mo_num),mos_array_r2(mo_num),delta
+ double precision, intent(out) :: denom, grad_denom(3)
+ double precision :: coef,phi_i_phi_j,inv_phi_i_phi_j,inv_phi_i_phi_j_2
+ phi_i_phi_j = mos_array_r1(i) * mos_array_r2(j)
+ if(phi_i_phi_j /= 0.d0)then
+  inv_phi_i_phi_j   = 1.d0/phi_i_phi_j
+  inv_phi_i_phi_j_2 = 1.d0/(phi_i_phi_j * phi_i_phi_j)
+ else
+  inv_phi_i_phi_j   = huge(1.0)
+  inv_phi_i_phi_j_2 = huge(1.d0)
+ endif
+ denom         = phi_i_phi_j + delta * inv_phi_i_phi_j
+ grad_denom(:) = (1.d0 - delta*inv_phi_i_phi_j_2) * mos_array_r2(j) * mos_grad_array_r1(:,i)
+end
+
+subroutine numerator_psi(a,b,mos_array_r1,mos_grad_array_r1,mos_array_r2,num, grad_num)
+ implicit none
+ integer, intent(in)           :: a,b
+ double precision, intent(in)  :: mos_array_r1(mo_num),mos_grad_array_r1(3,mo_num),mos_array_r2(mo_num)
+ double precision, intent(out) :: num, grad_num(3)
+ num = mos_array_r1(a) * mos_array_r2(b)
+ grad_num(:) = mos_array_r2(b) * mos_grad_array_r1(:,a)
+end
--- a/plugins/local/non_h_ints_mu/mu_of_r.irp.f
+++ b/plugins/local/non_h_ints_mu/mu_of_r.irp.f
@ -0,0 +1,43 @@
+
+subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+ implicit none
+ BEGIN_DOC
+ ! returns the value and gradients of the mu(r) mean field, together with the HF density and its gradients. 
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ 
+ double precision :: dist
+ call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ 
+ dm = dm_a + dm_b
+ grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
+
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+  grad_mu_mf = 0.d0
+ else
+  if(mu_of_r_tc=="Erfmu")then
+   mu_mf           = 0.3333333333d0 * sqpi * (f_mf_ab/two_bod_dens + 0.25d0)
+   grad_mu_mf(1:3) = 0.3333333333d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                    /(two_bod_dens*two_bod_dens)
+  else if(mu_of_r_tc=="Standard")then
+   mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+   grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                    /(two_bod_dens*two_bod_dens)
+  else if(mu_of_r_tc=="Erfmugauss")then
+   mu_mf = (f_mf_ab/two_bod_dens + 0.25d0)/c_mu_gauss_tot 
+   grad_mu_mf(1:3) = 1.d0/c_mu_gauss_tot* (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& 
+                                    /(two_bod_dens*two_bod_dens)
+  else 
+   print*,'Wrong value for mu_of_r_tc !'
+   stop
+  endif
+ endif 
+
+end
+
--- a/plugins/local/non_h_ints_mu/mu_of_r_mean_field.irp.f
+++ b/plugins/local/non_h_ints_mu/mu_of_r_mean_field.irp.f
@ -0,0 +1,166 @@
+BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
+ implicit none
+ integer :: i,j,k,l 
+ double precision :: get_two_e_integral
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) 
+    enddo
+   enddo
+  enddo
+ enddo
+END_PROVIDER 
+
+subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ implicit none
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
+ double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
+ call give_all_mos_at_r(r,mos_array_r) 
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+ enddo
+
+ dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+ enddo
+
+ dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+ enddo
+ two_bod_dens = dm_a * dm_b
+ 
+ f_mf_ab = 0.d0
+ do i = 1, elec_alpha_num
+  do j = 1, elec_beta_num
+   do k = 1, elec_alpha_num
+    do l = 1, elec_beta_num
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
+    enddo
+   enddo
+  enddo
+ enddo
+ ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+
+end
+
+subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+ implicit none
+ BEGIN_DOC
+ ! gradient of mu(r) mean field, together with the gradient of the one- and two-body HF density. 
+ END_DOC
+ double precision, intent(in)  :: r(3)
+ double precision, intent(out) :: f_mf_ab, two_bod_dens
+ double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
+ double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
+
+ double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
+ double precision, allocatable :: mos_array_a(:), mos_array_b(:)
+ double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
+ double precision :: mo_i, mo_j, mo_k, mo_l
+ double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
+ 
+ integer :: i,j,k,l
+ allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
+ allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
+ allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
+ call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
+ do i = 1, elec_alpha_num
+  mos_array_a(i) = mos_array_r(i)
+  mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+ do i = 1, elec_beta_num
+  mos_array_b(i) = mos_array_r(i)
+  mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
+ enddo
+
+ ! ALPHA DENSITY AND GRADIENT 
+ dm_a = 0.d0
+ grad_dm_a = 0.d0
+ do i = 1, elec_alpha_num
+  dm_a += mos_array_a(i) * mos_array_a(i) 
+  grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)  
+ enddo
+
+ ! BETA DENSITY AND GRADIENT 
+ dm_b = 0.d0
+ grad_dm_b = 0.d0
+ do i = 1, elec_beta_num
+  dm_b += mos_array_b(i) * mos_array_b(i) 
+  grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)  
+ enddo
+ ! TWO-BODY DENSITY AND GRADIENT 
+ two_bod_dens = dm_a * dm_b
+ grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
+
+ ! F_MF and GRADIENT 
+ grad_f_mf_ab = 0.d0
+ f_mf_ab  = 0.d0
+ do i = 1, elec_alpha_num
+  mo_i = mos_array_a(i)
+  grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
+  do j = 1, elec_beta_num
+   mo_j = mos_array_b(j)
+   grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
+   do k = 1, elec_alpha_num
+    mo_k = mos_array_a(k)
+    grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
+    do l = 1, elec_beta_num
+     mo_l = mos_array_b(l)
+     grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
+     f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
+     grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & 
+     (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & 
+     +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
+    enddo
+   enddo
+  enddo
+ enddo
+
+ f_mf_ab *= 2.d0 
+ two_bod_dens *= 2.d0
+ grad_f_mf_ab *= 2.D0
+ grad_two_bod_dens *= 2.d0
+end
+
+subroutine mu_of_r_mean_field(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
+ endif
+end
+
+subroutine mu_of_r_mean_field_tc(r,mu_mf, dm)
+ implicit none
+  include 'constants.include.F'
+ double precision, intent(in) :: r(3)
+ double precision, intent(out):: mu_mf, dm
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+ call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
+ dm = dm_a + dm_b
+ if(dabs(two_bod_dens).lt.1.d-10)then
+  mu_mf = 1.d+10
+ else
+  mu_mf = 0.3333333333d0 * sqpi * (f_mf_ab/two_bod_dens + 0.25d0)
+ endif
+end
+
--- a/plugins/local/non_h_ints_mu/numerical_integ.irp.f
+++ b/plugins/local/non_h_ints_mu/numerical_integ.irp.f
@ -179,7 +179,7 @@ double precision function num_v_ij_erf_rk_cst_mu_env(i, j, ipoint)
    dx    = r1(1) - r2(1)
    dy    = r1(2) - r2(2)
    dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
    if(r12 .lt. 1d-10) cycle

    tmp1  = (derf(mu_erf * r12) - 1.d0) / r12
@ -228,7 +228,7 @@ subroutine num_x_v_ij_erf_rk_cst_mu_env(i, j, ipoint, integ)
    dx    = r1(1) - r2(1)
    dy    = r1(2) - r2(2)
    dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
    if(r12 .lt. 1d-10) cycle

    tmp1  = (derf(mu_erf * r12) - 1.d0) / r12
@ -530,7 +530,7 @@ subroutine num_int2_u_grad1u_total_env2(i, j, ipoint, integ)
    dx    = r1(1) - r2(1)
    dy    = r1(2) - r2(2)
    dz    = r1(3) - r2(3)
-    r12   = dsqrt( dx * dx + dy * dy + dz * dz )
+    r12   = dsqrt(dx*dx + dy*dy + dz*dz)
    if(r12 .lt. 1d-10) cycle

    tmp0 = env_nucl(r2)
--- a/plugins/local/non_h_ints_mu/plot_j_gauss.irp.f
+++ b/plugins/local/non_h_ints_mu/plot_j_gauss.irp.f
@ -0,0 +1,59 @@
+program plot_j_gauss
+ implicit none
+ double precision :: xmin, xmax, x, dx
+ double precision :: mu_min, mu_max, mu, d_mu
+ double precision :: pot_j_gauss,j_mu_simple,j_gauss_simple,pot_j_mu
+ double precision, allocatable :: mu_tab(:),j_mu(:),j_mu_gauss(:)
+ double precision, allocatable :: w_mu(:), w_mu_gauss(:)
+
+ character*(128) :: output
+ integer :: getUnitAndOpen
+ integer :: i_unit_output_wee_gauss,i_unit_output_wee_mu
+ integer :: i_unit_output_j_gauss,i_unit_output_j_mu
+ output=trim(ezfio_filename)//'.w_ee_mu_gauss'
+ i_unit_output_wee_gauss = getUnitAndOpen(output,'w')
+ output=trim(ezfio_filename)//'.w_ee_mu'
+ i_unit_output_wee_mu = getUnitAndOpen(output,'w')
+ output=trim(ezfio_filename)//'.j_mu_gauss'
+ i_unit_output_j_gauss = getUnitAndOpen(output,'w')
+ output=trim(ezfio_filename)//'.j_mu'
+ i_unit_output_j_mu = getUnitAndOpen(output,'w')
+
+ integer :: npt, i, j, n_mu
+ n_mu = 3
+ allocate(mu_tab(n_mu),j_mu(n_mu),j_mu_gauss(n_mu),w_mu(n_mu), w_mu_gauss(n_mu))
+ mu_min = 0.5d0
+ mu_max = 2.d0
+ d_mu = (mu_max - mu_min)/dble(n_mu)
+ mu = mu_min
+ do i = 1, n_mu
+  mu_tab(i) = mu
+  print*,'mu = ',mu
+  mu += d_mu
+ enddo
+ mu_tab(1) = 0.9d0
+ mu_tab(2) = 0.95d0
+ mu_tab(3) = 1.d0
+
+ xmin = 0.01d0
+ xmax = 10.d0
+ npt = 1000
+ dx = (xmax - xmin)/dble(npt)
+ x = xmin
+ do i = 1, npt
+  do j = 1, n_mu
+   mu = mu_tab(j)
+   w_mu_gauss(j) = pot_j_gauss(x,mu)
+   w_mu(j) = pot_j_mu(x,mu)
+   j_mu(j) = j_mu_simple(x,mu)
+   j_mu_gauss(j) = j_gauss_simple(x,mu) + j_mu(j)
+  enddo
+  write(i_unit_output_wee_gauss,'(100(F16.10,X))')x,w_mu_gauss(:)
+  write(i_unit_output_wee_mu,'(100(F16.10,X))')x,w_mu(:)
+  write(i_unit_output_j_gauss,'(100(F16.10,X))')x,j_mu_gauss(:)
+  write(i_unit_output_j_mu,'(100(F16.10,X))')x,j_mu(:)
+  x += dx
+ enddo
+
+
+end
--- a/plugins/local/non_h_ints_mu/plot_mo.irp.f
+++ b/plugins/local/non_h_ints_mu/plot_mo.irp.f
@ -0,0 +1,19 @@
+program plot_mo
+ implicit none
+ integer :: i,npt
+ double precision :: xmin,xmax,dx,r(3)
+ double precision,allocatable :: mos_array(:)
+ allocate(mos_array(mo_num))
+ npt = 10000
+ xmin =0.d0
+ xmax =10.d0
+ dx=(xmax-xmin)/dble(npt)
+ r=0.d0
+ r(1) = xmin
+ do i = 1, npt
+  call give_all_mos_at_r(r,mos_array)
+  write(33,'(100(F16.10,X))')r(1),mos_array(1),mos_array(2),mos_array(3)
+  r(1) += dx
+ enddo
+
+end
--- a/plugins/local/non_h_ints_mu/plot_mu_of_r.irp.f
+++ b/plugins/local/non_h_ints_mu/plot_mu_of_r.irp.f
@ -16,15 +16,16 @@ subroutine routine_print
 integer :: ipoint,nx,i
 double precision :: xmax,xmin,r(3),dx,sigma
 double precision :: mu_val, mu_der(3),dm_a,dm_b,grad,grad_dm_a(3), grad_dm_b(3)
- xmax =  5.D0
- xmin = -5.D0
+ xmax =  3.9D0
+ xmin = -3.9D0
 nx = 10000
 dx = (xmax - xmin)/dble(nx)
 r = 0.d0
 r(1) = xmin
 do ipoint = 1, nx
-  call mu_r_val_and_grad(r, r, mu_val, mu_der)
-  call density_and_grad_alpha_beta(r,dm_a,dm_b, grad_dm_a, grad_dm_b)
+!  call mu_r_val_and_grad(r, r, mu_val, mu_der)
+  call grad_mu_of_r_mean_field(r,mu_val, dm_a, mu_der, grad_dm_a)
+!  call density_and_grad_alpha_beta(r,dm_a,dm_b, grad_dm_a, grad_dm_b)
  sigma = 0.d0
  do i = 1,3
   sigma += grad_dm_a(i)**2
@ -32,7 +33,8 @@ subroutine routine_print
  sigma=dsqrt(sigma)
  grad = mu_der(1)**2 + mu_der(2)**2 + mu_der(3)**2 
  grad = dsqrt(grad)
-  write(i_unit_output,'(100(F16.7,X))')r(1),mu_val,dm_a+dm_b,grad,sigma/dm_a
+  print*,r(1),mu_val
+  write(i_unit_output,'(100(F16.7,X))')r(1),mu_val,dm_a,grad,sigma/dm_a
  r(1) += dx
 enddo
 end
--- a/plugins/local/non_h_ints_mu/pot_j_gauss.irp.f
+++ b/plugins/local/non_h_ints_mu/pot_j_gauss.irp.f
@ -0,0 +1,146 @@
+double precision function j_simple(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ double precision :: j_mu_simple,j_gauss_simple
+ if(j2e_type .eq. "Mu".or.j2e_type .eq. "Mur") then
+  j_simple = j_mu_simple(x,mu)
+ else if(j2e_type .eq. "Mugauss".or.j2e_type .eq. "Murgauss") then
+  j_simple = j_gauss_simple(x,mu) + j_mu_simple(x,mu)
+ endif
+end
+
+
+double precision function j_mu_simple(x,mu)
+ implicit none
+ double precision, intent(in):: x,mu
+ include 'constants.include.F'
+ BEGIN_DOC
+! j_mu(mu,x) = 0.5 x (1 - erf(mu x)) - 1/[2 sqrt(pi)mu] exp(-(x*mu)^2)
+ END_DOC
+ j_mu_simple = 0.5d0 * x * (1.D0 - derf(mu*x)) - 0.5d0 * inv_sq_pi/mu *  dexp(-x*mu*x*mu)
+
+end
+
+double precision function j_gauss_simple(x,mu)
+ implicit none
+ double precision, intent(in):: x,mu
+ include 'constants.include.F'
+ BEGIN_DOC
+! j_mu(mu,x) = c/[4 alpha^2 mu] exp(-(alpha * mu * x)^2)
+!      with c = 27/(8 sqrt(pi)), alpha=3/2
+ END_DOC
+ double precision :: x_tmp
+ x_tmp = alpha_mu_gauss * mu * x
+ j_gauss_simple = 0.25d0 * c_mu_gauss / (alpha_mu_gauss*alpha_mu_gauss*mu) * dexp(-x_tmp*x_tmp)
+
+end
+
+double precision function j_mu_deriv(x,mu)
+ implicit none
+ BEGIN_DOC
+! d/dx j_mu(mu,x) = d/dx 0.5 x (1 - erf(mu x)) - 1/[2 sqrt(pi)mu] exp(-(x*mu)^2)
+!              = 0.5*(1 - erf(mu x))
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ j_mu_deriv = 0.5d0 * (1.d0 - derf(mu*x))
+end
+
+double precision function j_mu_deriv_2(x,mu)
+ implicit none
+ BEGIN_DOC
+! d^2/dx^2 j_mu(mu,x) = d^2/dx^2 0.5 x (1 - erf(mu x)) - 1/[2 sqrt(pi)mu] exp(-(x*mu)^2)
+!                  = -mu/sqrt(pi) * exp(-(mu x)^2)
+ END_DOC
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ j_mu_deriv_2 = - mu * inv_sq_pi * dexp(-x*mu*x*mu)
+end
+
+double precision function j_gauss_deriv(x,mu)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d/dx j_gauss(mu,x) = d/dx c/[4 alpha^2 mu] exp(-(alpha * mu * x)^2)
+!      with c = 27/(8 sqrt(pi)), alpha=3/2
+!                    = -0.5 * mu *  c * x * exp(-(alpha * mu * x)^2)
+ END_DOC
+ double precision :: x_tmp
+ x_tmp = alpha_mu_gauss * mu * x
+ j_gauss_deriv = -0.5d0 * mu * c_mu_gauss * x * exp(-x_tmp*x_tmp)
+end
+
+double precision function j_gauss_deriv_2(x,mu)
+ implicit none
+ include 'constants.include.F'
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d/dx j_gauss(mu,x) = d/dx c/[4 alpha^2 mu] exp(-(alpha * mu * x)^2)
+!      with c = 27/(8 sqrt(pi)), alpha=3/2
+!                    = 0.5 * mu *  c * exp(-(alpha * mu * x)^2) * (2 (alpha*mu*x)^2 - 1)
+ END_DOC
+ double precision :: x_tmp
+ x_tmp = alpha_mu_gauss * mu * x
+ x_tmp = x_tmp * x_tmp
+ j_gauss_deriv_2 = 0.5d0 * mu * c_mu_gauss * exp(-x_tmp) * (2.d0*x_tmp - 1.d0)
+end
+
+double precision function j_erf_gauss_deriv(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d/dx (j_gauss(mu,x)+j_mu(mu,x)) 
+ END_DOC
+ double precision :: j_gauss_deriv,j_mu_deriv
+ j_erf_gauss_deriv = j_gauss_deriv(x,mu)+j_mu_deriv(x,mu)
+end
+
+double precision function j_erf_gauss_deriv_2(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+! d^2/dx^2 (j_gauss(mu,x)+j_mu(mu,x)) 
+ END_DOC
+ double precision :: j_gauss_deriv_2,j_mu_deriv_2
+ j_erf_gauss_deriv_2 = j_gauss_deriv_2(x,mu)+j_mu_deriv_2(x,mu)
+end
+
+
+double precision function pot_j_gauss(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+ ! effective scalar potential associated with the erf_gauss correlation factor
+ ! 
+ ! 1/x( 1 - 2 * d/dx j_erf_gauss(x,mu)) - d^2/dx^2 j_erf_gauss(x,mu)) - d/dx d/dx (j_erf_gauss(x,mu))^2
+ END_DOC
+ double precision :: j_erf_gauss_deriv_2,j_erf_gauss_deriv
+ double precision :: deriv_1, deriv_2
+ pot_j_gauss = 0.d0
+ if(x.ne.0.d0)then
+  deriv_1 = j_erf_gauss_deriv(x,mu)
+  deriv_2 = j_erf_gauss_deriv_2(x,mu)
+  pot_j_gauss = 1.d0/x * (1.d0 - 2.d0 * deriv_1) - deriv_1 * deriv_1 - deriv_2
+ endif
+
+end
+
+double precision function pot_j_mu(x,mu)
+ implicit none
+ double precision, intent(in) :: x,mu
+ BEGIN_DOC
+ ! effective scalar potential associated with the correlation factor
+ ! 
+ ! 1/x( 1 - 2 * d/dx j_erf(x,mu)) - d^2/dx^2 j_erf(x,mu)) - d/dx d/dx (j_erf(x,mu))^2
+ END_DOC
+ double precision :: j_mu_deriv_2,j_mu_deriv
+ double precision :: deriv_1, deriv_2
+ pot_j_mu = 0.d0
+ if(x.ne.0.d0)then
+  deriv_1 = j_mu_deriv(x,mu)
+  deriv_2 = j_mu_deriv_2(x,mu)
+  pot_j_mu= 1.d0/x * (1.d0 - 2.d0 * deriv_1) - deriv_1 * deriv_1 - deriv_2
+ endif
+
+end
--- a/plugins/local/non_h_ints_mu/print_jastrow_psi.irp.f
+++ b/plugins/local/non_h_ints_mu/print_jastrow_psi.irp.f
@ -0,0 +1,15 @@
+program print_j_psi
+ implicit none
+ integer :: i,j,a,b
+ do i = 1, elec_beta_num ! r2
+  do j = 1, elec_alpha_num ! r1
+   do a = elec_beta_num+1, mo_num ! r2
+    do b = elec_alpha_num+1, mo_num ! r1 
+     print*,b,a,j,i
+     print*,c_ij_ab_jastrow(b,a,j,i)
+    enddo
+   enddo
+  enddo
+ enddo
+
+end
--- a/plugins/local/non_h_ints_mu/qmckl.irp.f
+++ b/plugins/local/non_h_ints_mu/qmckl.irp.f
@ -75,3 +75,107 @@ BEGIN_PROVIDER [ integer*8, qmckl_ctx_jastrow ]
  endif

 END_PROVIDER
+
+
+ BEGIN_PROVIDER [ double precision, aos_in_r_array_qmckl, (ao_num,n_points_final_grid)]
+&BEGIN_PROVIDER [ double precision, aos_grad_in_r_array_qmckl, (ao_num,n_points_final_grid,3)]
+&BEGIN_PROVIDER [ double precision, aos_lapl_in_r_array_qmckl, (ao_num, n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! AOS computed with qmckl
+ END_DOC
+ use qmckl
+
+ integer*8 :: qmckl_ctx
+ integer(qmckl_exit_code) :: rc
+
+ qmckl_ctx = qmckl_context_create()
+
+ rc = qmckl_trexio_read(qmckl_ctx, trexio_file, 1_8*len(trim(trexio_filename)))
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in read_trexio'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ rc = qmckl_set_point(qmckl_ctx, 'N', n_points_final_grid*1_8, final_grid_points, n_points_final_grid*3_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in set_electron_point'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ double precision, allocatable :: vgl(:,:,:)
+ allocate( vgl(ao_num,5,n_points_final_grid))
+ rc = qmckl_get_ao_basis_ao_vgl_inplace(qmckl_ctx, vgl, n_points_final_grid*ao_num*5_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in get_ao_vgl'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ integer :: i,k
+ do k=1,n_points_final_grid
+   do i=1,ao_num
+     aos_in_r_array_qmckl(i,k) = vgl(i,1,k)
+     aos_grad_in_r_array_qmckl(i,k,1) = vgl(i,2,k)
+     aos_grad_in_r_array_qmckl(i,k,2) = vgl(i,3,k)
+     aos_grad_in_r_array_qmckl(i,k,3) = vgl(i,4,k)
+     aos_lapl_in_r_array_qmckl(i,k) = vgl(i,5,k)
+   enddo
+ enddo
+
+END_PROVIDER
+
+
+ BEGIN_PROVIDER [ double precision, mos_in_r_array_qmckl, (mo_num,n_points_final_grid)]
+&BEGIN_PROVIDER [ double precision, mos_grad_in_r_array_qmckl, (mo_num,n_points_final_grid,3)]
+&BEGIN_PROVIDER [ double precision, mos_lapl_in_r_array_qmckl, (mo_num, n_points_final_grid)]
+ implicit none
+ BEGIN_DOC
+ ! moS computed with qmckl
+ END_DOC
+ use qmckl
+
+ integer*8 :: qmckl_ctx
+ integer(qmckl_exit_code) :: rc
+
+ qmckl_ctx = qmckl_context_create()
+
+ rc = qmckl_trexio_read(qmckl_ctx, trexio_file, 1_8*len(trim(trexio_filename)))
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in read_trexio'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ rc = qmckl_set_point(qmckl_ctx, 'N', n_points_final_grid*1_8, final_grid_points, n_points_final_grid*3_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in set_electron_point'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ double precision, allocatable :: vgl(:,:,:)
+ allocate( vgl(mo_num,5,n_points_final_grid))
+ rc = qmckl_get_mo_basis_mo_vgl(qmckl_ctx, vgl, n_points_final_grid*mo_num*5_8)
+ if (rc /= QMCKL_SUCCESS) then
+   print *, irp_here, 'qmckl error in get_mo_vgl'
+   rc = qmckl_check(qmckl_ctx, rc)
+   stop -1
+ endif
+
+ integer :: i,k
+ do k=1,n_points_final_grid
+   do i=1,mo_num
+     mos_in_r_array_qmckl(i,k) = vgl(i,1,k)
+     mos_grad_in_r_array_qmckl(i,k,1) = vgl(i,2,k)
+     mos_grad_in_r_array_qmckl(i,k,2) = vgl(i,3,k)
+     mos_grad_in_r_array_qmckl(i,k,3) = vgl(i,4,k)
+     mos_lapl_in_r_array_qmckl(i,k) = vgl(i,5,k)
+   enddo
+ enddo
+
+END_PROVIDER
+
+
--- a/plugins/local/non_h_ints_mu/tc_integ.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ.irp.f
@ -44,14 +44,92 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
    elseif(tc_integ_type .eq. "numeric") then

      print *, ' Numerical integration over r1 and r2 will be performed'
-  
-      ! TODO combine 1shot & int2_grad1_u12_ao_num

-      PROVIDE int2_grad1_u12_ao_num
-      int2_grad1_u12_ao = int2_grad1_u12_ao_num
+      if(tc_save_mem) then

-      !PROVIDE int2_grad1_u12_ao_num_1shot
-      !int2_grad1_u12_ao = int2_grad1_u12_ao_num_1shot
+        integer                       :: n_blocks, n_rest, n_pass
+        integer                       :: i_blocks, i_rest, i_pass, ii
+        double precision              :: mem, n_double
+        double precision, allocatable :: tmp(:,:,:), xx(:)
+        double precision, allocatable :: tmp_grad1_u12(:,:,:)
+
+        PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+        allocate(tmp(n_points_extra_final_grid,ao_num,ao_num), xx(n_points_extra_final_grid))
+        !$OMP PARALLEL               &
+        !$OMP DEFAULT (NONE)         &
+        !$OMP PRIVATE (j, i, jpoint) &
+        !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+        !$OMP DO COLLAPSE(2)
+        do j = 1, ao_num
+          do i = 1, ao_num
+            do jpoint = 1, n_points_extra_final_grid
+              tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+            enddo
+          enddo
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call total_memory(mem)
+        mem      = max(1.d0, qp_max_mem - mem)
+        n_double = mem * 1.d8
+        n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+        n_rest   = int(mod(n_points_final_grid, n_blocks))
+        n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+        call write_int(6, n_pass, 'Number of passes')
+        call write_int(6, n_blocks, 'Size of the blocks')
+        call write_int(6, n_rest, 'Size of the last block')
+        allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,3))
+        do i_pass = 1, n_pass
+          ii = (i_pass-1)*n_blocks + 1
+          !$OMP PARALLEL                   &
+          !$OMP DEFAULT (NONE)             &
+          !$OMP PRIVATE (i_blocks, ipoint) &
+          !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, xx, tmp_grad1_u12)
+          !$OMP DO 
+          do i_blocks = 1, n_blocks
+            ipoint = ii - 1 + i_blocks ! r1
+            call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), xx(1))
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+          do m = 1, 3
+            call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0                     &
+                      , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                      , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+          enddo
+        enddo
+        deallocate(tmp_grad1_u12)
+        if(n_rest .gt. 0) then
+          allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,3))
+          ii = n_pass*n_blocks + 1
+          !$OMP PARALLEL                 &
+          !$OMP DEFAULT (NONE)           &
+          !$OMP PRIVATE (i_rest, ipoint) &
+          !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, xx, tmp_grad1_u12)
+          !$OMP DO 
+          do i_rest = 1, n_rest
+            ipoint = ii - 1 + i_rest ! r1
+            call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), xx(1))
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+          do m = 1, 3
+            call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0                       &
+                      , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
+                      , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
+          enddo
+          deallocate(tmp_grad1_u12)
+        endif
+        deallocate(tmp,xx)
+
+      else
+        ! TODO combine 1shot & int2_grad1_u12_ao_num
+        PROVIDE int2_grad1_u12_ao_num
+        int2_grad1_u12_ao = int2_grad1_u12_ao_num
+!         PROVIDE int2_grad1_u12_ao_num_1shot
+!         int2_grad1_u12_ao = int2_grad1_u12_ao_num_1shot
+      endif

    elseif(tc_integ_type .eq. "semi-analytic") then

@ -126,7 +204,7 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_ao, (ao_num, ao_num, n_points_f
    print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'

    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
-    call ezfio_set_work_empty(.False.)
+      call ezfio_set_work_empty(.False.)
      write(11) int2_grad1_u12_ao
    close(11)
    call ezfio_set_tc_keywords_io_tc_integ('Read')
@ -177,13 +255,88 @@ BEGIN_PROVIDER [double precision, int2_grad1_u12_square_ao, (ao_num, ao_num, n_p

    print *, ' Numerical integration over r1 and r2 will be performed'
  
-    ! TODO combine 1shot & int2_grad1_u12_square_ao_num
+    if(tc_save_mem) then

-    PROVIDE int2_grad1_u12_square_ao_num
-    int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num
+      integer                       :: n_blocks, n_rest, n_pass
+      integer                       :: i_blocks, i_rest, i_pass, ii
+      double precision              :: mem, n_double
+      double precision, allocatable :: tmp(:,:,:), xx(:,:,:)
+      double precision, allocatable :: tmp_grad1_u12_squared(:,:)

-    !PROVIDE int2_grad1_u12_square_ao_num_1shot
-    !int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num_1shot
+      PROVIDE final_weight_at_r_vector_extra aos_in_r_array_extra
+
+      allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
+      !$OMP PARALLEL               &
+      !$OMP DEFAULT (NONE)         &
+      !$OMP PRIVATE (j, i, jpoint) &
+      !$OMP SHARED (tmp, ao_num, n_points_extra_final_grid, final_weight_at_r_vector_extra, aos_in_r_array_extra_transp)
+      !$OMP DO COLLAPSE(2)
+      do j = 1, ao_num
+        do i = 1, ao_num
+          do jpoint = 1, n_points_extra_final_grid
+            tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call total_memory(mem)
+      mem      = max(1.d0, qp_max_mem - mem)
+      n_double = mem * 1.d8
+      n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
+      n_rest   = int(mod(n_points_final_grid, n_blocks))
+      n_pass   = int((n_points_final_grid - n_rest) / n_blocks)
+      call write_int(6, n_pass, 'Number of passes')
+      call write_int(6, n_blocks, 'Size of the blocks')
+      call write_int(6, n_rest, 'Size of the last block')
+      allocate(tmp_grad1_u12_squared(n_points_extra_final_grid,n_blocks), xx(n_points_extra_final_grid,n_blocks,3))
+      do i_pass = 1, n_pass
+        ii = (i_pass-1)*n_blocks + 1
+        !$OMP PARALLEL                   &
+        !$OMP DEFAULT (NONE)             &
+        !$OMP PRIVATE (i_blocks, ipoint) &
+        !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, xx, final_grid_points, tmp_grad1_u12_squared)
+        !$OMP DO 
+        do i_blocks = 1, n_blocks
+          ipoint = ii - 1 + i_blocks ! r1
+          call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, xx(1,i_blocks,1), xx(1,i_blocks,2), xx(1,i_blocks,3), tmp_grad1_u12_squared(1,i_blocks))
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, -0.5d0                         &
+                  , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12_squared(1,1), n_points_extra_final_grid &
+                  , 0.d0, int2_grad1_u12_square_ao(1,1,ii), ao_num*ao_num)
+      enddo
+      deallocate(tmp_grad1_u12_squared, xx)
+      if(n_rest .gt. 0) then
+        ii = n_pass*n_blocks + 1
+        allocate(tmp_grad1_u12_squared(n_points_extra_final_grid,n_rest), xx(n_points_extra_final_grid,n_rest,3))
+        !$OMP PARALLEL                 &
+        !$OMP DEFAULT (NONE)           &
+        !$OMP PRIVATE (i_rest, ipoint) &
+        !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, xx, final_grid_points, tmp_grad1_u12_squared)
+        !$OMP DO 
+        do i_rest = 1, n_rest
+          ipoint = ii - 1 + i_rest ! r1
+          call get_grad1_u12_withsq_r1_seq(ipoint, n_points_extra_final_grid, xx(1,i_rest,1), xx(1,i_rest,2), xx(1,i_rest,3), tmp_grad1_u12_squared(1,i_rest))
+        enddo
+        !$OMP END DO
+        !$OMP END PARALLEL
+        call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, -0.5d0                           &
+                  , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12_squared(1,1), n_points_extra_final_grid &
+                  , 0.d0, int2_grad1_u12_square_ao(1,1,ii), ao_num*ao_num)
+        deallocate(tmp_grad1_u12_squared, xx)
+      endif
+      deallocate(tmp)
+
+    else
+
+      ! TODO combine 1shot & int2_grad1_u12_square_ao_num
+      PROVIDE int2_grad1_u12_square_ao_num
+      int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num
+      !PROVIDE int2_grad1_u12_square_ao_num_1shot
+      !int2_grad1_u12_square_ao = int2_grad1_u12_square_ao_num_1shot
+    endif

  elseif(tc_integ_type .eq. "semi-analytic") then

--- a/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
+++ b/plugins/local/non_h_ints_mu/tc_integ_num.irp.f
@ -63,12 +63,10 @@
  do i_pass = 1, n_pass
    ii = (i_pass-1)*n_blocks + 1
  
-    !$OMP PARALLEL                                         &
-    !$OMP DEFAULT (NONE)                                   &
-    !$OMP PRIVATE (i_blocks, ipoint)                       &
-    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, &
-    !$OMP         final_grid_points, tmp_grad1_u12,        &
-    !$OMP         tmp_grad1_u12_squared)
+    !$OMP PARALLEL                   &
+    !$OMP DEFAULT (NONE)             &
+    !$OMP PRIVATE (i_blocks, ipoint) &
+    !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12, tmp_grad1_u12_squared)
    !$OMP DO 
    do i_blocks = 1, n_blocks
      ipoint = ii - 1 + i_blocks ! r1
@ -99,12 +97,10 @@
  
    ii = n_pass*n_blocks + 1

-    !$OMP PARALLEL                                       &
-    !$OMP DEFAULT (NONE)                                 &
-    !$OMP PRIVATE (i_rest, ipoint)                       &
-    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, &
-    !$OMP         final_grid_points, tmp_grad1_u12,      &
-    !$OMP         tmp_grad1_u12_squared)
+    !$OMP PARALLEL                 &
+    !$OMP DEFAULT (NONE)           &
+    !$OMP PRIVATE (i_rest, ipoint) &
+    !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12, tmp_grad1_u12_squared)
    !$OMP DO 
    do i_rest = 1, n_rest
      ipoint = ii - 1 + i_rest ! r1
@ -131,7 +127,7 @@
  deallocate(tmp)

  call wall_time(time1)
-  print*, ' wall time for int2_grad1_u12_ao_num & int2_grad1_u12_square_ao_num =', time1-time0 
+  print*, ' wall time for int2_grad1_u12_ao_num & int2_grad1_u12_square_ao_num = (min)', (time1-time0) / 60.d0
  call print_memory_usage()

 END_PROVIDER
--- a/plugins/local/non_h_ints_mu/test_mu_of_r_tc.irp.f
+++ b/plugins/local/non_h_ints_mu/test_mu_of_r_tc.irp.f
@ -0,0 +1,157 @@
+program test_mu_of_r_tc
+  implicit none
+  BEGIN_DOC
+! TODO
+  END_DOC
+  ! You specify that you want to avoid any contribution from 
+  ! orbitals coming from core 
+ call test_grad_f_mean_field
+ call test_grad_mu_mf
+ call plot_mu_of_r_mf
+end
+
+
+subroutine test_grad_f_mean_field
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3)
+ double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
+ double precision :: grad_dm_a(3), grad_dm_b(3)
+ double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
+
+ double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
+ double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
+ double precision :: f_mf_ab_p,f_mf_ab_m
+ double precision :: two_bod_dens_p, two_bod_dens_m
+ double precision :: dm_a_p, dm_a_m
+ double precision :: dm_b_p, dm_b_m
+ double precision :: rbis(3), dr
+ double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
+ double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
+ double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
+ dr = 0.00001d0
+ accu_f_mf_ab = 0.d0 
+ accu_two_bod_dens = 0.d0 
+ accu_dm_a = 0.d0 
+ accu_dm_b = 0.d0
+
+ accu_grad_f_mf_ab = 0.d0
+ accu_grad_two_bod_dens = 0.d0
+ accu_grad_dm_a = 0.d0
+ accu_grad_dm_b = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
+  call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+  accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
+  accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
+  accu_dm_a += weight*dabs(dm_a - dm_a_p)
+  accu_dm_b += weight*dabs(dm_b - dm_b_p)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
+   rbis = r
+   rbis(k) -= dr
+   call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
+   num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
+   num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
+   num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
+   num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
+   accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
+   accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
+   accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
+  enddo
+ enddo
+ print*,'accu_f_mf_ab = ',accu_f_mf_ab
+ print*,'accu_two_bod_dens = ',accu_two_bod_dens
+ print*,'accu_dm_a = ',accu_dm_a
+ print*,'accu_dm_b = ',accu_dm_b
+ print*,'accu_grad_f_mf_ab = '
+ print*,accu_grad_f_mf_ab
+ print*,'accu_grad_two_bod_dens = '
+ print*,accu_grad_two_bod_dens
+ print*,'accu_dm_a = '
+ print*,accu_grad_dm_a
+ print*,'accu_dm_b = '
+ print*,accu_grad_dm_b
+
+end
+
+subroutine test_grad_mu_mf
+ implicit none
+ integer :: i_point,k
+ double precision :: weight,r(3),rbis(3)
+ double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
+ double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
+ double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
+ dr = 0.00001d0
+ accu_grad_mu_mf = 0.d0
+ accu_mu = 0.d0
+ accu_grad_dm = 0.d0
+ accu_dm = 0.d0
+ do i_point = 1, n_points_final_grid
+  r(1:3)   = final_grid_points(1:3,i_point)
+  weight = final_weight_at_r_vector(i_point)
+  call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
+  call mu_of_r_mean_field(r,mu_mf_p, dm_p)
+  accu_mu += weight*dabs(mu_mf_p - mu_mf)
+  accu_dm += weight*dabs(dm_p - dm)
+  do k = 1, 3
+   rbis = r
+   rbis(k) += dr
+   call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
+   rbis = r
+   rbis(k) -= dr
+   call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
+
+   num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
+   num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
+  enddo
+  do k = 1, 3
+   accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
+   accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
+  enddo
+ enddo
+ print*,'accu_mu = ',accu_mu
+ print*,'accu_dm = ',accu_dm
+ print*,'accu_grad_dm = '
+ print*, accu_grad_dm
+ print*,'accu_grad_mu_mf = '
+ print*, accu_grad_mu_mf
+
+end
+
+subroutine plot_mu_of_r_mf
+ implicit none
+  include 'constants.include.F'
+ integer :: ipoint,npoint
+ double precision :: dx,r(3),xmax,xmin
+ double precision :: accu_mu,accu_nelec,mu_mf, dm,mu_mf_tc
+ character*(128) :: output
+ integer :: i_unit_output,getUnitAndOpen
+ output=trim(ezfio_filename)//'.mu_mf'
+ i_unit_output = getUnitAndOpen(output,'w')
+ xmax = 5.D0
+ xmin = 0.d0
+ npoint = 10000
+ dx = (xmax - xmin)/dble(npoint)
+ r = 0.d0
+ r(1) = xmin
+ accu_mu = 0.d0
+ accu_nelec = 0.d0
+ do ipoint = 1, npoint
+  call mu_of_r_mean_field(r,mu_mf, dm)
+  call mu_of_r_mean_field_tc(r,mu_mf_tc, dm)
+  write(i_unit_output,'(100(F16.10,X))')r(1),mu_mf,mu_mf_tc,dm
+  accu_mu    += mu_mf * dm * r(1)**2*dx*4.D0*pi
+  accu_nelec +=         dm * r(1)**2*dx*4.D0*pi
+  r(1) += dx
+ enddo
+ print*,'nelec      = ',accu_nelec
+ print*,'mu average = ',accu_mu/accu_nelec
+end
--- a/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
+++ b/plugins/local/non_h_ints_mu/test_non_h_ints.irp.f
@ -1125,6 +1125,7 @@ subroutine test_fit_coef_A1()
  double precision              :: accu, norm, diff
  double precision, allocatable :: A1(:,:)
  double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)

  ! ---

@ -1165,16 +1166,17 @@ subroutine test_fit_coef_A1()

  call wall_time(t1)

-  allocate(tmp(ao_num,ao_num,n_points_final_grid))
+  allocate(tmp1(ao_num,ao_num,n_points_final_grid), tmp2(ao_num,ao_num,n_points_final_grid))
  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
  !$OMP DO COLLAPSE(2)
  do j = 1, ao_num
    do i = 1, ao_num
      do ipoint = 1, n_points_final_grid
-        tmp(i,j,ipoint) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(i,j,ipoint) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(i,j,ipoint) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
      enddo
    enddo
  enddo
@ -1184,9 +1186,9 @@ subroutine test_fit_coef_A1()
  allocate(A2(ao_num,ao_num,ao_num,ao_num))

  call dgemm( "N", "T", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), ao_num*ao_num, tmp(1,1,1), ao_num*ao_num              &
+            , tmp1(1,1,1), ao_num*ao_num, tmp2(1,1,1), ao_num*ao_num            &
            , 0.d0, A2(1,1,1,1), ao_num*ao_num)
-  deallocate(tmp)
+  deallocate(tmp1, tmp2)

  call wall_time(t2)
  print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0
@ -1238,6 +1240,7 @@ subroutine test_fit_coef_inv()
  double precision, allocatable :: A1(:,:), A1_inv(:,:), A1_tmp(:,:)
  double precision, allocatable :: A2(:,:,:,:), tmp(:,:,:), A2_inv(:,:,:,:)
  double precision, allocatable :: U(:,:), D(:), Vt(:,:), work(:), A2_tmp(:,:,:,:)
+  double precision, allocatable :: tmp1(:,:,:), tmp2(:,:,:)

  cutoff_svd = 5d-8

@ -1286,16 +1289,17 @@ subroutine test_fit_coef_inv()

  call wall_time(t1)

-  allocate(tmp(n_points_final_grid,ao_num,ao_num))
+  allocate(tmp1(n_points_final_grid,ao_num,ao_num), tmp2(n_points_final_grid,ao_num,ao_num))
  !$OMP PARALLEL               &
  !$OMP DEFAULT (NONE)         &
  !$OMP PRIVATE (i, j, ipoint) &
-  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp)
+  !$OMP SHARED (n_points_final_grid, ao_num, final_weight_at_r_vector, aos_in_r_array_transp, tmp1, tmp2)
  !$OMP DO COLLAPSE(2)
  do j = 1, ao_num
    do i = 1, ao_num
      do ipoint = 1, n_points_final_grid
-        tmp(ipoint,i,j) = dsqrt(final_weight_at_r_vector(ipoint)) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp1(ipoint,i,j) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
+        tmp2(ipoint,i,j) =                                    aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,j)
      enddo
    enddo
  enddo
@ -1304,11 +1308,11 @@ subroutine test_fit_coef_inv()

  allocate(A2(ao_num,ao_num,ao_num,ao_num))

-  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
-            , tmp(1,1,1), n_points_final_grid, tmp(1,1,1), n_points_final_grid  &
+  call dgemm( "T", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0  &
+            , tmp1(1,1,1), n_points_final_grid, tmp2(1,1,1), n_points_final_grid &
            , 0.d0, A2(1,1,1,1), ao_num*ao_num)

-  deallocate(tmp)
+  deallocate(tmp1, tmp2)

  call wall_time(t2)
  print*, ' WALL TIME FOR A2 (min) =', (t2-t1)/60.d0
--- a/plugins/local/non_h_ints_mu/total_tc_int.irp.f
+++ b/plugins/local/non_h_ints_mu/total_tc_int.irp.f
@ -33,8 +33,10 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
  double precision              :: weight1, ao_k_r, ao_i_r
  double precision              :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
  double precision              :: time0, time1
-  double precision, allocatable :: b_mat(:,:,:,:), c_mat(:,:,:)
+  double precision, allocatable :: c_mat(:,:,:)
+  logical,          external    :: ao_two_e_integral_zero
  double precision, external    :: get_ao_two_e_integral
+  double precision, external    :: ao_two_e_integral

  PROVIDe tc_integ_type
  PROVIDE env_type
@ -53,7 +55,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
    print*, ' Reading ao_two_e_tc_tot from ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'

    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="read")
-      read(11) ao_two_e_tc_tot
+    do i = 1, ao_num
+      read(11) ao_two_e_tc_tot(:,:,:,i)
+    enddo
    close(11)

  else
@ -65,27 +69,59 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n

    PROVIDE int2_grad1_u12_square_ao

-    allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+    if(tc_save_mem_loops) then

-    !$OMP PARALLEL               &
-    !$OMP DEFAULT (NONE)         &
-    !$OMP PRIVATE (i, k, ipoint) &
-    !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
-    !$OMP DO SCHEDULE (static)
-    do i = 1, ao_num
-      do k = 1, ao_num
-        do ipoint = 1, n_points_final_grid
-          c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+      print*, ' LOOPS are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
+
+      !$OMP PARALLEL                                              &
+      !$OMP DEFAULT (NONE)                                        &
+      !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1) &
+      !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot, &
+      !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, int2_grad1_u12_square_ao)
+      !$OMP DO COLLAPSE(3)
+      do i = 1, ao_num
+        do k = 1, ao_num
+          do l = 1, ao_num
+            do j = 1, ao_num
+              ao_two_e_tc_tot(j,l,k,i) = 0.d0
+              do ipoint = 1, n_points_final_grid
+                weight1 = final_weight_at_r_vector(ipoint)
+                ao_i_r = aos_in_r_array_transp(ipoint,i)
+                ao_k_r = aos_in_r_array_transp(ipoint,k)
+                ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) + int2_grad1_u12_square_ao(j,l,ipoint) * weight1 * ao_i_r * ao_k_r
+              enddo
+            enddo
+          enddo
        enddo
      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
+      !$OMP END DO
+      !$OMP END PARALLEL

-    call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                 &
-              , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
-              , 0.d0, ao_two_e_tc_tot, ao_num*ao_num)
+    else

+      print*, ' DGEMM are used to evaluate Hermitian part of ao_two_e_tc_tot ...'
+
+      allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+      !$OMP PARALLEL               &
+      !$OMP DEFAULT (NONE)         &
+      !$OMP PRIVATE (i, k, ipoint) &
+      !$OMP SHARED (aos_in_r_array_transp, c_mat, ao_num, n_points_final_grid, final_weight_at_r_vector)
+      !$OMP DO SCHEDULE (static)
+      do i = 1, ao_num
+        do k = 1, ao_num
+          do ipoint = 1, n_points_final_grid
+            c_mat(ipoint,k,i) = final_weight_at_r_vector(ipoint) * aos_in_r_array_transp(ipoint,i) * aos_in_r_array_transp(ipoint,k)
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+      call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0                 &
+                , int2_grad1_u12_square_ao(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+                , 0.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
+      deallocate(c_mat)
+    endif
+  
    FREE int2_grad1_u12_square_ao

    if( (tc_integ_type .eq. "semi-analytic")                            .and. &
@ -96,6 +132,7 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
      ! an additional term is added here directly instead of 
      ! being added in int2_grad1_u12_square_ao for performance

+      allocate(c_mat(n_points_final_grid,ao_num,ao_num))
      PROVIDE int2_u2_env2

      !$OMP PARALLEL                                                                                     &
@ -127,10 +164,13 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
                , int2_u2_env2(1,1,1), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
                , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)

+      deallocate(c_mat)
      FREE int2_u2_env2
    endif ! use_ipp

-    deallocate(c_mat)
+    call wall_time(time1)
+    print*, ' done with Hermitian part after (min) ', (time1 - time0) / 60.d0
+    call print_memory_usage()

    ! ---

@ -138,39 +178,71 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n

      PROVIDE int2_grad1_u12_ao

-      allocate(b_mat(n_points_final_grid,ao_num,ao_num,3))
+      if(tc_save_mem_loops) then

-      !$OMP PARALLEL                                                              &
-      !$OMP DEFAULT (NONE)                                                        &
-      !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
-      !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, b_mat, & 
-      !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector)
-      !$OMP DO SCHEDULE (static)
-      do i = 1, ao_num
-        do k = 1, ao_num
-          do ipoint = 1, n_points_final_grid
+        print*, ' LOOPS are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'

-            weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
-            ao_i_r  = aos_in_r_array_transp(ipoint,i)
-            ao_k_r  = aos_in_r_array_transp(ipoint,k)
-
-            b_mat(ipoint,k,i,1) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1))
-            b_mat(ipoint,k,i,2) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2))
-            b_mat(ipoint,k,i,3) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+        !$OMP PARALLEL                                                 &
+        !$OMP DEFAULT (NONE)                                           &
+        !$OMP PRIVATE (i, j, k, l, ipoint, ao_i_r, ao_k_r, weight1)    &
+        !$OMP SHARED (ao_num, n_points_final_grid, ao_two_e_tc_tot,    &
+        !$OMP         aos_in_r_array_transp, final_weight_at_r_vector, &
+        !$OMP         int2_grad1_u12_ao, aos_grad_in_r_array_transp_bis)
+        !$OMP DO COLLAPSE(3)
+        do i = 1, ao_num
+          do k = 1, ao_num
+            do l = 1, ao_num
+              do j = 1, ao_num
+                do ipoint = 1, n_points_final_grid
+                  weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+                  ao_i_r  = aos_in_r_array_transp(ipoint,i)
+                  ao_k_r  = aos_in_r_array_transp(ipoint,k)
+                  ao_two_e_tc_tot(j,l,k,i) = ao_two_e_tc_tot(j,l,k,i) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,1) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,1) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,1)) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,2) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,2) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,2)) &
+                                           - weight1 * int2_grad1_u12_ao(j,l,ipoint,3) * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,3) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,3))
+                enddo
+              enddo
+            enddo
          enddo
        enddo
-      enddo
-      !$OMP END DO
-      !$OMP END PARALLEL
+        !$OMP END DO
+        !$OMP END PARALLEL

-      do m = 1, 3
-        call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0             &
-                  , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, b_mat(1,1,1,m), n_points_final_grid &
-                  , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
-      enddo
-      deallocate(b_mat)
+      else

-      FREE int2_grad1_u12_ao
+        print*, ' DGEMM are used to evaluate non-Hermitian part of ao_two_e_tc_tot ...'
+
+        allocate(c_mat(n_points_final_grid,ao_num,ao_num))
+        do m = 1, 3
+          !$OMP PARALLEL                                                              &
+          !$OMP DEFAULT (NONE)                                                        &
+          !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r)                       & 
+          !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & 
+          !$OMP         ao_num, n_points_final_grid, final_weight_at_r_vector, m)
+          !$OMP DO SCHEDULE (static)
+          do i = 1, ao_num
+            do k = 1, ao_num
+              do ipoint = 1, n_points_final_grid
+  
+                weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
+                ao_i_r  = aos_in_r_array_transp(ipoint,i)
+                ao_k_r  = aos_in_r_array_transp(ipoint,k)
+  
+                c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
+              enddo
+            enddo
+          enddo
+          !$OMP END DO
+          !$OMP END PARALLEL
+
+          call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0           &
+                    , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
+                    , 1.d0, ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)
+        enddo
+        deallocate(c_mat)
+
+      end if

      if(tc_integ_type .eq. "semi-analytic") then 
        FREE int2_grad1_u2e_ao
@ -178,30 +250,72 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n

    endif ! var_tc

+    call wall_time(time1)
+    print*, ' done with non-Hermitian part after (min) ', (time1 - time0) / 60.d0
+    call print_memory_usage()
+
    ! ---

    call sum_A_At(ao_two_e_tc_tot(1,1,1,1), ao_num*ao_num)

-    PROVIDE ao_integrals_map
+    ! ---
+  
+    logical          :: integ_zero
+    double precision :: integ_val

-    !$OMP PARALLEL DEFAULT(NONE)                            &
-    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
-    !$OMP PRIVATE(i, j, k, l)
-    !$OMP DO
-    do j = 1, ao_num
-      do l = 1, ao_num
-        do i = 1, ao_num
-          do k = 1, ao_num
-            !                                                     < 1:i, 2:j | 1:k, 2:l > 
-            ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+    print*, ' adding ERI to ao_two_e_tc_tot ...'
+
+    if(tc_save_mem) then
+      print*, ' ao_integrals_map will not be used'
+      !$OMP PARALLEL DEFAULT(NONE)                     &
+      !$OMP PRIVATE(i, j, k, l, integ_zero, integ_val) & 
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot)
+      !$OMP DO COLLAPSE(3)
+      do j = 1, ao_num
+        do l = 1, ao_num
+          do i = 1, ao_num
+            do k = 1, ao_num
+              integ_zero = ao_two_e_integral_zero(i,j,k,l)
+              if(.not. integ_zero) then
+                            ! i,k : r1    j,l : r2
+                integ_val = ao_two_e_integral(i,k,j,l)
+                ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + integ_val
+              endif
+            enddo
          enddo
        enddo
      enddo
-    enddo
-    !$OMP END DO
-    !$OMP END PARALLEL
+      !$OMP END DO
+      !$OMP END PARALLEL
+    else
+!      print*, ' ao_integrals_map will be used'
+!      PROVIDE ao_integrals_map
+      print*,'Cholesky vectors will be used '
+      double precision :: get_ao_integ_chol,eri
+      eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP 
+      !$OMP PARALLEL DEFAULT(NONE)                            &
+!!!    !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
+      !$OMP SHARED(ao_num, ao_two_e_tc_tot) &
+      !$OMP PRIVATE(i, j, k, l,eri)
+      !$OMP DO COLLAPSE(3)
+      do j = 1, ao_num
+        do l = 1, ao_num
+          do i = 1, ao_num
+            do k = 1, ao_num
+              !                                                     < 1:i, 2:j | 1:k, 2:l > 
+!              eri =  get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
+               eri = get_ao_integ_chol(i,k,j,l)
+              ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
+            enddo
+          enddo
+        enddo
+      enddo
+      !$OMP END DO
+      !$OMP END PARALLEL
+!      FREE ao_integrals_map
+    endif

-    if(tc_integ_type .eq. "numeric") then
+    if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
      FREE int2_grad1_u12_ao_num int2_grad1_u12_square_ao_num
    endif

@ -211,7 +325,9 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
    print*, ' Saving ao_two_e_tc_tot in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
    open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/ao_two_e_tc_tot', action="write")
    call ezfio_set_work_empty(.False.)
-      write(11) ao_two_e_tc_tot
+    do i = 1, ao_num
+      write(11) ao_two_e_tc_tot(:,:,:,i)
+    enddo
    close(11)
    call ezfio_set_tc_keywords_io_tc_integ('Read')
  endif
--- a/plugins/local/non_hermit_dav/biorthog.irp.f
+++ b/plugins/local/non_hermit_dav/biorthog.irp.f
--- a/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
+++ b/plugins/local/non_hermit_dav/lapack_diag_non_hermit.irp.f
@ -273,60 +273,6 @@ end

 ! ---

-subroutine lapack_diag_non_sym_right(n, A, WR, WI, VR)
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  double precision, intent(out) :: WR(n), WI(n), VR(n,n)
-
-  integer                       :: i, lda, ldvl, ldvr, LWORK, INFO
-  double precision, allocatable :: Atmp(:,:), WORK(:), VL(:,:)
-
-  lda  = n
-  ldvl = 1
-  ldvr = n
-
-  allocate( Atmp(n,n), VL(1,1) )
-  Atmp(1:n,1:n) = A(1:n,1:n)
-
-  allocate(WORK(1))
-  LWORK = -1
-  call dgeev('N', 'V', n, Atmp, lda, WR, WI, VL, ldvl, VR, ldvr, WORK, LWORK, INFO)
-  if(INFO.gt.0)then
-    print*,'dgeev failed !!',INFO
-    stop
-  endif
-
-  LWORK = max(int(WORK(1)), 1) ! this is the optimal size of WORK 
-  deallocate(WORK)
-
-  allocate(WORK(LWORK))
-
-  ! Actual diagonalization 
-  call dgeev('N', 'V', n, Atmp, lda, WR, WI, VL, ldvl, VR, ldvr, WORK, LWORK, INFO)
-  if(INFO.ne.0) then
-    print*,'dgeev failed !!', INFO
-    stop
-  endif
-
-  deallocate(Atmp, WORK, VL)
-
-! print *, ' JOBL = F'
-! print *, ' eigenvalues'
-! do i = 1, n
-!   write(*, '(1000(F16.10,X))') WR(i), WI(i)
-! enddo
-! print *, ' right eigenvect' 
-! do i = 1, n
-!   write(*, '(1000(F16.10,X))') VR(:,i)
-! enddo
-
-end
-
-! ---
-
 subroutine non_hrmt_real_diag(n, A, leigvec, reigvec, n_real_eigv, eigval)

  BEGIN_DOC
@ -1780,70 +1726,6 @@ end

 ! ---

-subroutine check_weighted_biorthog(n, m, W, Vl, Vr, thr_d, thr_nd, accu_d, accu_nd, S, stop_ifnot)
-
-  implicit none
-  
-  integer,          intent(in)  :: n, m
-  double precision, intent(in)  :: Vl(n,m), Vr(n,m), W(n,n)
-  double precision, intent(in)  :: thr_d, thr_nd
-  logical,          intent(in)  :: stop_ifnot
-  double precision, intent(out) :: accu_d, accu_nd, S(m,m)
-
-  integer                       :: i, j
-  double precision, allocatable :: SS(:,:), tmp(:,:)
-
-  print *, ' check weighted bi-orthogonality'
-
-  ! ---
-
-  allocate(tmp(m,n))
-  call dgemm( 'T', 'N', m, n, n, 1.d0        &
-            , Vl, size(Vl, 1), W, size(W, 1) &
-            , 0.d0, tmp, size(tmp, 1) )
-  call dgemm( 'N', 'N', m, m, n, 1.d0            &
-            , tmp, size(tmp, 1), Vr, size(Vr, 1) &
-            , 0.d0, S, size(S, 1) )
-  deallocate(tmp)
-
-  !print *, ' overlap matrix:'
-  !do i = 1, m
-  !  write(*,'(1000(F16.10,X))') S(i,:)
-  !enddo
-
-  accu_d  = 0.d0
-  accu_nd = 0.d0
-  do i = 1, m
-    do j = 1, m
-      if(i==j) then
-        accu_d = accu_d + dabs(S(i,i))
-      else
-        accu_nd = accu_nd + S(j,i) * S(j,i)
-      endif
-    enddo
-  enddo
-  accu_nd = dsqrt(accu_nd)
-
-  print *, ' accu_nd = ', accu_nd
-  print *, ' accu_d  = ', dabs(accu_d-dble(m))/dble(m)
-
-  ! ---
-
-  if( stop_ifnot .and. ((accu_nd .gt. thr_nd) .or. dabs(accu_d-dble(m))/dble(m) .gt. thr_d) ) then
-    print *, ' non bi-orthogonal vectors !'
-    print *, ' accu_nd = ', accu_nd
-    print *, ' accu_d  = ', dabs(accu_d-dble(m))/dble(m)
-    !print *, ' overlap matrix:'
-    !do i = 1, m
-    !  write(*,'(1000(F16.10,X))') S(i,:)
-    !enddo
-    stop
-  endif
-
-end
-
-! ---
-
 subroutine check_biorthog(n, m, Vl, Vr, accu_d, accu_nd, S, thr_d, thr_nd, stop_ifnot)

  implicit none
@ -2144,6 +2026,7 @@ subroutine impose_biorthog_degen_eigvec(n, deg_num, e0, L0, R0)
      enddo
      !print*,' accu_nd after = ', accu_nd
      if(accu_nd .gt. 1d-12) then
+        print*, ' accu_nd =', accu_nd
        print*, ' your strategy for degenerates orbitals failed !'
        print*, m, 'deg on', i
        stop
--- a/plugins/local/non_hermit_dav/new_routines.irp.f
+++ b/plugins/local/non_hermit_dav/new_routines.irp.f
@ -1,670 +0,0 @@
-subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut, thr_norm=1d0
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    print *, ' '
-    print *, ' '
-    print *, ' orthog between degen eigenvect' 
-    print *, ' '
-    double precision, allocatable :: S_nh_inv_half(:,:)
-    allocate(S_nh_inv_half(n,n))
-    logical :: complex_root
-    deallocate(S_nh_inv_half)
-    call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp)
-    call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S)
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-     if(accu_nd .lt. thresh_biorthog_nondiag) then
-       print *, ' bi-orthogonality: ok'
-     else 
-      print*,'New vectors not bi-orthonormals at ',accu_nd
-      print*,'Must be a deep problem ...'
-      stop
-     endif
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
-
-subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
-  !
-  ! of a non hermitian matrix A(n,n)
-  !
-  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut, thr_norm=1.d0
-  double precision, allocatable :: S_nh_inv_half(:,:)
-  logical :: complex_root
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-  allocate(S_nh_inv_half(n,n))
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-!   thr_cut = shift_current
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    if(complex_root) then 
-     print *, ' '
-     print *, ' '
-     print *, ' orthog between degen eigenvect' 
-     print *, ' '
-     ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
-     call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp) ! orthogonalization of reigvec
-     call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp) ! orthogonalization of leigvec
-     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S,  thresh_biorthog_diag, thresh_biorthog_nondiag, .false.) 
-
-     if(accu_nd .lt. thresh_biorthog_nondiag) then
-       print *, ' bi-orthogonality: ok'
-     else 
-      print*,'New vectors not bi-orthonormals at ', accu_nd
-      call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half, complex_root)
-      if(complex_root)then 
-       call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
-      else
-       print*,'S^{-1/2} exists !!'
-       call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-      endif
-     endif
-    else ! the matrix S^{-1/2} exists
-     print*,'S^{-1/2} exists !!'
-     call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     print*,'Must be a deep problem ...'
-     stop
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
-subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
-
-  BEGIN_DOC
-  !
-  ! routine returning the eigenvalues and left/right eigenvectors of the TC fock matrix 
-  !
-  END_DOC
-
-  implicit none
-
-  integer,          intent(in)  :: n
-  double precision, intent(in)  :: A(n,n)
-  integer,          intent(out) :: n_real_eigv
-  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
-  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
-
-  integer                       :: i, j, n_degen,k , iteration
-  double precision              :: shift_current
-  double precision              :: r,thr,accu_d, accu_nd
-  integer,          allocatable :: iorder_origin(:),iorder(:)
-  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
-  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
-  double precision, allocatable :: im_part(:),re_part(:)
-  double precision :: accu,thr_cut
-  double precision, allocatable :: S_nh_inv_half(:,:)
-  logical :: complex_root
-  double precision :: thr_norm=1d0
-
-
-  thr_cut = 1.d-15
-  print*,'Computing the left/right eigenvectors ...'
-  print*,'Using the degeneracy splitting algorithm'
- ! initialization
-  shift_current = 1.d-15
-  iteration = 0 
-  print*,'***** iteration = ',iteration
-
-
-  ! pre-processing the matrix :: sorting by diagonal elements
-  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
-  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
-!  print*,'Aw'
-  do i = 1, n
-   iorder_origin(i) = i
-   diag_elem(i) = A(i,i)
-!   write(*,'(100(F16.10,X))')A(:,i)
-  enddo
-  call dsort(diag_elem, iorder_origin, n)
-  do i = 1, n
-   do j = 1, n
-    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
-   enddo
-  enddo
-
-  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
-  allocate(im_part(n),iorder(n))
-  allocate( S(n,n) )
-  allocate(S_nh_inv_half(n,n))
-
-
-  Aw = A_save
-  call cancel_small_elmts(aw,n,thr_cut)
-  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-  do i = 1, n
-   im_part(i) = -dabs(WI(i))
-   iorder(i) = i
-  enddo
-  call dsort(im_part, iorder, n)
-  n_real_eigv = 0
-  do i = 1, n
-    if(dabs(WI(i)).lt.1.d-20)then
-      n_real_eigv += 1
-    else
-!      print*,'Found an imaginary component to eigenvalue'
-!      print*,'Re(i) + Im(i)',WR(i),WI(i)
-    endif
-  enddo
-  if(n_real_eigv.ne.n)then
-   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-   print*,'Splitting the degeneracies by ',shift_current
-  else
-   print*,'All eigenvalues are real !'
-  endif
-
-
-  do while(n_real_eigv.ne.n)
-   iteration += 1
-   print*,'***** iteration = ',iteration
-   if(shift_current.gt.1.d-3)then
-    print*,'shift_current > 1.d-3 !!'
-    print*,'Your matrix intrinsically contains complex eigenvalues'
-    stop
-   endif
-   Aw = A_save
-!   thr_cut = shift_current
-   call cancel_small_elmts(Aw,n,thr_cut)
-   call split_matrix_degen(Aw,n,shift_current)
-   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
-   n_real_eigv = 0
-   do i = 1, n
-     if(dabs(WI(i)).lt.1.d-20)then
-       n_real_eigv+= 1
-     else
-!       print*,'Found an imaginary component to eigenvalue'
-!       print*,'Re(i) + Im(i)',WR(i),WI(i)
-     endif
-   enddo
-   if(n_real_eigv.ne.n)then
-    do i = 1, n
-     im_part(i) = -dabs(WI(i))
-     iorder(i) = i
-    enddo
-    call dsort(im_part, iorder, n)
-    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
-    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
-    print*,'Splitting the degeneracies by ',shift_current
-   else
-    print*,'All eigenvalues are real !'
-   endif
-  enddo
-  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
-  do i = 1, n
-   eigval(i) = WR(i)
-   iorder(i) = i
-  enddo
-  call dsort(eigval,iorder,n)
-  do i = 1, n
-!   print*,'eigval(i) = ',eigval(i)
-   reigvec_tmp(:,i) = VR(:,iorder(i))
-   leigvec_tmp(:,i) = Vl(:,iorder(i))
-  enddo
-
-!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
-  !                               check bi-orthogonality
-  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else
-    print *, ' '
-    print *, ' bi-orthogonality: not imposed yet'
-    print *, ' '
-    print *, ' '
-    print *, ' Using impose_unique_biorthog_degen_eigvec' 
-    print *, ' '
-    ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
-    call impose_unique_biorthog_degen_eigvec(n, eigval, mo_coef, leigvec_tmp, reigvec_tmp)
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    print*,'accu_nd = ',accu_nd
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half,complex_root)
-     if(complex_root)then 
-      print*,'S^{-1/2} does not exits, using QR bi-orthogonalization'
-      call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
-     else
-      print*,'S^{-1/2} exists !!'
-      call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
-     endif
-    endif
-    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-    if(accu_nd .lt. thresh_biorthog_nondiag) then
-      print *, ' bi-orthogonality: ok'
-    else 
-     print*,'New vectors not bi-orthonormals at ',accu_nd
-     print*,'Must be a deep problem ...'
-     stop
-    endif
-  endif
- 
-  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
-  do i = 1, n
-   do j = 1, n
-    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
-    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
-   enddo
-  enddo
-
-  !! RECOMPUTING THE EIGENVALUES 
-  eigval = 0.d0
-  do i = 1, n
-   iorder(i) = i
-   accu = 0.d0
-   do j = 1, n
-    accu += VL(j,i) * VR(j,i) 
-    do k = 1, n
-     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
-    enddo
-   enddo
-   eigval(i) *= 1.d0/accu
-!   print*,'eigval(i) = ',eigval(i)
-  enddo
-  !! RESORT JUST TO BE SURE
-  call dsort(eigval, iorder, n)
-  do i = 1, n
-   do j = 1, n
-    reigvec(j,i) = VR(j,iorder(i))
-    leigvec(j,i) = VL(j,iorder(i))
-   enddo
-  enddo
-  print*,'Checking for final reigvec/leigvec'
-  shift_current = max(1.d-10,shift_current)
-  print*,'Thr for eigenvectors = ',shift_current
-  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
-  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, thresh_biorthog_diag, thresh_biorthog_nondiag, .false.)
-  print *, ' accu_nd bi-orthog = ', accu_nd
-  
-  if(accu_nd .lt. thresh_biorthog_nondiag) then
-    print *, ' bi-orthogonality: ok'
-  else 
-   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
-   print*,'Eigenvectors are not bi orthonormal ..'
-   print*,'accu_nd = ',accu_nd
-   stop
-  endif
-
-end 
-
-
--- a/plugins/local/normal_order_old/NEED
+++ b/plugins/local/normal_order_old/NEED
@ -0,0 +1 @@
+tc_scf
--- a/plugins/local/normal_order_old/README.rst
+++ b/plugins/local/normal_order_old/README.rst
@ -0,0 +1,4 @@
+================
+normal_order_old
+================
+
--- a/plugins/local/normal_order_old/normal_ordered.irp.f
+++ b/plugins/local/normal_order_old/normal_ordered.irp.f
--- a/plugins/local/normal_order_old/normal_ordered_contractions.irp.f
+++ b/plugins/local/normal_order_old/normal_ordered_contractions.irp.f
--- a/plugins/local/normal_order_old/normal_ordered_old.irp.f
+++ b/plugins/local/normal_order_old/normal_ordered_old.irp.f
--- a/plugins/local/normal_order_old/normal_ordered_v0.irp.f
+++ b/plugins/local/normal_order_old/normal_ordered_v0.irp.f
--- a/plugins/local/old_delta_tc_qmc/NEED
+++ b/plugins/local/old_delta_tc_qmc/NEED
@ -0,0 +1 @@
+
--- a/plugins/local/old_delta_tc_qmc/README.rst
+++ b/plugins/local/old_delta_tc_qmc/README.rst
@ -0,0 +1,4 @@
+================
+old_delta_tc_qmc
+================
+
--- a/plugins/local/old_delta_tc_qmc/compute_deltamu_right.irp.f
+++ b/plugins/local/old_delta_tc_qmc/compute_deltamu_right.irp.f
--- a/plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
+++ b/plugins/local/old_delta_tc_qmc/dressing_vectors_lr.irp.f
@ -27,7 +27,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)

  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
  call hmat_bi_ortho         (psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)

  delta = 0.d0
@ -39,7 +39,7 @@ subroutine get_delta_bitc_right(psidet, psicoef, ndet, Nint, delta)
    do j = 1, ndet

      ! < I |Htilde | J >
-      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
      ! < I |H | J >
      call hmat_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, h_mono, h_twoe, h_tot)

@ -78,7 +78,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)

  i = 1
  j = 1
-  call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+  call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)

  delta = 0.d0
 !$OMP PARALLEL DO DEFAULT(NONE) SCHEDULE(dynamic,8)   &
@ -88,7 +88,7 @@ subroutine get_htc_bitc_right(psidet, psicoef, ndet, Nint, delta)
    do j = 1, ndet

      ! < I |Htilde | J >
-      call htilde_mu_mat_bi_ortho_slow(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)
+      call htilde_mu_mat_opt_bi_ortho(psidet(1,1,i), psidet(1,1,j), Nint, htc_mono, htc_twoe, htc_three, htc_tot)

      delta(i) = delta(i) + psicoef(j) * htc_tot
    enddo
--- a/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
+++ b/plugins/local/old_delta_tc_qmc/old_delta_tc_qmc.irp.f
@ -1,4 +1,4 @@
-program tc_keywords
+program old_delta_tc_qmc
  implicit none
  BEGIN_DOC
 ! TODO : Put the documentation of the program here
--- a/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
+++ b/plugins/local/ortho_three_e_ints/mu_j_ints_usual_mos.irp.f
@ -183,11 +183,3 @@ BEGIN_PROVIDER [ double precision, x_W_ij_erf_rk, ( n_points_final_grid,3,mo_num

 END_PROVIDER 

-BEGIN_PROVIDER [ double precision, sqrt_weight_at_r, (n_points_final_grid)]
- implicit none
- integer :: ipoint
- do ipoint = 1, n_points_final_grid
-  sqrt_weight_at_r(ipoint) = dsqrt(final_weight_at_r_vector(ipoint))
- enddo
-END_PROVIDER 
-
--- a/plugins/local/slater_tc/NEED
+++ b/plugins/local/slater_tc/NEED
@ -0,0 +1,8 @@
+determinants
+normal_order_old
+bi_ort_ints
+bi_ortho_mos
+tc_keywords
+non_hermit_dav
+dav_general_mat
+tc_scf
--- a/plugins/local/tc_bi_ortho/h_mat_triple.irp.f
+++ b/plugins/local/tc_bi_ortho/h_mat_triple.irp.f
@ -1,196 +1,3 @@
-subroutine get_excitation_general(key_i,key_j, Nint,degree_array,holes_array, particles_array,phase)
- use bitmasks
- BEGIN_DOC
-! returns the array, for each spin, of holes/particles between key_i and key_j 
-!
-! with the following convention: a^+_{particle} a_{hole}|key_i> = |key_j>
- END_DOC
-  include 'utils/constants.include.F'
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2),particles_array(100,2),degree_array(2)
- double precision, intent(out)  :: phase
- integer :: ispin,k,i,pos
- integer(bit_kind) :: key_hole, key_particle
- integer(bit_kind) :: xorvec(N_int_max,2)
- holes_array = -1
- particles_array = -1
- degree_array = 0
-  do i = 1, N_int
-   xorvec(i,1) = xor( key_i(i,1), key_j(i,1))
-   xorvec(i,2) = xor( key_i(i,2), key_j(i,2))
-   degree_array(1) += popcnt(xorvec(i,1))
-   degree_array(2) += popcnt(xorvec(i,2))
-  enddo
-  degree_array(1) = shiftr(degree_array(1),1)
-  degree_array(2) = shiftr(degree_array(2),1)
-  
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE HOLES 
-  do i = 1, N_int
-   key_hole = iand(xorvec(i,ispin),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
- do ispin = 1, 2
-  k = 1
-  !!! GETTING THE PARTICLES
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_excitation_general '
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo
-  enddo 
- enddo
- integer :: h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree_array(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
-subroutine get_holes_general(key_i, key_j,Nint, holes_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of holes between key_i and key_j 
-!
-! with the following convention: a_{hole}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: holes_array(100,2)
- integer(bit_kind) :: key_hole
- integer :: ispin,k,i,pos
- holes_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_hole = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_i(i,ispin))
-   do while(key_hole .ne.0_bit_kind)
-    pos = trailz(key_hole)
-    holes_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_hole = ibclr(key_hole,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_particles_general(key_i, key_j,Nint,particles_array)
- use bitmasks
- BEGIN_DOC
-! returns the array, per spin, of particles between key_i and key_j 
-!
-! with the following convention: a^dagger_{particle}|key_i> --> |key_j>
- END_DOC
- implicit none
- integer, intent(in)            :: Nint
- integer(bit_kind), intent(in)  :: key_j(Nint,2),key_i(Nint,2)
- integer, intent(out)           :: particles_array(100,2)
- integer(bit_kind) :: key_particle
- integer :: ispin,k,i,pos
- particles_array = -1
- do ispin = 1, 2
-  k = 1
-  do i = 1, N_int
-   key_particle = iand(xor(key_i(i,ispin),key_j(i,ispin)),key_j(i,ispin))
-   do while(key_particle .ne.0_bit_kind)
-    pos = trailz(key_particle)
-    particles_array(k,ispin) = 1+ bit_kind_size * (i-1) + pos
-    key_particle = ibclr(key_particle,pos)
-    k += 1
-    if(k .gt.100)then
-     print*,'WARNING in get_holes_general'
-     print*,'More than a 100-th excitation for spin ',ispin
-     print*,'Those are the two determinants'
-     call debug_det(key_i, N_int)
-     call debug_det(key_j, N_int)
-     print*,'stoping ...'
-     stop
-    endif
-   enddo 
-  enddo
- enddo
-end
-
-subroutine get_phase_general(key_i,Nint,degree, holes_array, particles_array,phase)
- implicit none
- integer, intent(in)            :: degree(2), Nint
- integer(bit_kind), intent(in)  :: key_i(Nint,2)
- integer, intent(in)            :: holes_array(100,2),particles_array(100,2)
- double precision, intent(out)  :: phase
- integer :: i,ispin,h,p, i_ok
- integer(bit_kind), allocatable :: det_i(:,:),det_ip(:,:)
- integer                        :: exc(0:2,2,2)
- double precision :: phase_tmp
- allocate(det_i(Nint,2),det_ip(N_int,2))
- det_i = key_i
- phase = 1.d0
- do ispin = 1, 2
-  do i = 1, degree(ispin)
-   h = holes_array(i,ispin)
-   p = particles_array(i,ispin)
-   det_ip = det_i
-   call do_single_excitation(det_ip,h,p,ispin,i_ok)
-   if(i_ok == -1)then
-     print*,'excitation was not possible '
-     stop
-   endif
-   call get_single_excitation(det_i,det_ip,exc,phase_tmp,Nint)
-   phase *= phase_tmp
-   det_i = det_ip
-  enddo
- enddo
-
-end
-
 subroutine H_tc_s2_u_0_with_pure_three(v_0, s_0, u_0, N_st, sze)
  BEGIN_DOC
  ! Computes $v_0 = H^TC | u_0\rangle$ WITH PURE TRIPLE EXCITATION TERMS 
--- a/plugins/local/tc_bi_ortho/h_tc_s2_u0.irp.f
+++ b/plugins/local/tc_bi_ortho/h_tc_s2_u0.irp.f
--- a/plugins/local/tc_bi_ortho/slater_tc_opt.irp.f
+++ b/plugins/local/tc_bi_ortho/slater_tc_opt.irp.f
@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
  implicit none
  double precision :: t1, t2

-  PROVIDE ao_two_e_integrals_in_map
-
  print *, ' start provide_all_three_ints_bi_ortho'
  call wall_time(t1)

@ -131,9 +129,9 @@ subroutine htilde_mu_mat_opt_bi_ortho(key_j, key_i, Nint, hmono, htwoe, hthree,

  endif

-  if(degree==0) then
-    htot += nuclear_repulsion
-  endif
+! if(degree==0) then
+!   htot += nuclear_repulsion
+! endif

 end

@ -181,3 +179,48 @@ end

 ! ---

+subroutine htilde_mu_mat_opt_bi_ortho_no_3e_both(key_j, key_i, Nint, hji,hij)
+
+  BEGIN_DOC
+  !
+  ! <key_j |H_tilde | key_i> where |key_j> is developed on the LEFT basis and |key_i> is developed on the RIGHT basis
+  !!
+  ! Returns the detail of the matrix element WITHOUT ANY CONTRIBUTION FROM THE THREE ELECTRON TERMS
+  !! WARNING !!
+  !
+  ! Non hermitian !!
+  !
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: hji,hij
+  integer                       :: degree
+
+  hji = 0.d0
+  hij = 0.d0
+
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+  if(degree.gt.2) return
+
+  if(degree == 0) then
+    call diag_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_i,hji)
+    hij = hji
+  else if (degree == 1) then
+    call single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint,key_j, key_i , hji,hij)
+  else if(degree == 2) then
+    call double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+  endif
+
+  if(degree==0) then
+    hji += nuclear_repulsion
+    hij += nuclear_repulsion
+  endif
+
+end
+
+! ---
+
--- a/plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f
+++ b/plugins/local/tc_bi_ortho/slater_tc_opt_diag.irp.f
@ -19,13 +19,13 @@
  PROVIDE HF_bitmask
  PROVIDE mo_l_coef mo_r_coef

-  call diag_htilde_mu_mat_bi_ortho_slow(N_int, HF_bitmask, hmono, htwoe, htot)
+  call diag_htc_bi_orth_2e_brute(N_int, HF_bitmask, hmono, htwoe, htot)

  ref_tc_energy_1e = hmono
  ref_tc_energy_2e = htwoe 

  if(three_body_h_tc) then
-    call diag_htilde_three_body_ints_bi_ort_slow(N_int, HF_bitmask, hthree)
+    call diag_htc_bi_orth_3e_brute(N_int, HF_bitmask, hthree)
    ref_tc_energy_3e = hthree
  else
    ref_tc_energy_3e = 0.d0
@ -524,3 +524,310 @@ end

 ! ---

+subroutine diag_htc_bi_orth_2e_brute(Nint, key_i, hmono, htwoe, htot)
+
+  BEGIN_DOC
+  !
+  ! diagonal element of htilde ONLY FOR ONE- AND TWO-BODY TERMS 
+  !
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in)  :: Nint
+  integer(bit_kind), intent(in)  :: key_i(Nint,2)
+  double precision, intent(out)  :: hmono,htwoe,htot
+  integer                        :: occ(Nint*bit_kind_size,2)
+  integer                        :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  double precision               :: get_mo_two_e_integral_tc_int
+  integer(bit_kind)              :: key_i_core(Nint,2)
+
+  PROVIDE mo_bi_ortho_tc_two_e
+
+  hmono = 0.d0
+  htwoe = 0.d0
+  htot  = 0.d0
+
+  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+
+  do ispin = 1, 2
+    do i = 1, Ne(ispin)
+      ii = occ(i,ispin)
+      hmono += mo_bi_ortho_tc_one_e(ii,ii)
+    enddo
+  enddo
+
+  ! alpha/beta two-body
+  ispin = 1
+  jspin = 2
+  do i = 1, Ne(ispin) ! electron 1 (so it can be associated to mu(r1))
+    ii = occ(i,ispin) 
+    do j = 1, Ne(jspin) ! electron 2 
+      jj = occ(j,jspin) 
+      htwoe += mo_bi_ortho_tc_two_e(jj,ii,jj,ii) 
+    enddo
+  enddo
+ 
+  ! alpha/alpha two-body
+  do i = 1, Ne(ispin)
+    ii = occ(i,ispin) 
+    do j = i+1, Ne(ispin)
+      jj = occ(j,ispin) 
+      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
+    enddo
+  enddo
+ 
+  ! beta/beta two-body
+  do i = 1, Ne(jspin)
+    ii = occ(i,jspin) 
+    do j = i+1, Ne(jspin)
+      jj = occ(j,jspin) 
+      htwoe += mo_bi_ortho_tc_two_e(ii,jj,ii,jj) - mo_bi_ortho_tc_two_e(ii,jj,jj,ii)
+    enddo
+  enddo
+
+  htot = hmono + htwoe 
+
+end
+
+! ---                                                                                           
+
+subroutine diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)
+
+  BEGIN_DOC
+  !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_i(Nint,2)
+  double precision, intent(out) :: hthree
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2),i,j,ii,jj,ispin,jspin,m,mm
+  integer(bit_kind)             :: key_i_core(Nint,2)
+  double precision              :: direct_int, exchange_int, ref
+  double precision, external    :: sym_3_e_int_from_6_idx_tensor
+  double precision, external    :: three_e_diag_parrallel_spin
+
+  PROVIDE mo_l_coef mo_r_coef
+
+  if(core_tc_op) then
+    do i = 1, Nint
+      key_i_core(i,1) = xor(key_i(i,1), core_bitmask(i,1))
+      key_i_core(i,2) = xor(key_i(i,2), core_bitmask(i,2))
+    enddo
+    call bitstring_to_list_ab(key_i_core, occ, Ne, Nint)
+  else
+    call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+  endif
+
+  hthree = 0.d0
+
+  if((Ne(1)+Ne(2)) .ge. 3) then
+
+    ! alpha/alpha/beta three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1)
+      do j = i+1, Ne(1)
+        jj = occ(j,1)
+        do m = 1, Ne(2)
+          mm = occ(m,2)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)      !uses 3-idx tensor
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
+    enddo
+
+    ! beta/beta/alpha three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2)
+      do j = i+1, Ne(2)
+        jj = occ(j,2)
+        do m = 1, Ne(1)
+          mm = occ(m,1)
+          !direct_int   = three_body_ints_bi_ort(mm,jj,ii,mm,jj,ii) !uses the 6-idx tensor
+          !exchange_int = three_body_ints_bi_ort(mm,jj,ii,mm,ii,jj) !uses the 6-idx tensor
+          direct_int   = three_e_3_idx_direct_bi_ort(mm,jj,ii)
+          exchange_int = three_e_3_idx_exch12_bi_ort(mm,jj,ii)
+          hthree      += direct_int - exchange_int
+        enddo
+      enddo
+    enddo
+
+    ! alpha/alpha/alpha three-body
+    do i = 1, Ne(1)
+      ii = occ(i,1) ! 1
+      do j = i+1, Ne(1)
+        jj = occ(j,1) ! 2
+        do m = j+1, Ne(1)
+          mm = occ(m,1) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
+    ! beta/beta/beta three-body
+    do i = 1, Ne(2)
+      ii = occ(i,2) ! 1
+      do j = i+1, Ne(2)
+        jj = occ(j,2) ! 2
+        do m = j+1, Ne(2)
+          mm = occ(m,2) ! 3
+          !hthree += sym_3_e_int_from_6_idx_tensor(mm,jj,ii,mm,jj,ii) !uses the 6 idx tensor
+          hthree += three_e_diag_parrallel_spin(mm,jj,ii)             !uses only 3-idx tensors
+        enddo
+      enddo
+    enddo
+
+  endif
+
+end
+
+
+
+BEGIN_PROVIDER [ double precision, three_e_diag_parrallel_spin_prov, (mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator ON A BI ORTHONORMAL BASIS 
+  !
+  ! three_e_diag_parrallel_spin_prov(m,j,i) = All combinations of the form <mji|-L|mji> for same spin matrix elements  
+  ! 
+  ! notice the -1 sign: in this way three_e_diag_parrallel_spin_prov can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+  implicit none
+  integer          :: i, j, m
+  double precision :: integral, wall1, wall0, three_e_diag_parrallel_spin
+
+  three_e_diag_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_diag_parrallel_spin_prov ...'
+
+ integral = three_e_diag_parrallel_spin(1,1,1) ! to provide all stuffs
+  call wall_time(wall0)
+ !$OMP PARALLEL                 &
+ !$OMP DEFAULT (NONE)           &
+ !$OMP PRIVATE (i,j,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_diag_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = j, mo_num
+        three_e_diag_parrallel_spin_prov(m,j,i) =  three_e_diag_parrallel_spin(m,j,i)
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  do i = 1, mo_num
+    do j = 1, mo_num
+      do m = 1, j
+        three_e_diag_parrallel_spin_prov(m,j,i) = three_e_diag_parrallel_spin_prov(j,m,i)
+      enddo
+    enddo
+  enddo
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_diag_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
+BEGIN_PROVIDER [ double precision, three_e_single_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF SINGLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_single_parrallel_spin_prov(m,j,k,i) = All combination of <mjk|-L|mji> for same spin matrix elements 
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  !
+  END_DOC
+
+ implicit none
+ integer          :: i, j, k, m
+ double precision :: integral, wall1, wall0, three_e_single_parrallel_spin
+
+  three_e_single_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_single_parrallel_spin_prov ...'
+
+  integral = three_e_single_parrallel_spin(1,1,1,1)
+  call wall_time(wall0)
+ !$OMP PARALLEL                   &
+ !$OMP DEFAULT (NONE)             &
+ !$OMP PRIVATE (i,j,k,m,integral) & 
+ !$OMP SHARED (mo_num,three_e_single_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do m = 1, mo_num
+          three_e_single_parrallel_spin_prov(m,j,k,i) = three_e_single_parrallel_spin(m,j,k,i)
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_single_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
+
+! ---
+
+BEGIN_PROVIDER [ double precision, three_e_double_parrallel_spin_prov, (mo_num, mo_num, mo_num, mo_num, mo_num)]
+
+  BEGIN_DOC
+  !
+  ! matrix element of the -L  three-body operator FOR THE DIRECT TERMS OF DOUBLE EXCITATIONS AND BI ORTHO MOs
+  !
+  ! three_e_double_parrallel_spin_prov(m,l,j,k,i) = <mlk|-L|mji> ::: notice that i is the RIGHT MO and k is the LEFT MO
+  !
+  ! notice the -1 sign: in this way three_e_3_idx_direct_bi_ort can be directly used to compute Slater rules with a + sign
+  END_DOC
+
+  implicit none
+  integer          :: i, j, k, m, l
+  double precision :: integral, wall1, wall0, three_e_double_parrallel_spin
+
+  three_e_double_parrallel_spin_prov = 0.d0
+  print *, ' Providing the three_e_double_parrallel_spin_prov ...'
+  call wall_time(wall0)
+
+ integral = three_e_double_parrallel_spin(1,1,1,1,1)
+ !$OMP PARALLEL                     &
+ !$OMP DEFAULT (NONE)               &
+ !$OMP PRIVATE (i,j,k,m,l,integral) & 
+ !$OMP SHARED (mo_num,three_e_double_parrallel_spin_prov)
+ !$OMP DO SCHEDULE (dynamic)
+  do i = 1, mo_num
+    do k = 1, mo_num
+      do j = 1, mo_num
+        do l = 1, mo_num
+          do m = 1, mo_num
+            three_e_double_parrallel_spin_prov(m,l,j,k,i) = three_e_double_parrallel_spin(m,l,j,k,i)
+          enddo
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+  call wall_time(wall1)
+  print *, ' wall time for three_e_double_parrallel_spin_prov', wall1 - wall0
+
+END_PROVIDER 
+
--- a/plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f
+++ b/plugins/local/tc_bi_ortho/slater_tc_opt_double.irp.f
@ -505,3 +505,63 @@ subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e(Nint, key_j, key_i, htot)

 end

+subroutine double_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+
+  BEGIN_DOC
+  ! <key_j |H_tilde | key_i> and <key_i |H_tilde | key_j> for double excitation  ONLY FOR ONE- AND TWO-BODY TERMS
+  !!
+  !! WARNING !!
+  !
+  ! Non hermitian !!
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
+  double precision, intent(out) :: hji,hij
+  double precision :: hmono, htwoe_ji, htwoe_ij
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  integer                       :: degree,exc(0:2,2,2)
+  integer                       :: h1, p1, h2, p2, s1, s2
+  double precision              :: get_mo_two_e_integral_tc_int,phase
+
+
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+
+  hmono  = 0.d0
+  htwoe_ji  = 0.d0
+  htwoe_ij  = 0.d0
+  hji   = 0.d0
+  hij   = 0.d0
+
+  if(degree.ne.2)then
+   return
+  endif
+  integer :: degree_i,degree_j
+  call get_excitation_degree(ref_bitmask,key_i,degree_i,N_int)
+  call get_excitation_degree(ref_bitmask,key_j,degree_j,N_int)
+  call get_double_excitation(key_i, key_j, exc, phase, Nint)
+  call decode_exc(exc, 2, h1, p1, h2, p2, s1, s2)
+
+  if(s1.ne.s2)then
+   ! opposite spin two-body
+    htwoe_ji  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+    htwoe_ij  = mo_bi_ortho_tc_two_e_transp(p2,p1,h2,h1)
+  else
+   ! same spin two-body
+   ! direct terms
+   htwoe_ji  = mo_bi_ortho_tc_two_e(p2,p1,h2,h1)
+   htwoe_ij  = mo_bi_ortho_tc_two_e_transp(p2,p1,h2,h1)
+   ! exchange terms
+   htwoe_ji -= mo_bi_ortho_tc_two_e(p1,p2,h2,h1)
+   htwoe_ij -= mo_bi_ortho_tc_two_e_transp(p1,p2,h2,h1)
+  endif
+  htwoe_ji  *= phase
+  hji    =  htwoe_ji
+  htwoe_ij  *= phase
+  hij    =  htwoe_ij
+
+end
--- a/plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f
+++ b/plugins/local/tc_bi_ortho/slater_tc_opt_single.irp.f
@ -618,3 +618,145 @@ subroutine get_single_excitation_from_fock_tc_no_3e(Nint, key_i, key_j, h, p, sp

 end

+
+subroutine  single_htilde_mu_mat_fock_bi_ortho_no_3e_both(Nint, key_j, key_i, hji,hij)
+
+  BEGIN_DOC
+  ! <key_j |H_tilde | key_i> and <key_i |H_tilde | key_j> for single excitation ONLY FOR ONE- AND TWO-BODY TERMS 
+  !!
+  !! WARNING !!
+  ! 
+  ! Non hermitian !!
+  END_DOC
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer(bit_kind), intent(in) :: key_j(Nint,2), key_i(Nint,2)
+  double precision, intent(out) :: hji,hij
+
+  double precision              :: hmono, htwoe
+  integer                       :: occ(Nint*bit_kind_size,2)
+  integer                       :: Ne(2), i, j, ii, jj, ispin, jspin, k, kk
+  integer                       :: degree,exc(0:2,2,2)
+  integer                       :: h1, p1, h2, p2, s1, s2
+  double precision              :: get_mo_two_e_integral_tc_int, phase
+  double precision              :: direct_int, exchange_int_12, exchange_int_23, exchange_int_13
+  integer                       :: other_spin(2)
+  integer(bit_kind)             :: key_j_core(Nint,2), key_i_core(Nint,2)
+
+  other_spin(1) = 2
+  other_spin(2) = 1
+
+  hmono  = 0.d0
+  htwoe  = 0.d0
+  hji   = 0.d0
+  hij   = 0.d0
+  call get_excitation_degree(key_i, key_j, degree, Nint)
+  if(degree.ne.1)then
+   return
+  endif
+  call bitstring_to_list_ab(key_i, occ, Ne, Nint)
+
+  call get_single_excitation(key_i, key_j, exc, phase, Nint)
+  call decode_exc(exc,1,h1,p1,h2,p2,s1,s2)
+  call get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h1, p1, s1, phase, hji,hij)
+
+end
+
+! ---
+
+subroutine get_single_excitation_from_fock_tc_no_3e_both(Nint, key_i, key_j, h, p, spin, phase, hji,hij)
+
+  use bitmasks
+
+  implicit none
+  integer,           intent(in) :: Nint
+  integer,           intent(in) :: h, p, spin
+  double precision,  intent(in) :: phase
+  integer(bit_kind), intent(in) :: key_i(Nint,2), key_j(Nint,2)
+  double precision, intent(out) :: hji,hij
+  double precision :: hmono_ji,htwoe_ji
+  double precision :: hmono_ij,htwoe_ij
+
+  integer(bit_kind)             :: differences(Nint,2)
+  integer(bit_kind)             :: hole(Nint,2)
+  integer(bit_kind)             :: partcl(Nint,2)
+  integer                       :: occ_hole(Nint*bit_kind_size,2)
+  integer                       :: occ_partcl(Nint*bit_kind_size,2)
+  integer                       :: n_occ_ab_hole(2),n_occ_ab_partcl(2)
+  integer                       :: i0,i
+  double precision              :: buffer_c_ji(mo_num), buffer_x_ji(mo_num)
+  double precision              :: buffer_c_ij(mo_num), buffer_x_ij(mo_num)
+
+  do i = 1, mo_num
+    buffer_c_ji(i) = tc_2e_3idx_coulomb_integrals(i,p,h)
+    buffer_x_ji(i) = tc_2e_3idx_exchange_integrals(i,p,h)
+    buffer_c_ij(i) = tc_2e_3idx_coulomb_integrals_transp(i,p,h)
+    buffer_x_ij(i) = tc_2e_3idx_exchange_integrals_transp(i,p,h)
+  enddo
+
+  do i = 1, Nint
+   differences(i,1) = xor(key_i(i,1),ref_closed_shell_bitmask(i,1))
+   differences(i,2) = xor(key_i(i,2),ref_closed_shell_bitmask(i,2))
+   hole(i,1) = iand(differences(i,1),ref_closed_shell_bitmask(i,1))
+   hole(i,2) = iand(differences(i,2),ref_closed_shell_bitmask(i,2))
+   partcl(i,1) = iand(differences(i,1),key_i(i,1))
+   partcl(i,2) = iand(differences(i,2),key_i(i,2))
+  enddo
+
+  call bitstring_to_list_ab(hole, occ_hole, n_occ_ab_hole, Nint)
+  call bitstring_to_list_ab(partcl, occ_partcl, n_occ_ab_partcl, Nint)
+  hmono_ji = mo_bi_ortho_tc_one_e(p,h)
+  htwoe_ji = fock_op_2_e_tc_closed_shell(p,h)
+  hmono_ij = mo_bi_ortho_tc_one_e(h,p)
+  htwoe_ij = fock_op_2_e_tc_closed_shell(h,p)
+
+  ! holes :: direct terms
+  do i0 = 1, n_occ_ab_hole(1)
+    i = occ_hole(i0,1)
+    htwoe_ji -= buffer_c_ji(i)
+    htwoe_ij -= buffer_c_ij(i)
+  enddo
+  do i0 = 1, n_occ_ab_hole(2)
+    i = occ_hole(i0,2)
+    htwoe_ji -= buffer_c_ji(i)
+    htwoe_ij -= buffer_c_ij(i)
+  enddo
+ 
+  ! holes :: exchange terms
+  do i0 = 1, n_occ_ab_hole(spin)
+    i = occ_hole(i0,spin)
+    htwoe_ji += buffer_x_ji(i)
+    htwoe_ij += buffer_x_ij(i)
+  enddo
+ 
+  ! particles :: direct terms
+  do i0 = 1, n_occ_ab_partcl(1)
+    i = occ_partcl(i0,1)
+    htwoe_ji += buffer_c_ji(i)
+    htwoe_ij += buffer_c_ij(i)
+  enddo
+  do i0 = 1, n_occ_ab_partcl(2)
+    i = occ_partcl(i0,2)
+    htwoe_ji += buffer_c_ji(i)
+    htwoe_ij += buffer_c_ij(i)
+  enddo
+ 
+  ! particles :: exchange terms
+  do i0 = 1, n_occ_ab_partcl(spin)
+    i = occ_partcl(i0,spin)
+    htwoe_ji -= buffer_x_ji(i)
+    htwoe_ij -= buffer_x_ij(i)
+  enddo
+  htwoe_ji = htwoe_ji * phase
+  hmono_ji = hmono_ji * phase
+  hji  = htwoe_ji + hmono_ji 
+
+  htwoe_ij = htwoe_ij * phase
+  hmono_ij = hmono_ij * phase
+  hij  = htwoe_ij + hmono_ij 
+
+end
+
--- a/plugins/local/tc_bi_ortho/tc_hmat.irp.f
+++ b/plugins/local/tc_bi_ortho/tc_hmat.irp.f
@ -22,6 +22,7 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]

  if(noL_standard) then
    PROVIDE noL_0e
+    print*, "noL_0e =", noL_0e
    PROVIDE noL_1e
    PROVIDE noL_2e
  endif
@ -29,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
  print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
  call wall_time(t1)

-  call provide_all_three_ints_bi_ortho()
+  if(three_body_h_tc)then
+   call provide_all_three_ints_bi_ortho()
+  endif

  i = 1
  j = 1
--- a/plugins/local/slater_tc_no_opt/.gitignore
+++ b/plugins/local/slater_tc_no_opt/.gitignore
@ -0,0 +1,59 @@
+IRPF90_temp/
+IRPF90_man/
+build.ninja
+irpf90.make
+ezfio_interface.irp.f
+irpf90_entities
+tags
+Makefile
+ao_basis
+ao_one_e_ints
+ao_two_e_erf_ints
+ao_two_e_ints
+aux_quantities
+becke_numerical_grid
+bitmask
+cis
+cisd
+cipsi
+davidson
+davidson_dressed
+davidson_undressed
+density_for_dft
+determinants
+dft_keywords
+dft_utils_in_r
+dft_utils_one_e
+dft_utils_two_body
+dressing
+dummy
+electrons
+ezfio_files
+fci
+generators_cas
+generators_full
+hartree_fock
+iterations
+kohn_sham
+kohn_sham_rs
+mo_basis
+mo_guess
+mo_one_e_ints
+mo_two_e_erf_ints
+mo_two_e_ints
+mpi
+mrpt_utils
+nuclei
+perturbation
+pseudo
+psiref_cas
+psiref_utils
+scf_utils
+selectors_cassd
+selectors_full
+selectors_utils
+single_ref_method
+slave
+tools
+utils
+zmq
--- a/plugins/local/slater_tc_no_opt/NEED
+++ b/plugins/local/slater_tc_no_opt/NEED
@ -0,0 +1,8 @@
+determinants
+normal_order_old
+bi_ort_ints
+bi_ortho_mos
+tc_keywords
+non_hermit_dav
+dav_general_mat
+tc_scf
--- a/plugins/local/slater_tc_no_opt/README.rst
+++ b/plugins/local/slater_tc_no_opt/README.rst
@ -0,0 +1,4 @@
+================
+slater_tc_no_opt
+================
+
--- a/plugins/local/slater_tc_no_opt/h_biortho.irp.f
+++ b/plugins/local/slater_tc_no_opt/h_biortho.irp.f
--- a/plugins/local/slater_tc_no_opt/h_tc_bi_ortho_psi.irp.f
+++ b/plugins/local/slater_tc_no_opt/h_tc_bi_ortho_psi.irp.f
--- a/plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_3e_slow.irp.f
@ -1,7 +1,7 @@

 ! ---

-subroutine diag_htilde_three_body_ints_bi_ort_slow(Nint, key_i, hthree)
+subroutine diag_htc_bi_orth_3e_brute(Nint, key_i, hthree)

  BEGIN_DOC
  !  diagonal element of htilde ONLY FOR THREE-BODY TERMS WITH BI ORTHONORMAL ORBITALS
--- a/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
+++ b/plugins/local/slater_tc_no_opt/slater_tc_no_opt.irp.f
@ -0,0 +1,7 @@
+program slater_tc_no_opt
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+end
--- a/Show More
+++ b/Show More
				`@ -0,0 +1,2 @@`
				`-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++`