Merge pull request #2 from QuantumPackage/dev-tc

Dev tc
2025-01-03 00:55:38 +01:00 · 2022-10-24 11:29:25 +02:00 · 2022-10-24 11:29:25 +02:00 · 7e5a4c21d7
commit 7e5a4c21d7
parent fccf2fe438 7f653b9eee
66 changed files with 14194 additions and 1049 deletions
--- a/src/ao_many_one_e_ints/ao_erf_gauss.irp.f
+++ b/src/ao_many_one_e_ints/ao_erf_gauss.irp.f
@ -105,7 +105,7 @@ end function phi_j_erf_mu_r_phi
 subroutine erfc_mu_gauss_xyz_ij_ao(i, j, mu, C_center, delta, gauss_ints)
 implicit none
 BEGIN_DOC
-  ! gauss_ints(m) =   \int dr exp(-delta (r - C)^2 ) x/y/z * ( 1 - erf(mu |r-r'|))/ |r-r'| * AO_i(r') * AO_j(r')
+  ! gauss_ints(m) =   \int dr exp(-delta (r - C)^2 ) x/y/z * ( 1 - erf(mu |r-C|))/ |r-C| * AO_i(r) * AO_j(r)
  !
  ! with m = 1 ==> x, m = 2, m = 3 ==> z
  !
--- a/src/ao_many_one_e_ints/ao_gaus_gauss.irp.f
+++ b/src/ao_many_one_e_ints/ao_gaus_gauss.irp.f
@ -170,7 +170,7 @@ double precision function overlap_gauss_r12_ao_with1s(B_center, beta, D_center,

  double precision, external   :: overlap_gauss_r12, overlap_gauss_r12_ao

-  ASSERT(beta .gt. 0.d0)
+  ASSERT(beta .ge. 0.d0)
  if(beta .lt. 1d-10) then
    overlap_gauss_r12_ao_with1s = overlap_gauss_r12_ao(D_center, delta, i, j)
    return
--- a/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad2_jmu_modif.irp.f
@ -1,363 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_grad1u2_grad2u2_j1b2, (ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! -\frac{1}{4} x int dr2 phi_i(r2) phi_j(r2) 1s_j1b(r2)^2 [1 - erf(mu r12)]^2
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: i, j, ipoint, i_1s, i_fit
-  double precision              :: r(3), int_fit, expo_fit, coef_fit
-  double precision              :: coef, beta, B_center(3)
-  double precision              :: tmp
-  double precision              :: wall0, wall1
-
-  double precision, external    :: overlap_gauss_r12_ao_with1s
-
-  provide mu_erf final_grid_points j1b_pen
-  call wall_time(wall0)
-
-  int2_grad1u2_grad2u2_j1b2 = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                      &
- !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, tmp)                   & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
- !$OMP          final_grid_points, n_max_fit_slat,                  &
- !$OMP          expo_gauss_1_erf_x_2, coef_gauss_1_erf_x_2,         &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
- !$OMP          List_all_comb_b3_cent, int2_grad1u2_grad2u2_j1b2)
- !$OMP DO
-  !do ipoint = 1, 10
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
-
-    do i = 1, ao_num
-      do j = i, ao_num
-
-        tmp = 0.d0
-        do i_1s = 1, List_all_comb_b3_size
-
-          coef        = List_all_comb_b3_coef  (i_1s)
-          beta        = List_all_comb_b3_expo  (i_1s)
-          B_center(1) = List_all_comb_b3_cent(1,i_1s)
-          B_center(2) = List_all_comb_b3_cent(2,i_1s)
-          B_center(3) = List_all_comb_b3_cent(3,i_1s)
-
-          do i_fit = 1, n_max_fit_slat
-
-            expo_fit = expo_gauss_1_erf_x_2(i_fit)
-            coef_fit = coef_gauss_1_erf_x_2(i_fit)
-            int_fit  = overlap_gauss_r12_ao_with1s(B_center, beta, r, expo_fit, i, j)
-
-            tmp += -0.25d0 * coef * coef_fit * int_fit
-          enddo
-        enddo
-
-        int2_grad1u2_grad2u2_j1b2(j,i,ipoint) = tmp
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        int2_grad1u2_grad2u2_j1b2(j,i,ipoint) = int2_grad1u2_grad2u2_j1b2(i,j,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for int2_grad1u2_grad2u2_j1b2', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_u2_j1b2, (ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! int dr2 phi_i(r2) phi_j(r2) 1s_j1b(r2)^2 [u_12^mu]^2
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: i, j, ipoint, i_1s, i_fit
-  double precision              :: r(3), int_fit, expo_fit, coef_fit
-  double precision              :: coef, beta, B_center(3), tmp
-  double precision              :: wall0, wall1
-
-  double precision, external    :: overlap_gauss_r12_ao_with1s
-
-  provide mu_erf final_grid_points j1b_pen
-  call wall_time(wall0)
-
-  int2_u2_j1b2 = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                      &
- !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, tmp)                   & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
- !$OMP          final_grid_points, n_max_fit_slat,                  &
- !$OMP          expo_gauss_j_mu_x_2, coef_gauss_j_mu_x_2,           &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
- !$OMP          List_all_comb_b3_cent, int2_u2_j1b2)
- !$OMP DO
-  !do ipoint = 1, 10
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
-
-    do i = 1, ao_num
-      do j = i, ao_num
-
-        tmp = 0.d0
-        do i_1s = 1, List_all_comb_b3_size
-
-          coef        = List_all_comb_b3_coef  (i_1s)
-          beta        = List_all_comb_b3_expo  (i_1s)
-          B_center(1) = List_all_comb_b3_cent(1,i_1s)
-          B_center(2) = List_all_comb_b3_cent(2,i_1s)
-          B_center(3) = List_all_comb_b3_cent(3,i_1s)
-
-          do i_fit = 1, n_max_fit_slat
-
-            expo_fit = expo_gauss_j_mu_x_2(i_fit)
-            coef_fit = coef_gauss_j_mu_x_2(i_fit)
-            int_fit  = overlap_gauss_r12_ao_with1s(B_center, beta, r, expo_fit, i, j)
-
-            tmp += coef * coef_fit * int_fit
-          enddo
-        enddo
-
-        int2_u2_j1b2(j,i,ipoint) = tmp
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        int2_u2_j1b2(j,i,ipoint) = int2_u2_j1b2(i,j,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for int2_u2_j1b2', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_u_grad1u_x_j1b2, (3, ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! int dr2 phi_i(r2) phi_j(r2) 1s_j1b(r2)^2 u_12^mu [\grad_1 u_12^mu] r2
-  !
-  END_DOC
-
-  implicit none
-  integer          :: i, j, ipoint, i_1s, i_fit
-  double precision :: r(3), int_fit(3), expo_fit, coef_fit
-  double precision :: coef, beta, B_center(3), dist
-  double precision :: alpha_1s, alpha_1s_inv, centr_1s(3), expo_coef_1s, coef_tmp
-  double precision :: tmp_x, tmp_y, tmp_z
-  double precision :: wall0, wall1
-
-  provide mu_erf final_grid_points j1b_pen
-  call wall_time(wall0)
-
-  int2_u_grad1u_x_j1b2 = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                      &
- !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, alpha_1s, dist,        &
- !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp,     & 
- !$OMP          tmp_x, tmp_y, tmp_z)                                & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
- !$OMP          final_grid_points, n_max_fit_slat,                  &
- !$OMP          expo_gauss_j_mu_1_erf, coef_gauss_j_mu_1_erf,       &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
- !$OMP          List_all_comb_b3_cent, int2_u_grad1u_x_j1b2)
- !$OMP DO
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
-
-    do i = 1, ao_num
-      do j = i, ao_num
-
-        tmp_x = 0.d0
-        tmp_y = 0.d0
-        tmp_z = 0.d0
-        do i_1s = 1, List_all_comb_b3_size
-
-          coef        = List_all_comb_b3_coef  (i_1s)
-          beta        = List_all_comb_b3_expo  (i_1s)
-          B_center(1) = List_all_comb_b3_cent(1,i_1s)
-          B_center(2) = List_all_comb_b3_cent(2,i_1s)
-          B_center(3) = List_all_comb_b3_cent(3,i_1s)
-          dist        = (B_center(1) - r(1)) * (B_center(1) - r(1)) &
-                      + (B_center(2) - r(2)) * (B_center(2) - r(2)) &
-                      + (B_center(3) - r(3)) * (B_center(3) - r(3)) 
-
-          do i_fit = 1, n_max_fit_slat
-
-            expo_fit = expo_gauss_j_mu_1_erf(i_fit)
-            coef_fit = coef_gauss_j_mu_1_erf(i_fit)
-
-            alpha_1s     = beta + expo_fit
-            alpha_1s_inv = 1.d0 / alpha_1s 
-
-            centr_1s(1)  = alpha_1s_inv * (beta * B_center(1) + expo_fit * r(1))
-            centr_1s(2)  = alpha_1s_inv * (beta * B_center(2) + expo_fit * r(2))
-            centr_1s(3)  = alpha_1s_inv * (beta * B_center(3) + expo_fit * r(3))
-
-            expo_coef_1s = beta * expo_fit * alpha_1s_inv * dist 
-            !if(expo_coef_1s .gt. 80.d0) cycle
-            coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
-            !if(dabs(coef_tmp) .lt. 1d-10) cycle
-            
-            call NAI_pol_x_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s, 1.d+9, r, int_fit)
-
-            tmp_x += coef_tmp * int_fit(1)
-            tmp_y += coef_tmp * int_fit(2)
-            tmp_z += coef_tmp * int_fit(3)
-          enddo
-        enddo
-
-        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = tmp_x
-        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = tmp_y
-        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = tmp_z
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        int2_u_grad1u_x_j1b2(1,j,i,ipoint) = int2_u_grad1u_x_j1b2(1,i,j,ipoint)
-        int2_u_grad1u_x_j1b2(2,j,i,ipoint) = int2_u_grad1u_x_j1b2(2,i,j,ipoint)
-        int2_u_grad1u_x_j1b2(3,j,i,ipoint) = int2_u_grad1u_x_j1b2(3,i,j,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for int2_u_grad1u_x_j1b2', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, int2_u_grad1u_j1b2, (ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! int dr2 phi_i(r2) phi_j(r2) 1s_j1b(r2)^2 u_12^mu [\grad_1 u_12^mu]
-  !
-  END_DOC
-
-  implicit none
-  integer                       :: i, j, ipoint, i_1s, i_fit
-  double precision              :: r(3), int_fit, expo_fit, coef_fit, coef_tmp
-  double precision              :: coef, beta, B_center(3), dist
-  double precision              :: alpha_1s, alpha_1s_inv, centr_1s(3), expo_coef_1s, tmp
-  double precision              :: wall0, wall1
-  double precision, external    :: NAI_pol_mult_erf_ao_with1s
-
-  provide mu_erf final_grid_points j1b_pen
-  call wall_time(wall0)
-
-  int2_u_grad1u_j1b2 = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                      &
- !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, tmp, alpha_1s, dist,   &
- !$OMP          alpha_1s_inv, centr_1s, expo_coef_1s, coef_tmp)     & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b3_size, & 
- !$OMP          final_grid_points, n_max_fit_slat,                  &
- !$OMP          expo_gauss_j_mu_1_erf, coef_gauss_j_mu_1_erf,       &
- !$OMP          List_all_comb_b3_coef, List_all_comb_b3_expo,       & 
- !$OMP          List_all_comb_b3_cent, int2_u_grad1u_j1b2)
- !$OMP DO
-  do ipoint = 1, n_points_final_grid
-    do i = 1, ao_num
-      do j = i, ao_num
-        r(1) = final_grid_points(1,ipoint)
-        r(2) = final_grid_points(2,ipoint)
-        r(3) = final_grid_points(3,ipoint)
-
-        tmp = 0.d0
-        do i_1s = 1, List_all_comb_b3_size
-
-          coef        = List_all_comb_b3_coef  (i_1s)
-          beta        = List_all_comb_b3_expo  (i_1s)
-          B_center(1) = List_all_comb_b3_cent(1,i_1s)
-          B_center(2) = List_all_comb_b3_cent(2,i_1s)
-          B_center(3) = List_all_comb_b3_cent(3,i_1s)
-          dist        = (B_center(1) - r(1)) * (B_center(1) - r(1)) &
-                      + (B_center(2) - r(2)) * (B_center(2) - r(2)) &
-                      + (B_center(3) - r(3)) * (B_center(3) - r(3))
-
-          do i_fit = 1, n_max_fit_slat
-
-            expo_fit = expo_gauss_j_mu_1_erf(i_fit)
-            coef_fit = coef_gauss_j_mu_1_erf(i_fit)
-
-            alpha_1s     = beta + expo_fit
-            alpha_1s_inv = 1.d0 / alpha_1s 
-            centr_1s(1)  = alpha_1s_inv * (beta * B_center(1) + expo_fit * r(1))
-            centr_1s(2)  = alpha_1s_inv * (beta * B_center(2) + expo_fit * r(2))
-            centr_1s(3)  = alpha_1s_inv * (beta * B_center(3) + expo_fit * r(3))
-
-            expo_coef_1s = beta * expo_fit * alpha_1s_inv * dist
-            !if(expo_coef_1s .gt. 80.d0) cycle
-            coef_tmp = coef * coef_fit * dexp(-expo_coef_1s)
-            !if(dabs(coef_tmp) .lt. 1d-10) cycle
-            
-            int_fit = NAI_pol_mult_erf_ao_with1s(i, j, alpha_1s, centr_1s,  1.d+9, r)
-
-            tmp += coef_tmp * int_fit
-          enddo
-        enddo
-
-        int2_u_grad1u_j1b2(j,i,ipoint) = tmp
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        int2_u_grad1u_j1b2(j,i,ipoint) = int2_u_grad1u_j1b2(i,j,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for int2_u_grad1u_j1b2', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
--- a/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
+++ b/src/ao_many_one_e_ints/grad_lapl_jmu_modif.irp.f
@ -1,264 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ double precision, v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! int dr phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R| - 1) / |r - R|
-  !
-  END_DOC
-
-  implicit none
-  integer                    :: i, j, ipoint, i_1s
-  double precision           :: r(3), int_mu, int_coulomb
-  double precision           :: coef, beta, B_center(3)
-  double precision           :: tmp
-  double precision           :: wall0, wall1
-  double precision, external :: NAI_pol_mult_erf_ao_with1s
-
-  provide mu_erf final_grid_points j1b_pen
-  call wall_time(wall0)
-
-  v_ij_erf_rk_cst_mu_j1b = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                                         &
- !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center, int_mu, int_coulomb, tmp) & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b2_size, final_grid_points, &
- !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo, List_all_comb_b2_cent,   &
- !$OMP          v_ij_erf_rk_cst_mu_j1b, mu_erf)
- !$OMP DO
-  !do ipoint = 1, 10
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
-
-    do i = 1, ao_num
-      do j = i, ao_num
-
-        tmp = 0.d0
-        do i_1s = 1, List_all_comb_b2_size
-
-          coef        = List_all_comb_b2_coef  (i_1s)
-          beta        = List_all_comb_b2_expo  (i_1s)
-          B_center(1) = List_all_comb_b2_cent(1,i_1s)
-          B_center(2) = List_all_comb_b2_cent(2,i_1s)
-          B_center(3) = List_all_comb_b2_cent(3,i_1s)
-
-          int_mu      = NAI_pol_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r)
-          int_coulomb = NAI_pol_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r)
-
-          tmp += coef * (int_mu - int_coulomb)
-        enddo
-
-        v_ij_erf_rk_cst_mu_j1b(j,i,ipoint) = tmp
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        v_ij_erf_rk_cst_mu_j1b(j,i,ipoint) = v_ij_erf_rk_cst_mu_j1b(i,j,ipoint)
-      enddo
-    enddo
-  enddo
- 
-  call wall_time(wall1)
-  print*, ' wall time for v_ij_erf_rk_cst_mu_j1b', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_j1b, (ao_num, ao_num, n_points_final_grid, 3)]
-
-  BEGIN_DOC
-  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
-  END_DOC
-
-  implicit none
-  integer          :: i, j, ipoint
-  double precision :: wall0, wall1
-
-  call wall_time(wall0)
-
-  do ipoint = 1, n_points_final_grid
-    do i = 1, ao_num
-      do j = 1, ao_num
-        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,1) = x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint)
-        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,2) = x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint)
-        x_v_ij_erf_rk_cst_mu_j1b(j,i,ipoint,3) = x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_j1b', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-BEGIN_PROVIDER [ double precision, x_v_ij_erf_rk_cst_mu_tmp_j1b, (3, ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  ! int dr x phi_i(r) phi_j(r) 1s_j1b(r) (erf(mu(R) |r - R|) - 1)/|r - R|
-  END_DOC
-
-  implicit none
-  integer          :: i, j, ipoint, i_1s
-  double precision :: coef, beta, B_center(3), r(3), ints(3), ints_coulomb(3)
-  double precision :: tmp_x, tmp_y, tmp_z
-  double precision :: wall0, wall1
-
-  call wall_time(wall0)
-
-  x_v_ij_erf_rk_cst_mu_tmp_j1b = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                                        &
- !$OMP PRIVATE (ipoint, i, j, i_1s, r, coef, beta, B_center, ints, ints_coulomb,      & 
- !$OMP          tmp_x, tmp_y, tmp_z)                                                  & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b2_size, final_grid_points,&
- !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo, List_all_comb_b2_cent,  &
- !$OMP          x_v_ij_erf_rk_cst_mu_tmp_j1b, mu_erf)
- !$OMP DO
-  !do ipoint = 1, 10
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
-
-    do i = 1, ao_num
-      do j = i, ao_num
-
-        tmp_x = 0.d0
-        tmp_y = 0.d0
-        tmp_z = 0.d0
-        do i_1s = 1, List_all_comb_b2_size
-
-          coef        = List_all_comb_b2_coef  (i_1s)
-          beta        = List_all_comb_b2_expo  (i_1s)
-          B_center(1) = List_all_comb_b2_cent(1,i_1s)
-          B_center(2) = List_all_comb_b2_cent(2,i_1s)
-          B_center(3) = List_all_comb_b2_cent(3,i_1s)
-
-          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center, mu_erf, r, ints        )
-          call NAI_pol_x_mult_erf_ao_with1s(i, j, beta, B_center,  1.d+9, r, ints_coulomb)
-
-          tmp_x += coef * (ints(1) - ints_coulomb(1))
-          tmp_y += coef * (ints(2) - ints_coulomb(2))
-          tmp_z += coef * (ints(3) - ints_coulomb(3))
-        enddo
-
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint) = tmp_x
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint) = tmp_y
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint) = tmp_z
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(1,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(1,i,j,ipoint)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(2,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(2,i,j,ipoint)
-        x_v_ij_erf_rk_cst_mu_tmp_j1b(3,j,i,ipoint) = x_v_ij_erf_rk_cst_mu_tmp_j1b(3,i,j,ipoint)
-      enddo
-    enddo
-  enddo
-
-  call wall_time(wall1)
-  print*, ' wall time for x_v_ij_erf_rk_cst_mu_tmp_j1b', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
-! TODO analytically
-BEGIN_PROVIDER [ double precision, v_ij_u_cst_mu_j1b, (ao_num, ao_num, n_points_final_grid)]
-
-  BEGIN_DOC
-  !
-  ! int dr2 phi_i(r2) phi_j(r2) 1s_j1b(r2) u(mu, r12)
-  !
-  END_DOC
-
-  implicit none
-  integer                    :: i, j, ipoint, i_1s, i_fit
-  double precision           :: r(3), int_fit, expo_fit, coef_fit
-  double precision           :: coef, beta, B_center(3)
-  double precision           :: tmp
-  double precision           :: wall0, wall1
-
-  double precision, external :: overlap_gauss_r12_ao_with1s
-
-  provide mu_erf final_grid_points j1b_pen
-  call wall_time(wall0)
-
-  v_ij_u_cst_mu_j1b = 0.d0
-
- !$OMP PARALLEL DEFAULT (NONE)                                      &
- !$OMP PRIVATE (ipoint, i, j, i_1s, i_fit, r, coef, beta, B_center, &
- !$OMP          coef_fit, expo_fit, int_fit, tmp)                   & 
- !$OMP SHARED  (n_points_final_grid, ao_num, List_all_comb_b2_size, & 
- !$OMP          final_grid_points, n_max_fit_slat,                  &
- !$OMP          expo_gauss_j_mu_x, coef_gauss_j_mu_x,               &
- !$OMP          List_all_comb_b2_coef, List_all_comb_b2_expo,       & 
- !$OMP          List_all_comb_b2_cent, v_ij_u_cst_mu_j1b)
- !$OMP DO
-  !do ipoint = 1, 10
-  do ipoint = 1, n_points_final_grid
-    r(1) = final_grid_points(1,ipoint)
-    r(2) = final_grid_points(2,ipoint)
-    r(3) = final_grid_points(3,ipoint)
-
-    do i = 1, ao_num
-      do j = i, ao_num
-
-        tmp = 0.d0
-        do i_1s = 1, List_all_comb_b2_size
-
-          coef        = List_all_comb_b2_coef  (i_1s)
-          beta        = List_all_comb_b2_expo  (i_1s)
-          B_center(1) = List_all_comb_b2_cent(1,i_1s)
-          B_center(2) = List_all_comb_b2_cent(2,i_1s)
-          B_center(3) = List_all_comb_b2_cent(3,i_1s)
-
-          do i_fit = 1, n_max_fit_slat
-
-            expo_fit = expo_gauss_j_mu_x(i_fit)
-            coef_fit = coef_gauss_j_mu_x(i_fit)
-            int_fit  = overlap_gauss_r12_ao_with1s(B_center, beta, r, expo_fit, i, j)
-
-            tmp += coef * coef_fit * int_fit
-          enddo
-        enddo
-
-        v_ij_u_cst_mu_j1b(j,i,ipoint) = tmp
-      enddo
-    enddo
-  enddo
- !$OMP END DO
- !$OMP END PARALLEL
-
-  do ipoint = 1, n_points_final_grid
-    do i = 2, ao_num
-      do j = 1, i-1
-        v_ij_u_cst_mu_j1b(j,i,ipoint) = v_ij_u_cst_mu_j1b(i,j,ipoint)
-      enddo
-    enddo
-  enddo
- 
-  call wall_time(wall1)
-  print*, ' wall time for v_ij_u_cst_mu_j1b', wall1 - wall0
-
-END_PROVIDER 
-
-! ---
-
--- a/src/ao_many_one_e_ints/listj1b.irp.f
+++ b/src/ao_many_one_e_ints/listj1b.irp.f
@ -1,227 +0,0 @@
-
-! ---
-
-BEGIN_PROVIDER [ integer, List_all_comb_b2_size]
-
-  implicit none
-
-  List_all_comb_b2_size = 2**nucl_num
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ integer, List_all_comb_b2, (nucl_num, List_all_comb_b2_size)]
-
-  implicit none
-  integer :: i, j
-
-  if(nucl_num .gt. 32) then
-    print *, ' nucl_num = ', nucl_num, '> 32'
-    stop
-  endif
-
-  List_all_comb_b2 = 0
-
-  do i = 0, List_all_comb_b2_size-1
-    do j = 0, nucl_num-1
-      if (btest(i,j)) then
-        List_all_comb_b2(j+1,i+1) = 1
-      endif
-    enddo
-  enddo
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, List_all_comb_b2_coef, (   List_all_comb_b2_size)]
-&BEGIN_PROVIDER [ double precision, List_all_comb_b2_expo, (   List_all_comb_b2_size)]
-&BEGIN_PROVIDER [ double precision, List_all_comb_b2_cent, (3, List_all_comb_b2_size)]
-
-  implicit none
-  integer          :: i, j, k, phase
-  double precision :: tmp_alphaj, tmp_alphak
-  double precision :: tmp_cent_x, tmp_cent_y, tmp_cent_z
-
-  provide j1b_pen
-
-  List_all_comb_b2_coef = 0.d0
-  List_all_comb_b2_expo = 0.d0
-  List_all_comb_b2_cent = 0.d0
-
-  do i = 1, List_all_comb_b2_size
-
-    tmp_cent_x = 0.d0
-    tmp_cent_y = 0.d0
-    tmp_cent_z = 0.d0
-    do j = 1, nucl_num
-      tmp_alphaj = dble(List_all_comb_b2(j,i)) * j1b_pen(j)
-      List_all_comb_b2_expo(i) += tmp_alphaj
-      tmp_cent_x               += tmp_alphaj * nucl_coord(j,1)
-      tmp_cent_y               += tmp_alphaj * nucl_coord(j,2)
-      tmp_cent_z               += tmp_alphaj * nucl_coord(j,3)
-    enddo
-
-    ASSERT(List_all_comb_b2_expo(i) .gt. 0d0)
-    if(List_all_comb_b2_expo(i) .lt. 1d-10) cycle
-
-    List_all_comb_b2_cent(1,i) = tmp_cent_x / List_all_comb_b2_expo(i) 
-    List_all_comb_b2_cent(2,i) = tmp_cent_y / List_all_comb_b2_expo(i)
-    List_all_comb_b2_cent(3,i) = tmp_cent_z / List_all_comb_b2_expo(i)
-  enddo
-
-  ! ---
-
-  do i = 1, List_all_comb_b2_size
-
-    do j = 2, nucl_num, 1
-      tmp_alphaj = dble(List_all_comb_b2(j,i)) * j1b_pen(j)
-      do k = 1, j-1, 1
-        tmp_alphak = dble(List_all_comb_b2(k,i)) * j1b_pen(k)
-
-        List_all_comb_b2_coef(i) += tmp_alphaj * tmp_alphak * ( (nucl_coord(j,1) - nucl_coord(k,1)) * (nucl_coord(j,1) - nucl_coord(k,1)) &
-                                                              + (nucl_coord(j,2) - nucl_coord(k,2)) * (nucl_coord(j,2) - nucl_coord(k,2)) &
-                                                              + (nucl_coord(j,3) - nucl_coord(k,3)) * (nucl_coord(j,3) - nucl_coord(k,3)) )
-      enddo
-    enddo
-
-    if(List_all_comb_b2_expo(i) .lt. 1d-10) cycle
-
-    List_all_comb_b2_coef(i) = List_all_comb_b2_coef(i) / List_all_comb_b2_expo(i)
-  enddo
-
-  ! ---
-
-  do i = 1, List_all_comb_b2_size
-
-    phase = 0
-    do j = 1, nucl_num
-      phase += List_all_comb_b2(j,i)
-    enddo
-
-    List_all_comb_b2_coef(i) = (-1.d0)**dble(phase) * dexp(-List_all_comb_b2_coef(i))
-  enddo
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ integer, List_all_comb_b3_size]
-
-  implicit none
-
-  List_all_comb_b3_size = 3**nucl_num
-
-END_PROVIDER
-
-! ---
-
-BEGIN_PROVIDER [ integer, List_all_comb_b3, (nucl_num, List_all_comb_b3_size)]
-
-  implicit none
-  integer              :: i, j, ii, jj
-  integer, allocatable :: M(:,:), p(:)
-
-  if(nucl_num .gt. 32) then
-    print *, ' nucl_num = ', nucl_num, '> 32'
-    stop
-  endif
-
-  List_all_comb_b3(:,:)                     = 0
-  List_all_comb_b3(:,List_all_comb_b3_size) = 2
-
-  allocate(p(nucl_num))
-  p = 0
-
-  do i = 2, List_all_comb_b3_size-1
-    do j = 1, nucl_num
-
-      ii = 0
-      do jj = 1, j-1, 1
-        ii = ii + p(jj) * 3**(jj-1)
-      enddo
-      p(j) = modulo(i-1-ii, 3**j) / 3**(j-1)
-
-      List_all_comb_b3(j,i) = p(j)
-    enddo
-  enddo
-
-END_PROVIDER
-
-! ---
-
- BEGIN_PROVIDER [ double precision, List_all_comb_b3_coef, (   List_all_comb_b3_size)]
-&BEGIN_PROVIDER [ double precision, List_all_comb_b3_expo, (   List_all_comb_b3_size)]
-&BEGIN_PROVIDER [ double precision, List_all_comb_b3_cent, (3, List_all_comb_b3_size)]
-
-  implicit none
-  integer          :: i, j, k, phase
-  double precision :: tmp_alphaj, tmp_alphak, facto
-
-  provide j1b_pen
-
-  List_all_comb_b3_coef = 0.d0
-  List_all_comb_b3_expo = 0.d0
-  List_all_comb_b3_cent = 0.d0
-
-  do i = 1, List_all_comb_b3_size
-
-    do j = 1, nucl_num
-      tmp_alphaj = dble(List_all_comb_b3(j,i)) * j1b_pen(j)
-
-      List_all_comb_b3_expo(i)   += tmp_alphaj
-      List_all_comb_b3_cent(1,i) += tmp_alphaj * nucl_coord(j,1)
-      List_all_comb_b3_cent(2,i) += tmp_alphaj * nucl_coord(j,2)
-      List_all_comb_b3_cent(3,i) += tmp_alphaj * nucl_coord(j,3)
-
-    enddo
-
-    ASSERT(List_all_comb_b3_expo(i) .gt. 0d0)
-    if(List_all_comb_b3_expo(i) .lt. 1d-10) cycle
-
-    List_all_comb_b3_cent(1,i) = List_all_comb_b3_cent(1,i) / List_all_comb_b3_expo(i) 
-    List_all_comb_b3_cent(2,i) = List_all_comb_b3_cent(2,i) / List_all_comb_b3_expo(i)
-    List_all_comb_b3_cent(3,i) = List_all_comb_b3_cent(3,i) / List_all_comb_b3_expo(i)
-  enddo
-
-  ! ---
-
-  do i = 1, List_all_comb_b3_size
-
-    do j = 2, nucl_num, 1
-      tmp_alphaj = dble(List_all_comb_b3(j,i)) * j1b_pen(j)
-      do k = 1, j-1, 1
-        tmp_alphak = dble(List_all_comb_b3(k,i)) * j1b_pen(k)
-
-        List_all_comb_b3_coef(i) += tmp_alphaj * tmp_alphak * ( (nucl_coord(j,1) - nucl_coord(k,1)) * (nucl_coord(j,1) - nucl_coord(k,1)) &
-                                                              + (nucl_coord(j,2) - nucl_coord(k,2)) * (nucl_coord(j,2) - nucl_coord(k,2)) &
-                                                              + (nucl_coord(j,3) - nucl_coord(k,3)) * (nucl_coord(j,3) - nucl_coord(k,3)) )
-      enddo
-    enddo
-
-    if(List_all_comb_b3_expo(i) .lt. 1d-10) cycle
-
-    List_all_comb_b3_coef(i) = List_all_comb_b3_coef(i) / List_all_comb_b3_expo(i)
-  enddo
-
-  ! ---
-
-  do i = 1, List_all_comb_b3_size
-
-    facto = 1.d0
-    phase = 0
-    do j = 1, nucl_num
-      tmp_alphaj = dble(List_all_comb_b3(j,i)) 
-
-      facto *= 2.d0 / (gamma(tmp_alphaj+1.d0) * gamma(3.d0-tmp_alphaj))
-      phase += List_all_comb_b3(j,i)
-    enddo
-
-    List_all_comb_b3_coef(i) = (-1.d0)**dble(phase) * facto * dexp(-List_all_comb_b3_coef(i))
-  enddo
-
-END_PROVIDER
-
-! ---
-
--- a/src/ao_many_one_e_ints/prim_int_erf_gauss.irp.f
+++ b/src/ao_many_one_e_ints/prim_int_erf_gauss.irp.f
@ -142,7 +142,7 @@ double precision function erf_mu_gauss(D_center,delta,mu,A_center,B_center,power
  !
  ! .. math::
  ! 
-  !   \int dr exp(-delta (r - D)^2 ) erf(mu*|r-r'|)/ |r-r'| * (x-A_x)^a (x-B_x)^b \exp(-\alpha (x-A_x)^2 - \beta (x-B_x)^2 )
+  !   \int dr exp(-delta (r - D)^2 ) erf(mu*|r-D|)/ |r-D| * (x-A_x)^a (x-B_x)^b \exp(-\alpha (x-A_x)^2 - \beta (x-B_x)^2 )
  !
  END_DOC

--- a/src/ao_tc_eff_map/EZFIO.cfg
+++ b/src/ao_tc_eff_map/EZFIO.cfg
@ -1,18 +0,0 @@
-
-[j1b_pen]
-type: double precision
-doc: exponents of the 1-body Jastrow
-interface: ezfio
-size: (nuclei.nucl_num)
-
-[j1b_coeff]
-type: double precision
-doc: coeff of the 1-body Jastrow
-interface: ezfio
-size: (nuclei.nucl_num)
-
-[j1b_type]
-type: integer
-doc: type of 1-body Jastrow
-interface: ezfio, provider, ocaml
-default: 0
--- a/src/ao_tc_eff_map/NEED
+++ b/src/ao_tc_eff_map/NEED
@ -2,3 +2,4 @@ ao_two_e_erf_ints
 mo_one_e_ints
 ao_many_one_e_ints
 dft_utils_in_r
+tc_keywords
--- a/src/cipsi_tc_bi_ortho/EZFIO.cfg
+++ b/src/cipsi_tc_bi_ortho/EZFIO.cfg
@ -0,0 +1,36 @@
+[save_wf_after_selection]
+type: logical
+doc: If true, saves the wave function after the selection, before the diagonalization
+interface: ezfio,provider,ocaml
+default: False
+
+[seniority_max]
+type: integer
+doc: Maximum number of allowed open shells. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
+[excitation_ref]
+type: integer
+doc: 1: Hartree-Fock determinant, 2:All determinants of the dominant configuration
+interface: ezfio,ocaml,provider
+default: 1
+
+[excitation_max]
+type: integer
+doc: Maximum number of excitation with respect to the Hartree-Fock determinant. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
+[excitation_alpha_max]
+type: integer
+doc: Maximum number of excitation for alpha determinants with respect to the Hartree-Fock determinant. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
+[excitation_beta_max]
+type: integer
+doc: Maximum number of excitation for beta determinants with respect to the Hartree-Fock determinant. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
--- a/src/cipsi_tc_bi_ortho/NEED
+++ b/src/cipsi_tc_bi_ortho/NEED
@ -0,0 +1,6 @@
+mpi
+perturbation
+zmq
+iterations_tc
+csf
+tc_bi_ortho
--- a/src/cipsi_tc_bi_ortho/cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/cipsi.irp.f
@ -0,0 +1,136 @@
+subroutine run_cipsi
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with deterministic selection and
+  ! stochastic PT2.
+  END_DOC
+
+  use selection_types
+
+  implicit none
+
+  integer                        :: i,j,k,ndet
+  type(pt2_type)                 :: pt2_data, pt2_data_err
+  double precision, allocatable  :: zeros(:)
+  integer                        :: to_select
+  logical, external :: qp_stop
+
+  double precision :: threshold_generators_save
+  double precision :: rss
+  double precision, external :: memory_of_double
+  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
+
+  PROVIDE H_apply_buffer_allocated distributed_davidson
+
+  print*,'Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+  enddo
+
+  N_iter = 1
+  threshold_generators = 1.d0
+  SOFT_TOUCH threshold_generators
+
+  rss = memory_of_double(N_states)*4.d0
+  call check_mem(rss,irp_here)
+
+  allocate (zeros(N_states))
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+
+  double precision               :: hf_energy_ref
+  logical                        :: has, print_pt2
+  double precision               :: relative_error
+
+  relative_error=PT2_relative_error
+
+  zeros = 0.d0
+  pt2_data % pt2  = -huge(1.e0)
+  pt2_data % rpt2 = -huge(1.e0)
+  pt2_data % overlap(:,:) = 0.d0
+  pt2_data % variance = huge(1.e0)
+
+  if (s2_eig) then
+    call make_s2_eigenfunction
+  endif
+  print_pt2 = .False.
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+
+  call ezfio_has_hartree_fock_energy(has)
+  if (has) then
+    call ezfio_get_hartree_fock_energy(hf_energy_ref)
+  else
+    hf_energy_ref = ref_bitmask_energy
+  endif
+
+  if (N_det > N_det_max) then
+    psi_det(1:N_int,1:2,1:N_det) = psi_det_sorted_tc_gen(1:N_int,1:2,1:N_det)
+    psi_coef(1:N_det,1:N_states) = psi_coef_sorted_tc_gen(1:N_det,1:N_states)
+    N_det = N_det_max
+    soft_touch N_det psi_det psi_coef
+    if (s2_eig) then
+      call make_s2_eigenfunction
+    endif
+    print_pt2 = .False.
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!    call routine_save_right
+  endif
+
+  correlation_energy_ratio = 0.d0
+
+  print_pt2 = .True.
+  do while (                                                         &
+        (N_det < N_det_max) .and.                                    &
+        (maxval(abs(pt2_data % pt2(1:N_states))) > pt2_max)          &
+        )
+      write(*,'(A)')  '--------------------------------------------------------------------------------'
+
+
+    to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
+    to_select = max(N_states_diag, to_select)
+
+    E_denom = E_tc ! TC Energy of the current wave function 
+    if (do_pt2) then
+      call pt2_dealloc(pt2_data)
+      call pt2_dealloc(pt2_data_err)
+      call pt2_alloc(pt2_data, N_states)
+      call pt2_alloc(pt2_data_err, N_states)
+      threshold_generators_save = threshold_generators
+      threshold_generators = 1.d0
+      SOFT_TOUCH threshold_generators
+      call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
+      threshold_generators = threshold_generators_save
+      SOFT_TOUCH threshold_generators
+    else
+      call pt2_dealloc(pt2_data)
+      call pt2_alloc(pt2_data, N_states)
+      call ZMQ_selection(to_select, pt2_data)
+    endif
+
+    N_iter += 1
+
+    if (qp_stop()) exit
+
+    ! Add selected determinants
+    call copy_H_apply_buffer_to_wf()
+
+    if (save_wf_after_selection) then
+      call save_wavefunction
+    endif
+
+    PROVIDE  psi_coef
+    PROVIDE  psi_det
+    PROVIDE  psi_det_sorted_tc
+
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+    if (qp_stop()) exit
+  enddo
+
+  call pt2_dealloc(pt2_data)
+  call pt2_dealloc(pt2_data_err)
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+  call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+
+end
--- a/src/cipsi_tc_bi_ortho/energy.irp.f
+++ b/src/cipsi_tc_bi_ortho/energy.irp.f
@ -0,0 +1,51 @@
+BEGIN_PROVIDER [ logical, initialize_pt2_E0_denominator ]
+ implicit none
+ BEGIN_DOC
+ ! If true, initialize pt2_E0_denominator
+ END_DOC
+ initialize_pt2_E0_denominator = .True.
+END_PROVIDER
+
+BEGIN_PROVIDER [ double precision, pt2_E0_denominator, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! E0 in the denominator of the PT2
+ END_DOC
+ integer :: i,j
+
+  pt2_E0_denominator = eigval_right_tc_bi_orth
+
+! if (initialize_pt2_E0_denominator) then
+!   if (h0_type == "EN") then
+!     pt2_E0_denominator(1:N_states) = psi_energy(1:N_states)
+!   else if (h0_type == "HF") then
+!     do i=1,N_states
+!       j = maxloc(abs(psi_coef(:,i)),1)
+!       pt2_E0_denominator(i) = psi_det_hii(j)
+!     enddo
+!   else if (h0_type == "Barycentric") then
+!     pt2_E0_denominator(1:N_states) = barycentric_electronic_energy(1:N_states)
+!   else if (h0_type == "CFG") then
+!     pt2_E0_denominator(1:N_states) = psi_energy(1:N_states)
+!   else
+!     print *,  h0_type, ' not implemented'
+!     stop
+!   endif
+!  do i=1,N_states
+!    call write_double(6,pt2_E0_denominator(i)+nuclear_repulsion, 'PT2 Energy denominator')
+!  enddo
+! else
+!   pt2_E0_denominator = -huge(1.d0)
+! endif
+
+END_PROVIDER
+
+
+BEGIN_PROVIDER [ double precision, pt2_overlap, (N_states, N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Overlap between the perturbed wave functions
+ END_DOC
+ pt2_overlap(1:N_states,1:N_states) = 0.d0
+END_PROVIDER
+
--- a/src/cipsi_tc_bi_ortho/environment.irp.f
+++ b/src/cipsi_tc_bi_ortho/environment.irp.f
@ -0,0 +1,14 @@
+BEGIN_PROVIDER [ integer, nthreads_pt2 ]
+ implicit none
+ BEGIN_DOC
+ ! Number of threads for Davidson
+ END_DOC
+ nthreads_pt2 = nproc
+ character*(32) :: env
+ call getenv('QP_NTHREADS_PT2',env)
+ if (trim(env) /= '') then
+   read(env,*) nthreads_pt2
+   call write_int(6,nthreads_pt2,'Target number of threads for PT2')
+ endif
+END_PROVIDER
+
--- a/src/cipsi_tc_bi_ortho/get_d.irp.f
+++ b/src/cipsi_tc_bi_ortho/get_d.irp.f
--- a/src/cipsi_tc_bi_ortho/lock_2rdm.irp.f
+++ b/src/cipsi_tc_bi_ortho/lock_2rdm.irp.f
--- a/src/cipsi_tc_bi_ortho/pt2.irp.f
+++ b/src/cipsi_tc_bi_ortho/pt2.irp.f
@ -0,0 +1,89 @@
+subroutine pt2_tc_bi_ortho
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+  integer                        :: i,j,k,ndet
+  double precision, allocatable  :: zeros(:)
+  integer                        :: to_select
+  type(pt2_type)                 :: pt2_data, pt2_data_err
+  logical, external              :: qp_stop
+  logical                        :: print_pt2
+
+  double precision :: rss
+  double precision, external :: memory_of_double
+  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
+  double precision, allocatable :: ept2(:), pt1(:),extrap_energy(:)
+  PROVIDE H_apply_buffer_allocated distributed_davidson mo_two_e_integrals_in_map
+
+  print*,'Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+  enddo
+  N_iter = 1
+  threshold_generators = 1.d0
+  SOFT_TOUCH threshold_generators
+
+  rss = memory_of_double(N_states)*4.d0
+  call check_mem(rss,irp_here)
+
+  allocate (zeros(N_states))
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+
+  double precision               :: hf_energy_ref
+  logical                        :: has
+  double precision               :: relative_error
+
+  relative_error=PT2_relative_error
+
+  zeros = 0.d0
+  pt2_data % pt2   = -huge(1.e0)
+  pt2_data % rpt2  = -huge(1.e0)
+  pt2_data % overlap= 0.d0
+  pt2_data % variance = huge(1.e0)
+
+  if (s2_eig) then
+    call make_s2_eigenfunction
+  endif
+  print_pt2 = .False.
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  call routine_save_right
+
+  if (N_det > N_det_max) then
+    psi_det(1:N_int,1:2,1:N_det) = psi_det_sorted_tc_gen(1:N_int,1:2,1:N_det)
+    psi_coef(1:N_det,1:N_states) = psi_coef_sorted_tc_gen(1:N_det,1:N_states)
+    N_det = N_det_max
+    soft_touch N_det psi_det psi_coef
+    if (s2_eig) then
+      call make_s2_eigenfunction
+    endif
+    print_pt2 = .False.
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  endif
+
+  allocate(ept2(1000),pt1(1000),extrap_energy(100))
+
+  correlation_energy_ratio = 0.d0
+
+! thresh_it_dav  = 5.d-5
+! soft_touch thresh_it_dav
+
+  print_pt2 = .True.
+  to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
+  to_select = max(N_states_diag, to_select)
+
+  E_denom = E_tc ! TC Energy of the current wave function 
+  call pt2_dealloc(pt2_data)
+  call pt2_dealloc(pt2_data_err)
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+  call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
+
+  N_iter += 1
+
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+
+end
+
--- a/src/cipsi_tc_bi_ortho/pt2_stoch_routines.irp.f
+++ b/src/cipsi_tc_bi_ortho/pt2_stoch_routines.irp.f
@ -0,0 +1,869 @@
+BEGIN_PROVIDER [ integer, pt2_stoch_istate ]
+ implicit none
+ BEGIN_DOC
+ ! State for stochatsic PT2
+ END_DOC
+ pt2_stoch_istate = 1
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer, pt2_F, (N_det_generators) ]
+&BEGIN_PROVIDER [ integer, pt2_n_tasks_max ]
+  implicit none
+  logical, external :: testTeethBuilding
+  integer :: i,j
+  pt2_n_tasks_max = elec_alpha_num*elec_alpha_num + elec_alpha_num*elec_beta_num  - n_core_orb*2
+  pt2_n_tasks_max = min(pt2_n_tasks_max,1+N_det_generators/10000)
+  call write_int(6,pt2_n_tasks_max,'pt2_n_tasks_max')
+
+  pt2_F(:) = max(int(sqrt(float(pt2_n_tasks_max))),1)
+  do i=1,pt2_n_0(1+pt2_N_teeth/4)
+    pt2_F(i) = pt2_n_tasks_max*pt2_min_parallel_tasks
+  enddo
+  do i=1+pt2_n_0(pt2_N_teeth-pt2_N_teeth/4), pt2_n_0(pt2_N_teeth-pt2_N_teeth/10)
+    pt2_F(i) = pt2_min_parallel_tasks
+  enddo
+  do i=1+pt2_n_0(pt2_N_teeth-pt2_N_teeth/10), N_det_generators
+    pt2_F(i) = 1
+  enddo
+
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer, pt2_N_teeth ]
+&BEGIN_PROVIDER [ integer, pt2_minDetInFirstTeeth ]
+  implicit none
+  logical, external :: testTeethBuilding
+
+  if(N_det_generators < 500) then
+    pt2_minDetInFirstTeeth = 1
+    pt2_N_teeth = 1
+  else
+    pt2_minDetInFirstTeeth = min(5, N_det_generators)
+    do pt2_N_teeth=100,2,-1
+      if(testTeethBuilding(pt2_minDetInFirstTeeth, pt2_N_teeth)) exit
+    end do
+  end if
+  call write_int(6,pt2_N_teeth,'Number of comb teeth')
+END_PROVIDER
+
+
+logical function testTeethBuilding(minF, N)
+  implicit none
+  integer, intent(in) :: minF, N
+  integer :: n0, i
+  double precision :: u0, Wt, r
+
+  double precision, allocatable :: tilde_w(:), tilde_cW(:)
+  integer, external :: dress_find_sample
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+
+  rss = memory_of_double(2*N_det_generators+1)
+  call check_mem(rss,irp_here)
+
+  allocate(tilde_w(N_det_generators), tilde_cW(0:N_det_generators))
+
+  double precision :: norm2
+  norm2 = 0.d0
+  do i=N_det_generators,1,-1
+    tilde_w(i)  = psi_coef_sorted_tc_gen(i,pt2_stoch_istate) * &
+                  psi_coef_sorted_tc_gen(i,pt2_stoch_istate)
+    norm2 = norm2 + tilde_w(i)
+  enddo
+
+  f = 1.d0/norm2
+  tilde_w(:) = tilde_w(:) * f
+
+  tilde_cW(0) = -1.d0
+  do i=1,N_det_generators
+    tilde_cW(i) = tilde_cW(i-1) + tilde_w(i)
+  enddo
+  tilde_cW(:) = tilde_cW(:) + 1.d0
+  deallocate(tilde_w)
+
+  n0 = 0
+  testTeethBuilding = .false.
+  double precision :: f
+  integer :: minFN
+  minFN = N_det_generators - minF * N
+  f = 1.d0/dble(N)
+  do
+    u0 = tilde_cW(n0)
+    r = tilde_cW(n0 + minF)
+    Wt = (1d0 - u0) * f
+    if (dabs(Wt) <= 1.d-3) then
+      exit
+    endif
+    if(Wt >= r - u0) then
+       testTeethBuilding = .true.
+       exit
+    end if
+    n0 += 1
+    if(n0 > minFN) then
+      exit
+    end if
+  end do
+  deallocate(tilde_cW)
+
+end function
+
+
+
+subroutine ZMQ_pt2(E, pt2_data, pt2_data_err, relative_error, N_in)
+  use f77_zmq
+  use selection_types
+
+  implicit none
+
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket, zmq_socket_pull
+  integer, intent(in)            :: N_in
+!  integer, intent(inout)         :: N_in
+  double precision, intent(in)   :: relative_error, E(N_states)
+  type(pt2_type), intent(inout)  :: pt2_data, pt2_data_err
+!
+  integer                        :: i, N
+
+  double precision               :: state_average_weight_save(N_states), w(N_states,4)
+  integer(ZMQ_PTR), external     :: new_zmq_to_qp_run_socket
+  type(selection_buffer)         :: b
+
+  PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+  PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+  PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+  PROVIDE psi_bilinear_matrix_transp_order psi_selectors_coef_transp psi_det_sorted_tc
+  PROVIDE psi_det_hii selection_weight pseudo_sym
+  PROVIDE n_act_orb n_inact_orb n_core_orb n_virt_orb n_del_orb seniority_max
+  PROVIDE excitation_beta_max  excitation_alpha_max excitation_max
+  PROVIDE psi_selectors_rcoef_bi_orth_transp psi_selectors_lcoef_bi_orth_transp
+
+  if (h0_type == 'CFG') then
+    PROVIDE psi_configuration_hii det_to_configuration
+  endif
+
+  if (N_det <= max(4,N_states) .or. pt2_N_teeth < 2) then
+    print*,'ZMQ_selection'
+    call ZMQ_selection(N_in, pt2_data)
+  else
+    print*,'else ZMQ_selection'
+
+    N = max(N_in,1) * N_states
+    state_average_weight_save(:) = state_average_weight(:)
+    if (int(N,8)*2_8 > huge(1)) then
+      print *,  irp_here, ': integer too large'
+      stop -1
+    endif
+    call create_selection_buffer(N, N*2, b)
+    ASSERT (associated(b%det))
+    ASSERT (associated(b%val))
+
+    do pt2_stoch_istate=1,N_states
+      state_average_weight(:) = 0.d0
+      state_average_weight(pt2_stoch_istate) = 1.d0
+      TOUCH state_average_weight pt2_stoch_istate selection_weight
+
+      PROVIDE nproc pt2_F mo_two_e_integrals_in_map mo_one_e_integrals pt2_w
+      PROVIDE psi_selectors pt2_u pt2_J pt2_R
+      call new_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'pt2')
+
+      integer, external              :: zmq_put_psi
+      integer, external              :: zmq_put_N_det_generators
+      integer, external              :: zmq_put_N_det_selectors
+      integer, external              :: zmq_put_dvector
+      integer, external              :: zmq_put_ivector
+      if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+        stop 'Unable to put psi on ZMQ server'
+      endif
+      if (zmq_put_N_det_generators(zmq_to_qp_run_socket, 1) == -1) then
+        stop 'Unable to put N_det_generators on ZMQ server'
+      endif
+      if (zmq_put_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) then
+        stop 'Unable to put N_det_selectors on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',pt2_e0_denominator,size(pt2_e0_denominator)) == -1) then
+        stop 'Unable to put energy on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) then
+        stop 'Unable to put state_average_weight on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) then
+        stop 'Unable to put selection_weight on ZMQ server'
+      endif
+      if (zmq_put_ivector(zmq_to_qp_run_socket,1,'pt2_stoch_istate',pt2_stoch_istate,1) == -1) then
+        stop 'Unable to put pt2_stoch_istate on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) then
+        stop 'Unable to put threshold_generators on ZMQ server'
+      endif
+
+
+      integer, external :: add_task_to_taskserver
+      character(300000) :: task
+
+      integer :: j,k,ipos,ifirst
+      ifirst=0
+
+      ipos=0
+      do i=1,N_det_generators
+        if (pt2_F(i) > 1) then
+          ipos += 1
+        endif
+      enddo
+      call write_int(6,sum(pt2_F),'Number of tasks')
+      call write_int(6,ipos,'Number of fragmented tasks')
+
+      ipos=1
+      do i= 1, N_det_generators
+        do j=1,pt2_F(pt2_J(i))
+          write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, pt2_J(i), N_in
+          ipos += 30
+          if (ipos > 300000-30) then
+            if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+              stop 'Unable to add task to task server'
+            endif
+            ipos=1
+            if (ifirst == 0) then
+              ifirst=1
+              if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+                print *,  irp_here, ': Failed in zmq_set_running'
+              endif
+            endif
+          endif
+        end do
+      enddo
+      if (ipos > 1) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task to task server'
+        endif
+      endif
+
+      integer, external :: zmq_set_running
+      if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+        print *,  irp_here, ': Failed in zmq_set_running'
+      endif
+
+
+      double precision :: mem_collector, mem, rss
+
+      call resident_memory(rss)
+
+      mem_collector = 8.d0 *                  & ! bytes
+            ( 1.d0*pt2_n_tasks_max            & ! task_id, index
+            + 0.635d0*N_det_generators        & ! f,d
+            + pt2_n_tasks_max*pt2_type_size(N_states) & ! pt2_data_task
+            + N_det_generators*pt2_type_size(N_states)  & ! pt2_data_I
+            + 4.d0*(pt2_N_teeth+1)            & ! S, S2, T2, T3
+            + 1.d0*(N_int*2.d0*N + N)         & ! selection buffer
+            + 1.d0*(N_int*2.d0*N + N)         & ! sort selection buffer
+            ) / 1024.d0**3
+
+      integer :: nproc_target, ii
+      nproc_target = nthreads_pt2
+      ii = min(N_det, (elec_alpha_num*(mo_num-elec_alpha_num))**2)
+
+      do
+        mem = mem_collector +                   & !
+              nproc_target * 8.d0 *             & ! bytes
+              ( 0.5d0*pt2_n_tasks_max           & ! task_id
+              + 64.d0*pt2_n_tasks_max           & ! task
+              + pt2_type_size(N_states)*pt2_n_tasks_max*N_states   & ! pt2, variance, overlap
+              + 1.d0*pt2_n_tasks_max            & ! i_generator, subset
+              + 1.d0*(N_int*2.d0*ii+ ii)        & ! selection buffer
+              + 1.d0*(N_int*2.d0*ii+ ii)        & ! sort selection buffer
+              + 2.0d0*(ii)                      & ! preinteresting, interesting,
+                                                  ! prefullinteresting, fullinteresting
+              + 2.0d0*(N_int*2*ii)              & ! minilist, fullminilist
+              + 1.0d0*(N_states*mo_num*mo_num)  & ! mat
+              ) / 1024.d0**3
+
+        if (nproc_target == 0) then
+          call check_mem(mem,irp_here)
+          nproc_target = 1
+          exit
+        endif
+
+        if (mem+rss < qp_max_mem) then
+          exit
+        endif
+
+        nproc_target = nproc_target - 1
+
+      enddo
+      call write_int(6,nproc_target,'Number of threads for PT2')
+      call write_double(6,mem,'Memory (Gb)')
+
+      call omp_set_max_active_levels(1)
+
+
+      print '(A)', '========== ======================= ===================== ===================== ==========='
+      print '(A)', ' Samples          Energy                Variance               Norm^2          Seconds'
+      print '(A)', '========== ======================= ===================== ===================== ==========='
+
+      PROVIDE global_selection_buffer
+
+      !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc_target+1)            &
+          !$OMP  PRIVATE(i)
+      i = omp_get_thread_num()
+      if (i==0) then
+
+        call pt2_collector(zmq_socket_pull, E(pt2_stoch_istate),relative_error, pt2_data, pt2_data_err, b, N)
+        pt2_data % rpt2(pt2_stoch_istate) =  &
+          pt2_data % pt2(pt2_stoch_istate)/(1.d0+pt2_data % overlap(pt2_stoch_istate,pt2_stoch_istate))
+
+        !TODO : We should use here the correct formula for the error of X/Y
+        pt2_data_err % rpt2(pt2_stoch_istate) =  &
+          pt2_data_err % pt2(pt2_stoch_istate)/(1.d0 + pt2_data % overlap(pt2_stoch_istate,pt2_stoch_istate))
+
+      else
+        call pt2_slave_inproc(i)
+      endif
+      !$OMP END PARALLEL
+      call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'pt2')
+      call omp_set_max_active_levels(8)
+
+      print '(A)', '========== ======================= ===================== ===================== ==========='
+
+    do k=1,N_states
+      pt2_overlap(pt2_stoch_istate,k) = pt2_data % overlap(k,pt2_stoch_istate)
+    enddo
+    SOFT_TOUCH pt2_overlap
+
+    enddo
+    FREE pt2_stoch_istate
+
+    ! Symmetrize overlap
+    do j=2,N_states
+     do i=1,j-1
+       pt2_overlap(i,j) = 0.5d0 * (pt2_overlap(i,j) + pt2_overlap(j,i))
+       pt2_overlap(j,i) = pt2_overlap(i,j)
+     enddo
+    enddo
+
+    print *, 'Overlap of perturbed states:'
+    do k=1,N_states
+      print *, pt2_overlap(k,:)
+    enddo
+    print *, '-------'
+
+    if (N_in > 0) then
+      b%cur = min(N_in,b%cur)
+      if (s2_eig) then
+        call make_selection_buffer_s2(b)
+      else
+        call remove_duplicates_in_selection_buffer(b)
+      endif
+      call fill_H_apply_buffer_no_selection(b%cur,b%det,N_int,0)
+    endif
+    call delete_selection_buffer(b)
+
+    state_average_weight(:) = state_average_weight_save(:)
+    TOUCH state_average_weight
+    call update_pt2_and_variance_weights(pt2_data, N_states)
+  endif
+
+
+end subroutine
+
+
+subroutine pt2_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  PROVIDE global_selection_buffer
+  call run_pt2_slave(1,i,pt2_e0_denominator)
+end
+
+
+subroutine pt2_collector(zmq_socket_pull, E, relative_error, pt2_data, pt2_data_err, b, N_)
+  use f77_zmq
+  use selection_types
+  use bitmasks
+  implicit none
+
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  double precision, intent(in)   :: relative_error, E
+  type(pt2_type), intent(inout)  :: pt2_data, pt2_data_err
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(in)            :: N_
+
+  type(pt2_type), allocatable    :: pt2_data_task(:)
+  type(pt2_type), allocatable    :: pt2_data_I(:)
+  type(pt2_type), allocatable    :: pt2_data_S(:)
+  type(pt2_type), allocatable    :: pt2_data_S2(:)
+  type(pt2_type)                 :: pt2_data_teeth
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+  integer, external :: zmq_delete_tasks_async_send
+  integer, external :: zmq_delete_tasks_async_recv
+  integer, external :: zmq_abort
+  integer, external :: pt2_find_sample_lr
+
+  PROVIDE pt2_stoch_istate
+
+  integer :: more, n, i, p, c, t, n_tasks, U
+  integer, allocatable :: task_id(:)
+  integer, allocatable :: index(:)
+
+  double precision :: v, x, x2, x3, avg, avg2, avg3(N_states), eqt, E0, v0, n0(N_states)
+  double precision :: eqta(N_states)
+  double precision :: time, time1, time0
+
+  integer, allocatable :: f(:)
+  logical, allocatable :: d(:)
+  logical :: do_exit, stop_now, sending
+  logical, external :: qp_stop
+  type(selection_buffer) :: b2
+
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+
+  sending =.False.
+
+  rss  = memory_of_int(pt2_n_tasks_max*2+N_det_generators*2)
+  rss += memory_of_double(N_states*N_det_generators)*3.d0
+  rss += memory_of_double(N_states*pt2_n_tasks_max)*3.d0
+  rss += memory_of_double(pt2_N_teeth+1)*4.d0
+  call check_mem(rss,irp_here)
+
+  ! If an allocation is added here, the estimate of the memory should also be
+  ! updated in ZMQ_pt2
+  allocate(task_id(pt2_n_tasks_max), index(pt2_n_tasks_max), f(N_det_generators))
+  allocate(d(N_det_generators+1))
+  allocate(pt2_data_task(pt2_n_tasks_max))
+  allocate(pt2_data_I(N_det_generators))
+  allocate(pt2_data_S(pt2_N_teeth+1))
+  allocate(pt2_data_S2(pt2_N_teeth+1))
+
+
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+  call create_selection_buffer(N_, N_*2, b2)
+
+
+  pt2_data % pt2(pt2_stoch_istate) = -huge(1.)
+  pt2_data_err % pt2(pt2_stoch_istate) = huge(1.)
+  pt2_data % variance(pt2_stoch_istate) = huge(1.)
+  pt2_data_err % variance(pt2_stoch_istate) = huge(1.)
+  pt2_data % overlap(:,pt2_stoch_istate) = 0.d0
+  pt2_data_err % overlap(:,pt2_stoch_istate) = huge(1.)
+  n = 1
+  t = 0
+  U = 0
+  do i=1,pt2_n_tasks_max
+    call pt2_alloc(pt2_data_task(i),N_states)
+  enddo
+  do i=1,pt2_N_teeth+1
+    call pt2_alloc(pt2_data_S(i),N_states)
+    call pt2_alloc(pt2_data_S2(i),N_states)
+  enddo
+  do i=1,N_det_generators
+    call pt2_alloc(pt2_data_I(i),N_states)
+  enddo
+  f(:) = pt2_F(:)
+  d(:) = .false.
+  n_tasks = 0
+  E0 = E
+  v0 = 0.d0
+  n0(:) = 0.d0
+  more = 1
+  call wall_time(time0)
+  time1 = time0
+
+  do_exit = .false.
+  stop_now = .false.
+  do while (n <= N_det_generators)
+    if(f(pt2_J(n)) == 0) then
+      d(pt2_J(n)) = .true.
+      do while(d(U+1))
+        U += 1
+      end do
+
+      ! Deterministic part
+      do while(t <= pt2_N_teeth)
+        if(U >= pt2_n_0(t+1)) then
+          t=t+1
+          E0 = 0.d0
+          v0 = 0.d0
+          n0(:) = 0.d0
+          do i=pt2_n_0(t),1,-1
+            E0 += pt2_data_I(i) % pt2(pt2_stoch_istate)
+            v0 += pt2_data_I(i) % variance(pt2_stoch_istate)
+            n0(:) += pt2_data_I(i) % overlap(:,pt2_stoch_istate)
+          end do
+        else
+          exit
+        end if
+      end do
+
+      ! Add Stochastic part
+      c = pt2_R(n)
+      if(c > 0) then
+
+        call pt2_alloc(pt2_data_teeth,N_states)
+        do p=pt2_N_teeth, 1, -1
+          v = pt2_u_0 + pt2_W_T * (pt2_u(c) + dble(p-1))
+          i = pt2_find_sample_lr(v, pt2_cW,pt2_n_0(p),pt2_n_0(p+1))
+          v = pt2_W_T / pt2_w(i)
+          call pt2_add ( pt2_data_teeth,  v,  pt2_data_I(i) )
+          call pt2_add ( pt2_data_S(p),  1.d0,  pt2_data_teeth )
+          call pt2_add2( pt2_data_S2(p), 1.d0,  pt2_data_teeth )
+        enddo
+        call pt2_dealloc(pt2_data_teeth)
+
+        avg  = E0 + pt2_data_S(t) % pt2(pt2_stoch_istate) / dble(c)
+        avg2 = v0 + pt2_data_S(t) % variance(pt2_stoch_istate) / dble(c)
+        avg3(:) = n0(:) + pt2_data_S(t) % overlap(:,pt2_stoch_istate) / dble(c)
+        if ((avg /= 0.d0) .or. (n == N_det_generators) ) then
+          do_exit = .true.
+        endif
+        if (qp_stop()) then
+          stop_now = .True.
+        endif
+        pt2_data % pt2(pt2_stoch_istate) = avg
+        pt2_data % variance(pt2_stoch_istate) = avg2
+        pt2_data % overlap(:,pt2_stoch_istate) = avg3(:)
+        call wall_time(time)
+        ! 1/(N-1.5) : see  Brugger, The American Statistician (23) 4 p. 32 (1969)
+        if(c > 2) then
+          eqt = dabs((pt2_data_S2(t) % pt2(pt2_stoch_istate) / c) - (pt2_data_S(t) % pt2(pt2_stoch_istate)/c)**2) ! dabs for numerical stability
+          eqt = sqrt(eqt / (dble(c) - 1.5d0))
+          pt2_data_err % pt2(pt2_stoch_istate) = eqt
+
+          eqt = dabs((pt2_data_S2(t) % variance(pt2_stoch_istate) / c) - (pt2_data_S(t) % variance(pt2_stoch_istate)/c)**2) ! dabs for numerical stability
+          eqt = sqrt(eqt / (dble(c) - 1.5d0))
+          pt2_data_err % variance(pt2_stoch_istate) = eqt
+
+          eqta(:) = dabs((pt2_data_S2(t) % overlap(:,pt2_stoch_istate) / c) - (pt2_data_S(t) % overlap(:,pt2_stoch_istate)/c)**2) ! dabs for numerical stability
+          eqta(:) = sqrt(eqta(:) / (dble(c) - 1.5d0))
+          pt2_data_err % overlap(:,pt2_stoch_istate) = eqta(:)
+
+
+          if ((time - time1 > 1.d0) .or. (n==N_det_generators)) then
+            time1 = time
+            print '(I10, X, F12.6, X, G10.3, X, F10.6, X, G10.3, X, F10.6, X, G10.3, X, F10.4)', c, &
+              pt2_data     % pt2(pt2_stoch_istate) +E, &
+              pt2_data_err % pt2(pt2_stoch_istate), &
+              pt2_data     % variance(pt2_stoch_istate), &
+              pt2_data_err % variance(pt2_stoch_istate), &
+              pt2_data     % overlap(pt2_stoch_istate,pt2_stoch_istate), &
+              pt2_data_err % overlap(pt2_stoch_istate,pt2_stoch_istate), &
+              time-time0
+            if (stop_now .or. (                                      &
+                  (do_exit .and. (dabs(pt2_data_err % pt2(pt2_stoch_istate)) /    &
+                  (1.d-20 + dabs(pt2_data % pt2(pt2_stoch_istate)) ) <= relative_error))) ) then
+              if (zmq_abort(zmq_to_qp_run_socket) == -1) then
+                call sleep(10)
+                if (zmq_abort(zmq_to_qp_run_socket) == -1) then
+                  print *, irp_here, ': Error in sending abort signal (2)'
+                endif
+              endif
+            endif
+          endif
+        endif
+      end if
+      n += 1
+    else if(more == 0) then
+      exit
+    else
+      call pull_pt2_results(zmq_socket_pull, index, pt2_data_task, task_id, n_tasks, b2)
+      if(n_tasks > pt2_n_tasks_max)then
+       print*,'PB !!!'
+       print*,'If you see this, send a bug report with the following content'
+       print*,irp_here
+       print*,'n_tasks,pt2_n_tasks_max = ',n_tasks,pt2_n_tasks_max
+       stop -1
+      endif
+      if (zmq_delete_tasks_async_send(zmq_to_qp_run_socket,task_id,n_tasks,sending) == -1) then
+          stop 'PT2: Unable to delete tasks (send)'
+      endif
+      do i=1,n_tasks
+        if(index(i).gt.size(pt2_data_I,1).or.index(i).lt.1)then
+         print*,'PB !!!'
+         print*,'If you see this, send a bug report with the following content'
+         print*,irp_here
+         print*,'i,index(i),size(pt2_data_I,1) = ',i,index(i),size(pt2_data_I,1)
+         stop -1
+        endif
+        call pt2_add(pt2_data_I(index(i)),1.d0,pt2_data_task(i))
+        f(index(i)) -= 1
+      end do
+      do i=1, b2%cur
+        ! We assume the pulled buffer is sorted
+        if (b2%val(i) > b%mini) exit
+        call add_to_selection_buffer(b, b2%det(1,1,i), b2%val(i))
+      end do
+      if (zmq_delete_tasks_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
+          stop 'PT2: Unable to delete tasks (recv)'
+      endif
+    end if
+  end do
+  do i=1,N_det_generators
+    call pt2_dealloc(pt2_data_I(i))
+  enddo
+  do i=1,pt2_N_teeth+1
+    call pt2_dealloc(pt2_data_S(i))
+    call pt2_dealloc(pt2_data_S2(i))
+  enddo
+  do i=1,pt2_n_tasks_max
+    call pt2_dealloc(pt2_data_task(i))
+  enddo
+!print *,  'deleting b2'
+  call delete_selection_buffer(b2)
+!print *,  'sorting b'
+  call sort_selection_buffer(b)
+!print *,  'done'
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+
+end subroutine
+
+
+integer function pt2_find_sample(v, w)
+  implicit none
+  double precision, intent(in) :: v, w(0:N_det_generators)
+  integer, external :: pt2_find_sample_lr
+
+  pt2_find_sample = pt2_find_sample_lr(v, w, 0, N_det_generators)
+end function
+
+
+integer function pt2_find_sample_lr(v, w, l_in, r_in)
+  implicit none
+  double precision, intent(in) :: v, w(0:N_det_generators)
+  integer, intent(in) :: l_in,r_in
+  integer :: i,l,r
+
+  l=l_in
+  r=r_in
+
+  do while(r-l > 1)
+    i = shiftr(r+l,1)
+    if(w(i) < v) then
+      l = i
+    else
+      r = i
+    end if
+  end do
+  i = r
+  do r=i+1,N_det_generators
+    if (w(r) /= w(i)) then
+      exit
+    endif
+  enddo
+  pt2_find_sample_lr = r-1
+end function
+
+
+BEGIN_PROVIDER [ integer, pt2_n_tasks ]
+ implicit none
+ BEGIN_DOC
+ ! Number of parallel tasks for the Monte Carlo
+ END_DOC
+ pt2_n_tasks = N_det_generators
+END_PROVIDER
+
+BEGIN_PROVIDER[ double precision, pt2_u, (N_det_generators)]
+  implicit none
+  integer, allocatable :: seed(:)
+  integer :: m,i
+  call random_seed(size=m)
+  allocate(seed(m))
+  do i=1,m
+    seed(i) = i
+  enddo
+  call random_seed(put=seed)
+  deallocate(seed)
+
+  call RANDOM_NUMBER(pt2_u)
+ END_PROVIDER
+
+ BEGIN_PROVIDER[ integer, pt2_J, (N_det_generators)]
+&BEGIN_PROVIDER[ integer, pt2_R, (N_det_generators)]
+  implicit none
+  BEGIN_DOC
+! pt2_J contains the list of generators after ordering them according to the
+! Monte Carlo sampling.
+!
+! pt2_R(i) is the number of combs drawn when determinant i is computed.
+  END_DOC
+  integer                :: N_c, N_j
+  integer                :: U, t, i
+  double precision       :: v
+  integer, external      :: pt2_find_sample_lr
+
+  logical, allocatable :: pt2_d(:)
+  integer :: m,l,r,k
+  integer :: ncache
+  integer, allocatable :: ii(:,:)
+  double precision :: dt
+
+  ncache = min(N_det_generators,10000)
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+  rss = memory_of_int(ncache)*dble(pt2_N_teeth) + memory_of_int(N_det_generators)
+  call check_mem(rss,irp_here)
+
+  allocate(ii(pt2_N_teeth,ncache),pt2_d(N_det_generators))
+
+  pt2_R(:) = 0
+  pt2_d(:) = .false.
+  N_c = 0
+  N_j = pt2_n_0(1)
+  do i=1,N_j
+      pt2_d(i) = .true.
+      pt2_J(i) = i
+  end do
+
+  U = 0
+  do while(N_j < pt2_n_tasks)
+
+    if (N_c+ncache > N_det_generators) then
+      ncache = N_det_generators - N_c
+    endif
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(dt,v,t,k)
+    do k=1, ncache
+      dt = pt2_u_0
+      do t=1, pt2_N_teeth
+        v = dt + pt2_W_T *pt2_u(N_c+k)
+        dt = dt + pt2_W_T
+        ii(t,k) = pt2_find_sample_lr(v, pt2_cW,pt2_n_0(t),pt2_n_0(t+1))
+      end do
+    enddo
+    !$OMP END PARALLEL DO
+
+    do k=1,ncache
+      !ADD_COMB
+      N_c = N_c+1
+      do t=1, pt2_N_teeth
+        i = ii(t,k)
+        if(.not. pt2_d(i)) then
+          N_j += 1
+          pt2_J(N_j) = i
+          pt2_d(i) = .true.
+        end if
+      end do
+
+      pt2_R(N_j) = N_c
+
+      !FILL_TOOTH
+      do while(U < N_det_generators)
+        U += 1
+        if(.not. pt2_d(U)) then
+          N_j += 1
+          pt2_J(N_j) = U
+          pt2_d(U) = .true.
+          exit
+        end if
+      end do
+      if (N_j >= pt2_n_tasks) exit
+    end do
+  enddo
+
+  if(N_det_generators > 1) then
+    pt2_R(N_det_generators-1) = 0
+    pt2_R(N_det_generators) = N_c
+  end if
+
+  deallocate(ii,pt2_d)
+
+END_PROVIDER
+
+
+
+ BEGIN_PROVIDER [ double precision, pt2_w, (N_det_generators) ]
+&BEGIN_PROVIDER [ double precision, pt2_cW, (0:N_det_generators) ]
+&BEGIN_PROVIDER [ double precision, pt2_W_T ]
+&BEGIN_PROVIDER [ double precision, pt2_u_0 ]
+&BEGIN_PROVIDER [ integer,          pt2_n_0, (pt2_N_teeth+1) ]
+   implicit none
+   integer                        :: i, t
+   double precision, allocatable  :: tilde_w(:), tilde_cW(:)
+   double precision               :: r, tooth_width
+   integer, external              :: pt2_find_sample
+
+   double precision               :: rss
+   double precision, external     :: memory_of_double, memory_of_int
+   rss = memory_of_double(2*N_det_generators+1)
+   call check_mem(rss,irp_here)
+
+   if (N_det_generators == 1) then
+
+     pt2_w(1)   = 1.d0
+     pt2_cw(1)  = 1.d0
+     pt2_u_0    = 1.d0
+     pt2_W_T    = 0.d0
+     pt2_n_0(1) = 0
+     pt2_n_0(2) = 1
+
+   else
+
+     allocate(tilde_w(N_det_generators), tilde_cW(0:N_det_generators))
+
+     tilde_cW(0) = 0d0
+
+     do i=1,N_det_generators
+       tilde_w(i)  = psi_coef_sorted_tc_gen(i,pt2_stoch_istate)**2 !+ 1.d-20
+     enddo
+
+     double precision               :: norm2
+     norm2 = 0.d0
+     do i=N_det_generators,1,-1
+       norm2 += tilde_w(i)
+     enddo
+
+     tilde_w(:) = tilde_w(:) / norm2
+
+     tilde_cW(0) = -1.d0
+     do i=1,N_det_generators
+       tilde_cW(i) = tilde_cW(i-1) + tilde_w(i)
+     enddo
+     tilde_cW(:) = tilde_cW(:) + 1.d0
+
+     pt2_n_0(1) = 0
+     do
+     pt2_u_0 = tilde_cW(pt2_n_0(1))
+     r = tilde_cW(pt2_n_0(1) + pt2_minDetInFirstTeeth)
+     pt2_W_T = (1d0 - pt2_u_0) / dble(pt2_N_teeth)
+     if(pt2_W_T >= r - pt2_u_0) then
+       exit
+     end if
+     pt2_n_0(1) += 1
+     if(N_det_generators - pt2_n_0(1) < pt2_minDetInFirstTeeth * pt2_N_teeth) then
+       print *, "teeth building failed"
+       stop -1
+     end if
+   end do
+
+   do t=2, pt2_N_teeth
+     r = pt2_u_0 + pt2_W_T * dble(t-1)
+     pt2_n_0(t) = pt2_find_sample(r, tilde_cW)
+   end do
+   pt2_n_0(pt2_N_teeth+1) = N_det_generators
+
+   pt2_w(:pt2_n_0(1)) = tilde_w(:pt2_n_0(1))
+   do t=1, pt2_N_teeth
+     tooth_width = tilde_cW(pt2_n_0(t+1)) - tilde_cW(pt2_n_0(t))
+     if (tooth_width == 0.d0) then
+       tooth_width = sum(tilde_w(pt2_n_0(t):pt2_n_0(t+1)))
+     endif
+     ASSERT(tooth_width > 0.d0)
+     do i=pt2_n_0(t)+1, pt2_n_0(t+1)
+       pt2_w(i) = tilde_w(i) * pt2_W_T / tooth_width
+     end do
+   end do
+
+   pt2_cW(0) = 0d0
+   do i=1,N_det_generators
+     pt2_cW(i) = pt2_cW(i-1) + pt2_w(i)
+   end do
+   pt2_n_0(pt2_N_teeth+1) = N_det_generators
+
+ endif
+END_PROVIDER
+
+
+
+
+
--- a/src/cipsi_tc_bi_ortho/pt2_type.irp.f
+++ b/src/cipsi_tc_bi_ortho/pt2_type.irp.f
@ -0,0 +1,128 @@
+subroutine pt2_alloc(pt2_data,N)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(inout) :: pt2_data
+  integer, intent(in)           :: N
+  integer :: k
+
+  allocate(pt2_data % pt2(N)           &
+          ,pt2_data % variance(N)      &
+          ,pt2_data % rpt2(N)          &
+          ,pt2_data % overlap(N,N)     &
+          )
+
+  pt2_data % pt2(:)           = 0.d0
+  pt2_data % variance(:)      = 0.d0
+  pt2_data % rpt2(:)          = 0.d0
+  pt2_data % overlap(:,:)     = 0.d0
+
+end subroutine
+
+subroutine pt2_dealloc(pt2_data)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(inout) :: pt2_data
+  deallocate(pt2_data % pt2         &
+            ,pt2_data % variance    &
+            ,pt2_data % rpt2        &
+            ,pt2_data % overlap     &
+            )
+end subroutine
+
+subroutine pt2_add(p1, w, p2)
+  implicit none
+  use selection_types
+  BEGIN_DOC
+! p1 += w * p2
+  END_DOC
+  type(pt2_type), intent(inout) :: p1
+  double precision, intent(in)  :: w
+  type(pt2_type), intent(in)    :: p2
+
+  if (w == 1.d0) then
+
+    p1 % pt2(:)            = p1 % pt2(:)           + p2 % pt2(:)
+    p1 % rpt2(:)           = p1 % rpt2(:)          + p2 % rpt2(:)
+    p1 % variance(:)       = p1 % variance(:)      + p2 % variance(:)
+    p1 % overlap(:,:)      = p1 % overlap(:,:)     + p2 % overlap(:,:)
+
+  else
+
+    p1 % pt2(:)            = p1 % pt2(:)           + w * p2 % pt2(:)
+    p1 % rpt2(:)           = p1 % rpt2(:)          + w * p2 % rpt2(:)
+    p1 % variance(:)       = p1 % variance(:)      + w * p2 % variance(:)
+    p1 % overlap(:,:)      = p1 % overlap(:,:)     + w * p2 % overlap(:,:)
+
+  endif
+
+end subroutine
+
+
+subroutine pt2_add2(p1, w, p2)
+  implicit none
+  use selection_types
+  BEGIN_DOC
+! p1 += w * p2**2
+  END_DOC
+  type(pt2_type), intent(inout) :: p1
+  double precision, intent(in)  :: w
+  type(pt2_type), intent(in)    :: p2
+
+  if (w == 1.d0) then
+
+    p1 % pt2(:)           = p1 % pt2(:)           + p2 % pt2(:)           * p2 % pt2(:)
+    p1 % rpt2(:)          = p1 % rpt2(:)          + p2 % rpt2(:)          * p2 % rpt2(:)
+    p1 % variance(:)      = p1 % variance(:)      + p2 % variance(:)      * p2 % variance(:)
+    p1 % overlap(:,:)     = p1 % overlap(:,:)     + p2 % overlap(:,:)     * p2 % overlap(:,:)
+
+  else
+
+    p1 % pt2(:)           = p1 % pt2(:)           + w * p2 % pt2(:)           * p2 % pt2(:)
+    p1 % rpt2(:)          = p1 % rpt2(:)          + w * p2 % rpt2(:)          * p2 % rpt2(:)
+    p1 % variance(:)      = p1 % variance(:)      + w * p2 % variance(:)      * p2 % variance(:)
+    p1 % overlap(:,:)     = p1 % overlap(:,:)     + w * p2 % overlap(:,:)     * p2 % overlap(:,:)
+
+  endif
+
+end subroutine
+
+
+subroutine pt2_serialize(pt2_data, n, x)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(in)    :: pt2_data
+  integer, intent(in)           :: n
+  double precision, intent(out) :: x(*)
+
+  integer :: i,k,n2
+
+  n2 = n*n
+  x(1:n)           =  pt2_data % pt2(1:n)
+  k=n
+  x(k+1:k+n)     =  pt2_data % rpt2(1:n)
+  k=k+n
+  x(k+1:k+n)     =  pt2_data % variance(1:n)
+  k=k+n
+  x(k+1:k+n2)  =  reshape(pt2_data % overlap(1:n,1:n), (/ n2 /))
+
+end
+
+subroutine pt2_deserialize(pt2_data, n, x)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(inout) :: pt2_data
+  integer, intent(in)           :: n
+  double precision, intent(in)  :: x(*)
+
+  integer :: i,k,n2
+
+  n2 = n*n
+  pt2_data % pt2(1:n)           =   x(1:n)
+  k=n
+  pt2_data % rpt2(1:n)          =   x(k+1:k+n)
+  k=k+n
+  pt2_data % variance(1:n)      =   x(k+1:k+n)
+  k=k+n
+  pt2_data % overlap(1:n,1:n) = reshape(x(k+1:k+n2), (/ n, n /))
+
+end
--- a/src/cipsi_tc_bi_ortho/run_pt2_slave.irp.f
+++ b/src/cipsi_tc_bi_ortho/run_pt2_slave.irp.f
@ -0,0 +1,549 @@
+ use omp_lib
+ use selection_types
+ use f77_zmq
+BEGIN_PROVIDER [ integer(omp_lock_kind), global_selection_buffer_lock ]
+ use omp_lib
+ implicit none
+ BEGIN_DOC
+ ! Global buffer for the OpenMP selection
+ END_DOC
+ call omp_init_lock(global_selection_buffer_lock)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(selection_buffer), global_selection_buffer ]
+ use omp_lib
+ implicit none
+ BEGIN_DOC
+ ! Global buffer for the OpenMP selection
+ END_DOC
+ call omp_set_lock(global_selection_buffer_lock)
+ call delete_selection_buffer(global_selection_buffer)
+ call create_selection_buffer(N_det_generators, 2*N_det_generators, &
+    global_selection_buffer)
+ call omp_unset_lock(global_selection_buffer_lock)
+END_PROVIDER
+
+
+subroutine run_pt2_slave(thread,iproc,energy)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  double precision, intent(in)    :: energy(N_states_diag)
+  integer,  intent(in)            :: thread, iproc
+  if (N_det > 100000 ) then
+    call run_pt2_slave_large(thread,iproc,energy)
+  else
+    call run_pt2_slave_small(thread,iproc,energy)
+  endif
+end
+
+subroutine run_pt2_slave_small(thread,iproc,energy)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  double precision, intent(in)    :: energy(N_states_diag)
+  integer,  intent(in)            :: thread, iproc
+  integer                         :: rc, i
+
+  integer                        :: worker_id, ctask, ltask
+  character*(512), allocatable   :: task(:)
+  integer, allocatable           :: task_id(:)
+
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  type(selection_buffer) :: b
+  logical :: done, buffer_ready
+
+  type(pt2_type), allocatable :: pt2_data(:)
+  integer :: n_tasks, k, N
+  integer, allocatable :: i_generator(:), subset(:)
+
+  double precision, external :: memory_of_double, memory_of_int
+  integer :: bsize ! Size of selection buffers
+
+  allocate(task_id(pt2_n_tasks_max), task(pt2_n_tasks_max))
+  allocate(pt2_data(pt2_n_tasks_max), i_generator(pt2_n_tasks_max), subset(pt2_n_tasks_max))
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  integer, external :: connect_to_taskserver
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  b%N = 0
+  buffer_ready = .False.
+  n_tasks = 1
+
+  done = .False.
+  do while (.not.done)
+
+    n_tasks = max(1,n_tasks)
+    n_tasks = min(pt2_n_tasks_max,n_tasks)
+
+    integer, external :: get_tasks_from_taskserver
+    if (get_tasks_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task, n_tasks) == -1) then
+      exit
+    endif
+    done = task_id(n_tasks) == 0
+    if (done) then
+      n_tasks = n_tasks-1
+    endif
+    if (n_tasks == 0) exit
+
+    do k=1,n_tasks
+      call sscanf_ddd(task(k), subset(k), i_generator(k), N)
+    enddo
+    if (b%N == 0) then
+      ! Only first time
+      bsize = min(N, (elec_alpha_num * (mo_num-elec_alpha_num))**2)
+      call create_selection_buffer(bsize, bsize*2, b)
+      buffer_ready = .True.
+    else
+      ASSERT (b%N == bsize)
+    endif
+
+    double precision :: time0, time1
+    call wall_time(time0)
+    do k=1,n_tasks
+      call pt2_alloc(pt2_data(k),N_states)
+      b%cur = 0
+      call select_connected(i_generator(k),energy,pt2_data(k),b,subset(k),pt2_F(i_generator(k)))
+    enddo
+    call wall_time(time1)
+
+    integer, external :: tasks_done_to_taskserver
+    if (tasks_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id,n_tasks) == -1) then
+      done = .true.
+    endif
+    call sort_selection_buffer(b)
+    call push_pt2_results(zmq_socket_push, i_generator, pt2_data, b, task_id, n_tasks)
+    do k=1,n_tasks
+      call pt2_dealloc(pt2_data(k))
+    enddo
+    b%cur=0
+
+!    ! Try to adjust n_tasks around nproc/2 seconds per job
+    n_tasks = min(2*n_tasks,int( dble(n_tasks * nproc/2) / (time1 - time0 + 1.d0)))
+    n_tasks = min(n_tasks, pt2_n_tasks_max)
+!    n_tasks = 1
+  end do
+
+  integer, external :: disconnect_from_taskserver
+  do i=1,300
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) /= -2) exit
+    call usleep(500)
+    print *,  'Retry disconnect...'
+  end do
+
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  if (buffer_ready) then
+    call delete_selection_buffer(b)
+  endif
+  deallocate(pt2_data)
+end subroutine
+
+
+subroutine run_pt2_slave_large(thread,iproc,energy)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  double precision, intent(in)    :: energy(N_states_diag)
+  integer,  intent(in)            :: thread, iproc
+  integer                         :: rc, i
+
+  integer                        :: worker_id, ctask, ltask
+  character*(512)                :: task
+  integer                        :: task_id(1)
+
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  type(selection_buffer) :: b
+  logical :: done, buffer_ready
+
+  type(pt2_type) :: pt2_data
+  integer :: n_tasks, k, N
+  integer :: i_generator, subset
+
+  integer :: bsize ! Size of selection buffers
+  logical :: sending
+  double precision :: time_shift
+
+  PROVIDE global_selection_buffer global_selection_buffer_lock
+
+  call random_number(time_shift)
+  time_shift = time_shift*15.d0
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  integer, external :: connect_to_taskserver
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  b%N = 0
+  buffer_ready = .False.
+  n_tasks = 1
+
+  sending = .False.
+  done = .False.
+  double precision :: time0, time1
+  call wall_time(time0)
+  time0 = time0+time_shift
+  do while (.not.done)
+
+    integer, external :: get_tasks_from_taskserver
+    if (get_tasks_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task, n_tasks) == -1) then
+      exit
+    endif
+    done = task_id(1) == 0
+    if (done) then
+      n_tasks = n_tasks-1
+    endif
+    if (n_tasks == 0) exit
+
+    call sscanf_ddd(task, subset, i_generator, N)
+    if( pt2_F(i_generator) <= 0 .or. pt2_F(i_generator) > N_det ) then
+      print *,  irp_here
+      stop 'bug in selection'
+    endif
+    if (b%N == 0) then
+      ! Only first time
+      bsize = min(N, (elec_alpha_num * (mo_num-elec_alpha_num))**2)
+      call create_selection_buffer(bsize, bsize*2, b)
+      buffer_ready = .True.
+    else
+      ASSERT (b%N == bsize)
+    endif
+
+    call pt2_alloc(pt2_data,N_states)
+    b%cur = 0
+    call select_connected(i_generator,energy,pt2_data,b,subset,pt2_F(i_generator))
+
+    integer, external :: tasks_done_to_taskserver
+    if (tasks_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id,n_tasks) == -1) then
+      done = .true.
+    endif
+    call sort_selection_buffer(b)
+
+    call wall_time(time1)
+!    if (time1-time0 > 15.d0) then
+      call omp_set_lock(global_selection_buffer_lock)
+      global_selection_buffer%mini = b%mini
+      call merge_selection_buffers(b,global_selection_buffer)
+      b%cur=0
+      call omp_unset_lock(global_selection_buffer_lock)
+      call wall_time(time0)
+!    endif
+
+    call push_pt2_results_async_recv(zmq_socket_push,b%mini,sending)
+    if ( iproc == 1 .or. i_generator < 100 .or. done) then
+      call omp_set_lock(global_selection_buffer_lock)
+      call push_pt2_results_async_send(zmq_socket_push, (/i_generator/), (/pt2_data/), global_selection_buffer, (/task_id/), 1,sending)
+      global_selection_buffer%cur = 0
+      call omp_unset_lock(global_selection_buffer_lock)
+    else
+      call push_pt2_results_async_send(zmq_socket_push, (/i_generator/), (/pt2_data/), b, (/task_id/), 1,sending)
+    endif
+
+    call pt2_dealloc(pt2_data)
+  end do
+  call push_pt2_results_async_recv(zmq_socket_push,b%mini,sending)
+
+  integer, external :: disconnect_from_taskserver
+  do i=1,300
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) /= -2) exit
+    call sleep(1)
+    print *,  'Retry disconnect...'
+  end do
+
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  if (buffer_ready) then
+    call delete_selection_buffer(b)
+  endif
+  FREE global_selection_buffer
+end subroutine
+
+
+subroutine push_pt2_results(zmq_socket_push, index, pt2_data, b, task_id, n_tasks)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_push
+  type(pt2_type), intent(in)     :: pt2_data(n_tasks)
+  integer, intent(in) :: n_tasks, index(n_tasks), task_id(n_tasks)
+  type(selection_buffer), intent(inout) :: b
+
+  logical :: sending
+  sending = .False.
+  call push_pt2_results_async_send(zmq_socket_push, index, pt2_data, b, task_id, n_tasks, sending)
+  call push_pt2_results_async_recv(zmq_socket_push, b%mini, sending)
+end subroutine
+
+
+subroutine push_pt2_results_async_send(zmq_socket_push, index, pt2_data, b, task_id, n_tasks, sending)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_push
+  type(pt2_type), intent(in)     :: pt2_data(n_tasks)
+  integer, intent(in) :: n_tasks, index(n_tasks), task_id(n_tasks)
+  type(selection_buffer), intent(inout) :: b
+  logical, intent(inout) :: sending
+  integer :: rc, i
+  integer*8 :: rc8
+  double precision, allocatable :: pt2_serialized(:,:)
+
+  if (sending) then
+    print *,  irp_here, ': sending is true'
+    stop -1
+  endif
+  sending = .True.
+
+  rc = f77_zmq_send( zmq_socket_push, n_tasks, 4, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 1
+    return
+  else if(rc /= 4) then
+    stop 'push'
+  endif
+
+
+  rc = f77_zmq_send( zmq_socket_push, index, 4*n_tasks, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 2
+    return
+  else if(rc /= 4*n_tasks) then
+    stop 'push'
+  endif
+
+
+  allocate(pt2_serialized (pt2_type_size(N_states),n_tasks) )
+  do i=1,n_tasks
+    call pt2_serialize(pt2_data(i),N_states,pt2_serialized(1,i))
+  enddo
+
+  rc = f77_zmq_send( zmq_socket_push, pt2_serialized, size(pt2_serialized)*8, ZMQ_SNDMORE)
+  deallocate(pt2_serialized)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 3
+    return
+  else if(rc /= size(pt2_serialized)*8) then
+    stop 'push'
+  endif
+
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, n_tasks*4, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 6
+    return
+  else if(rc /= 4*n_tasks) then
+    stop 'push'
+  endif
+
+
+  if (b%cur == 0) then
+
+    rc = f77_zmq_send( zmq_socket_push, b%cur, 4, 0)
+    if (rc == -1) then
+      print *,  irp_here, ': error sending result'
+      stop 7
+      return
+    else if(rc /= 4) then
+      stop 'push'
+    endif
+
+  else
+
+    rc = f77_zmq_send( zmq_socket_push, b%cur, 4, ZMQ_SNDMORE)
+    if (rc == -1) then
+      print *,  irp_here, ': error sending result'
+      stop 7
+      return
+    else if(rc /= 4) then
+      stop 'push'
+    endif
+
+
+    rc8 = f77_zmq_send8( zmq_socket_push, b%val, 8_8*int(b%cur,8), ZMQ_SNDMORE)
+    if (rc8 == -1_8) then
+      print *,  irp_here, ': error sending result'
+      stop 8
+      return
+    else if(rc8 /= 8_8*int(b%cur,8)) then
+      stop 'push'
+    endif
+
+
+    rc8 = f77_zmq_send8( zmq_socket_push, b%det, int(bit_kind*N_int*2,8)*int(b%cur,8), 0)
+    if (rc8 == -1_8) then
+      print *,  irp_here, ': error sending result'
+      stop 9
+      return
+    else if(rc8 /= int(N_int*2*8,8)*int(b%cur,8)) then
+      stop 'push'
+    endif
+
+  endif
+
+end subroutine
+
+subroutine push_pt2_results_async_recv(zmq_socket_push,mini,sending)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)    :: zmq_socket_push
+  double precision, intent(out) :: mini
+  logical, intent(inout) :: sending
+  integer :: rc
+
+  if (.not.sending) return
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 10
+    return
+  else if ((rc /= 2).and.(ok(1:2) /= 'ok')) then
+    print *,  irp_here//': error in receiving ok'
+    stop -1
+  endif
+  rc = f77_zmq_recv( zmq_socket_push, mini, 8, 0)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 11
+    return
+  else if (rc /= 8) then
+    print *,  irp_here//': error in receiving mini'
+    stop 12
+  endif
+IRP_ENDIF
+  sending = .False.
+end subroutine
+
+
+
+subroutine pull_pt2_results(zmq_socket_pull, index, pt2_data, task_id, n_tasks, b)
+ use selection_types
+ use f77_zmq
+  implicit none
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  type(pt2_type), intent(inout)  :: pt2_data(*)
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(out) :: index(*)
+  integer, intent(out) :: n_tasks, task_id(*)
+  integer :: rc, rn, i
+  integer*8 :: rc8
+  double precision, allocatable :: pt2_serialized(:,:)
+
+  rc = f77_zmq_recv( zmq_socket_pull, n_tasks, 4, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4) then
+    stop 'pull'
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, index, 4*n_tasks, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4*n_tasks) then
+    stop 'pull'
+  endif
+
+  allocate(pt2_serialized (pt2_type_size(N_states),n_tasks) )
+  rc = f77_zmq_recv( zmq_socket_pull, pt2_serialized, 8*size(pt2_serialized)*n_tasks, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 8*size(pt2_serialized)) then
+    stop 'pull'
+  endif
+
+  do i=1,n_tasks
+    call pt2_deserialize(pt2_data(i),N_states,pt2_serialized(1,i))
+  enddo
+  deallocate(pt2_serialized)
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, n_tasks*4, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4*n_tasks) then
+    stop 'pull'
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, b%cur, 4, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4) then
+    stop 'pull'
+  endif
+
+  if (b%cur > 0) then
+
+    rc8 = f77_zmq_recv8( zmq_socket_pull, b%val, 8_8*int(b%cur,8), 0)
+    if (rc8 == -1_8) then
+      n_tasks = 1
+      task_id(1) = 0
+    else if(rc8 /= 8_8*int(b%cur,8)) then
+      stop 'pull'
+    endif
+
+    rc8 = f77_zmq_recv8( zmq_socket_pull, b%det, int(bit_kind*N_int*2,8)*int(b%cur,8), 0)
+    if (rc8 == -1_8) then
+      n_tasks = 1
+      task_id(1) = 0
+    else if(rc8 /= int(N_int*2*8,8)*int(b%cur,8)) then
+      stop 'pull'
+    endif
+
+  endif
+
+! Activate is zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, ZMQ_SNDMORE)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if (rc /= 2) then
+    print *,  irp_here//': error in sending ok'
+    stop -1
+  endif
+  rc = f77_zmq_send( zmq_socket_pull, b%mini, 8, 0)
+IRP_ENDIF
+
+end subroutine
+
--- a/src/cipsi_tc_bi_ortho/run_selection_slave.irp.f
+++ b/src/cipsi_tc_bi_ortho/run_selection_slave.irp.f
@ -0,0 +1,255 @@
+subroutine run_selection_slave(thread, iproc, energy)
+
+  use f77_zmq
+  use selection_types
+
+  implicit none
+
+  double precision, intent(in) :: energy(N_states)
+  integer,          intent(in) :: thread, iproc
+
+  integer                      :: rc, i
+  integer                      :: worker_id, task_id(1), ctask, ltask
+  character*(512)              :: task
+  integer(ZMQ_PTR)             :: zmq_to_qp_run_socket
+  integer(ZMQ_PTR)             :: zmq_socket_push
+  integer(ZMQ_PTR), external   :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR), external   :: new_zmq_push_socket
+  type(selection_buffer)       :: buf, buf2
+  type(pt2_type)               :: pt2_data
+  logical                      :: done, buffer_ready
+
+  PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+  PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+  PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+  PROVIDE psi_bilinear_matrix_transp_order N_int pt2_F pseudo_sym
+  PROVIDE psi_selectors_coef_transp psi_det_sorted_tc weight_selection
+
+  call pt2_alloc(pt2_data,N_states)
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  integer, external :: connect_to_taskserver
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  buf%N = 0
+  buffer_ready = .False.
+  ctask = 1
+
+  do
+    integer, external :: get_task_from_taskserver
+    if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id(ctask), task) == -1) then
+      exit
+    endif
+    done = task_id(ctask) == 0
+    if (done) then
+      ctask = ctask - 1
+    else
+      integer :: i_generator, N, subset, bsize
+      call sscanf_ddd(task, subset, i_generator, N)
+      if(buf%N == 0) then
+        ! Only first time
+        call create_selection_buffer(N, N*2, buf)
+        buffer_ready = .True.
+      else
+        if (N /= buf%N) then
+          print *, 'N=', N
+          print *, 'buf%N=', buf%N
+          print *, 'bug in ', irp_here
+          stop '-1'
+        end if
+      end if
+      call select_connected(i_generator, energy, pt2_data, buf,subset, pt2_F(i_generator))
+    endif
+
+    integer, external :: task_done_to_taskserver
+
+    if(done .or. ctask == size(task_id)) then
+      do i=1, ctask
+         if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) == -1) then
+           call usleep(100)
+          if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) == -1) then
+            ctask = 0
+            done = .true.
+            exit
+          endif
+         endif
+      end do
+      if(ctask > 0) then
+        call sort_selection_buffer(buf)
+!        call merge_selection_buffers(buf,buf2)
+        call push_selection_results(zmq_socket_push, pt2_data, buf, task_id(1), ctask)
+        call pt2_dealloc(pt2_data)
+        call pt2_alloc(pt2_data,N_states)
+!        buf%mini = buf2%mini
+        buf%cur = 0
+      end if
+      ctask = 0
+    end if
+
+    if(done) exit
+    ctask = ctask + 1
+  end do
+
+  if(ctask > 0) then
+    call sort_selection_buffer(buf)
+!    call merge_selection_buffers(buf,buf2)
+    call push_selection_results(zmq_socket_push, pt2_data, buf, task_id(1), ctask)
+!    buf%mini = buf2%mini
+    buf%cur = 0
+  end if
+  ctask = 0
+  call pt2_dealloc(pt2_data)
+
+  integer, external :: disconnect_from_taskserver
+  if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+    continue
+  endif
+
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  if (buffer_ready) then
+    call delete_selection_buffer(buf)
+!    call delete_selection_buffer(buf2)
+  endif
+end subroutine
+
+
+subroutine push_selection_results(zmq_socket_push, pt2_data, b, task_id, ntasks)
+  use f77_zmq
+  use selection_types
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_push
+  type(pt2_type), intent(in)     :: pt2_data
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(in) :: ntasks, task_id(*)
+  integer :: rc
+  double precision, allocatable :: pt2_serialized(:)
+
+  rc = f77_zmq_send( zmq_socket_push, b%cur, 4, ZMQ_SNDMORE)
+  if(rc /= 4) then
+    print *,  'f77_zmq_send( zmq_socket_push, b%cur, 4, ZMQ_SNDMORE)'
+  endif
+
+
+  allocate(pt2_serialized (pt2_type_size(N_states)) )
+  call pt2_serialize(pt2_data,N_states,pt2_serialized)
+
+  rc = f77_zmq_send( zmq_socket_push, pt2_serialized, size(pt2_serialized)*8, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 3
+    return
+  else if(rc /= size(pt2_serialized)*8) then
+    stop 'push'
+  endif
+  deallocate(pt2_serialized)
+
+  if (b%cur > 0) then
+
+      rc = f77_zmq_send( zmq_socket_push, b%val(1), 8*b%cur, ZMQ_SNDMORE)
+      if(rc /= 8*b%cur) then
+        print *,  'f77_zmq_send( zmq_socket_push, b%val(1), 8*b%cur, ZMQ_SNDMORE)'
+      endif
+
+      rc = f77_zmq_send( zmq_socket_push, b%det(1,1,1), bit_kind*N_int*2*b%cur, ZMQ_SNDMORE)
+      if(rc /= bit_kind*N_int*2*b%cur) then
+        print *,  'f77_zmq_send( zmq_socket_push, b%det(1,1,1), bit_kind*N_int*2*b%cur, ZMQ_SNDMORE)'
+      endif
+
+  endif
+
+  rc = f77_zmq_send( zmq_socket_push, ntasks, 4, ZMQ_SNDMORE)
+  if(rc /= 4) then
+    print *,  'f77_zmq_send( zmq_socket_push, ntasks, 4, ZMQ_SNDMORE)'
+  endif
+
+  rc = f77_zmq_send( zmq_socket_push, task_id(1), ntasks*4, 0)
+  if(rc /= 4*ntasks) then
+    print *,  'f77_zmq_send( zmq_socket_push, task_id(1), ntasks*4, 0)'
+  endif
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if ((rc /= 2).and.(ok(1:2) /= 'ok')) then
+    print *,  irp_here//': error in receiving ok'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+
+subroutine pull_selection_results(zmq_socket_pull, pt2_data, val, det, N, task_id, ntasks)
+  use f77_zmq
+  use selection_types
+  implicit none
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  type(pt2_type), intent(inout) :: pt2_data
+  double precision, intent(out) :: val(*)
+  integer(bit_kind), intent(out) :: det(N_int, 2, *)
+  integer, intent(out) :: N, ntasks, task_id(*)
+  integer :: rc, rn, i
+  double precision, allocatable :: pt2_serialized(:)
+
+  rc = f77_zmq_recv( zmq_socket_pull, N, 4, 0)
+  if(rc /= 4) then
+    print *,  'f77_zmq_recv( zmq_socket_pull, N, 4, 0)'
+  endif
+
+  allocate(pt2_serialized (pt2_type_size(N_states)) )
+  rc = f77_zmq_recv( zmq_socket_pull, pt2_serialized, 8*size(pt2_serialized), 0)
+  if (rc == -1) then
+    ntasks = 1
+    task_id(1) = 0
+  else if(rc /= 8*size(pt2_serialized)) then
+    stop 'pull'
+  endif
+
+  call pt2_deserialize(pt2_data,N_states,pt2_serialized)
+  deallocate(pt2_serialized)
+
+  if (N>0) then
+      rc = f77_zmq_recv( zmq_socket_pull, val(1), 8*N, 0)
+      if(rc /= 8*N) then
+        print *,  'f77_zmq_recv( zmq_socket_pull, val(1), 8*N, 0)'
+      endif
+
+      rc = f77_zmq_recv( zmq_socket_pull, det(1,1,1), bit_kind*N_int*2*N, 0)
+      if(rc /= bit_kind*N_int*2*N) then
+        print *,  'f77_zmq_recv( zmq_socket_pull, det(1,1,1), bit_kind*N_int*2*N, 0)'
+      endif
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, ntasks, 4, 0)
+  if(rc /= 4) then
+    print *,  'f77_zmq_recv( zmq_socket_pull, ntasks, 4, 0)'
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id(1), ntasks*4, 0)
+  if(rc /= 4*ntasks) then
+    print *,  'f77_zmq_recv( zmq_socket_pull, task_id(1), ntasks*4, 0)'
+  endif
+
+! Activate is zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
+  if (rc /= 2) then
+    print *,  irp_here//': error in sending ok'
+    stop -1
+  endif
+IRP_ENDIF
+end subroutine
+
+
+
--- a/src/cipsi_tc_bi_ortho/selection.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection.irp.f
--- a/src/cipsi_tc_bi_ortho/selection_buffer.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection_buffer.irp.f
@ -0,0 +1,416 @@
+
+subroutine create_selection_buffer(N, size_in, res)
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Allocates the memory for a selection buffer.
+! The arrays have dimension size_in and the maximum number of elements is N
+  END_DOC
+
+  integer, intent(in) :: N, size_in
+  type(selection_buffer), intent(out) :: res
+
+  integer :: siz
+  siz = max(size_in,1)
+
+  double precision :: rss
+  double precision, external :: memory_of_double
+  rss = memory_of_double(siz)*(N_int*2+1)
+  call check_mem(rss,irp_here)
+
+  allocate(res%det(N_int, 2, siz), res%val(siz))
+
+  res%val(:) = 0d0
+  res%det(:,:,:) = 0_8
+  res%N = N
+  res%mini = 0d0
+  res%cur = 0
+end subroutine
+
+subroutine delete_selection_buffer(b)
+  use selection_types
+  implicit none
+  type(selection_buffer), intent(inout) :: b
+  if (associated(b%det)) then
+    deallocate(b%det)
+  endif
+  if (associated(b%val)) then
+    deallocate(b%val)
+  endif
+  NULLIFY(b%det)
+  NULLIFY(b%val)
+  b%cur = 0
+  b%mini = 0.d0
+  b%N = 0
+end
+
+
+subroutine add_to_selection_buffer(b, det, val)
+  use selection_types
+  implicit none
+
+  type(selection_buffer), intent(inout) :: b
+  integer(bit_kind), intent(in) :: det(N_int, 2)
+  double precision, intent(in) :: val
+  integer :: i
+
+  if(b%N > 0 .and. val <= b%mini) then
+    b%cur += 1
+    b%det(1:N_int,1:2,b%cur) = det(1:N_int,1:2)
+    b%val(b%cur) = val
+    if(b%cur == size(b%val)) then
+      call sort_selection_buffer(b)
+    end if
+  end if
+end subroutine
+
+subroutine merge_selection_buffers(b1, b2)
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Merges the selection buffers b1 and b2 into b2
+  END_DOC
+  type(selection_buffer), intent(inout) :: b1
+  type(selection_buffer), intent(inout) :: b2
+  integer(bit_kind), pointer     :: detmp(:,:,:)
+  double precision, pointer      :: val(:)
+  integer                        :: i, i1, i2, k, nmwen, sze
+  if (b1%cur == 0) return
+  do while (b1%val(b1%cur) > b2%mini)
+    b1%cur = b1%cur-1
+    if (b1%cur == 0) then
+      return
+    endif
+  enddo
+  nmwen = min(b1%N, b1%cur+b2%cur)
+  double precision :: rss
+  double precision, external :: memory_of_double
+  sze = max(size(b1%val), size(b2%val))
+  rss = memory_of_double(sze) + 2*N_int*memory_of_double(sze)
+  call check_mem(rss,irp_here)
+  allocate(val(sze), detmp(N_int, 2, sze))
+  i1=1
+  i2=1
+  do i=1,nmwen
+    if ( (i1 > b1%cur).and.(i2 > b2%cur) ) then
+      exit
+    else if (i1 > b1%cur) then
+        val(i) = b2%val(i2)
+        detmp(1:N_int,1,i) = b2%det(1:N_int,1,i2)
+        detmp(1:N_int,2,i) = b2%det(1:N_int,2,i2)
+        i2=i2+1
+    else if (i2 > b2%cur) then
+        val(i) = b1%val(i1)
+        detmp(1:N_int,1,i) = b1%det(1:N_int,1,i1)
+        detmp(1:N_int,2,i) = b1%det(1:N_int,2,i1)
+        i1=i1+1
+    else
+      if (b1%val(i1) <= b2%val(i2)) then
+        val(i) = b1%val(i1)
+        detmp(1:N_int,1,i) = b1%det(1:N_int,1,i1)
+        detmp(1:N_int,2,i) = b1%det(1:N_int,2,i1)
+        i1=i1+1
+      else
+        val(i) = b2%val(i2)
+        detmp(1:N_int,1,i) = b2%det(1:N_int,1,i2)
+        detmp(1:N_int,2,i) = b2%det(1:N_int,2,i2)
+        i2=i2+1
+      endif
+    endif
+  enddo
+  deallocate(b2%det, b2%val)
+  do i=nmwen+1,b2%N
+    val(i) = 0.d0
+    detmp(1:N_int,1:2,i) = 0_bit_kind
+  enddo
+  b2%det => detmp
+  b2%val => val
+  b2%mini = min(b2%mini,b2%val(b2%N))
+  b2%cur = nmwen
+end
+
+
+subroutine sort_selection_buffer(b)
+  use selection_types
+  implicit none
+
+  type(selection_buffer), intent(inout) :: b
+  integer, allocatable :: iorder(:)
+  integer(bit_kind), pointer :: detmp(:,:,:)
+  integer :: i, nmwen
+  logical, external :: detEq
+  if (b%N == 0 .or. b%cur == 0) return
+  nmwen = min(b%N, b%cur)
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+  rss = memory_of_int(b%cur) + 2*N_int*memory_of_double(size(b%det,3))
+  call check_mem(rss,irp_here)
+  allocate(iorder(b%cur), detmp(N_int, 2, size(b%det,3)))
+  do i=1,b%cur
+    iorder(i) = i
+  end do
+  call dsort(b%val, iorder, b%cur)
+  do i=1, nmwen
+    detmp(1:N_int,1,i) = b%det(1:N_int,1,iorder(i))
+    detmp(1:N_int,2,i) = b%det(1:N_int,2,iorder(i))
+  end do
+  deallocate(b%det,iorder)
+  b%det => detmp
+  b%mini = min(b%mini,b%val(b%N))
+  b%cur = nmwen
+end subroutine
+
+subroutine make_selection_buffer_s2(b)
+  use selection_types
+  type(selection_buffer), intent(inout) :: b
+
+  integer(bit_kind), allocatable :: o(:,:,:)
+  double precision, allocatable  :: val(:)
+
+  integer :: n_d
+  integer :: i,k,sze,n_alpha,j,n
+  logical                        :: dup
+
+  ! Sort
+  integer, allocatable           :: iorder(:)
+  integer*8, allocatable         :: bit_tmp(:)
+  integer*8, external            :: configuration_search_key
+  integer(bit_kind), allocatable :: tmp_array(:,:,:)
+  logical, allocatable           :: duplicate(:)
+
+  n_d = b%cur
+  double precision :: rss
+  double precision, external :: memory_of_double
+  rss = (4*N_int+4)*memory_of_double(n_d)
+  call check_mem(rss,irp_here)
+  allocate(o(N_int,2,n_d), iorder(n_d), duplicate(n_d), bit_tmp(n_d), &
+           tmp_array(N_int,2,n_d), val(n_d) )
+
+  do i=1,n_d
+    do k=1,N_int
+      o(k,1,i) = ieor(b%det(k,1,i), b%det(k,2,i))
+      o(k,2,i) = iand(b%det(k,1,i), b%det(k,2,i))
+    enddo
+    iorder(i) = i
+    bit_tmp(i) = configuration_search_key(o(1,1,i),N_int)
+  enddo
+
+  deallocate(b%det)
+
+  call i8sort(bit_tmp,iorder,n_d)
+
+  do i=1,n_d
+    do k=1,N_int
+      tmp_array(k,1,i) = o(k,1,iorder(i))
+      tmp_array(k,2,i) = o(k,2,iorder(i))
+    enddo
+    val(i) = b%val(iorder(i))
+    duplicate(i) = .False.
+  enddo
+
+  ! Find duplicates
+  do i=1,n_d-1
+    if (duplicate(i)) then
+      cycle
+    endif
+    j = i+1
+    do while (bit_tmp(j)==bit_tmp(i))
+      if (duplicate(j)) then
+        j+=1
+        if (j>n_d) then
+          exit
+        endif
+        cycle
+      endif
+      dup = .True.
+      do k=1,N_int
+        if ( (tmp_array(k,1,i) /= tmp_array(k,1,j))                   &
+              .or. (tmp_array(k,2,i) /= tmp_array(k,2,j)) ) then
+          dup = .False.
+          exit
+        endif
+      enddo
+      if (dup) then
+        val(i) = max(val(i), val(j))
+        duplicate(j) = .True.
+      endif
+      j+=1
+      if (j>n_d) then
+        exit
+      endif
+    enddo
+  enddo
+
+  deallocate (b%val)
+  ! Copy filtered result
+  integer :: n_p
+  n_p=0
+  do i=1,n_d
+    if (duplicate(i)) then
+      cycle
+    endif
+    n_p = n_p + 1
+    do k=1,N_int
+      o(k,1,n_p) = tmp_array(k,1,i)
+      o(k,2,n_p) = tmp_array(k,2,i)
+    enddo
+    val(n_p) = val(i)
+  enddo
+
+  ! Sort by importance
+  do i=1,n_p
+    iorder(i) = i
+  end do
+  call dsort(val,iorder,n_p)
+  do i=1,n_p
+    do k=1,N_int
+      tmp_array(k,1,i) = o(k,1,iorder(i))
+      tmp_array(k,2,i) = o(k,2,iorder(i))
+    enddo
+  enddo
+  do i=1,n_p
+    do k=1,N_int
+      o(k,1,i) = tmp_array(k,1,i)
+      o(k,2,i) = tmp_array(k,2,i)
+    enddo
+  enddo
+
+  ! Create determinants
+  n_d = 0
+  do i=1,n_p
+    call configuration_to_dets_size(o(1,1,i),sze,elec_alpha_num,N_int)
+    n_d = n_d + sze
+    if (n_d > b%cur) then
+!      if (n_d - b%cur > b%cur - n_d + sze) then
+!        n_d = n_d - sze
+!      endif
+      exit
+    endif
+  enddo
+
+  rss = (4*N_int+2)*memory_of_double(n_d)
+  call check_mem(rss,irp_here)
+  allocate(b%det(N_int,2,2*n_d), b%val(2*n_d))
+  k=1
+  do i=1,n_p
+    n=n_d
+    call configuration_to_dets_size(o(1,1,i),n,elec_alpha_num,N_int)
+    call configuration_to_dets(o(1,1,i),b%det(1,1,k),n,elec_alpha_num,N_int)
+    do j=k,k+n-1
+      b%val(j) = val(i)
+    enddo
+    k = k+n
+    if (k > n_d) exit
+  enddo
+  deallocate(o)
+  b%cur = n_d
+  b%N = n_d
+end
+
+
+
+
+subroutine remove_duplicates_in_selection_buffer(b)
+  use selection_types
+  type(selection_buffer), intent(inout) :: b
+
+  integer(bit_kind), allocatable :: o(:,:,:)
+  double precision, allocatable  :: val(:)
+
+  integer :: n_d
+  integer :: i,k,sze,n_alpha,j,n
+  logical                        :: dup
+
+  ! Sort
+  integer, allocatable           :: iorder(:)
+  integer*8, allocatable         :: bit_tmp(:)
+  integer*8, external            :: det_search_key
+  integer(bit_kind), allocatable :: tmp_array(:,:,:)
+  logical, allocatable           :: duplicate(:)
+
+  n_d = b%cur
+  logical                        :: found_duplicates
+  double precision               :: rss
+  double precision, external     :: memory_of_double
+  rss = (4*N_int+4)*memory_of_double(n_d)
+  call check_mem(rss,irp_here)
+
+  found_duplicates = .False.
+  allocate(iorder(n_d), duplicate(n_d), bit_tmp(n_d), &
+           tmp_array(N_int,2,n_d), val(n_d) )
+
+  do i=1,n_d
+    iorder(i) = i
+    bit_tmp(i) = det_search_key(b%det(1,1,i),N_int)
+  enddo
+
+  call i8sort(bit_tmp,iorder,n_d)
+
+  do i=1,n_d
+    do k=1,N_int
+      tmp_array(k,1,i) = b%det(k,1,iorder(i))
+      tmp_array(k,2,i) = b%det(k,2,iorder(i))
+    enddo
+    val(i) = b%val(iorder(i))
+    duplicate(i) = .False.
+  enddo
+
+  ! Find duplicates
+  do i=1,n_d-1
+    if (duplicate(i)) then
+      cycle
+    endif
+    j = i+1
+    do while (bit_tmp(j)==bit_tmp(i))
+      if (duplicate(j)) then
+        j+=1
+        if (j>n_d) then
+          exit
+        endif
+        cycle
+      endif
+      dup = .True.
+      do k=1,N_int
+        if ( (tmp_array(k,1,i) /= tmp_array(k,1,j))                   &
+              .or. (tmp_array(k,2,i) /= tmp_array(k,2,j)) ) then
+          dup = .False.
+          exit
+        endif
+      enddo
+      if (dup) then
+        duplicate(j) = .True.
+        found_duplicates = .True.
+      endif
+      j+=1
+      if (j>n_d) then
+        exit
+      endif
+    enddo
+  enddo
+
+  if (found_duplicates) then
+
+    ! Copy filtered result
+    integer :: n_p
+    n_p=0
+    do i=1,n_d
+      if (duplicate(i)) then
+        cycle
+      endif
+      n_p = n_p + 1
+      do k=1,N_int
+        b%det(k,1,n_p) = tmp_array(k,1,i)
+        b%det(k,2,n_p) = tmp_array(k,2,i)
+      enddo
+      val(n_p) = val(i)
+    enddo
+    b%cur=n_p
+    b%N=n_p
+
+  endif
+
+end
+
+
+
--- a/src/cipsi_tc_bi_ortho/selection_weight.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection_weight.irp.f
@ -0,0 +1,134 @@
+BEGIN_PROVIDER [ double precision, pt2_match_weight, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Weights adjusted along the selection to make the PT2 contributions
+ ! of each state coincide.
+ END_DOC
+ pt2_match_weight(:) = 1.d0
+END_PROVIDER
+
+
+
+BEGIN_PROVIDER [ double precision, variance_match_weight, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Weights adjusted along the selection to make the variances
+ ! of each state coincide.
+ END_DOC
+ variance_match_weight(:) = 1.d0
+END_PROVIDER
+
+
+
+subroutine update_pt2_and_variance_weights(pt2_data, N_st)
+  implicit none
+  use selection_types
+  BEGIN_DOC
+! Updates the PT2- and Variance- matching weights.
+  END_DOC
+  integer, intent(in)          :: N_st
+  type(pt2_type), intent(in)   :: pt2_data
+  double precision             :: pt2(N_st)
+  double precision             :: variance(N_st)
+
+  double precision :: avg, element, dt, x
+  integer          :: k
+  pt2(:)      = pt2_data % pt2(:)
+  variance(:) = pt2_data % variance(:)
+
+  avg = sum(pt2(1:N_st)) / dble(N_st) + 1.d-32 ! Avoid future division by zero
+
+  dt = 8.d0 !* selection_factor
+  do k=1,N_st
+    element = exp(dt*(pt2(k)/avg - 1.d0))
+    element = min(2.0d0 , element)
+    element = max(0.5d0 , element)
+    pt2_match_weight(k) *= element
+  enddo
+
+
+  avg = sum(variance(1:N_st)) / dble(N_st) + 1.d-32 ! Avoid future division by zero
+
+  do k=1,N_st
+    element = exp(dt*(variance(k)/avg -1.d0))
+    element = min(2.0d0 , element)
+    element = max(0.5d0 , element)
+    variance_match_weight(k) *= element
+  enddo
+
+  if (N_det < 100) then
+    ! For tiny wave functions, weights are 1.d0
+    pt2_match_weight(:) = 1.d0
+    variance_match_weight(:) = 1.d0
+  endif
+
+  threshold_davidson_pt2 = min(1.d-6, &
+     max(threshold_davidson, 1.e-1 * PT2_relative_error * minval(abs(pt2(1:N_states)))) )
+
+  SOFT_TOUCH pt2_match_weight variance_match_weight threshold_davidson_pt2
+end
+
+
+
+
+BEGIN_PROVIDER [ double precision, selection_weight, (N_states) ]
+   implicit none
+   BEGIN_DOC
+   ! Weights used in the selection criterion
+   END_DOC
+   select case (weight_selection)
+
+     case (0)
+      print *,  'Using input weights in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * state_average_weight(1:N_states)
+
+     case (1)
+      print *,  'Using 1/c_max^2 weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states)
+
+     case (2)
+      print *,  'Using pt2-matching weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * pt2_match_weight(1:N_states)
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+
+     case (3)
+      print *,  'Using variance-matching weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * variance_match_weight(1:N_states)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (4)
+      print *,  'Using variance- and pt2-matching weights in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * sqrt(variance_match_weight(1:N_states) * pt2_match_weight(1:N_states))
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (5)
+      print *,  'Using variance-matching weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * variance_match_weight(1:N_states)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (6)
+      print *,  'Using CI coefficient-based selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states)
+
+     case (7)
+      print *,  'Input weights multiplied by variance- and pt2-matching'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * sqrt(variance_match_weight(1:N_states) * pt2_match_weight(1:N_states)) * state_average_weight(1:N_states)
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (8)
+      print *,  'Input weights multiplied by pt2-matching'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * pt2_match_weight(1:N_states) * state_average_weight(1:N_states)
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+
+     case (9)
+      print *,  'Input weights multiplied by variance-matching'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * variance_match_weight(1:N_states) * state_average_weight(1:N_states)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+    end select
+     print *, '# Total weight ', real(selection_weight(:),4)
+
+END_PROVIDER
+
--- a/src/cipsi_tc_bi_ortho/slave_cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/slave_cipsi.irp.f
@ -0,0 +1,350 @@
+subroutine run_slave_cipsi
+
+  BEGIN_DOC
+  ! Helper program for distributed parallelism
+  END_DOC
+
+  implicit none
+
+  call omp_set_max_active_levels(1)
+  distributed_davidson = .False.
+  read_wf = .False.
+  SOFT_TOUCH read_wf distributed_davidson
+  call provide_everything
+  call switch_qp_run_to_master
+  call run_slave_main
+end
+
+subroutine provide_everything
+  PROVIDE H_apply_buffer_allocated mo_two_e_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context N_states_diag
+  PROVIDE psi_selectors_rcoef_bi_orth_transp psi_selectors_lcoef_bi_orth_transp
+
+  PROVIDE pt2_e0_denominator mo_num N_int ci_energy mpi_master zmq_state zmq_context
+  PROVIDE psi_det psi_coef threshold_generators state_average_weight
+  PROVIDE N_det_selectors pt2_stoch_istate N_det selection_weight pseudo_sym
+end
+
+
+subroutine run_slave_main
+
+  use f77_zmq
+
+  implicit none
+  IRP_IF MPI
+    include 'mpif.h'
+  IRP_ENDIF
+
+  integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket
+  double precision :: energy(N_states)
+  character*(64) :: states(10)
+  character*(64) :: old_state
+  integer :: rc, i, ierr
+  double precision :: t0, t1
+
+  integer, external              :: zmq_get_dvector, zmq_get_N_det_generators
+  integer, external              :: zmq_get8_dvector
+  integer, external              :: zmq_get_ivector
+  integer, external              :: zmq_get_psi, zmq_get_N_det_selectors, zmq_get_psi_bilinear
+  integer, external              :: zmq_get_psi_notouch
+  integer, external              :: zmq_get_N_states_diag
+
+  zmq_context = f77_zmq_ctx_new ()
+  states(1) = 'selection'
+  states(2) = 'davidson'
+  states(3) = 'pt2'
+  old_state = 'Waiting'
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  PROVIDE psi_det psi_coef threshold_generators state_average_weight mpi_master
+  PROVIDE zmq_state N_det_selectors pt2_stoch_istate N_det pt2_e0_denominator
+  PROVIDE N_det_generators N_states N_states_diag pt2_e0_denominator mpi_rank
+
+  IRP_IF MPI
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  do
+
+    if (mpi_master) then
+      call wait_for_states(states,zmq_state,size(states))
+      if (zmq_state(1:64) == old_state(1:64)) then
+        call usleep(200)
+        cycle
+      else
+        old_state(1:64) = zmq_state(1:64)
+      endif
+      print *,  trim(zmq_state)
+    endif
+
+    IRP_IF MPI_DEBUG
+      print *,  irp_here, mpi_rank
+      call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+    IRP_ENDIF
+    IRP_IF MPI
+      call MPI_BCAST (zmq_state, 128, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
+      if (ierr /= MPI_SUCCESS) then
+        print *,  irp_here, 'error in broadcast of zmq_state'
+      endif
+    IRP_ENDIF
+
+    if(zmq_state(1:7) == 'Stopped') then
+      exit
+    endif
+
+
+    if (zmq_state(1:9) == 'selection') then
+
+      ! Selection
+      ! ---------
+
+      call wall_time(t0)
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_psi')
+      IRP_ENDIF
+      if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector threshold_generators')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector energy')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_generators')
+      IRP_ENDIF
+      if (zmq_get_N_det_generators (zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_selectors')
+      IRP_ENDIF
+      if (zmq_get_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector state_average_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector selection_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) cycle
+      pt2_e0_denominator(1:N_states) = energy(1:N_states)
+      TOUCH pt2_e0_denominator state_average_weight threshold_generators selection_weight psi_det psi_coef
+
+      if (mpi_master) then
+        print *,  'N_det', N_det
+        print *,  'N_det_generators', N_det_generators
+        print *,  'N_det_selectors', N_det_selectors
+        print *,  'pt2_e0_denominator', pt2_e0_denominator
+        print *,  'pt2_stoch_istate', pt2_stoch_istate
+        print *,  'state_average_weight', state_average_weight
+        print *,  'selection_weight', selection_weight
+      endif
+      call wall_time(t1)
+      call write_double(6,(t1-t0),'Broadcast time')
+
+      IRP_IF MPI_DEBUG
+        call mpi_print('Entering OpenMP section')
+      IRP_ENDIF
+      !$OMP PARALLEL PRIVATE(i)
+      i = omp_get_thread_num()
+      call run_selection_slave(0,i,energy)
+      !$OMP END PARALLEL
+      print *,  mpi_rank, ': Selection done'
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call mpi_print('----------')
+
+    else if (zmq_state(1:8) == 'davidson') then
+
+      ! Davidson
+      ! --------
+
+      call wall_time(t0)
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_states_diag')
+      IRP_ENDIF
+      if (zmq_get_N_states_diag(zmq_to_qp_run_socket,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_psi')
+      IRP_ENDIF
+      if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle
+
+      call wall_time(t1)
+      call write_double(6,(t1-t0),'Broadcast time')
+
+      !---
+      call omp_set_max_active_levels(8)
+      call davidson_slave_tcp(0)
+      call omp_set_max_active_levels(1)
+      print *,  mpi_rank, ': Davidson done'
+      !---
+
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call mpi_print('----------')
+
+    else if (zmq_state(1:3) == 'pt2') then
+
+      ! PT2
+      ! ---
+
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call wall_time(t0)
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_psi')
+      IRP_ENDIF
+      if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_generators')
+      IRP_ENDIF
+      if (zmq_get_N_det_generators (zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_selectors')
+      IRP_ENDIF
+      if (zmq_get_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector threshold_generators')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector energy')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_ivector pt2_stoch_istate')
+      IRP_ENDIF
+      if (zmq_get_ivector(zmq_to_qp_run_socket,1,'pt2_stoch_istate',pt2_stoch_istate,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector state_average_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector selection_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) cycle
+      pt2_e0_denominator(1:N_states) = energy(1:N_states)
+      SOFT_TOUCH pt2_e0_denominator state_average_weight pt2_stoch_istate threshold_generators selection_weight psi_det psi_coef N_det_generators N_det_selectors
+
+
+      call wall_time(t1)
+      call write_double(6,(t1-t0),'Broadcast time')
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+
+
+      IRP_IF MPI_DEBUG
+        call mpi_print('Entering OpenMP section')
+      IRP_ENDIF
+      if (.true.) then
+        integer :: nproc_target, ii
+        double precision :: mem_collector, mem, rss
+
+        call resident_memory(rss)
+
+        nproc_target = nthreads_pt2
+        ii = min(N_det, (elec_alpha_num*(mo_num-elec_alpha_num))**2)
+
+        do
+          mem = rss +                             & !
+                nproc_target * 8.d0 *             & ! bytes
+                ( 0.5d0*pt2_n_tasks_max           & ! task_id
+                + 64.d0*pt2_n_tasks_max           & ! task
+                + 3.d0*pt2_n_tasks_max*N_states   & ! pt2, variance, norm
+                + 1.d0*pt2_n_tasks_max            & ! i_generator, subset
+                + 3.d0*(N_int*2.d0*ii+ ii)        & ! selection buffer
+                + 1.d0*(N_int*2.d0*ii+ ii)        & ! sort selection buffer
+                + 2.0d0*(ii)                      & ! preinteresting, interesting,
+                                                    ! prefullinteresting, fullinteresting
+                + 2.0d0*(N_int*2*ii)              & ! minilist, fullminilist
+                + 1.0d0*(N_states*mo_num*mo_num)  & ! mat
+                ) / 1024.d0**3
+
+          if (nproc_target == 0) then
+            call check_mem(mem,irp_here)
+            nproc_target = 1
+            exit
+          endif
+
+          if (mem+rss < qp_max_mem) then
+            exit
+          endif
+
+          nproc_target = nproc_target - 1
+
+        enddo
+        
+        if (N_det > 100000) then
+
+          if (mpi_master) then
+            print *,  'N_det', N_det
+            print *,  'N_det_generators', N_det_generators
+            print *,  'N_det_selectors', N_det_selectors
+            print *,  'pt2_e0_denominator', pt2_e0_denominator
+            print *,  'pt2_stoch_istate', pt2_stoch_istate
+            print *,  'state_average_weight', state_average_weight
+            print *,  'selection_weight', selection_weight
+            print *,  'Number of threads', nproc_target
+          endif
+
+          if (h0_type == 'CFG') then
+            PROVIDE det_to_configuration
+          endif
+
+          PROVIDE global_selection_buffer pt2_N_teeth pt2_F N_det_generators
+          PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+          PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+          PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+          PROVIDE psi_bilinear_matrix_transp_order psi_selectors_coef_transp psi_det_sorted_tc
+          PROVIDE psi_selectors_rcoef_bi_orth_transp psi_selectors_lcoef_bi_orth_transp
+
+          PROVIDE psi_det_hii selection_weight pseudo_sym pt2_min_parallel_tasks
+
+          if (mpi_master) then
+            print *,  'Running PT2'
+          endif
+          !$OMP PARALLEL PRIVATE(i) NUM_THREADS(nproc_target+1)
+          i = omp_get_thread_num()
+          call run_pt2_slave(0,i,pt2_e0_denominator)
+          !$OMP END PARALLEL
+          FREE state_average_weight
+          print *,  mpi_rank, ': PT2 done'
+          print *,  '-------'
+
+        endif
+      endif
+
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call mpi_print('----------')
+
+    endif
+
+  end do
+  IRP_IF MPI
+    call MPI_finalize(ierr)
+  IRP_ENDIF
+end
+
+
+
--- a/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@ -0,0 +1,147 @@
+subroutine run_stochastic_cipsi
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+  integer                        :: i,j,k,ndet
+  double precision, allocatable  :: zeros(:)
+  integer                        :: to_select
+  type(pt2_type)                 :: pt2_data, pt2_data_err
+  logical, external              :: qp_stop
+  logical                        :: print_pt2
+
+  double precision :: rss
+  double precision, external :: memory_of_double
+  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
+  double precision, allocatable :: ept2(:), pt1(:),extrap_energy(:)
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*,'Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+  enddo
+  N_iter = 1
+  threshold_generators = 1.d0
+  SOFT_TOUCH threshold_generators
+
+  rss = memory_of_double(N_states)*4.d0
+  call check_mem(rss,irp_here)
+
+  allocate (zeros(N_states))
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+
+  double precision               :: hf_energy_ref
+  logical                        :: has
+  double precision               :: relative_error
+
+  relative_error=PT2_relative_error
+
+  zeros = 0.d0
+  pt2_data % pt2   = -huge(1.e0)
+  pt2_data % rpt2  = -huge(1.e0)
+  pt2_data % overlap= 0.d0
+  pt2_data % variance = huge(1.e0)
+
+  if (s2_eig) then
+    call make_s2_eigenfunction
+  endif
+  print_pt2 = .False.
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  call routine_save_right
+
+
+  if (N_det > N_det_max) then
+    psi_det(1:N_int,1:2,1:N_det) = psi_det_sorted_tc_gen(1:N_int,1:2,1:N_det)
+    psi_coef(1:N_det,1:N_states) = psi_coef_sorted_tc_gen(1:N_det,1:N_states)
+    N_det = N_det_max
+    soft_touch N_det psi_det psi_coef
+    if (s2_eig) then
+      call make_s2_eigenfunction
+    endif
+    print_pt2 = .False.
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!    call routine_save_right
+  endif
+
+  allocate(ept2(1000),pt1(1000),extrap_energy(100))
+
+  correlation_energy_ratio = 0.d0
+
+! thresh_it_dav  = 5.d-5
+! soft_touch thresh_it_dav
+
+  print_pt2 = .True.
+  do while (                                                         &
+        (N_det < N_det_max) .and.                                    &
+        (maxval(abs(pt2_data % pt2(1:N_states))) > pt2_max)          &
+        )
+      write(*,'(A)')  '--------------------------------------------------------------------------------'
+
+
+    to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
+    to_select = max(N_states_diag, to_select)
+
+    E_denom = E_tc ! TC Energy of the current wave function 
+    call pt2_dealloc(pt2_data)
+    call pt2_dealloc(pt2_data_err)
+    call pt2_alloc(pt2_data, N_states)
+    call pt2_alloc(pt2_data_err, N_states)
+    call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
+
+    N_iter += 1
+
+    if (qp_stop()) exit
+
+    ! Add selected determinants
+    call copy_H_apply_buffer_to_wf_tc()
+
+    PROVIDE  psi_l_coef_bi_ortho psi_r_coef_bi_ortho
+    PROVIDE  psi_det
+    PROVIDE  psi_det_sorted_tc
+
+    ept2(N_iter-1) = E_tc + nuclear_repulsion + (pt2_data % pt2(1))/norm
+    pt1(N_iter-1) = dsqrt(pt2_data % overlap(1,1))
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+    if (qp_stop()) exit
+  enddo
+!  print*,'data to extrapolate '
+!  do i = 2, N_iter
+!   print*,'iteration ',i
+!   print*,'pt1,Ept2',pt1(i),ept2(i)
+!   call get_extrapolated_energy(i-1,ept2(i),pt1(i),extrap_energy(i))
+!   do j = 2, i
+!    print*,'j,e,energy',j,extrap_energy(j)
+!   enddo
+!  enddo
+
+! thresh_it_dav  = 5.d-6
+! soft_touch thresh_it_dav
+
+  call pt2_dealloc(pt2_data)
+  call pt2_dealloc(pt2_data_err)
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+  call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  if (.not.qp_stop()) then
+!    if (N_det < N_det_max) then
+!     thresh_it_dav  = 5.d-7
+!     soft_touch thresh_it_dav
+!     call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!    endif
+!
+!    call pt2_dealloc(pt2_data)
+!    call pt2_dealloc(pt2_data_err)
+!    call pt2_alloc(pt2_data, N_states)
+!    call pt2_alloc(pt2_data_err, N_states)
+!    call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error, 0) ! Stochastic PT2
+!    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  endif
+!  call pt2_dealloc(pt2_data)
+!  call pt2_dealloc(pt2_data_err)
+!  call routine_save_right
+
+end
+
--- a/src/cipsi_tc_bi_ortho/zmq_selection.irp.f
+++ b/src/cipsi_tc_bi_ortho/zmq_selection.irp.f
@ -0,0 +1,235 @@
+subroutine ZMQ_selection(N_in, pt2_data)
+  use f77_zmq
+  use selection_types
+
+  implicit none
+
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket , zmq_socket_pull
+  integer, intent(in)            :: N_in
+  type(selection_buffer)         :: b
+  integer                        :: i, l, N
+  integer, external              :: omp_get_thread_num
+  type(pt2_type), intent(inout)  :: pt2_data
+
+  PROVIDE psi_det psi_coef N_det qp_max_mem N_states pt2_F s2_eig N_det_generators
+
+  N = max(N_in,1)
+  N = min(N, (elec_alpha_num * (mo_num-elec_alpha_num))**2)
+  if (.True.) then
+    PROVIDE pt2_e0_denominator nproc
+    PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+    PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+    PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+    PROVIDE psi_bilinear_matrix_transp_order selection_weight pseudo_sym
+    PROVIDE n_act_orb n_inact_orb n_core_orb n_virt_orb n_del_orb seniority_max
+    PROVIDE excitation_beta_max  excitation_alpha_max excitation_max
+
+    call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'selection')
+
+    integer, external              :: zmq_put_psi
+    integer, external              :: zmq_put_N_det_generators
+    integer, external              :: zmq_put_N_det_selectors
+    integer, external              :: zmq_put_dvector
+
+    if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+      stop 'Unable to put psi on ZMQ server'
+    endif
+    if (zmq_put_N_det_generators(zmq_to_qp_run_socket, 1) == -1) then
+      stop 'Unable to put N_det_generators on ZMQ server'
+    endif
+    if (zmq_put_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) then
+      stop 'Unable to put N_det_selectors on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',pt2_e0_denominator,size(pt2_e0_denominator)) == -1) then
+      stop 'Unable to put energy on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) then
+      stop 'Unable to put state_average_weight on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) then
+      stop 'Unable to put selection_weight on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) then
+      stop 'Unable to put threshold_generators on ZMQ server'
+    endif
+    call create_selection_buffer(N, N*2, b)
+  endif
+
+  integer, external :: add_task_to_taskserver
+  character(len=100000)           :: task
+  integer :: j,k,ipos
+  ipos=1
+  task = ' '
+
+ 
+ do i= 1, N_det_generators
+    do j=1,pt2_F(i)
+      write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, i, N
+      ipos += 30
+      if (ipos > 100000-30) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task to task server'
+        endif
+        ipos=1
+      endif
+    end do
+  enddo
+  if (ipos > 1) then
+    if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+      stop 'Unable to add task to task server'
+    endif
+  endif
+  N = max(N_in,1)
+
+
+  ASSERT (associated(b%det))
+  ASSERT (associated(b%val))
+
+  integer, external :: zmq_set_running
+  if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+    print *,  irp_here, ': Failed in zmq_set_running'
+  endif
+
+  integer :: nproc_target
+  if (N_det < 3*nproc) then
+    nproc_target = N_det/4
+  else
+    nproc_target = nproc
+  endif
+  double precision :: mem
+  mem = 8.d0 * N_det * (N_int * 2.d0 * 3.d0 +  3.d0 + 5.d0) / (1024.d0**3)
+  call write_double(6,mem,'Estimated memory/thread (Gb)')
+  if (qp_max_mem > 0) then
+    nproc_target = max(1,int(dble(qp_max_mem)/(0.1d0 + mem)))
+    nproc_target = min(nproc_target,nproc)
+  endif
+
+  f(:) = 1.d0
+  if (.not.do_pt2) then
+    double precision :: f(N_states), u_dot_u
+    do k=1,min(N_det,N_states)
+     f(k) = 1.d0 / u_dot_u(psi_selectors_coef(1,k), N_det_selectors)
+    enddo
+  endif
+
+  !$OMP PARALLEL DEFAULT(shared)  SHARED(b, pt2_data)  PRIVATE(i) NUM_THREADS(nproc_target+1)
+  i = omp_get_thread_num()
+  if (i==0) then
+    call selection_collector(zmq_socket_pull, b, N, pt2_data)
+  else
+    call selection_slave_inproc(i)
+  endif
+  !$OMP END PARALLEL
+
+  call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'selection')
+  if (N_in > 0) then
+    if (s2_eig) then
+      call make_selection_buffer_s2(b)
+    endif
+    call fill_H_apply_buffer_no_selection(b%cur,b%det,N_int,0)
+  endif
+  call delete_selection_buffer(b)
+
+  do k=1,N_states
+    pt2_data % pt2(k) = pt2_data % pt2(k) * f(k)
+    pt2_data % variance(k) = pt2_data % variance(k) * f(k)
+    do l=1,N_states
+      pt2_data % overlap(k,l) = pt2_data % overlap(k,l) * dsqrt(f(k)*f(l))
+      pt2_data % overlap(l,k) = pt2_data % overlap(l,k) * dsqrt(f(k)*f(l))
+    enddo
+
+    pt2_data % rpt2(k) =  &
+       pt2_data % pt2(k)/(1.d0 + pt2_data % overlap(k,k))
+  enddo
+
+  pt2_overlap(:,:) = pt2_data % overlap(:,:)
+
+  print *, 'Overlap of perturbed states:'
+  do l=1,N_states
+    print *, pt2_overlap(l,:)
+  enddo
+  print *, '-------'
+  SOFT_TOUCH pt2_overlap
+  call update_pt2_and_variance_weights(pt2_data, N_states)
+
+end subroutine
+
+
+subroutine selection_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  call run_selection_slave(1,i,pt2_e0_denominator)
+end
+
+subroutine selection_collector(zmq_socket_pull, b, N, pt2_data)
+  use f77_zmq
+  use selection_types
+  use bitmasks
+  implicit none
+
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(in)            :: N
+  type(pt2_type), intent(inout)  :: pt2_data
+  type(pt2_type)                 :: pt2_data_tmp
+
+  double precision               :: pt2_mwen(N_states)
+  double precision               :: variance_mwen(N_states)
+  double precision               :: norm2_mwen(N_states)
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_pull_socket
+
+  integer :: msg_size, rc, more
+  integer :: acc, i, j, robin, ntask
+  double precision, pointer :: val(:)
+  integer(bit_kind), pointer :: det(:,:,:)
+  integer, allocatable :: task_id(:)
+  type(selection_buffer) :: b2
+
+
+
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+  call create_selection_buffer(N, N*2, b2)
+  integer :: k
+  double precision :: rss
+  double precision, external :: memory_of_int
+  rss = memory_of_int(N_det_generators)
+  call check_mem(rss,irp_here)
+  allocate(task_id(N_det_generators))
+  more = 1
+  pt2_data % pt2(:)      = 0d0
+  pt2_data % variance(:) = 0.d0
+  pt2_data % overlap(:,:) = 0.d0
+  call pt2_alloc(pt2_data_tmp,N_states)
+  do while (more == 1)
+    call pull_selection_results(zmq_socket_pull, pt2_data_tmp, b2%val(1), b2%det(1,1,1), b2%cur, task_id, ntask)
+
+    call pt2_add(pt2_data, 1.d0, pt2_data_tmp)
+    do i=1, b2%cur
+      call add_to_selection_buffer(b, b2%det(1,1,i), b2%val(i))
+      if (b2%val(i) > b%mini) exit
+    end do
+
+    do i=1, ntask
+      if(task_id(i) == 0) then
+          print *,  "Error in collector"
+      endif
+      integer, external :: zmq_delete_task
+      if (zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) == -1) then
+        stop 'Unable to delete task'
+      endif
+    end do
+  end do
+  call pt2_dealloc(pt2_data_tmp)
+
+
+  call delete_selection_buffer(b2)
+  call sort_selection_buffer(b)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+end subroutine
+
--- a/src/cosgtos_ao_int/EZFIO.cfg
+++ b/src/cosgtos_ao_int/EZFIO.cfg
@ -0,0 +1,19 @@
+[ao_expoim_cosgtos]
+type: double precision
+doc: imag part for Exponents for each primitive of each cosGTOs |AO|
+size: (ao_basis.ao_num,ao_basis.ao_prim_num_max)
+interface: ezfio, provider
+
+[use_cosgtos]
+type: logical
+doc: If true, use cosgtos for AO integrals
+interface: ezfio,provider,ocaml
+default: False
+
+[ao_integrals_threshold]
+type: Threshold
+doc: If | (pq|rs) | < `ao_integrals_threshold` then (pq|rs) is zero
+interface: ezfio,provider,ocaml
+default: 1.e-15
+ezfio_name: threshold_ao
+
--- a/src/cosgtos_ao_int/NEED
+++ b/src/cosgtos_ao_int/NEED
@ -0,0 +1 @@
+ao_basis
--- a/src/cosgtos_ao_int/README.rst
+++ b/src/cosgtos_ao_int/README.rst
@ -0,0 +1,4 @@
+==============
+cosgtos_ao_int
+==============
+
--- a/src/cosgtos_ao_int/aos_cosgtos.irp.f
+++ b/src/cosgtos_ao_int/aos_cosgtos.irp.f
@ -0,0 +1,210 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, ao_coef_norm_ord_transp_cosgtos, (ao_prim_num_max, ao_num) ]
+
+  implicit none
+  integer :: i, j
+
+  do j = 1, ao_num
+    do i = 1, ao_prim_num_max
+      ao_coef_norm_ord_transp_cosgtos(i,j) = ao_coef_norm_ord_cosgtos(j,i)
+    enddo
+  enddo
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ complex*16, ao_expo_ord_transp_cosgtos, (ao_prim_num_max, ao_num) ]
+
+  implicit none
+  integer :: i, j
+
+  do j = 1, ao_num
+    do i = 1, ao_prim_num_max
+      ao_expo_ord_transp_cosgtos(i,j) = ao_expo_ord_cosgtos(j,i)
+    enddo
+  enddo
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, ao_coef_norm_cosgtos, (ao_num, ao_prim_num_max) ]
+
+  implicit none
+
+  integer          :: i, j, powA(3), nz
+  double precision :: norm
+  complex*16       :: overlap_x, overlap_y, overlap_z, C_A(3)
+  complex*16       :: integ1, integ2, expo
+
+  nz = 100
+
+  C_A(1) = (0.d0, 0.d0)
+  C_A(2) = (0.d0, 0.d0)
+  C_A(3) = (0.d0, 0.d0)
+
+  ao_coef_norm_cosgtos = 0.d0
+
+  do i = 1, ao_num
+
+    powA(1) = ao_power(i,1)
+    powA(2) = ao_power(i,2)
+    powA(3) = ao_power(i,3)
+
+    ! Normalization of the primitives
+    if(primitives_normalized) then
+
+      do j = 1, ao_prim_num(i)
+
+        expo = ao_expo(i,j) + (0.d0, 1.d0) * ao_expoim_cosgtos(i,j)
+
+        call overlap_cgaussian_xyz(C_A, C_A,        expo, expo, powA, powA, overlap_x, overlap_y, overlap_z, integ1, nz)
+        call overlap_cgaussian_xyz(C_A, C_A, conjg(expo), expo, powA, powA, overlap_x, overlap_y, overlap_z, integ2, nz)
+
+        norm = 2.d0 * real( integ1 + integ2 )
+
+        ao_coef_norm_cosgtos(i,j) = ao_coef(i,j) / dsqrt(norm)
+      enddo
+
+    else
+
+      do j = 1, ao_prim_num(i)
+        ao_coef_norm_cosgtos(i,j) = ao_coef(i,j)
+      enddo
+
+    endif
+
+  enddo
+
+END_PROVIDER
+
+! ---
+
+ BEGIN_PROVIDER [ double precision, ao_coef_norm_ord_cosgtos, (ao_num, ao_prim_num_max) ]
+&BEGIN_PROVIDER [ complex*16      , ao_expo_ord_cosgtos,      (ao_num, ao_prim_num_max) ]
+
+  implicit none
+  integer          :: i, j
+  integer          :: iorder(ao_prim_num_max)
+  double precision :: d(ao_prim_num_max,3)
+
+  d = 0.d0
+
+  do i = 1, ao_num
+
+    do j = 1, ao_prim_num(i)
+      iorder(j) = j
+      d(j,1) = ao_expo(i,j)
+      d(j,2) = ao_coef_norm_cosgtos(i,j)
+      d(j,3) = ao_expoim_cosgtos(i,j)
+    enddo
+
+    call dsort     (d(1,1), iorder, ao_prim_num(i))
+    call dset_order(d(1,2), iorder, ao_prim_num(i))
+    call dset_order(d(1,3), iorder, ao_prim_num(i))
+
+    do j = 1, ao_prim_num(i)
+      ao_expo_ord_cosgtos     (i,j) = d(j,1) + (0.d0, 1.d0) * d(j,3)
+      ao_coef_norm_ord_cosgtos(i,j) = d(j,2)
+    enddo
+
+  enddo
+
+END_PROVIDER
+
+! ---
+
+ BEGIN_PROVIDER [ double precision, ao_overlap_cosgtos,   (ao_num, ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_overlap_cosgtos_x, (ao_num, ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_overlap_cosgtos_y, (ao_num, ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_overlap_cosgtos_z, (ao_num, ao_num) ]
+
+  implicit none
+  integer          :: i, j, n, l, dim1, power_A(3), power_B(3)
+  double precision :: c, overlap, overlap_x, overlap_y, overlap_z
+  complex*16       :: alpha, beta, A_center(3), B_center(3)
+  complex*16       :: overlap1, overlap_x1, overlap_y1, overlap_z1
+  complex*16       :: overlap2, overlap_x2, overlap_y2, overlap_z2
+
+  ao_overlap_cosgtos   = 0.d0
+  ao_overlap_cosgtos_x = 0.d0
+  ao_overlap_cosgtos_y = 0.d0
+  ao_overlap_cosgtos_z = 0.d0
+
+  dim1 = 100
+
+ !$OMP PARALLEL DO SCHEDULE(GUIDED)                                                                 &
+ !$OMP DEFAULT(NONE)                                                                                &
+ !$OMP PRIVATE( A_center, B_center, power_A, power_B, alpha, beta, i, j, n, l, c                    &
+ !$OMP        , overlap_x , overlap_y , overlap_z , overlap                                         &
+ !$OMP        , overlap_x1, overlap_y1, overlap_z1, overlap1                                        &
+ !$OMP        , overlap_x2, overlap_y2, overlap_z2, overlap2 )                                      &
+ !$OMP SHARED( nucl_coord, ao_power, ao_prim_num, ao_num, ao_nucl, dim1                             &
+ !$OMP       , ao_overlap_cosgtos_x, ao_overlap_cosgtos_y, ao_overlap_cosgtos_z, ao_overlap_cosgtos &
+ !$OMP       , ao_coef_norm_ord_transp_cosgtos, ao_expo_ord_transp_cosgtos )
+
+  do j = 1, ao_num
+
+    A_center(1) = nucl_coord(ao_nucl(j),1) * (1.d0, 0.d0) 
+    A_center(2) = nucl_coord(ao_nucl(j),2) * (1.d0, 0.d0)
+    A_center(3) = nucl_coord(ao_nucl(j),3) * (1.d0, 0.d0)
+    power_A(1)  = ao_power(j,1)
+    power_A(2)  = ao_power(j,2)
+    power_A(3)  = ao_power(j,3)
+
+    do i = 1, ao_num
+
+      B_center(1) = nucl_coord(ao_nucl(i),1) * (1.d0, 0.d0) 
+      B_center(2) = nucl_coord(ao_nucl(i),2) * (1.d0, 0.d0)
+      B_center(3) = nucl_coord(ao_nucl(i),3) * (1.d0, 0.d0)
+      power_B(1)  = ao_power(i,1)
+      power_B(2)  = ao_power(i,2)
+      power_B(3)  = ao_power(i,3)
+
+      do n = 1, ao_prim_num(j)
+        alpha = ao_expo_ord_transp_cosgtos(n,j)
+
+        do l = 1, ao_prim_num(i)
+          c    = ao_coef_norm_ord_transp_cosgtos(n,j) * ao_coef_norm_ord_transp_cosgtos(l,i)
+          beta = ao_expo_ord_transp_cosgtos(l,i)
+
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                    , overlap_x1, overlap_y1, overlap_z1, overlap1, dim1 )
+
+          call overlap_cgaussian_xyz( A_center, B_center, conjg(alpha), beta, power_A, power_B &
+                                    , overlap_x2, overlap_y2, overlap_z2, overlap2, dim1       )
+
+          overlap_x = 2.d0 * real( overlap_x1 + overlap_x2 )
+          overlap_y = 2.d0 * real( overlap_y1 + overlap_y2 )
+          overlap_z = 2.d0 * real( overlap_z1 + overlap_z2 )
+          overlap   = 2.d0 * real( overlap1   + overlap2   )
+
+          ao_overlap_cosgtos(i,j) = ao_overlap_cosgtos(i,j) + c * overlap
+
+          if( isnan(ao_overlap_cosgtos(i,j)) ) then
+            print*,'i, j', i, j
+            print*,'l, n', l, n
+            print*,'c, overlap', c, overlap
+            print*, overlap_x, overlap_y, overlap_z
+            stop
+          endif
+
+          ao_overlap_cosgtos_x(i,j) = ao_overlap_cosgtos_x(i,j) + c * overlap_x
+          ao_overlap_cosgtos_y(i,j) = ao_overlap_cosgtos_y(i,j) + c * overlap_y
+          ao_overlap_cosgtos_z(i,j) = ao_overlap_cosgtos_z(i,j) + c * overlap_z
+
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+
+END_PROVIDER
+
+! ---
+
+
+
--- a/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
+++ b/src/cosgtos_ao_int/cosgtos_ao_int.irp.f
@ -0,0 +1,7 @@
+program cosgtos_ao_int
+  implicit none
+  BEGIN_DOC
+! TODO : Put the documentation of the program here
+  END_DOC
+  print *, 'Hello world'
+end
--- a/src/cosgtos_ao_int/expoim_opt.py
+++ b/src/cosgtos_ao_int/expoim_opt.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import subprocess
+from datetime import datetime
+import time
+import numpy as np
+from modif_powell_imp import my_fmin_powell
+
+QP_PATH=os.environ["QP_ROOT"]
+sys.path.insert(0, QP_PATH+"external/ezfio/Python")
+from ezfio import ezfio
+
+
+
+
+
+#------------------------------------------------------------------------------
+#
+def get_expoim():
+
+    expo_im = np.array(ezfio.get_cosgtos_ao_int_ao_expoim_cosgtos()).T
+    #print(expo_im.shape)
+
+    x = []
+    for i in range(ao_num):
+        for j in range(ao_prim_num[i]):
+            x.append(expo_im[i,j])
+
+    return x
+
+# ---
+
+def set_expoim(x):
+
+    expo_im = np.zeros((ao_num, ao_prim_num_max))
+
+    ii = 0
+    for i in range(ao_num):
+        for j in range(ao_prim_num[i]):
+            expo_im[i,j] = x[ii]
+            ii = ii + 1
+
+    ezfio.set_cosgtos_ao_int_ao_expoim_cosgtos(expo_im.T)
+#
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+def save_res(results, file_output):
+
+    lines = results.splitlines()
+    with open(file_output, "w") as f:
+        for line in lines:
+            f.write(f"{line}\n")
+
+#
+#------------------------------------------------------------------------------
+
+
+
+#------------------------------------------------------------------------------
+#
+def get_scfenergy(results):
+
+    scf_energy = 0.0
+
+    lines = results.splitlines()
+    for line in lines:
+        if("SCF energy" in line):
+            scf_energy = float(line.split()[-1])
+
+    return scf_energy
+#
+#------------------------------------------------------------------------------
+
+
+
+
+#------------------------------------------------------------------------------
+#
+def run_scf():
+
+    return subprocess.check_output( ['qp_run', 'scf', EZFIO_file]
+                                  , encoding = "utf-8" )
+#
+#------------------------------------------------------------------------------
+
+
+
+#------------------------------------------------------------------------------
+#
+def f_scf(x):
+
+    global i_call
+    i_call += 1
+
+    #print(x)
+
+    # set expo
+    set_expoim(x)
+
+    # run scf
+    results = run_scf()
+    #save_res(results, "scf_"+str(i_call))
+    
+
+    # get scf_energy
+    scf_energy = get_scfenergy(results)
+    print( scf_energy )
+    sys.stdout.flush()
+
+    return scf_energy
+#
+#------------------------------------------------------------------------------
+
+
+
+
+if __name__ == '__main__':
+
+    t0 = time.time()
+
+    EZFIO_file = sys.argv[1]
+    ezfio.set_file(EZFIO_file)
+    print(" Today's date:", datetime.now() )
+    print(" EZFIO file = {}".format(EZFIO_file))
+
+
+    ao_num = ezfio.get_ao_basis_ao_num()
+    print(f" ao_num = {ao_num}")
+
+    ao_prim_num = ezfio.get_ao_basis_ao_prim_num()
+
+    ao_prim_num_max = np.amax(ao_prim_num)
+    print(f" ao_prim_num_max = {ao_prim_num_max}")
+
+    ezfio.set_ao_basis_ao_prim_num_max(ao_prim_num_max)
+
+    x = get_expoim()
+
+    n_par = len(x)
+    print(' nb of parameters = {}'.format(n_par))
+
+    sys.stdout.flush()
+
+    #x    = (np.random.rand(n_par) - 0.5) * 1.0
+    x     = [ (+0.00) for _ in range(n_par)]
+
+    x_min = [ (-10.0) for _ in range(n_par)]
+    x_max = [ (+10.0) for _ in range(n_par)]
+
+    i_call = 0
+    memo_val = {'fmin': 100.}
+
+    opt = my_fmin_powell( f_scf
+                        , x, x_min, x_max
+                        #, xtol        = 1e-1
+                        #, ftol        = 1e-1
+                        , maxfev      = 1e8
+                        , full_output = 1
+                        , verbose     = 1 )
+
+
+    print(" x = " + str(opt))
+
+    print(" end after {:.3f} minutes".format((time.time()-t0)/60.) )
+
+    # !!!
+# !!!
--- a/src/cosgtos_ao_int/gauss_legendre.irp.f
+++ b/src/cosgtos_ao_int/gauss_legendre.irp.f
@ -0,0 +1,57 @@
+ BEGIN_PROVIDER [ double precision, gauleg_t2, (n_pt_max_integrals,n_pt_max_integrals/2) ]
+&BEGIN_PROVIDER [ double precision, gauleg_w, (n_pt_max_integrals,n_pt_max_integrals/2) ]
+   implicit none
+   BEGIN_DOC
+   ! t_w(i,1,k) = w(i)
+   ! t_w(i,2,k) = t(i)
+   END_DOC
+   integer                        :: i,j,l
+   l=0
+   do i = 2,n_pt_max_integrals,2
+     l = l+1
+     call gauleg(0.d0,1.d0,gauleg_t2(1,l),gauleg_w(1,l),i)
+     do j=1,i
+       gauleg_t2(j,l) *= gauleg_t2(j,l)
+     enddo
+   enddo
+
+END_PROVIDER
+
+subroutine gauleg(x1,x2,x,w,n)
+   implicit none
+   BEGIN_DOC
+   ! Gauss-Legendre
+   END_DOC
+   integer, intent(in)            :: n
+   double precision, intent(in)   :: x1, x2
+   double precision, intent (out) :: x(n),w(n)
+   double precision, parameter    :: eps=3.d-14
+
+   integer                        :: m,i,j
+   double precision               :: xm, xl, z, z1, p1, p2, p3, pp, dn
+   m=(n+1)/2
+   xm=0.5d0*(x2+x1)
+   xl=0.5d0*(x2-x1)
+   dn = dble(n)
+   do i=1,m
+     z=dcos(3.141592654d0*(dble(i)-.25d0)/(dble(n)+.5d0))
+     z1 = z+1.d0
+     do while (dabs(z-z1) > eps)
+       p1=1.d0
+       p2=0.d0
+       do j=1,n
+         p3=p2
+         p2=p1
+         p1=(dble(j+j-1)*z*p2-dble(j-1)*p3)/j
+       enddo
+       pp=dn*(z*p1-p2)/(z*z-1.d0)
+       z1=z
+       z=z1-p1/pp
+     end do
+     x(i)=xm-xl*z
+     x(n+1-i)=xm+xl*z
+     w(i)=(xl+xl)/((1.d0-z*z)*pp*pp)
+     w(n+1-i)=w(i)
+   enddo
+end
+
--- a/src/cosgtos_ao_int/one_e_Coul_integrals.irp.f
+++ b/src/cosgtos_ao_int/one_e_Coul_integrals.irp.f
@ -0,0 +1,535 @@
+
+! ---
+
+BEGIN_PROVIDER [ double precision, ao_integrals_n_e_cosgtos, (ao_num, ao_num)]
+
+  BEGIN_DOC
+  !
+  !  Nucleus-electron interaction, in the cosgtos |AO| basis set.
+  !
+  !  :math:`\langle \chi_i | -\sum_A \frac{1}{|r-R_A|} | \chi_j \rangle`
+  !
+  END_DOC
+
+  implicit none
+  integer          :: num_A, num_B, power_A(3), power_B(3)
+  integer          :: i, j, k, l, n_pt_in, m
+  double precision :: c, Z, A_center(3), B_center(3), C_center(3)
+  complex*16       :: alpha, beta, c1, c2
+
+  complex*16       :: NAI_pol_mult_cosgtos
+
+  ao_integrals_n_e_cosgtos = 0.d0
+
+ !$OMP PARALLEL                                                                           &
+ !$OMP DEFAULT (NONE)                                                                     &
+ !$OMP PRIVATE ( i, j, k, l, m, alpha, beta, A_center, B_center, C_center                 &
+ !$OMP         , power_A, power_B, num_A, num_B, Z, c, c1, c2, n_pt_in )                  &
+ !$OMP SHARED ( ao_num, ao_prim_num, ao_nucl, nucl_coord, ao_power, nucl_num, nucl_charge &
+ !$OMP        , ao_expo_ord_transp_cosgtos, ao_coef_norm_ord_transp_cosgtos               &
+ !$OMP        , n_pt_max_integrals, ao_integrals_n_e_cosgtos )
+
+ n_pt_in = n_pt_max_integrals
+
+ !$OMP DO SCHEDULE (dynamic)
+
+  do j = 1, ao_num
+    num_A         = ao_nucl(j)
+    power_A(1:3)  = ao_power(j,1:3)
+    A_center(1:3) = nucl_coord(num_A,1:3)
+
+    do i = 1, ao_num
+      num_B         = ao_nucl(i)
+      power_B(1:3)  = ao_power(i,1:3)
+      B_center(1:3) = nucl_coord(num_B,1:3)
+
+      do l = 1, ao_prim_num(j)
+        alpha = ao_expo_ord_transp_cosgtos(l,j)
+
+        do m = 1, ao_prim_num(i)
+          beta = ao_expo_ord_transp_cosgtos(m,i)
+
+          c = 0.d0
+          do k = 1, nucl_num
+
+            Z = nucl_charge(k)
+
+            C_center(1:3) = nucl_coord(k,1:3)
+
+            !print *, ' '
+            !print *, A_center, B_center, C_center, power_A, power_B
+            !print *, real(alpha), real(beta)
+
+            c1 = NAI_pol_mult_cosgtos( A_center, B_center, power_A, power_B &
+                                     , alpha, beta, C_center, n_pt_in )
+
+            !c2 = c1
+            c2 = NAI_pol_mult_cosgtos( A_center, B_center, power_A, power_B  &
+                                     , conjg(alpha), beta, C_center, n_pt_in )
+
+            !print *, ' c1 = ', real(c1)
+            !print *, ' c2 = ', real(c2)
+
+            c = c - Z * 2.d0 * real(c1 + c2)
+
+          enddo
+          ao_integrals_n_e_cosgtos(i,j) = ao_integrals_n_e_cosgtos(i,j) &
+                      + ao_coef_norm_ord_transp_cosgtos(l,j)            &
+                      * ao_coef_norm_ord_transp_cosgtos(m,i) * c
+        enddo
+      enddo
+    enddo
+  enddo
+
+ !$OMP END DO
+ !$OMP END PARALLEL
+
+END_PROVIDER
+
+! ---
+
+complex*16 function NAI_pol_mult_cosgtos(A_center, B_center, power_A, power_B, alpha, beta, C_center, n_pt_in)
+
+  BEGIN_DOC
+  !
+  ! Computes the electron-nucleus attraction with two primitves cosgtos.
+  !
+  ! :math:`\langle g_i | \frac{1}{|r-R_c|} | g_j \rangle`
+  !
+  END_DOC
+
+  implicit none
+  include 'utils/constants.include.F'
+
+  integer,          intent(in) :: n_pt_in, power_A(3), power_B(3)
+  double precision, intent(in) :: C_center(3), A_center(3), B_center(3)
+  complex*16,       intent(in) :: alpha, beta
+
+  integer                      :: i, n_pt, n_pt_out
+  double precision             :: dist, const_mod
+  complex*16                   :: p, p_inv, rho, dist_integral, const, const_factor, coeff, factor
+  complex*16                   :: accu, P_center(3)
+  complex*16                   :: d(0:n_pt_in)
+
+  complex*16                   :: V_n_e_cosgtos
+  complex*16                   :: crint
+
+  if ( (A_center(1)/=B_center(1)) .or. (A_center(2)/=B_center(2)) .or. (A_center(3)/=B_center(3)) .or. &
+       (A_center(1)/=C_center(1)) .or. (A_center(2)/=C_center(2)) .or. (A_center(3)/=C_center(3)) ) then
+
+    continue
+
+  else
+
+    NAI_pol_mult_cosgtos = V_n_e_cosgtos( power_A(1), power_A(2), power_A(3) &
+                                        , power_B(1), power_B(2), power_B(3) &
+                                        , alpha, beta )
+    return
+
+  endif
+
+  p     = alpha + beta
+  p_inv = (1.d0, 0.d0) / p
+  rho   = alpha * beta * p_inv
+
+  dist          = 0.d0
+  dist_integral = (0.d0, 0.d0)
+  do i = 1, 3
+    P_center(i)    = (alpha * A_center(i) + beta * B_center(i)) * p_inv
+    dist          += (A_center(i) - B_center(i)) * (A_center(i) - B_center(i))
+    dist_integral += (P_center(i) - C_center(i)) * (P_center(i) - C_center(i))
+  enddo
+
+  const_factor = dist * rho
+  const        = p * dist_integral
+
+  const_mod = dsqrt(real(const_factor)*real(const_factor) + aimag(const_factor)*aimag(const_factor))
+  if(const_mod > 80.d0) then
+    NAI_pol_mult_cosgtos = (0.d0, 0.d0)
+    return
+  endif
+
+  factor = zexp(-const_factor)
+  coeff  = dtwo_pi * factor * p_inv
+
+  do i = 0, n_pt_in
+    d(i) = (0.d0, 0.d0)
+  enddo
+
+  n_pt = 2 * ( (power_A(1) + power_B(1)) + (power_A(2) + power_B(2)) + (power_A(3) + power_B(3)) )
+  if(n_pt == 0) then
+    NAI_pol_mult_cosgtos = coeff * crint(0, const)
+    return
+  endif
+
+  call give_cpolynomial_mult_center_one_e( A_center, B_center, alpha, beta &
+                                         , power_A, power_B, C_center, n_pt_in, d, n_pt_out)
+
+  if(n_pt_out < 0) then
+    NAI_pol_mult_cosgtos = (0.d0, 0.d0)
+    return
+  endif
+
+  accu = (0.d0, 0.d0)
+  do i = 0, n_pt_out, 2
+    accu += crint(shiftr(i, 1), const) * d(i)
+
+!    print *, shiftr(i, 1), real(const), real(d(i)), real(crint(shiftr(i, 1), const))
+  enddo
+  NAI_pol_mult_cosgtos = accu * coeff
+
+end function NAI_pol_mult_cosgtos
+
+! ---
+
+subroutine give_cpolynomial_mult_center_one_e( A_center, B_center, alpha, beta &
+                                             , power_A, power_B, C_center, n_pt_in, d, n_pt_out)
+
+  BEGIN_DOC
+  ! Returns the explicit polynomial in terms of the "t" variable of the following
+  !
+  ! $I_{x1}(a_x, d_x,p,q) \times I_{x1}(a_y, d_y,p,q) \times I_{x1}(a_z, d_z,p,q)$.
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in) :: n_pt_in, power_A(3), power_B(3)
+  double precision, intent(in) :: A_center(3), B_center(3), C_center(3)
+  complex*16,       intent(in) :: alpha, beta
+  integer,         intent(out) :: n_pt_out
+  complex*16,      intent(out) :: d(0:n_pt_in)
+
+  integer                      :: a_x, b_x, a_y, b_y, a_z, b_z
+  integer                      :: n_pt1, n_pt2, n_pt3, dim, i, n_pt_tmp
+  complex*16                   :: p, P_center(3), rho, p_inv, p_inv_2
+  complex*16                   :: R1x(0:2), B01(0:2), R1xp(0:2),R2x(0:2)
+  complex*16                   :: d1(0:n_pt_in), d2(0:n_pt_in), d3(0:n_pt_in)
+
+  ASSERT (n_pt_in > 1)
+
+  p       = alpha + beta
+  p_inv   = (1.d0, 0.d0) / p
+  p_inv_2 = 0.5d0 * p_inv
+
+  do i = 1, 3
+    P_center(i) = (alpha * A_center(i) + beta * B_center(i)) * p_inv
+  enddo
+
+  do i = 0, n_pt_in
+    d(i)  = (0.d0, 0.d0)
+    d1(i) = (0.d0, 0.d0)
+    d2(i) = (0.d0, 0.d0)
+    d3(i) = (0.d0, 0.d0)
+  enddo
+
+  ! ---
+
+  n_pt1 = n_pt_in
+
+  R1x(0)  =  (P_center(1) - A_center(1))
+  R1x(1)  =  (0.d0, 0.d0)
+  R1x(2)  = -(P_center(1) - C_center(1))
+
+  R1xp(0) =  (P_center(1) - B_center(1))
+  R1xp(1) =  (0.d0, 0.d0)
+  R1xp(2) = -(P_center(1) - C_center(1))
+
+  R2x(0)  =  p_inv_2
+  R2x(1)  = (0.d0, 0.d0)
+  R2x(2)  = -p_inv_2
+
+  a_x = power_A(1)
+  b_x = power_B(1)
+  call I_x1_pol_mult_one_e_cosgtos(a_x, b_x, R1x, R1xp, R2x, d1, n_pt1, n_pt_in)
+
+  if(n_pt1 < 0) then
+    n_pt_out = -1
+    do i = 0, n_pt_in
+      d(i) = (0.d0, 0.d0)
+    enddo
+    return
+  endif
+
+  ! ---
+
+  n_pt2 = n_pt_in
+
+  R1x(0)  =  (P_center(2) - A_center(2))
+  R1x(1)  =  (0.d0, 0.d0)
+  R1x(2)  = -(P_center(2) - C_center(2))
+
+  R1xp(0) =  (P_center(2) - B_center(2))
+  R1xp(1) =  (0.d0, 0.d0)
+  R1xp(2) = -(P_center(2) - C_center(2))
+
+  a_y = power_A(2)
+  b_y = power_B(2)
+  call I_x1_pol_mult_one_e_cosgtos(a_y, b_y, R1x, R1xp, R2x, d2, n_pt2, n_pt_in)
+
+  if(n_pt2 < 0) then
+    n_pt_out = -1
+    do i = 0, n_pt_in
+      d(i) = (0.d0, 0.d0)
+    enddo
+    return
+  endif
+
+  ! ---
+
+  n_pt3 = n_pt_in
+
+  R1x(0)  =  (P_center(3) - A_center(3))
+  R1x(1)  =  (0.d0, 0.d0)
+  R1x(2)  = -(P_center(3) - C_center(3))
+
+  R1xp(0) =  (P_center(3) - B_center(3))
+  R1xp(1) =  (0.d0, 0.d0)
+  R1xp(2) = -(P_center(3) - C_center(3))
+
+  a_z = power_A(3)
+  b_z = power_B(3)
+  call I_x1_pol_mult_one_e_cosgtos(a_z, b_z, R1x, R1xp, R2x, d3, n_pt3, n_pt_in)
+
+  if(n_pt3 < 0) then
+    n_pt_out = -1
+    do i = 0, n_pt_in
+      d(i) = (0.d0, 0.d0)
+    enddo
+    return
+  endif
+
+  ! ---
+
+  n_pt_tmp = 0
+  call multiply_cpoly(d1, n_pt1, d2, n_pt2, d, n_pt_tmp)
+  do i = 0, n_pt_tmp
+    d1(i) = (0.d0, 0.d0)
+  enddo
+
+  n_pt_out = 0
+  call multiply_cpoly(d, n_pt_tmp, d3, n_pt3, d1, n_pt_out)
+  do i = 0, n_pt_out
+    d(i) = d1(i)
+  enddo
+
+end subroutine give_cpolynomial_mult_center_one_e
+
+! ---
+
+recursive subroutine I_x1_pol_mult_one_e_cosgtos(a, c, R1x, R1xp, R2x, d, nd, n_pt_in)
+
+  BEGIN_DOC
+  !  Recursive routine involved in the electron-nucleus potential
+  END_DOC
+
+  implicit none
+  include 'utils/constants.include.F'
+
+  integer,    intent(in)    :: a, c, n_pt_in
+  complex*16, intent(in)    :: R1x(0:2), R1xp(0:2), R2x(0:2)
+  integer,    intent(inout) :: nd
+  complex*16, intent(inout) :: d(0:n_pt_in)
+
+  integer                   :: nx, ix, dim, iy, ny
+  complex*16                :: X(0:max_dim)
+  complex*16                :: Y(0:max_dim)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: X, Y
+
+  dim = n_pt_in
+
+  if( (a==0) .and. (c==0)) then
+
+    nd   = 0
+    d(0) = (1.d0, 0.d0)
+    return
+
+  elseif( (c < 0) .or. (nd < 0) ) then
+
+    nd = -1
+    return
+
+  elseif((a == 0) .and. (c .ne. 0)) then
+
+    call I_x2_pol_mult_one_e_cosgtos(c, R1x, R1xp, R2x, d, nd, n_pt_in)
+
+  elseif(a == 1) then
+
+    nx = nd
+    do ix = 0, n_pt_in
+      X(ix) = (0.d0, 0.d0)
+      Y(ix) = (0.d0, 0.d0)
+    enddo
+
+    call I_x2_pol_mult_one_e_cosgtos(c-1, R1x, R1xp, R2x, X, nx, n_pt_in)
+
+    do ix = 0, nx
+      X(ix) *= dble(c)
+    enddo
+
+    call multiply_cpoly(X, nx, R2x, 2, d, nd)
+
+    ny = 0
+    call I_x2_pol_mult_one_e_cosgtos(c, R1x, R1xp, R2x, Y, ny, n_pt_in)
+    call multiply_cpoly(Y, ny, R1x, 2, d, nd)
+
+  else
+
+    nx = 0
+    do ix = 0, n_pt_in
+      X(ix) = (0.d0, 0.d0)
+      Y(ix) = (0.d0, 0.d0)
+    enddo
+
+    call I_x1_pol_mult_one_e_cosgtos(a-2, c, R1x, R1xp, R2x, X, nx, n_pt_in)
+
+    do ix = 0, nx
+      X(ix) *= dble(a-1)
+    enddo
+    call multiply_cpoly(X, nx, R2x, 2, d, nd)
+
+    nx = nd
+    do ix = 0, n_pt_in
+      X(ix) = (0.d0, 0.d0)
+    enddo
+
+    call I_x1_pol_mult_one_e_cosgtos(a-1, c-1, R1x, R1xp, R2x, X, nx, n_pt_in)
+    do ix = 0, nx
+      X(ix) *= dble(c)
+    enddo
+
+    call multiply_cpoly(X, nx, R2x, 2, d, nd)
+
+    ny = 0
+    call I_x1_pol_mult_one_e_cosgtos(a-1, c, R1x, R1xp, R2x, Y, ny, n_pt_in)
+    call multiply_cpoly(Y, ny, R1x, 2, d, nd)
+
+  endif
+
+end subroutine I_x1_pol_mult_one_e_cosgtos
+
+! ---
+
+recursive subroutine I_x2_pol_mult_one_e_cosgtos(c, R1x, R1xp, R2x, d, nd, dim)
+
+  BEGIN_DOC
+  !  Recursive routine involved in the electron-nucleus potential
+  END_DOC
+
+  implicit none
+  include 'utils/constants.include.F'
+
+  integer,    intent(in)    :: dim, c
+  complex*16, intent(in)    :: R1x(0:2), R1xp(0:2), R2x(0:2)
+  integer,    intent(inout) :: nd
+  complex*16, intent(out)   :: d(0:max_dim)
+
+  integer                   :: i, nx, ix, ny
+  complex*16                :: X(0:max_dim), Y(0:max_dim)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: X, Y
+
+  if(c == 0) then
+
+    nd = 0
+    d(0) = (1.d0, 0.d0)
+    return
+
+  elseif((nd < 0) .or. (c < 0)) then
+
+    nd = -1
+    return
+
+  else
+
+    nx = 0
+    do ix = 0, dim
+      X(ix) = (0.d0, 0.d0)
+      Y(ix) = (0.d0, 0.d0)
+    enddo
+
+    call I_x1_pol_mult_one_e_cosgtos(0, c-2, R1x, R1xp, R2x, X, nx, dim)
+
+    do ix = 0, nx
+      X(ix) *= dble(c-1)
+    enddo
+
+    call multiply_cpoly(X, nx, R2x, 2, d, nd)
+
+    ny = 0
+    do ix = 0, dim
+      Y(ix) = (0.d0, 0.d0)
+    enddo
+
+    call I_x1_pol_mult_one_e_cosgtos(0, c-1, R1x, R1xp, R2x, Y, ny, dim)
+
+    if(ny .ge. 0) then
+      call multiply_cpoly(Y, ny, R1xp, 2, d, nd)
+    endif
+
+  endif
+
+end subroutine I_x2_pol_mult_one_e_cosgtos
+
+! ---
+
+complex*16 function V_n_e_cosgtos(a_x, a_y, a_z, b_x, b_y, b_z, alpha, beta)
+
+  BEGIN_DOC
+  ! Primitve nuclear attraction between the two primitves centered on the same atom.
+  !
+  ! $p_1 = x^{a_x} y^{a_y} z^{a_z} \exp(-\alpha r^2)$
+  !
+  ! $p_2 = x^{b_x} y^{b_y} z^{b_z} \exp(-\beta  r^2)$
+  END_DOC
+
+  implicit none
+
+  integer,    intent(in) :: a_x, a_y, a_z, b_x, b_y, b_z
+  complex*16, intent(in) :: alpha, beta
+
+  double precision       :: V_phi, V_theta
+  complex*16             :: V_r_cosgtos
+
+  if( (iand(a_x + b_x, 1) == 1) .or. &
+      (iand(a_y + b_y, 1) == 1) .or. &
+      (iand(a_z + b_z, 1) == 1) ) then
+
+    V_n_e_cosgtos = (0.d0, 0.d0)
+
+  else
+
+    V_n_e_cosgtos = V_r_cosgtos(a_x + b_x + a_y + b_y + a_z + b_z + 1, alpha + beta) &
+                  * V_phi(a_x + b_x, a_y + b_y)                                      &
+                  * V_theta(a_z + b_z, a_x + b_x + a_y + b_y + 1)
+  endif
+
+end function V_n_e_cosgtos
+
+! ---
+
+complex*16 function V_r_cosgtos(n, alpha)
+
+  BEGIN_DOC
+  ! Computes the radial part of the nuclear attraction integral:
+  !
+  ! $\int_{0}^{\infty} r^n  \exp(-\alpha  r^2)  dr$
+  !
+  END_DOC
+
+  implicit none
+  include 'utils/constants.include.F'
+
+  integer   , intent(in) :: n
+  complex*16, intent(in) :: alpha
+
+  double precision       :: fact
+
+  if(iand(n, 1) .eq. 1) then
+    V_r_cosgtos = 0.5d0 * fact(shiftr(n, 1)) / (alpha**(shiftr(n, 1) + 1))
+  else
+    V_r_cosgtos = sqpi * fact(n) / fact(shiftr(n, 1)) * (0.5d0/zsqrt(alpha))**(n+1)
+  endif
+
+end function V_r_cosgtos
+
+! ---
+
--- a/src/cosgtos_ao_int/one_e_kin_integrals.irp.f
+++ b/src/cosgtos_ao_int/one_e_kin_integrals.irp.f
@ -0,0 +1,223 @@
+
+! ---
+
+ BEGIN_PROVIDER [ double precision, ao_deriv2_cosgtos_x, (ao_num, ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_deriv2_cosgtos_y, (ao_num, ao_num) ]
+&BEGIN_PROVIDER [ double precision, ao_deriv2_cosgtos_z, (ao_num, ao_num) ]
+
+  implicit none
+  integer          :: i, j, n, l, dim1, power_A(3), power_B(3)
+  double precision :: c, deriv_tmp
+  complex*16       :: alpha, beta, A_center(3), B_center(3)
+  complex*16       :: overlap_x, overlap_y, overlap_z, overlap
+  complex*16       :: overlap_x0_1, overlap_y0_1, overlap_z0_1
+  complex*16       :: overlap_x0_2, overlap_y0_2, overlap_z0_2 
+  complex*16       :: overlap_m2_1, overlap_p2_1
+  complex*16       :: overlap_m2_2, overlap_p2_2
+  complex*16       :: deriv_tmp_1, deriv_tmp_2
+
+
+  dim1 = 100
+
+  ! -- Dummy call to provide everything
+
+  A_center(:) = (0.0d0, 0.d0)
+  B_center(:) = (1.0d0, 0.d0)
+  alpha       = (1.0d0, 0.d0)
+  beta        = (0.1d0, 0.d0)
+  power_A     = 1
+  power_B     = 0
+  call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B       &
+                            , overlap_x0_1, overlap_y0_1, overlap_z0_1, overlap, dim1 )
+
+  ! ---
+
+ !$OMP PARALLEL DO SCHEDULE(GUIDED)                                                                  &
+ !$OMP DEFAULT(NONE)                                                                                 &
+ !$OMP PRIVATE( A_center, B_center, power_A, power_B, alpha, beta, i, j, l, n, c                     &
+ !$OMP        , deriv_tmp, deriv_tmp_1, deriv_tmp_2                                                  &
+ !$OMP        , overlap_x, overlap_y, overlap_z, overlap                                             &
+ !$OMP        , overlap_m2_1, overlap_p2_1, overlap_m2_2, overlap_p2_2                               &
+ !$OMP        , overlap_x0_1, overlap_y0_1, overlap_z0_1, overlap_x0_2, overlap_y0_2, overlap_z0_2 ) &
+ !$OMP SHARED( nucl_coord, ao_power, ao_prim_num, ao_num, ao_nucl, dim1                              &
+ !$OMP       , ao_coef_norm_ord_transp_cosgtos, ao_expo_ord_transp_cosgtos                           & 
+ !$OMP       , ao_deriv2_cosgtos_x, ao_deriv2_cosgtos_y, ao_deriv2_cosgtos_z ) 
+
+  do j = 1, ao_num
+    A_center(1) = nucl_coord(ao_nucl(j),1) * (1.d0, 0.d0)
+    A_center(2) = nucl_coord(ao_nucl(j),2) * (1.d0, 0.d0)
+    A_center(3) = nucl_coord(ao_nucl(j),3) * (1.d0, 0.d0)
+    power_A(1)  = ao_power(j,1)
+    power_A(2)  = ao_power(j,2)
+    power_A(3)  = ao_power(j,3)
+
+    do i = 1, ao_num
+      B_center(1) = nucl_coord(ao_nucl(i),1) * (1.d0, 0.d0) 
+      B_center(2) = nucl_coord(ao_nucl(i),2) * (1.d0, 0.d0)
+      B_center(3) = nucl_coord(ao_nucl(i),3) * (1.d0, 0.d0)
+      power_B(1)  = ao_power(i,1)
+      power_B(2)  = ao_power(i,2)
+      power_B(3)  = ao_power(i,3)
+
+      ao_deriv2_cosgtos_x(i,j) = 0.d0
+      ao_deriv2_cosgtos_y(i,j) = 0.d0
+      ao_deriv2_cosgtos_z(i,j) = 0.d0
+
+      do n = 1, ao_prim_num(j)
+        alpha = ao_expo_ord_transp_cosgtos(n,j)
+
+        do l = 1, ao_prim_num(i)
+          c    = ao_coef_norm_ord_transp_cosgtos(n,j) * ao_coef_norm_ord_transp_cosgtos(l,i)
+          beta = ao_expo_ord_transp_cosgtos(l,i)
+
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B       &
+                                    , overlap_x0_1, overlap_y0_1, overlap_z0_1, overlap, dim1 )
+
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                    , overlap_x0_2, overlap_y0_2, overlap_z0_2, overlap, dim1  )
+
+          ! ---
+
+          power_A(1) = power_A(1) - 2
+          if(power_A(1) > -1) then
+            call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                      , overlap_m2_1, overlap_y, overlap_z, overlap, dim1 )
+
+            call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                      , overlap_m2_2, overlap_y, overlap_z, overlap, dim1        )
+          else
+            overlap_m2_1 = (0.d0, 0.d0)
+            overlap_m2_2 = (0.d0, 0.d0)
+          endif
+
+          power_A(1) = power_A(1) + 4
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                    , overlap_p2_1, overlap_y, overlap_z, overlap, dim1 )
+
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                    , overlap_p2_2, overlap_y, overlap_z, overlap, dim1        )
+
+          power_A(1) = power_A(1) - 2
+
+          deriv_tmp_1 = ( -2.d0 * alpha * (2.d0 * power_A(1) + 1.d0) * overlap_x0_1 &
+                        + power_A(1) * (power_A(1) - 1.d0) * overlap_m2_1           &
+                        + 4.d0 * alpha * alpha * overlap_p2_1 ) * overlap_y0_1 * overlap_z0_1
+
+          deriv_tmp_2 = ( -2.d0 * alpha * (2.d0 * power_A(1) + 1.d0) * overlap_x0_2 &
+                        + power_A(1) * (power_A(1) - 1.d0) * overlap_m2_2           &
+                        + 4.d0 * alpha * alpha * overlap_p2_2 ) * overlap_y0_2 * overlap_z0_2
+
+          deriv_tmp = 2.d0 * real(deriv_tmp_1 + deriv_tmp_2)
+
+          ao_deriv2_cosgtos_x(i,j) += c * deriv_tmp
+
+          ! ---
+
+          power_A(2) = power_A(2) - 2
+          if(power_A(2) > -1) then
+            call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                      , overlap_x, overlap_m2_1, overlap_y, overlap, dim1 )
+
+            call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                      , overlap_x, overlap_m2_2, overlap_y, overlap, dim1        )
+          else
+            overlap_m2_1 = (0.d0, 0.d0)
+            overlap_m2_2 = (0.d0, 0.d0)
+          endif
+
+          power_A(2) = power_A(2) + 4
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                    , overlap_x, overlap_p2_1, overlap_y, overlap, dim1 )
+
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                    , overlap_x, overlap_p2_2, overlap_y, overlap, dim1        )
+
+          power_A(2) = power_A(2) - 2
+
+          deriv_tmp_1 = ( -2.d0 * alpha * (2.d0 * power_A(2) + 1.d0) * overlap_y0_1 &
+                        + power_A(2) * (power_A(2) - 1.d0) * overlap_m2_1           &
+                        + 4.d0 * alpha * alpha * overlap_p2_1 ) * overlap_x0_1 * overlap_z0_1
+
+          deriv_tmp_2 = ( -2.d0 * alpha * (2.d0 * power_A(2) + 1.d0) * overlap_y0_2 &
+                        + power_A(2) * (power_A(2) - 1.d0) * overlap_m2_2           &
+                        + 4.d0 * alpha * alpha * overlap_p2_2 ) * overlap_x0_2 * overlap_z0_2
+
+          deriv_tmp = 2.d0 * real(deriv_tmp_1 + deriv_tmp_2)
+
+          ao_deriv2_cosgtos_y(i,j) += c * deriv_tmp
+
+          ! ---
+
+          power_A(3) = power_A(3) - 2
+          if(power_A(3) > -1) then
+            call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                      , overlap_x, overlap_y, overlap_m2_1, overlap, dim1 )
+
+            call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                      , overlap_x, overlap_y, overlap_m2_2, overlap, dim1        )
+          else
+            overlap_m2_1 = (0.d0, 0.d0)
+            overlap_m2_2 = (0.d0, 0.d0)
+          endif
+
+          power_A(3) = power_A(3) + 4
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, beta, power_A, power_B &
+                                    , overlap_x, overlap_y, overlap_p2_1, overlap, dim1 )
+
+          call overlap_cgaussian_xyz( A_center, B_center, alpha, conjg(beta), power_A, power_B &
+                                    , overlap_x, overlap_y, overlap_p2_2, overlap, dim1        )
+
+          power_A(3) = power_A(3) - 2
+          
+          deriv_tmp_1 = ( -2.d0 * alpha * (2.d0 * power_A(3) + 1.d0) * overlap_z0_1 &
+                        + power_A(3) * (power_A(3) - 1.d0) * overlap_m2_1           &
+                        + 4.d0 * alpha * alpha * overlap_p2_1 ) * overlap_x0_1 * overlap_y0_1
+
+          deriv_tmp_2 = ( -2.d0 * alpha * (2.d0 * power_A(3) + 1.d0) * overlap_z0_2 &
+                        + power_A(3) * (power_A(3) - 1.d0) * overlap_m2_2           &
+                        + 4.d0 * alpha * alpha * overlap_p2_2 ) * overlap_x0_2 * overlap_y0_2
+
+          deriv_tmp = 2.d0 * real(deriv_tmp_1 + deriv_tmp_2)
+
+          ao_deriv2_cosgtos_z(i,j) += c * deriv_tmp
+
+          ! ---
+
+        enddo
+      enddo
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [double precision, ao_kinetic_integrals_cosgtos, (ao_num, ao_num)]
+
+  BEGIN_DOC
+  ! 
+  ! Kinetic energy integrals in the cosgtos |AO| basis.
+  !
+  ! $\langle \chi_i |\hat{T}| \chi_j \rangle$
+  !
+  END_DOC
+
+  implicit none
+  integer :: i, j
+
+ !$OMP PARALLEL DO DEFAULT(NONE) &
+ !$OMP  PRIVATE(i, j)            &
+ !$OMP  SHARED(ao_num, ao_kinetic_integrals_cosgtos, ao_deriv2_cosgtos_x, ao_deriv2_cosgtos_y, ao_deriv2_cosgtos_z)
+  do j = 1, ao_num
+    do i = 1, ao_num
+      ao_kinetic_integrals_cosgtos(i,j) = -0.5d0 * ( ao_deriv2_cosgtos_x(i,j) &
+                                                   + ao_deriv2_cosgtos_y(i,j) &
+                                                   + ao_deriv2_cosgtos_z(i,j) )
+    enddo
+  enddo
+ !$OMP END PARALLEL DO
+
+END_PROVIDER
+
+! ---
--- a/src/cosgtos_ao_int/two_e_Coul_integrals.irp.f
+++ b/src/cosgtos_ao_int/two_e_Coul_integrals.irp.f
--- a/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
+++ b/src/dav_general_mat/dav_ext_rout_nonsym_B1space.irp.f
@ -128,10 +128,10 @@ subroutine davidson_general_ext_rout_nonsym_b1space(u_in, H_jj, energies, sze, N

    if(itermax > 4) then
      itermax = itermax - 1
-    else if (m==1.and.disk_based_davidson) then
-      m = 0
-      disk_based = .True.
-      itermax = 6
+!    else if (m==1.and.disk_based_davidson) then
+!      m = 0
+!      disk_based = .True.
+!      itermax = 6
    else
      nproc_target = nproc_target - 1
    endif
--- a/src/davidson_keywords/EZFIO.cfg
+++ b/src/davidson_keywords/EZFIO.cfg
@ -0,0 +1,54 @@
+[threshold_davidson]
+type: Threshold
+doc: Thresholds of Davidson's algorithm if threshold_davidson_from_pt2 is false.
+interface: ezfio,provider,ocaml
+default: 1.e-10
+
+[threshold_nonsym_davidson]
+type: Threshold
+doc: Thresholds of non-symetric Davidson's algorithm
+interface: ezfio,provider,ocaml
+default: 1.e-12
+
+[davidson_sze_max]
+type: Strictly_positive_int
+doc: Number of micro-iterations before re-contracting
+default: 15
+interface: ezfio,provider,ocaml
+
+[state_following]
+type: logical
+doc: If |true|, the states are re-ordered to match the input states
+default: False
+interface: ezfio,provider,ocaml
+
+[disk_based_davidson]
+type: logical
+doc: If |true|, a memory-mapped file may be used to store the W and S2 vectors if not enough RAM is availabl
+default: True
+interface: ezfio,provider,ocaml
+
+[n_states_diag]
+type: States_number
+doc: Controls the number of states to consider during the Davdison diagonalization. The number of states is n_states * n_states_diag
+default: 4
+interface: ezfio,ocaml
+
+[n_det_max_full]
+type: Det_number_max
+doc: Maximum number of determinants where |H| is fully diagonalized
+interface: ezfio,provider,ocaml
+default: 1000
+
+[threshold_davidson_from_pt2]
+type: logical
+doc: Thresholds of Davidson's algorithm is set to E(rPT2)*threshold_davidson_from_pt2
+interface: ezfio,provider,ocaml
+default: false
+
+[distributed_davidson]
+type: logical
+doc: If |true|, use the distributed algorithm
+default: True
+interface: ezfio,provider,ocaml
+
--- a/src/davidson_keywords/NEED
+++ b/src/davidson_keywords/NEED
@ -0,0 +1 @@
+ezfio_files
--- a/src/davidson_keywords/README.rst
+++ b/src/davidson_keywords/README.rst
@ -0,0 +1,4 @@
+=================
+davidson_keywords
+=================
+
--- a/src/davidson_keywords/input.irp.f
+++ b/src/davidson_keywords/input.irp.f
@ -0,0 +1,43 @@
+
+! ---
+
+BEGIN_PROVIDER [ integer, n_states_diag  ]
+  implicit none
+  BEGIN_DOC
+! Number of states to consider during the Davdison diagonalization
+  END_DOC
+
+  logical                        :: has
+  PROVIDE ezfio_filename
+  if (mpi_master) then
+
+    call ezfio_has_davidson_keywords_n_states_diag(has)
+    if (has) then
+      call ezfio_get_davidson_keywords_n_states_diag(n_states_diag)
+    else
+      print *, 'davidson_keywords/n_states_diag not found in EZFIO file'
+      stop 1
+    endif
+    n_states_diag = max(2,N_states * N_states_diag)
+  endif
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    call MPI_BCAST( n_states_diag, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      stop 'Unable to read n_states_diag with MPI'
+    endif
+  IRP_ENDIF
+
+  call write_time(6)
+  if (mpi_master) then
+    write(6, *) 'Read  n_states_diag'
+  endif
+
+END_PROVIDER
+
+! ---
--- a/src/davidson_keywords/usef.irp.f
+++ b/src/davidson_keywords/usef.irp.f
@ -0,0 +1,33 @@
+use bitmasks
+use f77_zmq
+
+
+! ---
+
+BEGIN_PROVIDER [ integer, nthreads_davidson ]
+ implicit none
+ BEGIN_DOC
+ ! Number of threads for Davidson
+ END_DOC
+ nthreads_davidson = nproc
+ character*(32) :: env
+ call getenv('QP_NTHREADS_DAVIDSON',env)
+ if (trim(env) /= '') then
+   read(env,*) nthreads_davidson
+   call write_int(6,nthreads_davidson,'Target number of threads for <Psi|H|Psi>')
+ endif
+END_PROVIDER
+
+! ---
+
+BEGIN_PROVIDER [ double precision, threshold_davidson_pt2 ]
+ implicit none
+ BEGIN_DOC
+ ! Threshold of Davidson's algorithm, using PT2 as a guide
+ END_DOC
+ threshold_davidson_pt2 = threshold_davidson
+
+END_PROVIDER
+
+! ---
+
--- a/src/iterations_tc/EZFIO.cfg
+++ b/src/iterations_tc/EZFIO.cfg
@ -0,0 +1,24 @@
+[n_iter]
+interface: ezfio
+doc: Number of saved iterations
+type:integer
+default: 1
+
+[n_det_iterations]
+interface: ezfio, provider
+doc: Number of determinants at each iteration
+type: integer
+size: (100)
+ 
+[energy_iterations]
+interface: ezfio, provider
+doc: The variational energy at each iteration
+type: double precision 
+size: (determinants.n_states,100)
+
+[pt2_iterations]
+interface: ezfio, provider
+doc: The |PT2| correction at each iteration
+type: double precision 
+size: (determinants.n_states,100)
+
--- a/src/iterations_tc/NEED
+++ b/src/iterations_tc/NEED
--- a/src/iterations_tc/io.irp.f
+++ b/src/iterations_tc/io.irp.f
@ -0,0 +1,37 @@
+BEGIN_PROVIDER [ integer, n_iter  ]
+  implicit none
+  BEGIN_DOC
+! number of iterations
+  END_DOC
+
+  logical                        :: has
+  PROVIDE ezfio_filename
+  if (mpi_master) then
+
+      double precision :: zeros(N_states,100)
+      integer :: izeros(100)
+      zeros = 0.d0
+      izeros = 0
+      call ezfio_set_iterations_n_iter(0)
+      call ezfio_set_iterations_energy_iterations(zeros)
+      call ezfio_set_iterations_pt2_iterations(zeros)
+      call ezfio_set_iterations_n_det_iterations(izeros)
+      n_iter = 1
+  endif
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    call MPI_BCAST( n_iter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      stop 'Unable to read n_iter with MPI'
+    endif
+  IRP_ENDIF
+
+  call write_time(6)
+
+END_PROVIDER
+
--- a/src/iterations_tc/iterations.irp.f
+++ b/src/iterations_tc/iterations.irp.f
@ -0,0 +1,43 @@
+BEGIN_PROVIDER [ double precision, extrapolated_energy, (N_iter,N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Extrapolated energy, using E_var = f(PT2) where PT2=0
+ END_DOC
+! integer :: i
+ extrapolated_energy = 0.D0
+END_PROVIDER 
+
+ subroutine get_extrapolated_energy(Niter,ept2,pt1,extrap_energy)
+ implicit none
+ integer, intent(in)  :: Niter
+ double precision, intent(in) :: ept2(Niter),pt1(Niter),extrap_energy(Niter)
+ call extrapolate_data(Niter,ept2,pt1,extrap_energy)
+ end
+
+subroutine save_iterations(e_, pt2_,n_)
+  implicit none
+  BEGIN_DOC
+! Update the energy in the EZFIO file.
+  END_DOC
+  integer, intent(in) :: n_
+  double precision, intent(in) :: e_(N_states), pt2_(N_states)
+  integer :: i
+
+  if (N_iter == 101) then
+    do i=2,N_iter-1
+      energy_iterations(1:N_states,N_iter-1) = energy_iterations(1:N_states,N_iter)
+      pt2_iterations(1:N_states,N_iter-1) = pt2_iterations(1:N_states,N_iter) 
+    enddo
+    N_iter = N_iter-1
+    TOUCH N_iter
+  endif
+
+  energy_iterations(1:N_states,N_iter) = e_(1:N_states)
+     pt2_iterations(1:N_states,N_iter) = pt2_(1:N_states)
+  n_det_iterations(N_iter) = n_
+  call ezfio_set_iterations_N_iter(N_iter)
+  call ezfio_set_iterations_energy_iterations(energy_iterations)
+  call ezfio_set_iterations_pt2_iterations(pt2_iterations)
+  call ezfio_set_iterations_n_det_iterations(n_det_iterations)
+end
+
--- a/src/iterations_tc/print_extrapolation.irp.f
+++ b/src/iterations_tc/print_extrapolation.irp.f
@ -0,0 +1,46 @@
+subroutine print_extrapolated_energy
+  implicit none
+  BEGIN_DOC
+! Print the extrapolated energy in the output
+  END_DOC
+
+  integer :: i,k
+
+  if (N_iter< 2) then
+    return
+  endif
+  write(*,'(A)') ''
+  write(*,'(A)') 'Extrapolated energies'
+  write(*,'(A)') '------------------------'
+  write(*,'(A)') ''
+
+  print *,  ''
+  print *,  'State ', 1
+  print *,  ''
+  write(*,*)  '=========== ', '==================='
+  write(*,*)  'minimum PT2 ', 'Extrapolated energy'
+  write(*,*)  '=========== ', '==================='
+  do k=2,min(N_iter,8)
+    write(*,'(F11.4,2X,F18.8)') pt2_iterations(1,N_iter+1-k), extrapolated_energy(k,1)
+  enddo
+  write(*,*)  '=========== ', '==================='
+
+  do i=2, min(N_states,N_det)
+    print *,  ''
+    print *,  'State ', i
+    print *,  ''
+    write(*,*)  '=========== ', '=================== ', '=================== ', '==================='
+    write(*,*)  'minimum PT2 ', 'Extrapolated energy ', '  Excitation (a.u)  ', '  Excitation (eV)  '
+    write(*,*)  '=========== ', '=================== ', '=================== ', '==================='
+    do k=2,min(N_iter,8)
+      write(*,'(F11.4,X,3(X,F18.8))') pt2_iterations(i,N_iter+1-k), extrapolated_energy(k,i), &
+          extrapolated_energy(k,i) - extrapolated_energy(k,1), &
+          (extrapolated_energy(k,i) - extrapolated_energy(k,1) ) * 27.211396641308d0
+    enddo
+    write(*,*)  '=========== ', '=================== ', '=================== ', '==================='
+  enddo
+
+  print *,  ''
+
+end subroutine
+
--- a/src/iterations_tc/print_summary.irp.f
+++ b/src/iterations_tc/print_summary.irp.f
@ -0,0 +1,104 @@
+subroutine print_summary(e_,pt2_data,pt2_data_err,n_det_,n_configuration_,n_st,s2_)
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Print the extrapolated energy in the output
+  END_DOC
+
+  integer, intent(in)            :: n_det_, n_configuration_, n_st
+  double precision, intent(in)   :: e_(n_st), s2_(n_st)
+  type(pt2_type)  , intent(in)   :: pt2_data, pt2_data_err
+  integer                        :: i, k
+  integer                        :: N_states_p
+  character*(9)                  :: pt2_string
+  character*(512)                :: fmt
+
+  if (do_pt2) then
+    pt2_string = '        '
+  else
+    pt2_string = '(approx)'
+  endif
+
+  N_states_p = min(N_det_,n_st)
+
+  print *, ''
+  print '(A,I12)',  'Summary at N_det = ', N_det_
+  print '(A)',      '-----------------------------------'
+  print *, ''
+
+  write(fmt,*) '(''# ============'',', N_states_p, '(1X,''=============================''))'
+  write(*,fmt)
+  write(fmt,*) '(13X,', N_states_p, '(6X,A7,1X,I6,10X))'
+  write(*,fmt) ('State',k, k=1,N_states_p)
+  write(fmt,*) '(''# ============'',', N_states_p, '(1X,''=============================''))'
+  write(*,fmt)
+  write(fmt,*) '(A13,', N_states_p, '(1X,F14.8,15X))'
+  write(*,fmt) '# E          ', e_(1:N_states_p)
+  if (N_states_p > 1) then
+    write(*,fmt) '# Excit. (au)', e_(1:N_states_p)-e_(1)
+    write(*,fmt) '# Excit. (eV)', (e_(1:N_states_p)-e_(1))*27.211396641308d0
+  endif
+  write(fmt,*) '(A13,', 2*N_states_p, '(1X,F14.8))'
+  write(*,fmt) '# PT2 '//pt2_string, (pt2_data % pt2(k), pt2_data_err % pt2(k), k=1,N_states_p)
+  write(*,fmt) '# rPT2'//pt2_string, (pt2_data % rpt2(k), pt2_data_err % rpt2(k), k=1,N_states_p)
+  write(*,'(A)') '#'
+  write(*,fmt) '# E+PT2      ', (e_(k)+pt2_data % pt2(k),pt2_data_err % pt2(k), k=1,N_states_p)
+  write(*,fmt) '# E+rPT2     ', (e_(k)+pt2_data % rpt2(k),pt2_data_err % rpt2(k), k=1,N_states_p)
+  if (N_states_p > 1) then
+    write(*,fmt) '# Excit. (au)', ( (e_(k)+pt2_data % pt2(k)-e_(1)-pt2_data % pt2(1)), &
+      dsqrt(pt2_data_err % pt2(k)*pt2_data_err % pt2(k)+pt2_data_err % pt2(1)*pt2_data_err % pt2(1)), k=1,N_states_p)
+    write(*,fmt) '# Excit. (eV)', ( (e_(k)+pt2_data % pt2(k)-e_(1)-pt2_data % pt2(1))*27.211396641308d0, &
+      dsqrt(pt2_data_err % pt2(k)*pt2_data_err % pt2(k)+pt2_data_err % pt2(1)*pt2_data_err % pt2(1))*27.211396641308d0, k=1,N_states_p)
+  endif
+  write(fmt,*) '(''# ============'',', N_states_p, '(1X,''=============================''))'
+  write(*,fmt)
+  print *,  ''
+
+  print *,  'N_det             = ', N_det_
+  print *,  'N_states          = ', n_st
+  if (s2_eig) then
+    print *,  'N_cfg             = ', N_configuration_
+    if (only_expected_s2) then
+      print *,  'N_csf             = ', N_csf
+    endif
+  endif
+  print *,  ''
+
+  do k=1, N_states_p
+    print*,'* State ',k
+    print *,  '< S^2 >         = ', s2_(k)
+    print *,  'E               = ', e_(k)
+    print *,  'Variance        = ', pt2_data % variance(k), ' +/- ', pt2_data_err % variance(k)
+    print *,  'PT norm         = ', dsqrt(pt2_data % overlap(k,k)), ' +/- ', 0.5d0*dsqrt(pt2_data % overlap(k,k)) * pt2_data_err % overlap(k,k) / (pt2_data % overlap(k,k))
+    print *,  'PT2             = ', pt2_data % pt2(k), ' +/- ', pt2_data_err % pt2(k)
+    print *,  'rPT2            = ', pt2_data % rpt2(k), ' +/- ', pt2_data_err % rpt2(k)
+    print *,  'E+PT2 '//pt2_string//' = ', e_(k)+pt2_data % pt2(k), ' +/- ', pt2_data_err % pt2(k)
+    print *,  'E+rPT2'//pt2_string//' = ', e_(k)+pt2_data % rpt2(k), ' +/- ', pt2_data_err % rpt2(k)
+    print *,  ''
+  enddo
+
+  print *,  '-----'
+  if(n_st.gt.1)then
+    print *, 'Variational Energy difference (au | eV)'
+    do i=2, N_states_p
+      print*,'Delta E = ', (e_(i) - e_(1)), &
+        (e_(i) - e_(1)) * 27.211396641308d0
+    enddo
+    print *,  '-----'
+    print*, 'Variational + perturbative Energy difference (au | eV)'
+    do i=2, N_states_p
+      print*,'Delta E = ', (e_(i)+ pt2_data % pt2(i) - (e_(1) + pt2_data % pt2(1))), &
+        (e_(i)+ pt2_data % pt2(i) - (e_(1) + pt2_data % pt2(1))) * 27.211396641308d0
+    enddo
+    print *,  '-----'
+    print*, 'Variational + renormalized perturbative Energy difference (au | eV)'
+    do i=2, N_states_p
+      print*,'Delta E = ', (e_(i)+ pt2_data % rpt2(i) - (e_(1) + pt2_data % rpt2(1))), &
+        (e_(i)+ pt2_data % rpt2(i) - (e_(1) + pt2_data % rpt2(1))) * 27.211396641308d0
+    enddo
+  endif
+
+!  call print_energy_components()
+
+end subroutine
+
--- a/src/non_h_ints_mu/README.rst
+++ b/src/non_h_ints_mu/README.rst
@ -9,3 +9,7 @@ The two providers are :
 +) ao_non_hermit_term_chemist which returns the non hermitian part of the two-electron TC Hamiltonian on the MO basis. 
 +) mo_non_hermit_term_chemist which returns the non hermitian part of the two-electron TC Hamiltonian on the BI-ORTHO MO basis. 

+
+!\sum_mm = 1,3 \sum_R phi_i(R) \phi_k(R) grad_1_u_ij_mu(j,l,R,mm)  grad_1_u_ij_mu(m,n,R,mm)
+!\sum_mm+= 1,3 \sum_R phi_j(R) \phi_l(R) grad_1_u_ij_mu(i,k,R,mm)  grad_1_u_ij_mu(m,n,R,mm)
+!\sum_mm+= 1,3 \sum_R phi_m(R) \phi_n(R) grad_1_u_ij_mu(i,k,R,mm)  grad_1_u_ij_mu(j,l,R,mm)
--- a/src/non_h_ints_mu/grad_squared.irp.f
+++ b/src/non_h_ints_mu/grad_squared.irp.f
@ -155,6 +155,103 @@ END_PROVIDER

 ! ---

+ BEGIN_PROVIDER [ double precision, grad_1_squared_u_ij_mu_new, (n_points_final_grid, ao_num, ao_num)]
+ implicit none
+ integer :: ipoint,i,j,m,igauss
+ BEGIN_DOC
+ ! grad_1_squared_u_ij_mu(j,i,ipoint) = -1/2 \int dr2 phi_j(r2) phi_i(r2) |\grad_r1 u(r1,r2,\mu)|^2 
+ ! |\grad_r1 u(r1,r2,\mu)|^2 = 1/4 * (1 - erf(mu*r12))^2 
+ ! ! (1 - erf(mu*r12))^2 = \sum_i coef_gauss_1_erf_x_2(i) * exp(-expo_gauss_1_erf_x_2(i) * r12^2)
+ END_DOC
+  include 'constants.include.F'
+ double precision :: r(3),delta,coef
+ double precision :: overlap_gauss_r12_ao,time0,time1
+ integer :: num_a,num_b,power_A(3), power_B(3),l,k
+ double precision :: A_center(3), B_center(3),overlap_gauss_r12,alpha,beta,analytical_j
+ double precision  :: A_new(0:max_dim,3)! new polynom 
+ double precision  :: A_center_new(3)   ! new center
+ integer           :: iorder_a_new(3)   ! i_order(i) = order of the new polynom ==> should be equal to power_A
+ double precision  :: alpha_new         ! new exponent
+ double precision  :: fact_a_new, coef_i, coef_j, k_ab,center_new(3),p_new,c_tmp,coef_last        ! constant factor
+ double precision  :: coefxy, coefx, coefy, coefz,coefxyz
+ integer           :: d(3),lx,ly,lz,iorder_tmp(3),dim1
+ double precision  :: overlap,overlap_x,overlap_y,overlap_z,thr
+ dim1=100
+ thr = 0.d0
+ print*,'providing grad_1_squared_u_ij_mu_new ...'
+ grad_1_squared_u_ij_mu_new = 0.d0
+ call wall_time(time0)
+ !TODO : strong optmization : write the loops in a different way
+ !     : for each couple of AO, the gaussian product are done once for all 
+ d = 0
+ do i = 1, ao_num
+  do j = 1, ao_num
+   ! \int dr2 phi_j(r2) phi_i(r2) (1 - erf(mu*r12))^2 
+   ! = \sum_i coef_gauss_1_erf_x_2(i) \int dr2 phi_j(r2) phi_i(r2) exp(-expo_gauss_1_erf_x_2(i) * (r_1 - r_2)^2)
+   if(ao_overlap_abs(j,i).lt.1.d-12)then
+    cycle
+   endif
+   num_A = ao_nucl(i)
+   power_A(1:3)= ao_power(i,1:3)
+   A_center(1:3) = nucl_coord(num_A,1:3)
+   num_B = ao_nucl(j)
+   power_B(1:3)= ao_power(j,1:3)
+   B_center(1:3) = nucl_coord(num_B,1:3)
+   do l=1,ao_prim_num(i)
+    coef_i = ao_coef_normalized_ordered_transp(l,i)  
+    alpha = ao_expo_ordered_transp(l,i)     
+    do k=1,ao_prim_num(j)
+     beta = ao_expo_ordered_transp(k,j)     
+     coef_j = ao_coef_normalized_ordered_transp(k,j) 
+
+     ! New gaussian/polynom defined by :: new pol new center new expo   cst fact new order                                
+     ! from gaussian_A * gaussian_B
+     call give_explicit_poly_and_gaussian(A_new , A_center_new , alpha_new, fact_a_new , iorder_a_new , & 
+                                      beta,alpha,power_B,power_A,B_center,A_center,n_pt_max_integrals)
+     c_tmp = coef_i*coef_j*fact_a_new
+     if(dabs(c_tmp).lt.thr)cycle
+     do ipoint = 1, n_points_final_grid
+      r(1) = final_grid_points(1,ipoint)
+      r(2) = final_grid_points(2,ipoint)
+      r(3) = final_grid_points(3,ipoint)
+      do igauss = 1, n_max_fit_slat
+       delta = expo_gauss_1_erf_x_2(igauss)
+       coef  = coef_gauss_1_erf_x_2(igauss)
+       coef_last = c_tmp * coef
+       if(dabs(coef_last).lt.thr)cycle
+       do lx = 0, iorder_a_new(1)
+        coefx = A_new(lx,1)
+        coefx *= coef_last 
+!        if(dabs(coefx).lt.thr)cycle
+        iorder_tmp(1) = lx
+        do ly = 0, iorder_a_new(2)
+         coefy = A_new(ly,2)
+         coefxy= coefx*coefy
+!         if(dabs(coefxy).lt.thr)cycle
+         iorder_tmp(2) = ly
+         do lz = 0, iorder_a_new(3)
+          coefz = A_new(lz,3)
+          coefxyz = coefz * coefxy
+!          if(dabs(coefxyz).lt.thr)cycle
+          iorder_tmp(3) = lz
+!          call gaussian_product(alpha_new,A_center_new,delta,r,k_ab,p_new,center_new)
+!          if(dabs(coef_last*k_ab).lt.thr)cycle
+          call overlap_gaussian_xyz(A_center_new,r,alpha_new,delta,iorder_tmp,d,overlap_x,overlap_y,overlap_z,overlap,dim1)
+          grad_1_squared_u_ij_mu_new(ipoint,j,i) += -0.25 * coefxyz * overlap
+         enddo ! igauss
+        enddo ! ipoint 
+       enddo ! lz
+      enddo ! ly
+     enddo ! lx 
+    enddo ! k
+   enddo ! l
+  enddo ! j
+ enddo ! i
+ call wall_time(time1)
+ print*,'Wall time for grad_1_squared_u_ij_mu_new = ',time1 - time0
+ END_PROVIDER 
+
+
 BEGIN_PROVIDER [double precision, tc_grad_square_ao, (ao_num, ao_num, ao_num, ao_num)]

  BEGIN_DOC
--- a/src/non_h_ints_mu/test_non_h_ints.irp.f
+++ b/src/non_h_ints_mu/test_non_h_ints.irp.f
@ -1,102 +0,0 @@
-program test_non_h
- implicit none
-  my_grid_becke  = .True.
-  my_n_pt_r_grid = 50
-  my_n_pt_a_grid = 74
-!  my_n_pt_r_grid = 10 ! small grid for quick debug
-!  my_n_pt_a_grid = 26 ! small grid for quick debug
-  touch my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-!call routine_grad_squared
- call routine_fit
-end
-
-subroutine routine_lapl_grad
- implicit none
- integer :: i,j,k,l
- double precision :: grad_lapl, get_ao_tc_sym_two_e_pot,new,accu,contrib
- double precision :: ao_two_e_integral_erf,get_ao_two_e_integral,count_n,accu_relat
-! !!!!!!!!!!!!!!!!!!!!! WARNING
-! THIS ROUTINE MAKES SENSE ONLY IF HAND MODIFIED coef_gauss_eff_pot(1:n_max_fit_slat) = 0. to cancel (1-erf(mu*r12))^2
- accu = 0.d0
- accu_relat = 0.d0
- count_n = 0.d0
- do i = 1, ao_num
-  do j = 1, ao_num
-   do k = 1, ao_num
-    do l = 1, ao_num
-     grad_lapl  = get_ao_tc_sym_two_e_pot(i,j,k,l,ao_tc_sym_two_e_pot_map) ! pure gaussian part : comes from Lapl
-     grad_lapl += ao_two_e_integral_erf(i, k, j, l)                        ! erf(mu r12)/r12    : comes from Lapl
-     grad_lapl += ao_non_hermit_term_chemist(k,i,l,j)                      ! \grad u(r12) . grad
-     new        = tc_grad_and_lapl_ao(k,i,l,j)
-     new       += get_ao_two_e_integral(i,j,k,l,ao_integrals_map)
-     contrib    = dabs(new - grad_lapl)
-     if(dabs(grad_lapl).gt.1.d-12)then
-      count_n += 1.d0
-      accu_relat += 2.0d0 * contrib/dabs(grad_lapl+new)
-     endif
-     if(contrib.gt.1.d-10)then
-      print*,i,j,k,l
-      print*,grad_lapl,new,contrib
-      print*,2.0d0*contrib/dabs(grad_lapl+new+1.d-12)
-     endif 
-     accu += contrib
-    enddo
-   enddo
-  enddo
- enddo
- print*,'accu      = ',accu/count_n
- print*,'accu/rel  = ',accu_relat/count_n
-
-end
-
-subroutine routine_grad_squared
- implicit none
- integer :: i,j,k,l
- double precision :: grad_squared, get_ao_tc_sym_two_e_pot,new,accu,contrib
- double precision :: count_n,accu_relat
-! !!!!!!!!!!!!!!!!!!!!! WARNING
-! THIS ROUTINE MAKES SENSE ONLY IF HAND MODIFIED coef_gauss_eff_pot(n_max_fit_slat:n_max_fit_slat+1) = 0. to cancel exp(-'mu*r12)^2)
- accu = 0.d0
- accu_relat = 0.d0
- count_n = 0.d0
- do i = 1, ao_num
-  do j = 1, ao_num
-   do k = 1, ao_num
-    do l = 1, ao_num
-     grad_squared  = get_ao_tc_sym_two_e_pot(i,j,k,l,ao_tc_sym_two_e_pot_map) ! pure gaussian part : comes from Lapl
-     new        = tc_grad_square_ao(k,i,l,j)
-     contrib    = dabs(new - grad_squared)
-     if(dabs(grad_squared).gt.1.d-12)then
-      count_n += 1.d0
-      accu_relat += 2.0d0 * contrib/dabs(grad_squared+new)
-     endif
-     if(contrib.gt.1.d-10)then
-      print*,i,j,k,l
-      print*,grad_squared,new,contrib
-      print*,2.0d0*contrib/dabs(grad_squared+new+1.d-12)
-     endif 
-     accu += contrib
-    enddo
-   enddo
-  enddo
- enddo
- print*,'accu      = ',accu/count_n
- print*,'accu/rel  = ',accu_relat/count_n
-
-end
-
-subroutine routine_fit
- implicit none
- integer :: i,nx
- double precision :: dx,xmax,x,j_mu,j_mu_F_x_j,j_mu_fit_gauss
- nx = 500
- xmax = 5.d0
- dx = xmax/dble(nx)
- x = 0.d0
- print*,'coucou',mu_erf
- do i = 1, nx
-  write(33,'(100(F16.10,X))') x,j_mu(x),j_mu_F_x_j(x),j_mu_fit_gauss(x)
-  x += dx
- enddo
-
-end
--- a/src/non_hermit_dav/NEED
+++ b/src/non_hermit_dav/NEED
@ -0,0 +1 @@
+utils
--- a/src/non_hermit_dav/biorthog.irp.f
+++ b/src/non_hermit_dav/biorthog.irp.f
--- a/src/non_hermit_dav/gram_schmit.irp.f
+++ b/src/non_hermit_dav/gram_schmit.irp.f
@ -0,0 +1,56 @@
+subroutine bi_ortho_gram_schmidt(wi,vi,n,ni,wk,wk_schmidt)
+ implicit none
+ BEGIN_DOC
+! you enter with a set of "ni" BI-ORTHONORMAL vectors of length "n" 
+!
+! vi(j,i) = <j|vi>, wi(j,i) = <j|wi>, <vi|wj> = delta_{ij} S_ii, S_ii =<vi|wi>
+!
+! and a vector vk(j) = <j|vk>
+!
+! you go out with a vector vk_schmidt(j) = <j|vk_schmidt> 
+!
+! which is Gram-Schmidt orthonormalized with respect to the "vi"
+!
+! <vi|wk_schmidt> = 0 
+!
+! |wk_schmidt> = |wk> - \sum_{i=1}^ni (<vi|wk>/<vi|wi>) |wi> 
+!
+! according to Eq. (5), (6) of Computers Structures, Vol 56, No. 4, pp 605-613, 1995
+!
+! https://doi.org/10.1016/0045-7949(94)00565-K
+ END_DOC
+ integer, intent(in) :: n,ni
+ double precision, intent(in) :: wi(n,ni),vi(n,ni),wk(n)
+ double precision, intent(out):: wk_schmidt(n)
+ double precision :: vi_wk,u_dot_v,tmp,u_dot_u
+ double precision, allocatable :: sii(:)
+ integer :: i,j
+ allocate( sii(ni) )
+ wk_schmidt = wk
+ do i = 1, ni
+  sii(i) = u_dot_v(vi(1,i),wi(1,i),n)
+ enddo
+! do i = 1, n
+!  print*,i,'wk',wk(i)
+! enddo
+! print*,''
+! print*,''
+ do i = 1, ni
+!  print*,'i',i
+  ! Gram-Schmidt 
+  vi_wk = u_dot_v(vi(1,i),wk,n)
+  vi_wk = vi_wk / sii(i)
+!  print*,''
+  do j = 1, n
+!   print*,j,vi_wk,wi(j,i)
+   wk_schmidt(j) -= vi_wk * wi(j,i)
+  enddo
+ enddo
+ tmp = u_dot_u(wk_schmidt,n)
+ tmp = 1.d0/dsqrt(tmp)
+ wk_schmidt = tmp * wk_schmidt
+! do j = 1, n
+!  print*,j,'wk_scc',wk_schmidt(j)
+! enddo
+! pause
+end
--- a/src/non_hermit_dav/htilde_mat.irp.f
+++ b/src/non_hermit_dav/htilde_mat.irp.f
@ -0,0 +1,93 @@
+BEGIN_PROVIDER [ integer, n_mat]
+ implicit none
+ n_mat = 2
+END_PROVIDER 
+
+ BEGIN_PROVIDER [ double precision, h_non_hermit, (n_mat, n_mat)]
+&BEGIN_PROVIDER [ double precision, h_non_hermit_transp, (n_mat, n_mat)]
+&BEGIN_PROVIDER [ double precision, reigvec_ht, (n_mat, n_mat)]
+&BEGIN_PROVIDER [ double precision, leigvec_ht, (n_mat, n_mat)]
+&BEGIN_PROVIDER [ double precision, eigval_ht, (n_mat)]
+&BEGIN_PROVIDER [ integer, n_real_ht, (n_mat)]
+ implicit none
+ integer :: i,j
+ do i = 1, n_mat
+  read(33,*)h_non_hermit(i,1:n_mat)
+ enddo
+ print*,''
+ print*,'H_mat '
+ print*,''
+ do i = 1, n_mat
+  write(*,'(1000(F16.10,X))')h_non_hermit(i,:)
+ enddo
+ do i = 1, n_mat
+  do j = 1, n_mat
+   h_non_hermit_transp(j,i) = h_non_hermit(i,j)
+  enddo
+ enddo
+ call non_hrmt_real_diag(n_mat,h_non_hermit,reigvec_ht,leigvec_ht,n_real_ht,eigval_ht)
+
+
+END_PROVIDER 
+
+
+subroutine hcalc_r_tmp(v,u,N_st,sze) ! v = H u
+  implicit none
+  BEGIN_DOC
+  ! Template of routine for the application of H
+  !
+  ! Here, it is done with the Hamiltonian matrix 
+  !
+  ! on the set of determinants of psi_det 
+  !
+  ! Computes $v = H | u \rangle$ 
+  !
+  END_DOC
+  integer, intent(in)              :: N_st,sze
+  double precision, intent(in)     :: u(sze,N_st)
+  double precision, intent(inout)  :: v(sze,N_st)
+  integer :: i,j,istate
+  v = 0.d0
+  do istate = 1, N_st
+   do j = 1, sze
+    do i = 1, sze
+      v(i,istate) += h_non_hermit(i,j) * u(j,istate)
+!      print*,i,j,h_non_hermit(i,j),u(j,istate)
+    enddo
+   enddo
+  enddo
+  print*,'HU'
+  do i = 1, sze
+   print*,v(i,1)
+  enddo
+end
+
+subroutine hcalc_l_tmp(v,u,N_st,sze) ! v = H^\dagger u
+  implicit none
+  BEGIN_DOC
+  ! Template of routine for the application of H
+  !
+  ! Here, it is done with the Hamiltonian matrix 
+  !
+  ! on the set of determinants of psi_det 
+  !
+  ! Computes $v = H | u \rangle$ 
+  !
+  END_DOC
+  integer, intent(in)              :: N_st,sze
+  double precision, intent(in)     :: u(sze,N_st)
+  double precision, intent(inout)  :: v(sze,N_st)
+  integer :: i,j,istate
+  v = 0.d0
+  do istate = 1, N_st
+   do j = 1, sze
+    do i = 1, sze
+      v(i,istate) += h_non_hermit_transp(i,j) * u(j,istate)
+    enddo
+   enddo
+  enddo
+  print*,'HU'
+  do i = 1, sze
+   print*,v(i,1)
+  enddo
+end
--- a/src/non_hermit_dav/lapack_diag_non_hermit.irp.f
+++ b/src/non_hermit_dav/lapack_diag_non_hermit.irp.f
--- a/src/non_hermit_dav/new_routines.irp.f
+++ b/src/non_hermit_dav/new_routines.irp.f
@ -0,0 +1,669 @@
+subroutine non_hrmt_diag_split_degen_bi_orthog(n, A, leigvec, reigvec, n_real_eigv, eigval)
+
+  BEGIN_DOC
+  !
+  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
+  !
+  ! of a non hermitian matrix A(n,n)
+  !
+  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
+  !
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in)  :: n
+  double precision, intent(in)  :: A(n,n)
+  integer,          intent(out) :: n_real_eigv
+  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
+  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
+
+  integer                       :: i, j, n_degen,k , iteration
+  double precision              :: shift_current
+  double precision              :: r,thr,accu_d, accu_nd
+  integer,          allocatable :: iorder_origin(:),iorder(:)
+  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
+  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
+  double precision, allocatable :: im_part(:),re_part(:)
+  double precision :: accu,thr_cut, thr_norm=1d0
+
+
+  thr_cut = 1.d-15
+  print*,'Computing the left/right eigenvectors ...'
+  print*,'Using the degeneracy splitting algorithm'
+ ! initialization
+  shift_current = 1.d-15
+  iteration = 0 
+  print*,'***** iteration = ',iteration
+
+
+  ! pre-processing the matrix :: sorting by diagonal elements
+  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
+  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
+!  print*,'Aw'
+  do i = 1, n
+   iorder_origin(i) = i
+   diag_elem(i) = A(i,i)
+!   write(*,'(100(F16.10,X))')A(:,i)
+  enddo
+  call dsort(diag_elem, iorder_origin, n)
+  do i = 1, n
+   do j = 1, n
+    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
+   enddo
+  enddo
+
+  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
+  allocate(im_part(n),iorder(n))
+  allocate( S(n,n) )
+
+
+  Aw = A_save
+  call cancel_small_elmts(aw,n,thr_cut)
+  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
+  do i = 1, n
+   im_part(i) = -dabs(WI(i))
+   iorder(i) = i
+  enddo
+  call dsort(im_part, iorder, n)
+  n_real_eigv = 0
+  do i = 1, n
+    if(dabs(WI(i)).lt.1.d-20)then
+      n_real_eigv += 1
+    else
+!      print*,'Found an imaginary component to eigenvalue'
+!      print*,'Re(i) + Im(i)',WR(i),WI(i)
+    endif
+  enddo
+  if(n_real_eigv.ne.n)then
+   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
+   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
+   print*,'Splitting the degeneracies by ',shift_current
+  else
+   print*,'All eigenvalues are real !'
+  endif
+
+
+  do while(n_real_eigv.ne.n)
+   iteration += 1
+   print*,'***** iteration = ',iteration
+   if(shift_current.gt.1.d-3)then
+    print*,'shift_current > 1.d-3 !!'
+    print*,'Your matrix intrinsically contains complex eigenvalues'
+    stop
+   endif
+   Aw = A_save
+   call cancel_small_elmts(Aw,n,thr_cut)
+   call split_matrix_degen(Aw,n,shift_current)
+   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
+   n_real_eigv = 0
+   do i = 1, n
+     if(dabs(WI(i)).lt.1.d-20)then
+       n_real_eigv+= 1
+     else
+!       print*,'Found an imaginary component to eigenvalue'
+!       print*,'Re(i) + Im(i)',WR(i),WI(i)
+     endif
+   enddo
+   if(n_real_eigv.ne.n)then
+    do i = 1, n
+     im_part(i) = -dabs(WI(i))
+     iorder(i) = i
+    enddo
+    call dsort(im_part, iorder, n)
+    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
+    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
+    print*,'Splitting the degeneracies by ',shift_current
+   else
+    print*,'All eigenvalues are real !'
+   endif
+  enddo
+  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
+  do i = 1, n
+   eigval(i) = WR(i)
+   iorder(i) = i
+  enddo
+  call dsort(eigval,iorder,n)
+  do i = 1, n
+!   print*,'eigval(i) = ',eigval(i)
+   reigvec_tmp(:,i) = VR(:,iorder(i))
+   leigvec_tmp(:,i) = Vl(:,iorder(i))
+  enddo
+
+!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
+  !                               check bi-orthogonality
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+  print *, ' accu_nd bi-orthog = ', accu_nd
+  if( accu_nd .lt. 1d-10 ) then
+    print *, ' bi-orthogonality: ok'
+  else
+    print *, ' '
+    print *, ' bi-orthogonality: not imposed yet'
+    print *, ' '
+    print *, ' '
+    print *, ' orthog between degen eigenvect' 
+    print *, ' '
+    double precision, allocatable :: S_nh_inv_half(:,:)
+    allocate(S_nh_inv_half(n,n))
+    logical :: complex_root
+    deallocate(S_nh_inv_half)
+    call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp)
+    call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+    if( accu_nd .lt. 1d-10 ) then
+      print *, ' bi-orthogonality: ok'
+    else 
+     print*,'New vectors not bi-orthonormals at ',accu_nd
+     call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S)
+     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+     if( accu_nd .lt. 1d-10 ) then
+       print *, ' bi-orthogonality: ok'
+     else 
+      print*,'New vectors not bi-orthonormals at ',accu_nd
+      print*,'Must be a deep problem ...'
+      stop
+     endif
+    endif
+  endif
+ 
+  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
+  do i = 1, n
+   do j = 1, n
+    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
+    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
+   enddo
+  enddo
+
+  !! RECOMPUTING THE EIGENVALUES 
+  eigval = 0.d0
+  do i = 1, n
+   iorder(i) = i
+   accu = 0.d0
+   do j = 1, n
+    accu += VL(j,i) * VR(j,i) 
+    do k = 1, n
+     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
+    enddo
+   enddo
+   eigval(i) *= 1.d0/accu
+!   print*,'eigval(i) = ',eigval(i)
+  enddo
+  !! RESORT JUST TO BE SURE
+  call dsort(eigval, iorder, n)
+  do i = 1, n
+   do j = 1, n
+    reigvec(j,i) = VR(j,iorder(i))
+    leigvec(j,i) = VL(j,iorder(i))
+   enddo
+  enddo
+  print*,'Checking for final reigvec/leigvec'
+  shift_current = max(1.d-10,shift_current)
+  print*,'Thr for eigenvectors = ',shift_current
+  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, .false.)
+  print *, ' accu_nd bi-orthog = ', accu_nd
+  
+  if( accu_nd .lt. 1d-10 ) then
+    print *, ' bi-orthogonality: ok'
+  else 
+   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
+   print*,'Eigenvectors are not bi orthonormal ..'
+   print*,'accu_nd = ',accu_nd
+   stop
+  endif
+
+end 
+
+
+
+subroutine non_hrmt_diag_split_degen_s_inv_half(n, A, leigvec, reigvec, n_real_eigv, eigval)
+
+  BEGIN_DOC
+  !
+  ! routine which returns the sorted REAL EIGENVALUES ONLY and corresponding LEFT/RIGHT eigenvetors 
+  !
+  ! of a non hermitian matrix A(n,n)
+  !
+  ! n_real_eigv is the number of real eigenvalues, which might be smaller than the dimension "n" 
+  !
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in)  :: n
+  double precision, intent(in)  :: A(n,n)
+  integer,          intent(out) :: n_real_eigv
+  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
+  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
+
+  integer                       :: i, j, n_degen,k , iteration
+  double precision              :: shift_current
+  double precision              :: r,thr,accu_d, accu_nd
+  integer,          allocatable :: iorder_origin(:),iorder(:)
+  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
+  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
+  double precision, allocatable :: im_part(:),re_part(:)
+  double precision :: accu,thr_cut, thr_norm=1.d0
+  double precision, allocatable :: S_nh_inv_half(:,:)
+  logical :: complex_root
+
+
+  thr_cut = 1.d-15
+  print*,'Computing the left/right eigenvectors ...'
+  print*,'Using the degeneracy splitting algorithm'
+ ! initialization
+  shift_current = 1.d-15
+  iteration = 0 
+  print*,'***** iteration = ',iteration
+
+
+  ! pre-processing the matrix :: sorting by diagonal elements
+  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
+  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
+!  print*,'Aw'
+  do i = 1, n
+   iorder_origin(i) = i
+   diag_elem(i) = A(i,i)
+!   write(*,'(100(F16.10,X))')A(:,i)
+  enddo
+  call dsort(diag_elem, iorder_origin, n)
+  do i = 1, n
+   do j = 1, n
+    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
+   enddo
+  enddo
+
+  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
+  allocate(im_part(n),iorder(n))
+  allocate( S(n,n) )
+  allocate(S_nh_inv_half(n,n))
+
+
+  Aw = A_save
+  call cancel_small_elmts(aw,n,thr_cut)
+  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
+  do i = 1, n
+   im_part(i) = -dabs(WI(i))
+   iorder(i) = i
+  enddo
+  call dsort(im_part, iorder, n)
+  n_real_eigv = 0
+  do i = 1, n
+    if(dabs(WI(i)).lt.1.d-20)then
+      n_real_eigv += 1
+    else
+!      print*,'Found an imaginary component to eigenvalue'
+!      print*,'Re(i) + Im(i)',WR(i),WI(i)
+    endif
+  enddo
+  if(n_real_eigv.ne.n)then
+   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
+   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
+   print*,'Splitting the degeneracies by ',shift_current
+  else
+   print*,'All eigenvalues are real !'
+  endif
+
+
+  do while(n_real_eigv.ne.n)
+   iteration += 1
+   print*,'***** iteration = ',iteration
+   if(shift_current.gt.1.d-3)then
+    print*,'shift_current > 1.d-3 !!'
+    print*,'Your matrix intrinsically contains complex eigenvalues'
+    stop
+   endif
+   Aw = A_save
+!   thr_cut = shift_current
+   call cancel_small_elmts(Aw,n,thr_cut)
+   call split_matrix_degen(Aw,n,shift_current)
+   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
+   n_real_eigv = 0
+   do i = 1, n
+     if(dabs(WI(i)).lt.1.d-20)then
+       n_real_eigv+= 1
+     else
+!       print*,'Found an imaginary component to eigenvalue'
+!       print*,'Re(i) + Im(i)',WR(i),WI(i)
+     endif
+   enddo
+   if(n_real_eigv.ne.n)then
+    do i = 1, n
+     im_part(i) = -dabs(WI(i))
+     iorder(i) = i
+    enddo
+    call dsort(im_part, iorder, n)
+    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
+    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
+    print*,'Splitting the degeneracies by ',shift_current
+   else
+    print*,'All eigenvalues are real !'
+   endif
+  enddo
+  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
+  do i = 1, n
+   eigval(i) = WR(i)
+   iorder(i) = i
+  enddo
+  call dsort(eigval,iorder,n)
+  do i = 1, n
+!   print*,'eigval(i) = ',eigval(i)
+   reigvec_tmp(:,i) = VR(:,iorder(i))
+   leigvec_tmp(:,i) = Vl(:,iorder(i))
+  enddo
+
+!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
+  !                               check bi-orthogonality
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+  print *, ' accu_nd bi-orthog = ', accu_nd
+  if( accu_nd .lt. 1d-10 ) then
+    print *, ' bi-orthogonality: ok'
+  else
+    print *, ' '
+    print *, ' bi-orthogonality: not imposed yet'
+    if(complex_root)then 
+     print *, ' '
+     print *, ' '
+     print *, ' orthog between degen eigenvect' 
+     print *, ' '
+     ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
+     call impose_orthog_degen_eigvec(n, eigval, reigvec_tmp) ! orthogonalization of reigvec
+     call impose_orthog_degen_eigvec(n, eigval, leigvec_tmp) ! orthogonalization of leigvec
+     call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S) 
+
+     if( accu_nd .lt. 1d-10 ) then
+       print *, ' bi-orthogonality: ok'
+     else 
+      print*,'New vectors not bi-orthonormals at ', accu_nd
+      call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half, complex_root)
+      if(complex_root)then 
+       call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp) ! bi-orthonormalization using QR
+      else
+       print*,'S^{-1/2} exists !!'
+       call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
+      endif
+     endif
+    else ! the matrix S^{-1/2} exists
+     print*,'S^{-1/2} exists !!'
+     call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
+    endif
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S, .false.)
+    if( accu_nd .lt. 1d-10 ) then
+      print *, ' bi-orthogonality: ok'
+    else 
+     print*,'New vectors not bi-orthonormals at ',accu_nd
+     print*,'Must be a deep problem ...'
+     stop
+    endif
+  endif
+ 
+  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
+  do i = 1, n
+   do j = 1, n
+    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
+    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
+   enddo
+  enddo
+
+  !! RECOMPUTING THE EIGENVALUES 
+  eigval = 0.d0
+  do i = 1, n
+   iorder(i) = i
+   accu = 0.d0
+   do j = 1, n
+    accu += VL(j,i) * VR(j,i) 
+    do k = 1, n
+     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
+    enddo
+   enddo
+   eigval(i) *= 1.d0/accu
+!   print*,'eigval(i) = ',eigval(i)
+  enddo
+  !! RESORT JUST TO BE SURE
+  call dsort(eigval, iorder, n)
+  do i = 1, n
+   do j = 1, n
+    reigvec(j,i) = VR(j,iorder(i))
+    leigvec(j,i) = VL(j,iorder(i))
+   enddo
+  enddo
+  print*,'Checking for final reigvec/leigvec'
+  shift_current = max(1.d-10,shift_current)
+  print*,'Thr for eigenvectors = ',shift_current
+  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec, shift_current, thr_norm, .false.)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S, .false.)
+  print *, ' accu_nd bi-orthog = ', accu_nd
+  
+  if( accu_nd .lt. 1d-10 ) then
+    print *, ' bi-orthogonality: ok'
+  else 
+   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
+   print*,'Eigenvectors are not bi orthonormal ..'
+   print*,'accu_nd = ',accu_nd
+   stop
+  endif
+
+end 
+
+
+subroutine non_hrmt_fock_mat(n, A, leigvec, reigvec, n_real_eigv, eigval)
+
+  BEGIN_DOC
+  !
+  ! routine returning the eigenvalues and left/right eigenvectors of the TC fock matrix 
+  !
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in)  :: n
+  double precision, intent(in)  :: A(n,n)
+  integer,          intent(out) :: n_real_eigv
+  double precision, intent(out) :: reigvec(n,n), leigvec(n,n), eigval(n)
+  double precision, allocatable :: reigvec_tmp(:,:), leigvec_tmp(:,:)
+
+  integer                       :: i, j, n_degen,k , iteration
+  double precision              :: shift_current
+  double precision              :: r,thr,accu_d, accu_nd
+  integer,          allocatable :: iorder_origin(:),iorder(:)
+  double precision, allocatable :: WR(:), WI(:), Vl(:,:), VR(:,:),S(:,:)
+  double precision, allocatable :: Aw(:,:),diag_elem(:),A_save(:,:)
+  double precision, allocatable :: im_part(:),re_part(:)
+  double precision :: accu,thr_cut
+  double precision, allocatable :: S_nh_inv_half(:,:)
+  logical :: complex_root
+
+
+  thr_cut = 1.d-15
+  print*,'Computing the left/right eigenvectors ...'
+  print*,'Using the degeneracy splitting algorithm'
+ ! initialization
+  shift_current = 1.d-15
+  iteration = 0 
+  print*,'***** iteration = ',iteration
+
+
+  ! pre-processing the matrix :: sorting by diagonal elements
+  allocate(reigvec_tmp(n,n), leigvec_tmp(n,n))
+  allocate(diag_elem(n),iorder_origin(n),A_save(n,n))
+!  print*,'Aw'
+  do i = 1, n
+   iorder_origin(i) = i
+   diag_elem(i) = A(i,i)
+!   write(*,'(100(F16.10,X))')A(:,i)
+  enddo
+  call dsort(diag_elem, iorder_origin, n)
+  do i = 1, n
+   do j = 1, n
+    A_save(j,i) = A(iorder_origin(j),iorder_origin(i))
+   enddo
+  enddo
+
+  allocate(WR(n), WI(n), VL(n,n), VR(n,n), Aw(n,n))
+  allocate(im_part(n),iorder(n))
+  allocate( S(n,n) )
+  allocate(S_nh_inv_half(n,n))
+
+
+  Aw = A_save
+  call cancel_small_elmts(aw,n,thr_cut)
+  call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
+  do i = 1, n
+   im_part(i) = -dabs(WI(i))
+   iorder(i) = i
+  enddo
+  call dsort(im_part, iorder, n)
+  n_real_eigv = 0
+  do i = 1, n
+    if(dabs(WI(i)).lt.1.d-20)then
+      n_real_eigv += 1
+    else
+!      print*,'Found an imaginary component to eigenvalue'
+!      print*,'Re(i) + Im(i)',WR(i),WI(i)
+    endif
+  enddo
+  if(n_real_eigv.ne.n)then
+   shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
+   print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
+   print*,'Splitting the degeneracies by ',shift_current
+  else
+   print*,'All eigenvalues are real !'
+  endif
+
+
+  do while(n_real_eigv.ne.n)
+   iteration += 1
+   print*,'***** iteration = ',iteration
+   if(shift_current.gt.1.d-3)then
+    print*,'shift_current > 1.d-3 !!'
+    print*,'Your matrix intrinsically contains complex eigenvalues'
+    stop
+   endif
+   Aw = A_save
+!   thr_cut = shift_current
+   call cancel_small_elmts(Aw,n,thr_cut)
+   call split_matrix_degen(Aw,n,shift_current)
+   call lapack_diag_non_sym(n,Aw,WR,WI,VL,VR)
+   n_real_eigv = 0
+   do i = 1, n
+     if(dabs(WI(i)).lt.1.d-20)then
+       n_real_eigv+= 1
+     else
+!       print*,'Found an imaginary component to eigenvalue'
+!       print*,'Re(i) + Im(i)',WR(i),WI(i)
+     endif
+   enddo
+   if(n_real_eigv.ne.n)then
+    do i = 1, n
+     im_part(i) = -dabs(WI(i))
+     iorder(i) = i
+    enddo
+    call dsort(im_part, iorder, n)
+    shift_current = max(10.d0 * dabs(im_part(1)),shift_current*10.d0)
+    print*,'Largest imaginary part found in eigenvalues = ',im_part(1)
+    print*,'Splitting the degeneracies by ',shift_current
+   else
+    print*,'All eigenvalues are real !'
+   endif
+  enddo
+  !!!!!!!!!!!!!!!! SORTING THE EIGENVALUES 
+  do i = 1, n
+   eigval(i) = WR(i)
+   iorder(i) = i
+  enddo
+  call dsort(eigval,iorder,n)
+  do i = 1, n
+!   print*,'eigval(i) = ',eigval(i)
+   reigvec_tmp(:,i) = VR(:,iorder(i))
+   leigvec_tmp(:,i) = Vl(:,iorder(i))
+  enddo
+
+!!! ONCE ALL EIGENVALUES ARE REAL ::: CHECK BI-ORTHONORMALITY
+  !                               check bi-orthogonality
+  call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+  print *, ' accu_nd bi-orthog = ', accu_nd
+  if( accu_nd .lt. 1d-10 ) then
+    print *, ' bi-orthogonality: ok'
+  else
+    print *, ' '
+    print *, ' bi-orthogonality: not imposed yet'
+    print *, ' '
+    print *, ' '
+    print *, ' Using impose_unique_biorthog_degen_eigvec' 
+    print *, ' '
+    ! bi-orthonormalization using orthogonalization of left, right and then QR between left and right
+    call impose_unique_biorthog_degen_eigvec(n, eigval, mo_coef, leigvec_tmp, reigvec_tmp)
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+    print*,'accu_nd = ',accu_nd
+    if( accu_nd .lt. 1d-10 ) then
+      print *, ' bi-orthogonality: ok'
+    else 
+     print*,'New vectors not bi-orthonormals at ',accu_nd
+     call get_inv_half_nonsymmat_diago(S, n, S_nh_inv_half,complex_root)
+     if(complex_root)then 
+      print*,'S^{-1/2} does not exits, using QR bi-orthogonalization'
+      call impose_biorthog_qr(n, n, leigvec_tmp, reigvec_tmp, S) ! bi-orthonormalization using QR
+     else
+      print*,'S^{-1/2} exists !!'
+      call bi_ortho_s_inv_half(n,leigvec_tmp,reigvec_tmp,S_nh_inv_half) ! use of S^{-1/2} bi-orthonormalization 
+     endif
+    endif
+    call check_biorthog(n, n, leigvec_tmp, reigvec_tmp, accu_d, accu_nd, S)
+    if( accu_nd .lt. 1d-10 ) then
+      print *, ' bi-orthogonality: ok'
+    else 
+     print*,'New vectors not bi-orthonormals at ',accu_nd
+     print*,'Must be a deep problem ...'
+     stop
+    endif
+  endif
+ 
+  !! EIGENVECTORS SORTED AND BI-ORTHONORMAL
+  do i = 1, n
+   do j = 1, n
+    VR(iorder_origin(j),i) = reigvec_tmp(j,i)
+    VL(iorder_origin(j),i) = leigvec_tmp(j,i)
+   enddo
+  enddo
+
+  !! RECOMPUTING THE EIGENVALUES 
+  eigval = 0.d0
+  do i = 1, n
+   iorder(i) = i
+   accu = 0.d0
+   do j = 1, n
+    accu += VL(j,i) * VR(j,i) 
+    do k = 1, n
+     eigval(i) +=  VL(j,i) * A(j,k) * VR(k,i) 
+    enddo
+   enddo
+   eigval(i) *= 1.d0/accu
+!   print*,'eigval(i) = ',eigval(i)
+  enddo
+  !! RESORT JUST TO BE SURE
+  call dsort(eigval, iorder, n)
+  do i = 1, n
+   do j = 1, n
+    reigvec(j,i) = VR(j,iorder(i))
+    leigvec(j,i) = VL(j,iorder(i))
+   enddo
+  enddo
+  print*,'Checking for final reigvec/leigvec'
+  shift_current = max(1.d-10,shift_current)
+  print*,'Thr for eigenvectors = ',shift_current
+  call check_EIGVEC(n, n, A, eigval, leigvec, reigvec,shift_current)
+  call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S)
+  print *, ' accu_nd bi-orthog = ', accu_nd
+  
+  if( accu_nd .lt. 1d-10 ) then
+    print *, ' bi-orthogonality: ok'
+  else 
+   print*,'Something went wrong in non_hrmt_diag_split_degen_bi_orthog'
+   print*,'Eigenvectors are not bi orthonormal ..'
+   print*,'accu_nd = ',accu_nd
+   stop
+  endif
+
+end 
+
+
--- a/src/non_hermit_dav/project.irp.f
+++ b/src/non_hermit_dav/project.irp.f
@ -0,0 +1,53 @@
+subroutine h_non_hermite(v,u,Hmat,a,N_st,sze) 
+ implicit none
+ BEGIN_DOC
+ ! Template of routine for the application of H
+ !
+ ! Here, it is done with the Hamiltonian matrix 
+ !
+ ! on the set of determinants of psi_det 
+ !
+ ! Computes $v = a * H | u \rangle$ 
+ !
+ END_DOC
+ integer, intent(in)              :: N_st,sze
+ double precision, intent(in)     :: u(sze,N_st), Hmat(sze,sze), a
+ double precision, intent(inout)  :: v(sze,N_st)
+ integer :: i,j,k 
+ do k = 1, N_st
+  do j = 1, sze
+   do i = 1, sze
+    v(i,k) += a * u(j,k) * Hmat(i,j)
+   enddo
+  enddo
+ enddo
+end
+
+
+subroutine exp_tau_H(u,v,hmat,tau,et,N_st,sze)
+ implicit none
+ BEGIN_DOC
+! realises v = (1 - tau (H - et)) u
+ END_DOC
+ integer, intent(in) :: N_st,sze
+ double precision, intent(in) :: hmat(sze,sze), u(sze,N_st), tau, et
+ double precision, intent(out):: v(sze,N_st)
+ double precision :: a
+ integer :: i,j
+ v = (1.d0 + tau * et) * u 
+ a = -1.d0 * tau
+ call h_non_hermite(v,u,Hmat,a,N_st,sze)
+end
+
+double precision function project_phi0(u,Hmat0,N_st,sze)
+ implicit none
+ integer, intent(in)              :: N_st,sze
+ double precision, intent(in)     :: u(sze,N_st), Hmat0(sze)
+ integer :: j
+ project_phi0 = 0.d0
+ do j = 1, sze
+  project_phi0 += u(j,1) * Hmat0(j) 
+ enddo
+ project_phi0 *= 1.d0 / u(1,1)
+end
+
--- a/src/non_hermit_dav/utils.irp.f
+++ b/src/non_hermit_dav/utils.irp.f
@ -0,0 +1,325 @@
+
+subroutine get_inv_half_svd(matrix, n, matrix_inv_half)
+
+  BEGIN_DOC
+  !   :math:`X = S^{-1/2}` obtained by SVD
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in)  :: n
+  double precision, intent(in)  :: matrix(n,n)
+  double precision, intent(out) :: matrix_inv_half(n,n)
+
+  integer                       :: num_linear_dependencies
+  integer                       :: LDA, LDC
+  integer                       :: info, i, j, k
+  double precision, parameter   :: threshold = 1.d-6
+  double precision, allocatable :: U(:,:),Vt(:,:), D(:),matrix_half(:,:),D_half(:)
+
+  double precision :: accu_d,accu_nd
+
+  LDA = size(matrix, 1)
+  LDC = size(matrix_inv_half, 1)
+  if(LDA .ne. LDC) then
+    print*, ' LDA != LDC'
+    stop
+  endif
+
+  print*, ' n   = ', n
+  print*, ' LDA = ', LDA
+  print*, ' LDC = ', LDC
+
+  double precision,allocatable :: WR(:),WI(:),VL(:,:),VR(:,:)
+  allocate(WR(n),WI(n),VL(n,n),VR(n,n))
+  call lapack_diag_non_sym(n,matrix,WR,WI,VL,VR)
+  do i = 1, n
+    print*,'WR,WI',WR(i),WI(i)
+  enddo
+
+
+  allocate(U(LDC,n), Vt(LDA,n), D(n))
+
+  call svd(matrix, LDA, U, LDC, D, Vt, LDA, n, n)
+  double precision, allocatable :: tmp1(:,:),tmp2(:,:),D_mat(:,:)
+  allocate(tmp1(n,n),tmp2(n,n),D_mat(n,n),matrix_half(n,n),D_half(n))
+  D_mat = 0.d0
+  do i = 1,n
+   D_mat(i,i) = D(i)
+  enddo
+  ! matrix = U D Vt
+  ! tmp1 = U D
+  tmp1 = 0.d0
+  call dgemm( 'N', 'N', n, n, n, 1.d0                                            &
+            , U, size(U, 1), D_mat, size(D_mat, 1) &
+            , 0.d0, tmp1, size(tmp1, 1) )
+  ! tmp2 = tmp1 X Vt = matrix
+  tmp2 = 0.d0
+  call dgemm( 'N', 'N', n, n, n, 1.d0                                            &
+            , tmp1, size(tmp1, 1), Vt, size(Vt, 1) &
+            , 0.d0, tmp2, size(tmp2, 1) )
+  print*,'Checking the recomposition of the matrix'
+  accu_nd = 0.d0
+  accu_d  = 0.d0
+  do i = 1, n
+   accu_d += dabs(tmp2(i,i) - matrix(i,i))
+   do j = 1, n
+    if(i==j)cycle 
+    accu_nd += dabs(tmp2(j,i) - matrix(j,i))
+   enddo
+  enddo
+  print*,'accu_d  =',accu_d
+  print*,'accu_nd =',accu_nd
+  print*,'passed the recomposition'
+
+  num_linear_dependencies = 0
+  do i = 1, n
+    if(abs(D(i)) <= threshold) then
+      D(i) = 0.d0
+      num_linear_dependencies += 1
+    else
+      ASSERT (D(i) > 0.d0)
+      D_half(i) = dsqrt(D(i))
+      D(i) = 1.d0 / dsqrt(D(i))
+    endif
+  enddo
+  write(*,*) ' linear dependencies', num_linear_dependencies
+
+  matrix_inv_half = 0.d0
+  matrix_half = 0.d0
+  do k = 1, n
+    if(D(k) /= 0.d0) then
+      do j = 1, n
+        do i = 1, n
+!          matrix_inv_half(i,j) = matrix_inv_half(i,j) + U(i,k) * D(k) * Vt(k,j)
+           matrix_inv_half(i,j) = matrix_inv_half(i,j) + U(i,k) * D(k) * Vt(j,k)
+           matrix_half(i,j) = matrix_half(i,j) + U(i,k) * D_half(k) * Vt(j,k)
+        enddo
+      enddo
+    endif
+  enddo
+  print*,'testing S^1/2 * S^1/2= S'
+  ! tmp1 = S^1/2 X S^1/2
+  tmp1 = 0.d0
+  call dgemm( 'N', 'N', n, n, n, 1.d0                                            &
+            , matrix_half, size(matrix_half, 1), matrix_half, size(matrix_half, 1) &
+            , 0.d0, tmp1, size(tmp1, 1) )
+  accu_nd = 0.d0
+  accu_d  = 0.d0
+  do i = 1, n
+   accu_d += dabs(tmp1(i,i) - matrix(i,i))
+   do j = 1, n
+    if(i==j)cycle 
+    accu_nd += dabs(tmp1(j,i) - matrix(j,i))
+   enddo
+  enddo
+  print*,'accu_d  =',accu_d
+  print*,'accu_nd =',accu_nd
+
+!  print*,'S inv half'
+!  do i = 1, n
+!   write(*, '(1000(F16.10,X))') matrix_inv_half(i,:)
+!  enddo
+
+  double precision, allocatable :: pseudo_inverse(:,:),identity(:,:)
+  allocate( pseudo_inverse(n,n),identity(n,n))
+  call get_pseudo_inverse(matrix,n,n,n,pseudo_inverse,n,threshold)
+ 
+  ! S^-1 X S = 1
+!  identity = 0.d0
+!  call dgemm( 'N', 'N', n, n, n, 1.d0                                            &
+!            , matrix, size(matrix, 1), pseudo_inverse, size(pseudo_inverse, 1) &
+!            , 0.d0, identity, size(identity, 1) )
+  print*,'Checking  S^-1/2 X S^-1/2 = S^-1 ?'
+  ! S^-1/2 X S^-1/2 = S^-1 ?
+  tmp1 = 0.d0
+  call dgemm( 'N', 'N', n, n, n, 1.d0                                            &
+            ,matrix_inv_half, size(matrix_inv_half, 1), matrix_inv_half, size(matrix_inv_half, 1) &
+            , 0.d0, tmp1, size(tmp1, 1) )
+  accu_nd = 0.d0
+  accu_d  = 0.d0
+  do i = 1, n
+   accu_d += dabs(1.d0 - pseudo_inverse(i,i))
+   do j = 1, n
+    if(i==j)cycle 
+    accu_nd += dabs(tmp1(j,i) - pseudo_inverse(j,i))
+   enddo
+  enddo
+  print*,'accu_d  =',accu_d
+  print*,'accu_nd =',accu_nd
+
+  stop
+!
+!  ! ( S^-1/2 x S ) x S^-1/2
+!  Stmp2 = 0.d0
+!  call dgemm( 'N', 'N', n, n, n, 1.d0                                        &
+!            , Stmp, size(Stmp, 1), matrix_inv_half, size(matrix_inv_half, 1) &
+!            , 0.d0, Stmp2, size(Stmp2, 1) )
+
+  ! S^-1/2 x ( S^-1/2 x S )
+!  Stmp2 = 0.d0
+!  call dgemm( 'N', 'N', n, n, n, 1.d0                                        &
+!            , matrix_inv_half, size(matrix_inv_half, 1), Stmp, size(Stmp, 1) &
+!            , 0.d0, Stmp2, size(Stmp2, 1) )
+ 
+!  do i = 1, n
+!    do j = 1, n
+!      if(i==j) then
+!       accu_d += Stmp2(j,i)
+!      else 
+!       accu_nd = accu_nd + Stmp2(j,i) * Stmp2(j,i)
+!      endif
+!    enddo
+!  enddo
+!  accu_nd = dsqrt(accu_nd)
+!  print*, ' after S^-1/2: sum of off-diag S elements = ', accu_nd
+!  print*, ' after S^-1/2: sum of     diag S elements = ', accu_d
+!  do i = 1, n
+!   write(*,'(1000(F16.10,X))') Stmp2(i,:)
+!  enddo
+
+  !double precision  :: thresh
+  !thresh = 1.d-10
+  !if( accu_nd.gt.thresh .or. dabs(accu_d-dble(n)).gt.thresh) then
+  !  stop
+  !endif
+
+end subroutine get_inv_half_svd
+
+! ---
+
+subroutine get_inv_half_nonsymmat_diago(matrix, n, matrix_inv_half, complex_root)
+
+  BEGIN_DOC
+  ! input:  S = matrix
+  ! output: S^{-1/2} = matrix_inv_half obtained by diagonalization
+  !
+  ! S = VR D VL^T
+  !   = VR D^{1/2} D^{1/2} VL^T
+  !   = VR D^{1/2} VL^T VR D^{1/2} VL^T
+  !   = S^{1/2} S^{1/2} with S = VR D^{1/2} VL^T 
+  !
+  ! == > S^{-1/2} = VR D^{-1/2} VL^T
+  !
+  END_DOC
+
+  implicit none
+
+  integer,          intent(in)  :: n
+  double precision, intent(in)  :: matrix(n,n)
+  logical,          intent(out) :: complex_root
+  double precision, intent(out) :: matrix_inv_half(n,n)
+
+  integer                       :: i, j
+  double precision              :: accu_d, accu_nd
+  double precision, allocatable :: WR(:), WI(:), VL(:,:), VR(:,:), S(:,:), S_diag(:)
+  double precision, allocatable :: tmp1(:,:), D_mat(:,:)
+
+  complex_root = .False.
+
+  matrix_inv_half = 0.D0
+  print*,'Computing S^{-1/2}'
+
+  allocate(WR(n), WI(n), VL(n,n), VR(n,n))
+  call lapack_diag_non_sym(n, matrix, WR, WI, VL, VR)
+
+  allocate(S(n,n))
+  call check_biorthog(n, n, VL, VR, accu_d, accu_nd, S)
+  print*,'accu_nd S^{-1/2}',accu_nd
+  if(accu_nd.gt.1.d-10) then
+    complex_root = .True. ! if vectors are not bi-orthogonal return 
+    print*,'Eigenvectors of S are not bi-orthonormal, skipping S^{-1/2}'
+    return
+  endif
+
+  allocate(S_diag(n))
+  do i = 1, n
+    S_diag(i) = 1.d0/dsqrt(S(i,i))
+    if(dabs(WI(i)).gt.1.d-20.or.WR(i).lt.0.d0)then ! check that eigenvalues are real and positive 
+     complex_root = .True.
+     print*,'Eigenvalues of S have imaginary part '
+     print*,'WR(i),WI(i)',WR(i), WR(i)
+     print*,'Skipping S^{-1/2}'
+     return
+    endif
+  enddo
+  deallocate(S)
+
+  if(complex_root) return
+
+  ! normalization of vectors 
+  do i = 1, n
+    if(S_diag(i).eq.1.d0) cycle
+    do j = 1,n
+      VL(j,i) *= S_diag(i)
+      VR(j,i) *= S_diag(i)
+    enddo
+  enddo
+  deallocate(S_diag)
+
+  allocate(tmp1(n,n), D_mat(n,n))
+
+  D_mat = 0.d0
+  do i = 1, n
+    D_mat(i,i) = 1.d0/dsqrt(WR(i))
+  enddo
+  deallocate(WR, WI)
+
+  ! tmp1 = VR D^{-1/2} 
+  tmp1 = 0.d0
+  call dgemm( 'N', 'N', n, n, n, 1.d0                &
+            , VR, size(VR, 1), D_mat, size(D_mat, 1) &
+            , 0.d0, tmp1, size(tmp1, 1) )
+  deallocate(VR, D_mat)
+
+  ! S^{-1/2} = tmp1 X VL^T 
+  matrix_inv_half = 0.d0
+  call dgemm( 'N', 'T', n, n, n, 1.d0              &
+            , tmp1, size(tmp1, 1), VL, size(VL, 1) &
+            , 0.d0, matrix_inv_half, size(matrix_inv_half, 1) )
+  deallocate(tmp1, VL)
+
+end
+
+! ---
+
+subroutine bi_ortho_s_inv_half(n,leigvec,reigvec,S_nh_inv_half)
+ implicit  none
+ integer, intent(in) :: n
+ double precision, intent(in) :: S_nh_inv_half(n,n)
+ double precision, intent(inout) :: leigvec(n,n),reigvec(n,n)
+ BEGIN_DOC 
+ ! bi-orthonormalization of left and right vectors 
+ ! 
+ ! S = VL^T VR 
+ !
+ ! S^{-1/2} S S^{-1/2} = 1 = S^{-1/2} VL^T VR S^{-1/2} = VL_new^T VR_new
+ !
+ ! VL_new = VL (S^{-1/2})^T
+ ! 
+ ! VR_new = VR S^{^{-1/2}}
+ END_DOC
+ double precision,allocatable :: vl_tmp(:,:),vr_tmp(:,:)
+ print*,'Bi-orthonormalization using S^{-1/2}'
+ allocate(vl_tmp(n,n),vr_tmp(n,n))
+ vl_tmp = leigvec
+ vr_tmp = reigvec
+ ! VL_new = VL (S^{-1/2})^T
+ call dgemm( 'N', 'T', n, n, n, 1.d0                                            &
+             , vl_tmp, size(vl_tmp, 1), S_nh_inv_half, size(S_nh_inv_half, 1) &
+             , 0.d0, leigvec, size(leigvec, 1) )
+ ! VR_new = VR S^{^{-1/2}}
+ call dgemm( 'N', 'N', n, n, n, 1.d0                                            &
+             , vr_tmp, size(vr_tmp, 1), S_nh_inv_half, size(S_nh_inv_half, 1) &
+             , 0.d0, reigvec, size(reigvec, 1) )
+ double precision :: accu_d, accu_nd
+ double precision,allocatable :: S(:,:) 
+ allocate(S(n,n)) 
+ call check_biorthog(n, n, leigvec, reigvec, accu_d, accu_nd, S)
+ if(dabs(accu_d - n).gt.1.d-10 .or. accu_nd .gt.1.d-8 )then
+  print*,'Pb in bi_ortho_s_inv_half !!'
+  print*,'accu_d =',accu_d
+  print*,'accu_nd =',accu_nd
+  stop
+ endif
+end
--- a/src/tc_bi_ortho/compute_deltamu_right.irp.f
+++ b/src/tc_bi_ortho/compute_deltamu_right.irp.f
@ -42,7 +42,7 @@ subroutine delta_right()

  enddo

-  call ezfio_set_dmc_dress_dmc_delta_h(delta)
+!  call ezfio_set_dmc_dress_dmc_delta_h(delta)

  deallocate(delta)

--- a/src/tc_bi_ortho/print_tc_wf.irp.f
+++ b/src/tc_bi_ortho/print_tc_wf.irp.f
@ -24,7 +24,7 @@ subroutine write_l_r_wf
 output=trim(ezfio_filename)//'.tc_wf'
 i_unit_output = getUnitAndOpen(output,'w')
 integer :: i
- print*,'Writing the left-right wf'
+ print*,'Writing the left-right wf, and usual psi'
 do i = 1, N_det
  write(i_unit_output,*)i,psi_l_coef_sorted_bi_ortho_left(i),psi_r_coef_sorted_bi_ortho_right(i)
 enddo
--- a/src/tc_bi_ortho/select_dets_bi_ortho.irp.f
+++ b/src/tc_bi_ortho/select_dets_bi_ortho.irp.f
@ -1,61 +0,0 @@
-program tc_bi_ortho
-  implicit none
-  BEGIN_DOC
-! TODO : Put the documentation of the program here
-  END_DOC
-  print *, 'Hello world'
-  my_grid_becke = .True.
-  my_n_pt_r_grid = 30
-  my_n_pt_a_grid = 50
-  read_wf = .True.
-  touch read_wf
-  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid
-
-  !!!!!!!!!!!!!!! WARNING NO 3-BODY
-  !!!!!!!!!!!!!!! WARNING NO 3-BODY
-  three_body_h_tc = .False.
-  touch three_body_h_tc 
-  !!!!!!!!!!!!!!! WARNING NO 3-BODY
-  !!!!!!!!!!!!!!! WARNING NO 3-BODY
-
-  call routine_test
-! call test
-end
-
-subroutine routine_test
- implicit none
- use bitmasks ! you need to include the bitmasks_module.f90 features
- integer :: i,n_good,degree
- integer(bit_kind), allocatable :: dets(:,:,:)
- integer, allocatable :: iorder(:)
- double precision, allocatable :: coef(:),coef_new(:,:)
- double precision :: thr
- allocate(coef(N_det), iorder(N_det))
- do i = 1, N_det
-  iorder(i) = i
-  call get_excitation_degree(HF_bitmask,psi_det(1,1,i),degree,N_int)
-  if(degree==1)then
-   coef(i) = -0.5d0
-  else
-   coef(i) = -dabs(coef_pt1_bi_ortho(i))
-  endif
- enddo
- call dsort(coef,iorder,N_det)
- !thr = save_threshold
- thr = 1d-15
- n_good = 0
- do i = 1, N_det
-  if(dabs(coef(i)).gt.thr)then
-   n_good += 1
-  endif
- enddo
- print*,'n_good = ',n_good
- allocate(dets(N_int,2,n_good),coef_new(n_good,n_states))
- do i = 1, n_good
-  dets(:,:,i) = psi_det(:,:,iorder(i))
-  coef_new(i,:) = psi_coef(iorder(i),:)
- enddo
- call save_wavefunction_general(n_good,n_states,dets,n_good,coef_new)
-
-
-end
--- a/src/tc_keywords/EZFIO.cfg
+++ b/src/tc_keywords/EZFIO.cfg
@ -105,3 +105,21 @@ type: integer
 doc: if +1: only positive is selected, -1: only negative is selected, :0 both positive and negative
 interface: ezfio,provider,ocaml
 default: 0
+
+[j1b_pen]
+type: double precision
+doc: exponents of the 1-body Jastrow
+interface: ezfio
+size: (nuclei.nucl_num)
+
+[j1b_coeff]
+type: double precision
+doc: coeff of the 1-body Jastrow
+interface: ezfio
+size: (nuclei.nucl_num)
+
+[j1b_type]
+type: integer
+doc: type of 1-body Jastrow
+interface: ezfio, provider, ocaml
+default: 0
--- a/src/tc_keywords/NEED
+++ b/src/tc_keywords/NEED
@ -1 +1,2 @@
 ezfio_files
+nuclei
--- a/src/ao_tc_eff_map/j1b_pen.irp.f
+++ b/src/ao_tc_eff_map/j1b_pen.irp.f
@ -13,7 +13,7 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
  PROVIDE ezfio_filename

  if (mpi_master) then
-    call ezfio_has_ao_tc_eff_map_j1b_pen(exists)
+    call ezfio_has_tc_keywords_j1b_pen(exists)
  endif

  IRP_IF MPI_DEBUG
@ -34,7 +34,7 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]

    if (mpi_master) then
      write(6,'(A)') '.. >>>>> [ IO READ: j1b_pen ] <<<<< ..'
-      call ezfio_get_ao_tc_eff_map_j1b_pen(j1b_pen)
+      call ezfio_get_tc_keywords_j1b_pen(j1b_pen)
      IRP_IF MPI
        call MPI_BCAST(j1b_pen, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
        if (ierr /= MPI_SUCCESS) then
@ -47,10 +47,11 @@ BEGIN_PROVIDER [ double precision, j1b_pen, (nucl_num) ]
 
    integer :: i
    do i = 1, nucl_num
-      j1b_pen(i) = 1d5
+      j1b_pen(i) = 100.d0
    enddo

  endif
+  print*,'j1b_pen = ',j1b_pen

 END_PROVIDER

@ -68,7 +69,7 @@ BEGIN_PROVIDER [ double precision, j1b_coeff, (nucl_num) ]
  PROVIDE ezfio_filename

  if (mpi_master) then
-    call ezfio_has_ao_tc_eff_map_j1b_coeff(exists)
+    call ezfio_has_tc_keywords_j1b_coeff(exists)
  endif

  IRP_IF MPI_DEBUG
@ -89,7 +90,7 @@ BEGIN_PROVIDER [ double precision, j1b_coeff, (nucl_num) ]

    if (mpi_master) then
      write(6,'(A)') '.. >>>>> [ IO READ: j1b_coeff ] <<<<< ..'
-      call ezfio_get_ao_tc_eff_map_j1b_coeff(j1b_coeff)
+      call ezfio_get_tc_keywords_j1b_coeff(j1b_coeff)
      IRP_IF MPI
        call MPI_BCAST(j1b_coeff, (nucl_num), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
        if (ierr /= MPI_SUCCESS) then
@ -110,3 +111,4 @@ BEGIN_PROVIDER [ double precision, j1b_coeff, (nucl_num) ]
 END_PROVIDER

 ! ---
+
--- a/src/tc_keywords/tc_keywords.irp.f
+++ b/src/tc_keywords/tc_keywords.irp.f
@ -4,4 +4,5 @@ program tc_keywords
 ! TODO : Put the documentation of the program here
  END_DOC
  print *, 'Hello world'
+ provide j1b_pen
 end