added fci_tc and cipsi_tc_bi_ortho

2025-04-27 02:34:44 +02:00 · 2023-02-07 17:28:11 +01:00 · 2023-02-07 17:28:11 +01:00 · b258a2f154
commit b258a2f154
parent 80b66dee79
40 changed files with 8201 additions and 0 deletions
--- a/src/cipsi_tc_bi_ortho/EZFIO.cfg
+++ b/src/cipsi_tc_bi_ortho/EZFIO.cfg
@ -0,0 +1,36 @@
+[save_wf_after_selection]
+type: logical
+doc: If true, saves the wave function after the selection, before the diagonalization
+interface: ezfio,provider,ocaml
+default: False
+
+[seniority_max]
+type: integer
+doc: Maximum number of allowed open shells. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
+[excitation_ref]
+type: integer
+doc: 1: Hartree-Fock determinant, 2:All determinants of the dominant configuration
+interface: ezfio,ocaml,provider
+default: 1
+
+[excitation_max]
+type: integer
+doc: Maximum number of excitation with respect to the Hartree-Fock determinant. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
+[excitation_alpha_max]
+type: integer
+doc: Maximum number of excitation for alpha determinants with respect to the Hartree-Fock determinant. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
+[excitation_beta_max]
+type: integer
+doc: Maximum number of excitation for beta determinants with respect to the Hartree-Fock determinant. Using -1 selects all determinants
+interface: ezfio,ocaml,provider
+default: -1
+
--- a/src/cipsi_tc_bi_ortho/NEED
+++ b/src/cipsi_tc_bi_ortho/NEED
@ -0,0 +1,6 @@
+mpi
+perturbation
+zmq
+iterations_tc
+csf
+tc_bi_ortho
--- a/src/cipsi_tc_bi_ortho/cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/cipsi.irp.f
@ -0,0 +1,136 @@
+subroutine run_cipsi
+
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with deterministic selection and
+  ! stochastic PT2.
+  END_DOC
+
+  use selection_types
+
+  implicit none
+
+  integer                        :: i,j,k,ndet
+  type(pt2_type)                 :: pt2_data, pt2_data_err
+  double precision, allocatable  :: zeros(:)
+  integer                        :: to_select
+  logical, external :: qp_stop
+
+  double precision :: threshold_generators_save
+  double precision :: rss
+  double precision, external :: memory_of_double
+  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
+
+  PROVIDE H_apply_buffer_allocated distributed_davidson
+
+  print*,'Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+  enddo
+
+  N_iter = 1
+  threshold_generators = 1.d0
+  SOFT_TOUCH threshold_generators
+
+  rss = memory_of_double(N_states)*4.d0
+  call check_mem(rss,irp_here)
+
+  allocate (zeros(N_states))
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+
+  double precision               :: hf_energy_ref
+  logical                        :: has, print_pt2
+  double precision               :: relative_error
+
+  relative_error=PT2_relative_error
+
+  zeros = 0.d0
+  pt2_data % pt2  = -huge(1.e0)
+  pt2_data % rpt2 = -huge(1.e0)
+  pt2_data % overlap(:,:) = 0.d0
+  pt2_data % variance = huge(1.e0)
+
+  if (s2_eig) then
+    call make_s2_eigenfunction
+  endif
+  print_pt2 = .False.
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+
+  call ezfio_has_hartree_fock_energy(has)
+  if (has) then
+    call ezfio_get_hartree_fock_energy(hf_energy_ref)
+  else
+    hf_energy_ref = ref_bitmask_energy
+  endif
+
+  if (N_det > N_det_max) then
+    psi_det(1:N_int,1:2,1:N_det) = psi_det_sorted_tc_gen(1:N_int,1:2,1:N_det)
+    psi_coef(1:N_det,1:N_states) = psi_coef_sorted_tc_gen(1:N_det,1:N_states)
+    N_det = N_det_max
+    soft_touch N_det psi_det psi_coef
+    if (s2_eig) then
+      call make_s2_eigenfunction
+    endif
+    print_pt2 = .False.
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!    call routine_save_right
+  endif
+
+  correlation_energy_ratio = 0.d0
+
+  print_pt2 = .True.
+  do while (                                                         &
+        (N_det < N_det_max) .and.                                    &
+        (maxval(abs(pt2_data % pt2(1:N_states))) > pt2_max)          &
+        )
+      write(*,'(A)')  '--------------------------------------------------------------------------------'
+
+
+    to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
+    to_select = max(N_states_diag, to_select)
+
+    E_denom = E_tc ! TC Energy of the current wave function 
+    if (do_pt2) then
+      call pt2_dealloc(pt2_data)
+      call pt2_dealloc(pt2_data_err)
+      call pt2_alloc(pt2_data, N_states)
+      call pt2_alloc(pt2_data_err, N_states)
+      threshold_generators_save = threshold_generators
+      threshold_generators = 1.d0
+      SOFT_TOUCH threshold_generators
+      call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
+      threshold_generators = threshold_generators_save
+      SOFT_TOUCH threshold_generators
+    else
+      call pt2_dealloc(pt2_data)
+      call pt2_alloc(pt2_data, N_states)
+      call ZMQ_selection(to_select, pt2_data)
+    endif
+
+    N_iter += 1
+
+    if (qp_stop()) exit
+
+    ! Add selected determinants
+    call copy_H_apply_buffer_to_wf()
+
+    if (save_wf_after_selection) then
+      call save_wavefunction
+    endif
+
+    PROVIDE  psi_coef
+    PROVIDE  psi_det
+    PROVIDE  psi_det_sorted_tc
+
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+    if (qp_stop()) exit
+  enddo
+
+  call pt2_dealloc(pt2_data)
+  call pt2_dealloc(pt2_data_err)
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+  call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+
+end
--- a/src/cipsi_tc_bi_ortho/energy.irp.f
+++ b/src/cipsi_tc_bi_ortho/energy.irp.f
@ -0,0 +1,51 @@
+BEGIN_PROVIDER [ logical, initialize_pt2_E0_denominator ]
+ implicit none
+ BEGIN_DOC
+ ! If true, initialize pt2_E0_denominator
+ END_DOC
+ initialize_pt2_E0_denominator = .True.
+END_PROVIDER
+
+BEGIN_PROVIDER [ double precision, pt2_E0_denominator, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! E0 in the denominator of the PT2
+ END_DOC
+ integer :: i,j
+
+  pt2_E0_denominator = eigval_right_tc_bi_orth
+
+! if (initialize_pt2_E0_denominator) then
+!   if (h0_type == "EN") then
+!     pt2_E0_denominator(1:N_states) = psi_energy(1:N_states)
+!   else if (h0_type == "HF") then
+!     do i=1,N_states
+!       j = maxloc(abs(psi_coef(:,i)),1)
+!       pt2_E0_denominator(i) = psi_det_hii(j)
+!     enddo
+!   else if (h0_type == "Barycentric") then
+!     pt2_E0_denominator(1:N_states) = barycentric_electronic_energy(1:N_states)
+!   else if (h0_type == "CFG") then
+!     pt2_E0_denominator(1:N_states) = psi_energy(1:N_states)
+!   else
+!     print *,  h0_type, ' not implemented'
+!     stop
+!   endif
+!  do i=1,N_states
+!    call write_double(6,pt2_E0_denominator(i)+nuclear_repulsion, 'PT2 Energy denominator')
+!  enddo
+! else
+!   pt2_E0_denominator = -huge(1.d0)
+! endif
+
+END_PROVIDER
+
+
+BEGIN_PROVIDER [ double precision, pt2_overlap, (N_states, N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Overlap between the perturbed wave functions
+ END_DOC
+ pt2_overlap(1:N_states,1:N_states) = 0.d0
+END_PROVIDER
+
--- a/src/cipsi_tc_bi_ortho/environment.irp.f
+++ b/src/cipsi_tc_bi_ortho/environment.irp.f
@ -0,0 +1,14 @@
+BEGIN_PROVIDER [ integer, nthreads_pt2 ]
+ implicit none
+ BEGIN_DOC
+ ! Number of threads for Davidson
+ END_DOC
+ nthreads_pt2 = nproc
+ character*(32) :: env
+ call getenv('QP_NTHREADS_PT2',env)
+ if (trim(env) /= '') then
+   read(env,*) nthreads_pt2
+   call write_int(6,nthreads_pt2,'Target number of threads for PT2')
+ endif
+END_PROVIDER
+
--- a/src/cipsi_tc_bi_ortho/fock_diag.irp.f
+++ b/src/cipsi_tc_bi_ortho/fock_diag.irp.f
@ -0,0 +1,95 @@
+subroutine build_fock_tmp_tc(fock_diag_tmp,det_ref,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+! Build the diagonal of the Fock matrix corresponding to a generator
+! determinant. $F_{00}$ is $\langle i|H|i \rangle = E_0$.
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer(bit_kind), intent(in)  :: det_ref(Nint,2)
+  double precision, intent(out)  :: fock_diag_tmp(2,mo_num+1)
+
+  integer                        :: occ(Nint*bit_kind_size,2)
+  integer                        :: ne(2), i, j, ii, jj
+  double precision               :: E0
+
+  ! Compute Fock matrix diagonal elements
+  call bitstring_to_list_ab(det_ref,occ,Ne,Nint)
+
+  fock_diag_tmp = 0.d0
+  E0 = 0.d0
+
+  if (Ne(1) /= elec_alpha_num) then
+    print *,  'Error in build_fock_tmp_tc (alpha)', Ne(1), Ne(2)
+    call debug_det(det_ref,N_int)
+    stop -1
+  endif
+  if (Ne(2) /= elec_beta_num) then
+    print *, 'Error in build_fock_tmp_tc (beta)', Ne(1), Ne(2)
+    call debug_det(det_ref,N_int)
+    stop -1
+  endif
+
+  ! Occupied MOs
+  do ii=1,elec_alpha_num
+    i = occ(ii,1)
+    fock_diag_tmp(1,i) = fock_diag_tmp(1,i) + mo_one_e_integrals(i,i)
+    E0 = E0 + mo_one_e_integrals(i,i)
+    do jj=1,elec_alpha_num
+      j = occ(jj,1)
+      if (i==j) cycle
+      fock_diag_tmp(1,i) = fock_diag_tmp(1,i) + mo_two_e_integrals_jj_anti(i,j)
+      E0 = E0 + 0.5d0*mo_two_e_integrals_jj_anti(i,j)
+    enddo
+    do jj=1,elec_beta_num
+      j = occ(jj,2)
+      fock_diag_tmp(1,i) = fock_diag_tmp(1,i) + mo_two_e_integrals_jj(i,j)
+      E0 = E0 + mo_two_e_integrals_jj(i,j)
+    enddo
+  enddo
+  do ii=1,elec_beta_num
+    i = occ(ii,2)
+    fock_diag_tmp(2,i) = fock_diag_tmp(2,i) + mo_one_e_integrals(i,i)
+    E0 = E0 + mo_one_e_integrals(i,i)
+    do jj=1,elec_beta_num
+      j = occ(jj,2)
+      if (i==j) cycle
+      fock_diag_tmp(2,i) = fock_diag_tmp(2,i) + mo_two_e_integrals_jj_anti(i,j)
+      E0 = E0 + 0.5d0*mo_two_e_integrals_jj_anti(i,j)
+    enddo
+    do jj=1,elec_alpha_num
+      j = occ(jj,1)
+      fock_diag_tmp(2,i) = fock_diag_tmp(2,i) + mo_two_e_integrals_jj(i,j)
+    enddo
+  enddo
+
+  ! Virtual MOs
+  do i=1,mo_num
+    if (fock_diag_tmp(1,i) /= 0.d0) cycle
+    fock_diag_tmp(1,i) = fock_diag_tmp(1,i) + mo_one_e_integrals(i,i)
+    do jj=1,elec_alpha_num
+      j = occ(jj,1)
+      fock_diag_tmp(1,i) = fock_diag_tmp(1,i) + mo_two_e_integrals_jj_anti(i,j)
+    enddo
+    do jj=1,elec_beta_num
+      j = occ(jj,2)
+      fock_diag_tmp(1,i) = fock_diag_tmp(1,i) + mo_two_e_integrals_jj(i,j)
+    enddo
+  enddo
+  do i=1,mo_num
+    if (fock_diag_tmp(2,i) /= 0.d0) cycle
+    fock_diag_tmp(2,i) = fock_diag_tmp(2,i) + mo_one_e_integrals(i,i)
+    do jj=1,elec_beta_num
+      j = occ(jj,2)
+      fock_diag_tmp(2,i) = fock_diag_tmp(2,i) + mo_two_e_integrals_jj_anti(i,j)
+    enddo
+    do jj=1,elec_alpha_num
+      j = occ(jj,1)
+      fock_diag_tmp(2,i) = fock_diag_tmp(2,i) + mo_two_e_integrals_jj(i,j)
+    enddo
+  enddo
+
+  fock_diag_tmp(1,mo_num+1) = E0
+  fock_diag_tmp(2,mo_num+1) = E0
+
+end
--- a/src/cipsi_tc_bi_ortho/get_d.irp.f
+++ b/src/cipsi_tc_bi_ortho/get_d.irp.f
--- a/src/cipsi_tc_bi_ortho/get_d0_good.irp.f
+++ b/src/cipsi_tc_bi_ortho/get_d0_good.irp.f
@ -0,0 +1,139 @@
+subroutine get_d0_new(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be okay for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: gen(N_int, 2), mask(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind) :: det(N_int, 2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  integer :: i, j, k, s, h1, h2, p1, p2, puti, putj, mm
+  double precision :: phase
+  double precision :: hij,hji
+  double precision, external :: get_phase_bi
+  logical :: ok
+
+  integer, parameter :: bant=1
+  double precision, allocatable :: hij_cache1(:), hij_cache2(:)
+  allocate (hij_cache1(mo_num),hij_cache2(mo_num))
+  double precision, allocatable :: hji_cache1(:), hji_cache2(:)
+  allocate (hji_cache1(mo_num),hji_cache2(mo_num))
+!  print*,'in get_d0_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  if(sp == 3) then ! AB
+    h1 = p(1,1)
+    h2 = p(1,2)
+    do p1=1, mo_num
+      if(bannedOrb(p1, 1)) cycle
+!      call get_mo_two_e_integrals_complex(p1,h2,h1,mo_num,hij_cache1,mo_integrals_map)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,p1,h2,h1)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e(h2,h1,mm,p1)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(det,gen,N_int, hij)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hij = hij_cache1(p2) * phase
+        end if
+        if (hij == (0.d0,0.d0)) cycle
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,1) * hij  ! HOTSPOT
+        enddo
+      end do
+      !!!!!!!!!! <phi|H|alpha>
+      do p2=1, mo_num
+        if(bannedOrb(p2,2)) cycle
+        if(banned(p1, p2, bant)) cycle ! rentable?
+        if(p1 == h1 .or. p2 == h2) then
+          call apply_particles(mask, 1,p1,2,p2, det, ok, N_int)
+          ! call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(gen,det,N_int, hji)
+        else
+          phase = get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+          hji = hji_cache1(p2) * phase
+        end if
+        if (hji == (0.d0,0.d0)) cycle
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,2) * hji  ! HOTSPOT
+        enddo
+      end do
+    end do
+
+  else ! AA BB
+    p1 = p(1,sp)
+    p2 = p(2,sp)
+    do puti=1, mo_num
+      if(bannedOrb(puti, sp)) cycle
+!      call get_mo_two_e_integrals_complex(puti,p2,p1,mo_num,hij_cache1,mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(puti,p1,p2,mo_num,hij_cache2,mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache1(mm) = mo_bi_ortho_tc_two_e(mm,puti,p2,p1)
+       hij_cache2(mm) = mo_bi_ortho_tc_two_e(mm,puti,p1,p2)
+       hji_cache1(mm) = mo_bi_ortho_tc_two_e(p2,p1,mm,puti)
+       hji_cache2(mm) = mo_bi_ortho_tc_two_e(p1,p2,mm,puti)
+      enddo
+      !!!!!!!!!! <alpha|H|psi>
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          !call i_h_j_complex(gen, det, N_int, hij) ! need to take conjugate of this
+!          call i_h_j_complex(det, gen, N_int, hij)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(det,gen,N_int, hij)
+          if (hij == 0.d0) cycle
+        else
+!          hij = (mo_two_e_integral_complex(p1, p2, puti, putj) -  mo_two_e_integral_complex(p2, p1, puti, putj))
+!          hij = (mo_bi_ortho_tc_two_e(p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e(p2, p1, puti, putj))
+          hij = (mo_bi_ortho_tc_two_e(puti, putj, p1, p2) -  mo_bi_ortho_tc_two_e(puti, putj, p2, p1))
+          if (hij == 0.d0) cycle
+          hij = (hij) * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+        end if
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,1) * hij
+        enddo
+      end do
+
+      !!!!!!!!!! <phi|H|alpha>
+      do putj=puti+1, mo_num
+        if(bannedOrb(putj, sp)) cycle
+        if(banned(puti, putj, bant)) cycle ! rentable?
+        if(puti == p1 .or. putj == p2 .or. puti == p2 .or. putj == p1) then
+          call apply_particles(mask, sp,puti,sp,putj, det, ok, N_int)
+          call htilde_mu_mat_opt_bi_ortho_no_3e(gen,det,N_int, hji)
+          if (hji == 0.d0) cycle
+        else
+          hji = (mo_bi_ortho_tc_two_e( p1, p2, puti, putj) -  mo_bi_ortho_tc_two_e( p2, p1, puti, putj))
+          if (hji == 0.d0) cycle
+          hji = (hji) * get_phase_bi(phasemask, sp, sp, puti, p1 , putj, p2, N_int)
+        end if
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,2) * hji
+        enddo
+      end do
+    end do
+  end if
+
+  deallocate(hij_cache1,hij_cache2)
+end
+
--- a/src/cipsi_tc_bi_ortho/get_d1_good.irp.f
+++ b/src/cipsi_tc_bi_ortho/get_d1_good.irp.f
@ -0,0 +1,454 @@
+subroutine get_d1_new(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices should be okay for complex?
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in)  :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in)  :: phasemask(N_int,2)
+  logical, intent(in)            :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  integer(bit_kind)              :: det(N_int, 2)
+  double precision, intent(in)   :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  integer, intent(in)            :: h(0:2,2), p(0:4,2), sp
+  double precision, external     :: get_phase_bi
+  double precision, external     :: mo_two_e_integral_complex
+  logical                        :: ok
+
+  logical, allocatable           :: lbanned(:,:)
+  integer                        :: puti, putj, ma, mi, s1, s2, i, i1, i2, j
+  integer                        :: hfix, pfix, h1, h2, p1, p2, ib, k, l, mm
+
+  integer, parameter             :: turn2(2) = (/2,1/)
+  integer, parameter             :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer                        :: bant
+  double precision, allocatable :: hij_cache(:,:)
+  double precision               :: hij, tmp_rowij(N_states, mo_num), tmp_rowij2(N_states, mo_num)
+  double precision, allocatable :: hji_cache(:,:)
+  double precision               :: hji, tmp_rowji(N_states, mo_num), tmp_rowji2(N_states, mo_num)
+!  PROVIDE mo_integrals_map N_int
+!  print*,'in get_d1_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  allocate (lbanned(mo_num, 2))
+  allocate (hij_cache(mo_num,2))
+  allocate (hji_cache(mo_num,2))
+  lbanned = bannedOrb
+
+  do i=1, p(0,1)
+    lbanned(p(i,1), 1) = .true.
+  end do
+  do i=1, p(0,2)
+    lbanned(p(i,2), 2) = .true.
+  end do
+
+  ma = 1
+  if(p(0,2) >= 2) ma = 2
+  mi = turn2(ma)
+
+  bant = 1
+
+  if(sp == 3) then
+    !move MA
+    if(ma == 2) bant = 2
+    puti = p(1,mi)
+    hfix = h(1,ma)
+    p1 = p(1,ma)
+    p2 = p(2,ma)
+    if(.not. bannedOrb(puti, mi)) then
+!      call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e(p1,p2,mm,hfix)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e(p2,p1,mm,hfix)
+      enddo
+      !! <alpha|H|psi>
+      tmp_rowij = 0.d0
+      do putj=1, hfix-1
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,1) - hij_cache(putj,2)
+        if (hij /= 0.d0) then
+          hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,1)
+          enddo
+        endif
+      end do
+      do putj=hfix+1, mo_num
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hij = hij_cache(putj,2) - hij_cache(putj,1)
+        if (hij /= 0.d0) then
+          hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,putj) = tmp_rowij(k,putj) + hij * coefs(k,1)
+          enddo
+        endif
+      end do
+
+      if(ma == 1) then
+        mat_r(1:N_states,1:mo_num,puti) = mat_r(1:N_states,1:mo_num,puti) + tmp_rowij(1:N_states,1:mo_num)
+      else
+        do l=1,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k,puti,l) = mat_r(k,puti,l) + tmp_rowij(k,l)
+          enddo
+        enddo
+      end if
+
+      !! <phi|H|alpha>
+      tmp_rowji = 0.d0
+      do putj=1, hfix-1
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hji = hji_cache(putj,1) - hji_cache(putj,2)
+        if (hji /= 0.d0) then
+          hji = hji * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,2)
+          enddo
+        endif
+      end do
+      do putj=hfix+1, mo_num
+        if(lbanned(putj, ma)) cycle
+        if(banned(putj, puti,bant)) cycle
+        hji = hji_cache(putj,2) - hji_cache(putj,1)
+        if (hji /= 0.d0) then
+          hji = hji * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowji(k,putj) = tmp_rowji(k,putj) + hji * coefs(k,2)
+          enddo
+        endif
+      end do
+
+      if(ma == 1) then
+        mat_l(1:N_states,1:mo_num,puti) = mat_l(1:N_states,1:mo_num,puti) + tmp_rowji(1:N_states,1:mo_num)
+      else
+        do l=1,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k,puti,l) = mat_l(k,puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end if
+    end if
+
+    !MOVE MI
+    pfix = p(1,mi)
+    tmp_rowij  = 0.d0
+    tmp_rowij2 = 0.d0
+    tmp_rowji  = 0.d0
+    tmp_rowji2 = 0.d0
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p1,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!    call get_mo_two_e_integrals_complex(hfix,pfix,p2,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+    do mm = 1, mo_num
+     hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p1)
+     hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,pfix,p2)
+     hji_cache(mm,1) = mo_bi_ortho_tc_two_e(pfix,p1,mm,hfix)
+     hji_cache(mm,2) = mo_bi_ortho_tc_two_e(pfix,p2,mm,hfix)
+    enddo
+    putj = p1
+    !! <alpha|H|psi>
+    do puti=1,mo_num !HOT
+      if(lbanned(puti,mi)) cycle
+      !p1 fixed
+      putj = p1
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,2)
+        if (hij /= 0.d0) then
+          hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,1)
+          enddo
+        endif
+      end if
+!      
+      putj = p2
+      if(.not. banned(putj,puti,bant)) then
+        hij = hij_cache(puti,1)
+        if (hij /= 0.d0) then
+          hij = hij * get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
+          do k=1,N_states
+            tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,1)
+          enddo
+        endif
+      end if
+    end do
+
+    if(mi == 1) then
+      mat_r(:,:,p1) = mat_r(:,:,p1) + tmp_rowij(:,:)
+      mat_r(:,:,p2) = mat_r(:,:,p2) + tmp_rowij2(:,:)
+    else
+      do l=1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij(k,l)
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij2(k,l)
+        enddo
+      enddo
+    end if
+
+    putj = p1
+    !! <phi|H|alpha>
+    do puti=1,mo_num !HOT
+      if(lbanned(puti,mi)) cycle
+      !p1 fixed
+      putj = p1
+      if(.not. banned(putj,puti,bant)) then
+        hji = hji_cache(puti,2)
+        if (hji /= 0.d0) then
+          hji = hji * get_phase_bi(phasemask, ma, mi, hfix, p2, puti, pfix, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,2)
+          enddo
+        endif
+      end if
+!      
+      putj = p2
+      if(.not. banned(putj,puti,bant)) then
+        hji = hji_cache(puti,1)
+        if (hji /= 0.d0) then
+          hji = hji * get_phase_bi(phasemask, ma, mi, hfix, p1, puti, pfix, N_int)
+          do k=1,N_states
+            tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,2)
+          enddo
+        endif
+      end if
+    end do
+
+    if(mi == 1) then
+      mat_l(:,:,p1) = mat_l(:,:,p1) + tmp_rowji(:,:)
+      mat_l(:,:,p2) = mat_l(:,:,p2) + tmp_rowji2(:,:)
+    else
+      do l=1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji(k,l)
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+
+  else  ! sp /= 3
+
+    if(p(0,ma) == 3) then
+      do i=1,3
+        hfix = h(1,ma)
+        puti = p(i, ma)
+        p1 = p(turn3(1,i), ma)
+        p2 = p(turn3(2,i), ma)
+!        call get_mo_two_e_integrals_complex(hfix,p1,p2,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!        call get_mo_two_e_integrals_complex(hfix,p2,p1,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+        do mm = 1, mo_num
+         hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,p2)
+         hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,p1)
+         hji_cache(mm,1) = mo_bi_ortho_tc_two_e(p1,p2,mm,hfix)
+         hji_cache(mm,2) = mo_bi_ortho_tc_two_e(p2,p1,mm,hfix)
+        enddo
+    !! <alpha|H|psi>
+        tmp_rowij = 0.d0
+        do putj=1,hfix-1
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,1) - hij_cache(putj,2)
+          if (hij /= 0.d0) then
+            hij = hij * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,1)
+          endif
+        end do
+        do putj=hfix+1,mo_num
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hij = hij_cache(putj,2) - hij_cache(putj,1)
+          if (hij /= 0.d0) then
+            hij = hij * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+            tmp_rowij(:,putj) = tmp_rowij(:,putj) + hij * coefs(:,1)
+          endif
+        end do
+
+        mat_r(:, :puti-1, puti) = mat_r(:, :puti-1, puti) + tmp_rowij(:,:puti-1)
+        do l=puti,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, l) = mat_r(k, puti,l) + tmp_rowij(k,l)
+          enddo
+        enddo
+    !! <phi|H|alpha>
+        tmp_rowji = 0.d0
+        do putj=1,hfix-1
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hji = hji_cache(putj,1) - hji_cache(putj,2)
+          if (hji /= 0.d0) then
+            hji = hji * get_phase_bi(phasemask, ma, ma, putj, p1, hfix, p2, N_int)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,2)
+          endif
+        end do
+        do putj=hfix+1,mo_num
+          if(banned(putj,puti,1)) cycle
+          if(lbanned(putj,ma)) cycle
+          hji = hji_cache(putj,2) - hji_cache(putj,1)
+          if (hji /= 0.d0) then
+            hji = hji * get_phase_bi(phasemask, ma, ma, hfix, p1, putj, p2, N_int)
+            tmp_rowji(:,putj) = tmp_rowji(:,putj) + hji * coefs(:,2)
+          endif
+        end do
+
+        mat_l(:, :puti-1, puti) = mat_l(:, :puti-1, puti) + tmp_rowji(:,:puti-1)
+        do l=puti,mo_num
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, puti, l) = mat_l(k, puti,l) + tmp_rowji(k,l)
+          enddo
+        enddo
+      end do
+    else
+      hfix = h(1,mi)
+      pfix = p(1,mi)
+      p1 = p(1,ma)
+      p2 = p(2,ma)
+      tmp_rowij =  0.d0
+      tmp_rowij2 = 0.d0
+      tmp_rowji =  0.d0
+      tmp_rowji2 = 0.d0
+!      call get_mo_two_e_integrals_complex(hfix,p1,pfix,mo_num,hij_cache(1,1),mo_integrals_map,mo_integrals_map_2)
+!      call get_mo_two_e_integrals_complex(hfix,p2,pfix,mo_num,hij_cache(1,2),mo_integrals_map,mo_integrals_map_2)
+      do mm = 1, mo_num
+       hij_cache(mm,1) = mo_bi_ortho_tc_two_e(mm,hfix,p1,pfix)
+       hij_cache(mm,2) = mo_bi_ortho_tc_two_e(mm,hfix,p2,pfix)
+       hji_cache(mm,1) = mo_bi_ortho_tc_two_e(p1,pfix,mm,hfix)
+       hji_cache(mm,2) = mo_bi_ortho_tc_two_e(p2,pfix,mm,hfix)
+      enddo
+      putj = p2
+    !! <alpha|H|psi>
+      do puti=1,mo_num
+        if(lbanned(puti,ma)) cycle
+        putj = p2
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,1)
+          if (hij /= 0.d0) then
+            hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              tmp_rowij(k,puti) = tmp_rowij(k,puti) + hij * coefs(k,1)
+            enddo
+          endif
+        end if
+
+        putj = p1
+        if(.not. banned(puti,putj,1)) then
+          hij = hij_cache(puti,2)
+          if (hij /= 0.d0) then
+            hij = hij * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
+            do k=1,N_states
+              tmp_rowij2(k,puti) = tmp_rowij2(k,puti) + hij * coefs(k,1)
+            enddo
+          endif
+        end if
+      end do
+      mat_r(:,:p2-1,p2) = mat_r(:,:p2-1,p2) + tmp_rowij(:,:p2-1)
+      do l=p2,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p2,l) = mat_r(k,p2,l) + tmp_rowij(k,l)
+        enddo
+      enddo
+      mat_r(:,:p1-1,p1) = mat_r(:,:p1-1,p1) + tmp_rowij2(:,:p1-1)
+      do l=p1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_r(k,p1,l) = mat_r(k,p1,l) + tmp_rowij2(k,l)
+        enddo
+      enddo
+
+
+    !! <phi|H|alpha>
+      putj = p2
+      do puti=1,mo_num
+        if(lbanned(puti,ma)) cycle
+        putj = p2
+        if(.not. banned(puti,putj,1)) then
+          hji = hji_cache(puti,1)
+          if (hji /= 0.d0) then
+            hji = hji * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p1, N_int)
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              tmp_rowji(k,puti) = tmp_rowji(k,puti) + hji * coefs(k,2)
+            enddo
+          endif
+        end if
+
+        putj = p1
+        if(.not. banned(puti,putj,1)) then
+          hji = hji_cache(puti,2)
+          if (hji /= 0.d0) then
+            hji = hji * get_phase_bi(phasemask, mi, ma, hfix, pfix, puti, p2, N_int)
+            do k=1,N_states
+              tmp_rowji2(k,puti) = tmp_rowji2(k,puti) + hji * coefs(k,2)
+            enddo
+          endif
+        end if
+      end do
+      mat_l(:,:p2-1,p2) = mat_l(:,:p2-1,p2) + tmp_rowji(:,:p2-1)
+      do l=p2,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k,p2,l) = mat_l(k,p2,l) + tmp_rowji(k,l)
+        enddo
+      enddo
+      mat_l(:,:p1-1,p1) = mat_l(:,:p1-1,p1) + tmp_rowji2(:,:p1-1)
+      do l=p1,mo_num
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          mat_l(k,p1,l) = mat_l(k,p1,l) + tmp_rowji2(k,l)
+        enddo
+      enddo
+    end if
+  end if
+  deallocate(lbanned,hij_cache, hji_cache)
+
+ !! MONO
+    if(sp == 3) then
+      s1 = 1
+      s2 = 2
+    else
+      s1 = sp
+      s2 = sp
+    end if
+
+    do i1=1,p(0,s1)
+      ib = 1
+      if(s1 == s2) ib = i1+1
+      do i2=ib,p(0,s2)
+        p1 = p(i1,s1)
+        p2 = p(i2,s2)
+        if(bannedOrb(p1, s1) .or. bannedOrb(p2, s2) .or. banned(p1, p2, 1)) cycle
+        call apply_particles(mask, s1, p1, s2, p2, det, ok, N_int)
+        ! gen is a selector; mask is ionized generator; det is alpha
+        ! hij is contribution to <psi|H|alpha>
+!        call i_h_j_complex(gen, det, N_int, hij)
+        call htilde_mu_mat_opt_bi_ortho_no_3e(det, gen, N_int, hij)
+        call htilde_mu_mat_opt_bi_ortho_no_3e(gen, det, N_int, hji)
+        !DIR$ LOOP COUNT AVG(4)
+        do k=1,N_states
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,1) * dconjg(hij)
+          mat_r(k, p1, p2) = mat_r(k, p1, p2) + coefs(k,1) * hij
+          mat_l(k, p1, p2) = mat_l(k, p1, p2) + coefs(k,2) * hji
+        enddo
+      end do
+    end do
+end
+
--- a/src/cipsi_tc_bi_ortho/get_d2_good.irp.f
+++ b/src/cipsi_tc_bi_ortho/get_d2_good.irp.f
@ -0,0 +1,308 @@
+
+subroutine get_d2_new(gen, phasemask, bannedOrb, banned, mat_l, mat_r, mask, h, p, sp, coefs)
+  !todo: indices/conjg should be correct for complex
+  use bitmasks
+  implicit none
+
+  integer(bit_kind), intent(in) :: mask(N_int, 2), gen(N_int, 2)
+  integer(bit_kind), intent(in) :: phasemask(N_int,2)
+  logical, intent(in) :: bannedOrb(mo_num, 2), banned(mo_num, mo_num,2)
+  double precision, intent(in) :: coefs(N_states,2)
+  double precision, intent(inout) :: mat_r(N_states, mo_num, mo_num)
+  double precision, intent(inout) :: mat_l(N_states, mo_num, mo_num)
+  integer, intent(in) :: h(0:2,2), p(0:4,2), sp
+
+  double precision, external :: get_phase_bi
+
+  integer :: i, j, k, tip, ma, mi, puti, putj
+  integer :: h1, h2, p1, p2, i1, i2
+  double precision :: phase
+  double precision :: hij,hji
+
+  integer, parameter:: turn2d(2,3,4) = reshape((/0,0, 0,0, 0,0,  3,4, 0,0, 0,0,  2,4, 1,4, 0,0,  2,3, 1,3, 1,2 /), (/2,3,4/))
+  integer, parameter :: turn2(2) = (/2, 1/)
+  integer, parameter :: turn3(2,3) = reshape((/2,3,  1,3, 1,2/), (/2,3/))
+
+  integer :: bant
+  bant = 1
+!  print*, 'in get_d2_new'
+!  call debug_det(gen,N_int)
+!  print*,'coefs',coefs(1,:)
+
+  tip = p(0,1) * p(0,2) ! number of alpha particles times number of beta particles
+
+  ma = sp !1:(alpha,alpha); 2:(b,b); 3:(a,b)
+  if(p(0,1) > p(0,2)) ma = 1 ! more alpha particles than beta particles
+  if(p(0,1) < p(0,2)) ma = 2 ! fewer alpha particles than beta particles
+  mi = mod(ma, 2) + 1
+
+  if(sp == 3) then ! if one alpha and one beta xhole 
+    !(where xholes refer to the ionizations from the generator, not the holes occupied in the ionized generator)
+    if(ma == 2) bant = 2 ! if more beta particles than alpha particles
+
+    if(tip == 3) then ! if 3 of one particle spin and 1 of the other particle spin
+      puti = p(1, mi)
+      if(bannedOrb(puti, mi)) return
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+
+      !! <alpha|H|psi>
+      do i = 1, 3    ! loop over all 3 combinations of 2 particles with spin ma
+        putj = p(i, ma)
+        if(banned(putj,puti,bant)) cycle
+        i1 = turn3(1,i)
+        i2 = turn3(2,i)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        
+     ! |G> = |psi_{gen,i}>
+     ! |G'> = a_{x1} a_{x2} |G>
+     ! |alpha> = a_{puti}^{\dagger} a_{putj}^{\dagger} |G'>
+     ! |alpha> = t_{x1,x2}^{puti,putj} |G>
+     ! hij = <psi_{selectors,i}|H|alpha>
+     ! |alpha> = t_{p1,p2}^{h1,h2}|psi_{selectors,i}>
+        !todo: <i|H|j>  =  (<h1,h2|p1,p2> - <h1,h2|p2,p1>) * phase
+        !    <psi|H|j> +=  dconjg(c_i) * <i|H|j>
+        !      <j|H|i>  =  (<p1,p2|h1,h2> - <p2,p1|h1,h2>) * phase
+        !    <j|H|psi> +=  <j|H|i> * c_i
+!        hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2, p1, h1, h2)
+
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e( h1, h2, p2, p1)
+        if (hij == 0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+        hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+
+        if(ma == 1) then ! if particle spins are (alpha,alpha,alpha,beta), then puti is beta and putj is alpha
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,1) * hij
+          enddo
+        else            ! if particle spins are (beta,beta,beta,alpha), then puti is alpha and putj is beta
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,1) * hij
+          enddo
+        end if
+      end do
+      !! <phi|H|alpha>
+      do i = 1, 3    ! loop over all 3 combinations of 2 particles with spin ma
+        putj = p(i, ma)
+        if(banned(putj,puti,bant)) cycle
+        i1 = turn3(1,i)
+        i2 = turn3(2,i)
+        p1 = p(i1, ma)
+        p2 = p(i2, ma)
+        hji = mo_bi_ortho_tc_two_e(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e( p2, p1, h1, h2)
+        if (hji == 0.d0) cycle
+        hji = hji * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+
+        if(ma == 1) then ! if particle spins are (alpha,alpha,alpha,beta), then puti is beta and putj is alpha
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,2) * hji
+          enddo
+        else            ! if particle spins are (beta,beta,beta,alpha), then puti is alpha and putj is beta
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,2) * hji
+          enddo
+        end if
+      end do
+    else ! if 2 alpha and 2 beta particles
+      h1 = h(1,1)
+      h2 = h(1,2)
+      !! <alpha|H|psi>
+      do j = 1,2 ! loop over all 4 combinations of one alpha and one beta particle
+        putj = p(j, 2)
+        if(bannedOrb(putj, 2)) cycle
+        p2 = p(turn2(j), 2)
+        do i = 1,2
+          puti = p(i, 1)
+          if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle
+          p1 = p(turn2(i), 1)
+    ! hij = <psi_{selectors,i}|H|alpha> 
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+          hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2 )
+          if (hij /= 0.d0) then
+            ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!            hij = dconjg(hij) * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            hij = hij * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,1) * hij
+            enddo
+          endif
+        end do
+      end do
+      !! <phi|H|alpha>
+      do j = 1,2 ! loop over all 4 combinations of one alpha and one beta particle
+        putj = p(j, 2)
+        if(bannedOrb(putj, 2)) cycle
+        p2 = p(turn2(j), 2)
+        do i = 1,2
+          puti = p(i, 1)
+          if(banned(puti,putj,bant) .or. bannedOrb(puti,1)) cycle
+          p1 = p(turn2(i), 1)
+          hji = mo_bi_ortho_tc_two_e( p1, p2, h1, h2)
+          if (hji /= 0.d0) then
+            hji = hji * get_phase_bi(phasemask, 1, 2, h1, p1, h2, p2, N_int)
+            !DIR$ LOOP COUNT AVG(4)
+            do k=1,N_states
+              mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,2) * hji
+            enddo
+          endif
+        end do
+      end do
+    end if
+
+  else ! if holes are (a,a) or (b,b)
+    if(tip == 0) then ! if particles are (a,a,a,a) or (b,b,b,b)
+      h1 = h(1, ma)
+      h2 = h(2, ma)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(i, ma)
+        if(bannedOrb(puti,ma)) cycle
+        do j=i+1,4
+          putj = p(j, ma)
+          if(bannedOrb(putj,ma)) cycle
+          if(banned(puti,putj,1)) cycle
+
+          i1 = turn2d(1, i, j)
+          i2 = turn2d(2, i, j)
+          p1 = p(i1, ma)
+          p2 = p(i2, ma)
+!          hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+          hij = mo_bi_ortho_tc_two_e(h1, h2, p1, p2) - mo_bi_ortho_tc_two_e(h1, h2, p2,p1 )
+          if (hij == 0.d0) cycle
+
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          hij = hij * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) +coefs(k,1) * hij
+          enddo
+        end do
+      end do
+      !! <phi|H|alpha>
+      do i=1,3
+        puti = p(i, ma)
+        if(bannedOrb(puti,ma)) cycle
+        do j=i+1,4
+          putj = p(j, ma)
+          if(bannedOrb(putj,ma)) cycle
+          if(banned(puti,putj,1)) cycle
+          i1 = turn2d(1, i, j)
+          i2 = turn2d(2, i, j)
+          p1 = p(i1, ma)
+          p2 = p(i2, ma)
+          hji = mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1,h1, h2 )
+          if (hji == 0.d0) cycle
+          hji = hji * get_phase_bi(phasemask, ma, ma, h1, p1, h2, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) +coefs(k,2) * hji
+          enddo
+        end do
+      end do
+    else if(tip == 3) then ! if particles are (a,a,a,b) (ma=1,mi=2) or (a,b,b,b) (ma=2,mi=1)
+      h1 = h(1, mi)
+      h2 = h(1, ma)
+      p1 = p(1, mi)
+      !! <alpha|H|psi>
+      do i=1,3
+        puti = p(turn3(1,i), ma)
+        if(bannedOrb(puti,ma)) cycle
+        putj = p(turn3(2,i), ma)
+        if(bannedOrb(putj,ma)) cycle
+        if(banned(puti,putj,1)) cycle
+        p2 = p(i, ma)
+
+!        hij = mo_bi_ortho_tc_two_e(p1, p2, h1, h2)
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = mo_bi_ortho_tc_two_e(h1, h2,p1, p2 )
+        if (hij == 0.d0) cycle
+
+        ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!        hij = dconjg(hij) * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        hij = hij * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        if (puti < putj) then
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,1) * hij
+          enddo
+        else
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, putj, puti) = mat_r(k, putj, puti) + coefs(k,1) * hij
+          enddo
+        endif
+      end do
+      !! <phi|H|alpha>
+      do i=1,3
+        puti = p(turn3(1,i), ma)
+        if(bannedOrb(puti,ma)) cycle
+        putj = p(turn3(2,i), ma)
+        if(bannedOrb(putj,ma)) cycle
+        if(banned(puti,putj,1)) cycle
+        p2 = p(i, ma)
+        hji = mo_bi_ortho_tc_two_e(p1, p2,h1, h2)
+        if (hji == 0.d0) cycle
+        hji = hji * get_phase_bi(phasemask, mi, ma, h1, p1, h2, p2, N_int)
+        if (puti < putj) then
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,2) * hji
+          enddo
+        else
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, putj, puti) = mat_l(k, putj, puti) + coefs(k,2) * hji
+          enddo
+        endif
+      end do
+    else ! tip == 4  (a,a,b,b)
+      puti = p(1, sp)
+      putj = p(2, sp)
+      if(.not. banned(puti,putj,1)) then
+        p1 = p(1, mi)
+        p2 = p(2, mi)
+        h1 = h(1, mi)
+        h2 = h(2, mi)
+      !! <alpha|H|psi>
+!        hij = (mo_bi_ortho_tc_two_e(p1, p2, h1, h2) - mo_bi_ortho_tc_two_e(p2,p1, h1, h2))
+!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!! 
+        ! take the transpose of what's written above because later use the complex conjugate 
+        hij = (mo_bi_ortho_tc_two_e(h1, h2,p1, p2) - mo_bi_ortho_tc_two_e(h1, h2, p2,p1))
+        if (hij /= 0.d0) then
+          ! take conjugate to get contribution to <alpha|H|psi> instead of <psi|H|alpha>
+!          hij = dconjg(hij) * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          hij = hij * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_r(k, puti, putj) = mat_r(k, puti, putj) + coefs(k,1) * hij
+          enddo
+        end if
+      !! <phi|H|alpha>
+        hji = (mo_bi_ortho_tc_two_e(p1, p2,h1, h2) - mo_bi_ortho_tc_two_e( p2,p1, h1, h2))
+        if (hji /= 0.d0) then
+          hji = hji * get_phase_bi(phasemask, mi, mi, h1, p1, h2, p2, N_int)
+          !DIR$ LOOP COUNT AVG(4)
+          do k=1,N_states
+            mat_l(k, puti, putj) = mat_l(k, puti, putj) + coefs(k,2) * hji
+          enddo
+        end if
+      end if
+    end if
+  end if
+end
--- a/src/cipsi_tc_bi_ortho/lock_2rdm.irp.f
+++ b/src/cipsi_tc_bi_ortho/lock_2rdm.irp.f
--- a/src/cipsi_tc_bi_ortho/pouet
+++ b/src/cipsi_tc_bi_ortho/pouet
@ -0,0 +1,33 @@
+
+          if(dabs(psi_h_alpha*alpha_h_psi - psi_h_alpha_tmp*alpha_h_psi_tmp).gt.1.d-10)then
+!!!          print*,'---'
+!!!          print*,psi_h_alpha    *alpha_h_psi,    psi_h_alpha,    alpha_h_psi  
+!!!          print*,psi_h_alpha_tmp*alpha_h_psi_tmp,psi_h_alpha_tmp,alpha_h_psi_tmp  
+           call debug_det(det,N_int)
+            print*,dabs(psi_h_alpha*alpha_h_psi - psi_h_alpha_tmp*alpha_h_psi_tmp),psi_h_alpha    *alpha_h_psi,psi_h_alpha_tmp*alpha_h_psi_tmp
+            print*,'-- Good '
+            print*,   psi_h_alpha,    alpha_h_psi  
+            print*,'-- bad '
+            print*,psi_h_alpha_tmp,alpha_h_psi_tmp  
+            print*,'-- details good'
+          double precision :: accu_1, accu_2
+          accu_1 = 0.d0
+          accu_2 = 0.d0
+          do iii = 1, N_det
+            call get_excitation_degree( psi_det(1,1,iii), det, degree, N_int)
+            call htilde_mu_mat_bi_ortho_tot(psi_det(1,1,iii), det, N_int, i_h_alpha)
+            call htilde_mu_mat_bi_ortho_tot(det, psi_det(1,1,iii), N_int, alpha_h_i)
+            print*,iii,degree,i_h_alpha,alpha_h_i
+            accu_1 += i_h_alpha
+            accu_2 += alpha_h_i
+            print*,accu_1,accu_2
+            
+          enddo
+!!!          if(dabs(psi_h_alpha*alpha_h_psi).gt.1.d-10)then
+!!!          print*,p1,p2
+!!!          print*,det(1,1), det(1,2)
+!!!          call debug_det(det,N_int)
+!!!          print*,psi_h_alpha    *alpha_h_psi,    psi_h_alpha,    alpha_h_psi  
+!!!          print*,psi_h_alpha_tmp*alpha_h_psi_tmp,psi_h_alpha_tmp,alpha_h_psi_tmp  
+!!!          print*, dabs(psi_h_alpha*alpha_h_psi - psi_h_alpha_tmp*alpha_h_psi_tmp),& 
+!!!                   psi_h_alpha    *alpha_h_psi,psi_h_alpha_tmp*alpha_h_psi_tmp
--- a/src/cipsi_tc_bi_ortho/pt2.irp.f
+++ b/src/cipsi_tc_bi_ortho/pt2.irp.f
@ -0,0 +1,89 @@
+subroutine pt2_tc_bi_ortho
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+  integer                        :: i,j,k,ndet
+  double precision, allocatable  :: zeros(:)
+  integer                        :: to_select
+  type(pt2_type)                 :: pt2_data, pt2_data_err
+  logical, external              :: qp_stop
+  logical                        :: print_pt2
+
+  double precision :: rss
+  double precision, external :: memory_of_double
+  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
+  double precision, allocatable :: ept2(:), pt1(:),extrap_energy(:)
+  PROVIDE H_apply_buffer_allocated distributed_davidson mo_two_e_integrals_in_map
+
+  print*,'Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+  enddo
+  N_iter = 1
+  threshold_generators = 1.d0
+  SOFT_TOUCH threshold_generators
+
+  rss = memory_of_double(N_states)*4.d0
+  call check_mem(rss,irp_here)
+
+  allocate (zeros(N_states))
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+
+  double precision               :: hf_energy_ref
+  logical                        :: has
+  double precision               :: relative_error
+
+  relative_error=PT2_relative_error
+
+  zeros = 0.d0
+  pt2_data % pt2   = -huge(1.e0)
+  pt2_data % rpt2  = -huge(1.e0)
+  pt2_data % overlap= 0.d0
+  pt2_data % variance = huge(1.e0)
+
+  if (s2_eig) then
+    call make_s2_eigenfunction
+  endif
+  print_pt2 = .False.
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  call routine_save_right
+
+  if (N_det > N_det_max) then
+    psi_det(1:N_int,1:2,1:N_det) = psi_det_sorted_tc_gen(1:N_int,1:2,1:N_det)
+    psi_coef(1:N_det,1:N_states) = psi_coef_sorted_tc_gen(1:N_det,1:N_states)
+    N_det = N_det_max
+    soft_touch N_det psi_det psi_coef
+    if (s2_eig) then
+      call make_s2_eigenfunction
+    endif
+    print_pt2 = .False.
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  endif
+
+  allocate(ept2(1000),pt1(1000),extrap_energy(100))
+
+  correlation_energy_ratio = 0.d0
+
+! thresh_it_dav  = 5.d-5
+! soft_touch thresh_it_dav
+
+  print_pt2 = .True.
+  to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
+  to_select = max(N_states_diag, to_select)
+
+  E_denom = E_tc ! TC Energy of the current wave function 
+  call pt2_dealloc(pt2_data)
+  call pt2_dealloc(pt2_data_err)
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+  call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
+
+  N_iter += 1
+
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+
+end
+
--- a/src/cipsi_tc_bi_ortho/pt2_stoch_routines.irp.f
+++ b/src/cipsi_tc_bi_ortho/pt2_stoch_routines.irp.f
@ -0,0 +1,869 @@
+BEGIN_PROVIDER [ integer, pt2_stoch_istate ]
+ implicit none
+ BEGIN_DOC
+ ! State for stochatsic PT2
+ END_DOC
+ pt2_stoch_istate = 1
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer, pt2_F, (N_det_generators) ]
+&BEGIN_PROVIDER [ integer, pt2_n_tasks_max ]
+  implicit none
+  logical, external :: testTeethBuilding
+  integer :: i,j
+  pt2_n_tasks_max = elec_alpha_num*elec_alpha_num + elec_alpha_num*elec_beta_num  - n_core_orb*2
+  pt2_n_tasks_max = min(pt2_n_tasks_max,1+N_det_generators/10000)
+  call write_int(6,pt2_n_tasks_max,'pt2_n_tasks_max')
+
+  pt2_F(:) = max(int(sqrt(float(pt2_n_tasks_max))),1)
+  do i=1,pt2_n_0(1+pt2_N_teeth/4)
+    pt2_F(i) = pt2_n_tasks_max*pt2_min_parallel_tasks
+  enddo
+  do i=1+pt2_n_0(pt2_N_teeth-pt2_N_teeth/4), pt2_n_0(pt2_N_teeth-pt2_N_teeth/10)
+    pt2_F(i) = pt2_min_parallel_tasks
+  enddo
+  do i=1+pt2_n_0(pt2_N_teeth-pt2_N_teeth/10), N_det_generators
+    pt2_F(i) = 1
+  enddo
+
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer, pt2_N_teeth ]
+&BEGIN_PROVIDER [ integer, pt2_minDetInFirstTeeth ]
+  implicit none
+  logical, external :: testTeethBuilding
+
+  if(N_det_generators < 500) then
+    pt2_minDetInFirstTeeth = 1
+    pt2_N_teeth = 1
+  else
+    pt2_minDetInFirstTeeth = min(5, N_det_generators)
+    do pt2_N_teeth=100,2,-1
+      if(testTeethBuilding(pt2_minDetInFirstTeeth, pt2_N_teeth)) exit
+    end do
+  end if
+  call write_int(6,pt2_N_teeth,'Number of comb teeth')
+END_PROVIDER
+
+
+logical function testTeethBuilding(minF, N)
+  implicit none
+  integer, intent(in) :: minF, N
+  integer :: n0, i
+  double precision :: u0, Wt, r
+
+  double precision, allocatable :: tilde_w(:), tilde_cW(:)
+  integer, external :: dress_find_sample
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+
+  rss = memory_of_double(2*N_det_generators+1)
+  call check_mem(rss,irp_here)
+
+  allocate(tilde_w(N_det_generators), tilde_cW(0:N_det_generators))
+
+  double precision :: norm2
+  norm2 = 0.d0
+  do i=N_det_generators,1,-1
+    tilde_w(i)  = psi_coef_sorted_tc_gen(i,pt2_stoch_istate) * &
+                  psi_coef_sorted_tc_gen(i,pt2_stoch_istate)
+    norm2 = norm2 + tilde_w(i)
+  enddo
+
+  f = 1.d0/norm2
+  tilde_w(:) = tilde_w(:) * f
+
+  tilde_cW(0) = -1.d0
+  do i=1,N_det_generators
+    tilde_cW(i) = tilde_cW(i-1) + tilde_w(i)
+  enddo
+  tilde_cW(:) = tilde_cW(:) + 1.d0
+  deallocate(tilde_w)
+
+  n0 = 0
+  testTeethBuilding = .false.
+  double precision :: f
+  integer :: minFN
+  minFN = N_det_generators - minF * N
+  f = 1.d0/dble(N)
+  do
+    u0 = tilde_cW(n0)
+    r = tilde_cW(n0 + minF)
+    Wt = (1d0 - u0) * f
+    if (dabs(Wt) <= 1.d-3) then
+      exit
+    endif
+    if(Wt >= r - u0) then
+       testTeethBuilding = .true.
+       exit
+    end if
+    n0 += 1
+    if(n0 > minFN) then
+      exit
+    end if
+  end do
+  deallocate(tilde_cW)
+
+end function
+
+
+
+subroutine ZMQ_pt2(E, pt2_data, pt2_data_err, relative_error, N_in)
+  use f77_zmq
+  use selection_types
+
+  implicit none
+
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket, zmq_socket_pull
+  integer, intent(in)            :: N_in
+!  integer, intent(inout)         :: N_in
+  double precision, intent(in)   :: relative_error, E(N_states)
+  type(pt2_type), intent(inout)  :: pt2_data, pt2_data_err
+!
+  integer                        :: i, N
+
+  double precision               :: state_average_weight_save(N_states), w(N_states,4)
+  integer(ZMQ_PTR), external     :: new_zmq_to_qp_run_socket
+  type(selection_buffer)         :: b
+
+  PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+  PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+  PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+  PROVIDE psi_bilinear_matrix_transp_order psi_selectors_coef_transp_tc psi_det_sorted_tc
+  PROVIDE psi_det_hii selection_weight pseudo_sym
+  PROVIDE n_act_orb n_inact_orb n_core_orb n_virt_orb n_del_orb seniority_max
+  PROVIDE excitation_beta_max  excitation_alpha_max excitation_max
+  PROVIDE psi_selectors_rcoef_bi_orth_transp psi_selectors_lcoef_bi_orth_transp
+
+  if (h0_type == 'CFG') then
+    PROVIDE psi_configuration_hii det_to_configuration
+  endif
+
+  if (N_det <= max(4,N_states) .or. pt2_N_teeth < 2) then
+    print*,'ZMQ_selection'
+    call ZMQ_selection(N_in, pt2_data)
+  else
+    print*,'else ZMQ_selection'
+
+    N = max(N_in,1) * N_states
+    state_average_weight_save(:) = state_average_weight(:)
+    if (int(N,8)*2_8 > huge(1)) then
+      print *,  irp_here, ': integer too large'
+      stop -1
+    endif
+    call create_selection_buffer(N, N*2, b)
+    ASSERT (associated(b%det))
+    ASSERT (associated(b%val))
+
+    do pt2_stoch_istate=1,N_states
+      state_average_weight(:) = 0.d0
+      state_average_weight(pt2_stoch_istate) = 1.d0
+      TOUCH state_average_weight pt2_stoch_istate selection_weight
+
+      PROVIDE nproc pt2_F mo_two_e_integrals_in_map mo_one_e_integrals pt2_w
+      PROVIDE psi_selectors pt2_u pt2_J pt2_R
+      call new_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'pt2')
+
+      integer, external              :: zmq_put_psi
+      integer, external              :: zmq_put_N_det_generators
+      integer, external              :: zmq_put_N_det_selectors
+      integer, external              :: zmq_put_dvector
+      integer, external              :: zmq_put_ivector
+      if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+        stop 'Unable to put psi on ZMQ server'
+      endif
+      if (zmq_put_N_det_generators(zmq_to_qp_run_socket, 1) == -1) then
+        stop 'Unable to put N_det_generators on ZMQ server'
+      endif
+      if (zmq_put_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) then
+        stop 'Unable to put N_det_selectors on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',pt2_e0_denominator,size(pt2_e0_denominator)) == -1) then
+        stop 'Unable to put energy on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) then
+        stop 'Unable to put state_average_weight on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) then
+        stop 'Unable to put selection_weight on ZMQ server'
+      endif
+      if (zmq_put_ivector(zmq_to_qp_run_socket,1,'pt2_stoch_istate',pt2_stoch_istate,1) == -1) then
+        stop 'Unable to put pt2_stoch_istate on ZMQ server'
+      endif
+      if (zmq_put_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) then
+        stop 'Unable to put threshold_generators on ZMQ server'
+      endif
+
+
+      integer, external :: add_task_to_taskserver
+      character(300000) :: task
+
+      integer :: j,k,ipos,ifirst
+      ifirst=0
+
+      ipos=0
+      do i=1,N_det_generators
+        if (pt2_F(i) > 1) then
+          ipos += 1
+        endif
+      enddo
+      call write_int(6,sum(pt2_F),'Number of tasks')
+      call write_int(6,ipos,'Number of fragmented tasks')
+
+      ipos=1
+      do i= 1, N_det_generators
+        do j=1,pt2_F(pt2_J(i))
+          write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, pt2_J(i), N_in
+          ipos += 30
+          if (ipos > 300000-30) then
+            if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+              stop 'Unable to add task to task server'
+            endif
+            ipos=1
+            if (ifirst == 0) then
+              ifirst=1
+              if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+                print *,  irp_here, ': Failed in zmq_set_running'
+              endif
+            endif
+          endif
+        end do
+      enddo
+      if (ipos > 1) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task to task server'
+        endif
+      endif
+
+      integer, external :: zmq_set_running
+      if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+        print *,  irp_here, ': Failed in zmq_set_running'
+      endif
+
+
+      double precision :: mem_collector, mem, rss
+
+      call resident_memory(rss)
+
+      mem_collector = 8.d0 *                  & ! bytes
+            ( 1.d0*pt2_n_tasks_max            & ! task_id, index
+            + 0.635d0*N_det_generators        & ! f,d
+            + pt2_n_tasks_max*pt2_type_size(N_states) & ! pt2_data_task
+            + N_det_generators*pt2_type_size(N_states)  & ! pt2_data_I
+            + 4.d0*(pt2_N_teeth+1)            & ! S, S2, T2, T3
+            + 1.d0*(N_int*2.d0*N + N)         & ! selection buffer
+            + 1.d0*(N_int*2.d0*N + N)         & ! sort selection buffer
+            ) / 1024.d0**3
+
+      integer :: nproc_target, ii
+      nproc_target = nthreads_pt2
+      ii = min(N_det, (elec_alpha_num*(mo_num-elec_alpha_num))**2)
+
+      do
+        mem = mem_collector +                   & !
+              nproc_target * 8.d0 *             & ! bytes
+              ( 0.5d0*pt2_n_tasks_max           & ! task_id
+              + 64.d0*pt2_n_tasks_max           & ! task
+              + pt2_type_size(N_states)*pt2_n_tasks_max*N_states   & ! pt2, variance, overlap
+              + 1.d0*pt2_n_tasks_max            & ! i_generator, subset
+              + 1.d0*(N_int*2.d0*ii+ ii)        & ! selection buffer
+              + 1.d0*(N_int*2.d0*ii+ ii)        & ! sort selection buffer
+              + 2.0d0*(ii)                      & ! preinteresting, interesting,
+                                                  ! prefullinteresting, fullinteresting
+              + 2.0d0*(N_int*2*ii)              & ! minilist, fullminilist
+              + 1.0d0*(N_states*mo_num*mo_num)  & ! mat
+              ) / 1024.d0**3
+
+        if (nproc_target == 0) then
+          call check_mem(mem,irp_here)
+          nproc_target = 1
+          exit
+        endif
+
+        if (mem+rss < qp_max_mem) then
+          exit
+        endif
+
+        nproc_target = nproc_target - 1
+
+      enddo
+      call write_int(6,nproc_target,'Number of threads for PT2')
+      call write_double(6,mem,'Memory (Gb)')
+
+      call omp_set_max_active_levels(1)
+
+
+      print '(A)', '========== ======================= ===================== ===================== ==========='
+      print '(A)', ' Samples          Energy                Variance               Norm^2          Seconds'
+      print '(A)', '========== ======================= ===================== ===================== ==========='
+
+      PROVIDE global_selection_buffer
+
+      !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(nproc_target+1)            &
+          !$OMP  PRIVATE(i)
+      i = omp_get_thread_num()
+      if (i==0) then
+
+        call pt2_collector(zmq_socket_pull, E(pt2_stoch_istate),relative_error, pt2_data, pt2_data_err, b, N)
+        pt2_data % rpt2(pt2_stoch_istate) =  &
+          pt2_data % pt2(pt2_stoch_istate)/(1.d0+pt2_data % overlap(pt2_stoch_istate,pt2_stoch_istate))
+
+        !TODO : We should use here the correct formula for the error of X/Y
+        pt2_data_err % rpt2(pt2_stoch_istate) =  &
+          pt2_data_err % pt2(pt2_stoch_istate)/(1.d0 + pt2_data % overlap(pt2_stoch_istate,pt2_stoch_istate))
+
+      else
+        call pt2_slave_inproc(i)
+      endif
+      !$OMP END PARALLEL
+      call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'pt2')
+      call omp_set_max_active_levels(8)
+
+      print '(A)', '========== ======================= ===================== ===================== ==========='
+
+    do k=1,N_states
+      pt2_overlap(pt2_stoch_istate,k) = pt2_data % overlap(k,pt2_stoch_istate)
+    enddo
+    SOFT_TOUCH pt2_overlap
+
+    enddo
+    FREE pt2_stoch_istate
+
+    ! Symmetrize overlap
+    do j=2,N_states
+     do i=1,j-1
+       pt2_overlap(i,j) = 0.5d0 * (pt2_overlap(i,j) + pt2_overlap(j,i))
+       pt2_overlap(j,i) = pt2_overlap(i,j)
+     enddo
+    enddo
+
+    print *, 'Overlap of perturbed states:'
+    do k=1,N_states
+      print *, pt2_overlap(k,:)
+    enddo
+    print *, '-------'
+
+    if (N_in > 0) then
+      b%cur = min(N_in,b%cur)
+      if (s2_eig) then
+        call make_selection_buffer_s2(b)
+      else
+        call remove_duplicates_in_selection_buffer(b)
+      endif
+      call fill_H_apply_buffer_no_selection(b%cur,b%det,N_int,0)
+    endif
+    call delete_selection_buffer(b)
+
+    state_average_weight(:) = state_average_weight_save(:)
+    TOUCH state_average_weight
+    call update_pt2_and_variance_weights(pt2_data, N_states)
+  endif
+
+
+end subroutine
+
+
+subroutine pt2_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  PROVIDE global_selection_buffer
+  call run_pt2_slave(1,i,pt2_e0_denominator)
+end
+
+
+subroutine pt2_collector(zmq_socket_pull, E, relative_error, pt2_data, pt2_data_err, b, N_)
+  use f77_zmq
+  use selection_types
+  use bitmasks
+  implicit none
+
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  double precision, intent(in)   :: relative_error, E
+  type(pt2_type), intent(inout)  :: pt2_data, pt2_data_err
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(in)            :: N_
+
+  type(pt2_type), allocatable    :: pt2_data_task(:)
+  type(pt2_type), allocatable    :: pt2_data_I(:)
+  type(pt2_type), allocatable    :: pt2_data_S(:)
+  type(pt2_type), allocatable    :: pt2_data_S2(:)
+  type(pt2_type)                 :: pt2_data_teeth
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+  integer, external :: zmq_delete_tasks_async_send
+  integer, external :: zmq_delete_tasks_async_recv
+  integer, external :: zmq_abort
+  integer, external :: pt2_find_sample_lr
+
+  PROVIDE pt2_stoch_istate
+
+  integer :: more, n, i, p, c, t, n_tasks, U
+  integer, allocatable :: task_id(:)
+  integer, allocatable :: index(:)
+
+  double precision :: v, x, x2, x3, avg, avg2, avg3(N_states), eqt, E0, v0, n0(N_states)
+  double precision :: eqta(N_states)
+  double precision :: time, time1, time0
+
+  integer, allocatable :: f(:)
+  logical, allocatable :: d(:)
+  logical :: do_exit, stop_now, sending
+  logical, external :: qp_stop
+  type(selection_buffer) :: b2
+
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+
+  sending =.False.
+
+  rss  = memory_of_int(pt2_n_tasks_max*2+N_det_generators*2)
+  rss += memory_of_double(N_states*N_det_generators)*3.d0
+  rss += memory_of_double(N_states*pt2_n_tasks_max)*3.d0
+  rss += memory_of_double(pt2_N_teeth+1)*4.d0
+  call check_mem(rss,irp_here)
+
+  ! If an allocation is added here, the estimate of the memory should also be
+  ! updated in ZMQ_pt2
+  allocate(task_id(pt2_n_tasks_max), index(pt2_n_tasks_max), f(N_det_generators))
+  allocate(d(N_det_generators+1))
+  allocate(pt2_data_task(pt2_n_tasks_max))
+  allocate(pt2_data_I(N_det_generators))
+  allocate(pt2_data_S(pt2_N_teeth+1))
+  allocate(pt2_data_S2(pt2_N_teeth+1))
+
+
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+  call create_selection_buffer(N_, N_*2, b2)
+
+
+  pt2_data % pt2(pt2_stoch_istate) = -huge(1.)
+  pt2_data_err % pt2(pt2_stoch_istate) = huge(1.)
+  pt2_data % variance(pt2_stoch_istate) = huge(1.)
+  pt2_data_err % variance(pt2_stoch_istate) = huge(1.)
+  pt2_data % overlap(:,pt2_stoch_istate) = 0.d0
+  pt2_data_err % overlap(:,pt2_stoch_istate) = huge(1.)
+  n = 1
+  t = 0
+  U = 0
+  do i=1,pt2_n_tasks_max
+    call pt2_alloc(pt2_data_task(i),N_states)
+  enddo
+  do i=1,pt2_N_teeth+1
+    call pt2_alloc(pt2_data_S(i),N_states)
+    call pt2_alloc(pt2_data_S2(i),N_states)
+  enddo
+  do i=1,N_det_generators
+    call pt2_alloc(pt2_data_I(i),N_states)
+  enddo
+  f(:) = pt2_F(:)
+  d(:) = .false.
+  n_tasks = 0
+  E0 = E
+  v0 = 0.d0
+  n0(:) = 0.d0
+  more = 1
+  call wall_time(time0)
+  time1 = time0
+
+  do_exit = .false.
+  stop_now = .false.
+  do while (n <= N_det_generators)
+    if(f(pt2_J(n)) == 0) then
+      d(pt2_J(n)) = .true.
+      do while(d(U+1))
+        U += 1
+      end do
+
+      ! Deterministic part
+      do while(t <= pt2_N_teeth)
+        if(U >= pt2_n_0(t+1)) then
+          t=t+1
+          E0 = 0.d0
+          v0 = 0.d0
+          n0(:) = 0.d0
+          do i=pt2_n_0(t),1,-1
+            E0 += pt2_data_I(i) % pt2(pt2_stoch_istate)
+            v0 += pt2_data_I(i) % variance(pt2_stoch_istate)
+            n0(:) += pt2_data_I(i) % overlap(:,pt2_stoch_istate)
+          end do
+        else
+          exit
+        end if
+      end do
+
+      ! Add Stochastic part
+      c = pt2_R(n)
+      if(c > 0) then
+
+        call pt2_alloc(pt2_data_teeth,N_states)
+        do p=pt2_N_teeth, 1, -1
+          v = pt2_u_0 + pt2_W_T * (pt2_u(c) + dble(p-1))
+          i = pt2_find_sample_lr(v, pt2_cW,pt2_n_0(p),pt2_n_0(p+1))
+          v = pt2_W_T / pt2_w(i)
+          call pt2_add ( pt2_data_teeth,  v,  pt2_data_I(i) )
+          call pt2_add ( pt2_data_S(p),  1.d0,  pt2_data_teeth )
+          call pt2_add2( pt2_data_S2(p), 1.d0,  pt2_data_teeth )
+        enddo
+        call pt2_dealloc(pt2_data_teeth)
+
+        avg  = E0 + pt2_data_S(t) % pt2(pt2_stoch_istate) / dble(c)
+        avg2 = v0 + pt2_data_S(t) % variance(pt2_stoch_istate) / dble(c)
+        avg3(:) = n0(:) + pt2_data_S(t) % overlap(:,pt2_stoch_istate) / dble(c)
+        if ((avg /= 0.d0) .or. (n == N_det_generators) ) then
+          do_exit = .true.
+        endif
+        if (qp_stop()) then
+          stop_now = .True.
+        endif
+        pt2_data % pt2(pt2_stoch_istate) = avg
+        pt2_data % variance(pt2_stoch_istate) = avg2
+        pt2_data % overlap(:,pt2_stoch_istate) = avg3(:)
+        call wall_time(time)
+        ! 1/(N-1.5) : see  Brugger, The American Statistician (23) 4 p. 32 (1969)
+        if(c > 2) then
+          eqt = dabs((pt2_data_S2(t) % pt2(pt2_stoch_istate) / c) - (pt2_data_S(t) % pt2(pt2_stoch_istate)/c)**2) ! dabs for numerical stability
+          eqt = sqrt(eqt / (dble(c) - 1.5d0))
+          pt2_data_err % pt2(pt2_stoch_istate) = eqt
+
+          eqt = dabs((pt2_data_S2(t) % variance(pt2_stoch_istate) / c) - (pt2_data_S(t) % variance(pt2_stoch_istate)/c)**2) ! dabs for numerical stability
+          eqt = sqrt(eqt / (dble(c) - 1.5d0))
+          pt2_data_err % variance(pt2_stoch_istate) = eqt
+
+          eqta(:) = dabs((pt2_data_S2(t) % overlap(:,pt2_stoch_istate) / c) - (pt2_data_S(t) % overlap(:,pt2_stoch_istate)/c)**2) ! dabs for numerical stability
+          eqta(:) = sqrt(eqta(:) / (dble(c) - 1.5d0))
+          pt2_data_err % overlap(:,pt2_stoch_istate) = eqta(:)
+
+
+          if ((time - time1 > 1.d0) .or. (n==N_det_generators)) then
+            time1 = time
+            print '(I10, X, F12.6, X, G10.3, X, F10.6, X, G10.3, X, F10.6, X, G10.3, X, F10.4)', c, &
+              pt2_data     % pt2(pt2_stoch_istate) +E, &
+              pt2_data_err % pt2(pt2_stoch_istate), &
+              pt2_data     % variance(pt2_stoch_istate), &
+              pt2_data_err % variance(pt2_stoch_istate), &
+              pt2_data     % overlap(pt2_stoch_istate,pt2_stoch_istate), &
+              pt2_data_err % overlap(pt2_stoch_istate,pt2_stoch_istate), &
+              time-time0
+            if (stop_now .or. (                                      &
+                  (do_exit .and. (dabs(pt2_data_err % pt2(pt2_stoch_istate)) /    &
+                  (1.d-20 + dabs(pt2_data % pt2(pt2_stoch_istate)) ) <= relative_error))) ) then
+              if (zmq_abort(zmq_to_qp_run_socket) == -1) then
+                call sleep(10)
+                if (zmq_abort(zmq_to_qp_run_socket) == -1) then
+                  print *, irp_here, ': Error in sending abort signal (2)'
+                endif
+              endif
+            endif
+          endif
+        endif
+      end if
+      n += 1
+    else if(more == 0) then
+      exit
+    else
+      call pull_pt2_results(zmq_socket_pull, index, pt2_data_task, task_id, n_tasks, b2)
+      if(n_tasks > pt2_n_tasks_max)then
+       print*,'PB !!!'
+       print*,'If you see this, send a bug report with the following content'
+       print*,irp_here
+       print*,'n_tasks,pt2_n_tasks_max = ',n_tasks,pt2_n_tasks_max
+       stop -1
+      endif
+      if (zmq_delete_tasks_async_send(zmq_to_qp_run_socket,task_id,n_tasks,sending) == -1) then
+          stop 'PT2: Unable to delete tasks (send)'
+      endif
+      do i=1,n_tasks
+        if(index(i).gt.size(pt2_data_I,1).or.index(i).lt.1)then
+         print*,'PB !!!'
+         print*,'If you see this, send a bug report with the following content'
+         print*,irp_here
+         print*,'i,index(i),size(pt2_data_I,1) = ',i,index(i),size(pt2_data_I,1)
+         stop -1
+        endif
+        call pt2_add(pt2_data_I(index(i)),1.d0,pt2_data_task(i))
+        f(index(i)) -= 1
+      end do
+      do i=1, b2%cur
+        ! We assume the pulled buffer is sorted
+        if (b2%val(i) > b%mini) exit
+        call add_to_selection_buffer(b, b2%det(1,1,i), b2%val(i))
+      end do
+      if (zmq_delete_tasks_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
+          stop 'PT2: Unable to delete tasks (recv)'
+      endif
+    end if
+  end do
+  do i=1,N_det_generators
+    call pt2_dealloc(pt2_data_I(i))
+  enddo
+  do i=1,pt2_N_teeth+1
+    call pt2_dealloc(pt2_data_S(i))
+    call pt2_dealloc(pt2_data_S2(i))
+  enddo
+  do i=1,pt2_n_tasks_max
+    call pt2_dealloc(pt2_data_task(i))
+  enddo
+!print *,  'deleting b2'
+  call delete_selection_buffer(b2)
+!print *,  'sorting b'
+  call sort_selection_buffer(b)
+!print *,  'done'
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+
+end subroutine
+
+
+integer function pt2_find_sample(v, w)
+  implicit none
+  double precision, intent(in) :: v, w(0:N_det_generators)
+  integer, external :: pt2_find_sample_lr
+
+  pt2_find_sample = pt2_find_sample_lr(v, w, 0, N_det_generators)
+end function
+
+
+integer function pt2_find_sample_lr(v, w, l_in, r_in)
+  implicit none
+  double precision, intent(in) :: v, w(0:N_det_generators)
+  integer, intent(in) :: l_in,r_in
+  integer :: i,l,r
+
+  l=l_in
+  r=r_in
+
+  do while(r-l > 1)
+    i = shiftr(r+l,1)
+    if(w(i) < v) then
+      l = i
+    else
+      r = i
+    end if
+  end do
+  i = r
+  do r=i+1,N_det_generators
+    if (w(r) /= w(i)) then
+      exit
+    endif
+  enddo
+  pt2_find_sample_lr = r-1
+end function
+
+
+BEGIN_PROVIDER [ integer, pt2_n_tasks ]
+ implicit none
+ BEGIN_DOC
+ ! Number of parallel tasks for the Monte Carlo
+ END_DOC
+ pt2_n_tasks = N_det_generators
+END_PROVIDER
+
+BEGIN_PROVIDER[ double precision, pt2_u, (N_det_generators)]
+  implicit none
+  integer, allocatable :: seed(:)
+  integer :: m,i
+  call random_seed(size=m)
+  allocate(seed(m))
+  do i=1,m
+    seed(i) = i
+  enddo
+  call random_seed(put=seed)
+  deallocate(seed)
+
+  call RANDOM_NUMBER(pt2_u)
+ END_PROVIDER
+
+ BEGIN_PROVIDER[ integer, pt2_J, (N_det_generators)]
+&BEGIN_PROVIDER[ integer, pt2_R, (N_det_generators)]
+  implicit none
+  BEGIN_DOC
+! pt2_J contains the list of generators after ordering them according to the
+! Monte Carlo sampling.
+!
+! pt2_R(i) is the number of combs drawn when determinant i is computed.
+  END_DOC
+  integer                :: N_c, N_j
+  integer                :: U, t, i
+  double precision       :: v
+  integer, external      :: pt2_find_sample_lr
+
+  logical, allocatable :: pt2_d(:)
+  integer :: m,l,r,k
+  integer :: ncache
+  integer, allocatable :: ii(:,:)
+  double precision :: dt
+
+  ncache = min(N_det_generators,10000)
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+  rss = memory_of_int(ncache)*dble(pt2_N_teeth) + memory_of_int(N_det_generators)
+  call check_mem(rss,irp_here)
+
+  allocate(ii(pt2_N_teeth,ncache),pt2_d(N_det_generators))
+
+  pt2_R(:) = 0
+  pt2_d(:) = .false.
+  N_c = 0
+  N_j = pt2_n_0(1)
+  do i=1,N_j
+      pt2_d(i) = .true.
+      pt2_J(i) = i
+  end do
+
+  U = 0
+  do while(N_j < pt2_n_tasks)
+
+    if (N_c+ncache > N_det_generators) then
+      ncache = N_det_generators - N_c
+    endif
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(dt,v,t,k)
+    do k=1, ncache
+      dt = pt2_u_0
+      do t=1, pt2_N_teeth
+        v = dt + pt2_W_T *pt2_u(N_c+k)
+        dt = dt + pt2_W_T
+        ii(t,k) = pt2_find_sample_lr(v, pt2_cW,pt2_n_0(t),pt2_n_0(t+1))
+      end do
+    enddo
+    !$OMP END PARALLEL DO
+
+    do k=1,ncache
+      !ADD_COMB
+      N_c = N_c+1
+      do t=1, pt2_N_teeth
+        i = ii(t,k)
+        if(.not. pt2_d(i)) then
+          N_j += 1
+          pt2_J(N_j) = i
+          pt2_d(i) = .true.
+        end if
+      end do
+
+      pt2_R(N_j) = N_c
+
+      !FILL_TOOTH
+      do while(U < N_det_generators)
+        U += 1
+        if(.not. pt2_d(U)) then
+          N_j += 1
+          pt2_J(N_j) = U
+          pt2_d(U) = .true.
+          exit
+        end if
+      end do
+      if (N_j >= pt2_n_tasks) exit
+    end do
+  enddo
+
+  if(N_det_generators > 1) then
+    pt2_R(N_det_generators-1) = 0
+    pt2_R(N_det_generators) = N_c
+  end if
+
+  deallocate(ii,pt2_d)
+
+END_PROVIDER
+
+
+
+ BEGIN_PROVIDER [ double precision, pt2_w, (N_det_generators) ]
+&BEGIN_PROVIDER [ double precision, pt2_cW, (0:N_det_generators) ]
+&BEGIN_PROVIDER [ double precision, pt2_W_T ]
+&BEGIN_PROVIDER [ double precision, pt2_u_0 ]
+&BEGIN_PROVIDER [ integer,          pt2_n_0, (pt2_N_teeth+1) ]
+   implicit none
+   integer                        :: i, t
+   double precision, allocatable  :: tilde_w(:), tilde_cW(:)
+   double precision               :: r, tooth_width
+   integer, external              :: pt2_find_sample
+
+   double precision               :: rss
+   double precision, external     :: memory_of_double, memory_of_int
+   rss = memory_of_double(2*N_det_generators+1)
+   call check_mem(rss,irp_here)
+
+   if (N_det_generators == 1) then
+
+     pt2_w(1)   = 1.d0
+     pt2_cw(1)  = 1.d0
+     pt2_u_0    = 1.d0
+     pt2_W_T    = 0.d0
+     pt2_n_0(1) = 0
+     pt2_n_0(2) = 1
+
+   else
+
+     allocate(tilde_w(N_det_generators), tilde_cW(0:N_det_generators))
+
+     tilde_cW(0) = 0d0
+
+     do i=1,N_det_generators
+       tilde_w(i)  = psi_coef_sorted_tc_gen(i,pt2_stoch_istate)**2 !+ 1.d-20
+     enddo
+
+     double precision               :: norm2
+     norm2 = 0.d0
+     do i=N_det_generators,1,-1
+       norm2 += tilde_w(i)
+     enddo
+
+     tilde_w(:) = tilde_w(:) / norm2
+
+     tilde_cW(0) = -1.d0
+     do i=1,N_det_generators
+       tilde_cW(i) = tilde_cW(i-1) + tilde_w(i)
+     enddo
+     tilde_cW(:) = tilde_cW(:) + 1.d0
+
+     pt2_n_0(1) = 0
+     do
+     pt2_u_0 = tilde_cW(pt2_n_0(1))
+     r = tilde_cW(pt2_n_0(1) + pt2_minDetInFirstTeeth)
+     pt2_W_T = (1d0 - pt2_u_0) / dble(pt2_N_teeth)
+     if(pt2_W_T >= r - pt2_u_0) then
+       exit
+     end if
+     pt2_n_0(1) += 1
+     if(N_det_generators - pt2_n_0(1) < pt2_minDetInFirstTeeth * pt2_N_teeth) then
+       print *, "teeth building failed"
+       stop -1
+     end if
+   end do
+
+   do t=2, pt2_N_teeth
+     r = pt2_u_0 + pt2_W_T * dble(t-1)
+     pt2_n_0(t) = pt2_find_sample(r, tilde_cW)
+   end do
+   pt2_n_0(pt2_N_teeth+1) = N_det_generators
+
+   pt2_w(:pt2_n_0(1)) = tilde_w(:pt2_n_0(1))
+   do t=1, pt2_N_teeth
+     tooth_width = tilde_cW(pt2_n_0(t+1)) - tilde_cW(pt2_n_0(t))
+     if (tooth_width == 0.d0) then
+       tooth_width = sum(tilde_w(pt2_n_0(t):pt2_n_0(t+1)))
+     endif
+     ASSERT(tooth_width > 0.d0)
+     do i=pt2_n_0(t)+1, pt2_n_0(t+1)
+       pt2_w(i) = tilde_w(i) * pt2_W_T / tooth_width
+     end do
+   end do
+
+   pt2_cW(0) = 0d0
+   do i=1,N_det_generators
+     pt2_cW(i) = pt2_cW(i-1) + pt2_w(i)
+   end do
+   pt2_n_0(pt2_N_teeth+1) = N_det_generators
+
+ endif
+END_PROVIDER
+
+
+
+
+
--- a/src/cipsi_tc_bi_ortho/pt2_type.irp.f
+++ b/src/cipsi_tc_bi_ortho/pt2_type.irp.f
@ -0,0 +1,128 @@
+subroutine pt2_alloc(pt2_data,N)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(inout) :: pt2_data
+  integer, intent(in)           :: N
+  integer :: k
+
+  allocate(pt2_data % pt2(N)           &
+          ,pt2_data % variance(N)      &
+          ,pt2_data % rpt2(N)          &
+          ,pt2_data % overlap(N,N)     &
+          )
+
+  pt2_data % pt2(:)           = 0.d0
+  pt2_data % variance(:)      = 0.d0
+  pt2_data % rpt2(:)          = 0.d0
+  pt2_data % overlap(:,:)     = 0.d0
+
+end subroutine
+
+subroutine pt2_dealloc(pt2_data)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(inout) :: pt2_data
+  deallocate(pt2_data % pt2         &
+            ,pt2_data % variance    &
+            ,pt2_data % rpt2        &
+            ,pt2_data % overlap     &
+            )
+end subroutine
+
+subroutine pt2_add(p1, w, p2)
+  implicit none
+  use selection_types
+  BEGIN_DOC
+! p1 += w * p2
+  END_DOC
+  type(pt2_type), intent(inout) :: p1
+  double precision, intent(in)  :: w
+  type(pt2_type), intent(in)    :: p2
+
+  if (w == 1.d0) then
+
+    p1 % pt2(:)            = p1 % pt2(:)           + p2 % pt2(:)
+    p1 % rpt2(:)           = p1 % rpt2(:)          + p2 % rpt2(:)
+    p1 % variance(:)       = p1 % variance(:)      + p2 % variance(:)
+    p1 % overlap(:,:)      = p1 % overlap(:,:)     + p2 % overlap(:,:)
+
+  else
+
+    p1 % pt2(:)            = p1 % pt2(:)           + w * p2 % pt2(:)
+    p1 % rpt2(:)           = p1 % rpt2(:)          + w * p2 % rpt2(:)
+    p1 % variance(:)       = p1 % variance(:)      + w * p2 % variance(:)
+    p1 % overlap(:,:)      = p1 % overlap(:,:)     + w * p2 % overlap(:,:)
+
+  endif
+
+end subroutine
+
+
+subroutine pt2_add2(p1, w, p2)
+  implicit none
+  use selection_types
+  BEGIN_DOC
+! p1 += w * p2**2
+  END_DOC
+  type(pt2_type), intent(inout) :: p1
+  double precision, intent(in)  :: w
+  type(pt2_type), intent(in)    :: p2
+
+  if (w == 1.d0) then
+
+    p1 % pt2(:)           = p1 % pt2(:)           + p2 % pt2(:)           * p2 % pt2(:)
+    p1 % rpt2(:)          = p1 % rpt2(:)          + p2 % rpt2(:)          * p2 % rpt2(:)
+    p1 % variance(:)      = p1 % variance(:)      + p2 % variance(:)      * p2 % variance(:)
+    p1 % overlap(:,:)     = p1 % overlap(:,:)     + p2 % overlap(:,:)     * p2 % overlap(:,:)
+
+  else
+
+    p1 % pt2(:)           = p1 % pt2(:)           + w * p2 % pt2(:)           * p2 % pt2(:)
+    p1 % rpt2(:)          = p1 % rpt2(:)          + w * p2 % rpt2(:)          * p2 % rpt2(:)
+    p1 % variance(:)      = p1 % variance(:)      + w * p2 % variance(:)      * p2 % variance(:)
+    p1 % overlap(:,:)     = p1 % overlap(:,:)     + w * p2 % overlap(:,:)     * p2 % overlap(:,:)
+
+  endif
+
+end subroutine
+
+
+subroutine pt2_serialize(pt2_data, n, x)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(in)    :: pt2_data
+  integer, intent(in)           :: n
+  double precision, intent(out) :: x(*)
+
+  integer :: i,k,n2
+
+  n2 = n*n
+  x(1:n)           =  pt2_data % pt2(1:n)
+  k=n
+  x(k+1:k+n)     =  pt2_data % rpt2(1:n)
+  k=k+n
+  x(k+1:k+n)     =  pt2_data % variance(1:n)
+  k=k+n
+  x(k+1:k+n2)  =  reshape(pt2_data % overlap(1:n,1:n), (/ n2 /))
+
+end
+
+subroutine pt2_deserialize(pt2_data, n, x)
+  implicit none
+  use selection_types
+  type(pt2_type), intent(inout) :: pt2_data
+  integer, intent(in)           :: n
+  double precision, intent(in)  :: x(*)
+
+  integer :: i,k,n2
+
+  n2 = n*n
+  pt2_data % pt2(1:n)           =   x(1:n)
+  k=n
+  pt2_data % rpt2(1:n)          =   x(k+1:k+n)
+  k=k+n
+  pt2_data % variance(1:n)      =   x(k+1:k+n)
+  k=k+n
+  pt2_data % overlap(1:n,1:n) = reshape(x(k+1:k+n2), (/ n, n /))
+
+end
--- a/src/cipsi_tc_bi_ortho/run_pt2_slave.irp.f
+++ b/src/cipsi_tc_bi_ortho/run_pt2_slave.irp.f
@ -0,0 +1,549 @@
+ use omp_lib
+ use selection_types
+ use f77_zmq
+BEGIN_PROVIDER [ integer(omp_lock_kind), global_selection_buffer_lock ]
+ use omp_lib
+ implicit none
+ BEGIN_DOC
+ ! Global buffer for the OpenMP selection
+ END_DOC
+ call omp_init_lock(global_selection_buffer_lock)
+END_PROVIDER
+
+BEGIN_PROVIDER [ type(selection_buffer), global_selection_buffer ]
+ use omp_lib
+ implicit none
+ BEGIN_DOC
+ ! Global buffer for the OpenMP selection
+ END_DOC
+ call omp_set_lock(global_selection_buffer_lock)
+ call delete_selection_buffer(global_selection_buffer)
+ call create_selection_buffer(N_det_generators, 2*N_det_generators, &
+    global_selection_buffer)
+ call omp_unset_lock(global_selection_buffer_lock)
+END_PROVIDER
+
+
+subroutine run_pt2_slave(thread,iproc,energy)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  double precision, intent(in)    :: energy(N_states_diag)
+  integer,  intent(in)            :: thread, iproc
+  if (N_det > 100000 ) then
+    call run_pt2_slave_large(thread,iproc,energy)
+  else
+    call run_pt2_slave_small(thread,iproc,energy)
+  endif
+end
+
+subroutine run_pt2_slave_small(thread,iproc,energy)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  double precision, intent(in)    :: energy(N_states_diag)
+  integer,  intent(in)            :: thread, iproc
+  integer                         :: rc, i
+
+  integer                        :: worker_id, ctask, ltask
+  character*(512), allocatable   :: task(:)
+  integer, allocatable           :: task_id(:)
+
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  type(selection_buffer) :: b
+  logical :: done, buffer_ready
+
+  type(pt2_type), allocatable :: pt2_data(:)
+  integer :: n_tasks, k, N
+  integer, allocatable :: i_generator(:), subset(:)
+
+  double precision, external :: memory_of_double, memory_of_int
+  integer :: bsize ! Size of selection buffers
+
+  allocate(task_id(pt2_n_tasks_max), task(pt2_n_tasks_max))
+  allocate(pt2_data(pt2_n_tasks_max), i_generator(pt2_n_tasks_max), subset(pt2_n_tasks_max))
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  integer, external :: connect_to_taskserver
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  b%N = 0
+  buffer_ready = .False.
+  n_tasks = 1
+
+  done = .False.
+  do while (.not.done)
+
+    n_tasks = max(1,n_tasks)
+    n_tasks = min(pt2_n_tasks_max,n_tasks)
+
+    integer, external :: get_tasks_from_taskserver
+    if (get_tasks_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task, n_tasks) == -1) then
+      exit
+    endif
+    done = task_id(n_tasks) == 0
+    if (done) then
+      n_tasks = n_tasks-1
+    endif
+    if (n_tasks == 0) exit
+
+    do k=1,n_tasks
+      call sscanf_ddd(task(k), subset(k), i_generator(k), N)
+    enddo
+    if (b%N == 0) then
+      ! Only first time
+      bsize = min(N, (elec_alpha_num * (mo_num-elec_alpha_num))**2)
+      call create_selection_buffer(bsize, bsize*2, b)
+      buffer_ready = .True.
+    else
+      ASSERT (b%N == bsize)
+    endif
+
+    double precision :: time0, time1
+    call wall_time(time0)
+    do k=1,n_tasks
+      call pt2_alloc(pt2_data(k),N_states)
+      b%cur = 0
+      call select_connected(i_generator(k),energy,pt2_data(k),b,subset(k),pt2_F(i_generator(k)))
+    enddo
+    call wall_time(time1)
+
+    integer, external :: tasks_done_to_taskserver
+    if (tasks_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id,n_tasks) == -1) then
+      done = .true.
+    endif
+    call sort_selection_buffer(b)
+    call push_pt2_results(zmq_socket_push, i_generator, pt2_data, b, task_id, n_tasks)
+    do k=1,n_tasks
+      call pt2_dealloc(pt2_data(k))
+    enddo
+    b%cur=0
+
+!    ! Try to adjust n_tasks around nproc/2 seconds per job
+    n_tasks = min(2*n_tasks,int( dble(n_tasks * nproc/2) / (time1 - time0 + 1.d0)))
+    n_tasks = min(n_tasks, pt2_n_tasks_max)
+!    n_tasks = 1
+  end do
+
+  integer, external :: disconnect_from_taskserver
+  do i=1,300
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) /= -2) exit
+    call usleep(500)
+    print *,  'Retry disconnect...'
+  end do
+
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  if (buffer_ready) then
+    call delete_selection_buffer(b)
+  endif
+  deallocate(pt2_data)
+end subroutine
+
+
+subroutine run_pt2_slave_large(thread,iproc,energy)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  double precision, intent(in)    :: energy(N_states_diag)
+  integer,  intent(in)            :: thread, iproc
+  integer                         :: rc, i
+
+  integer                        :: worker_id, ctask, ltask
+  character*(512)                :: task
+  integer                        :: task_id(1)
+
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  type(selection_buffer) :: b
+  logical :: done, buffer_ready
+
+  type(pt2_type) :: pt2_data
+  integer :: n_tasks, k, N
+  integer :: i_generator, subset
+
+  integer :: bsize ! Size of selection buffers
+  logical :: sending
+  double precision :: time_shift
+
+  PROVIDE global_selection_buffer global_selection_buffer_lock
+
+  call random_number(time_shift)
+  time_shift = time_shift*15.d0
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  integer, external :: connect_to_taskserver
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  b%N = 0
+  buffer_ready = .False.
+  n_tasks = 1
+
+  sending = .False.
+  done = .False.
+  double precision :: time0, time1
+  call wall_time(time0)
+  time0 = time0+time_shift
+  do while (.not.done)
+
+    integer, external :: get_tasks_from_taskserver
+    if (get_tasks_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, task, n_tasks) == -1) then
+      exit
+    endif
+    done = task_id(1) == 0
+    if (done) then
+      n_tasks = n_tasks-1
+    endif
+    if (n_tasks == 0) exit
+
+    call sscanf_ddd(task, subset, i_generator, N)
+    if( pt2_F(i_generator) <= 0 .or. pt2_F(i_generator) > N_det ) then
+      print *,  irp_here
+      stop 'bug in selection'
+    endif
+    if (b%N == 0) then
+      ! Only first time
+      bsize = min(N, (elec_alpha_num * (mo_num-elec_alpha_num))**2)
+      call create_selection_buffer(bsize, bsize*2, b)
+      buffer_ready = .True.
+    else
+      ASSERT (b%N == bsize)
+    endif
+
+    call pt2_alloc(pt2_data,N_states)
+    b%cur = 0
+    call select_connected(i_generator,energy,pt2_data,b,subset,pt2_F(i_generator))
+
+    integer, external :: tasks_done_to_taskserver
+    if (tasks_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id,n_tasks) == -1) then
+      done = .true.
+    endif
+    call sort_selection_buffer(b)
+
+    call wall_time(time1)
+!    if (time1-time0 > 15.d0) then
+      call omp_set_lock(global_selection_buffer_lock)
+      global_selection_buffer%mini = b%mini
+      call merge_selection_buffers(b,global_selection_buffer)
+      b%cur=0
+      call omp_unset_lock(global_selection_buffer_lock)
+      call wall_time(time0)
+!    endif
+
+    call push_pt2_results_async_recv(zmq_socket_push,b%mini,sending)
+    if ( iproc == 1 .or. i_generator < 100 .or. done) then
+      call omp_set_lock(global_selection_buffer_lock)
+      call push_pt2_results_async_send(zmq_socket_push, (/i_generator/), (/pt2_data/), global_selection_buffer, (/task_id/), 1,sending)
+      global_selection_buffer%cur = 0
+      call omp_unset_lock(global_selection_buffer_lock)
+    else
+      call push_pt2_results_async_send(zmq_socket_push, (/i_generator/), (/pt2_data/), b, (/task_id/), 1,sending)
+    endif
+
+    call pt2_dealloc(pt2_data)
+  end do
+  call push_pt2_results_async_recv(zmq_socket_push,b%mini,sending)
+
+  integer, external :: disconnect_from_taskserver
+  do i=1,300
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) /= -2) exit
+    call sleep(1)
+    print *,  'Retry disconnect...'
+  end do
+
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  if (buffer_ready) then
+    call delete_selection_buffer(b)
+  endif
+  FREE global_selection_buffer
+end subroutine
+
+
+subroutine push_pt2_results(zmq_socket_push, index, pt2_data, b, task_id, n_tasks)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_push
+  type(pt2_type), intent(in)     :: pt2_data(n_tasks)
+  integer, intent(in) :: n_tasks, index(n_tasks), task_id(n_tasks)
+  type(selection_buffer), intent(inout) :: b
+
+  logical :: sending
+  sending = .False.
+  call push_pt2_results_async_send(zmq_socket_push, index, pt2_data, b, task_id, n_tasks, sending)
+  call push_pt2_results_async_recv(zmq_socket_push, b%mini, sending)
+end subroutine
+
+
+subroutine push_pt2_results_async_send(zmq_socket_push, index, pt2_data, b, task_id, n_tasks, sending)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_push
+  type(pt2_type), intent(in)     :: pt2_data(n_tasks)
+  integer, intent(in) :: n_tasks, index(n_tasks), task_id(n_tasks)
+  type(selection_buffer), intent(inout) :: b
+  logical, intent(inout) :: sending
+  integer :: rc, i
+  integer*8 :: rc8
+  double precision, allocatable :: pt2_serialized(:,:)
+
+  if (sending) then
+    print *,  irp_here, ': sending is true'
+    stop -1
+  endif
+  sending = .True.
+
+  rc = f77_zmq_send( zmq_socket_push, n_tasks, 4, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 1
+    return
+  else if(rc /= 4) then
+    stop 'push'
+  endif
+
+
+  rc = f77_zmq_send( zmq_socket_push, index, 4*n_tasks, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 2
+    return
+  else if(rc /= 4*n_tasks) then
+    stop 'push'
+  endif
+
+
+  allocate(pt2_serialized (pt2_type_size(N_states),n_tasks) )
+  do i=1,n_tasks
+    call pt2_serialize(pt2_data(i),N_states,pt2_serialized(1,i))
+  enddo
+
+  rc = f77_zmq_send( zmq_socket_push, pt2_serialized, size(pt2_serialized)*8, ZMQ_SNDMORE)
+  deallocate(pt2_serialized)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 3
+    return
+  else if(rc /= size(pt2_serialized)*8) then
+    stop 'push'
+  endif
+
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, n_tasks*4, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 6
+    return
+  else if(rc /= 4*n_tasks) then
+    stop 'push'
+  endif
+
+
+  if (b%cur == 0) then
+
+    rc = f77_zmq_send( zmq_socket_push, b%cur, 4, 0)
+    if (rc == -1) then
+      print *,  irp_here, ': error sending result'
+      stop 7
+      return
+    else if(rc /= 4) then
+      stop 'push'
+    endif
+
+  else
+
+    rc = f77_zmq_send( zmq_socket_push, b%cur, 4, ZMQ_SNDMORE)
+    if (rc == -1) then
+      print *,  irp_here, ': error sending result'
+      stop 7
+      return
+    else if(rc /= 4) then
+      stop 'push'
+    endif
+
+
+    rc8 = f77_zmq_send8( zmq_socket_push, b%val, 8_8*int(b%cur,8), ZMQ_SNDMORE)
+    if (rc8 == -1_8) then
+      print *,  irp_here, ': error sending result'
+      stop 8
+      return
+    else if(rc8 /= 8_8*int(b%cur,8)) then
+      stop 'push'
+    endif
+
+
+    rc8 = f77_zmq_send8( zmq_socket_push, b%det, int(bit_kind*N_int*2,8)*int(b%cur,8), 0)
+    if (rc8 == -1_8) then
+      print *,  irp_here, ': error sending result'
+      stop 9
+      return
+    else if(rc8 /= int(N_int*2*8,8)*int(b%cur,8)) then
+      stop 'push'
+    endif
+
+  endif
+
+end subroutine
+
+subroutine push_pt2_results_async_recv(zmq_socket_push,mini,sending)
+ use selection_types
+ use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)    :: zmq_socket_push
+  double precision, intent(out) :: mini
+  logical, intent(inout) :: sending
+  integer :: rc
+
+  if (.not.sending) return
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 10
+    return
+  else if ((rc /= 2).and.(ok(1:2) /= 'ok')) then
+    print *,  irp_here//': error in receiving ok'
+    stop -1
+  endif
+  rc = f77_zmq_recv( zmq_socket_push, mini, 8, 0)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 11
+    return
+  else if (rc /= 8) then
+    print *,  irp_here//': error in receiving mini'
+    stop 12
+  endif
+IRP_ENDIF
+  sending = .False.
+end subroutine
+
+
+
+subroutine pull_pt2_results(zmq_socket_pull, index, pt2_data, task_id, n_tasks, b)
+ use selection_types
+ use f77_zmq
+  implicit none
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  type(pt2_type), intent(inout)  :: pt2_data(*)
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(out) :: index(*)
+  integer, intent(out) :: n_tasks, task_id(*)
+  integer :: rc, rn, i
+  integer*8 :: rc8
+  double precision, allocatable :: pt2_serialized(:,:)
+
+  rc = f77_zmq_recv( zmq_socket_pull, n_tasks, 4, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4) then
+    stop 'pull'
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, index, 4*n_tasks, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4*n_tasks) then
+    stop 'pull'
+  endif
+
+  allocate(pt2_serialized (pt2_type_size(N_states),n_tasks) )
+  rc = f77_zmq_recv( zmq_socket_pull, pt2_serialized, 8*size(pt2_serialized)*n_tasks, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 8*size(pt2_serialized)) then
+    stop 'pull'
+  endif
+
+  do i=1,n_tasks
+    call pt2_deserialize(pt2_data(i),N_states,pt2_serialized(1,i))
+  enddo
+  deallocate(pt2_serialized)
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, n_tasks*4, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4*n_tasks) then
+    stop 'pull'
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, b%cur, 4, 0)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if(rc /= 4) then
+    stop 'pull'
+  endif
+
+  if (b%cur > 0) then
+
+    rc8 = f77_zmq_recv8( zmq_socket_pull, b%val, 8_8*int(b%cur,8), 0)
+    if (rc8 == -1_8) then
+      n_tasks = 1
+      task_id(1) = 0
+    else if(rc8 /= 8_8*int(b%cur,8)) then
+      stop 'pull'
+    endif
+
+    rc8 = f77_zmq_recv8( zmq_socket_pull, b%det, int(bit_kind*N_int*2,8)*int(b%cur,8), 0)
+    if (rc8 == -1_8) then
+      n_tasks = 1
+      task_id(1) = 0
+    else if(rc8 /= int(N_int*2*8,8)*int(b%cur,8)) then
+      stop 'pull'
+    endif
+
+  endif
+
+! Activate is zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, ZMQ_SNDMORE)
+  if (rc == -1) then
+    n_tasks = 1
+    task_id(1) = 0
+  else if (rc /= 2) then
+    print *,  irp_here//': error in sending ok'
+    stop -1
+  endif
+  rc = f77_zmq_send( zmq_socket_pull, b%mini, 8, 0)
+IRP_ENDIF
+
+end subroutine
+
--- a/src/cipsi_tc_bi_ortho/run_selection_slave.irp.f
+++ b/src/cipsi_tc_bi_ortho/run_selection_slave.irp.f
@ -0,0 +1,255 @@
+subroutine run_selection_slave(thread, iproc, energy)
+
+  use f77_zmq
+  use selection_types
+
+  implicit none
+
+  double precision, intent(in) :: energy(N_states)
+  integer,          intent(in) :: thread, iproc
+
+  integer                      :: rc, i
+  integer                      :: worker_id, task_id(1), ctask, ltask
+  character*(512)              :: task
+  integer(ZMQ_PTR)             :: zmq_to_qp_run_socket
+  integer(ZMQ_PTR)             :: zmq_socket_push
+  integer(ZMQ_PTR), external   :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR), external   :: new_zmq_push_socket
+  type(selection_buffer)       :: buf, buf2
+  type(pt2_type)               :: pt2_data
+  logical                      :: done, buffer_ready
+
+  PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+  PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+  PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+  PROVIDE psi_bilinear_matrix_transp_order N_int pt2_F pseudo_sym
+  PROVIDE psi_selectors_coef_transp_tc psi_det_sorted_tc weight_selection
+
+  call pt2_alloc(pt2_data,N_states)
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  integer, external :: connect_to_taskserver
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  buf%N = 0
+  buffer_ready = .False.
+  ctask = 1
+
+  do
+    integer, external :: get_task_from_taskserver
+    if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id(ctask), task) == -1) then
+      exit
+    endif
+    done = task_id(ctask) == 0
+    if (done) then
+      ctask = ctask - 1
+    else
+      integer :: i_generator, N, subset, bsize
+      call sscanf_ddd(task, subset, i_generator, N)
+      if(buf%N == 0) then
+        ! Only first time
+        call create_selection_buffer(N, N*2, buf)
+        buffer_ready = .True.
+      else
+        if (N /= buf%N) then
+          print *, 'N=', N
+          print *, 'buf%N=', buf%N
+          print *, 'bug in ', irp_here
+          stop '-1'
+        end if
+      end if
+      call select_connected(i_generator, energy, pt2_data, buf,subset, pt2_F(i_generator))
+    endif
+
+    integer, external :: task_done_to_taskserver
+
+    if(done .or. ctask == size(task_id)) then
+      do i=1, ctask
+         if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) == -1) then
+           call usleep(100)
+          if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id(i)) == -1) then
+            ctask = 0
+            done = .true.
+            exit
+          endif
+         endif
+      end do
+      if(ctask > 0) then
+        call sort_selection_buffer(buf)
+!        call merge_selection_buffers(buf,buf2)
+        call push_selection_results(zmq_socket_push, pt2_data, buf, task_id(1), ctask)
+        call pt2_dealloc(pt2_data)
+        call pt2_alloc(pt2_data,N_states)
+!        buf%mini = buf2%mini
+        buf%cur = 0
+      end if
+      ctask = 0
+    end if
+
+    if(done) exit
+    ctask = ctask + 1
+  end do
+
+  if(ctask > 0) then
+    call sort_selection_buffer(buf)
+!    call merge_selection_buffers(buf,buf2)
+    call push_selection_results(zmq_socket_push, pt2_data, buf, task_id(1), ctask)
+!    buf%mini = buf2%mini
+    buf%cur = 0
+  end if
+  ctask = 0
+  call pt2_dealloc(pt2_data)
+
+  integer, external :: disconnect_from_taskserver
+  if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+    continue
+  endif
+
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  call end_zmq_push_socket(zmq_socket_push,thread)
+  if (buffer_ready) then
+    call delete_selection_buffer(buf)
+!    call delete_selection_buffer(buf2)
+  endif
+end subroutine
+
+
+subroutine push_selection_results(zmq_socket_push, pt2_data, b, task_id, ntasks)
+  use f77_zmq
+  use selection_types
+  implicit none
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_push
+  type(pt2_type), intent(in)     :: pt2_data
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(in) :: ntasks, task_id(*)
+  integer :: rc
+  double precision, allocatable :: pt2_serialized(:)
+
+  rc = f77_zmq_send( zmq_socket_push, b%cur, 4, ZMQ_SNDMORE)
+  if(rc /= 4) then
+    print *,  'f77_zmq_send( zmq_socket_push, b%cur, 4, ZMQ_SNDMORE)'
+  endif
+
+
+  allocate(pt2_serialized (pt2_type_size(N_states)) )
+  call pt2_serialize(pt2_data,N_states,pt2_serialized)
+
+  rc = f77_zmq_send( zmq_socket_push, pt2_serialized, size(pt2_serialized)*8, ZMQ_SNDMORE)
+  if (rc == -1) then
+    print *,  irp_here, ': error sending result'
+    stop 3
+    return
+  else if(rc /= size(pt2_serialized)*8) then
+    stop 'push'
+  endif
+  deallocate(pt2_serialized)
+
+  if (b%cur > 0) then
+
+      rc = f77_zmq_send( zmq_socket_push, b%val(1), 8*b%cur, ZMQ_SNDMORE)
+      if(rc /= 8*b%cur) then
+        print *,  'f77_zmq_send( zmq_socket_push, b%val(1), 8*b%cur, ZMQ_SNDMORE)'
+      endif
+
+      rc = f77_zmq_send( zmq_socket_push, b%det(1,1,1), bit_kind*N_int*2*b%cur, ZMQ_SNDMORE)
+      if(rc /= bit_kind*N_int*2*b%cur) then
+        print *,  'f77_zmq_send( zmq_socket_push, b%det(1,1,1), bit_kind*N_int*2*b%cur, ZMQ_SNDMORE)'
+      endif
+
+  endif
+
+  rc = f77_zmq_send( zmq_socket_push, ntasks, 4, ZMQ_SNDMORE)
+  if(rc /= 4) then
+    print *,  'f77_zmq_send( zmq_socket_push, ntasks, 4, ZMQ_SNDMORE)'
+  endif
+
+  rc = f77_zmq_send( zmq_socket_push, task_id(1), ntasks*4, 0)
+  if(rc /= 4*ntasks) then
+    print *,  'f77_zmq_send( zmq_socket_push, task_id(1), ntasks*4, 0)'
+  endif
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if ((rc /= 2).and.(ok(1:2) /= 'ok')) then
+    print *,  irp_here//': error in receiving ok'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+
+subroutine pull_selection_results(zmq_socket_pull, pt2_data, val, det, N, task_id, ntasks)
+  use f77_zmq
+  use selection_types
+  implicit none
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  type(pt2_type), intent(inout) :: pt2_data
+  double precision, intent(out) :: val(*)
+  integer(bit_kind), intent(out) :: det(N_int, 2, *)
+  integer, intent(out) :: N, ntasks, task_id(*)
+  integer :: rc, rn, i
+  double precision, allocatable :: pt2_serialized(:)
+
+  rc = f77_zmq_recv( zmq_socket_pull, N, 4, 0)
+  if(rc /= 4) then
+    print *,  'f77_zmq_recv( zmq_socket_pull, N, 4, 0)'
+  endif
+
+  allocate(pt2_serialized (pt2_type_size(N_states)) )
+  rc = f77_zmq_recv( zmq_socket_pull, pt2_serialized, 8*size(pt2_serialized), 0)
+  if (rc == -1) then
+    ntasks = 1
+    task_id(1) = 0
+  else if(rc /= 8*size(pt2_serialized)) then
+    stop 'pull'
+  endif
+
+  call pt2_deserialize(pt2_data,N_states,pt2_serialized)
+  deallocate(pt2_serialized)
+
+  if (N>0) then
+      rc = f77_zmq_recv( zmq_socket_pull, val(1), 8*N, 0)
+      if(rc /= 8*N) then
+        print *,  'f77_zmq_recv( zmq_socket_pull, val(1), 8*N, 0)'
+      endif
+
+      rc = f77_zmq_recv( zmq_socket_pull, det(1,1,1), bit_kind*N_int*2*N, 0)
+      if(rc /= bit_kind*N_int*2*N) then
+        print *,  'f77_zmq_recv( zmq_socket_pull, det(1,1,1), bit_kind*N_int*2*N, 0)'
+      endif
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, ntasks, 4, 0)
+  if(rc /= 4) then
+    print *,  'f77_zmq_recv( zmq_socket_pull, ntasks, 4, 0)'
+  endif
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id(1), ntasks*4, 0)
+  if(rc /= 4*ntasks) then
+    print *,  'f77_zmq_recv( zmq_socket_pull, task_id(1), ntasks*4, 0)'
+  endif
+
+! Activate is zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
+  if (rc /= 2) then
+    print *,  irp_here//': error in sending ok'
+    stop -1
+  endif
+IRP_ENDIF
+end subroutine
+
+
+
--- a/src/cipsi_tc_bi_ortho/selection.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection.irp.f
--- a/src/cipsi_tc_bi_ortho/selection_buffer.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection_buffer.irp.f
@ -0,0 +1,416 @@
+
+subroutine create_selection_buffer(N, size_in, res)
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Allocates the memory for a selection buffer.
+! The arrays have dimension size_in and the maximum number of elements is N
+  END_DOC
+
+  integer, intent(in) :: N, size_in
+  type(selection_buffer), intent(out) :: res
+
+  integer :: siz
+  siz = max(size_in,1)
+
+  double precision :: rss
+  double precision, external :: memory_of_double
+  rss = memory_of_double(siz)*(N_int*2+1)
+  call check_mem(rss,irp_here)
+
+  allocate(res%det(N_int, 2, siz), res%val(siz))
+
+  res%val(:) = 0d0
+  res%det(:,:,:) = 0_8
+  res%N = N
+  res%mini = 0d0
+  res%cur = 0
+end subroutine
+
+subroutine delete_selection_buffer(b)
+  use selection_types
+  implicit none
+  type(selection_buffer), intent(inout) :: b
+  if (associated(b%det)) then
+    deallocate(b%det)
+  endif
+  if (associated(b%val)) then
+    deallocate(b%val)
+  endif
+  NULLIFY(b%det)
+  NULLIFY(b%val)
+  b%cur = 0
+  b%mini = 0.d0
+  b%N = 0
+end
+
+
+subroutine add_to_selection_buffer(b, det, val)
+  use selection_types
+  implicit none
+
+  type(selection_buffer), intent(inout) :: b
+  integer(bit_kind), intent(in) :: det(N_int, 2)
+  double precision, intent(in) :: val
+  integer :: i
+
+  if(b%N > 0 .and. val <= b%mini) then
+    b%cur += 1
+    b%det(1:N_int,1:2,b%cur) = det(1:N_int,1:2)
+    b%val(b%cur) = val
+    if(b%cur == size(b%val)) then
+      call sort_selection_buffer(b)
+    end if
+  end if
+end subroutine
+
+subroutine merge_selection_buffers(b1, b2)
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Merges the selection buffers b1 and b2 into b2
+  END_DOC
+  type(selection_buffer), intent(inout) :: b1
+  type(selection_buffer), intent(inout) :: b2
+  integer(bit_kind), pointer     :: detmp(:,:,:)
+  double precision, pointer      :: val(:)
+  integer                        :: i, i1, i2, k, nmwen, sze
+  if (b1%cur == 0) return
+  do while (b1%val(b1%cur) > b2%mini)
+    b1%cur = b1%cur-1
+    if (b1%cur == 0) then
+      return
+    endif
+  enddo
+  nmwen = min(b1%N, b1%cur+b2%cur)
+  double precision :: rss
+  double precision, external :: memory_of_double
+  sze = max(size(b1%val), size(b2%val))
+  rss = memory_of_double(sze) + 2*N_int*memory_of_double(sze)
+  call check_mem(rss,irp_here)
+  allocate(val(sze), detmp(N_int, 2, sze))
+  i1=1
+  i2=1
+  do i=1,nmwen
+    if ( (i1 > b1%cur).and.(i2 > b2%cur) ) then
+      exit
+    else if (i1 > b1%cur) then
+        val(i) = b2%val(i2)
+        detmp(1:N_int,1,i) = b2%det(1:N_int,1,i2)
+        detmp(1:N_int,2,i) = b2%det(1:N_int,2,i2)
+        i2=i2+1
+    else if (i2 > b2%cur) then
+        val(i) = b1%val(i1)
+        detmp(1:N_int,1,i) = b1%det(1:N_int,1,i1)
+        detmp(1:N_int,2,i) = b1%det(1:N_int,2,i1)
+        i1=i1+1
+    else
+      if (b1%val(i1) <= b2%val(i2)) then
+        val(i) = b1%val(i1)
+        detmp(1:N_int,1,i) = b1%det(1:N_int,1,i1)
+        detmp(1:N_int,2,i) = b1%det(1:N_int,2,i1)
+        i1=i1+1
+      else
+        val(i) = b2%val(i2)
+        detmp(1:N_int,1,i) = b2%det(1:N_int,1,i2)
+        detmp(1:N_int,2,i) = b2%det(1:N_int,2,i2)
+        i2=i2+1
+      endif
+    endif
+  enddo
+  deallocate(b2%det, b2%val)
+  do i=nmwen+1,b2%N
+    val(i) = 0.d0
+    detmp(1:N_int,1:2,i) = 0_bit_kind
+  enddo
+  b2%det => detmp
+  b2%val => val
+  b2%mini = min(b2%mini,b2%val(b2%N))
+  b2%cur = nmwen
+end
+
+
+subroutine sort_selection_buffer(b)
+  use selection_types
+  implicit none
+
+  type(selection_buffer), intent(inout) :: b
+  integer, allocatable :: iorder(:)
+  integer(bit_kind), pointer :: detmp(:,:,:)
+  integer :: i, nmwen
+  logical, external :: detEq
+  if (b%N == 0 .or. b%cur == 0) return
+  nmwen = min(b%N, b%cur)
+
+  double precision :: rss
+  double precision, external :: memory_of_double, memory_of_int
+  rss = memory_of_int(b%cur) + 2*N_int*memory_of_double(size(b%det,3))
+  call check_mem(rss,irp_here)
+  allocate(iorder(b%cur), detmp(N_int, 2, size(b%det,3)))
+  do i=1,b%cur
+    iorder(i) = i
+  end do
+  call dsort(b%val, iorder, b%cur)
+  do i=1, nmwen
+    detmp(1:N_int,1,i) = b%det(1:N_int,1,iorder(i))
+    detmp(1:N_int,2,i) = b%det(1:N_int,2,iorder(i))
+  end do
+  deallocate(b%det,iorder)
+  b%det => detmp
+  b%mini = min(b%mini,b%val(b%N))
+  b%cur = nmwen
+end subroutine
+
+subroutine make_selection_buffer_s2(b)
+  use selection_types
+  type(selection_buffer), intent(inout) :: b
+
+  integer(bit_kind), allocatable :: o(:,:,:)
+  double precision, allocatable  :: val(:)
+
+  integer :: n_d
+  integer :: i,k,sze,n_alpha,j,n
+  logical                        :: dup
+
+  ! Sort
+  integer, allocatable           :: iorder(:)
+  integer*8, allocatable         :: bit_tmp(:)
+  integer*8, external            :: configuration_search_key
+  integer(bit_kind), allocatable :: tmp_array(:,:,:)
+  logical, allocatable           :: duplicate(:)
+
+  n_d = b%cur
+  double precision :: rss
+  double precision, external :: memory_of_double
+  rss = (4*N_int+4)*memory_of_double(n_d)
+  call check_mem(rss,irp_here)
+  allocate(o(N_int,2,n_d), iorder(n_d), duplicate(n_d), bit_tmp(n_d), &
+           tmp_array(N_int,2,n_d), val(n_d) )
+
+  do i=1,n_d
+    do k=1,N_int
+      o(k,1,i) = ieor(b%det(k,1,i), b%det(k,2,i))
+      o(k,2,i) = iand(b%det(k,1,i), b%det(k,2,i))
+    enddo
+    iorder(i) = i
+    bit_tmp(i) = configuration_search_key(o(1,1,i),N_int)
+  enddo
+
+  deallocate(b%det)
+
+  call i8sort(bit_tmp,iorder,n_d)
+
+  do i=1,n_d
+    do k=1,N_int
+      tmp_array(k,1,i) = o(k,1,iorder(i))
+      tmp_array(k,2,i) = o(k,2,iorder(i))
+    enddo
+    val(i) = b%val(iorder(i))
+    duplicate(i) = .False.
+  enddo
+
+  ! Find duplicates
+  do i=1,n_d-1
+    if (duplicate(i)) then
+      cycle
+    endif
+    j = i+1
+    do while (bit_tmp(j)==bit_tmp(i))
+      if (duplicate(j)) then
+        j+=1
+        if (j>n_d) then
+          exit
+        endif
+        cycle
+      endif
+      dup = .True.
+      do k=1,N_int
+        if ( (tmp_array(k,1,i) /= tmp_array(k,1,j))                   &
+              .or. (tmp_array(k,2,i) /= tmp_array(k,2,j)) ) then
+          dup = .False.
+          exit
+        endif
+      enddo
+      if (dup) then
+        val(i) = max(val(i), val(j))
+        duplicate(j) = .True.
+      endif
+      j+=1
+      if (j>n_d) then
+        exit
+      endif
+    enddo
+  enddo
+
+  deallocate (b%val)
+  ! Copy filtered result
+  integer :: n_p
+  n_p=0
+  do i=1,n_d
+    if (duplicate(i)) then
+      cycle
+    endif
+    n_p = n_p + 1
+    do k=1,N_int
+      o(k,1,n_p) = tmp_array(k,1,i)
+      o(k,2,n_p) = tmp_array(k,2,i)
+    enddo
+    val(n_p) = val(i)
+  enddo
+
+  ! Sort by importance
+  do i=1,n_p
+    iorder(i) = i
+  end do
+  call dsort(val,iorder,n_p)
+  do i=1,n_p
+    do k=1,N_int
+      tmp_array(k,1,i) = o(k,1,iorder(i))
+      tmp_array(k,2,i) = o(k,2,iorder(i))
+    enddo
+  enddo
+  do i=1,n_p
+    do k=1,N_int
+      o(k,1,i) = tmp_array(k,1,i)
+      o(k,2,i) = tmp_array(k,2,i)
+    enddo
+  enddo
+
+  ! Create determinants
+  n_d = 0
+  do i=1,n_p
+    call configuration_to_dets_size(o(1,1,i),sze,elec_alpha_num,N_int)
+    n_d = n_d + sze
+    if (n_d > b%cur) then
+!      if (n_d - b%cur > b%cur - n_d + sze) then
+!        n_d = n_d - sze
+!      endif
+      exit
+    endif
+  enddo
+
+  rss = (4*N_int+2)*memory_of_double(n_d)
+  call check_mem(rss,irp_here)
+  allocate(b%det(N_int,2,2*n_d), b%val(2*n_d))
+  k=1
+  do i=1,n_p
+    n=n_d
+    call configuration_to_dets_size(o(1,1,i),n,elec_alpha_num,N_int)
+    call configuration_to_dets(o(1,1,i),b%det(1,1,k),n,elec_alpha_num,N_int)
+    do j=k,k+n-1
+      b%val(j) = val(i)
+    enddo
+    k = k+n
+    if (k > n_d) exit
+  enddo
+  deallocate(o)
+  b%cur = n_d
+  b%N = n_d
+end
+
+
+
+
+subroutine remove_duplicates_in_selection_buffer(b)
+  use selection_types
+  type(selection_buffer), intent(inout) :: b
+
+  integer(bit_kind), allocatable :: o(:,:,:)
+  double precision, allocatable  :: val(:)
+
+  integer :: n_d
+  integer :: i,k,sze,n_alpha,j,n
+  logical                        :: dup
+
+  ! Sort
+  integer, allocatable           :: iorder(:)
+  integer*8, allocatable         :: bit_tmp(:)
+  integer*8, external            :: det_search_key
+  integer(bit_kind), allocatable :: tmp_array(:,:,:)
+  logical, allocatable           :: duplicate(:)
+
+  n_d = b%cur
+  logical                        :: found_duplicates
+  double precision               :: rss
+  double precision, external     :: memory_of_double
+  rss = (4*N_int+4)*memory_of_double(n_d)
+  call check_mem(rss,irp_here)
+
+  found_duplicates = .False.
+  allocate(iorder(n_d), duplicate(n_d), bit_tmp(n_d), &
+           tmp_array(N_int,2,n_d), val(n_d) )
+
+  do i=1,n_d
+    iorder(i) = i
+    bit_tmp(i) = det_search_key(b%det(1,1,i),N_int)
+  enddo
+
+  call i8sort(bit_tmp,iorder,n_d)
+
+  do i=1,n_d
+    do k=1,N_int
+      tmp_array(k,1,i) = b%det(k,1,iorder(i))
+      tmp_array(k,2,i) = b%det(k,2,iorder(i))
+    enddo
+    val(i) = b%val(iorder(i))
+    duplicate(i) = .False.
+  enddo
+
+  ! Find duplicates
+  do i=1,n_d-1
+    if (duplicate(i)) then
+      cycle
+    endif
+    j = i+1
+    do while (bit_tmp(j)==bit_tmp(i))
+      if (duplicate(j)) then
+        j+=1
+        if (j>n_d) then
+          exit
+        endif
+        cycle
+      endif
+      dup = .True.
+      do k=1,N_int
+        if ( (tmp_array(k,1,i) /= tmp_array(k,1,j))                   &
+              .or. (tmp_array(k,2,i) /= tmp_array(k,2,j)) ) then
+          dup = .False.
+          exit
+        endif
+      enddo
+      if (dup) then
+        duplicate(j) = .True.
+        found_duplicates = .True.
+      endif
+      j+=1
+      if (j>n_d) then
+        exit
+      endif
+    enddo
+  enddo
+
+  if (found_duplicates) then
+
+    ! Copy filtered result
+    integer :: n_p
+    n_p=0
+    do i=1,n_d
+      if (duplicate(i)) then
+        cycle
+      endif
+      n_p = n_p + 1
+      do k=1,N_int
+        b%det(k,1,n_p) = tmp_array(k,1,i)
+        b%det(k,2,n_p) = tmp_array(k,2,i)
+      enddo
+      val(n_p) = val(i)
+    enddo
+    b%cur=n_p
+    b%N=n_p
+
+  endif
+
+end
+
+
+
--- a/src/cipsi_tc_bi_ortho/selection_types.f90
+++ b/src/cipsi_tc_bi_ortho/selection_types.f90
@ -0,0 +1,25 @@
+module selection_types
+  type selection_buffer
+    integer :: N, cur
+    integer(8)      , pointer :: det(:,:,:)
+    double precision, pointer :: val(:)
+    double precision          :: mini
+  endtype
+
+  type pt2_type
+    double precision, allocatable :: pt2(:)
+    double precision, allocatable :: rpt2(:)
+    double precision, allocatable :: variance(:)
+    double precision, allocatable :: overlap(:,:)
+  endtype
+
+  contains
+
+  integer function pt2_type_size(N)
+    implicit none
+    integer, intent(in) :: N
+    pt2_type_size = (3*n + n*n)
+  end function
+
+end module
+
--- a/src/cipsi_tc_bi_ortho/selection_weight.irp.f
+++ b/src/cipsi_tc_bi_ortho/selection_weight.irp.f
@ -0,0 +1,134 @@
+BEGIN_PROVIDER [ double precision, pt2_match_weight, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Weights adjusted along the selection to make the PT2 contributions
+ ! of each state coincide.
+ END_DOC
+ pt2_match_weight(:) = 1.d0
+END_PROVIDER
+
+
+
+BEGIN_PROVIDER [ double precision, variance_match_weight, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Weights adjusted along the selection to make the variances
+ ! of each state coincide.
+ END_DOC
+ variance_match_weight(:) = 1.d0
+END_PROVIDER
+
+
+
+subroutine update_pt2_and_variance_weights(pt2_data, N_st)
+  implicit none
+  use selection_types
+  BEGIN_DOC
+! Updates the PT2- and Variance- matching weights.
+  END_DOC
+  integer, intent(in)          :: N_st
+  type(pt2_type), intent(in)   :: pt2_data
+  double precision             :: pt2(N_st)
+  double precision             :: variance(N_st)
+
+  double precision :: avg, element, dt, x
+  integer          :: k
+  pt2(:)      = pt2_data % pt2(:)
+  variance(:) = pt2_data % variance(:)
+
+  avg = sum(pt2(1:N_st)) / dble(N_st) + 1.d-32 ! Avoid future division by zero
+
+  dt = 8.d0 !* selection_factor
+  do k=1,N_st
+    element = exp(dt*(pt2(k)/avg - 1.d0))
+    element = min(2.0d0 , element)
+    element = max(0.5d0 , element)
+    pt2_match_weight(k) *= element
+  enddo
+
+
+  avg = sum(variance(1:N_st)) / dble(N_st) + 1.d-32 ! Avoid future division by zero
+
+  do k=1,N_st
+    element = exp(dt*(variance(k)/avg -1.d0))
+    element = min(2.0d0 , element)
+    element = max(0.5d0 , element)
+    variance_match_weight(k) *= element
+  enddo
+
+  if (N_det < 100) then
+    ! For tiny wave functions, weights are 1.d0
+    pt2_match_weight(:) = 1.d0
+    variance_match_weight(:) = 1.d0
+  endif
+
+  threshold_davidson_pt2 = min(1.d-6, &
+     max(threshold_davidson, 1.e-1 * PT2_relative_error * minval(abs(pt2(1:N_states)))) )
+
+  SOFT_TOUCH pt2_match_weight variance_match_weight threshold_davidson_pt2
+end
+
+
+
+
+BEGIN_PROVIDER [ double precision, selection_weight, (N_states) ]
+   implicit none
+   BEGIN_DOC
+   ! Weights used in the selection criterion
+   END_DOC
+   select case (weight_selection)
+
+     case (0)
+      print *,  'Using input weights in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * state_average_weight(1:N_states)
+
+     case (1)
+      print *,  'Using 1/c_max^2 weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states)
+
+     case (2)
+      print *,  'Using pt2-matching weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * pt2_match_weight(1:N_states)
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+
+     case (3)
+      print *,  'Using variance-matching weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * variance_match_weight(1:N_states)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (4)
+      print *,  'Using variance- and pt2-matching weights in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * sqrt(variance_match_weight(1:N_states) * pt2_match_weight(1:N_states))
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (5)
+      print *,  'Using variance-matching weight in selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * variance_match_weight(1:N_states)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (6)
+      print *,  'Using CI coefficient-based selection'
+      selection_weight(1:N_states) = c0_weight(1:N_states)
+
+     case (7)
+      print *,  'Input weights multiplied by variance- and pt2-matching'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * sqrt(variance_match_weight(1:N_states) * pt2_match_weight(1:N_states)) * state_average_weight(1:N_states)
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+     case (8)
+      print *,  'Input weights multiplied by pt2-matching'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * pt2_match_weight(1:N_states) * state_average_weight(1:N_states)
+      print *, '# PT2 weight ', real(pt2_match_weight(:),4)
+
+     case (9)
+      print *,  'Input weights multiplied by variance-matching'
+      selection_weight(1:N_states) = c0_weight(1:N_states) * variance_match_weight(1:N_states) * state_average_weight(1:N_states)
+      print *, '# var weight ', real(variance_match_weight(:),4)
+
+    end select
+     print *, '# Total weight ', real(selection_weight(:),4)
+
+END_PROVIDER
+
--- a/src/cipsi_tc_bi_ortho/slave_cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/slave_cipsi.irp.f
@ -0,0 +1,350 @@
+subroutine run_slave_cipsi
+
+  BEGIN_DOC
+  ! Helper program for distributed parallelism
+  END_DOC
+
+  implicit none
+
+  call omp_set_max_active_levels(1)
+  distributed_davidson = .False.
+  read_wf = .False.
+  SOFT_TOUCH read_wf distributed_davidson
+  call provide_everything
+  call switch_qp_run_to_master
+  call run_slave_main
+end
+
+subroutine provide_everything
+  PROVIDE H_apply_buffer_allocated mo_two_e_integrals_in_map psi_det_generators psi_coef_generators psi_det_sorted_bit psi_selectors n_det_generators n_states generators_bitmask zmq_context N_states_diag
+  PROVIDE psi_selectors_rcoef_bi_orth_transp psi_selectors_lcoef_bi_orth_transp
+
+  PROVIDE pt2_e0_denominator mo_num N_int ci_energy mpi_master zmq_state zmq_context
+  PROVIDE psi_det psi_coef threshold_generators state_average_weight
+  PROVIDE N_det_selectors pt2_stoch_istate N_det selection_weight pseudo_sym
+end
+
+
+subroutine run_slave_main
+
+  use f77_zmq
+
+  implicit none
+  IRP_IF MPI
+    include 'mpif.h'
+  IRP_ENDIF
+
+  integer(ZMQ_PTR), external :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket
+  double precision :: energy(N_states)
+  character*(64) :: states(10)
+  character*(64) :: old_state
+  integer :: rc, i, ierr
+  double precision :: t0, t1
+
+  integer, external              :: zmq_get_dvector, zmq_get_N_det_generators
+  integer, external              :: zmq_get8_dvector
+  integer, external              :: zmq_get_ivector
+  integer, external              :: zmq_get_psi, zmq_get_N_det_selectors, zmq_get_psi_bilinear
+  integer, external              :: zmq_get_psi_notouch
+  integer, external              :: zmq_get_N_states_diag
+
+  zmq_context = f77_zmq_ctx_new ()
+  states(1) = 'selection'
+  states(2) = 'davidson'
+  states(3) = 'pt2'
+  old_state = 'Waiting'
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  PROVIDE psi_det psi_coef threshold_generators state_average_weight mpi_master
+  PROVIDE zmq_state N_det_selectors pt2_stoch_istate N_det pt2_e0_denominator
+  PROVIDE N_det_generators N_states N_states_diag pt2_e0_denominator mpi_rank
+
+  IRP_IF MPI
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  do
+
+    if (mpi_master) then
+      call wait_for_states(states,zmq_state,size(states))
+      if (zmq_state(1:64) == old_state(1:64)) then
+        call usleep(200)
+        cycle
+      else
+        old_state(1:64) = zmq_state(1:64)
+      endif
+      print *,  trim(zmq_state)
+    endif
+
+    IRP_IF MPI_DEBUG
+      print *,  irp_here, mpi_rank
+      call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+    IRP_ENDIF
+    IRP_IF MPI
+      call MPI_BCAST (zmq_state, 128, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
+      if (ierr /= MPI_SUCCESS) then
+        print *,  irp_here, 'error in broadcast of zmq_state'
+      endif
+    IRP_ENDIF
+
+    if(zmq_state(1:7) == 'Stopped') then
+      exit
+    endif
+
+
+    if (zmq_state(1:9) == 'selection') then
+
+      ! Selection
+      ! ---------
+
+      call wall_time(t0)
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_psi')
+      IRP_ENDIF
+      if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector threshold_generators')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector energy')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_generators')
+      IRP_ENDIF
+      if (zmq_get_N_det_generators (zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_selectors')
+      IRP_ENDIF
+      if (zmq_get_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector state_average_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector selection_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) cycle
+      pt2_e0_denominator(1:N_states) = energy(1:N_states)
+      TOUCH pt2_e0_denominator state_average_weight threshold_generators selection_weight psi_det psi_coef
+
+      if (mpi_master) then
+        print *,  'N_det', N_det
+        print *,  'N_det_generators', N_det_generators
+        print *,  'N_det_selectors', N_det_selectors
+        print *,  'pt2_e0_denominator', pt2_e0_denominator
+        print *,  'pt2_stoch_istate', pt2_stoch_istate
+        print *,  'state_average_weight', state_average_weight
+        print *,  'selection_weight', selection_weight
+      endif
+      call wall_time(t1)
+      call write_double(6,(t1-t0),'Broadcast time')
+
+      IRP_IF MPI_DEBUG
+        call mpi_print('Entering OpenMP section')
+      IRP_ENDIF
+      !$OMP PARALLEL PRIVATE(i)
+      i = omp_get_thread_num()
+      call run_selection_slave(0,i,energy)
+      !$OMP END PARALLEL
+      print *,  mpi_rank, ': Selection done'
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call mpi_print('----------')
+
+    else if (zmq_state(1:8) == 'davidson') then
+
+      ! Davidson
+      ! --------
+
+      call wall_time(t0)
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_states_diag')
+      IRP_ENDIF
+      if (zmq_get_N_states_diag(zmq_to_qp_run_socket,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_psi')
+      IRP_ENDIF
+      if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle
+
+      call wall_time(t1)
+      call write_double(6,(t1-t0),'Broadcast time')
+
+      !---
+      call omp_set_max_active_levels(8)
+      call davidson_slave_tcp(0)
+      call omp_set_max_active_levels(1)
+      print *,  mpi_rank, ': Davidson done'
+      !---
+
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call mpi_print('----------')
+
+    else if (zmq_state(1:3) == 'pt2') then
+
+      ! PT2
+      ! ---
+
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call wall_time(t0)
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_psi')
+      IRP_ENDIF
+      if (zmq_get_psi(zmq_to_qp_run_socket,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_generators')
+      IRP_ENDIF
+      if (zmq_get_N_det_generators (zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_N_det_selectors')
+      IRP_ENDIF
+      if (zmq_get_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector threshold_generators')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector energy')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'energy',energy,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_ivector pt2_stoch_istate')
+      IRP_ENDIF
+      if (zmq_get_ivector(zmq_to_qp_run_socket,1,'pt2_stoch_istate',pt2_stoch_istate,1) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector state_average_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) cycle
+      IRP_IF MPI_DEBUG
+        call mpi_print('zmq_get_dvector selection_weight')
+      IRP_ENDIF
+      if (zmq_get_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) cycle
+      pt2_e0_denominator(1:N_states) = energy(1:N_states)
+      SOFT_TOUCH pt2_e0_denominator state_average_weight pt2_stoch_istate threshold_generators selection_weight psi_det psi_coef N_det_generators N_det_selectors
+
+
+      call wall_time(t1)
+      call write_double(6,(t1-t0),'Broadcast time')
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+
+
+      IRP_IF MPI_DEBUG
+        call mpi_print('Entering OpenMP section')
+      IRP_ENDIF
+      if (.true.) then
+        integer :: nproc_target, ii
+        double precision :: mem_collector, mem, rss
+
+        call resident_memory(rss)
+
+        nproc_target = nthreads_pt2
+        ii = min(N_det, (elec_alpha_num*(mo_num-elec_alpha_num))**2)
+
+        do
+          mem = rss +                             & !
+                nproc_target * 8.d0 *             & ! bytes
+                ( 0.5d0*pt2_n_tasks_max           & ! task_id
+                + 64.d0*pt2_n_tasks_max           & ! task
+                + 3.d0*pt2_n_tasks_max*N_states   & ! pt2, variance, norm
+                + 1.d0*pt2_n_tasks_max            & ! i_generator, subset
+                + 3.d0*(N_int*2.d0*ii+ ii)        & ! selection buffer
+                + 1.d0*(N_int*2.d0*ii+ ii)        & ! sort selection buffer
+                + 2.0d0*(ii)                      & ! preinteresting, interesting,
+                                                    ! prefullinteresting, fullinteresting
+                + 2.0d0*(N_int*2*ii)              & ! minilist, fullminilist
+                + 1.0d0*(N_states*mo_num*mo_num)  & ! mat
+                ) / 1024.d0**3
+
+          if (nproc_target == 0) then
+            call check_mem(mem,irp_here)
+            nproc_target = 1
+            exit
+          endif
+
+          if (mem+rss < qp_max_mem) then
+            exit
+          endif
+
+          nproc_target = nproc_target - 1
+
+        enddo
+        
+        if (N_det > 100000) then
+
+          if (mpi_master) then
+            print *,  'N_det', N_det
+            print *,  'N_det_generators', N_det_generators
+            print *,  'N_det_selectors', N_det_selectors
+            print *,  'pt2_e0_denominator', pt2_e0_denominator
+            print *,  'pt2_stoch_istate', pt2_stoch_istate
+            print *,  'state_average_weight', state_average_weight
+            print *,  'selection_weight', selection_weight
+            print *,  'Number of threads', nproc_target
+          endif
+
+          if (h0_type == 'CFG') then
+            PROVIDE det_to_configuration
+          endif
+
+          PROVIDE global_selection_buffer pt2_N_teeth pt2_F N_det_generators
+          PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+          PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+          PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+          PROVIDE psi_bilinear_matrix_transp_order psi_selectors_coef_transp psi_det_sorted_tc
+          PROVIDE psi_selectors_rcoef_bi_orth_transp psi_selectors_lcoef_bi_orth_transp
+
+          PROVIDE psi_det_hii selection_weight pseudo_sym pt2_min_parallel_tasks
+
+          if (mpi_master) then
+            print *,  'Running PT2'
+          endif
+          !$OMP PARALLEL PRIVATE(i) NUM_THREADS(nproc_target+1)
+          i = omp_get_thread_num()
+          call run_pt2_slave(0,i,pt2_e0_denominator)
+          !$OMP END PARALLEL
+          FREE state_average_weight
+          print *,  mpi_rank, ': PT2 done'
+          print *,  '-------'
+
+        endif
+      endif
+
+      IRP_IF MPI
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+        if (ierr /= MPI_SUCCESS) then
+          print *,  irp_here, 'error in barrier'
+        endif
+      IRP_ENDIF
+      call mpi_print('----------')
+
+    endif
+
+  end do
+  IRP_IF MPI
+    call MPI_finalize(ierr)
+  IRP_ENDIF
+end
+
+
+
--- a/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
+++ b/src/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f
@ -0,0 +1,149 @@
+subroutine run_stochastic_cipsi
+  use selection_types
+  implicit none
+  BEGIN_DOC
+! Selected Full Configuration Interaction with Stochastic selection and PT2.
+  END_DOC
+  integer                        :: i,j,k,ndet
+  double precision, allocatable  :: zeros(:)
+  integer                        :: to_select
+  type(pt2_type)                 :: pt2_data, pt2_data_err
+  logical, external              :: qp_stop
+  logical                        :: print_pt2
+
+  double precision :: rss
+  double precision, external :: memory_of_double
+  double precision :: correlation_energy_ratio,E_denom,E_tc,norm
+  double precision, allocatable :: ept2(:), pt1(:),extrap_energy(:)
+  PROVIDE H_apply_buffer_allocated distributed_davidson 
+
+  print*,'Diagonal elements of the Fock matrix '
+  do i = 1, mo_num
+   write(*,*)i,Fock_matrix_tc_mo_tot(i,i)
+  enddo
+  N_iter = 1
+  threshold_generators = 1.d0
+  SOFT_TOUCH threshold_generators
+
+  rss = memory_of_double(N_states)*4.d0
+  call check_mem(rss,irp_here)
+
+  allocate (zeros(N_states))
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+
+  double precision               :: hf_energy_ref
+  logical                        :: has
+  double precision               :: relative_error
+
+  relative_error=PT2_relative_error
+
+  zeros = 0.d0
+  pt2_data % pt2   = -huge(1.e0)
+  pt2_data % rpt2  = -huge(1.e0)
+  pt2_data % overlap= 0.d0
+  pt2_data % variance = huge(1.e0)
+
+  !!!! WARNING  !!!! SEEMS TO BE PROBLEM WTH make_s2_eigenfunction !!!! THE DETERMINANTS CAN APPEAR TWICE IN THE WFT DURING SELECTION
+!  if (s2_eig) then
+!    call make_s2_eigenfunction
+!  endif
+  print_pt2 = .False.
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  call routine_save_right
+
+
+  if (N_det > N_det_max) then
+    psi_det(1:N_int,1:2,1:N_det) = psi_det_sorted_tc_gen(1:N_int,1:2,1:N_det)
+    psi_coef(1:N_det,1:N_states) = psi_coef_sorted_tc_gen(1:N_det,1:N_states)
+    N_det = N_det_max
+    soft_touch N_det psi_det psi_coef
+    if (s2_eig) then
+      call make_s2_eigenfunction
+    endif
+    print_pt2 = .False.
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!    call routine_save_right
+  endif
+
+  allocate(ept2(1000),pt1(1000),extrap_energy(100))
+
+  correlation_energy_ratio = 0.d0
+
+! thresh_it_dav  = 5.d-5
+! soft_touch thresh_it_dav
+
+  print_pt2 = .True.
+  do while (                                                         &
+        (N_det < N_det_max) .and.                                    &
+        (maxval(abs(pt2_data % pt2(1:N_states))) > pt2_max)          &
+        )
+      write(*,'(A)')  '--------------------------------------------------------------------------------'
+
+
+    to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
+    to_select = max(N_states_diag, to_select)
+
+    E_denom = E_tc ! TC Energy of the current wave function 
+    call pt2_dealloc(pt2_data)
+    call pt2_dealloc(pt2_data_err)
+    call pt2_alloc(pt2_data, N_states)
+    call pt2_alloc(pt2_data_err, N_states)
+    call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error,to_select) ! Stochastic PT2 and selection
+!    stop
+
+    N_iter += 1
+
+    if (qp_stop()) exit
+
+    ! Add selected determinants
+    call copy_H_apply_buffer_to_wf_tc()
+
+    PROVIDE  psi_l_coef_bi_ortho psi_r_coef_bi_ortho
+    PROVIDE  psi_det
+    PROVIDE  psi_det_sorted_tc
+
+    ept2(N_iter-1) = E_tc + nuclear_repulsion + (pt2_data % pt2(1))/norm
+    pt1(N_iter-1) = dsqrt(pt2_data % overlap(1,1))
+    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+    if (qp_stop()) exit
+  enddo
+!  print*,'data to extrapolate '
+!  do i = 2, N_iter
+!   print*,'iteration ',i
+!   print*,'pt1,Ept2',pt1(i),ept2(i)
+!   call get_extrapolated_energy(i-1,ept2(i),pt1(i),extrap_energy(i))
+!   do j = 2, i
+!    print*,'j,e,energy',j,extrap_energy(j)
+!   enddo
+!  enddo
+
+! thresh_it_dav  = 5.d-6
+! soft_touch thresh_it_dav
+
+  call pt2_dealloc(pt2_data)
+  call pt2_dealloc(pt2_data_err)
+  call pt2_alloc(pt2_data, N_states)
+  call pt2_alloc(pt2_data_err, N_states)
+  call ZMQ_pt2(E_tc, pt2_data, pt2_data_err, relative_error,0) ! Stochastic PT2 and selection
+  call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  if (.not.qp_stop()) then
+!    if (N_det < N_det_max) then
+!     thresh_it_dav  = 5.d-7
+!     soft_touch thresh_it_dav
+!     call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!    endif
+!
+!    call pt2_dealloc(pt2_data)
+!    call pt2_dealloc(pt2_data_err)
+!    call pt2_alloc(pt2_data, N_states)
+!    call pt2_alloc(pt2_data_err, N_states)
+!    call ZMQ_pt2(E_denom, pt2_data, pt2_data_err, relative_error, 0) ! Stochastic PT2
+!    call diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+!  endif
+!  call pt2_dealloc(pt2_data)
+!  call pt2_dealloc(pt2_data_err)
+!  call routine_save_right
+
+end
+
--- a/src/cipsi_tc_bi_ortho/zmq_selection.irp.f
+++ b/src/cipsi_tc_bi_ortho/zmq_selection.irp.f
@ -0,0 +1,235 @@
+subroutine ZMQ_selection(N_in, pt2_data)
+  use f77_zmq
+  use selection_types
+
+  implicit none
+
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket , zmq_socket_pull
+  integer, intent(in)            :: N_in
+  type(selection_buffer)         :: b
+  integer                        :: i, l, N
+  integer, external              :: omp_get_thread_num
+  type(pt2_type), intent(inout)  :: pt2_data
+
+  PROVIDE psi_det psi_coef N_det qp_max_mem N_states pt2_F s2_eig N_det_generators
+
+  N = max(N_in,1)
+  N = min(N, (elec_alpha_num * (mo_num-elec_alpha_num))**2)
+  if (.True.) then
+    PROVIDE pt2_e0_denominator nproc
+    PROVIDE psi_bilinear_matrix_columns_loc psi_det_alpha_unique psi_det_beta_unique
+    PROVIDE psi_bilinear_matrix_rows psi_det_sorted_tc_order psi_bilinear_matrix_order
+    PROVIDE psi_bilinear_matrix_transp_rows_loc psi_bilinear_matrix_transp_columns
+    PROVIDE psi_bilinear_matrix_transp_order selection_weight pseudo_sym
+    PROVIDE n_act_orb n_inact_orb n_core_orb n_virt_orb n_del_orb seniority_max
+    PROVIDE excitation_beta_max  excitation_alpha_max excitation_max
+
+    call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'selection')
+
+    integer, external              :: zmq_put_psi
+    integer, external              :: zmq_put_N_det_generators
+    integer, external              :: zmq_put_N_det_selectors
+    integer, external              :: zmq_put_dvector
+
+    if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+      stop 'Unable to put psi on ZMQ server'
+    endif
+    if (zmq_put_N_det_generators(zmq_to_qp_run_socket, 1) == -1) then
+      stop 'Unable to put N_det_generators on ZMQ server'
+    endif
+    if (zmq_put_N_det_selectors(zmq_to_qp_run_socket, 1) == -1) then
+      stop 'Unable to put N_det_selectors on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',pt2_e0_denominator,size(pt2_e0_denominator)) == -1) then
+      stop 'Unable to put energy on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'state_average_weight',state_average_weight,N_states) == -1) then
+      stop 'Unable to put state_average_weight on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'selection_weight',selection_weight,N_states) == -1) then
+      stop 'Unable to put selection_weight on ZMQ server'
+    endif
+    if (zmq_put_dvector(zmq_to_qp_run_socket,1,'threshold_generators',(/threshold_generators/),1) == -1) then
+      stop 'Unable to put threshold_generators on ZMQ server'
+    endif
+    call create_selection_buffer(N, N*2, b)
+  endif
+
+  integer, external :: add_task_to_taskserver
+  character(len=100000)           :: task
+  integer :: j,k,ipos
+  ipos=1
+  task = ' '
+
+ 
+ do i= 1, N_det_generators
+    do j=1,pt2_F(i)
+      write(task(ipos:ipos+30),'(I9,1X,I9,1X,I9,''|'')') j, i, N
+      ipos += 30
+      if (ipos > 100000-30) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task to task server'
+        endif
+        ipos=1
+      endif
+    end do
+  enddo
+  if (ipos > 1) then
+    if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+      stop 'Unable to add task to task server'
+    endif
+  endif
+  N = max(N_in,1)
+
+
+  ASSERT (associated(b%det))
+  ASSERT (associated(b%val))
+
+  integer, external :: zmq_set_running
+  if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+    print *,  irp_here, ': Failed in zmq_set_running'
+  endif
+
+  integer :: nproc_target
+  if (N_det < 3*nproc) then
+    nproc_target = N_det/4
+  else
+    nproc_target = nproc
+  endif
+  double precision :: mem
+  mem = 8.d0 * N_det * (N_int * 2.d0 * 3.d0 +  3.d0 + 5.d0) / (1024.d0**3)
+  call write_double(6,mem,'Estimated memory/thread (Gb)')
+  if (qp_max_mem > 0) then
+    nproc_target = max(1,int(dble(qp_max_mem)/(0.1d0 + mem)))
+    nproc_target = min(nproc_target,nproc)
+  endif
+
+  f(:) = 1.d0
+  if (.not.do_pt2) then
+    double precision :: f(N_states), u_dot_u
+    do k=1,min(N_det,N_states)
+     f(k) = 1.d0 / u_dot_u(psi_selectors_coef(1,k), N_det_selectors)
+    enddo
+  endif
+
+  !$OMP PARALLEL DEFAULT(shared)  SHARED(b, pt2_data)  PRIVATE(i) NUM_THREADS(nproc_target+1)
+  i = omp_get_thread_num()
+  if (i==0) then
+    call selection_collector(zmq_socket_pull, b, N, pt2_data)
+  else
+    call selection_slave_inproc(i)
+  endif
+  !$OMP END PARALLEL
+
+  call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'selection')
+  if (N_in > 0) then
+    if (s2_eig) then
+      call make_selection_buffer_s2(b)
+    endif
+    call fill_H_apply_buffer_no_selection(b%cur,b%det,N_int,0)
+  endif
+  call delete_selection_buffer(b)
+
+  do k=1,N_states
+    pt2_data % pt2(k) = pt2_data % pt2(k) * f(k)
+    pt2_data % variance(k) = pt2_data % variance(k) * f(k)
+    do l=1,N_states
+      pt2_data % overlap(k,l) = pt2_data % overlap(k,l) * dsqrt(f(k)*f(l))
+      pt2_data % overlap(l,k) = pt2_data % overlap(l,k) * dsqrt(f(k)*f(l))
+    enddo
+
+    pt2_data % rpt2(k) =  &
+       pt2_data % pt2(k)/(1.d0 + pt2_data % overlap(k,k))
+  enddo
+
+  pt2_overlap(:,:) = pt2_data % overlap(:,:)
+
+  print *, 'Overlap of perturbed states:'
+  do l=1,N_states
+    print *, pt2_overlap(l,:)
+  enddo
+  print *, '-------'
+  SOFT_TOUCH pt2_overlap
+  call update_pt2_and_variance_weights(pt2_data, N_states)
+
+end subroutine
+
+
+subroutine selection_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  call run_selection_slave(1,i,pt2_e0_denominator)
+end
+
+subroutine selection_collector(zmq_socket_pull, b, N, pt2_data)
+  use f77_zmq
+  use selection_types
+  use bitmasks
+  implicit none
+
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  type(selection_buffer), intent(inout) :: b
+  integer, intent(in)            :: N
+  type(pt2_type), intent(inout)  :: pt2_data
+  type(pt2_type)                 :: pt2_data_tmp
+
+  double precision               :: pt2_mwen(N_states)
+  double precision               :: variance_mwen(N_states)
+  double precision               :: norm2_mwen(N_states)
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_pull_socket
+
+  integer :: msg_size, rc, more
+  integer :: acc, i, j, robin, ntask
+  double precision, pointer :: val(:)
+  integer(bit_kind), pointer :: det(:,:,:)
+  integer, allocatable :: task_id(:)
+  type(selection_buffer) :: b2
+
+
+
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+  call create_selection_buffer(N, N*2, b2)
+  integer :: k
+  double precision :: rss
+  double precision, external :: memory_of_int
+  rss = memory_of_int(N_det_generators)
+  call check_mem(rss,irp_here)
+  allocate(task_id(N_det_generators))
+  more = 1
+  pt2_data % pt2(:)      = 0d0
+  pt2_data % variance(:) = 0.d0
+  pt2_data % overlap(:,:) = 0.d0
+  call pt2_alloc(pt2_data_tmp,N_states)
+  do while (more == 1)
+    call pull_selection_results(zmq_socket_pull, pt2_data_tmp, b2%val(1), b2%det(1,1,1), b2%cur, task_id, ntask)
+
+    call pt2_add(pt2_data, 1.d0, pt2_data_tmp)
+    do i=1, b2%cur
+      call add_to_selection_buffer(b, b2%det(1,1,i), b2%val(i))
+      if (b2%val(i) > b%mini) exit
+    end do
+
+    do i=1, ntask
+      if(task_id(i) == 0) then
+          print *,  "Error in collector"
+      endif
+      integer, external :: zmq_delete_task
+      if (zmq_delete_task(zmq_to_qp_run_socket,zmq_socket_pull,task_id(i),more) == -1) then
+        stop 'Unable to delete task'
+      endif
+    end do
+  end do
+  call pt2_dealloc(pt2_data_tmp)
+
+
+  call delete_selection_buffer(b2)
+  call sort_selection_buffer(b)
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+end subroutine
+
--- a/src/fci_tc_bi/EZFIO.cfg
+++ b/src/fci_tc_bi/EZFIO.cfg
@ -0,0 +1,17 @@
+[energy]
+type: double precision
+doc: Calculated Selected |FCI| energy
+interface: ezfio
+size: (determinants.n_states)
+
+[energy_pt2]
+type: double precision
+doc: Calculated |FCI| energy + |PT2|
+interface: ezfio
+size: (determinants.n_states)
+
+[cipsi_tc]
+type: character*(32)
+doc: TODO
+interface: ezfio,provider,ocaml
+default: h_tc
--- a/src/fci_tc_bi/NEED
+++ b/src/fci_tc_bi/NEED
@ -0,0 +1,3 @@
+tc_bi_ortho
+davidson_undressed
+cipsi_tc_bi_ortho
--- a/src/fci_tc_bi/class.irp.f
+++ b/src/fci_tc_bi/class.irp.f
@ -0,0 +1,12 @@
+ BEGIN_PROVIDER [ logical, do_only_1h1p ]
+&BEGIN_PROVIDER [ logical, do_only_cas  ]
+&BEGIN_PROVIDER [ logical, do_ddci ]
+ implicit none
+ BEGIN_DOC
+ ! In the FCI case, all those are always false
+ END_DOC
+ do_only_1h1p = .False.
+ do_only_cas  = .False.
+ do_ddci = .False.
+END_PROVIDER
+
--- a/src/fci_tc_bi/copy_wf.irp.f
+++ b/src/fci_tc_bi/copy_wf.irp.f
@ -0,0 +1,215 @@
+
+use bitmasks
+
+subroutine copy_H_apply_buffer_to_wf_tc
+  use omp_lib
+  implicit none
+  BEGIN_DOC
+! Copies the H_apply buffer to psi_coef.
+! After calling this subroutine, N_det, psi_det and psi_coef need to be touched
+  END_DOC
+  integer(bit_kind), allocatable :: buffer_det(:,:,:)
+  double precision, allocatable  :: buffer_r_coef(:,:), buffer_l_coef(:,:)
+  integer                        :: i,j,k
+  integer                        :: N_det_old
+
+  PROVIDE H_apply_buffer_allocated
+
+
+  ASSERT (N_int > 0)
+  ASSERT (N_det > 0)
+
+  allocate ( buffer_det(N_int,2,N_det), buffer_r_coef(N_det,N_states), buffer_l_coef(N_det,N_states)  )
+
+  ! Backup determinants
+  j=0
+  do i=1,N_det
+!    if (pruned(i)) cycle  ! Pruned determinants
+    j+=1
+    ASSERT (sum(popcnt(psi_det(:,1,i))) == elec_alpha_num)
+    ASSERT (sum(popcnt(psi_det(:,2,i))) == elec_beta_num)
+    buffer_det(:,:,j) = psi_det(:,:,i)
+  enddo
+  N_det_old = j
+
+  ! Backup coefficients
+  do k=1,N_states
+    j=0
+    do i=1,N_det
+!      if (pruned(i)) cycle  ! Pruned determinants
+      j += 1
+      buffer_r_coef(j,k) = psi_r_coef_bi_ortho(i,k)
+      buffer_l_coef(j,k) = psi_l_coef_bi_ortho(i,k)
+    enddo
+    ASSERT ( j == N_det_old )
+  enddo
+
+  ! Update N_det
+  N_det = N_det_old
+  do j=0,nproc-1
+    N_det = N_det + H_apply_buffer(j)%N_det
+  enddo
+
+  ! Update array sizes
+  if (psi_det_size < N_det) then
+    psi_det_size = N_det
+    TOUCH psi_det_size
+  endif
+
+  ! Restore backup in resized array
+  do i=1,N_det_old
+    psi_det(:,:,i) = buffer_det(:,:,i)
+    ASSERT (sum(popcnt(psi_det(:,1,i))) == elec_alpha_num)
+    ASSERT (sum(popcnt(psi_det(:,2,i))) == elec_beta_num )
+  enddo
+  do k=1,N_states
+    do i=1,N_det_old
+      psi_r_coef_bi_ortho(i,k) = buffer_r_coef(i,k)
+      psi_l_coef_bi_ortho(i,k) = buffer_l_coef(i,k)
+    enddo
+  enddo
+
+  ! Copy new buffers
+
+  !$OMP PARALLEL DEFAULT(SHARED)                                     &
+      !$OMP PRIVATE(j,k,i) FIRSTPRIVATE(N_det_old)                   &
+      !$OMP SHARED(N_int,H_apply_buffer,psi_det,psi_r_coef_bi_ortho,psi_l_coef_bi_ortho,N_states,psi_det_size)
+  j=0
+  !$ j=omp_get_thread_num()
+  do k=0,j-1
+    N_det_old += H_apply_buffer(k)%N_det
+  enddo
+  do i=1,H_apply_buffer(j)%N_det
+    do k=1,N_int
+      psi_det(k,1,i+N_det_old) = H_apply_buffer(j)%det(k,1,i)
+      psi_det(k,2,i+N_det_old) = H_apply_buffer(j)%det(k,2,i)
+    enddo
+    ASSERT (sum(popcnt(psi_det(:,1,i+N_det_old))) == elec_alpha_num)
+    ASSERT (sum(popcnt(psi_det(:,2,i+N_det_old))) == elec_beta_num )
+  enddo
+  do k=1,N_states
+    do i=1,H_apply_buffer(j)%N_det
+      psi_r_coef_bi_ortho(i+N_det_old,k) = H_apply_buffer(j)%coef(i,k)
+      psi_l_coef_bi_ortho(i+N_det_old,k) = 0.d0
+    enddo
+  enddo
+  !$OMP BARRIER
+  H_apply_buffer(j)%N_det = 0
+  !$OMP END PARALLEL
+  SOFT_TOUCH N_det psi_det psi_r_coef_bi_ortho psi_l_coef_bi_ortho 
+
+  logical :: found_duplicates
+  call remove_duplicates_in_psi_det_tc(found_duplicates)
+  call bi_normalize(psi_l_coef_bi_ortho,psi_r_coef_bi_ortho,N_det,size(psi_l_coef_bi_ortho,1),N_states)
+  SOFT_TOUCH N_det psi_det psi_r_coef_bi_ortho psi_l_coef_bi_ortho
+
+end
+
+subroutine remove_duplicates_in_psi_det_tc(found_duplicates)
+  implicit none
+  logical, intent(out) :: found_duplicates
+  BEGIN_DOC
+! Removes duplicate determinants in the wave function.
+  END_DOC
+  integer                        :: i,j,k
+  integer(bit_kind), allocatable :: bit_tmp(:)
+  logical,allocatable            :: duplicate(:)
+  logical                        :: dup
+
+  allocate (duplicate(N_det), bit_tmp(N_det))
+
+  found_duplicates = .False.
+
+  !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k,dup)
+
+  !$OMP DO
+  do i=1,N_det
+    integer, external            :: det_search_key
+    !$DIR FORCEINLINE
+    bit_tmp(i) = det_search_key(psi_det_sorted_bit_tc(1,1,i),N_int)
+    duplicate(i) = .False.
+  enddo
+  !$OMP END DO
+
+  !$OMP DO schedule(dynamic,1024)
+  do i=1,N_det-1
+    if (duplicate(i)) then
+      cycle
+    endif
+    j = i+1
+    do while (bit_tmp(j)==bit_tmp(i))
+      if (duplicate(j)) then
+        j = j+1
+        if (j > N_det) then
+          exit
+        else
+          cycle
+        endif
+      endif
+      dup = .True.
+      do k=1,N_int
+        if ( (psi_det_sorted_bit_tc(k,1,i) /= psi_det_sorted_bit_tc(k,1,j) ) &
+        .or. (psi_det_sorted_bit_tc(k,2,i) /= psi_det_sorted_bit_tc(k,2,j) ) ) then
+          dup = .False.
+          exit
+        endif
+      enddo
+      if (dup) then
+        duplicate(j) = .True.
+        found_duplicates = .True.
+      endif
+      j += 1
+      if (j > N_det) then
+        exit
+      endif
+    enddo
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  if (found_duplicates) then
+    k=0
+    do i=1,N_det
+      if (.not.duplicate(i)) then
+        k += 1
+        psi_det(:,:,k) = psi_det_sorted_bit_tc (:,:,i)
+        psi_r_coef_bi_ortho(k,:)  = psi_r_coef_sorted_bit(i,:)
+        psi_l_coef_bi_ortho(k,:)  = psi_l_coef_sorted_bit(i,:)
+      else
+        if (sum(abs(psi_r_coef_sorted_bit(i,:))) /= 0.d0 ) then
+          psi_r_coef_bi_ortho(k,:)  = psi_r_coef_sorted_bit(i,:)
+          psi_l_coef_bi_ortho(k,:)  = psi_l_coef_sorted_bit(i,:)
+        endif
+      endif
+    enddo
+    N_det = k
+    psi_det_sorted_bit_tc(:,:,1:N_det) = psi_det(:,:,1:N_det)
+    psi_r_coef_sorted_bit(1:N_det,:) = psi_r_coef_bi_ortho(1:N_det,:)
+    psi_l_coef_sorted_bit(1:N_det,:) = psi_l_coef_bi_ortho(1:N_det,:)
+    TOUCH N_det psi_det psi_det_sorted_bit_tc c0_weight psi_r_coef_sorted_bit psi_l_coef_sorted_bit
+  endif
+  psi_det = psi_det_sorted_tc
+  psi_r_coef_bi_ortho = psi_r_coef_sorted_bi_ortho
+  psi_l_coef_bi_ortho = psi_l_coef_sorted_bi_ortho
+  SOFT_TOUCH psi_det psi_r_coef_bi_ortho psi_l_coef_bi_ortho psi_det_sorted_bit_tc psi_r_coef_sorted_bit psi_l_coef_sorted_bit
+  deallocate (duplicate,bit_tmp)
+end
+
+
+ BEGIN_PROVIDER [ integer(bit_kind), psi_det_sorted_bit_tc, (N_int,2,psi_det_size) ]
+&BEGIN_PROVIDER [ double precision, psi_r_coef_sorted_bit, (N_det,N_states) ]
+&BEGIN_PROVIDER [ double precision, psi_l_coef_sorted_bit, (N_det,N_states) ]
+   implicit none
+   BEGIN_DOC
+   ! Determinants on which we apply $\langle i|H|psi \rangle$ for perturbation.
+   ! They are sorted by determinants interpreted as integers. Useful
+   ! to accelerate the search of a random determinant in the wave
+   ! function.
+   END_DOC
+
+   call sort_dets_by_det_search_key(N_det, psi_det, psi_r_coef_bi_ortho, size(psi_r_coef_bi_ortho,1),       &
+       psi_det_sorted_bit_tc, psi_r_coef_sorted_bit, N_states)
+   call sort_dets_by_det_search_key(N_det, psi_det, psi_l_coef_bi_ortho, size(psi_l_coef_bi_ortho,1),       &
+       psi_det_sorted_bit_tc, psi_l_coef_sorted_bit, N_states)
+
+END_PROVIDER
--- a/src/fci_tc_bi/diagonalize_ci.irp.f
+++ b/src/fci_tc_bi/diagonalize_ci.irp.f
@ -0,0 +1,100 @@
+
+subroutine diagonalize_CI_tc_bi_ortho(ndet, E_tc,norm,pt2_data,print_pt2)
+  use selection_types
+  implicit none
+  integer, intent(inout)              :: ndet      ! number of determinants from before 
+  double precision, intent(inout)     :: E_tc,norm ! E and norm from previous wave function 
+  type(pt2_type)  , intent(in)        :: pt2_data  ! PT2 from previous wave function 
+  logical, intent(in) :: print_pt2
+  BEGIN_DOC
+!  Replace the coefficients of the CI states by the coefficients of the
+!  eigenstates of the CI matrix
+  END_DOC
+  integer :: i,j
+ double precision :: pt2_tmp,pt1_norm,rpt2_tmp,abs_pt2
+ pt2_tmp = pt2_data % pt2(1)
+ abs_pt2 = pt2_data % variance(1)
+ pt1_norm = pt2_data % overlap(1,1)
+ rpt2_tmp = pt2_tmp/(1.d0 + pt1_norm)
+  print*,'*****'
+  print*,'New wave function information'
+  print*,'N_det tc               = ',N_det
+  print*,'norm_ground_left_right_bi_orth = ',norm_ground_left_right_bi_orth
+  print*,'eigval_right_tc = ',eigval_right_tc_bi_orth(1)
+  print*,'Ndet, E_tc = ',N_det,eigval_right_tc_bi_orth(1)
+  print*,'*****'
+  if(print_pt2)then
+   print*,'*****'
+   print*,'previous wave function info'
+   print*,'norm(before)      = ',norm
+   print*,'E(before)         = ',E_tc 
+   print*,'PT1 norm          = ',dsqrt(pt1_norm)
+   print*,'PT2               = ',pt2_tmp
+   print*,'rPT2              = ',rpt2_tmp
+   print*,'|PT2|             = ',abs_pt2
+   print*,'Positive PT2      = ',(pt2_tmp + abs_pt2)*0.5d0
+   print*,'Negative PT2      = ',(pt2_tmp - abs_pt2)*0.5d0
+   print*,'E(before) + PT2   = ',E_tc + pt2_tmp/norm
+   print*,'E(before) +rPT2   = ',E_tc + rpt2_tmp/norm
+   write(*,'(A28,X,I10,X,100(F16.8,X))')'Ndet,E,E+PT2,E+RPT2,|PT2|=',ndet,E_tc ,E_tc  + pt2_tmp/norm,E_tc  + rpt2_tmp/norm,abs_pt2
+   print*,'*****'
+  endif
+  E_tc  = eigval_right_tc_bi_orth(1)
+  norm  = norm_ground_left_right_bi_orth
+  ndet  = N_det
+  do j=1,N_states
+    do i=1,N_det
+      psi_l_coef_bi_ortho(i,j) = leigvec_tc_bi_orth(i,j)
+      psi_r_coef_bi_ortho(i,j) = reigvec_tc_bi_orth(i,j)
+      psi_coef(i,j) = dabs(psi_l_coef_bi_ortho(i,j) * psi_r_coef_bi_ortho(i,j))   
+    enddo
+  enddo
+  SOFT_TOUCH  eigval_left_tc_bi_orth  eigval_right_tc_bi_orth  leigvec_tc_bi_orth  reigvec_tc_bi_orth norm_ground_left_right_bi_orth psi_coef  psi_l_coef_bi_ortho psi_r_coef_bi_ortho
+
+
+
+  call save_tc_bi_ortho_wavefunction
+end
+
+subroutine print_CI_dressed(ndet, E_tc,norm,pt2_data,print_pt2)
+  use selection_types
+  implicit none
+  integer, intent(inout)              :: ndet      ! number of determinants from before 
+  double precision, intent(inout)     :: E_tc,norm ! E and norm from previous wave function 
+  type(pt2_type)  , intent(in)        :: pt2_data  ! PT2 from previous wave function 
+  logical, intent(in) :: print_pt2
+  BEGIN_DOC
+!  Replace the coefficients of the CI states by the coefficients of the
+!  eigenstates of the CI matrix
+  END_DOC
+  integer :: i,j
+  print*,'*****'
+  print*,'New wave function information'
+  print*,'N_det tc               = ',N_det
+  print*,'norm_ground_left_right_bi_orth = ',norm_ground_left_right_bi_orth
+  print*,'eigval_right_tc = ',eigval_right_tc_bi_orth(1)
+  print*,'Ndet, E_tc = ',N_det,eigval_right_tc_bi_orth(1)
+  print*,'*****'
+  if(print_pt2)then
+   print*,'*****'
+   print*,'previous wave function info'
+   print*,'norm(before)      = ',norm
+   print*,'E(before)         = ',E_tc
+   print*,'PT1 norm          = ',dsqrt(pt2_data % overlap(1,1))
+   print*,'E(before) + PT2   = ',E_tc + (pt2_data % pt2(1))/norm
+   print*,'PT2               = ',pt2_data % pt2(1)
+   print*,'Ndet, E_tc, E+PT2 = ',ndet,E_tc,E_tc + (pt2_data % pt2(1))/norm,dsqrt(pt2_data % overlap(1,1))
+   print*,'*****'
+  endif
+  E_tc  = eigval_right_tc_bi_orth(1)
+  norm  = norm_ground_left_right_bi_orth
+  ndet  = N_det
+  do j=1,N_states
+    do i=1,N_det
+      psi_coef(i,j) = reigvec_tc_bi_orth(i,j)
+    enddo
+  enddo
+  SOFT_TOUCH  eigval_left_tc_bi_orth  eigval_right_tc_bi_orth  leigvec_tc_bi_orth  norm_ground_left_right_bi_orth  psi_coef  reigvec_tc_bi_orth 
+
+end
+
--- a/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
+++ b/src/fci_tc_bi/fci_tc_bi_ortho.irp.f
@ -0,0 +1,85 @@
+program fci
+  implicit none
+  BEGIN_DOC
+  ! Selected Full Configuration Interaction with stochastic selection
+  ! and PT2.
+  !
+  ! This program performs a |CIPSI|-like selected |CI| using a
+  ! stochastic scheme for both the selection of the important Slater
+  ! determinants and the computation of the |PT2| correction. This
+  ! |CIPSI|-like algorithm will be performed for the lowest states of
+  ! the variational space (see :option:`determinants n_states`). The
+  ! |FCI| program will stop when reaching at least one the two following
+  ! conditions:
+  !
+  ! * number of Slater determinants > :option:`determinants n_det_max`
+  ! * abs(|PT2|) less than :option:`perturbation pt2_max`
+  !
+  ! The following other options can be of interest:
+  !
+  ! :option:`determinants read_wf`
+  !   When set to |false|, the program starts with a ROHF-like Slater
+  !   determinant as a guess wave function. When set to |true|, the
+  !   program starts with the wave function(s) stored in the |EZFIO|
+  !   directory as guess wave function(s).
+  !
+  ! :option:`determinants s2_eig`
+  !   When set to |true|, the selection will systematically add all the
+  !   necessary Slater determinants in order to have a pure spin wave
+  !   function with an |S^2| value corresponding to
+  !   :option:`determinants expected_s2`.
+  !
+  ! For excited states calculations, it is recommended to start with
+  ! :ref:`cis` or :ref:`cisd` guess wave functions, eventually in
+  ! a restricted set of |MOs|, and to set :option:`determinants s2_eig`
+  ! to |true|.
+  !
+  END_DOC
+
+
+  my_grid_becke = .True.
+  my_n_pt_r_grid = 30
+  my_n_pt_a_grid = 50
+  touch  my_grid_becke my_n_pt_r_grid my_n_pt_a_grid 
+  pruning = -1.d0
+  touch pruning
+!  pt2_relative_error = 0.01d0
+!  touch pt2_relative_error
+  call run_cipsi_tc
+
+end
+
+
+subroutine run_cipsi_tc
+
+  implicit none
+
+  if (.not.is_zmq_slave) then
+    PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
+    if(elec_alpha_num+elec_beta_num.ge.3)then
+      if(three_body_h_tc)then
+        call provide_all_three_ints_bi_ortho
+      endif
+    endif
+    ! ---
+
+    if (do_pt2) then
+      call run_stochastic_cipsi
+    else
+      call run_cipsi
+    endif
+
+  else
+    PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
+    if(elec_alpha_num+elec_beta_num.ge.3)then
+      if(three_body_h_tc)then
+        call provide_all_three_ints_bi_ortho
+      endif
+    endif
+    ! ---
+
+    call run_slave_cipsi
+
+  endif
+
+end
--- a/src/fci_tc_bi/generators.irp.f
+++ b/src/fci_tc_bi/generators.irp.f
@ -0,0 +1,51 @@
+use bitmasks
+
+BEGIN_PROVIDER [ integer, N_det_generators ]
+ implicit none
+ BEGIN_DOC
+ ! For Single reference wave functions, the number of generators is 1 : the
+ ! Hartree-Fock determinant
+ END_DOC
+ integer :: i
+ double precision :: norm
+ call write_time(6)
+ norm = 1.d0
+ N_det_generators = N_det
+ do i=1,N_det
+   norm = norm - psi_average_norm_contrib_sorted_tc(i)
+   if (norm - 1.d-10 < 1.d0 - threshold_generators) then
+     N_det_generators = i
+     exit
+   endif
+ enddo
+ N_det_generators = max(N_det_generators,1)
+ call write_int(6,N_det_generators,'Number of generators')
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer(bit_kind), psi_det_generators, (N_int,2,psi_det_size) ]
+&BEGIN_PROVIDER [ double precision, psi_coef_generators, (psi_det_size,N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! For Single reference wave functions, the generator is the
+ ! Hartree-Fock determinant
+ END_DOC
+ psi_det_generators(1:N_int,1:2,1:N_det) = psi_det_sorted_tc(1:N_int,1:2,1:N_det)
+ psi_coef_generators(1:N_det,1:N_states) = psi_coef_sorted_tc(1:N_det,1:N_states)
+
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer(bit_kind), psi_det_sorted_tc_gen, (N_int,2,psi_det_size) ]
+&BEGIN_PROVIDER [ double precision, psi_coef_sorted_tc_gen, (psi_det_size,N_states) ]
+&BEGIN_PROVIDER [ integer, psi_det_sorted_tc_gen_order,     (psi_det_size)  ]
+
+ implicit none
+ BEGIN_DOC
+ ! For Single reference wave functions, the generator is the
+ ! Hartree-Fock determinant
+ END_DOC
+ psi_det_sorted_tc_gen = psi_det_sorted_tc
+ psi_coef_sorted_tc_gen = psi_coef_sorted_tc
+ psi_det_sorted_tc_gen_order = psi_det_sorted_tc_order
+END_PROVIDER
+
+
--- a/src/fci_tc_bi/save_energy.irp.f
+++ b/src/fci_tc_bi/save_energy.irp.f
@ -0,0 +1,9 @@
+subroutine save_energy(E,pt2)
+  implicit none
+  BEGIN_DOC
+! Saves the energy in |EZFIO|.
+  END_DOC
+  double precision, intent(in) :: E(N_states), pt2(N_states)
+  call ezfio_set_fci_tc_energy(E(1:N_states))
+  call ezfio_set_fci_tc_energy_pt2(E(1:N_states)+pt2(1:N_states))
+end
--- a/src/fci_tc_bi/scripts_fci_tc/CH2.xyz
+++ b/src/fci_tc_bi/scripts_fci_tc/CH2.xyz
@ -0,0 +1,6 @@
+3
+
+C                    6.000000     0.000000     0.000000     0.173480    
+H                    1.000000     0.000000    -0.861500    -0.520430
+H                    1.000000     0.000000     0.861500    -0.520430
+
--- a/src/fci_tc_bi/scripts_fci_tc/FH.xyz
+++ b/src/fci_tc_bi/scripts_fci_tc/FH.xyz
@ -0,0 +1,5 @@
+2 
+
+H                    0.000000     0.000000    -0.825120
+F                    0.000000     0.000000     0.091680
+
--- a/src/fci_tc_bi/scripts_fci_tc/extract_tables.sh
+++ b/src/fci_tc_bi/scripts_fci_tc/extract_tables.sh
@ -0,0 +1,16 @@
+
+input=h2o
+basis=dz
+EZFIO=${input}_${basis}_bi_ortho
+file=${EZFIO}.tc_fci.out 
+grep "Ndet,E,E+PT2,E+RPT2,|PT2|=" ${file} | cut -d "=" -f 2 > data_${EZFIO}
+file=${EZFIO}.tc_fci_normal_order.out
+grep "Ndet,E,E+PT2,E+RPT2=" ${file} | cut -d "=" -f 2 > data_${EZFIO}_normal
+
+#EZFIO=${input}_${basis}_ortho
+#file=${EZFIO}.tc_fci.out 
+#grep "Ndet, E_tc, E+PT2 =" ${file} | cut -d "=" -f 2 > data_${EZFIO}
+#file=${EZFIO}.tc_fci_normal_order.out
+#grep "Ndet, E_tc, E+PT2 =" ${file} | cut -d "=" -f 2 > data_${EZFIO}_normal
+
+#zip data_${input}_${basis}.zip data*
--- a/src/fci_tc_bi/scripts_fci_tc/h2o.sh
+++ b/src/fci_tc_bi/scripts_fci_tc/h2o.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+# This is a sample PBS script
+# temps CPU a ajuster au calcul
+   #PBS -l cput=2000:00:00
+   #PBS -l nodes=1:ppn=16
+# memoire a ajuster au calcul
+   #PBS -l vmem=100gb
+# a changer
+# Pour savoir sur quel noeud on est
+#echo $HOSTNAME
+# Startdir = ou sont les fichiers d'input, par defaut HOMEDIR
+#
+StartDir=$PBS_O_WORKDIR
+echo $StartDir
+#
+# SCRATCHDIR = espace temporaire (local au noeud et a vider apres le calcul)
+# NE PAS MODIFIER
+ulimit -s unlimited
+export SCRATCHDIR=/scratch/$USER/$PBS_JOBID
+#
+cd $StartDir
+
+
+############################################################################
+#### EXAMPLE OF SCRIPT TO RUN A CIPSI CALCULATION ON 5 STATES ON THE Ne^+ CATION
+#### USING NATURAL ORBITALS OF A SMALL CIPSI AS MOS 
+#### ALL STATES WILL HAVE THE SAME SPIN SIMETRY : A DOUBLET 
+
+####### YOU PUT THE PATH TO YOUR 
+QP_ROOT=/home_lct/eginer/programs/qp2
+source ${QP_ROOT}/quantum_package.rc 
+####### YOU LOAD SOME LIBRARIES 
+alias python3='/programmes/installation/Python/3.7.1/bin/python3'
+type -a python3
+
+export OMP_NUM_THREADS=16
+
+module load intel2016_OMPI-V2
+
+source ~/programs/qp2/quantum_package.rc
+./script.sh h2o dz O 1
--- a/src/fci_tc_bi/scripts_fci_tc/h2o.xyz
+++ b/src/fci_tc_bi/scripts_fci_tc/h2o.xyz
@ -0,0 +1,6 @@
+3
+
+O      0.000000     0.000000     0.000000
+H      0.000000     0.000000     0.957200
+H     -0.926627     0.000000    -0.239987
+
--- a/src/fci_tc_bi/scripts_fci_tc/script.sh
+++ b/src/fci_tc_bi/scripts_fci_tc/script.sh
@ -0,0 +1,31 @@
+source /home_lct/eginer/qp2/quantum_package.rc
+input=$1
+ basis=$2
+ atom=$3
+ mul=$4
+ EXPORT_OMP_NUM_THREADS=16
+ dir=${input}_${basis}
+ mkdir ${dir}
+ cp ${input}.xyz ${dir}/
+ cd $dir
+ EZFIO=${input}_${basis}_bi_ortho
+ qp create_ezfio -b "${atom}:cc-pcv${basis}|H:cc-pv${basis}" ${input}.xyz -m $mul -o $EZFIO
+ qp run scf
+ # Getting THE GOOD VALUE OF MU
+ qp run print_mu_av_tc  | tee ${EZFIO_FILE}.mu_av.out
+ mu=`grep "average_mu_rs_c_lda  =" ${EZFIO_FILE}.mu_av.out | cut -d "=" -f 2`
+ qp set ao_two_e_erf_ints mu_erf $mu
+ # Carrying the BI-ORTHO TC-SCF 
+ qp run tc_scf | tee ${EZFIO_FILE}.tc_scf.out
+ #   Three body terms without normal order
+ ### THREE E TERMS FOR FCI
+ qp set tc_keywords three_body_h_tc True  
+ qp set tc_keywords double_normal_ord False
+ qp set perturbation pt2_max 0.003
+ qp run fci_tc_bi_ortho | tee ${EZFIO_FILE}.tc_fci.out
+ #   Three body terms with normal order
+ qp set tc_keywords double_normal_ord True
+ qp run fci_tc_bi_ortho | tee ${EZFIO_FILE}.tc_fci_normal_order.out
+
+cd ../
+
--- a/src/fci_tc_bi/selectors.irp.f
+++ b/src/fci_tc_bi/selectors.irp.f
@ -0,0 +1,100 @@
+use bitmasks
+
+BEGIN_PROVIDER [ double precision, threshold_selectors ]
+ implicit none
+ BEGIN_DOC
+ ! Thresholds on selectors (fraction of the square of the norm)
+ END_DOC
+ threshold_selectors = dsqrt(threshold_generators)
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, N_det_selectors]
+  implicit none
+  BEGIN_DOC
+  ! For Single reference wave functions, the number of selectors is 1 : the
+  ! Hartree-Fock determinant
+  END_DOC
+  integer                        :: i
+  double precision               :: norm, norm_max
+  call write_time(6)
+  N_det_selectors = N_det
+  norm = 1.d0
+  do i=1,N_det
+    norm = norm - psi_average_norm_contrib_tc(i)
+    if (norm - 1.d-10 < 1.d0 - threshold_selectors) then
+      N_det_selectors = i
+      exit
+    endif
+  enddo
+  N_det_selectors = max(N_det_selectors,N_det_generators)
+  call write_int(6,N_det_selectors,'Number of selectors')
+END_PROVIDER
+
+ BEGIN_PROVIDER [ integer(bit_kind), psi_selectors, (N_int,2,psi_selectors_size) ]
+&BEGIN_PROVIDER [ double precision, psi_selectors_coef, (psi_selectors_size,N_states) ]
+&BEGIN_PROVIDER [ double precision, psi_selectors_coef_tc, (psi_selectors_size,2,N_states) ]
+  implicit none
+  BEGIN_DOC
+  ! Determinants on which we apply <i|H|psi> for perturbation.
+  END_DOC
+  integer                        :: i,k
+
+  do i=1,N_det_selectors
+    do k=1,N_int
+      psi_selectors(k,1,i) = psi_det_sorted_tc(k,1,i)
+      psi_selectors(k,2,i) = psi_det_sorted_tc(k,2,i)
+    enddo
+  enddo
+  do k=1,N_states
+    do i=1,N_det_selectors
+      psi_selectors_coef(i,k) = psi_coef_sorted_tc_gen(i,k)
+      psi_selectors_coef_tc(i,1,k) = psi_l_coef_sorted_bi_ortho(i,k)
+      psi_selectors_coef_tc(i,2,k) = psi_r_coef_sorted_bi_ortho(i,k)
+!      psi_selectors_coef_tc(i,1,k) = 1.d0
+!      psi_selectors_coef_tc(i,2,k) = 1.d0
+    enddo
+  enddo
+
+END_PROVIDER
+
+ BEGIN_PROVIDER [ double precision, psi_selectors_coef_transp, (N_states,psi_selectors_size) ]
+&BEGIN_PROVIDER [ double precision, psi_selectors_coef_transp_tc, (N_states,2,psi_selectors_size) ]
+  implicit none
+  BEGIN_DOC
+  ! Transposed psi_selectors
+  END_DOC
+  integer                        :: i,k
+
+  do i=1,N_det_selectors
+    do k=1,N_states
+      psi_selectors_coef_transp(k,i) = psi_selectors_coef(i,k)
+      psi_selectors_coef_transp_tc(k,1,i) = psi_selectors_coef_tc(i,1,k)
+      psi_selectors_coef_transp_tc(k,2,i) = psi_selectors_coef_tc(i,2,k)
+    enddo
+  enddo
+END_PROVIDER
+
+ BEGIN_PROVIDER [ double precision, psi_selectors_rcoef_bi_orth_transp, (N_states, psi_det_size) ]
+&BEGIN_PROVIDER [ double precision, psi_selectors_lcoef_bi_orth_transp, (N_states, psi_det_size) ]
+
+  implicit none
+  integer :: i, k
+
+  psi_selectors_rcoef_bi_orth_transp = 0.d0
+  psi_selectors_lcoef_bi_orth_transp = 0.d0
+
+  print*,'N_det,N_det_selectors',N_det,N_det_selectors
+  do i = 1, N_det_selectors
+    do k = 1, N_states
+      psi_selectors_rcoef_bi_orth_transp(k,i) = psi_r_coef_sorted_bi_ortho(i,k)
+      psi_selectors_lcoef_bi_orth_transp(k,i) = psi_l_coef_sorted_bi_ortho(i,k)
+    enddo
+  enddo
+
+END_PROVIDER
+
+BEGIN_PROVIDER [ integer, psi_selectors_size ]
+ implicit none
+ psi_selectors_size = psi_det_size
+END_PROVIDER
+
--- a/src/fci_tc_bi/zmq.irp.f
+++ b/src/fci_tc_bi/zmq.irp.f
@ -0,0 +1,103 @@
+BEGIN_TEMPLATE
+
+integer function zmq_put_$X(zmq_to_qp_run_socket,worker_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Put $X on the qp_run scheduler
+  END_DOC
+  integer(ZMQ_PTR), intent(in)   :: zmq_to_qp_run_socket
+  integer, intent(in)            :: worker_id
+  integer                        :: rc
+  character*(256)                :: msg
+
+  zmq_put_$X = 0
+
+  write(msg,'(A,1X,I8,1X,A200)') 'put_data '//trim(zmq_state), worker_id, '$X'
+  rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),ZMQ_SNDMORE)
+  if (rc /= len(trim(msg))) then
+    zmq_put_$X = -1
+    return
+  endif
+
+  rc = f77_zmq_send(zmq_to_qp_run_socket,$X,4,0)
+  if (rc /= 4) then
+    zmq_put_$X = -1
+    return
+  endif
+
+  rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0)
+  if (msg(1:rc) /= 'put_data_reply ok') then
+    zmq_put_$X = -1
+    return
+  endif
+
+end
+
+integer function zmq_get_$X(zmq_to_qp_run_socket, worker_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Get $X from the qp_run scheduler
+  END_DOC
+  integer(ZMQ_PTR), intent(in)   :: zmq_to_qp_run_socket
+  integer, intent(in)            :: worker_id
+  integer                        :: rc
+  character*(256)                :: msg
+
+  PROVIDE zmq_state
+  zmq_get_$X = 0
+  if (mpi_master) then
+
+    write(msg,'(A,1X,I8,1X,A200)') 'get_data '//trim(zmq_state), worker_id, '$X'
+    rc = f77_zmq_send(zmq_to_qp_run_socket,trim(msg),len(trim(msg)),0)
+    if (rc /= len(trim(msg))) then
+      zmq_get_$X = -1
+      go to 10
+    endif
+
+    rc = f77_zmq_recv(zmq_to_qp_run_socket,msg,len(msg),0)
+    if (msg(1:14) /= 'get_data_reply') then
+      zmq_get_$X = -1
+      go to 10
+    endif
+
+    rc = f77_zmq_recv(zmq_to_qp_run_socket,$X,4,0)
+    if (rc /= 4) then
+      zmq_get_$X = -1
+      go to 10
+    endif
+
+  endif
+
+  10 continue
+
+  IRP_IF MPI_DEBUG
+    print *,  irp_here, mpi_rank
+    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+  IRP_ENDIF
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+
+    call MPI_BCAST (zmq_get_$X, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      print *,  irp_here//': Unable to broadcast N_det_generators'
+      stop -1
+    endif
+    call MPI_BCAST ($X, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      print *,  irp_here//': Unable to broadcast N_det_generators'
+      stop -1
+    endif
+  IRP_ENDIF
+
+end
+
+SUBST [ X ]
+
+N_det_generators ;;
+N_det_selectors ;;
+
+END_TEMPLATE
+