Added davidson without S2

2025-04-24 17:24:54 +02:00 · 2021-02-17 00:46:58 +01:00 · 2021-02-17 00:46:58 +01:00 · 554579492b
commit 554579492b
parent 7aa191523d
17 changed files with 2243 additions and 416 deletions
--- a/src/cipsi/NEED
+++ b/src/cipsi/NEED
@ -4,3 +4,4 @@ mpi
 davidson_undressed
 iterations
 two_body_rdm
+csf
--- a/src/cipsi/selection.irp.f
+++ b/src/cipsi/selection.irp.f
@ -700,7 +700,7 @@ subroutine fill_buffer_double(i_generator, sp, h1, h2, bannedOrb, banned, fock_d
        endif
      enddo

-      do_diag = sum(dabs(coef)) > 0.001d0
+      do_diag = sum(dabs(coef)) > 0.001d0 .and. N_states > 1

      double precision :: eigvalues(N_states+1)
      double precision :: work(1+6*(N_states+1)+2*(N_states+1)**2)
--- a/src/csf/NEED
+++ b/src/csf/NEED
@ -0,0 +1 @@
+determinants
--- a/src/determinants/configuration_CI_sigma_helpers.irp.f
+++ b/src/determinants/configuration_CI_sigma_helpers.irp.f
--- a/src/determinants/configuration_CI_sigma_helpers.org
+++ b/src/determinants/configuration_CI_sigma_helpers.org
--- a/src/determinants/configurations.irp.f
+++ b/src/determinants/configurations.irp.f
--- a/src/csf/create_excitations.irp.f
+++ b/src/csf/create_excitations.irp.f
@ -0,0 +1,279 @@
+subroutine do_single_excitation_cfg(key_in,key_out,i_hole,i_particle,ok)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Applies the single excitation operator to a configuration
+  ! If the excitation is possible, ok is True
+  END_DOC
+  integer, intent(in)            :: i_hole,i_particle
+  integer(bit_kind), intent(in)  :: key_in(N_int,2)
+  logical , intent(out)          :: ok
+  integer                        :: k,j,i
+  integer(bit_kind)              :: mask
+  integer(bit_kind)              :: key_out(N_int,2)
+
+  ASSERT (i_hole > 0)
+  ASSERT (i_particle <= mo_num)
+
+  ok = .True.
+  key_out(:,:) = key_in(:,:)
+
+  ! hole
+  k = shiftr(i_hole-1,bit_kind_shift)+1
+  j = i_hole-shiftl(k-1,bit_kind_shift)-1
+  mask = ibset(0_bit_kind,j)
+
+  ! Check if the position j is singly occupied
+  ! 1  ->  0  (SOMO)
+  ! 0      0  (DOMO)
+  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
+     key_out(k,1) = ibclr(key_out(k,1),j)
+
+  ! Check if the position j is doubly occupied
+  ! 0  ->  1  (SOMO)
+  ! 1      0  (DOMO)
+  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
+     key_out(k,1) = ibset(key_out(k,1),j)
+     key_out(k,2) = ibclr(key_out(k,2),j)
+
+  ! The position j is unoccupied: Not OK
+  ! 0  ->  0  (SOMO)
+  ! 0      0  (DOMO)
+  else
+     ok =.False.
+     return
+  endif
+
+
+  ! particle
+  k = shiftr(i_particle-1,bit_kind_shift)+1
+  j = i_particle-shiftl(k-1,bit_kind_shift)-1
+  mask = ibset(0_bit_kind,j)
+
+  ! Check if the position j is singly occupied
+  ! 1  ->  0  (SOMO)
+  ! 0      1  (DOMO)
+  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
+     key_out(k,1) = ibclr(key_out(k,1),j)
+     key_out(k,2) = ibset(key_out(k,2),j)
+
+  ! Check if the position j is doubly occupied : Not OK
+  ! 0  ->  1  (SOMO)
+  ! 1      0  (DOMO)
+  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
+     ok = .False.
+     return
+
+  ! Position at j is unoccupied
+  ! 0  ->  0  (SOMO)
+  ! 0      0  (DOMO)
+  else
+     key_out(k,1) = ibset(key_out(k,1),j)
+  endif
+
+end
+
+subroutine do_single_excitation_cfg_with_type(key_in,key_out,i_hole,i_particle,ex_type,ok)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Applies the single excitation operator to a configuration
+  ! Returns the type of excitation in ex_type
+  ! where the following convention is used
+  ! 1 = (SOMO -> SOMO) 1 change in Nsomo
+  ! 2 = (DOMO ->  VMO) 1 change in Nsomo
+  ! 3 = (SOMO ->  VMO) 0 change in Nsomo
+  ! 4 = (DOMO -> SOMO) 0 change in Nsomo
+  ! If the excitation is possible, ok is True
+  END_DOC
+  integer, intent(in)            :: i_hole,i_particle
+  integer(bit_kind), intent(in)  :: key_in(N_int,2)
+  integer , intent(out)          :: ex_type
+  logical , intent(out)          :: ok
+  integer                        :: k,j,i
+  integer(bit_kind)              :: mask
+  integer(bit_kind)              :: key_out(N_int,2)
+  logical                        :: isholeSOMO
+  logical                        :: isparticleSOMO
+  logical                        :: isholeDOMO
+  logical                        :: isparticleVMO
+  isholeSOMO = .False.
+  isholeDOMO = .False.
+  isparticleSOMO = .False.
+  isparticleVMO = .False.
+
+  ASSERT (i_hole > 0)
+  ASSERT (i_particle <= mo_num)
+
+  ok = .True.
+  key_out(:,:) = key_in(:,:)
+
+  ! hole
+  k = shiftr(i_hole-1,bit_kind_shift)+1
+  j = i_hole-shiftl(k-1,bit_kind_shift)-1
+  mask = ibset(0_bit_kind,j)
+
+  ! Check if the position j is singly occupied
+  ! 1  ->  0  (SOMO)
+  ! 0      0  (DOMO)
+  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
+     key_out(k,1) = ibclr(key_out(k,1),j)
+     isholeSOMO = .True.
+
+  ! Check if the position j is doubly occupied
+  ! 0  ->  1  (SOMO)
+  ! 1      0  (DOMO)
+  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
+     key_out(k,1) = ibset(key_out(k,1),j)
+     key_out(k,2) = ibclr(key_out(k,2),j)
+     isholeDOMO = .True.
+
+  ! The position j is unoccupied: Not OK
+  ! 0  ->  0  (SOMO)
+  ! 0      0  (DOMO)
+  else
+     ok =.False.
+     return
+  endif
+
+
+  ! particle
+  k = shiftr(i_particle-1,bit_kind_shift)+1
+  j = i_particle-shiftl(k-1,bit_kind_shift)-1
+  mask = ibset(0_bit_kind,j)
+
+  ! Check if the position j is singly occupied
+  ! 1  ->  0  (SOMO)
+  ! 0      1  (DOMO)
+  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
+     key_out(k,1) = ibclr(key_out(k,1),j)
+     key_out(k,2) = ibset(key_out(k,2),j)
+     isparticleSOMO = .True.
+
+  ! Check if the position j is doubly occupied : Not OK
+  ! 0  ->  1  (SOMO)
+  ! 1      0  (DOMO)
+  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
+     ok = .False.
+     return
+
+  ! Position at j is unoccupied
+  ! 0  ->  0  (SOMO)
+  ! 0      0  (DOMO)
+  else
+     key_out(k,1) = ibset(key_out(k,1),j)
+     isparticleVMO = .True.
+  endif
+
+  if(isholeSOMO) then
+      ! two possibilities
+      ! particle is SOMO or VMO
+      if(isparticleSOMO) then
+          ! SOMO -> SOMO
+          ex_type = 1
+      else
+          ! SOMO -> VMO
+          ex_type = 3
+      endif 
+   else
+       ! two possibilities
+       ! particle is SOMO or VMO
+       if(isparticleSOMO) then
+           ! DOMO -> SOMO
+           ex_type = 4
+       else
+           ! DOMO -> VMO
+           ex_type = 2
+       endif
+    endif
+
+end
+
+subroutine generate_all_singles_cfg(cfg,singles,n_singles,Nint)
+  implicit none
+  use bitmasks
+  BEGIN_DOC
+  ! Generate all single excitation wrt a configuration
+  !
+  ! n_singles : on input, max number of singles :
+  !   elec_alpha_num * (mo_num - elec_beta_num)
+  !   on output, number of generated singles
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer, intent(inout)         :: n_singles
+  integer(bit_kind), intent(in)  :: cfg(Nint,2)
+  integer(bit_kind), intent(out) :: singles(Nint,2,*)
+
+  integer           :: i,k, n_singles_ma, i_hole, i_particle
+  integer(bit_kind) :: single(Nint,2)
+  logical           :: i_ok
+
+  n_singles = 0
+  !TODO
+  !Make list of Somo  and Domo for holes
+  !Make list of Unocc and Somo for particles
+  do i_hole = 1, mo_num
+    do i_particle = 1, mo_num
+      call do_single_excitation_cfg(cfg,single,i_hole,i_particle,i_ok)
+      if (i_ok) then
+        n_singles = n_singles + 1
+        do k=1,Nint
+          singles(k,1,n_singles) = single(k,1)
+          singles(k,2,n_singles) = single(k,2)
+        enddo
+      endif
+    enddo
+  enddo
+end
+
+subroutine generate_all_singles_cfg_with_type(cfgInp,singles,idxs_singles,pq_singles,ex_type_singles,n_singles,Nint)
+  implicit none
+  use bitmasks
+  BEGIN_DOC
+  ! Generate all single excitation wrt a configuration
+  !
+  ! n_singles : on input, max number of singles :
+  !   elec_alpha_num * (mo_num - elec_beta_num)
+  !   on output, number of generated singles
+  ! ex_type_singles : on output contains type of excitations  :
+  !
+  END_DOC
+  integer, intent(in)            :: Nint
+  integer, intent(inout)         :: n_singles
+  integer, intent(out)           :: idxs_singles(*)
+  integer, intent(out)           :: ex_type_singles(*)
+  integer, intent(out)           :: pq_singles(2,*)
+  integer(bit_kind), intent(in)  :: cfgInp(Nint,2)
+  integer(bit_kind), intent(out) :: singles(Nint,2,*)
+  integer(bit_kind)              :: Jdet(Nint,2)
+
+  integer           :: i,k, n_singles_ma, i_hole, i_particle, ex_type, addcfg
+  integer(bit_kind) :: single(Nint,2)
+  logical           :: i_ok
+
+  n_singles = 0
+  !TODO
+  !Make list of Somo  and Domo for holes
+  !Make list of Unocc and Somo for particles
+  do i_hole = 1+n_core_orb, n_core_orb + n_act_orb
+    do i_particle = 1+n_core_orb, n_core_orb + n_act_orb
+      if(i_hole .EQ. i_particle) cycle
+      addcfg = -1
+      call do_single_excitation_cfg_with_type(cfgInp,single,i_hole,i_particle,ex_type,i_ok)
+      if (i_ok) then
+        call binary_search_cfg(single,addcfg)
+        if(addcfg .EQ. -1) cycle
+        n_singles = n_singles + 1
+        do k=1,Nint
+          singles(k,1,n_singles) = single(k,1)
+          singles(k,2,n_singles) = single(k,2)
+          ex_type_singles(n_singles) = ex_type
+          pq_singles(1,n_singles) = i_hole     ! p
+          pq_singles(2,n_singles) = i_particle ! q
+          idxs_singles(n_singles) = addcfg
+        enddo
+      endif
+    enddo
+  enddo
+end
+
--- a/src/davidson/NEED
+++ b/src/davidson/NEED
@ -1 +1 @@
-determinants
+csf
--- a/src/davidson/davidson_parallel.irp.f
+++ b/src/davidson/davidson_parallel.irp.f
@ -329,6 +329,7 @@ end subroutine



+
 subroutine davidson_collector(zmq_to_qp_run_socket, zmq_socket_pull, v0, s0, sze, N_st)
  use f77_zmq
  implicit none
@ -377,7 +378,6 @@ end subroutine



-
 subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze)
  use omp_lib
  use bitmasks
@ -538,6 +538,10 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze)
 end


+
+
+
+
 BEGIN_PROVIDER [ integer, nthreads_davidson ]
 implicit none
 BEGIN_DOC
--- a/src/davidson/davidson_parallel_nos2.irp.f
+++ b/src/davidson/davidson_parallel_nos2.irp.f
@ -0,0 +1,495 @@
+use bitmasks
+use f77_zmq
+
+
+subroutine davidson_nos2_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  call davidson_nos2_run_slave(1,i)
+end
+
+
+subroutine davidson_nos2_slave_tcp(i)
+  implicit none
+  integer, intent(in)            :: i
+  call davidson_nos2_run_slave(0,i)
+end
+
+
+
+subroutine davidson_nos2_run_slave(thread,iproc)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Slave routine for Davidson's diagonalization.
+  END_DOC
+
+  integer,  intent(in)           :: thread, iproc
+
+  integer                        :: worker_id, task_id, blockb
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  integer, external              :: connect_to_taskserver
+  integer                        :: doexit, send, receive
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  doexit = 0
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    doexit=1
+  endif
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    send = doexit
+    call MPI_AllReduce(send, receive, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      doexit=1
+    endif
+    doexit = receive
+  IRP_ENDIF
+  if (doexit>0) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  call davidson_nos2_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_states_diag, N_det, worker_id)
+
+  integer, external :: disconnect_from_taskserver
+  if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+    call sleep(1)
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+      print *,  irp_here, ': disconnect failed'
+      continue
+    endif
+  endif
+
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  call end_zmq_push_socket(zmq_socket_push)
+
+end subroutine
+
+
+
+subroutine davidson_nos2_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, worker_id)
+  use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR),intent(in)   :: zmq_to_qp_run_socket
+  integer(ZMQ_PTR),intent(in)   :: zmq_socket_push
+  integer,intent(in)             :: worker_id, N_st, sze
+  integer                        :: task_id
+  character*(512)                :: msg
+  integer                        :: imin, imax, ishift, istep
+
+  integer, allocatable           :: psi_det_read(:,:,:)
+  double precision, allocatable  :: v_t(:,:), u_t(:,:)
+
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t
+
+  ! Get wave function (u_t)
+  ! -----------------------
+
+  integer                        :: rc, ni, nj
+  integer*8                      :: rc8
+  integer                        :: N_states_read, N_det_read, psi_det_size_read
+  integer                        :: N_det_selectors_read, N_det_generators_read
+
+  integer, external :: zmq_get_dvector
+  integer, external :: zmq_get_dmatrix
+
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  allocate(u_t(N_st,N_det))
+
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (size(u_t,kind=8) < 8388608_8) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = int(size(u_t,kind=8)/8388608_8,4) + 1
+  endif
+
+  do while (zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1)
+    print *,  'mpi_rank, N_states_diag, N_det'
+    print *,  mpi_rank, N_states_diag, N_det
+    stop 'u_t'
+  enddo
+
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+
+    call broadcast_chunks_double(u_t,size(u_t,kind=8))
+
+  IRP_ENDIF
+
+  ! Run tasks
+  ! ---------
+
+  logical :: sending
+  sending=.False.
+
+  allocate(v_t(N_st,N_det))
+  do
+    integer, external :: get_task_from_taskserver
+    integer, external :: task_done_to_taskserver
+    if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
+      exit
+    endif
+    if(task_id == 0) exit
+    read (msg,*) imin, imax, ishift, istep
+    integer :: k
+    do k=imin,imax
+      v_t(:,k) = 0.d0
+    enddo
+    call H_u_0_nstates_openmp_work(v_t,u_t,N_st,N_det,imin,imax,ishift,istep)
+    if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
+        print *,  irp_here, 'Unable to send task_done'
+    endif
+    call davidson_push_results_async_recv(zmq_socket_push, sending)
+    call davidson_nos2_push_results_async_send(zmq_socket_push, v_t, imin, imax, task_id, sending)
+  end do
+  deallocate(u_t,v_t)
+  call davidson_push_results_async_recv(zmq_socket_push, sending)
+
+end subroutine
+
+
+
+subroutine davidson_nos2_push_results(zmq_socket_push, v_t, imin, imax, task_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  double precision    ,intent(in)    :: v_t(N_states_diag,N_det)
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_nos2_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_nos2_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_nos2_push_results failed to push imax'
+
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_nos2_push_results failed to push vt'
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if ((rc /= 2).and.(ok(1:2)/='ok')) then
+    print *, irp_here, ': f77_zmq_recv( zmq_socket_push, ok, 2, 0)'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+subroutine davidson_nos2_push_results_async_send(zmq_socket_push, v_t, imin, imax, task_id,sending)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  double precision    ,intent(in)    :: v_t(N_states_diag,N_det)
+  logical             ,intent(inout) :: sending
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  if (sending) then
+    print *,  irp_here, ': sending=true'
+    stop -1
+  endif
+  sending = .True.
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_nos2_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_nos2_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_nos2_push_results failed to push imax'
+
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_nos2_push_results failed to push vt'
+
+
+end subroutine
+
+
+
+
+subroutine davidson_nos2_pull_results(zmq_socket_pull, v_t, imin, imax, task_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Pull the results of $H | U \rangle$ on the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)     :: zmq_socket_pull
+  integer             ,intent(out)    :: task_id, imin, imax
+  double precision    ,intent(out)    :: v_t(N_states_diag,N_det)
+
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
+  if(rc /= 4) stop 'davidson_nos2_pull_results failed to pull task_id'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
+  if(rc /= 4) stop 'davidson_nos2_pull_results failed to pull imin'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
+  if(rc /= 4) stop 'davidson_nos2_pull_results failed to pull imax'
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc8 = f77_zmq_recv8( zmq_socket_pull, v_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8*sz) stop 'davidson_nos2_pull_results failed to pull v_t'
+
+! Activate if zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
+  if (rc /= 2) then
+    print *,  irp_here, ' : f77_zmq_send (zmq_socket_pull,...'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+
+
+
+subroutine davidson_nos2_collector(zmq_to_qp_run_socket, zmq_socket_pull, v0, sze, N_st)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Routine collecting the results of the workers in Davidson's algorithm.
+  END_DOC
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  integer, intent(in)            :: sze, N_st
+  integer(ZMQ_PTR), intent(in)   :: zmq_to_qp_run_socket
+
+  double precision    ,intent(inout) :: v0(sze, N_st)
+
+  integer                          :: more, task_id, imin, imax
+
+  double precision, allocatable :: v_t(:,:)
+  logical :: sending
+  integer :: i,j
+  integer, external :: zmq_delete_task_async_send
+  integer, external :: zmq_delete_task_async_recv
+
+  allocate(v_t(N_st,N_det))
+  v0 = 0.d0
+  more = 1
+  sending = .False.
+  do while (more == 1)
+    call davidson_nos2_pull_results(zmq_socket_pull, v_t, imin, imax, task_id)
+    if (zmq_delete_task_async_send(zmq_to_qp_run_socket,task_id,sending) == -1) then
+      stop 'davidson: Unable to delete task (send)'
+    endif
+    do j=1,N_st
+      do i=imin,imax
+        v0(i,j) = v0(i,j) + v_t(j,i)
+      enddo
+    enddo
+    if (zmq_delete_task_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
+      stop 'davidson: Unable to delete task (recv)'
+    endif
+  end do
+  deallocate(v_t)
+
+end subroutine
+
+
+
+
+
+subroutine H_u_0_nstates_zmq(v_0,u_0,N_st,sze)
+  use omp_lib
+  use bitmasks
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_0 = H | u_0\rangle$
+  !
+  ! n : number of determinants
+  !
+  ! H_jj : array of $\langle j | H | j \rangle$
+  END_DOC
+  integer, intent(in)            :: N_st, sze
+  double precision, intent(out)  :: v_0(sze,N_st)
+  double precision, intent(inout):: u_0(sze,N_st)
+  integer                        :: i,j,k
+  integer                        :: ithread
+  double precision, allocatable  :: u_t(:,:)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson')
+
+!  integer :: N_states_diag_save
+!  N_states_diag_save = N_states_diag
+!  N_states_diag = N_st
+  if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then
+    stop 'Unable to put N_states_diag on ZMQ server'
+  endif
+
+  if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+    stop 'Unable to put psi on ZMQ server'
+  endif
+  energy = 0.d0
+  if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then
+    stop 'Unable to put energy on ZMQ server'
+  endif
+
+
+  ! Create tasks
+  ! ============
+
+  integer :: istep, imin, imax, ishift, ipos
+  integer, external :: add_task_to_taskserver
+  integer, parameter :: tasksize=20000
+  character*(100000) :: task
+  istep=1
+  ishift=0
+  imin=1
+
+
+  ipos=1
+  do imin=1,N_det,tasksize
+    imax = min(N_det,imin-1+tasksize)
+    if (imin==1) then
+      istep = 2
+    else
+      istep = 1
+    endif
+    do ishift=0,istep-1
+      write(task(ipos:ipos+50),'(4(I11,1X),1X,1A)') imin, imax, ishift, istep, '|'
+      ipos = ipos+50
+      if (ipos > 100000-50) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task'
+        endif
+        ipos=1
+      endif
+    enddo
+  enddo
+
+  if (ipos > 1) then
+    if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+      stop 'Unable to add task'
+    endif
+    ipos=1
+  endif
+
+  allocate(u_t(N_st,N_det))
+  do k=1,N_st
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+
+  call dtranspose(                                                   &
+      u_0,                                                           &
+      size(u_0, 1),                                                  &
+      u_t,                                                           &
+      size(u_t, 1),                                                  &
+      N_det, N_st)
+
+
+  ASSERT (N_st == N_states_diag)
+  ASSERT (sze >= N_det)
+
+  integer :: rc, ni, nj
+  integer*8 :: rc8
+  double precision :: energy(N_st)
+
+  integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag
+  integer, external :: zmq_put_dmatrix
+
+  if (size(u_t) < 8388608) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = size(u_t)/8388608 + 1
+  endif
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (zmq_put_dmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then
+    stop 'Unable to put u_t on ZMQ server'
+  endif
+
+  deallocate(u_t)
+
+  integer, external :: zmq_set_running
+  if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+    print *,  irp_here, ': Failed in zmq_set_running'
+  endif
+
+  call omp_set_max_active_levels(4)
+  !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(2) PRIVATE(ithread)
+  ithread = omp_get_thread_num()
+  if (ithread == 0 ) then
+    call davidson_nos2_collector(zmq_to_qp_run_socket, zmq_socket_pull, v_0, N_det, N_st)
+  else
+    call davidson_nos2_slave_inproc(1)
+  endif
+  !$OMP END PARALLEL
+  call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'davidson')
+
+  !$OMP PARALLEL
+  !$OMP SINGLE
+  do k=1,N_st
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+  enddo
+  !$OMP END SINGLE
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+!  N_states_diag = N_states_diag_save
+!  SOFT_TOUCH N_states_diag
+end
+
--- a/src/davidson/diagonalization_h_dressed.irp.f
+++ b/src/davidson/diagonalization_h_dressed.irp.f
@ -0,0 +1,568 @@
+subroutine davidson_diag_h(dets_in,u_in,dim_in,energies,sze,N_st,N_st_diag,Nint,dressing_state,converged)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization.
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, N_st, N_st_diag, Nint
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag)
+  double precision, intent(out)  :: energies(N_st_diag)
+  integer, intent(in)            :: dressing_state
+  logical, intent(out)           :: converged
+  double precision, allocatable  :: H_jj(:)
+
+  double precision, external     :: diag_H_mat_elem, diag_S_mat_elem
+  integer                        :: i,k
+  ASSERT (N_st > 0)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+  PROVIDE mo_two_e_integrals_in_map
+  allocate(H_jj(sze))
+
+  H_jj(1) = diag_h_mat_elem(dets_in(1,1,1),Nint)
+  !$OMP PARALLEL DEFAULT(NONE)                                       &
+      !$OMP  SHARED(sze,H_jj, dets_in,Nint)                    &
+      !$OMP  PRIVATE(i)
+  !$OMP DO SCHEDULE(static)
+  do i=2,sze
+    H_jj(i)  = diag_H_mat_elem(dets_in(1,1,i),Nint)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  if (dressing_state > 0) then
+    do k=1,N_st
+      do i=1,sze
+        H_jj(i)  += u_in(i,k) * dressing_column_h(i,k)
+      enddo
+    enddo
+  endif
+
+  call davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag,Nint,dressing_state,converged)
+  deallocate (H_jj)
+end
+
+
+subroutine davidson_diag_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,N_st,N_st_diag_in,Nint,dressing_state,converged)
+  use bitmasks
+  use mmap_module
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization with specific diagonal elements of the H matrix
+  !
+  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! N_st_diag_in : Number of states in which H is diagonalized. Assumed > sze
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, N_st, N_st_diag_in, Nint
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision,  intent(in)  :: H_jj(sze)
+  integer, intent(in)            :: dressing_state
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag_in)
+  double precision, intent(out)  :: energies(N_st_diag_in)
+
+  integer                        :: iter, N_st_diag
+  integer                        :: i,j,k,l,m
+  logical, intent(inout)         :: converged
+
+  double precision, external     :: u_dot_v, u_dot_u
+
+  integer                        :: k_pairs, kl
+
+  integer                        :: iter2, itertot
+  double precision, allocatable  :: y(:,:), h(:,:), lambda(:)
+  double precision, allocatable  :: s_tmp(:,:)
+  double precision               :: diag_h_mat_elem
+  double precision, allocatable  :: residual_norm(:)
+  character*(16384)              :: write_buffer
+  double precision               :: to_print(2,N_st)
+  double precision               :: cpu, wall
+  integer                        :: shift, shift2, itermax, istate
+  double precision               :: r1, r2, alpha
+  logical                        :: state_ok(N_st_diag_in*davidson_sze_max)
+  integer                        :: nproc_target
+  integer                        :: order(N_st_diag_in)
+  double precision               :: cmax
+  double precision, allocatable  :: U(:,:), overlap(:,:)
+  double precision, pointer      :: W(:,:)
+  logical                        :: disk_based
+  double precision               :: energy_shift(N_st_diag_in*davidson_sze_max)
+
+  include 'constants.include.F'
+
+  N_st_diag = N_st_diag_in
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, S, y, S_d, h, lambda
+  if (N_st_diag*3 > sze) then
+    print *,  'error in Davidson :'
+    print *,  'Increase n_det_max_full to ', N_st_diag*3
+    stop -1
+  endif
+
+  itermax = max(2,min(davidson_sze_max, sze/N_st_diag))+1
+  itertot = 0
+
+  if (state_following) then
+    allocate(overlap(N_st_diag*itermax, N_st_diag*itermax))
+  else
+    allocate(overlap(1,1))  ! avoid 'if' for deallocate
+  endif
+  overlap = 0.d0
+
+  PROVIDE nuclear_repulsion expected_s2 psi_bilinear_matrix_order psi_bilinear_matrix_order_reverse threshold_davidson_pt2 threshold_davidson_from_pt2
+
+  call write_time(6)
+  write(6,'(A)') ''
+  write(6,'(A)') 'Davidson Diagonalization'
+  write(6,'(A)') '------------------------'
+  write(6,'(A)') ''
+
+  ! Find max number of cores to fit in memory
+  ! -----------------------------------------
+
+  nproc_target = nproc
+  double precision :: rss
+  integer :: maxab
+  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
+
+  m=1
+  disk_based = .False.
+  call resident_memory(rss)
+  do
+    r1 = 8.d0 *                                   &! bytes
+         ( dble(sze)*(N_st_diag*itermax)          &! U
+         + 1.0d0*dble(sze*m)*(N_st_diag*itermax)  &! W
+         + 3.0d0*(N_st_diag*itermax)**2           &! h,y,s_tmp
+         + 1.d0*(N_st_diag*itermax)               &! lambda
+         + 1.d0*(N_st_diag)                       &! residual_norm
+                                                   ! In H_u_0_nstates_zmq
+         + 2.d0*(N_st_diag*N_det)                 &! u_t, v_t, on collector
+         + 2.d0*(N_st_diag*N_det)                 &! u_t, v_t, on slave
+         + 0.5d0*maxab                            &! idx0 in H_u_0_nstates_openmp_work_*
+         + nproc_target *                         &! In OMP section
+           ( 1.d0*(N_int*maxab)                   &! buffer
+           + 3.5d0*(maxab) )                      &! singles_a, singles_b, doubles, idx
+         ) / 1024.d0**3
+
+    if (nproc_target == 0) then
+      call check_mem(r1,irp_here)
+      nproc_target = 1
+      exit
+    endif
+
+    if (r1+rss < qp_max_mem) then
+      exit
+    endif
+
+    if (itermax > 4) then
+      itermax = itermax - 1
+    else if (m==1.and.disk_based_davidson) then
+      m=0
+      disk_based = .True.
+      itermax = 6
+    else
+      nproc_target = nproc_target - 1
+    endif
+
+  enddo
+  nthreads_davidson = nproc_target
+  TOUCH nthreads_davidson
+  call write_int(6,N_st,'Number of states')
+  call write_int(6,N_st_diag,'Number of states in diagonalization')
+  call write_int(6,sze,'Number of determinants')
+  call write_int(6,nproc_target,'Number of threads for diagonalization')
+  call write_double(6, r1, 'Memory(Gb)')
+  if (disk_based) then
+    print *, 'Using swap space to reduce RAM'
+  endif
+
+  !---------------
+
+  write(6,'(A)') ''
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = 'Iter'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//'       Energy        Residual '
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+
+
+  if (disk_based) then
+    ! Create memory-mapped files for W and S
+    type(c_ptr) :: ptr_w, ptr_s
+    integer :: fd_s, fd_w
+    call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
+        8, fd_w, .False., ptr_w)
+    call c_f_pointer(ptr_w, w, (/sze,N_st_diag*itermax/))
+  else
+    allocate(W(sze,N_st_diag*itermax))
+  endif
+
+  allocate(                                                          &
+      ! Large
+      U(sze,N_st_diag*itermax),                                      &
+
+      ! Small
+      h(N_st_diag*itermax,N_st_diag*itermax),                        &
+      y(N_st_diag*itermax,N_st_diag*itermax),                        &
+      s_tmp(N_st_diag*itermax,N_st_diag*itermax),                    &
+      residual_norm(N_st_diag),                                      &
+      lambda(N_st_diag*itermax))
+
+  h = 0.d0
+  U = 0.d0
+  y = 0.d0
+  s_tmp = 0.d0
+
+
+  ASSERT (N_st > 0)
+  ASSERT (N_st_diag >= N_st)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+
+  ! Davidson iterations
+  ! ===================
+
+  converged = .False.
+
+  do k=N_st+1,N_st_diag
+    do i=1,sze
+        call random_number(r1)
+        call random_number(r2)
+        r1 = dsqrt(-2.d0*dlog(r1))
+        r2 = dtwo_pi*r2
+        u_in(i,k) = r1*dcos(r2) * u_in(i,k-N_st)
+    enddo
+    u_in(k,k) = u_in(k,k) + 10.d0
+  enddo
+  do k=1,N_st_diag
+    call normalize(u_in(1,k),sze)
+  enddo
+
+  do k=1,N_st_diag
+    do i=1,sze
+      U(i,k) = u_in(i,k)
+    enddo
+  enddo
+
+
+  do while (.not.converged)
+    itertot = itertot+1
+    if (itertot == 8) then
+      exit
+    endif
+
+    do iter=1,itermax-1
+
+      shift  = N_st_diag*(iter-1)
+      shift2 = N_st_diag*iter
+
+      if ((iter > 1).or.(itertot == 1)) then
+        ! Compute |W_k> = \sum_i |i><i|H|u_k>
+        ! -----------------------------------
+
+        if (disk_based) then
+          call ortho_qr_unblocked(U,size(U,1),sze,shift2)
+          call ortho_qr_unblocked(U,size(U,1),sze,shift2)
+        else
+          call ortho_qr(U,size(U,1),sze,shift2)
+          call ortho_qr(U,size(U,1),sze,shift2)
+        endif
+
+        if ((sze > 100000).and.distributed_davidson) then
+            call H_u_0_nstates_zmq   (W(1,shift+1),U(1,shift+1),N_st_diag,sze)
+        else
+            call H_u_0_nstates_openmp(W(1,shift+1),U(1,shift+1),N_st_diag,sze)
+        endif
+      else
+         ! Already computed in update below
+         continue
+      endif
+
+      if (dressing_state > 0) then
+
+        if (N_st == 1) then
+
+          l = dressed_column_idx(1)
+          double precision :: f
+          f = 1.0d0/psi_coef(l,1)
+          do istate=1,N_st_diag
+            do i=1,sze
+              W(i,shift+istate) += dressing_column_h(i,1) *f * U(l,shift+istate)
+              W(l,shift+istate) += dressing_column_h(i,1) *f * U(i,shift+istate)
+            enddo
+
+          enddo
+
+        else
+
+          call dgemm('T','N', N_st, N_st_diag, sze, 1.d0, &
+            psi_coef, size(psi_coef,1), &
+            U(1,shift+1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+
+          call dgemm('N','N', sze, N_st_diag, N_st, 1.0d0, &
+            dressing_column_h, size(dressing_column_h,1), s_tmp, size(s_tmp,1), &
+            1.d0, W(1,shift+1), size(W,1))
+
+
+          call dgemm('T','N', N_st, N_st_diag, sze, 1.d0, &
+            dressing_column_h, size(dressing_column_h,1), &
+            U(1,shift+1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+
+          call dgemm('N','N', sze, N_st_diag, N_st, 1.0d0, &
+            psi_coef, size(psi_coef,1), s_tmp, size(s_tmp,1), &
+            1.d0, W(1,shift+1), size(W,1))
+
+        endif
+      endif
+
+      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
+      ! -------------------------------------------
+
+      call dgemm('T','N', shift2, shift2, sze,                       &
+          1.d0, U, size(U,1), W, size(W,1),                          &
+          0.d0, h, size(h,1))
+
+      ! Diagonalize h
+      ! ---------------
+
+      call lapack_diag(lambda,y,h,size(h,1),shift2)
+
+      ! Compute Energy for each eigenvector
+      ! -----------------------------------
+
+      call dgemm('N','N',shift2,shift2,shift2,                       &
+          1.d0, h, size(h,1), y, size(y,1),                          &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      call dgemm('T','N',shift2,shift2,shift2,                       &
+          1.d0, y, size(y,1), s_tmp, size(s_tmp,1),                  &
+          0.d0, h, size(h,1))
+
+      do k=1,shift2
+        lambda(k) = h(k,k)
+      enddo
+
+      if (state_following) then
+
+        overlap = -1.d0
+        do k=1,shift2
+          do i=1,shift2
+            overlap(k,i) = dabs(y(k,i))
+          enddo
+        enddo
+        do k=1,N_st
+          cmax = -1.d0
+          do i=1,N_st
+            if (overlap(i,k) > cmax) then
+              cmax = overlap(i,k)
+              order(k) = i
+            endif
+          enddo
+          do i=1,N_st_diag
+            overlap(order(k),i) = -1.d0
+          enddo
+        enddo
+        overlap = y
+        do k=1,N_st
+          l = order(k)
+          if (k /= l) then
+            y(1:shift2,k) = overlap(1:shift2,l)
+          endif
+        enddo
+        do k=1,N_st
+          overlap(k,1) = lambda(k)
+        enddo
+
+      endif
+
+
+      ! Express eigenvectors of h in the determinant basis
+      ! --------------------------------------------------
+
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, U, size(U,1), y, size(y,1), 0.d0, U(1,shift2+1), size(U,1))
+      call dgemm('N','N', sze, N_st_diag, shift2,                    &
+          1.d0, W, size(W,1), y, size(y,1), 0.d0, W(1,shift2+1), size(W,1))
+
+      ! Compute residual vector and davidson step
+      ! -----------------------------------------
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k)
+      do k=1,N_st_diag
+        do i=1,sze
+          U(i,shift2+k) =  &
+            (lambda(k) * U(i,shift2+k) - W(i,shift2+k) )      &
+              /max(H_jj(i) - lambda (k),1.d-2)
+        enddo
+
+        if (k <= N_st) then
+          residual_norm(k) = u_dot_u(U(1,shift2+k),sze)
+          to_print(1,k) = lambda(k) + nuclear_repulsion
+          to_print(2,k) = residual_norm(k)
+        endif
+      enddo
+      !$OMP END PARALLEL DO
+
+
+      if ((itertot>1).and.(iter == 1)) then
+        !don't print
+        continue
+      else
+        write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st)
+      endif
+
+      ! Check convergence
+      if (iter > 1) then
+        if (threshold_davidson_from_pt2) then
+          converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson_pt2
+        else
+          converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson
+        endif
+      endif
+
+      do k=1,N_st
+        if (residual_norm(k) > 1.e8) then
+          print *, 'Davidson failed'
+          stop -1
+        endif
+      enddo
+      if (converged) then
+        exit
+      endif
+
+      logical, external :: qp_stop
+      if (qp_stop()) then
+        converged = .True.
+        exit
+      endif
+
+
+    enddo
+
+    ! Re-contract U and update W
+    ! --------------------------------
+
+    call dgemm('N','N', sze, N_st_diag, shift2, 1.d0,      &
+        W, size(W,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+    do k=1,N_st_diag
+      do i=1,sze
+        W(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    call dgemm('N','N', sze, N_st_diag, shift2, 1.d0,      &
+        U, size(U,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+
+    do k=1,N_st_diag
+      do i=1,sze
+        U(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    if (disk_based) then
+      call ortho_qr_unblocked(U,size(U,1),sze,N_st_diag)
+      call ortho_qr_unblocked(U,size(U,1),sze,N_st_diag)
+    else
+      call ortho_qr(U,size(U,1),sze,N_st_diag)
+      call ortho_qr(U,size(U,1),sze,N_st_diag)
+    endif
+
+    ! Adjust the phase
+    do j=1,N_st_diag
+      ! Find first non-zero
+      k=1
+      do while ((k<sze).and.(U(k,j) == 0.d0))
+        k = k+1
+      enddo
+      ! Check sign
+      if (U(k,j) * u_in(k,j) < 0.d0) then
+        do i=1,sze
+          W(i,j) = -W(i,j)
+        enddo
+      endif
+    enddo
+
+  enddo
+
+
+  call nullify_small_elements(sze,N_st_diag,U,size(U,1),threshold_davidson_pt2)
+  do k=1,N_st_diag
+    do i=1,sze
+      u_in(i,k) = U(i,k)
+    enddo
+  enddo
+
+  do k=1,N_st_diag
+    energies(k) = lambda(k)
+  enddo
+  write_buffer = '======'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') trim(write_buffer)
+  write(6,'(A)') ''
+  call write_time(6)
+
+  if (disk_based)then
+    ! Remove temp files
+    integer, external :: getUnitAndOpen
+    call munmap( (/int(sze,8),int(N_st_diag*itermax,8)/), 8, fd_w, ptr_w )
+    fd_w = getUnitAndOpen(trim(ezfio_work_dir)//'davidson_w','r')
+    close(fd_w,status='delete')
+  else
+    deallocate(W)
+  endif
+
+  deallocate (                                                       &
+      residual_norm,                                                 &
+      U, overlap,                                                    &
+      h, y, s_tmp,                                                   &
+      lambda                                                         &
+      )
+  FREE nthreads_davidson
+end
+
+
+
+
+
+
+
--- a/src/davidson/diagonalize_ci.irp.f
+++ b/src/davidson/diagonalize_ci.irp.f
@ -57,9 +57,15 @@ END_PROVIDER

   if (diag_algorithm == "Davidson") then

-     call davidson_diag_HS2(psi_det,CI_eigenvectors, CI_s2, &
+     if (s2_eig.and.only_expected_s2) then
+       call davidson_diag_H(psi_det,CI_eigenvectors, &
         size(CI_eigenvectors,1),CI_electronic_energy,               &
         N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+     else
+       call davidson_diag_HS2(psi_det,CI_eigenvectors, CI_s2, &
+         size(CI_eigenvectors,1),CI_electronic_energy,               &
+         N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+     endif

     integer :: N_states_diag_save
     N_states_diag_save = N_states_diag
@ -71,25 +77,48 @@ END_PROVIDER
        N_states_diag  *= 2
        TOUCH N_states_diag

-        allocate (CI_electronic_energy_tmp (N_states_diag) )
-        allocate (CI_eigenvectors_tmp (N_det,N_states_diag) )
-        allocate (CI_s2_tmp (N_states_diag) )
+        if (s2_eig.and.only_expected_s2) then

-        CI_electronic_energy_tmp(1:N_states_diag_save) = CI_electronic_energy(1:N_states_diag_save)
-        CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save) = CI_eigenvectors(1:N_det,1:N_states_diag_save)
-        CI_s2_tmp(1:N_states_diag_save) = CI_s2(1:N_states_diag_save)
+          allocate (CI_electronic_energy_tmp (N_states_diag) )
+          allocate (CI_eigenvectors_tmp (N_det,N_states_diag) )

-        call davidson_diag_HS2(psi_det,CI_eigenvectors_tmp, CI_s2_tmp, &
+          CI_electronic_energy_tmp(1:N_states_diag_save) = CI_electronic_energy(1:N_states_diag_save)
+          CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save) = CI_eigenvectors(1:N_det,1:N_states_diag_save)
+
+          call davidson_diag_H(psi_det,CI_eigenvectors_tmp, &
            size(CI_eigenvectors_tmp,1),CI_electronic_energy_tmp,               &
            N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)

-        CI_electronic_energy(1:N_states_diag_save) = CI_electronic_energy_tmp(1:N_states_diag_save)
-        CI_eigenvectors(1:N_det,1:N_states_diag_save) = CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save)
-        CI_s2(1:N_states_diag_save) = CI_s2_tmp(1:N_states_diag_save)
+          CI_electronic_energy(1:N_states_diag_save) = CI_electronic_energy_tmp(1:N_states_diag_save)
+          CI_eigenvectors(1:N_det,1:N_states_diag_save) = CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save)
+
+          deallocate (CI_electronic_energy_tmp)
+          deallocate (CI_eigenvectors_tmp)
+
+        else
+
+          allocate (CI_electronic_energy_tmp (N_states_diag) )
+          allocate (CI_eigenvectors_tmp (N_det,N_states_diag) )
+          allocate (CI_s2_tmp (N_states_diag) )
+
+          CI_electronic_energy_tmp(1:N_states_diag_save) = CI_electronic_energy(1:N_states_diag_save)
+          CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save) = CI_eigenvectors(1:N_det,1:N_states_diag_save)
+          CI_s2_tmp(1:N_states_diag_save) = CI_s2(1:N_states_diag_save)
+
+          call davidson_diag_HS2(psi_det,CI_eigenvectors_tmp, CI_s2_tmp, &
+            size(CI_eigenvectors_tmp,1),CI_electronic_energy_tmp,               &
+            N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+
+          CI_electronic_energy(1:N_states_diag_save) = CI_electronic_energy_tmp(1:N_states_diag_save)
+          CI_eigenvectors(1:N_det,1:N_states_diag_save) = CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save)
+          CI_s2(1:N_states_diag_save) = CI_s2_tmp(1:N_states_diag_save)
+
+          deallocate (CI_electronic_energy_tmp)
+          deallocate (CI_eigenvectors_tmp)
+          deallocate (CI_s2_tmp)
+
+        endif

-        deallocate (CI_electronic_energy_tmp)
-        deallocate (CI_eigenvectors_tmp)
-        deallocate (CI_s2_tmp)
     enddo
     if (N_states_diag > N_states_diag_save) then
       N_states_diag = N_states_diag_save
--- a/src/davidson/u0_h_u0.irp.f
+++ b/src/davidson/u0_h_u0.irp.f
@ -1,72 +1,43 @@
- BEGIN_PROVIDER [ double precision, psi_energy, (N_states) ]
-&BEGIN_PROVIDER [ double precision, psi_s2, (N_states) ]
-  implicit none
-  BEGIN_DOC
-! psi_energy(i) = $\langle \Psi_i | H | \Psi_i \rangle$
-!
-! psi_s2(i) = $\langle \Psi_i | S^2 | \Psi_i \rangle$
-  END_DOC
-  call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size)
-  integer :: i
-  do i=N_det+1,N_states
-    psi_energy(i) = 0.d0
-    psi_s2(i) = 0.d0
-  enddo
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, psi_energy_with_nucl_rep, (N_states) ]
- implicit none
- BEGIN_DOC
- ! Energy of the wave function with the nuclear repulsion energy.
- END_DOC
- psi_energy_with_nucl_rep(1:N_states) = psi_energy(1:N_states) + nuclear_repulsion
-END_PROVIDER
-
-
-subroutine u_0_H_u_0(e_0,s_0,u_0,n,keys_tmp,Nint,N_st,sze)
+subroutine u_0_H_u_0(e_0,u_0,n,keys_tmp,Nint,N_st,sze)
  use bitmasks
  implicit none
  BEGIN_DOC
  ! Computes $E_0 = \frac{\langle u_0 | H | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
  !
-  ! and      $S_0 = \frac{\langle u_0 | S^2 | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
-  !
  ! n : number of determinants
  !
  END_DOC
  integer, intent(in)             :: n,Nint, N_st, sze
-  double precision, intent(out)   :: e_0(N_st),s_0(N_st)
+  double precision, intent(out)   :: e_0(N_st)
  double precision, intent(inout) :: u_0(sze,N_st)
  integer(bit_kind),intent(in)    :: keys_tmp(Nint,2,n)

-  double precision, allocatable   :: v_0(:,:), s_vec(:,:), u_1(:,:)
+  double precision, allocatable   :: v_0(:,:), u_1(:,:)
  double precision                :: u_dot_u,u_dot_v,diag_H_mat_elem
  integer                         :: i,j, istate

  if ((n > 100000).and.distributed_davidson) then
-    allocate (v_0(n,N_states_diag),s_vec(n,N_states_diag), u_1(n,N_states_diag))
+    allocate (v_0(n,N_states_diag), u_1(n,N_states_diag))
    u_1(:,:) = 0.d0
    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
-    call H_S2_u_0_nstates_zmq(v_0,s_vec,u_1,N_states_diag,n)
+    call H_u_0_nstates_zmq(v_0,u_1,N_states_diag,n)
  else if (n < n_det_max_full) then
-    allocate (v_0(n,N_st),s_vec(n,N_st), u_1(n,N_st))
+    allocate (v_0(n,N_st), u_1(n,N_st))
    v_0(:,:) = 0.d0
    u_1(:,:) = 0.d0
-    s_vec(:,:) = 0.d0
    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
    do istate = 1,N_st
      do j=1,n
        do i=1,n
          v_0(i,istate) = v_0(i,istate) + h_matrix_all_dets(i,j) * u_0(j,istate)
-          s_vec(i,istate) = s_vec(i,istate) + S2_matrix_all_dets(i,j) * u_0(j,istate)
        enddo
      enddo
    enddo
  else
-    allocate (v_0(n,N_st),s_vec(n,N_st),u_1(n,N_st))
+    allocate (v_0(n,N_st),u_1(n,N_st))
    u_1(:,:) = 0.d0
    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
-    call H_S2_u_0_nstates_openmp(v_0,s_vec,u_1,N_st,n)
+    call H_u_0_nstates_openmp(v_0,u_1,N_st,n)
  endif
  u_0(1:n,1:N_st) = u_1(1:n,1:N_st)
  deallocate(u_1)
@ -76,42 +47,39 @@ subroutine u_0_H_u_0(e_0,s_0,u_0,n,keys_tmp,Nint,N_st,sze)
    norm = u_dot_u(u_0(1,i),n)
    if (norm /= 0.d0) then
      e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)
-      s_0(i) = u_dot_v(s_vec(1,i),u_0(1,i),n)
    else
      e_0(i) = 0.d0
-      s_0(i) = 0.d0
    endif
  enddo
  !$OMP END PARALLEL DO
-  deallocate (s_vec, v_0)
+  deallocate (v_0)
 end





-subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
+subroutine H_u_0_nstates_openmp(v_0,u_0,N_st,sze)
  use bitmasks
  implicit none
  BEGIN_DOC
-  ! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2  | u_0\rangle$.
+  ! Computes $v_0 = H | u_0\rangle$.
  !
  ! Assumes that the determinants are in psi_det
  !
  ! istart, iend, ishift, istep are used in ZMQ parallelization.
  END_DOC
  integer, intent(in)            :: N_st,sze
-  double precision, intent(inout)  :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st)
+  double precision, intent(inout)  :: v_0(sze,N_st), u_0(sze,N_st)
  integer :: k
-  double precision, allocatable  :: u_t(:,:), v_t(:,:), s_t(:,:)
+  double precision, allocatable  :: u_t(:,:), v_t(:,:)
  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
-  allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det))
+  allocate(u_t(N_st,N_det),v_t(N_st,N_det))

  do k=1,N_st
    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
  enddo
  v_t = 0.d0
-  s_t = 0.d0
  call dtranspose(                                                   &
      u_0,                                                           &
      size(u_0, 1),                                                  &
@ -119,7 +87,7 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
      size(u_t, 1),                                                  &
      N_det, N_st)

-  call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1)
+  call H_u_0_nstates_openmp_work(v_t,u_t,N_st,sze,1,N_det,0,1)
  deallocate(u_t)

  call dtranspose(                                                   &
@ -128,66 +96,59 @@ subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
      v_0,                                                           &
      size(v_0, 1),                                                  &
      N_st, N_det)
-  call dtranspose(                                                   &
-      s_t,                                                           &
-      size(s_t, 1),                                                  &
-      s_0,                                                           &
-      size(s_0, 1),                                                  &
-      N_st, N_det)
-  deallocate(v_t,s_t)
+  deallocate(v_t)

  do k=1,N_st
    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
-    call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
  enddo

 end


-subroutine H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+subroutine H_u_0_nstates_openmp_work(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
  use bitmasks
  implicit none
  BEGIN_DOC
-  ! Computes $v_t = H | u_t\rangle$ and $s_t = S^2  | u_t\rangle$
+  ! Computes $v_t = H | u_t\rangle$ 
  !
  ! Default should be 1,N_det,0,1
  END_DOC
  integer, intent(in)            :: N_st,sze,istart,iend,ishift,istep
  double precision, intent(in)   :: u_t(N_st,N_det)
-  double precision, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)
+  double precision, intent(out)  :: v_t(N_st,sze)


  PROVIDE ref_bitmask_energy N_int

  select case (N_int)
    case (1)
-      call H_S2_u_0_nstates_openmp_work_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+      call H_u_0_nstates_openmp_work_1(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
    case (2)
-      call H_S2_u_0_nstates_openmp_work_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+      call H_u_0_nstates_openmp_work_2(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
    case (3)
-      call H_S2_u_0_nstates_openmp_work_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+      call H_u_0_nstates_openmp_work_3(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
    case (4)
-      call H_S2_u_0_nstates_openmp_work_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+      call H_u_0_nstates_openmp_work_4(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
    case default
-      call H_S2_u_0_nstates_openmp_work_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+      call H_u_0_nstates_openmp_work_N_int(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
  end select
 end
 BEGIN_TEMPLATE

-subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+subroutine H_u_0_nstates_openmp_work_$N_int(v_t,u_t,N_st,sze,istart,iend,ishift,istep)
  use bitmasks
  implicit none
  BEGIN_DOC
-  ! Computes $v_t = H | u_t \\rangle$ and $s_t = S^2 | u_t\\rangle$
+  ! Computes $v_t = H | u_t \\rangle$ 
  !
  ! Default should be 1,N_det,0,1
  END_DOC
  integer, intent(in)            :: N_st,sze,istart,iend,ishift,istep
  double precision, intent(in)   :: u_t(N_st,N_det)
-  double precision, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)
+  double precision, intent(out)  :: v_t(N_st,sze)

-  double precision               :: hij, sij
+  double precision               :: hij
  integer                        :: i,j,k,l,kk
  integer                        :: k_a, k_b, l_a, l_b, m_a, m_b
  integer                        :: istate
@ -246,14 +207,14 @@ compute_singles=.True.
      !$OMP          psi_bilinear_matrix_order_transp_reverse,       &
      !$OMP          psi_bilinear_matrix_columns_loc,                &
      !$OMP          psi_bilinear_matrix_transp_rows_loc,            &
-      !$OMP          istart, iend, istep, irp_here, v_t, s_t,        &
+      !$OMP          istart, iend, istep, irp_here, v_t,             &
      !$OMP          ishift, idx0, u_t, maxab, compute_singles,      &
      !$OMP          singles_alpha_csc,singles_alpha_csc_idx,        &
      !$OMP          singles_beta_csc,singles_beta_csc_idx)          &
      !$OMP   PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i,     &
      !$OMP          lcol, lrow, l_a, l_b, utl, kk, u_is_sparse,     &
      !$OMP          buffer, doubles, n_doubles, umax,               &
-      !$OMP          tmp_det2, hij, sij, idx, l, kcol_prev,          &
+      !$OMP          tmp_det2, hij, idx, l, kcol_prev,          &
      !$OMP          singles_a, n_singles_a, singles_b, ratio,       &
      !$OMP          n_singles_b, k8, last_found,left,right,right_max)

@ -453,11 +414,9 @@ compute_singles=.True.

          tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
          call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij)
-          call get_s2(tmp_det,tmp_det2,$N_int,sij)
          !DIR$ LOOP COUNT AVG(4)
          do l=1,N_st
            v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
-            s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1)
          enddo
        enddo
      enddo
@ -559,7 +518,6 @@ compute_singles=.True.
        !DIR$ LOOP COUNT AVG(4)
        do l=1,N_st
          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
-          ! single => sij = 0
        enddo
      enddo
    enddo
@ -604,7 +562,6 @@ compute_singles=.True.
        !DIR$ LOOP COUNT AVG(4)
        do l=1,N_st
          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
-          ! same spin => sij = 0
        enddo
      enddo
    enddo
@ -697,7 +654,6 @@ compute_singles=.True.
        !DIR$ LOOP COUNT AVG(4)
        do l=1,N_st
          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
-          ! single => sij = 0
        enddo
      enddo
    enddo
@ -745,7 +701,6 @@ compute_singles=.True.
        !DIR$ LOOP COUNT AVG(4)
        do l=1,N_st
          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
-          ! same spin => sij = 0
        enddo
      enddo
    enddo
@ -777,14 +732,12 @@ compute_singles=.True.
    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)

-    double precision, external :: diag_H_mat_elem, diag_S_mat_elem
+    double precision, external :: diag_H_mat_elem

    hij = diag_H_mat_elem(tmp_det,$N_int)
-    sij = diag_S_mat_elem(tmp_det,$N_int)
    !DIR$ LOOP COUNT AVG(4)
    do l=1,N_st
      v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a)
-      s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a)
    enddo

  end do
--- a/src/davidson/u0_hs2_u0.irp.f
+++ b/src/davidson/u0_hs2_u0.irp.f
@ -0,0 +1,807 @@
+ BEGIN_PROVIDER [ double precision, psi_energy, (N_states) ]
+&BEGIN_PROVIDER [ double precision, psi_s2, (N_states) ]
+  implicit none
+  BEGIN_DOC
+! psi_energy(i) = $\langle \Psi_i | H | \Psi_i \rangle$
+!
+! psi_s2(i) = $\langle \Psi_i | S^2 | \Psi_i \rangle$
+  END_DOC
+  call u_0_HS2_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size)
+  integer :: i
+  do i=N_det+1,N_states
+    psi_energy(i) = 0.d0
+    psi_s2(i) = 0.d0
+  enddo
+END_PROVIDER
+
+BEGIN_PROVIDER [ double precision, psi_energy_with_nucl_rep, (N_states) ]
+ implicit none
+ BEGIN_DOC
+ ! Energy of the wave function with the nuclear repulsion energy.
+ END_DOC
+ psi_energy_with_nucl_rep(1:N_states) = psi_energy(1:N_states) + nuclear_repulsion
+END_PROVIDER
+
+
+subroutine u_0_HS2_u_0(e_0,s_0,u_0,n,keys_tmp,Nint,N_st,sze)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $E_0 = \frac{\langle u_0 | H | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
+  !
+  ! and      $S_0 = \frac{\langle u_0 | S^2 | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)             :: n,Nint, N_st, sze
+  double precision, intent(out)   :: e_0(N_st),s_0(N_st)
+  double precision, intent(inout) :: u_0(sze,N_st)
+  integer(bit_kind),intent(in)    :: keys_tmp(Nint,2,n)
+
+  double precision, allocatable   :: v_0(:,:), s_vec(:,:), u_1(:,:)
+  double precision                :: u_dot_u,u_dot_v,diag_H_mat_elem
+  integer                         :: i,j, istate
+
+  if ((n > 100000).and.distributed_davidson) then
+    allocate (v_0(n,N_states_diag),s_vec(n,N_states_diag), u_1(n,N_states_diag))
+    u_1(:,:) = 0.d0
+    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
+    call H_S2_u_0_nstates_zmq(v_0,s_vec,u_1,N_states_diag,n)
+  else if (n < n_det_max_full) then
+    allocate (v_0(n,N_st),s_vec(n,N_st), u_1(n,N_st))
+    v_0(:,:) = 0.d0
+    u_1(:,:) = 0.d0
+    s_vec(:,:) = 0.d0
+    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
+    do istate = 1,N_st
+      do j=1,n
+        do i=1,n
+          v_0(i,istate) = v_0(i,istate) + h_matrix_all_dets(i,j) * u_0(j,istate)
+          s_vec(i,istate) = s_vec(i,istate) + S2_matrix_all_dets(i,j) * u_0(j,istate)
+        enddo
+      enddo
+    enddo
+  else
+    allocate (v_0(n,N_st),s_vec(n,N_st),u_1(n,N_st))
+    u_1(:,:) = 0.d0
+    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
+    call H_S2_u_0_nstates_openmp(v_0,s_vec,u_1,N_st,n)
+  endif
+  u_0(1:n,1:N_st) = u_1(1:n,1:N_st)
+  deallocate(u_1)
+  double precision :: norm
+  !$OMP PARALLEL DO PRIVATE(i,norm) DEFAULT(SHARED)
+  do i=1,N_st
+    norm = u_dot_u(u_0(1,i),n)
+    if (norm /= 0.d0) then
+      e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)
+      s_0(i) = u_dot_v(s_vec(1,i),u_0(1,i),n)
+    else
+      e_0(i) = 0.d0
+      s_0(i) = 0.d0
+    endif
+  enddo
+  !$OMP END PARALLEL DO
+  deallocate (s_vec, v_0)
+end
+
+
+
+
+
+subroutine H_S2_u_0_nstates_openmp(v_0,s_0,u_0,N_st,sze)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2  | u_0\rangle$.
+  !
+  ! Assumes that the determinants are in psi_det
+  !
+  ! istart, iend, ishift, istep are used in ZMQ parallelization.
+  END_DOC
+  integer, intent(in)            :: N_st,sze
+  double precision, intent(inout)  :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st)
+  integer :: k
+  double precision, allocatable  :: u_t(:,:), v_t(:,:), s_t(:,:)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
+  allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det))
+
+  do k=1,N_st
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+  v_t = 0.d0
+  s_t = 0.d0
+  call dtranspose(                                                   &
+      u_0,                                                           &
+      size(u_0, 1),                                                  &
+      u_t,                                                           &
+      size(u_t, 1),                                                  &
+      N_det, N_st)
+
+  call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1)
+  deallocate(u_t)
+
+  call dtranspose(                                                   &
+      v_t,                                                           &
+      size(v_t, 1),                                                  &
+      v_0,                                                           &
+      size(v_0, 1),                                                  &
+      N_st, N_det)
+  call dtranspose(                                                   &
+      s_t,                                                           &
+      size(s_t, 1),                                                  &
+      s_0,                                                           &
+      size(s_0, 1),                                                  &
+      N_st, N_det)
+  deallocate(v_t,s_t)
+
+  do k=1,N_st
+    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+  enddo
+
+end
+
+
+subroutine H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_t = H | u_t\rangle$ and $s_t = S^2  | u_t\rangle$
+  !
+  ! Default should be 1,N_det,0,1
+  END_DOC
+  integer, intent(in)            :: N_st,sze,istart,iend,ishift,istep
+  double precision, intent(in)   :: u_t(N_st,N_det)
+  double precision, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)
+
+
+  PROVIDE ref_bitmask_energy N_int
+
+  select case (N_int)
+    case (1)
+      call H_S2_u_0_nstates_openmp_work_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case (2)
+      call H_S2_u_0_nstates_openmp_work_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case (3)
+      call H_S2_u_0_nstates_openmp_work_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case (4)
+      call H_S2_u_0_nstates_openmp_work_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case default
+      call H_S2_u_0_nstates_openmp_work_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+  end select
+end
+BEGIN_TEMPLATE
+
+subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_t = H | u_t \\rangle$ and $s_t = S^2 | u_t\\rangle$
+  !
+  ! Default should be 1,N_det,0,1
+  END_DOC
+  integer, intent(in)            :: N_st,sze,istart,iend,ishift,istep
+  double precision, intent(in)   :: u_t(N_st,N_det)
+  double precision, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)
+
+  double precision               :: hij, sij
+  integer                        :: i,j,k,l,kk
+  integer                        :: k_a, k_b, l_a, l_b, m_a, m_b
+  integer                        :: istate
+  integer                        :: krow, kcol, krow_b, kcol_b
+  integer                        :: lrow, lcol
+  integer                        :: mrow, mcol
+  integer(bit_kind)              :: spindet($N_int)
+  integer(bit_kind)              :: tmp_det($N_int,2)
+  integer(bit_kind)              :: tmp_det2($N_int,2)
+  integer(bit_kind)              :: tmp_det3($N_int,2)
+  integer(bit_kind), allocatable :: buffer(:,:)
+  integer                        :: n_doubles
+  integer, allocatable           :: doubles(:)
+  integer, allocatable           :: singles_a(:)
+  integer, allocatable           :: singles_b(:)
+  integer, allocatable           :: idx(:), idx0(:)
+  integer                        :: maxab, n_singles_a, n_singles_b, kcol_prev
+  integer*8                      :: k8
+  logical                        :: compute_singles
+  integer*8                      :: last_found, left, right, right_max
+  double precision               :: rss, mem, ratio
+  double precision, allocatable  :: utl(:,:)
+  integer, parameter             :: block_size=128
+  logical                        :: u_is_sparse
+
+!  call resident_memory(rss)
+!  mem = dble(singles_beta_csc_size) / 1024.d0**3
+!
+!  compute_singles = (mem+rss > qp_max_mem)
+!
+!  if (.not.compute_singles) then
+!    provide singles_beta_csc
+!  endif
+compute_singles=.True.
+
+
+  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
+  allocate(idx0(maxab))
+
+  do i=1,maxab
+    idx0(i) = i
+  enddo
+
+  ! Prepare the array of all alpha single excitations
+  ! -------------------------------------------------
+
+  PROVIDE N_int nthreads_davidson
+  !$OMP PARALLEL DEFAULT(SHARED) NUM_THREADS(nthreads_davidson)        &
+      !$OMP   SHARED(psi_bilinear_matrix_rows, N_det,                &
+      !$OMP          psi_bilinear_matrix_columns,                    &
+      !$OMP          psi_det_alpha_unique, psi_det_beta_unique,      &
+      !$OMP          n_det_alpha_unique, n_det_beta_unique, N_int,   &
+      !$OMP          psi_bilinear_matrix_transp_rows,                &
+      !$OMP          psi_bilinear_matrix_transp_columns,             &
+      !$OMP          psi_bilinear_matrix_transp_order, N_st,         &
+      !$OMP          psi_bilinear_matrix_order_transp_reverse,       &
+      !$OMP          psi_bilinear_matrix_columns_loc,                &
+      !$OMP          psi_bilinear_matrix_transp_rows_loc,            &
+      !$OMP          istart, iend, istep, irp_here, v_t, s_t,        &
+      !$OMP          ishift, idx0, u_t, maxab, compute_singles,      &
+      !$OMP          singles_alpha_csc,singles_alpha_csc_idx,        &
+      !$OMP          singles_beta_csc,singles_beta_csc_idx)          &
+      !$OMP   PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i,     &
+      !$OMP          lcol, lrow, l_a, l_b, utl, kk, u_is_sparse,     &
+      !$OMP          buffer, doubles, n_doubles, umax,               &
+      !$OMP          tmp_det2, hij, sij, idx, l, kcol_prev,          &
+      !$OMP          singles_a, n_singles_a, singles_b, ratio,       &
+      !$OMP          n_singles_b, k8, last_found,left,right,right_max)
+
+  ! Alpha/Beta double excitations
+  ! =============================
+
+  allocate( buffer($N_int,maxab),                                     &
+      singles_a(maxab),                                              &
+      singles_b(maxab),                                              &
+      doubles(maxab),                                                &
+      idx(maxab), utl(N_st,block_size))
+
+  kcol_prev=-1
+
+  ! Check if u has multiple zeros
+  kk=1 ! Avoid division by zero
+  !$OMP DO
+  do k=1,N_det
+    umax = 0.d0
+    do l=1,N_st
+      umax = max(umax, dabs(u_t(l,k)))
+    enddo
+    if (umax < 1.d-20) then
+      !$OMP ATOMIC
+      kk = kk+1
+    endif
+  enddo
+  !$OMP END DO
+  u_is_sparse = N_det / kk < 20  ! 5%
+
+  ASSERT (iend <= N_det)
+  ASSERT (istart > 0)
+  ASSERT (istep  > 0)
+
+  !$OMP DO SCHEDULE(guided,64)
+  do k_a=istart+ishift,iend,istep
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    ASSERT (krow <= N_det_alpha_unique)
+
+    kcol = psi_bilinear_matrix_columns(k_a)
+    ASSERT (kcol <= N_det_beta_unique)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+
+    if (kcol /= kcol_prev) then
+      tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+      if (compute_singles) then
+        call get_all_spin_singles_$N_int(                              &
+            psi_det_beta_unique, idx0,                                 &
+            tmp_det(1,2), N_det_beta_unique,                           &
+            singles_b, n_singles_b)
+      else
+        n_singles_b = 0
+        !DIR$ LOOP COUNT avg(1000)
+        do k8=singles_beta_csc_idx(kcol),singles_beta_csc_idx(kcol+1)-1
+          n_singles_b = n_singles_b+1
+          singles_b(n_singles_b) = singles_beta_csc(k8)
+        enddo
+      endif
+    endif
+    kcol_prev = kcol
+
+    ! Loop over singly excited beta columns
+    ! -------------------------------------
+
+    !DIR$ LOOP COUNT avg(1000)
+    do i=1,n_singles_b
+      lcol = singles_b(i)
+
+      tmp_det2(1:$N_int,2) = psi_det_beta_unique(1:$N_int, lcol)
+
+!---
+!      if (compute_singles) then
+
+        l_a = psi_bilinear_matrix_columns_loc(lcol)
+        ASSERT (l_a <= N_det)
+
+        !DIR$ UNROLL(8)
+        !DIR$ LOOP COUNT avg(50000)
+        do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
+          lrow = psi_bilinear_matrix_rows(l_a)
+          ASSERT (lrow <= N_det_alpha_unique)
+
+          buffer(1:$N_int,j) = psi_det_alpha_unique(1:$N_int, lrow)  ! hot spot
+
+          ASSERT (l_a <= N_det)
+          idx(j) = l_a
+          l_a = l_a+1
+        enddo
+        j = j-1
+
+        call get_all_spin_singles_$N_int(                              &
+            buffer, idx, tmp_det(1,1), j,                              &
+            singles_a, n_singles_a )
+
+!-----
+!      else
+!
+! ! Search for singles
+!
+!call cpu_time(time0)
+! ! Right boundary
+!        l_a = psi_bilinear_matrix_columns_loc(lcol+1)-1
+!        ASSERT (l_a <= N_det)
+!        do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
+!          lrow = psi_bilinear_matrix_rows(l_a)
+!          ASSERT (lrow <= N_det_alpha_unique)
+!
+!          left = singles_alpha_csc_idx(krow)
+!          right_max = -1_8
+!          right = singles_alpha_csc_idx(krow+1)
+!          do while (right-left>0_8)
+!            k8 = shiftr(right+left,1)
+!            if (singles_alpha_csc(k8) > lrow) then
+!              right = k8
+!            else if (singles_alpha_csc(k8) < lrow) then
+!              left = k8 + 1_8
+!            else
+!              right_max = k8+1_8
+!              exit
+!            endif
+!          enddo
+!          if (right_max > 0_8) exit
+!          l_a = l_a-1
+!        enddo
+!        if (right_max < 0_8) right_max = singles_alpha_csc_idx(krow)
+!
+! ! Search
+!        n_singles_a = 0
+!        l_a = psi_bilinear_matrix_columns_loc(lcol)
+!        ASSERT (l_a <= N_det)
+!
+!        last_found = singles_alpha_csc_idx(krow)
+!        do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
+!          lrow = psi_bilinear_matrix_rows(l_a)
+!          ASSERT (lrow <= N_det_alpha_unique)
+!
+!          left = last_found
+!          right = right_max
+!          do while (right-left>0_8)
+!            k8 = shiftr(right+left,1)
+!            if (singles_alpha_csc(k8) > lrow) then
+!              right = k8
+!            else if (singles_alpha_csc(k8) < lrow) then
+!              left = k8 + 1_8
+!            else
+!              n_singles_a += 1
+!              singles_a(n_singles_a) = l_a
+!              last_found = k8+1_8
+!              exit
+!            endif
+!          enddo
+!          l_a = l_a+1
+!        enddo
+!        j = j-1
+!
+!      endif
+!-----
+
+      ! Loop over alpha singles
+      ! -----------------------
+
+      double precision :: umax
+
+      !DIR$ LOOP COUNT avg(1000)
+      do k = 1,n_singles_a,block_size
+        umax = 0.d0
+        ! Prefetch u_t(:,l_a)
+        if (u_is_sparse) then
+          do kk=0,block_size-1
+            if (k+kk > n_singles_a) exit
+            l_a = singles_a(k+kk)
+            ASSERT (l_a <= N_det)
+
+            do l=1,N_st
+              utl(l,kk+1) = u_t(l,l_a)
+              umax = max(umax, dabs(utl(l,kk+1)))
+            enddo
+          enddo
+        else
+          do kk=0,block_size-1
+            if (k+kk > n_singles_a) exit
+            l_a = singles_a(k+kk)
+            ASSERT (l_a <= N_det)
+            utl(:,kk+1) = u_t(:,l_a)
+          enddo
+          umax = 1.d0
+        endif
+        if (umax < 1.d-20) cycle
+
+        do kk=0,block_size-1
+          if (k+kk > n_singles_a) exit
+          l_a = singles_a(k+kk)
+          lrow = psi_bilinear_matrix_rows(l_a)
+          ASSERT (lrow <= N_det_alpha_unique)
+
+          tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
+          call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij)
+          call get_s2(tmp_det,tmp_det2,$N_int,sij)
+          !DIR$ LOOP COUNT AVG(4)
+          do l=1,N_st
+            v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+            s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1)
+          enddo
+        enddo
+      enddo
+
+    enddo
+
+  enddo
+  !$OMP END DO
+
+  !$OMP DO SCHEDULE(guided,64)
+  do k_a=istart+ishift,iend,istep
+
+
+    ! Single and double alpha excitations
+    ! ===================================
+
+
+    ! Initial determinant is at k_a in alpha-major representation
+    ! -----------------------------------------------------------------------
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    ASSERT (krow <= N_det_alpha_unique)
+
+    kcol = psi_bilinear_matrix_columns(k_a)
+    ASSERT (kcol <= N_det_beta_unique)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+
+    ! Initial determinant is at k_b in beta-major representation
+    ! ----------------------------------------------------------------------
+
+    k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
+    ASSERT (k_b <= N_det)
+
+    spindet(1:$N_int) = tmp_det(1:$N_int,1)
+
+    ! Loop inside the beta column to gather all the connected alphas
+    lcol = psi_bilinear_matrix_columns(k_a)
+    l_a = psi_bilinear_matrix_columns_loc(lcol)
+
+    !DIR$ LOOP COUNT avg(200000)
+    do i=1,N_det_alpha_unique
+      if (l_a > N_det) exit
+      lcol = psi_bilinear_matrix_columns(l_a)
+      if (lcol /= kcol) exit
+      lrow = psi_bilinear_matrix_rows(l_a)
+      ASSERT (lrow <= N_det_alpha_unique)
+
+      buffer(1:$N_int,i) = psi_det_alpha_unique(1:$N_int, lrow) ! Hot spot
+      idx(i) = l_a
+      l_a = l_a+1
+    enddo
+    i = i-1
+
+    call get_all_spin_singles_and_doubles_$N_int(                    &
+        buffer, idx, spindet, i,                                     &
+        singles_a, doubles, n_singles_a, n_doubles )
+
+    ! Compute Hij for all alpha singles
+    ! ----------------------------------
+
+    tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+    !DIR$ LOOP COUNT avg(1000)
+    do i=1,n_singles_a,block_size
+      umax = 0.d0
+      ! Prefetch u_t(:,l_a)
+      if (u_is_sparse) then
+        do kk=0,block_size-1
+          if (i+kk > n_singles_a) exit
+          l_a = singles_a(i+kk)
+          ASSERT (l_a <= N_det)
+
+          do l=1,N_st
+            utl(l,kk+1) = u_t(l,l_a)
+            umax = max(umax, dabs(utl(l,kk+1)))
+          enddo
+        enddo
+      else
+        do kk=0,block_size-1
+          if (i+kk > n_singles_a) exit
+          l_a = singles_a(i+kk)
+          ASSERT (l_a <= N_det)
+          utl(:,kk+1) = u_t(:,l_a)
+        enddo
+        umax = 1.d0
+      endif
+      if (umax < 1.d-20) cycle
+
+      do kk=0,block_size-1
+        if (i+kk > n_singles_a) exit
+        l_a = singles_a(i+kk)
+        lrow = psi_bilinear_matrix_rows(l_a)
+        ASSERT (lrow <= N_det_alpha_unique)
+
+        tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
+        call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij)
+
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! single => sij = 0
+        enddo
+      enddo
+    enddo
+
+
+    ! Compute Hij for all alpha doubles
+    ! ----------------------------------
+
+    !DIR$ LOOP COUNT avg(50000)
+    do i=1,n_doubles,block_size
+      umax = 0.d0
+      ! Prefetch u_t(:,l_a)
+      if (u_is_sparse) then
+        do kk=0,block_size-1
+          if (i+kk > n_doubles) exit
+          l_a = doubles(i+kk)
+          ASSERT (l_a <= N_det)
+
+          do l=1,N_st
+            utl(l,kk+1) = u_t(l,l_a)
+            umax = max(umax, dabs(utl(l,kk+1)))
+          enddo
+        enddo
+      else
+        do kk=0,block_size-1
+          if (i+kk > n_doubles) exit
+          l_a = doubles(i+kk)
+          ASSERT (l_a <= N_det)
+          utl(:,kk+1) = u_t(:,l_a)
+        enddo
+        umax = 1.d0
+      endif
+      if (umax < 1.d-20) cycle
+
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_a = doubles(i+kk)
+        lrow = psi_bilinear_matrix_rows(l_a)
+        ASSERT (lrow <= N_det_alpha_unique)
+
+        call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij)
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! same spin => sij = 0
+        enddo
+      enddo
+    enddo
+
+
+    ! Single and double beta excitations
+    ! ==================================
+
+
+    ! Initial determinant is at k_a in alpha-major representation
+    ! -----------------------------------------------------------------------
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    kcol = psi_bilinear_matrix_columns(k_a)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+
+    spindet(1:$N_int) = tmp_det(1:$N_int,2)
+
+    ! Initial determinant is at k_b in beta-major representation
+    ! -----------------------------------------------------------------------
+
+    k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
+    ASSERT (k_b <= N_det)
+
+    ! Loop inside the alpha row to gather all the connected betas
+    lrow = psi_bilinear_matrix_transp_rows(k_b)
+    l_b = psi_bilinear_matrix_transp_rows_loc(lrow)
+    !DIR$ LOOP COUNT avg(200000)
+    do i=1,N_det_beta_unique
+      if (l_b > N_det) exit
+      lrow = psi_bilinear_matrix_transp_rows(l_b)
+      if (lrow /= krow) exit
+      lcol = psi_bilinear_matrix_transp_columns(l_b)
+      ASSERT (lcol <= N_det_beta_unique)
+
+      buffer(1:$N_int,i) = psi_det_beta_unique(1:$N_int, lcol)
+      idx(i) = l_b
+      l_b = l_b+1
+    enddo
+    i = i-1
+
+    call get_all_spin_singles_and_doubles_$N_int(                    &
+        buffer, idx, spindet, i,                                     &
+        singles_b, doubles, n_singles_b, n_doubles )
+
+    ! Compute Hij for all beta singles
+    ! ----------------------------------
+
+    tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    !DIR$ LOOP COUNT avg(1000)
+    do i=1,n_singles_b,block_size
+      umax = 0.d0
+      if (u_is_sparse) then
+        do kk=0,block_size-1
+          if (i+kk > n_singles_b) exit
+          l_b = singles_b(i+kk)
+          l_a = psi_bilinear_matrix_transp_order(l_b)
+          ASSERT (l_b <= N_det)
+          ASSERT (l_a <= N_det)
+
+          do l=1,N_st
+            utl(l,kk+1) = u_t(l,l_a)
+            umax = max(umax, dabs(utl(l,kk+1)))
+          enddo
+        enddo
+      else
+        do kk=0,block_size-1
+          if (i+kk > n_singles_b) exit
+          l_b = singles_b(i+kk)
+          l_a = psi_bilinear_matrix_transp_order(l_b)
+          ASSERT (l_b <= N_det)
+          ASSERT (l_a <= N_det)
+          utl(:,kk+1) = u_t(:,l_a)
+        enddo
+        umax = 1.d0
+      endif
+      if (umax < 1.d-20) cycle
+
+      do kk=0,block_size-1
+        if (i+kk > n_singles_b) exit
+        l_b = singles_b(i+kk)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        lcol = psi_bilinear_matrix_transp_columns(l_b)
+        ASSERT (lcol <= N_det_beta_unique)
+
+        tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol)
+        call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij)
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! single => sij = 0
+        enddo
+      enddo
+    enddo
+
+    ! Compute Hij for all beta doubles
+    ! ----------------------------------
+
+    !DIR$ LOOP COUNT avg(50000)
+    do i=1,n_doubles,block_size
+      umax = 0.d0
+      if (u_is_sparse) then
+        do kk=0,block_size-1
+          if (i+kk > n_doubles) exit
+          l_b = doubles(i+kk)
+          l_a = psi_bilinear_matrix_transp_order(l_b)
+          ASSERT (l_b <= N_det)
+          ASSERT (l_a <= N_det)
+          do l=1,N_st
+            utl(l,kk+1) = u_t(l,l_a)
+            umax = max(umax, dabs(utl(l,kk+1)))
+          enddo
+        enddo
+      else
+        do kk=0,block_size-1
+          if (i+kk > n_doubles) exit
+          l_b = doubles(i+kk)
+          l_a = psi_bilinear_matrix_transp_order(l_b)
+          ASSERT (l_b <= N_det)
+          ASSERT (l_a <= N_det)
+          utl(:,kk+1) = u_t(:,l_a)
+        enddo
+        umax = 1.d0
+      endif
+      if (umax < 1.d-20) cycle
+
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_b = doubles(i+kk)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        lcol = psi_bilinear_matrix_transp_columns(l_b)
+        ASSERT (lcol <= N_det_beta_unique)
+
+        call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij)
+
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! same spin => sij = 0
+        enddo
+      enddo
+    enddo
+
+
+    ! Diagonal contribution
+    ! =====================
+
+
+    ! Initial determinant is at k_a in alpha-major representation
+    ! -----------------------------------------------------------------------
+
+    if (u_is_sparse) then
+      umax = 0.d0
+      do l=1,N_st
+        umax = max(umax, dabs(u_t(l,k_a)))
+      enddo
+    else
+      umax = 1.d0
+    endif
+    if (umax < 1.d-20) cycle
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    ASSERT (krow <= N_det_alpha_unique)
+
+    kcol = psi_bilinear_matrix_columns(k_a)
+    ASSERT (kcol <= N_det_beta_unique)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+
+    double precision, external :: diag_H_mat_elem, diag_S_mat_elem
+
+    hij = diag_H_mat_elem(tmp_det,$N_int)
+    sij = diag_S_mat_elem(tmp_det,$N_int)
+    !DIR$ LOOP COUNT AVG(4)
+    do l=1,N_st
+      v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a)
+      s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a)
+    enddo
+
+  end do
+  !$OMP END DO
+  deallocate(buffer, singles_a, singles_b, doubles, idx, utl)
+  !$OMP END PARALLEL
+
+end
+
+SUBST [ N_int ]
+
+1;;
+2;;
+3;;
+4;;
+N_int;;
+
+END_TEMPLATE
+
+
--- a/src/determinants/connected_to_ref.irp.f
+++ b/src/determinants/connected_to_ref.irp.f
@ -12,36 +12,6 @@ integer*8 function det_search_key(det,Nint)
 end


-integer*8 function configuration_search_key(cfg,Nint)
-  use bitmasks
-  implicit none
-  BEGIN_DOC
-  ! Returns an integer*8 corresponding to a determinant index for searching.
-  ! The left-most 8 bits contain the number of open shells+1. This ensures that the CSF
-  ! are packed with the same seniority.
-  END_DOC
-  integer, intent(in) :: Nint
-  integer(bit_kind), intent(in) :: cfg(Nint,2)
-  integer :: i, n_open_shells
-  integer*8 :: mask
-
-  i = shiftr(elec_alpha_num, bit_kind_shift)+1
-  configuration_search_key = int(shiftr(ior(cfg(i,1),cfg(i,2)),1)+sum(cfg),8)
-
-  mask = X'00FFFFFFFFFFFFFF'
-  configuration_search_key = iand(mask,configuration_search_key)
-
-  n_open_shells = 1
-  do i=1,Nint
-    if (cfg(i,1) == 0_bit_kind) cycle
-    n_open_shells = n_open_shells + popcnt(cfg(i,1))
-  enddo
-  mask = n_open_shells
-  mask = shiftl(mask,56)
-  configuration_search_key = ior (mask,configuration_search_key)
-
-end
-


 logical function is_in_wavefunction(key,Nint)
--- a/src/determinants/create_excitations.irp.f
+++ b/src/determinants/create_excitations.irp.f
@ -107,283 +107,3 @@ logical function is_spin_flip_possible(key_in,i_flip,ispin)
    return
  endif
 end
-
-subroutine do_single_excitation_cfg(key_in,key_out,i_hole,i_particle,ok)
-  use bitmasks
-  implicit none
-  BEGIN_DOC
-  ! Applies the single excitation operator to a configuration
-  ! If the excitation is possible, ok is True
-  END_DOC
-  integer, intent(in)            :: i_hole,i_particle
-  integer(bit_kind), intent(in)  :: key_in(N_int,2)
-  logical , intent(out)          :: ok
-  integer                        :: k,j,i
-  integer(bit_kind)              :: mask
-  integer(bit_kind)              :: key_out(N_int,2)
-
-  ASSERT (i_hole > 0)
-  ASSERT (i_particle <= mo_num)
-
-  ok = .True.
-  key_out(:,:) = key_in(:,:)
-
-  ! hole
-  k = shiftr(i_hole-1,bit_kind_shift)+1
-  j = i_hole-shiftl(k-1,bit_kind_shift)-1
-  mask = ibset(0_bit_kind,j)
-
-  ! Check if the position j is singly occupied
-  ! 1  ->  0  (SOMO)
-  ! 0      0  (DOMO)
-  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
-     key_out(k,1) = ibclr(key_out(k,1),j)
-
-  ! Check if the position j is doubly occupied
-  ! 0  ->  1  (SOMO)
-  ! 1      0  (DOMO)
-  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
-     key_out(k,1) = ibset(key_out(k,1),j)
-     key_out(k,2) = ibclr(key_out(k,2),j)
-
-  ! The position j is unoccupied: Not OK
-  ! 0  ->  0  (SOMO)
-  ! 0      0  (DOMO)
-  else
-     ok =.False.
-     return
-  endif
-
-
-  ! particle
-  k = shiftr(i_particle-1,bit_kind_shift)+1
-  j = i_particle-shiftl(k-1,bit_kind_shift)-1
-  mask = ibset(0_bit_kind,j)
-
-  ! Check if the position j is singly occupied
-  ! 1  ->  0  (SOMO)
-  ! 0      1  (DOMO)
-  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
-     key_out(k,1) = ibclr(key_out(k,1),j)
-     key_out(k,2) = ibset(key_out(k,2),j)
-
-  ! Check if the position j is doubly occupied : Not OK
-  ! 0  ->  1  (SOMO)
-  ! 1      0  (DOMO)
-  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
-     ok = .False.
-     return
-
-  ! Position at j is unoccupied
-  ! 0  ->  0  (SOMO)
-  ! 0      0  (DOMO)
-  else
-     key_out(k,1) = ibset(key_out(k,1),j)
-  endif
-
-end
-
-subroutine do_single_excitation_cfg_with_type(key_in,key_out,i_hole,i_particle,ex_type,ok)
-  use bitmasks
-  implicit none
-  BEGIN_DOC
-  ! Applies the single excitation operator to a configuration
-  ! Returns the type of excitation in ex_type
-  ! where the following convention is used
-  ! 1 = (SOMO -> SOMO) 1 change in Nsomo
-  ! 2 = (DOMO ->  VMO) 1 change in Nsomo
-  ! 3 = (SOMO ->  VMO) 0 change in Nsomo
-  ! 4 = (DOMO -> SOMO) 0 change in Nsomo
-  ! If the excitation is possible, ok is True
-  END_DOC
-  integer, intent(in)            :: i_hole,i_particle
-  integer(bit_kind), intent(in)  :: key_in(N_int,2)
-  integer , intent(out)          :: ex_type
-  logical , intent(out)          :: ok
-  integer                        :: k,j,i
-  integer(bit_kind)              :: mask
-  integer(bit_kind)              :: key_out(N_int,2)
-  logical                        :: isholeSOMO
-  logical                        :: isparticleSOMO
-  logical                        :: isholeDOMO
-  logical                        :: isparticleVMO
-  isholeSOMO = .False.
-  isholeDOMO = .False.
-  isparticleSOMO = .False.
-  isparticleVMO = .False.
-
-  ASSERT (i_hole > 0)
-  ASSERT (i_particle <= mo_num)
-
-  ok = .True.
-  key_out(:,:) = key_in(:,:)
-
-  ! hole
-  k = shiftr(i_hole-1,bit_kind_shift)+1
-  j = i_hole-shiftl(k-1,bit_kind_shift)-1
-  mask = ibset(0_bit_kind,j)
-
-  ! Check if the position j is singly occupied
-  ! 1  ->  0  (SOMO)
-  ! 0      0  (DOMO)
-  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
-     key_out(k,1) = ibclr(key_out(k,1),j)
-     isholeSOMO = .True.
-
-  ! Check if the position j is doubly occupied
-  ! 0  ->  1  (SOMO)
-  ! 1      0  (DOMO)
-  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
-     key_out(k,1) = ibset(key_out(k,1),j)
-     key_out(k,2) = ibclr(key_out(k,2),j)
-     isholeDOMO = .True.
-
-  ! The position j is unoccupied: Not OK
-  ! 0  ->  0  (SOMO)
-  ! 0      0  (DOMO)
-  else
-     ok =.False.
-     return
-  endif
-
-
-  ! particle
-  k = shiftr(i_particle-1,bit_kind_shift)+1
-  j = i_particle-shiftl(k-1,bit_kind_shift)-1
-  mask = ibset(0_bit_kind,j)
-
-  ! Check if the position j is singly occupied
-  ! 1  ->  0  (SOMO)
-  ! 0      1  (DOMO)
-  if (iand(key_out(k,1),mask) /= 0_bit_kind) then
-     key_out(k,1) = ibclr(key_out(k,1),j)
-     key_out(k,2) = ibset(key_out(k,2),j)
-     isparticleSOMO = .True.
-
-  ! Check if the position j is doubly occupied : Not OK
-  ! 0  ->  1  (SOMO)
-  ! 1      0  (DOMO)
-  else if (iand(key_out(k,2),mask) /= 0_bit_kind) then
-     ok = .False.
-     return
-
-  ! Position at j is unoccupied
-  ! 0  ->  0  (SOMO)
-  ! 0      0  (DOMO)
-  else
-     key_out(k,1) = ibset(key_out(k,1),j)
-     isparticleVMO = .True.
-  endif
-
-  if(isholeSOMO) then
-      ! two possibilities
-      ! particle is SOMO or VMO
-      if(isparticleSOMO) then
-          ! SOMO -> SOMO
-          ex_type = 1
-      else
-          ! SOMO -> VMO
-          ex_type = 3
-      endif 
-   else
-       ! two possibilities
-       ! particle is SOMO or VMO
-       if(isparticleSOMO) then
-           ! DOMO -> SOMO
-           ex_type = 4
-       else
-           ! DOMO -> VMO
-           ex_type = 2
-       endif
-    endif
-
-end
-
-subroutine generate_all_singles_cfg(cfg,singles,n_singles,Nint)
-  implicit none
-  use bitmasks
-  BEGIN_DOC
-  ! Generate all single excitation wrt a configuration
-  !
-  ! n_singles : on input, max number of singles :
-  !   elec_alpha_num * (mo_num - elec_beta_num)
-  !   on output, number of generated singles
-  END_DOC
-  integer, intent(in)            :: Nint
-  integer, intent(inout)         :: n_singles
-  integer(bit_kind), intent(in)  :: cfg(Nint,2)
-  integer(bit_kind), intent(out) :: singles(Nint,2,*)
-
-  integer           :: i,k, n_singles_ma, i_hole, i_particle
-  integer(bit_kind) :: single(Nint,2)
-  logical           :: i_ok
-
-  n_singles = 0
-  !TODO
-  !Make list of Somo  and Domo for holes
-  !Make list of Unocc and Somo for particles
-  do i_hole = 1, mo_num
-    do i_particle = 1, mo_num
-      call do_single_excitation_cfg(cfg,single,i_hole,i_particle,i_ok)
-      if (i_ok) then
-        n_singles = n_singles + 1
-        do k=1,Nint
-          singles(k,1,n_singles) = single(k,1)
-          singles(k,2,n_singles) = single(k,2)
-        enddo
-      endif
-    enddo
-  enddo
-end
-
-subroutine generate_all_singles_cfg_with_type(cfgInp,singles,idxs_singles,pq_singles,ex_type_singles,n_singles,Nint)
-  implicit none
-  use bitmasks
-  BEGIN_DOC
-  ! Generate all single excitation wrt a configuration
-  !
-  ! n_singles : on input, max number of singles :
-  !   elec_alpha_num * (mo_num - elec_beta_num)
-  !   on output, number of generated singles
-  ! ex_type_singles : on output contains type of excitations  :
-  !
-  END_DOC
-  integer, intent(in)            :: Nint
-  integer, intent(inout)         :: n_singles
-  integer, intent(out)           :: idxs_singles(*)
-  integer, intent(out)           :: ex_type_singles(*)
-  integer, intent(out)           :: pq_singles(2,*)
-  integer(bit_kind), intent(in)  :: cfgInp(Nint,2)
-  integer(bit_kind), intent(out) :: singles(Nint,2,*)
-  integer(bit_kind)              :: Jdet(Nint,2)
-
-  integer           :: i,k, n_singles_ma, i_hole, i_particle, ex_type, addcfg
-  integer(bit_kind) :: single(Nint,2)
-  logical           :: i_ok
-
-  n_singles = 0
-  !TODO
-  !Make list of Somo  and Domo for holes
-  !Make list of Unocc and Somo for particles
-  do i_hole = 1+n_core_orb, n_core_orb + n_act_orb
-    do i_particle = 1+n_core_orb, n_core_orb + n_act_orb
-      if(i_hole .EQ. i_particle) cycle
-      addcfg = -1
-      call do_single_excitation_cfg_with_type(cfgInp,single,i_hole,i_particle,ex_type,i_ok)
-      if (i_ok) then
-        call binary_search_cfg(single,addcfg)
-        if(addcfg .EQ. -1) cycle
-        n_singles = n_singles + 1
-        do k=1,Nint
-          singles(k,1,n_singles) = single(k,1)
-          singles(k,2,n_singles) = single(k,2)
-          ex_type_singles(n_singles) = ex_type
-          pq_singles(1,n_singles) = i_hole     ! p
-          pq_singles(2,n_singles) = i_particle ! q
-          idxs_singles(n_singles) = addcfg
-        enddo
-      endif
-    enddo
-  enddo
-end
-
--- a/src/determinants/prune_wf.irp.f
+++ b/src/determinants/prune_wf.irp.f
@ -13,16 +13,16 @@ BEGIN_PROVIDER [ logical, pruned, (N_det) ]
 integer :: i,j,k,ndet_new,ncfg_max
 double precision :: thr

- if (s2_eig) then
-
-  ncfg_max = max(1,int ( dble(N_configuration) * (1.d0 - pruning) + 0.5d0 ))
-
-  do i=1,N_det
-    k = det_to_configuration(i)
-    pruned(i) = psi_configuration_sorted_order_reverse(k) > ncfg_max
-  enddo
-
- else
+! if (s2_eig) then
+!
+!  ncfg_max = max(1,int ( dble(N_configuration) * (1.d0 - pruning) + 0.5d0 ))
+!
+!  do i=1,N_det
+!    k = det_to_configuration(i)
+!    pruned(i) = psi_configuration_sorted_order_reverse(k) > ncfg_max
+!  enddo
+!
+! else

  ndet_new = max(1,int( dble(N_det) * (1.d0 - pruning) + 0.5d0 ))
  thr = psi_average_norm_contrib_sorted(ndet_new)
@ -30,6 +30,6 @@ BEGIN_PROVIDER [ logical, pruned, (N_det) ]
    pruned(i) = psi_average_norm_contrib(i) < thr
  enddo

- endif
+! endif

 END_PROVIDER