diff --git a/src/cipsi/cipsi.irp.f b/src/cipsi/cipsi.irp.f
index ba922c49..66881b28 100644
--- a/src/cipsi/cipsi.irp.f
+++ b/src/cipsi/cipsi.irp.f
@@ -45,13 +45,19 @@ subroutine run_cipsi
 
   if (N_det > N_det_max) then
     psi_det = psi_det_sorted
-    psi_coef = psi_coef_sorted
-    N_det = N_det_max
-    soft_touch N_det psi_det psi_coef
+    if (is_complex) then
+      psi_coef_complex = psi_coef_sorted_complex
+      N_det = N_det_max
+      soft_touch N_det psi_det psi_coef_complex
+    else
+      psi_coef = psi_coef_sorted
+      N_det = N_det_max
+      soft_touch N_det psi_det psi_coef
+    endif
     if (s2_eig) then
       call make_s2_eigenfunction
     endif
-    call diagonalize_CI
+    call diagonalize_ci
     call save_wavefunction
   endif
 
@@ -109,8 +115,11 @@ subroutine run_cipsi
     to_select = int(sqrt(dble(N_states))*dble(N_det)*selection_factor)
     to_select = max(N_states_diag, to_select)
     call ZMQ_selection(to_select, pt2, variance, norm)
-
-    PROVIDE  psi_coef
+    if (is_complex) then
+      PROVIDE  psi_coef_complex
+    else
+      PROVIDE  psi_coef
+    endif
     PROVIDE  psi_det
     PROVIDE  psi_det_sorted
 
diff --git a/src/davidson/davidson_parallel.irp.f b/src/davidson/davidson_parallel.irp.f
index c0d94b35..b98ec377 100644
--- a/src/davidson/davidson_parallel.irp.f
+++ b/src/davidson/davidson_parallel.irp.f
@@ -89,21 +89,93 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
   character*(512)                :: msg
   integer                        :: imin, imax, ishift, istep
 
-  integer, allocatable           :: psi_det_read(:,:,:)
-  double precision, allocatable  :: v_t(:,:), s_t(:,:), u_t(:,:)
-
-  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t
-
-  ! Get wave function (u_t)
-  ! -----------------------
-
   integer                        :: rc, ni, nj
   integer*8                      :: rc8
   integer                        :: N_states_read, N_det_read, psi_det_size_read
   integer                        :: N_det_selectors_read, N_det_generators_read
 
-  integer, external :: zmq_get_dvector
+  integer, allocatable           :: psi_det_read(:,:,:)
+  logical :: sending
+  integer, external :: get_task_from_taskserver
+  integer, external :: task_done_to_taskserver
+  integer :: k
+  integer :: ierr
+ 
+
+!  integer, external :: zmq_get_dvector
   integer, external :: zmq_get_dmatrix
+ 
+  if (is_complex) then
+    complex*16, allocatable  :: v_tc(:,:), s_tc(:,:), u_tc(:,:)
+
+    !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_tc, v_tc, s_tc
+
+
+    ! Get wave function (u_tc)
+    ! -----------------------
+
+    PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+    PROVIDE psi_bilinear_matrix_transp_values_complex psi_bilinear_matrix_values_complex psi_bilinear_matrix_columns_loc
+    PROVIDE ref_bitmask_energy nproc
+    PROVIDE mpi_initialized
+
+    allocate(u_tc(N_st,N_det))
+    
+    !todo: resize for complex?
+    ! Warning : dimensions are modified for efficiency, It is OK since we get the
+    ! full matrix
+    if (size(u_tc,kind=8) < 8388608_8) then
+      ni = size(u_tc)
+      nj = 1
+    else
+      ni = 8388608
+      nj = int(size(u_tc,kind=8)/8388608_8,4) + 1
+    endif
+
+    do while (zmq_get_cdmatrix(zmq_to_qp_run_socket, worker_id, 'u_tc', u_tc, ni, nj, size(u_tc,kind=8)) == -1)
+      print *,  'mpi_rank, N_states_diag, N_det'
+      print *,  mpi_rank, N_states_diag, N_det
+      stop 'u_t'
+    enddo
+
+    IRP_IF MPI
+      include 'mpif.h'
+      call broadcast_chunks_complex_double(u_tc,size(u_tc,kind=8))
+    IRP_ENDIF
+
+    ! Run tasks
+    ! ---------
+
+    sending=.False.
+
+    allocate(v_tc(N_st,N_det), s_tc(N_st,N_det))
+    do
+      if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
+        exit
+      endif
+      if(task_id == 0) exit
+      read (msg,*) imin, imax, ishift, istep
+      do k=imin,imax
+        v_tc(:,k) = (0.d0,0.d0)
+        s_tc(:,k) = (0.d0,0.d0)
+      enddo
+      call h_s2_u_0_nstates_openmp_work_complex(v_tc,s_tc,u_tc,N_st,N_det,imin,imax,ishift,istep)
+      if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
+          print *,  irp_here, 'Unable to send task_done'
+      endif
+      call davidson_push_results_async_recv(zmq_socket_push, sending)
+      call davidson_push_results_async_send_complex(zmq_socket_push, v_tc, s_tc, imin, imax, task_id, sending)
+    end do
+    deallocate(u_tc,v_tc, s_tc)
+    call davidson_push_results_async_recv(zmq_socket_push, sending)
+  else
+  double precision, allocatable  :: v_t(:,:), s_t(:,:), u_t(:,:)
+
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t, s_t
+
+
+  ! Get wave function (u_t)
+  ! -----------------------
 
   PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
   PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
@@ -130,28 +202,21 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
 
   IRP_IF MPI
     include 'mpif.h'
-    integer :: ierr
-
     call broadcast_chunks_double(u_t,size(u_t,kind=8))
-
   IRP_ENDIF
 
   ! Run tasks
   ! ---------
 
-  logical :: sending
   sending=.False.
 
   allocate(v_t(N_st,N_det), s_t(N_st,N_det))
   do
-    integer, external :: get_task_from_taskserver
-    integer, external :: task_done_to_taskserver
     if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
       exit
     endif
     if(task_id == 0) exit
     read (msg,*) imin, imax, ishift, istep
-    integer :: k
     do k=imin,imax
       v_t(:,k) = 0.d0
       s_t(:,k) = 0.d0
@@ -165,7 +230,7 @@ subroutine davidson_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze,
   end do
   deallocate(u_t,v_t, s_t)
   call davidson_push_results_async_recv(zmq_socket_push, sending)
-
+  endif
 end subroutine
 
 
@@ -533,6 +598,7 @@ subroutine H_S2_u_0_nstates_zmq(v_0,s_0,u_0,N_st,sze)
 end
 
 
+
 BEGIN_PROVIDER [ integer, nthreads_davidson ]
  implicit none
  BEGIN_DOC
@@ -643,3 +709,365 @@ integer function zmq_get_N_states_diag(zmq_to_qp_run_socket, worker_id)
   IRP_ENDIF
 end
 
+
+!==============================================================================!
+!                                                                              !
+!                                    Complex                                   !
+!                                                                              !
+!==============================================================================!
+
+subroutine davidson_push_results_complex(zmq_socket_push, v_t, s_t, imin, imax, task_id)
+  !todo: implement for complex; check double sz
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  complex*16          ,intent(in)    :: v_t(N_states_diag,N_det)
+  complex*16          ,intent(in)    :: s_t(N_states_diag,N_det)
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_push_results failed to push imax'
+
+  !todo: double sz for complex?
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push vt'
+
+  !todo: double sz for complex?
+  rc8 = f77_zmq_send8( zmq_socket_push, s_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push st'
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if ((rc /= 2).and.(ok(1:2)/='ok')) then
+    print *, irp_here, ': f77_zmq_recv( zmq_socket_push, ok, 2, 0)'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+subroutine davidson_push_results_async_send_complex(zmq_socket_push, v_t, s_t, imin, imax, task_id,sending)
+  !todo: implement for complex; check double sz
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  complex*16          ,intent(in)    :: v_t(N_states_diag,N_det)
+  complex*16          ,intent(in)    :: s_t(N_states_diag,N_det)
+  logical             ,intent(inout) :: sending
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  if (sending) then
+    print *,  irp_here, ': sending=true'
+    stop -1
+  endif
+  sending = .True.
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_push_results failed to push imax'
+
+  !todo: double sz for complex?
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push vt'
+
+  !todo: double sz for complex?
+  rc8 = f77_zmq_send8( zmq_socket_push, s_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8_8*sz) stop 'davidson_push_results failed to push st'
+
+end subroutine
+
+
+subroutine davidson_pull_results_complex(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
+  !todo: implement for complex; check double sz
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Pull the results of $H | U \rangle$ on the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)     :: zmq_socket_pull
+  integer             ,intent(out)    :: task_id, imin, imax
+  complex*16          ,intent(out)    :: v_t(N_states_diag,N_det)
+  complex*16          ,intent(out)    :: s_t(N_states_diag,N_det)
+
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
+  if(rc /= 4) stop 'davidson_pull_results failed to pull task_id'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
+  if(rc /= 4) stop 'davidson_pull_results failed to pull imin'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
+  if(rc /= 4) stop 'davidson_pull_results failed to pull imax'
+
+  sz = (imax-imin+1)*N_states_diag
+
+  !todo: double sz for complex?
+  rc8 = f77_zmq_recv8( zmq_socket_pull, v_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8*sz) stop 'davidson_pull_results failed to pull v_t'
+
+  !todo: double sz for complex?
+  rc8 = f77_zmq_recv8( zmq_socket_pull, s_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8*sz) stop 'davidson_pull_results failed to pull s_t'
+
+! Activate if zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
+  if (rc /= 2) then
+    print *,  irp_here, ' : f77_zmq_send (zmq_socket_pull,...'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+
+subroutine davidson_collector_complex(zmq_to_qp_run_socket, zmq_socket_pull, v0, s0, sze, N_st)
+  !todo: implement for complex; check conjg v_t s_t
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Routine collecting the results of the workers in Davidson's algorithm.
+  END_DOC
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  integer, intent(in)            :: sze, N_st
+  integer(ZMQ_PTR), intent(in)   :: zmq_to_qp_run_socket
+
+  complex*16      ,intent(inout) :: v0(sze, N_st)
+  complex*16      ,intent(inout) :: s0(sze, N_st)
+
+  integer                          :: more, task_id, imin, imax
+
+  complex*16, allocatable        :: v_t(:,:), s_t(:,:)
+  logical :: sending
+  integer :: i,j
+  integer, external :: zmq_delete_task_async_send
+  integer, external :: zmq_delete_task_async_recv
+
+  allocate(v_t(N_st,N_det), s_t(N_st,N_det))
+  v0 = (0.d0,0.d0)
+  s0 = (0.d0,0.d0)
+  more = 1
+  sending = .False.
+  do while (more == 1)
+    call davidson_pull_results_complex(zmq_socket_pull, v_t, s_t, imin, imax, task_id)
+    if (zmq_delete_task_async_send(zmq_to_qp_run_socket,task_id,sending) == -1) then
+      stop 'davidson: Unable to delete task (send)'
+    endif
+    do j=1,N_st
+      do i=imin,imax
+        !todo: conjg or no?
+        print*,irp_here,' not implemented for complex (conjg?)'
+        v0(i,j) = v0(i,j) + v_t(j,i)
+        s0(i,j) = s0(i,j) + s_t(j,i)
+      enddo
+    enddo
+    if (zmq_delete_task_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
+      stop 'davidson: Unable to delete task (recv)'
+    endif
+  end do
+  deallocate(v_t,s_t)
+
+end subroutine
+
+
+
+subroutine h_s2_u_0_nstates_zmq_complex(v_0,s_0,u_0,N_st,sze)
+  !todo: implement for complex
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use omp_lib
+  use bitmasks
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2  | u_0\rangle$
+  !
+  ! n : number of determinants
+  !
+  ! H_jj : array of $\langle j | H | j \rangle$
+  !
+  ! S2_jj : array of $\langle j | S^2 | j \rangle$
+  END_DOC
+  integer, intent(in)            :: N_st, sze
+  complex*16, intent(out)        :: v_0(sze,N_st), s_0(sze,N_st)
+  complex*16, intent(inout)      :: u_0(sze,N_st)
+  integer                        :: i,j,k
+  integer                        :: ithread
+  complex*16, allocatable        :: u_t(:,:)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values_complex psi_bilinear_matrix_values_complex psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson')
+
+!  integer :: N_states_diag_save
+!  N_states_diag_save = N_states_diag
+!  N_states_diag = N_st
+  if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then
+    stop 'Unable to put N_states_diag on ZMQ server'
+  endif
+
+  if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+    stop 'Unable to put psi on ZMQ server'
+  endif
+  energy = 0.d0
+  if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then
+    stop 'Unable to put energy on ZMQ server'
+  endif
+
+
+  ! Create tasks
+  ! ============
+
+  integer :: istep, imin, imax, ishift, ipos
+  integer, external :: add_task_to_taskserver
+  integer, parameter :: tasksize=10000
+  character*(100000) :: task
+  istep=1
+  ishift=0
+  imin=1
+
+
+  ipos=1
+  do imin=1,N_det,tasksize
+    imax = min(N_det,imin-1+tasksize)
+    do ishift=0,istep-1
+      write(task(ipos:ipos+50),'(4(I11,1X),1X,1A)') imin, imax, ishift, istep, '|'
+      ipos = ipos+50
+      if (ipos > 100000-50) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task'
+        endif
+        ipos=1
+      endif
+    enddo
+  enddo
+
+  if (ipos > 1) then
+    if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+      stop 'Unable to add task'
+    endif
+    ipos=1
+  endif
+
+  allocate(u_t(N_st,N_det))
+  do k=1,N_st
+    call cdset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+
+  call cdtranspose(                                                   &
+      u_0,                                                           &
+      size(u_0, 1),                                                  &
+      u_t,                                                           &
+      size(u_t, 1),                                                  &
+      N_det, N_st)
+
+
+  ASSERT (N_st == N_states_diag)
+  ASSERT (sze >= N_det)
+
+  integer :: rc, ni, nj
+  integer*8 :: rc8
+  double precision :: energy(N_st)
+
+  integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag
+  integer, external :: zmq_put_cdmatrix
+  !todo: size/2 for complex?
+  if (size(u_t) < 8388608) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = size(u_t)/8388608 + 1
+  endif
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (zmq_put_cdmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then
+    stop 'Unable to put u_t on ZMQ server'
+  endif
+
+  deallocate(u_t)
+
+  integer, external :: zmq_set_running
+  if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+    print *,  irp_here, ': Failed in zmq_set_running'
+  endif
+
+  call omp_set_nested(.True.)
+  !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(2) PRIVATE(ithread)
+  ithread = omp_get_thread_num()
+  if (ithread == 0 ) then
+    call davidson_collector_complex(zmq_to_qp_run_socket, zmq_socket_pull, v_0, s_0, N_det, N_st)
+  else
+    call davidson_slave_inproc(1)
+  endif
+  !$OMP END PARALLEL
+  call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'davidson')
+
+  !$OMP PARALLEL
+  !$OMP SINGLE
+  do k=1,N_st
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call cdset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call cdset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call cdset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+  enddo
+  !$OMP END SINGLE
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+!  N_states_diag = N_states_diag_save
+!  SOFT_TOUCH N_states_diag
+end
+
diff --git a/src/davidson/u0_h_u0.irp.f b/src/davidson/u0_h_u0.irp.f
index 6117a13e..e3a2e1ed 100644
--- a/src/davidson/u0_h_u0.irp.f
+++ b/src/davidson/u0_h_u0.irp.f
@@ -6,7 +6,11 @@
 ! 
 ! psi_s2(i) = $\langle \Psi_i | S^2 | \Psi_i \rangle$
   END_DOC
-  call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size)
+  if (is_complex) then
+    call u_0_h_u_0_complex(psi_energy,psi_s2,psi_coef_complex,N_det,psi_det,N_int,N_states,psi_det_size)
+  else
+    call u_0_H_u_0(psi_energy,psi_s2,psi_coef,N_det,psi_det,N_int,N_states,psi_det_size)
+  endif
   integer :: i
   do i=N_det+1,N_states
     psi_energy(i) = 0.d0
@@ -708,3 +712,703 @@ N_int;;
 END_TEMPLATE
 
 
+!==============================================================================!
+!                                                                              !
+!                                    Complex                                   !
+!                                                                              !
+!==============================================================================!
+
+subroutine u_0_H_u_0_complex(e_0,s_0,u_0,n,keys_tmp,Nint,N_st,sze)
+  !todo: implement for complex
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $E_0 = \frac{\langle u_0 | H | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
+  !
+  ! and      $S_0 = \frac{\langle u_0 | S^2 | u_0 \rangle}{\langle u_0 | u_0 \rangle}$
+  !
+  ! n : number of determinants
+  !
+  END_DOC
+  integer, intent(in)             :: n,Nint, N_st, sze
+  double precision, intent(out)   :: e_0(N_st),s_0(N_st)
+  complex*16, intent(inout)       :: u_0(sze,N_st)
+  integer(bit_kind),intent(in)    :: keys_tmp(Nint,2,n)
+
+  complex*16, allocatable         :: v_0(:,:), s_vec(:,:), u_1(:,:)
+  double precision                :: u_dot_u_complex,diag_H_mat_elem
+  complex*16                      :: u_dot_v_complex
+  integer                         :: i,j, istate
+
+  if ((n > 100000).and.distributed_davidson) then
+    allocate (v_0(n,N_states_diag),s_vec(n,N_states_diag), u_1(n,N_states_diag))
+    u_1(:,:) = (0.d0,0.d0)
+    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
+    call h_s2_u_0_nstates_zmq(v_0,s_vec,u_1,N_states_diag,n)
+  else if (n < n_det_max_full) then
+    allocate (v_0(n,N_st),s_vec(n,N_st), u_1(n,N_st))
+    v_0(:,:) = 0.d0
+    u_1(:,:) = 0.d0
+    s_vec(:,:) = 0.d0
+    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
+    do istate = 1,N_st
+      do j=1,n
+        do i=1,n
+          v_0(i,istate) = v_0(i,istate) + h_matrix_all_dets(i,j) * u_0(j,istate)
+          s_vec(i,istate) = s_vec(i,istate) + S2_matrix_all_dets(i,j) * u_0(j,istate)
+        enddo
+      enddo
+    enddo
+  else
+    allocate (v_0(n,N_st),s_vec(n,N_st),u_1(n,N_st))
+    u_1(:,:) = 0.d0
+    u_1(1:n,1:N_st) = u_0(1:n,1:N_st)
+    call H_S2_u_0_nstates_openmp(v_0,s_vec,u_1,N_st,n)
+  endif
+  u_0(1:n,1:N_st) = u_1(1:n,1:N_st)
+  deallocate(u_1)
+  double precision :: norm
+  !$OMP PARALLEL DO PRIVATE(i,norm) DEFAULT(SHARED)
+  do i=1,N_st
+    norm = u_dot_u(u_0(1,i),n)
+    if (norm /= 0.d0) then
+      e_0(i) = u_dot_v(v_0(1,i),u_0(1,i),n)
+      s_0(i) = u_dot_v(s_vec(1,i),u_0(1,i),n)
+    else
+      e_0(i) = 0.d0
+      s_0(i) = 0.d0
+    endif
+  enddo
+  !$OMP END PARALLEL DO
+  deallocate (s_vec, v_0)
+end
+
+
+subroutine H_S2_u_0_nstates_openmp_complex(v_0,s_0,u_0,N_st,sze)
+  !todo: implement for complex
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_0 = H | u_0\rangle$ and $s_0 = S^2  | u_0\rangle$.
+  !
+  ! Assumes that the determinants are in psi_det
+  !
+  ! istart, iend, ishift, istep are used in ZMQ parallelization.
+  END_DOC
+  integer, intent(in)            :: N_st,sze
+  double precision, intent(inout)  :: v_0(sze,N_st), s_0(sze,N_st), u_0(sze,N_st)
+  integer :: k
+  double precision, allocatable  :: u_t(:,:), v_t(:,:), s_t(:,:)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
+  allocate(u_t(N_st,N_det),v_t(N_st,N_det),s_t(N_st,N_det))
+
+  do k=1,N_st
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+  v_t = 0.d0
+  s_t = 0.d0
+  call dtranspose(                                                   &
+      u_0,                                                           &
+      size(u_0, 1),                                                  &
+      u_t,                                                           &
+      size(u_t, 1),                                                  &
+      N_det, N_st)
+
+  call H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,1,N_det,0,1)
+  deallocate(u_t)
+
+  call dtranspose(                                                   &
+      v_t,                                                           &
+      size(v_t, 1),                                                  &
+      v_0,                                                           &
+      size(v_0, 1),                                                  &
+      N_st, N_det)
+  call dtranspose(                                                   &
+      s_t,                                                           &
+      size(s_t, 1),                                                  &
+      s_0,                                                           &
+      size(s_0, 1),                                                  &
+      N_st, N_det)
+  deallocate(v_t,s_t)
+
+  do k=1,N_st
+    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    call dset_order(s_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+  enddo
+
+end
+subroutine H_S2_u_0_nstates_openmp_work_complex(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+  !todo: implement for complex
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_t = H | u_t\rangle$ and $s_t = S^2  | u_t\rangle$
+  !
+  ! Default should be 1,N_det,0,1
+  END_DOC
+  integer, intent(in)            :: N_st,sze,istart,iend,ishift,istep
+  complex*16, intent(in)   :: u_t(N_st,N_det)
+  complex*16, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)
+
+
+  PROVIDE ref_bitmask_energy N_int
+
+  select case (N_int)
+    case (1)
+      call H_S2_u_0_nstates_openmp_work_complex_1(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case (2)
+      call H_S2_u_0_nstates_openmp_work_complex_2(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case (3)
+      call H_S2_u_0_nstates_openmp_work_complex_3(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case (4)
+      call H_S2_u_0_nstates_openmp_work_complex_4(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+    case default
+      call H_S2_u_0_nstates_openmp_work_complex_N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+  end select
+end
+
+BEGIN_TEMPLATE
+
+subroutine H_S2_u_0_nstates_openmp_work_complex_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,istep)
+  !todo: implement for complex
+  print*,irp_here,' not implemented for complex'
+  stop -1
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_t = H | u_t \\rangle$ and $s_t = S^2 | u_t\\rangle$
+  !
+  ! Default should be 1,N_det,0,1
+  END_DOC
+  integer, intent(in)            :: N_st,sze,istart,iend,ishift,istep
+  complex*16, intent(in)         :: u_t(N_st,N_det)
+  complex*16, intent(out)        :: v_t(N_st,sze), s_t(N_st,sze)
+
+  complex*16                     :: hij, sij
+  integer                        :: i,j,k,l,kk
+  integer                        :: k_a, k_b, l_a, l_b, m_a, m_b
+  integer                        :: istate
+  integer                        :: krow, kcol, krow_b, kcol_b
+  integer                        :: lrow, lcol
+  integer                        :: mrow, mcol
+  integer(bit_kind)              :: spindet($N_int)
+  integer(bit_kind)              :: tmp_det($N_int,2)
+  integer(bit_kind)              :: tmp_det2($N_int,2)
+  integer(bit_kind)              :: tmp_det3($N_int,2)
+  integer(bit_kind), allocatable :: buffer(:,:)
+  integer                        :: n_doubles
+  integer, allocatable           :: doubles(:)
+  integer, allocatable           :: singles_a(:)
+  integer, allocatable           :: singles_b(:)
+  integer, allocatable           :: idx(:), idx0(:)
+  integer                        :: maxab, n_singles_a, n_singles_b, kcol_prev
+  integer*8                      :: k8
+  logical                        :: compute_singles
+  integer*8                      :: last_found, left, right, right_max
+  double precision               :: rss, mem, ratio
+  complex*16, allocatable        :: utl(:,:)
+  integer, parameter             :: block_size=128
+
+!  call resident_memory(rss)
+!  mem = dble(singles_beta_csc_size) / 1024.d0**3
+!
+!  compute_singles = (mem+rss > qp_max_mem)
+!
+!  if (.not.compute_singles) then
+!    provide singles_beta_csc
+!  endif
+compute_singles=.True.
+
+  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
+  allocate(idx0(maxab))
+
+  do i=1,maxab
+    idx0(i) = i
+  enddo
+
+  ! Prepare the array of all alpha single excitations
+  ! -------------------------------------------------
+
+  PROVIDE N_int nthreads_davidson
+  !$OMP PARALLEL DEFAULT(SHARED) NUM_THREADS(nthreads_davidson)        &
+      !$OMP   SHARED(psi_bilinear_matrix_rows, N_det,                &
+      !$OMP          psi_bilinear_matrix_columns,                    &
+      !$OMP          psi_det_alpha_unique, psi_det_beta_unique,      &
+      !$OMP          n_det_alpha_unique, n_det_beta_unique, N_int,   &
+      !$OMP          psi_bilinear_matrix_transp_rows,                &
+      !$OMP          psi_bilinear_matrix_transp_columns,             &
+      !$OMP          psi_bilinear_matrix_transp_order, N_st,         &
+      !$OMP          psi_bilinear_matrix_order_transp_reverse,       &
+      !$OMP          psi_bilinear_matrix_columns_loc,                &
+      !$OMP          psi_bilinear_matrix_transp_rows_loc,            &
+      !$OMP          istart, iend, istep, irp_here, v_t, s_t,        &
+      !$OMP          ishift, idx0, u_t, maxab, compute_singles,      &
+      !$OMP          singles_alpha_csc,singles_alpha_csc_idx,        &
+      !$OMP          singles_beta_csc,singles_beta_csc_idx)          &
+      !$OMP   PRIVATE(krow, kcol, tmp_det, spindet, k_a, k_b, i,     &
+      !$OMP          lcol, lrow, l_a, l_b, utl, kk,                  &
+      !$OMP          buffer, doubles, n_doubles,                     &
+      !$OMP          tmp_det2, hij, sij, idx, l, kcol_prev,          &
+      !$OMP          singles_a, n_singles_a, singles_b, ratio,       &
+      !$OMP          n_singles_b, k8, last_found,left,right,right_max)
+
+  ! Alpha/Beta double excitations
+  ! =============================
+
+  allocate( buffer($N_int,maxab),                                     &
+      singles_a(maxab),                                              &
+      singles_b(maxab),                                              &
+      doubles(maxab),                                                &
+      idx(maxab), utl(N_st,block_size))
+
+  kcol_prev=-1
+
+  ASSERT (iend <= N_det)
+  ASSERT (istart > 0)
+  ASSERT (istep  > 0)
+
+  !$OMP DO SCHEDULE(guided,64)
+  do k_a=istart+ishift,iend,istep
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    ASSERT (krow <= N_det_alpha_unique)
+
+    kcol = psi_bilinear_matrix_columns(k_a)
+    ASSERT (kcol <= N_det_beta_unique)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+
+    if (kcol /= kcol_prev) then
+      tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+      if (compute_singles) then
+        call get_all_spin_singles_$N_int(                              &
+            psi_det_beta_unique, idx0,                                 &
+            tmp_det(1,2), N_det_beta_unique,                           &
+            singles_b, n_singles_b)
+      else
+        n_singles_b = 0
+        !DIR$ LOOP COUNT avg(1000)
+        do k8=singles_beta_csc_idx(kcol),singles_beta_csc_idx(kcol+1)-1
+          n_singles_b = n_singles_b+1
+          singles_b(n_singles_b) = singles_beta_csc(k8)
+        enddo
+      endif
+    endif
+    kcol_prev = kcol
+
+    ! Loop over singly excited beta columns
+    ! -------------------------------------
+
+    !DIR$ LOOP COUNT avg(1000)
+    do i=1,n_singles_b
+      lcol = singles_b(i)
+
+      tmp_det2(1:$N_int,2) = psi_det_beta_unique(1:$N_int, lcol)
+
+!---
+!      if (compute_singles) then
+
+        l_a = psi_bilinear_matrix_columns_loc(lcol)
+        ASSERT (l_a <= N_det)
+
+        !DIR$ UNROLL(8)
+        !DIR$ LOOP COUNT avg(50000)
+        do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
+          lrow = psi_bilinear_matrix_rows(l_a)
+          ASSERT (lrow <= N_det_alpha_unique)
+
+          buffer(1:$N_int,j) = psi_det_alpha_unique(1:$N_int, lrow)  ! hot spot
+
+          ASSERT (l_a <= N_det)
+          idx(j) = l_a
+          l_a = l_a+1
+        enddo
+        j = j-1
+
+        call get_all_spin_singles_$N_int(                              &
+            buffer, idx, tmp_det(1,1), j,                              &
+            singles_a, n_singles_a )
+
+!-----
+!      else
+!
+! ! Search for singles
+!
+!call cpu_time(time0)
+! ! Right boundary
+!        l_a = psi_bilinear_matrix_columns_loc(lcol+1)-1
+!        ASSERT (l_a <= N_det)
+!        do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
+!          lrow = psi_bilinear_matrix_rows(l_a)
+!          ASSERT (lrow <= N_det_alpha_unique)
+!
+!          left = singles_alpha_csc_idx(krow)
+!          right_max = -1_8
+!          right = singles_alpha_csc_idx(krow+1)
+!          do while (right-left>0_8)
+!            k8 = shiftr(right+left,1)
+!            if (singles_alpha_csc(k8) > lrow) then
+!              right = k8
+!            else if (singles_alpha_csc(k8) < lrow) then
+!              left = k8 + 1_8
+!            else
+!              right_max = k8+1_8
+!              exit
+!            endif
+!          enddo
+!          if (right_max > 0_8) exit
+!          l_a = l_a-1
+!        enddo
+!        if (right_max < 0_8) right_max = singles_alpha_csc_idx(krow)
+!
+! ! Search
+!        n_singles_a = 0
+!        l_a = psi_bilinear_matrix_columns_loc(lcol)
+!        ASSERT (l_a <= N_det)
+!
+!        last_found = singles_alpha_csc_idx(krow)
+!        do j=1,psi_bilinear_matrix_columns_loc(lcol+1) - psi_bilinear_matrix_columns_loc(lcol)
+!          lrow = psi_bilinear_matrix_rows(l_a)
+!          ASSERT (lrow <= N_det_alpha_unique)
+!
+!          left = last_found
+!          right = right_max
+!          do while (right-left>0_8)
+!            k8 = shiftr(right+left,1)
+!            if (singles_alpha_csc(k8) > lrow) then
+!              right = k8
+!            else if (singles_alpha_csc(k8) < lrow) then
+!              left = k8 + 1_8
+!            else
+!              n_singles_a += 1
+!              singles_a(n_singles_a) = l_a
+!              last_found = k8+1_8
+!              exit
+!            endif
+!          enddo
+!          l_a = l_a+1
+!        enddo
+!        j = j-1
+!
+!      endif
+!-----
+
+      ! Loop over alpha singles
+      ! -----------------------
+
+      !DIR$ LOOP COUNT avg(1000)
+      do k = 1,n_singles_a,block_size
+        ! Prefetch u_t(:,l_a)
+        do kk=0,block_size-1
+          if (k+kk > n_singles_a) exit
+          l_a = singles_a(k+kk)
+          ASSERT (l_a <= N_det)
+
+          do l=1,N_st
+            utl(l,kk+1) = u_t(l,l_a)
+          enddo
+        enddo
+
+        do kk=0,block_size-1
+          if (k+kk > n_singles_a) exit
+          l_a = singles_a(k+kk)
+          lrow = psi_bilinear_matrix_rows(l_a)
+          ASSERT (lrow <= N_det_alpha_unique)
+
+          tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
+          call i_H_j_double_alpha_beta(tmp_det,tmp_det2,$N_int,hij)
+          call get_s2(tmp_det,tmp_det2,$N_int,sij)
+          !DIR$ LOOP COUNT AVG(4)
+          do l=1,N_st
+            v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+            s_t(l,k_a) = s_t(l,k_a) + sij * utl(l,kk+1)
+          enddo
+        enddo
+      enddo
+
+    enddo
+
+  enddo
+  !$OMP END DO
+
+  !$OMP DO SCHEDULE(guided,64)
+  do k_a=istart+ishift,iend,istep
+
+
+    ! Single and double alpha excitations
+    ! ===================================
+
+
+    ! Initial determinant is at k_a in alpha-major representation
+    ! -----------------------------------------------------------------------
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    ASSERT (krow <= N_det_alpha_unique)
+
+    kcol = psi_bilinear_matrix_columns(k_a)
+    ASSERT (kcol <= N_det_beta_unique)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+
+    ! Initial determinant is at k_b in beta-major representation
+    ! ----------------------------------------------------------------------
+
+    k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
+    ASSERT (k_b <= N_det)
+
+    spindet(1:$N_int) = tmp_det(1:$N_int,1)
+
+    ! Loop inside the beta column to gather all the connected alphas
+    lcol = psi_bilinear_matrix_columns(k_a)
+    l_a = psi_bilinear_matrix_columns_loc(lcol)
+
+    !DIR$ LOOP COUNT avg(200000)
+    do i=1,N_det_alpha_unique
+      if (l_a > N_det) exit
+      lcol = psi_bilinear_matrix_columns(l_a)
+      if (lcol /= kcol) exit
+      lrow = psi_bilinear_matrix_rows(l_a)
+      ASSERT (lrow <= N_det_alpha_unique)
+
+      buffer(1:$N_int,i) = psi_det_alpha_unique(1:$N_int, lrow) ! Hot spot
+      idx(i) = l_a
+      l_a = l_a+1
+    enddo
+    i = i-1
+
+    call get_all_spin_singles_and_doubles_$N_int(                    &
+        buffer, idx, spindet, i,                                     &
+        singles_a, doubles, n_singles_a, n_doubles )
+
+    ! Compute Hij for all alpha singles
+    ! ----------------------------------
+
+    tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+    !DIR$ LOOP COUNT avg(1000)
+    do i=1,n_singles_a,block_size
+      ! Prefetch u_t(:,l_a)
+      do kk=0,block_size-1
+        if (i+kk > n_singles_a) exit
+        l_a = singles_a(i+kk)
+        ASSERT (l_a <= N_det)
+
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
+
+      do kk=0,block_size-1
+        if (i+kk > n_singles_a) exit
+        l_a = singles_a(i+kk)
+        lrow = psi_bilinear_matrix_rows(l_a)
+        ASSERT (lrow <= N_det_alpha_unique)
+
+        tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, lrow)
+        call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 1, hij)
+
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! single => sij = 0
+        enddo
+      enddo
+    enddo
+
+
+    ! Compute Hij for all alpha doubles
+    ! ----------------------------------
+
+    !DIR$ LOOP COUNT avg(50000)
+    do i=1,n_doubles,block_size
+      ! Prefetch u_t(:,l_a)
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_a = doubles(i+kk)
+        ASSERT (l_a <= N_det)
+
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
+
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_a = doubles(i+kk)
+        lrow = psi_bilinear_matrix_rows(l_a)
+        ASSERT (lrow <= N_det_alpha_unique)
+
+        call i_H_j_double_spin( tmp_det(1,1), psi_det_alpha_unique(1, lrow), $N_int, hij)
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! same spin => sij = 0
+        enddo
+      enddo
+    enddo
+
+
+    ! Single and double beta excitations
+    ! ==================================
+
+
+    ! Initial determinant is at k_a in alpha-major representation
+    ! -----------------------------------------------------------------------
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    kcol = psi_bilinear_matrix_columns(k_a)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+
+    spindet(1:$N_int) = tmp_det(1:$N_int,2)
+
+    ! Initial determinant is at k_b in beta-major representation
+    ! -----------------------------------------------------------------------
+
+    k_b = psi_bilinear_matrix_order_transp_reverse(k_a)
+    ASSERT (k_b <= N_det)
+
+    ! Loop inside the alpha row to gather all the connected betas
+    lrow = psi_bilinear_matrix_transp_rows(k_b)
+    l_b = psi_bilinear_matrix_transp_rows_loc(lrow)
+    !DIR$ LOOP COUNT avg(200000)
+    do i=1,N_det_beta_unique
+      if (l_b > N_det) exit
+      lrow = psi_bilinear_matrix_transp_rows(l_b)
+      if (lrow /= krow) exit
+      lcol = psi_bilinear_matrix_transp_columns(l_b)
+      ASSERT (lcol <= N_det_beta_unique)
+
+      buffer(1:$N_int,i) = psi_det_beta_unique(1:$N_int, lcol)
+      idx(i) = l_b
+      l_b = l_b+1
+    enddo
+    i = i-1
+
+    call get_all_spin_singles_and_doubles_$N_int(                    &
+        buffer, idx, spindet, i,                                     &
+        singles_b, doubles, n_singles_b, n_doubles )
+
+    ! Compute Hij for all beta singles
+    ! ----------------------------------
+
+    tmp_det2(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    !DIR$ LOOP COUNT avg(1000)
+    do i=1,n_singles_b,block_size
+      do kk=0,block_size-1
+        if (i+kk > n_singles_b) exit
+        l_b = singles_b(i+kk)
+        ASSERT (l_b <= N_det)
+
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        ASSERT (l_a <= N_det)
+
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
+
+      do kk=0,block_size-1
+        if (i+kk > n_singles_b) exit
+        l_b = singles_b(i+kk)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        lcol = psi_bilinear_matrix_transp_columns(l_b)
+        ASSERT (lcol <= N_det_beta_unique)
+
+        tmp_det2(1:$N_int,2) = psi_det_beta_unique (1:$N_int, lcol)
+        call i_h_j_single_spin( tmp_det, tmp_det2, $N_int, 2, hij)
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! single => sij = 0
+        enddo
+      enddo
+    enddo
+
+    ! Compute Hij for all beta doubles
+    ! ----------------------------------
+
+    !DIR$ LOOP COUNT avg(50000)
+    do i=1,n_doubles,block_size
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_b = doubles(i+kk)
+        ASSERT (l_b <= N_det)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        ASSERT (l_a <= N_det)
+
+        do l=1,N_st
+          utl(l,kk+1) = u_t(l,l_a)
+        enddo
+      enddo
+
+      do kk=0,block_size-1
+        if (i+kk > n_doubles) exit
+        l_b = doubles(i+kk)
+        l_a = psi_bilinear_matrix_transp_order(l_b)
+        lcol = psi_bilinear_matrix_transp_columns(l_b)
+        ASSERT (lcol <= N_det_beta_unique)
+
+        call i_H_j_double_spin( tmp_det(1,2), psi_det_beta_unique(1, lcol), $N_int, hij)
+
+        !DIR$ LOOP COUNT AVG(4)
+        do l=1,N_st
+          v_t(l,k_a) = v_t(l,k_a) + hij * utl(l,kk+1)
+          ! same spin => sij = 0
+        enddo
+      enddo
+    enddo
+
+
+    ! Diagonal contribution
+    ! =====================
+
+
+    ! Initial determinant is at k_a in alpha-major representation
+    ! -----------------------------------------------------------------------
+
+    krow = psi_bilinear_matrix_rows(k_a)
+    ASSERT (krow <= N_det_alpha_unique)
+
+    kcol = psi_bilinear_matrix_columns(k_a)
+    ASSERT (kcol <= N_det_beta_unique)
+
+    tmp_det(1:$N_int,1) = psi_det_alpha_unique(1:$N_int, krow)
+    tmp_det(1:$N_int,2) = psi_det_beta_unique (1:$N_int, kcol)
+
+    double precision, external :: diag_H_mat_elem, diag_S_mat_elem
+
+    hij = diag_H_mat_elem(tmp_det,$N_int)
+    sij = diag_S_mat_elem(tmp_det,$N_int)
+    !DIR$ LOOP COUNT AVG(4)
+    do l=1,N_st
+      v_t(l,k_a) = v_t(l,k_a) + hij * u_t(l,k_a)
+      s_t(l,k_a) = s_t(l,k_a) + sij * u_t(l,k_a)
+    enddo
+
+  end do
+  !$OMP END DO
+  deallocate(buffer, singles_a, singles_b, doubles, idx, utl)
+  !$OMP END PARALLEL
+
+end
+
+SUBST [ N_int ]
+
+1;;
+2;;
+3;;
+4;;
+N_int;;
+
+END_TEMPLATE
+
+
diff --git a/src/utils/transpose.irp.f b/src/utils/transpose.irp.f
index 7c86f458..ddffb172 100644
--- a/src/utils/transpose.irp.f
+++ b/src/utils/transpose.irp.f
@@ -84,3 +84,100 @@ recursive subroutine dtranspose(A,LDA,B,LDB,d1,d2)
 
 end
 
+
+!DIR$ attributes forceinline :: cdtranspose
+recursive subroutine cdtranspose(A,LDA,B,LDB,d1,d2)
+ implicit none
+ BEGIN_DOC
+! Transpose input matrix A into output matrix B
+! don't take complex conjugate
+ END_DOC
+ integer, intent(in) :: d1, d2, LDA, LDB
+ complex*16, intent(in) :: A(LDA,d2)
+ complex*16, intent(out) :: B(LDB,d1)
+
+
+! do j=1,d1
+!   do i=1,d2
+!    B(i,j  ) = A(j  ,i)
+!   enddo
+! enddo
+! return
+
+ integer :: i,j,k
+ if ( d2 < 32 ) then
+   do j=1,d1
+     !DIR$ LOOP COUNT (16)
+     do i=1,d2
+      B(i,j  ) = A(j  ,i)
+     enddo
+    enddo
+   return
+ else if (d1 > d2) then
+ !DIR$ forceinline 
+   k=d1/2
+ !DIR$ forceinline recursive
+   call cdtranspose(A(1,1),LDA,B(1,1),LDB,k,d2)
+ !DIR$ forceinline recursive
+   call cdtranspose(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
+   return
+ else
+ !DIR$ forceinline
+   k=d2/2
+ !DIR$ forceinline recursive
+   call cdtranspose(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
+ !DIR$ forceinline recursive
+   call cdtranspose(A(1,1),LDA,B(1,1),LDB,d1,k)
+   return
+ endif
+
+end
+
+!DIR$ attributes forceinline :: cdadjoint
+recursive subroutine cdadjoint(A,LDA,B,LDB,d1,d2)
+ implicit none
+ BEGIN_DOC
+! Transpose input matrix A into output matrix B
+! and take complex conjugate
+ END_DOC
+ integer, intent(in) :: d1, d2, LDA, LDB
+ complex*16, intent(in) :: A(LDA,d2)
+ complex*16, intent(out) :: B(LDB,d1)
+
+
+! do j=1,d1
+!   do i=1,d2
+!    B(i,j  ) = A(j  ,i)
+!   enddo
+! enddo
+! return
+
+ integer :: i,j,k
+ if ( d2 < 32 ) then
+   do j=1,d1
+     !DIR$ LOOP COUNT (16)
+     do i=1,d2
+      B(i,j  ) = conjg(A(j  ,i))
+     enddo
+    enddo
+   return
+ else if (d1 > d2) then
+ !DIR$ forceinline 
+   k=d1/2
+ !DIR$ forceinline recursive
+   call cdadjoint(A(1,1),LDA,B(1,1),LDB,k,d2)
+ !DIR$ forceinline recursive
+   call cdadjoint(A(k+1,1),LDA,B(1,k+1),LDB,d1-k,d2)
+   return
+ else
+ !DIR$ forceinline
+   k=d2/2
+ !DIR$ forceinline recursive
+   call cdadjoint(A(1,k+1),LDA,B(k+1,1),LDB,d1,d2-k)
+ !DIR$ forceinline recursive
+   call cdadjoint(A(1,1),LDA,B(1,1),LDB,d1,k)
+   return
+ endif
+
+end
+
diff --git a/src/utils_complex/qp2-pbc-diff.txt b/src/utils_complex/qp2-pbc-diff.txt
index 9f3598e3..730a3970 100644
--- a/src/utils_complex/qp2-pbc-diff.txt
+++ b/src/utils_complex/qp2-pbc-diff.txt
@@ -1,6 +1,11 @@
 
 -------------------------------------------------------------------------------------
 current:
+
+  irp_align for complex?
+  zmq_put_psi_complex instead of branch inside zmq_put_psi?
+    are there cases where we call this without already being on a complex branch of code?
+
   h_apply.irp.f
     push/pull_pt2
       pt2,norm_pert,h_pert_diag
@@ -83,6 +88,27 @@ determinants:
            and broadcast_chunks_complex_double in mpi/mpi.irp.f
 
 
+davidson	
+  (****)  davidson_parallel.irp.f
+            davidson_slave_work
+              branch inside or outside?
+                (currently inside)
+              same broadcast size issue as in h_apply (2^23 elements)
+              needs h_s2_u_0_nstates_openmp_work_complex (be careful with transp/conjg)
+              needs davidson_push_results_async_send_complex
+            davidson_pu{sh,ll}_results{,_async_send}_complex
+              double sz?
+              does f77_zmq_send8 know about types, or just send raw data?
+            davidson_collector_complex
+              is {v,s}_t conjugate transpose or just transpose?
+  (****)  diagonalization_hs2_dressed.irp.f
+  (****)  diagonalize_ci.irp.f
+  (****)  EZFIO.cfg
+  (****)  ezfio_interface.irp.f
+  (****)  input.irp.f
+  (****)  print_e_components.irp.f
+  (****)  u0_h_u0.irp.f
+  (****)  u0_wee_u0.irp.f
 -------------------------------------------------------------------------------------
 
 for complex data, add extra dim (size 2) and treat as real in EZFIO.cfg
@@ -496,6 +522,3 @@ src/scf_utils/scf_density_matrix_ao_complex.irp.f
 		[ complex*16, SCF_density_matrix_ao_complex, (ao_num,ao_num) ]
 	
 
-	
-	
-