First CSF davidson working

2025-02-18 15:44:14 +01:00 · 2021-02-17 14:59:25 +01:00 · 2021-02-17 14:59:25 +01:00 · b87e87b740
commit b87e87b740
parent 554579492b
11 changed files with 4116 additions and 4 deletions
--- a/src/csf/cfgCI_interface.f90
+++ b/src/csf/cfgCI_interface.f90
@ -0,0 +1,49 @@
+module cfunctions
+        use, intrinsic :: ISO_C_BINDING
+      interface
+         subroutine printcfglist(nint, ncfgs, cfglist) bind(C, name='printCFGList')
+           import C_INT32_T, C_INT64_T
+         integer(kind=C_INT32_T) :: nint
+         integer(kind=C_INT32_T) :: ncfgs
+         integer(kind=C_INT64_T) :: cfglist(nint,2,ncfgs)
+       end subroutine printcfglist
+      end interface
+      interface
+         subroutine getApqIJMatrixDims(Isomo, Jsomo, MS, rowsout, colsout) &
+              bind(C, name='getApqIJMatrixDims')
+           import C_INT32_T, C_INT64_T
+           integer(kind=C_INT64_T),value,intent(in) :: Isomo ! CSFI
+           integer(kind=C_INT64_T),value,intent(in) :: Jsomo ! CSFJ
+           integer(kind=C_INT64_T),value,intent(in) :: MS    ! Ms = 2*Spin
+           integer(kind=C_INT32_T),intent(out):: rowsout
+           integer(kind=C_INT32_T),intent(out):: colsout
+         end subroutine getApqIJMatrixDims
+      end interface
+      interface
+         subroutine getApqIJMatrixDriver(Isomo, Jsomo, orbp, orbq,  &
+              MS, NMO, CSFICSFJApqIJ, rowsmax, colsmax) bind(C, name='getApqIJMatrixDriverArrayInp')
+           import C_INT32_T, C_INT64_T, C_DOUBLE
+           integer(kind=C_INT64_T),value,intent(in) :: Isomo
+           integer(kind=C_INT64_T),value,intent(in) :: Jsomo
+           integer(kind=C_INT32_T),value,intent(in) :: orbp
+           integer(kind=C_INT32_T),value,intent(in) :: orbq
+           integer(kind=C_INT64_T),value,intent(in) :: MS
+           integer(kind=C_INT64_T),value,intent(in) :: NMO
+           integer(kind=C_INT32_T),intent(in) :: rowsmax
+           integer(kind=C_INT32_T),intent(in) :: colsmax
+           real   (kind=C_DOUBLE ),intent(out) :: CSFICSFJApqIJ(rowsmax,colsmax)
+           !integer(kind=C_INT32_T),dimension(rowApqIJ,colApqIJ) :: ApqIJ
+         end subroutine getApqIJMatrixDriver
+      end interface
+       interface
+         subroutine getCSFtoDETTransformationMatrix(Isomo,&
+              MS, rowsmax, colsmax, csftodetmatrix) bind(C, name='convertCSFtoDetBasis')
+           import C_INT32_T, C_INT64_T, C_DOUBLE
+           integer(kind=C_INT64_T),value,intent(in) :: Isomo
+           integer(kind=C_INT64_T),value,intent(in) :: MS
+           integer(kind=C_INT32_T),intent(in) :: rowsmax
+           integer(kind=C_INT32_T),intent(in) :: colsmax
+           real   (kind=C_DOUBLE ),intent(out) :: csftodetmatrix(rowsmax,colsmax)
+         end subroutine getCSFtoDETTransformationMatrix
+      end interface
+    end module cfunctions
--- a/src/csf/cfgCI_utils.c
+++ b/src/csf/cfgCI_utils.c
--- a/src/csf/configurations.irp.f
+++ b/src/csf/configurations.irp.f
@ -1,4 +1,13 @@
 use bitmasks
+
+BEGIN_PROVIDER [ integer, spin_multiplicity ]
+ implicit none
+ BEGIN_DOC
+ ! n_alpha - n_beta + 1
+ END_DOC
+ spin_multiplicity = elec_alpha_num - elec_beta_num + 1
+END_PROVIDER
+
 subroutine configuration_of_det(d,o,Nint)
  use bitmasks
  implicit none
--- a/src/csf/connected_to_ref.irp.f
+++ b/src/csf/connected_to_ref.irp.f
@ -0,0 +1,29 @@
+integer*8 function configuration_search_key(cfg,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Returns an integer*8 corresponding to a determinant index for searching.
+  ! The left-most 8 bits contain the number of open shells+1. This ensures that the CSF
+  ! are packed with the same seniority.
+  END_DOC
+  integer, intent(in) :: Nint
+  integer(bit_kind), intent(in) :: cfg(Nint,2)
+  integer :: i, n_open_shells
+  integer*8 :: mask
+
+  i = shiftr(elec_alpha_num, bit_kind_shift)+1
+  configuration_search_key = int(shiftr(ior(cfg(i,1),cfg(i,2)),1)+sum(cfg),8)
+
+  mask = X'00FFFFFFFFFFFFFF'
+  configuration_search_key = iand(mask,configuration_search_key)
+
+  n_open_shells = 1
+  do i=1,Nint
+    if (cfg(i,1) == 0_bit_kind) cycle
+    n_open_shells = n_open_shells + popcnt(cfg(i,1))
+  enddo
+  mask = n_open_shells
+  mask = shiftl(mask,56)
+  configuration_search_key = ior (mask,configuration_search_key)
+
+end
--- a/src/csf/conversion.irp.f
+++ b/src/csf/conversion.irp.f
@ -0,0 +1,148 @@
+subroutine convertWFfromDETtoCSF(N_st,psi_coef_det_in, psi_coef_cfg_out)
+  use cfunctions
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Documentation for DetToCSFTransformationMatrix
+  ! Provides the matrix of transformatons for the
+  ! conversion between determinant to CSF basis (in BFs)
+  END_DOC
+  integer, intent(in)            :: N_st
+  double precision, intent(in)   :: psi_coef_det_in(N_det,N_st)
+  double precision, intent(out)  :: psi_coef_cfg_out(n_CSF,N_st)
+  integer*8                      :: Isomo, Idomo, mask
+  integer(bit_kind)              :: Ialpha(N_int) ,Ibeta(N_int)
+  integer                        :: rows, cols, i, j, k
+  integer                        :: startdet, enddet
+  integer                        :: ndetI
+  integer                        :: getNSOMO
+  double precision,allocatable   :: tempBuffer(:,:)
+  double precision,allocatable   :: tempCoeff(:,:)
+  double precision               :: phasedet
+  integer                        :: idx
+
+  ! initialization
+  psi_coef_cfg_out(:,1) = 0.d0
+
+  integer s, bfIcfg
+  integer countcsf
+  countcsf = 0
+  phasedet = 1.0d0
+  do i = 1,N_configuration
+    startdet = psi_configuration_to_psi_det(1,i)
+    enddet = psi_configuration_to_psi_det(2,i)
+    ndetI = enddet-startdet+1
+
+    allocate(tempCoeff(ndetI,N_st))
+    do j = startdet, enddet
+      idx = psi_configuration_to_psi_det_data(j)
+      Ialpha(:) = psi_det(:,1,idx)
+      Ibeta(:)  = psi_det(:,2,idx)
+      call get_phase_qp_to_cfg(Ialpha, Ibeta, phasedet)
+      do k=1,N_st
+        tempCoeff(j-startdet+1,k) = psi_coef_det_in(idx, k)*phasedet
+      enddo
+    enddo
+
+    s = 0
+    do k=1,N_int
+      if (psi_configuration(k,1,i) == 0_bit_kind) cycle
+      s = s + popcnt(psi_configuration(k,1,i))
+    enddo
+    bfIcfg = max(1,nint((binom(s,(s+1)/2)-binom(s,((s+1)/2)+1))))
+
+    ! perhaps blocking with CFGs of same seniority
+    ! can be more efficient
+    allocate(tempBuffer(bfIcfg,ndetI))
+    tempBuffer = DetToCSFTransformationMatrix(s,:bfIcfg,:ndetI)
+
+    call dgemm('N','N', bfIcfg, N_st, ndetI, 1.d0, tempBuffer, size(tempBuffer,1),&
+        tempCoeff, size(tempCoeff,1), 0.d0, psi_coef_cfg_out(countcsf+1,1),&
+        size(psi_coef_cfg_out,1))
+
+    deallocate(tempCoeff)
+    deallocate(tempBuffer)
+    countcsf += bfIcfg
+  enddo
+
+end
+
+
+subroutine convertWFfromCSFtoDET(N_st,psi_coef_cfg_in, psi_coef_det)
+  implicit none
+  BEGIN_DOC
+  ! Documentation for convertCSFtoDET
+  ! This function converts the wavefunction
+  ! in CFG basis to DET basis using the
+  ! transformation matrix provided before.
+  END_DOC
+  integer, intent(in)            :: N_st
+  double precision,intent(in)    :: psi_coef_cfg_in(n_CSF,N_st)
+  double precision,intent(out)   :: psi_coef_det(N_det,N_st)
+  double precision               :: tmp_psi_coef_det(maxDetDimPerBF,N_st)
+  integer                        :: s, bfIcfg
+  integer                        :: countcsf
+  integer(bit_kind)              :: Ialpha(N_int), Ibeta(N_int)
+  integer                        :: rows, cols, i, j, k
+  integer                        :: startdet, enddet
+  integer                        :: ndetI
+  integer                        :: getNSOMO
+  double precision,allocatable   :: tempBuffer(:,:)
+  double precision,allocatable   :: tempCoeff (:,:)
+  double precision               :: phasedet
+  integer                        :: idx
+
+  countcsf = 0
+
+  do i = 1,N_configuration
+    startdet = psi_configuration_to_psi_det(1,i)
+    enddet = psi_configuration_to_psi_det(2,i)
+    ndetI = enddet-startdet+1
+
+    s = 0
+    do k=1,N_int
+      if (psi_configuration(k,1,i) == 0_bit_kind) cycle
+      s = s + popcnt(psi_configuration(k,1,i))
+    enddo
+    bfIcfg = max(1,nint((binom(s,(s+1)/2)-binom(s,((s+1)/2)+1))))
+
+    allocate(tempCoeff(bfIcfg,N_st))
+
+    do k=1,N_st
+      do j = 1,bfIcfg
+        tempCoeff(j,k) = psi_coef_cfg_in(countcsf+j,k)
+      enddo
+    enddo
+
+    countcsf += bfIcfg
+    ! perhaps blocking with CFGs of same seniority
+    ! can be more efficient
+    allocate(tempBuffer(bfIcfg,ndetI))
+    tempBuffer = DetToCSFTransformationMatrix(s,:bfIcfg,:ndetI)
+
+    call dgemm('T','N', ndetI, N_st, bfIcfg, 1.d0, tempBuffer, size(tempBuffer,1),&
+        tempCoeff, size(tempCoeff,1), 0.d0, tmp_psi_coef_det,        &
+        size(tmp_psi_coef_det,1))
+
+    do j=startdet,enddet
+      idx = psi_configuration_to_psi_det_data(j)
+      Ialpha(:) = psi_det(:,1,idx)
+      Ibeta(:)  = psi_det(:,2,idx)
+      call get_phase_qp_to_cfg(Ialpha, Ibeta, phasedet)
+      do k=1,N_st
+        psi_coef_det(idx,k) = tmp_psi_coef_det(j-startdet+1,k) * phasedet
+      enddo
+    enddo
+
+    deallocate(tempCoeff)
+    deallocate(tempBuffer)
+  enddo
+
+end
+
+
+
+
+
+
+
--- a/src/csf/sigma_vector.irp.f
+++ b/src/csf/sigma_vector.irp.f
@ -0,0 +1,641 @@
+  BEGIN_PROVIDER [ integer, NSOMOMax]
+ &BEGIN_PROVIDER [ integer, NCSFMax]
+ &BEGIN_PROVIDER [ integer*8, NMO]
+ &BEGIN_PROVIDER [ integer, NBFMax]
+ &BEGIN_PROVIDER [ integer, n_CSF]
+ &BEGIN_PROVIDER [ integer, maxDetDimPerBF]
+  implicit none
+  BEGIN_DOC
+  ! Documentation for NSOMOMax
+  ! The maximum number of SOMOs for the current calculation.
+  ! required for the calculation of prototype arrays.
+  END_DOC
+  NSOMOMax = min(elec_num, cfg_nsomo_max + 2)
+  ! Note that here we need NSOMOMax + 2 sizes
+  NCSFMax = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2)-binom(NSOMOMax,((NSOMOMax+1)/2)+1)))) ! TODO: NCSFs for MS=0
+  NBFMax = NCSFMax
+  maxDetDimPerBF = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2))))
+  NMO = n_act_orb
+  integer i,j,k,l
+  integer startdet,enddet
+  integer ncfg,ncfgprev
+  integer NSOMO
+  integer dimcsfpercfg
+  integer detDimperBF
+  real*8 :: coeff
+  integer MS
+  integer ncfgpersomo
+  detDimperBF = 0
+  MS = elec_alpha_num-elec_beta_num
+  ! number of cfgs = number of dets for 0 somos
+  n_CSF = cfg_seniority_index(0)-1
+  ncfgprev = cfg_seniority_index(0)
+  do i = 0-iand(MS,1)+2, NSOMOMax,2
+     if(cfg_seniority_index(i) .EQ. -1)then
+        ncfgpersomo = N_configuration + 1
+     else
+        ncfgpersomo = cfg_seniority_index(i)
+     endif
+  ncfg = ncfgpersomo - ncfgprev
+  !detDimperBF = max(1,nint((binom(i,(i+1)/2))))
+  dimcsfpercfg = max(1,nint((binom(i-2,(i-2+1)/2)-binom(i-2,((i-2+1)/2)+1))))
+  n_CSF += ncfg * dimcsfpercfg
+  !if(cfg_seniority_index(i+2) == -1) EXIT
+  !if(detDimperBF > maxDetDimPerBF) maxDetDimPerBF = detDimperBF
+  ncfgprev = cfg_seniority_index(i)
+  enddo
+  END_PROVIDER
+
+
+subroutine get_phase_qp_to_cfg(Ialpha, Ibeta, phaseout)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Documentation for get_phase_qp_to_cfg
+  !
+  ! This function converts from (aaaa)(bbbb)
+  ! notation to (ab)(ab)(ab)(ab)
+  ! notation.
+  ! The cfgCI code works in (ab)(ab)(ab)(ab)
+  ! notation throughout.
+  END_DOC
+  integer(bit_kind),intent(in)   :: Ialpha(N_int)
+  integer(bit_kind),intent(in)   :: Ibeta(N_int)
+  real*8,intent(out)             :: phaseout
+  integer(bit_kind)              :: mask(N_int), deta(N_int), detb(N_int)
+  integer                        :: nbetas
+  integer                        :: count, k
+if (N_int >1 ) then
+   stop 'TODO: get_phase_qp_to_cfg '
+endif
+
+  nbetas = 0
+  mask = 0_bit_kind
+  count = 0
+  deta = Ialpha
+  detb = Ibeta
+  ! remove the domos
+  mask = IAND(deta,detb)
+  deta = IEOR(deta,mask)
+  detb = IEOR(detb,mask)
+  mask = 0
+  phaseout = 1.0
+  k = 1
+  do while((deta(k)).GT.0_8)
+     mask(k) = ISHFT(1_8,count)
+     if(POPCNT(IAND(deta(k),mask(k))).EQ.1)then
+        if(IAND(nbetas,1).EQ.0) then
+           phaseout *= 1.0d0
+        else
+           phaseout *= -1.0d0
+        endif
+        deta(k) = IEOR(deta(k),mask(k))
+     else
+        if(POPCNT(IAND(detb(k),mask(k))).EQ.1) then
+           nbetas += 1
+           detb(k) = IEOR(detb(k),mask(k))
+        endif
+     endif
+     count += 1
+  enddo
+end subroutine get_phase_qp_to_cfg
+
+
+
+  BEGIN_PROVIDER [ integer, AIJpqMatrixDimsList, (0:NSOMOMax,0:NSOMOMax,4,NSOMOMax,NSOMOMax,2)]
+ &BEGIN_PROVIDER [ integer, rowsmax]
+ &BEGIN_PROVIDER [ integer, colsmax]
+  use cfunctions
+  implicit none
+  BEGIN_DOC
+  ! Documentation for AIJpqMatrixList
+  ! The prototype matrix containing the <I|E_{pq}|J>
+  ! matrices for each I,J somo pair and orb ids.
+  END_DOC
+  integer i,j,k,l
+  integer*8 Isomo, Jsomo, tmpsomo
+  Isomo = 0
+  Jsomo = 0
+  integer rows, cols, nsomoi, nsomoj
+  rows = -1
+  cols = -1
+  integer*8 MS
+  MS = elec_alpha_num-elec_beta_num
+  integer nsomomin
+  nsomomin = elec_alpha_num-elec_beta_num
+  rowsmax = 0
+  colsmax = 0
+  !allocate(AIJpqMatrixDimsList(NSOMOMax,NSOMOMax,4,NSOMOMax,NSOMOMax,2))
+  ! Type
+  ! 1. SOMO -> SOMO
+  do i = 2-iand(nsomomin,1), NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i-2,i-2, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LT. 0) then
+           cycle
+        end if
+        do k = 1,i
+           do l = 1,i
+              ! Define Jsomo
+              if(k.NE.l)then
+                 Jsomo = IBCLR(Isomo, k-1)
+                 Jsomo = IBCLR(Jsomo, l-1)
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,i)-1
+                 nsomoi = i
+                 nsomoj = i
+              endif
+
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(nsomoi,nsomoj,1,k,l,1) = rows
+              AIJpqMatrixDimsList(nsomoi,nsomoj,1,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 2. DOMO -> VMO
+  do i = 0+iand(nsomomin,1), NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     tmpsomo = ISHFT(1_8,i+2)-1
+     do j = i+2,i+2, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LT. 0) then
+           cycle
+        end if
+        do k = 1,j
+           do l = 1,j
+              if(k .NE. l) then
+                 Isomo = IBCLR(tmpsomo,k-1)
+                 Isomo = IBCLR(Isomo,l-1)
+
+                 ! Define Jsomo
+                 Jsomo = ISHFT(1_8,j)-1;
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,j)-1
+                 Isomo = IBCLR(Isomo,1-1)
+                 Isomo = IBCLR(Isomo,j-1)
+                 Jsomo = ISHFT(1_8,j)-1
+                 Isomo = ISHFT(1_8,j)-1
+                 nsomoi = j
+                 nsomoj = j
+              endif
+
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(nsomoi,nsomoj,2,k,l,1) = rows
+              AIJpqMatrixDimsList(nsomoi,nsomoj,2,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 3. SOMO -> VMO
+  !print *,"Doing SOMO->VMO"
+  do i = 2-iand(nsomomin,1), NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i,i, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LE. 0) then
+           cycle
+        end if
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,l-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,k-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(i,j,3,k,l,1) = rows
+              AIJpqMatrixDimsList(i,j,3,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 4. DOMO -> VMO
+  !print *,"Doing DOMO->SOMO"
+  do i = 2-iand(nsomomin,1), NSOMOMax, 2
+     do j = i,i, 2
+        if(j .GT. NSOMOMax .OR. j .LE. 0) then
+           cycle
+        end if
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,k+1-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,l-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(i,j,4,k,l,1) = rows
+              AIJpqMatrixDimsList(i,j,4,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  END_PROVIDER
+
+  BEGIN_PROVIDER [ real*8, AIJpqContainer, (0:NSOMOMax,0:NSOMOMax,4,NSOMOMax,NSOMOMax,NBFMax,NBFMax)]
+  use cfunctions
+  implicit none
+  BEGIN_DOC
+  ! Documentation for AIJpqMatrixList
+  ! The prototype matrix containing the <I|E_{pq}|J>
+  ! matrices for each I,J somo pair and orb ids.
+  !
+  ! Due to the different types of excitations which
+  ! include DOMOs and VMOs two prototype DOMOs and two
+  ! prototype VMOs are needed. Therefore
+  ! the 4th and 5th dimensions are NSOMOMax+4 and NSOMOMax+4
+  ! respectively.
+  !
+  ! The type of excitations are ordered as follows:
+  ! Type 1 - SOMO -> SOMO
+  ! Type 2 - DOMO -> VMO
+  ! Type 3 - SOMO -> VMO
+  ! Type 4 - DOMO -> SOMO
+  END_DOC
+  integer i,j,k,l, orbp, orbq, ri, ci
+  orbp = 0
+  orbq = 0
+  integer*8 Isomo, Jsomo, tmpsomo
+  Isomo = 0
+  Jsomo = 0
+  integer rows, cols, nsomoi, nsomoj
+  rows = -1
+  cols = -1
+  integer*8 MS
+  MS = 0
+  touch AIJpqMatrixDimsList
+  real*8,dimension(:,:),allocatable :: meMatrix
+  integer maxdim
+  !maxdim = max(rowsmax,colsmax)
+  ! allocate matrix
+  !allocate(AIJpqMatrixDimsList(NSOMOMax,NSOMOMax,4,NSOMOMax,NSOMOMax,2))
+  ! Type
+  ! 1. SOMO -> SOMO
+  do i = 2, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i-2,i-2, 2
+        if(j .GT. NSOMOMax .OR. j .LT. 0) cycle
+        do k = 1,i
+           do l = 1,i
+
+              ! Define Jsomo
+              if(k .NE. l) then
+                 Jsomo = IBCLR(Isomo, k-1)
+                 Jsomo = IBCLR(Jsomo, l-1)
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,i)-1
+                 nsomoi = i
+                 nsomoj = i
+              endif
+
+              AIJpqContainer(nsomoi,nsomoj,1,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(nsomoi,nsomoj,1,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 2. DOMO -> VMO
+  do i = 0, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     tmpsomo = ISHFT(1_8,i+2)-1
+     do j = i+2,i+2, 2
+        if(j .GT. NSOMOMax .OR. j .LE. 0) cycle
+        Jsomo = ISHFT(1_8,j)-1
+        do k = 1,j
+           do l = 1,j
+              if(k .NE. l) then
+                 Isomo = IBCLR(tmpsomo,k-1)
+                 Isomo = IBCLR(Isomo,l-1)
+                 ! Define Jsomo
+                 Jsomo = ISHFT(1_8,j)-1;
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,j)-1
+                 Isomo = IBCLR(Isomo,1-1)
+                 Isomo = IBCLR(Isomo,j-1)
+                 Jsomo = ISHFT(1_8,j)-1
+                 Isomo = ISHFT(1_8,j)-1
+                 nsomoi = j
+                 nsomoj = j
+              endif
+
+              AIJpqContainer(nsomoi,nsomoj,2,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(nsomoi,nsomoj,2,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 3. SOMO -> VMO
+  do i = 2, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i,i, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LE. 0) cycle
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,l-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,k-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+
+              AIJpqContainer(i,j,3,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(i,j,3,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 4. DOMO -> SOMO
+  do i = 2, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i,i, 2
+        Jsomo = ISHFT(1_8,i)-1
+        if(j .GT. NSOMOMax .OR. j .LE. 0) cycle
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,k-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,l+1-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+
+              AIJpqContainer(i,j,4,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(i,j,4,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  END_PROVIDER
+
+
+!!!!!!
+
+  BEGIN_PROVIDER [ real*8, DetToCSFTransformationMatrix, (0:NSOMOMax,NBFMax,maxDetDimPerBF)]
+ &BEGIN_PROVIDER [ real*8, psi_coef_config,  (n_CSF)]
+ &BEGIN_PROVIDER [ integer, psi_config_data, (N_configuration,2)]
+  use cfunctions
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Documentation for DetToCSFTransformationMatrix
+  ! Provides the matrix of transformatons for the
+  ! conversion between determinant to CSF basis (in BFs)
+  END_DOC
+  integer(bit_kind) :: mask(N_int), Ialpha(N_int),Ibeta(N_int)
+  integer   :: rows, cols, i, j, k
+  integer   :: startdet, enddet
+  integer*8 MS, Isomo, Idomo
+  integer ndetI
+  integer :: getNSOMO
+  real*8,dimension(:,:),allocatable    :: tempBuffer
+  real*8,dimension(:),allocatable    :: tempCoeff
+  real*8  :: norm_det1, phasedet
+  norm_det1 = 0.d0
+  MS = elec_alpha_num - elec_beta_num
+  ! initialization
+  psi_coef_config = 0.d0
+  DetToCSFTransformationMatrix(0,:,:) = 1.d0
+  do i = 2-iand(elec_alpha_num-elec_beta_num,1), NSOMOMax,2
+    Isomo = IBSET(0_8, i) - 1_8
+    ! rows = Ncsfs
+    ! cols = Ndets
+    bfIcfg = max(1,nint((binom(i,(i+1)/2)-binom(i,((i+1)/2)+1))))
+    ndetI = max(1,nint((binom(i,(i+1)/2))))
+
+    allocate(tempBuffer(bfIcfg,ndetI))
+    call getCSFtoDETTransformationMatrix(Isomo, MS, NBFMax, maxDetDimPerBF, tempBuffer)
+    DetToCSFTransformationMatrix(i,1:bfIcfg,1:ndetI) =  tempBuffer(1:bfIcfg,1:ndetI)
+    deallocate(tempBuffer)
+  enddo
+
+  integer s, bfIcfg
+  integer countcsf
+  countcsf = 0
+  integer countdet
+  countdet = 0
+  integer idx
+  integer istate
+  istate = 1
+  phasedet = 1.0d0
+  do i = 1,N_configuration
+      startdet = psi_configuration_to_psi_det(1,i)
+      enddet = psi_configuration_to_psi_det(2,i)
+      ndetI = enddet-startdet+1
+
+      allocate(tempCoeff(ndetI))
+      countdet = 1
+      do j = startdet, enddet
+         idx = psi_configuration_to_psi_det_data(j)
+         Ialpha(:) = psi_det(:,1,idx)
+         Ibeta(:)  = psi_det(:,2,idx)
+         call get_phase_qp_to_cfg(Ialpha, Ibeta, phasedet)
+         tempCoeff(countdet) = psi_coef(idx, istate)*phasedet
+         norm_det1 += tempCoeff(countdet)*tempCoeff(countdet)
+         countdet += 1
+      enddo
+
+      s = 0
+      do k=1,N_int
+        if (psi_configuration(k,1,i) == 0_bit_kind) cycle
+        s = s + popcnt(psi_configuration(k,1,i))
+      enddo
+      bfIcfg = max(1,nint((binom(s,(s+1)/2)-binom(s,((s+1)/2)+1))))
+
+      ! perhaps blocking with CFGs of same seniority
+      ! can be more efficient
+      allocate(tempBuffer(bfIcfg,ndetI))
+      tempBuffer = DetToCSFTransformationMatrix(s,:bfIcfg,:ndetI)
+
+       call dgemm('N','N', bfIcfg, 1, ndetI, 1.d0, tempBuffer, size(tempBuffer,1), tempCoeff, size(tempCoeff,1), 0.d0, psi_coef_config(countcsf+1), size(psi_coef_config,1))
+       !call dgemv('N', NBFMax, maxDetDimPerBF, 1.d0, tempBuffer, size(tempBuffer,1), tempCoeff, 1, 0.d0, psi_coef_config(countcsf), 1)
+
+      deallocate(tempCoeff)
+      deallocate(tempBuffer)
+      psi_config_data(i,1) = countcsf + 1
+      countcsf += bfIcfg
+      psi_config_data(i,2) = countcsf
+  enddo
+
+  END_PROVIDER
--- a/src/csf/tree_utils.c
+++ b/src/csf/tree_utils.c
@ -0,0 +1,309 @@
+#include "tree_utils.h"
+
+void buildTree(Tree *bftree,
+               Node **inode,
+               int isomo,
+               int izeros,
+               int icpl,
+               int NSOMOMax,
+               int MSmax){
+
+    // Find the maximum parallel couplings 0
+    //      the maximum anti-parallel couplings 1
+    int zeromax = MSmax + (NSOMOMax-MSmax)/2;
+    int onemax = NSOMOMax - zeromax;
+
+    // Exit condition
+    if(isomo > NSOMOMax || icpl < 0 || izeros > zeromax ) return;
+
+    // If we find a valid BF assign its address
+    if(isomo == NSOMOMax){
+        (*inode)->addr = bftree->rootNode->addr;
+        bftree->rootNode->addr += 1;
+        return;
+    }
+
+    // Call 0 branch
+    if(((*inode)->C0) == NULL && izeros+1 <= zeromax){
+        ((*inode)->C0) = malloc(sizeof(Node));
+        (*(*inode)->C0) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 0, .iSOMO = isomo };
+        buildTree(bftree, &(*inode)->C0, isomo+1, izeros+1, icpl+1, NSOMOMax, MSmax);
+    }
+    else buildTree(bftree, &(*inode)->C0, isomo+1, izeros+1, icpl+1, NSOMOMax, MSmax);
+
+    // Call 1 branch
+    if(((*inode)->C1) == NULL && icpl-1 >= 0){
+        ((*inode)->C1) = malloc(sizeof(Node));
+        (*(*inode)->C1) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 1, .iSOMO = isomo };
+        buildTree(bftree, &(*inode)->C1, isomo+1, izeros+0, icpl-1, NSOMOMax, MSmax);
+    }
+    else buildTree(bftree, &(*inode)->C1, isomo+1, izeros+0, icpl-1, NSOMOMax, MSmax);
+
+    return;
+}
+
+void buildTreeDriver(Tree *bftree, int NSOMO, int MS, int *NBF){
+    int isomo = 0; // counts the total number of SOMO's
+    int izeros= 0; // Counts the total number of parallel coupings (i.e. 0's)
+    int icpl  = 0; // keep track of the ith ms (cannot be -ve)
+    int addr  = 0; // Counts the total BF's
+
+    buildTree(bftree, &(bftree->rootNode), isomo, izeros, icpl, NSOMO, MS);
+
+    *NBF = bftree->rootNode->addr;
+}
+
+
+void getIthBF(Node *inode, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF){
+    // Exit condition
+    if(foundBF) return;
+    if(isomo > NSOMOMax) return;
+    if(inode == NULL) return;
+
+    if(isomo == NSOMOMax){
+        if(inode->addr == getaddr){
+            for(int i = NSOMOMax-1; i > -1; i--){
+                vecBF[i] = inode->cpl;
+                inode = inode->PREV;
+            }
+            foundBF = true;
+            return;
+        }
+    }
+
+    // Recurse to C0
+    if(inode->C0 != NULL){
+        getIthBF(inode->C0, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+    // Recurse to C1
+    if(inode->C1 != NULL){
+        getIthBF(inode->C1, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+
+    return;
+}
+
+void getIthBFDriver(Tree *bftree, int NSOMOMax, int getaddr, int *vecBF){
+    int isomo = 0;
+    bool foundBF = false;
+    getIthBF((bftree->rootNode), isomo, foundBF, NSOMOMax, getaddr, vecBF);
+}
+
+void genDets(Tree *dettree,
+               Node **inode,
+               int isomo,
+               int izeros,
+               int icpl,
+               int NSOMOMax,
+               int MSmax){
+
+    // Find the maximum parallel couplings 0
+    //      the maximum anti-parallel couplings 1
+    int zeromax = MSmax + (NSOMOMax-MSmax)/2;
+    int onemax = NSOMOMax - zeromax;
+
+    // Exit condition
+    if(isomo > NSOMOMax || izeros > zeromax || abs(icpl) > onemax) return;
+
+    // If we find a valid BF assign its address
+    if(isomo == NSOMOMax){
+        (*inode)->addr = dettree->rootNode->addr;
+        dettree->rootNode->addr += 1;
+        return;
+    }
+
+    // Call 0 branch
+    if(((*inode)->C0) == NULL && izeros+1 <= zeromax){
+        ((*inode)->C0) = malloc(sizeof(Node));
+        (*(*inode)->C0) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 0, .iSOMO = isomo };
+        genDets(dettree, &(*inode)->C0, isomo+1, izeros+1, icpl+0, NSOMOMax, MSmax);
+    }
+    else genDets(dettree, &(*inode)->C0, isomo+1, izeros+1, icpl+0, NSOMOMax, MSmax);
+
+    // Call 1 branch
+    if(((*inode)->C1) == NULL && abs(icpl+1) <= onemax){
+        ((*inode)->C1) = malloc(sizeof(Node));
+        (*(*inode)->C1) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 1, .iSOMO = isomo };
+        genDets(dettree, &(*inode)->C1, isomo+1, izeros+0, icpl+1, NSOMOMax, MSmax);
+    }
+    else genDets(dettree, &(*inode)->C1, isomo+1, izeros+0, icpl+1, NSOMOMax, MSmax);
+
+    return;
+}
+
+void genDetsDriver(Tree *dettree, int NSOMO, int MS, int *Ndets){
+    int isomo = 0; // counts the total number of SOMO's
+    int izeros= 0; // Counts the total number of parallel coupings (i.e. 0's)
+    int icpl  = 0; // keep track of the ith ms (cannot be -ve)
+    int addr  = 0; // Counts the total BF's
+
+    genDets(dettree, &(dettree->rootNode), isomo, izeros, icpl, NSOMO, MS);
+
+    *Ndets = dettree->rootNode->addr;
+}
+
+void getIthDet(Node *inode, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF){
+    // Exit condition
+    if(foundBF) return;
+    if(isomo > NSOMOMax) return;
+    if(inode == NULL) return;
+
+    if(isomo == NSOMOMax){
+        if(inode->addr == getaddr){
+            for(int i = NSOMOMax-1; i > -1; i--){
+                vecBF[i] = inode->cpl;
+                inode = inode->PREV;
+            }
+            foundBF = true;
+            return;
+        }
+    }
+
+    // Recurse to C0
+    if(inode->C0 != NULL){
+        getIthDet(inode->C0, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+    // Recurse to C1
+    if(inode->C1 != NULL){
+        getIthDet(inode->C1, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+
+    return;
+}
+
+void getIthDetDriver(Tree *dettree, int NSOMOMax, int getaddr, int *vecBF){
+    int isomo = 0;
+    bool foundBF = false;
+    getIthDet((dettree->rootNode), isomo, foundBF, NSOMOMax, getaddr, vecBF);
+}
+
+void findAddofDet(Node *inode, int isomo, bool foundDet, int NSOMOMax, int *inpdet, int *addr){
+    // Exit condition
+    if(foundDet) return;
+    if(isomo == NSOMOMax){
+        foundDet = true;
+        *addr = inode->addr;
+        return;
+    }
+
+    // Recurse to C0
+    if(inpdet[isomo] == 0){
+        if(inode->C0 != NULL){
+            findAddofDet(inode->C0, isomo+1, foundDet, NSOMOMax, inpdet, addr);
+        }
+        else{
+            *addr = -1;
+            return;
+        }
+    }
+    else{
+        // Recurse to C1
+        if(inode->C1 != NULL){
+            findAddofDet(inode->C1, isomo+1, foundDet, NSOMOMax, inpdet, addr);
+        }
+        else{
+            *addr = -1;
+            return;
+        }
+    }
+
+    return;
+}
+
+void findAddofDetDriver(Tree *dettree, int NSOMOMax, int *inpdet, int *addr){
+    *addr = -1;
+    int isomo = 0;
+    bool foundDet = false;
+    findAddofDet((dettree->rootNode), isomo, foundDet, NSOMOMax, inpdet, addr);
+}
+
+void getDetlist(Node *inode, int isomo, int NSOMOMax, int *vecBF, int *detlist){
+    // Exit condition
+    if(isomo > NSOMOMax) return;
+    if(inode == NULL) return;
+
+    if(isomo == NSOMOMax){
+        int idet=0;
+        for(int k=0;k<NSOMOMax;k++){
+            if(vecBF[k] == 1) idet = idet | (1<<(NSOMOMax-1-k));
+        }
+        detlist[inode->addr]=idet;
+        return;
+    }
+
+
+    // Recurse to C0
+    if(inode->C0 != NULL){
+        vecBF[isomo] = 0;
+        getDetlist(inode->C0, isomo+1, NSOMOMax, vecBF, detlist);
+    }
+    // Recurse to C1
+    if(inode->C1 != NULL){
+        vecBF[isomo] = 1;
+        getDetlist(inode->C1, isomo+1, NSOMOMax, vecBF, detlist);
+    }
+
+    return;
+}
+
+void getDetlistDriver(Tree *dettree, int NSOMOMax, int *detlist){
+    int isomo = 0;
+    int vecBF[NSOMOMax];
+    if(dettree->rootNode->addr > 1) getDetlist((dettree->rootNode), isomo, NSOMOMax, vecBF, detlist);
+}
+
+void genDetBasis(Tree *dettree, int Isomo, int MS, int *ndets){
+
+    int NSOMO=0;
+    getSetBits(Isomo, &NSOMO);
+    genDetsDriver(dettree, NSOMO, MS, ndets);
+
+}
+
+void callBlasMatxMat(double *A, int rowA, int colA, double *B, int rowB, int colB, double *C, bool transA, bool transB){
+    int m = rowA;
+    int k = colA;
+    int n = colB;
+    double alpha = 1.0;
+    double beta  = 0.0;
+    int val = 0;
+    if (transA) val |= 0x1;
+    if (transB) val |= 0x2;
+
+    switch (val) {
+        case 0: // notransA, notransB
+            m = rowA;
+            n = colB;
+            k = colA;
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                        m, n, k, alpha, A, k, B, n, beta, C, n);
+            break;
+        case 1: // transA, notransB
+            m = colA;
+            n = colB;
+            k = rowA;
+            cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                        m, n, k, alpha, A, colA, B, n, beta, C, n);
+            break;
+        case 2: // notransA, transB
+            //m = rowA;
+            //n = rowB;
+            //k = colB;
+            m = rowA;
+            n = rowB;
+            k = colA;
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        m, n, k, alpha, A, k, B, colB, beta, C, n);
+            break;
+        case 3: // transA, transB
+            m = colA;
+            n = rowB;
+            k = rowA;
+            cblas_dgemm(CblasRowMajor, CblasTrans, CblasTrans,
+                        m, n, k, alpha, A, colA, B, colB, beta, C, n);
+            break;
+        default:
+            printf("Impossible !!!!\n");
+            break;
+    }
+}
--- a/src/csf/tree_utils.h
+++ b/src/csf/tree_utils.h
@ -0,0 +1,87 @@
+#ifndef TREE_UTILS_H
+#define TREE_UTILS_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <math.h>
+
+typedef struct bin_node Node;
+typedef struct bin_tree Tree;
+struct bin_node {
+    Node *C0;
+    Node *C1;
+    Node *PREV;
+    int addr;
+    int cpl;
+    int iSOMO;
+};
+
+struct bin_tree {
+    Node *rootNode;
+    int NBF;
+};
+
+#include "/usr/include/x86_64-linux-gnu/cblas.h"
+
+#define MAX_SOMO 32
+
+void buildTreeDriver(Tree *bftree, int NSOMO, int MS, int *NBF);
+
+void buildTree(Tree *bftree, Node **inode, int isomo, int izeros, int icpl, int NSOMOMax, int MSmax);
+
+void printTreeDriver(Tree *bftree, int NSOMOMax);
+void printTree(Node *bftree, int isomo, int NSOMOMax, int *vecBF);
+
+void getIthBF(Node *node, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF);
+void getIthBFDriver(Tree *bftree, int NSOMOMax, int getaddr, int *vecBF);
+
+void getBFIndexList(int NSOMO, int *BF1, int *IdxListBF1);
+void getIslands(int NSOMO, int *BF1, int *BF2, int *nislands, int *phasefactor);
+
+void generateAllBFs(int64_t Isomo, int64_t MS, Tree *bftree, int *NBF, int *NSOMO);
+void getSetBits(int64_t n, int *nsetbits);
+void getOverlapMatrix(int64_t Isomo, int64_t MS, double **overlapMatrixptr, int *rows, int *cols, int *NSOMOout);
+void getOverlapMatrix_withDet(double *bftodetmatrixI, int rowsbftodetI, int colsbftodetI, int64_t Isomo, int64_t MS, double **overlapMatrixI, int *rowsI, int *colsI, int *NSOMO);
+void gramSchmidt(double *overlapMatrix, int rows, int cols, double *orthoMatrix);
+
+
+void calculateMETypeSOMOSOMO(int *BF1, int *BF2, int moi, int moj, double *factor, int *phasefactor);
+void getOneElMETypeSOMOSOMO(int64_t Isomo, int64_t Jsomos, int moi, int moj, int MS, double **oneElMatrixElementsptr, int *rows, int *cols);
+
+/***********************
+
+Determinant Tree utils
+***********************/
+
+
+void genDets(Tree *dettree,
+               Node **inode,
+               int isomo,
+               int izeros,
+               int icpl,
+               int NSOMOMax,
+               int MSmax);
+void genDetsDriver(Tree *dettree, int NSOMO, int MS, int *Ndets);
+
+void getIthDet(Node *inode, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF);
+void getIthDetDriver(Tree *dettree, int NSOMOMax, int getaddr, int *vecBF);
+void getDetlistDriver(Tree *dettree, int NSOMOMax, int *detlist);
+void findAddofDet(Node *inode, int isomo, bool foundDet, int NSOMOMax, int *inpdet, int *addr);
+void findAddofDetDriver(Tree *dettree, int NSOMOMax, int *inpdet, int *addr);
+
+
+/************************/
+
+void genDetBasis(Tree *dettree, int Isomo, int MS, int *ndets);
+void getbftodetfunction(Tree *dettree, int NSOMO, int MS, int *BF1, double *rowvec);
+void convertBFtoDetBasis(int64_t Isomo, int MS, double **bftodetmatrixptr, int *rows, int *cols);
+
+// Misc utils
+void int_to_bin_digit(int64_t in, int count, int* out);
+void printRealMatrix(double *orthoMatrix, int rows, int cols);
+void callBlasMatxMat(double *A, int rowA, int colA, double *B, int rowB, int colB, double *C, bool transA, bool transB);
+void get_phase_cfg_to_qp_inpList(int *inpdet, int NSOMO, int *phaseout);
+void get_phase_cfg_to_qp_inpInt(int inpdet, double *phaseout);
+
+#endif
--- a/src/davidson/davidson_parallel_csf.irp.f
+++ b/src/davidson/davidson_parallel_csf.irp.f
@ -0,0 +1,495 @@
+use bitmasks
+use f77_zmq
+
+
+subroutine davidson_csf_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  call davidson_csf_run_slave(1,i)
+end
+
+
+subroutine davidson_csf_slave_tcp(i)
+  implicit none
+  integer, intent(in)            :: i
+  call davidson_csf_run_slave(0,i)
+end
+
+
+
+subroutine davidson_csf_run_slave(thread,iproc)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Slave routine for Davidson's diagonalization.
+  END_DOC
+
+  integer,  intent(in)           :: thread, iproc
+
+  integer                        :: worker_id, task_id, blockb
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  integer, external              :: connect_to_taskserver
+  integer                        :: doexit, send, receive
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  doexit = 0
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    doexit=1
+  endif
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    send = doexit
+    call MPI_AllReduce(send, receive, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      doexit=1
+    endif
+    doexit = receive
+  IRP_ENDIF
+  if (doexit>0) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  call davidson_csf_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_states_diag, N_det, worker_id)
+
+  integer, external :: disconnect_from_taskserver
+  if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+    call sleep(1)
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+      print *,  irp_here, ': disconnect failed'
+      continue
+    endif
+  endif
+
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  call end_zmq_push_socket(zmq_socket_push)
+
+end subroutine
+
+
+
+subroutine davidson_csf_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, worker_id)
+  use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR),intent(in)   :: zmq_to_qp_run_socket
+  integer(ZMQ_PTR),intent(in)   :: zmq_socket_push
+  integer,intent(in)             :: worker_id, N_st, sze
+  integer                        :: task_id
+  character*(512)                :: msg
+  integer                        :: imin, imax, ishift, istep
+
+  integer, allocatable           :: psi_det_read(:,:,:)
+  double precision, allocatable  :: v_t(:,:), u_t(:,:)
+
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t
+
+  ! Get wave function (u_t)
+  ! -----------------------
+
+  integer                        :: rc, ni, nj
+  integer*8                      :: rc8
+  integer                        :: N_states_read, N_det_read, psi_det_size_read
+  integer                        :: N_det_selectors_read, N_det_generators_read
+
+  integer, external :: zmq_get_dvector
+  integer, external :: zmq_get_dmatrix
+
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  allocate(u_t(N_st,N_det))
+
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (size(u_t,kind=8) < 8388608_8) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = int(size(u_t,kind=8)/8388608_8,4) + 1
+  endif
+
+  do while (zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1)
+    print *,  'mpi_rank, N_states_diag, N_det'
+    print *,  mpi_rank, N_states_diag, N_det
+    stop 'u_t'
+  enddo
+
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+
+    call broadcast_chunks_double(u_t,size(u_t,kind=8))
+
+  IRP_ENDIF
+
+  ! Run tasks
+  ! ---------
+
+  logical :: sending
+  sending=.False.
+
+  allocate(v_t(N_st,N_det))
+  do
+    integer, external :: get_task_from_taskserver
+    integer, external :: task_done_to_taskserver
+    if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
+      exit
+    endif
+    if(task_id == 0) exit
+    read (msg,*) imin, imax, ishift, istep
+    integer :: k
+    do k=imin,imax
+      v_t(:,k) = 0.d0
+    enddo
+    call H_u_0_nstates_openmp_work(v_t,u_t,N_st,N_det,imin,imax,ishift,istep)
+    if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
+        print *,  irp_here, 'Unable to send task_done'
+    endif
+    call davidson_push_results_async_recv(zmq_socket_push, sending)
+    call davidson_csf_push_results_async_send(zmq_socket_push, v_t, imin, imax, task_id, sending)
+  end do
+  deallocate(u_t,v_t)
+  call davidson_push_results_async_recv(zmq_socket_push, sending)
+
+end subroutine
+
+
+
+subroutine davidson_csf_push_results(zmq_socket_push, v_t, imin, imax, task_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  double precision    ,intent(in)    :: v_t(N_states_diag,N_det)
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imax'
+
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_csf_push_results failed to push vt'
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if ((rc /= 2).and.(ok(1:2)/='ok')) then
+    print *, irp_here, ': f77_zmq_recv( zmq_socket_push, ok, 2, 0)'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+subroutine davidson_csf_push_results_async_send(zmq_socket_push, v_t, imin, imax, task_id,sending)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  double precision    ,intent(in)    :: v_t(N_states_diag,N_det)
+  logical             ,intent(inout) :: sending
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  if (sending) then
+    print *,  irp_here, ': sending=true'
+    stop -1
+  endif
+  sending = .True.
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imax'
+
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_csf_push_results failed to push vt'
+
+
+end subroutine
+
+
+
+
+subroutine davidson_csf_pull_results(zmq_socket_pull, v_t, imin, imax, task_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Pull the results of $H | U \rangle$ on the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)     :: zmq_socket_pull
+  integer             ,intent(out)    :: task_id, imin, imax
+  double precision    ,intent(out)    :: v_t(N_states_diag,N_det)
+
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
+  if(rc /= 4) stop 'davidson_csf_pull_results failed to pull task_id'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
+  if(rc /= 4) stop 'davidson_csf_pull_results failed to pull imin'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
+  if(rc /= 4) stop 'davidson_csf_pull_results failed to pull imax'
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc8 = f77_zmq_recv8( zmq_socket_pull, v_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8*sz) stop 'davidson_csf_pull_results failed to pull v_t'
+
+! Activate if zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
+  if (rc /= 2) then
+    print *,  irp_here, ' : f77_zmq_send (zmq_socket_pull,...'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+
+
+
+subroutine davidson_csf_collector(zmq_to_qp_run_socket, zmq_socket_pull, v0, sze, N_st)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Routine collecting the results of the workers in Davidson's algorithm.
+  END_DOC
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  integer, intent(in)            :: sze, N_st
+  integer(ZMQ_PTR), intent(in)   :: zmq_to_qp_run_socket
+
+  double precision    ,intent(inout) :: v0(sze, N_st)
+
+  integer                          :: more, task_id, imin, imax
+
+  double precision, allocatable :: v_t(:,:)
+  logical :: sending
+  integer :: i,j
+  integer, external :: zmq_delete_task_async_send
+  integer, external :: zmq_delete_task_async_recv
+
+  allocate(v_t(N_st,N_det))
+  v0 = 0.d0
+  more = 1
+  sending = .False.
+  do while (more == 1)
+    call davidson_csf_pull_results(zmq_socket_pull, v_t, imin, imax, task_id)
+    if (zmq_delete_task_async_send(zmq_to_qp_run_socket,task_id,sending) == -1) then
+      stop 'davidson: Unable to delete task (send)'
+    endif
+    do j=1,N_st
+      do i=imin,imax
+        v0(i,j) = v0(i,j) + v_t(j,i)
+      enddo
+    enddo
+    if (zmq_delete_task_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
+      stop 'davidson: Unable to delete task (recv)'
+    endif
+  end do
+  deallocate(v_t)
+
+end subroutine
+
+
+
+
+
+subroutine H_u_0_nstates_zmq(v_0,u_0,N_st,sze)
+  use omp_lib
+  use bitmasks
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_0 = H | u_0\rangle$
+  !
+  ! n : number of determinants
+  !
+  ! H_jj : array of $\langle j | H | j \rangle$
+  END_DOC
+  integer, intent(in)            :: N_st, sze
+  double precision, intent(out)  :: v_0(sze,N_st)
+  double precision, intent(inout):: u_0(sze,N_st)
+  integer                        :: i,j,k
+  integer                        :: ithread
+  double precision, allocatable  :: u_t(:,:)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson')
+
+!  integer :: N_states_diag_save
+!  N_states_diag_save = N_states_diag
+!  N_states_diag = N_st
+  if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then
+    stop 'Unable to put N_states_diag on ZMQ server'
+  endif
+
+  if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+    stop 'Unable to put psi on ZMQ server'
+  endif
+  energy = 0.d0
+  if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then
+    stop 'Unable to put energy on ZMQ server'
+  endif
+
+
+  ! Create tasks
+  ! ============
+
+  integer :: istep, imin, imax, ishift, ipos
+  integer, external :: add_task_to_taskserver
+  integer, parameter :: tasksize=20000
+  character*(100000) :: task
+  istep=1
+  ishift=0
+  imin=1
+
+
+  ipos=1
+  do imin=1,N_det,tasksize
+    imax = min(N_det,imin-1+tasksize)
+    if (imin==1) then
+      istep = 2
+    else
+      istep = 1
+    endif
+    do ishift=0,istep-1
+      write(task(ipos:ipos+50),'(4(I11,1X),1X,1A)') imin, imax, ishift, istep, '|'
+      ipos = ipos+50
+      if (ipos > 100000-50) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task'
+        endif
+        ipos=1
+      endif
+    enddo
+  enddo
+
+  if (ipos > 1) then
+    if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+      stop 'Unable to add task'
+    endif
+    ipos=1
+  endif
+
+  allocate(u_t(N_st,N_det))
+  do k=1,N_st
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+
+  call dtranspose(                                                   &
+      u_0,                                                           &
+      size(u_0, 1),                                                  &
+      u_t,                                                           &
+      size(u_t, 1),                                                  &
+      N_det, N_st)
+
+
+  ASSERT (N_st == N_states_diag)
+  ASSERT (sze >= N_det)
+
+  integer :: rc, ni, nj
+  integer*8 :: rc8
+  double precision :: energy(N_st)
+
+  integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag
+  integer, external :: zmq_put_dmatrix
+
+  if (size(u_t) < 8388608) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = size(u_t)/8388608 + 1
+  endif
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (zmq_put_dmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then
+    stop 'Unable to put u_t on ZMQ server'
+  endif
+
+  deallocate(u_t)
+
+  integer, external :: zmq_set_running
+  if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+    print *,  irp_here, ': Failed in zmq_set_running'
+  endif
+
+  call omp_set_max_active_levels(4)
+  !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(2) PRIVATE(ithread)
+  ithread = omp_get_thread_num()
+  if (ithread == 0 ) then
+    call davidson_csf_collector(zmq_to_qp_run_socket, zmq_socket_pull, v_0, N_det, N_st)
+  else
+    call davidson_csf_slave_inproc(1)
+  endif
+  !$OMP END PARALLEL
+  call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'davidson')
+
+  !$OMP PARALLEL
+  !$OMP SINGLE
+  do k=1,N_st
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+  enddo
+  !$OMP END SINGLE
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+!  N_states_diag = N_states_diag_save
+!  SOFT_TOUCH N_states_diag
+end
+
--- a/src/davidson/diagonalization_hcsf_dressed.irp.f
+++ b/src/davidson/diagonalization_hcsf_dressed.irp.f
@ -0,0 +1,582 @@
+subroutine davidson_diag_h_csf(dets_in,u_in,dim_in,energies,sze,sze_csf,N_st,N_st_diag,Nint,dressing_state,converged)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization.
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, sze_csf, N_st, N_st_diag, Nint
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag)
+  double precision, intent(out)  :: energies(N_st_diag)
+  integer, intent(in)            :: dressing_state
+  logical, intent(out)           :: converged
+  double precision, allocatable  :: H_jj(:)
+
+  double precision, external     :: diag_H_mat_elem, diag_S_mat_elem
+  integer                        :: i,k
+  ASSERT (N_st > 0)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+  PROVIDE mo_two_e_integrals_in_map
+  allocate(H_jj(sze))
+
+  H_jj(1) = diag_h_mat_elem(dets_in(1,1,1),Nint)
+  !$OMP PARALLEL DEFAULT(NONE)                                       &
+      !$OMP  SHARED(sze,H_jj, dets_in,Nint)                    &
+      !$OMP  PRIVATE(i)
+  !$OMP DO SCHEDULE(static)
+  do i=2,sze
+    H_jj(i)  = diag_H_mat_elem(dets_in(1,1,i),Nint)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  if (dressing_state > 0) then
+    do k=1,N_st
+      do i=1,sze
+        H_jj(i)  += u_in(i,k) * dressing_column_h(i,k)
+      enddo
+    enddo
+  endif
+
+  call davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N_st,N_st_diag,Nint,dressing_state,converged)
+  deallocate(H_jj)
+end
+
+
+subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N_st,N_st_diag_in,Nint,dressing_state,converged)
+  use bitmasks
+  use mmap_module
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization with specific diagonal elements of the H matrix
+  !
+  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! N_st_diag_in : Number of states in which H is diagonalized. Assumed > sze
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, sze_csf, N_st, N_st_diag_in, Nint
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision,  intent(in)  :: H_jj(sze)
+  integer, intent(in)            :: dressing_state
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag_in)
+  double precision, intent(out)  :: energies(N_st_diag_in)
+
+  integer                        :: iter, N_st_diag
+  integer                        :: i,j,k,l,m
+  logical, intent(inout)         :: converged
+
+  double precision, external     :: u_dot_v, u_dot_u
+
+  integer                        :: k_pairs, kl
+
+  integer                        :: iter2, itertot
+  double precision, allocatable  :: y(:,:), h(:,:), lambda(:)
+  double precision, allocatable  :: s_tmp(:,:)
+  double precision               :: diag_h_mat_elem
+  double precision, allocatable  :: residual_norm(:)
+  character*(16384)              :: write_buffer
+  double precision               :: to_print(2,N_st)
+  double precision               :: cpu, wall
+  integer                        :: shift, shift2, itermax, istate
+  double precision               :: r1, r2, alpha
+  logical                        :: state_ok(N_st_diag_in*davidson_sze_max)
+  integer                        :: nproc_target
+  integer                        :: order(N_st_diag_in)
+  double precision               :: cmax
+  double precision, allocatable  :: U(:,:), U_csf(:,:), overlap(:,:)
+  double precision, pointer      :: W(:,:), W_csf(:,:)
+  logical                        :: disk_based
+  double precision               :: energy_shift(N_st_diag_in*davidson_sze_max)
+
+  include 'constants.include.F'
+
+  N_st_diag = N_st_diag_in
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, S, y, S_d, h, lambda
+  if (N_st_diag*3 > sze) then
+    print *,  'error in Davidson :'
+    print *,  'Increase n_det_max_full to ', N_st_diag*3
+    stop -1
+  endif
+
+  itermax = max(2,min(davidson_sze_max, sze/N_st_diag))+1
+  itertot = 0
+
+  if (state_following) then
+    allocate(overlap(N_st_diag*itermax, N_st_diag*itermax))
+  else
+    allocate(overlap(1,1))  ! avoid 'if' for deallocate
+  endif
+  overlap = 0.d0
+
+  PROVIDE nuclear_repulsion expected_s2 psi_bilinear_matrix_order psi_bilinear_matrix_order_reverse threshold_davidson_pt2 threshold_davidson_from_pt2
+
+  call write_time(6)
+  write(6,'(A)') ''
+  write(6,'(A)') 'Davidson Diagonalization'
+  write(6,'(A)') '------------------------'
+  write(6,'(A)') ''
+
+  ! Find max number of cores to fit in memory
+  ! -----------------------------------------
+
+  nproc_target = nproc
+  double precision :: rss
+  integer :: maxab
+  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
+
+  m=1
+  disk_based = .False.
+  call resident_memory(rss)
+  do
+    r1 = 8.d0 *                                   &! bytes
+         ( dble(sze)*(N_st_diag*itermax)          &! U
+         + 1.0d0*dble(sze*m)*(N_st_diag*itermax)  &! W
+         + 3.0d0*(N_st_diag*itermax)**2           &! h,y,s_tmp
+         + 1.d0*(N_st_diag*itermax)               &! lambda
+         + 1.d0*(N_st_diag)                       &! residual_norm
+                                                   ! In H_u_0_nstates_zmq
+         + 2.d0*(N_st_diag*N_det)                 &! u_t, v_t, on collector
+         + 2.d0*(N_st_diag*N_det)                 &! u_t, v_t, on slave
+         + 0.5d0*maxab                            &! idx0 in H_u_0_nstates_openmp_work_*
+         + nproc_target *                         &! In OMP section
+           ( 1.d0*(N_int*maxab)                   &! buffer
+           + 3.5d0*(maxab) )                      &! singles_a, singles_b, doubles, idx
+         ) / 1024.d0**3
+
+    if (nproc_target == 0) then
+      call check_mem(r1,irp_here)
+      nproc_target = 1
+      exit
+    endif
+
+    if (r1+rss < qp_max_mem) then
+      exit
+    endif
+
+    if (itermax > 4) then
+      itermax = itermax - 1
+    else if (m==1.and.disk_based_davidson) then
+      m=0
+      disk_based = .True.
+      itermax = 6
+    else
+      nproc_target = nproc_target - 1
+    endif
+
+  enddo
+  nthreads_davidson = nproc_target
+  TOUCH nthreads_davidson
+  call write_int(6,N_st,'Number of states')
+  call write_int(6,N_st_diag,'Number of states in diagonalization')
+  call write_int(6,sze,'Number of determinants')
+  call write_int(6,nproc_target,'Number of threads for diagonalization')
+  call write_double(6, r1, 'Memory(Gb)')
+  if (disk_based) then
+    print *, 'Using swap space to reduce RAM'
+  endif
+
+  !---------------
+
+  write(6,'(A)') ''
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = 'Iter'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//'       Energy        Residual '
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+
+
+  if (disk_based) then
+    ! Create memory-mapped files for W and S
+    type(c_ptr) :: ptr_w, ptr_s
+    integer :: fd_s, fd_w
+    call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
+        8, fd_w, .False., ptr_w)
+    call c_f_pointer(ptr_w, W_csf, (/sze_csf,N_st_diag*itermax/))
+  else
+    allocate(W(sze,N_st_diag),W_csf(sze_csf,N_st_diag*itermax),)
+  endif
+
+  allocate(                                                          &
+      ! Large
+      U(sze,N_st_diag),                                              &
+      U_csf(sze_csf,N_st_diag*itermax),                              &
+
+      ! Small
+      h(N_st_diag*itermax,N_st_diag*itermax),                        &
+      y(N_st_diag*itermax,N_st_diag*itermax),                        &
+      s_tmp(N_st_diag*itermax,N_st_diag*itermax),                    &
+      residual_norm(N_st_diag),                                      &
+      lambda(N_st_diag*itermax))
+
+  h = 0.d0
+  U = 0.d0
+  y = 0.d0
+  s_tmp = 0.d0
+
+
+  ASSERT (N_st > 0)
+  ASSERT (N_st_diag >= N_st)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+
+  ! Davidson iterations
+  ! ===================
+
+  converged = .False.
+
+  do k=N_st+1,N_st_diag
+    do i=1,sze
+        call random_number(r1)
+        call random_number(r2)
+        r1 = dsqrt(-2.d0*dlog(r1))
+        r2 = dtwo_pi*r2
+        u_in(i,k) = r1*dcos(r2) * u_in(i,k-N_st)
+    enddo
+    u_in(k,k) = u_in(k,k) + 10.d0
+  enddo
+  do k=1,N_st_diag
+    call normalize(u_in(1,k),sze)
+  enddo
+
+  do k=1,N_st_diag
+    do i=1,sze
+      U(i,k) = u_in(i,k)
+    enddo
+  enddo
+
+  ! Make random verctors eigenstates of S2
+  call convertWFfromDETtoCSF(N_st_diag,U,U_csf)
+  call convertWFfromCSFtoDET(N_st_diag,U_csf,U)
+
+  do while (.not.converged)
+    itertot = itertot+1
+    if (itertot == 8) then
+      exit
+    endif
+
+    do iter=1,itermax-1
+
+      shift  = N_st_diag*(iter-1)
+      shift2 = N_st_diag*iter
+
+      if ((iter > 1).or.(itertot == 1)) then
+        ! Compute |W_k> = \sum_i |i><i|H|u_k>
+        ! -----------------------------------
+
+        if (disk_based) then
+          call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,shift2)
+          call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,shift2)
+        else
+          call ortho_qr(U_csf,size(U_csf,1),sze_csf,shift2)
+          call ortho_qr(U_csf,size(U_csf,1),sze_csf,shift2)
+        endif
+
+        call convertWFfromCSFtoDET(N_st_diag,U_csf(1,shift+1),U)
+        if ((sze > 100000).and.distributed_davidson) then
+            call H_u_0_nstates_zmq   (W,U,N_st_diag,sze)
+        else
+            call H_u_0_nstates_openmp(W,U,N_st_diag,sze)
+        endif
+      else
+         ! Already computed in update below
+         continue
+      endif
+
+      if (dressing_state > 0) then
+
+        if (N_st == 1) then
+
+          l = dressed_column_idx(1)
+          double precision :: f
+          f = 1.0d0/psi_coef(l,1)
+          do istate=1,N_st_diag
+            do i=1,sze
+              W(i,istate) += dressing_column_h(i,1) *f * U(l,istate)
+              W(l,istate) += dressing_column_h(i,1) *f * U(i,istate)
+            enddo
+
+          enddo
+
+        else
+
+          call dgemm('T','N', N_st, N_st_diag, sze, 1.d0, &
+            psi_coef, size(psi_coef,1), &
+            U(1,1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+
+          call dgemm('N','N', sze, N_st_diag, N_st, 1.0d0, &
+            dressing_column_h, size(dressing_column_h,1), s_tmp, size(s_tmp,1), &
+            1.d0, W(1,1), size(W,1))
+
+
+          call dgemm('T','N', N_st, N_st_diag, sze, 1.d0, &
+            dressing_column_h, size(dressing_column_h,1), &
+            U(1,1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+
+          call dgemm('N','N', sze, N_st_diag, N_st, 1.0d0, &
+            psi_coef, size(psi_coef,1), s_tmp, size(s_tmp,1), &
+            1.d0, W(1,1), size(W,1))
+
+        endif
+      endif
+
+      call convertWFfromDETtoCSF(N_st_diag,W,W_csf(1,shift+1))
+
+      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
+      ! -------------------------------------------
+
+      call dgemm('T','N', shift2, shift2, sze_csf,                       &
+          1.d0, U_csf, size(U_csf,1), W_csf, size(W_csf,1),                          &
+          0.d0, h, size(h,1))
+
+      ! Diagonalize h
+      ! ---------------
+
+      call lapack_diag(lambda,y,h,size(h,1),shift2)
+
+      ! Compute Energy for each eigenvector
+      ! -----------------------------------
+
+      call dgemm('N','N',shift2,shift2,shift2,                       &
+          1.d0, h, size(h,1), y, size(y,1),                          &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      call dgemm('T','N',shift2,shift2,shift2,                       &
+          1.d0, y, size(y,1), s_tmp, size(s_tmp,1),                  &
+          0.d0, h, size(h,1))
+
+      do k=1,shift2
+        lambda(k) = h(k,k)
+      enddo
+
+      if (state_following) then
+
+        overlap = -1.d0
+        do k=1,shift2
+          do i=1,shift2
+            overlap(k,i) = dabs(y(k,i))
+          enddo
+        enddo
+        do k=1,N_st
+          cmax = -1.d0
+          do i=1,N_st
+            if (overlap(i,k) > cmax) then
+              cmax = overlap(i,k)
+              order(k) = i
+            endif
+          enddo
+          do i=1,N_st_diag
+            overlap(order(k),i) = -1.d0
+          enddo
+        enddo
+        overlap = y
+        do k=1,N_st
+          l = order(k)
+          if (k /= l) then
+            y(1:shift2,k) = overlap(1:shift2,l)
+          endif
+        enddo
+        do k=1,N_st
+          overlap(k,1) = lambda(k)
+        enddo
+
+      endif
+
+
+      ! Express eigenvectors of h in the csf basis
+      ! ------------------------------------------
+
+      call dgemm('N','N', sze_csf, N_st_diag, shift2,                    &
+          1.d0, U_csf, size(U_csf,1), y, size(y,1), 0.d0, U_csf(1,shift2+1), size(U_csf,1))
+      call dgemm('N','N', sze_csf, N_st_diag, shift2,                    &
+          1.d0, W_csf, size(W_csf,1), y, size(y,1), 0.d0, W_csf(1,shift2+1), size(W_csf,1))
+
+      ! Compute residual vector and davidson step
+      ! -----------------------------------------
+
+      call convertWFfromCSFtoDET(N_st_diag,U_csf(1,shift2+1),U)
+      call convertWFfromCSFtoDET(N_st_diag,W_csf(1,shift2+1),W)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k)
+      do k=1,N_st_diag
+        do i=1,sze
+!           U_csf(i,shift2+k) =  &
+!            (lambda(k) * U_csf(i,shift2+k) - W_csf(i,shift2+k) )      &
+!              /max(H_jj_csf(i) - lambda (k),1.d-2)
+           U(i,k) =  (lambda(k) * U(i,k) - W(i,k) )      &
+              /max(H_jj(i) - lambda (k),1.d-2)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      do k=1,N_st
+        residual_norm(k) = u_dot_u(U(1,k),sze)
+        to_print(1,k) = lambda(k) + nuclear_repulsion
+        to_print(2,k) = residual_norm(k)
+      enddo
+      call convertWFfromDETtoCSF(N_st_diag,U,U_csf(1,shift2+1))
+
+      if ((itertot>1).and.(iter == 1)) then
+        !don't print
+        continue
+      else
+        write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st)
+      endif
+
+      ! Check convergence
+      if (iter > 1) then
+        if (threshold_davidson_from_pt2) then
+          converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson_pt2
+        else
+          converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson
+        endif
+      endif
+
+      do k=1,N_st
+        if (residual_norm(k) > 1.d8) then
+          print *, 'Davidson failed'
+          stop -1
+        endif
+      enddo
+      if (converged) then
+        exit
+      endif
+
+      logical, external :: qp_stop
+      if (qp_stop()) then
+        converged = .True.
+        exit
+      endif
+
+
+    enddo
+
+    ! Re-contract U and update W
+    ! --------------------------------
+
+    call dgemm('N','N', sze_csf, N_st_diag, shift2, 1.d0,      &
+        W_csf, size(W_csf,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+    do k=1,N_st_diag
+      do i=1,sze_csf
+        W_csf(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    call dgemm('N','N', sze_csf, N_st_diag, shift2, 1.d0,      &
+        U_csf, size(U_csf,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+
+    do k=1,N_st_diag
+      do i=1,sze_csf
+        U_csf(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    if (disk_based) then
+      call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+      call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+    else
+      call ortho_qr(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+      call ortho_qr(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+    endif
+
+    call convertWFfromCSFtoDET(N_st_diag,U_csf,U)
+
+    ! Adjust the phase
+    do j=1,N_st_diag
+      ! Find first non-zero
+      k=1
+      do while ((k<sze).and.(U(k,j) == 0.d0))
+        k = k+1
+      enddo
+      ! Check sign
+      if (U(k,j) * u_in(k,j) < 0.d0) then
+        do i=1,sze
+          W(i,j) = -W(i,j)
+        enddo
+      endif
+    enddo
+
+  enddo
+
+
+  call nullify_small_elements(sze,N_st_diag,U,size(U,1),threshold_davidson_pt2)
+  do k=1,N_st_diag
+    do i=1,sze
+      u_in(i,k) = U(i,k)
+    enddo
+  enddo
+
+  do k=1,N_st_diag
+    energies(k) = lambda(k)
+  enddo
+  write_buffer = '======'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') trim(write_buffer)
+  write(6,'(A)') ''
+  call write_time(6)
+
+  if (disk_based)then
+    ! Remove temp files
+    integer, external :: getUnitAndOpen
+    call munmap( (/int(sze,8),int(N_st_diag*itermax,8)/), 8, fd_w, ptr_w )
+    fd_w = getUnitAndOpen(trim(ezfio_work_dir)//'davidson_w','r')
+    close(fd_w,status='delete')
+  else
+    deallocate(W, W_csf)
+  endif
+
+  deallocate (                                                       &
+      residual_norm,                                                 &
+      U, U_csf, overlap,                                             &
+      h, y, s_tmp,                                                   &
+      lambda                                                         &
+      )
+  FREE nthreads_davidson
+end
+
+
+
+
+
+
+
--- a/src/davidson/diagonalize_ci.irp.f
+++ b/src/davidson/diagonalize_ci.irp.f
@ -58,9 +58,9 @@ END_PROVIDER
   if (diag_algorithm == "Davidson") then

     if (s2_eig.and.only_expected_s2) then
-       call davidson_diag_H(psi_det,CI_eigenvectors, &
+       call davidson_diag_H_csf(psi_det,CI_eigenvectors, &
         size(CI_eigenvectors,1),CI_electronic_energy,               &
-         N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+         N_det,N_csf,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
     else
       call davidson_diag_HS2(psi_det,CI_eigenvectors, CI_s2, &
         size(CI_eigenvectors,1),CI_electronic_energy,               &
@ -85,9 +85,9 @@ END_PROVIDER
          CI_electronic_energy_tmp(1:N_states_diag_save) = CI_electronic_energy(1:N_states_diag_save)
          CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save) = CI_eigenvectors(1:N_det,1:N_states_diag_save)

-          call davidson_diag_H(psi_det,CI_eigenvectors_tmp, &
+          call davidson_diag_H_csf(psi_det,CI_eigenvectors_tmp, &
            size(CI_eigenvectors_tmp,1),CI_electronic_energy_tmp,               &
-            N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+            N_det,N_csf,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)

          CI_electronic_energy(1:N_states_diag_save) = CI_electronic_energy_tmp(1:N_states_diag_save)
          CI_eigenvectors(1:N_det,1:N_states_diag_save) = CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save)