qp_plugins_scemama/devel/svdwf/buildpsi_eff.irp.f

program buildpsi_eff

  implicit none

  BEGIN_DOC
  ! study efficiency for different way to build | psi >
  END_DOC

  read_wf = .True.
  TOUCH read_wf

  PROVIDE N_int

  call run()
end


subroutine run

  USE OMP_LIB

  implicit none

  integer(bit_kind)             :: det1(N_int,2), det2(N_int,2)
  integer                       :: degree, i_state
  double precision              :: h12

  integer                       :: i, j, k, l, ii, jj, nn, n, na, nb, m, ma, mb

  double precision              :: norm_psi, inv_sqrt_norm_psi
  double precision, allocatable :: Uref(:,:), Dref(:), Vtref(:,:), Aref(:,:), Vref(:,:)

  double precision              :: E0_av, E0_ap, E0pt2
  double precision              :: err0, err_tmp, e_tmp, E0, overlop, E0_old, tol_energy
  double precision              :: ctmp, htmp, Ept2
  double precision              :: E0_postsvd, overlop_postsvd
  double precision              :: norm_coeff_psi, inv_sqrt_norm_coeff_psi
  double precision              :: overlopU, overlopU_mat, overlopV, overlopV_mat, overlop_psi

  double precision, allocatable :: H(:,:,:,:)
  double precision, allocatable :: Hdiag(:), Hkl(:,:), H0(:,:)
  double precision, allocatable :: psi_postsvd(:), coeff_psi_perturb(:)

  integer                       :: it_svd, it_svd_max

  integer                       :: n_TSVD, n_FSVD, n_selected, n_toselect
  integer,          allocatable :: numalpha_selected(:), numbeta_selected(:)
  integer,          allocatable :: numalpha_toselect(:), numbeta_toselect(:)

  integer(kind=8)               :: W_tbeg, W_tend, W_tbeg_it, W_tend_it, W_tbeg_step, W_tend_step, W_ir
  real(kind=8)                  :: W_tot_time, W_tot_time_it, W_tot_time_step
  real(kind=8)                  :: CPU_tbeg, CPU_tend, CPU_tbeg_it, CPU_tend_it, CPU_tbeg_step, CPU_tend_step
  real(kind=8)                  :: CPU_tot_time, CPU_tot_time_it, CPU_tot_time_step
  real(kind=8)                  :: speedup, speedup_it, speedup_step
  integer                       :: nb_taches

  !$OMP PARALLEL
  nb_taches = OMP_GET_NUM_THREADS()
  !$OMP END PARALLEL

  call CPU_TIME(CPU_tbeg)
  call SYSTEM_CLOCK(COUNT=W_tbeg, COUNT_RATE=W_ir)

  i_state = 1


  det1(:,1) = psi_det_alpha_unique(:,1)
  det2(:,1) = psi_det_alpha_unique(:,1)
  call get_excitation_degree_spin(det1(1,1),det2(1,1),degree,N_int)
  det1(:,2) = psi_det_beta_unique(:,1)
  det2(:,2) = psi_det_beta_unique(:,1)
  call get_excitation_degree(det1,det2,degree,N_int)
  call i_H_j(det1, det2, N_int, h12)
  ! ---------------------------------------------------------------------------------------
  !                      construct the initial CISD matrix

  print *, ' ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~'
  print *, ' CISD matrix:', n_det_alpha_unique,'x',n_det_beta_unique
  print *, ' N det      :', N_det
  print *, ' ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~'

  norm_psi = 0.d0
  do k = 1, N_det
    norm_psi = norm_psi + psi_bilinear_matrix_values(k,i_state) &
                        * psi_bilinear_matrix_values(k,i_state)
  enddo
  print *, ' initial norm = ', norm_psi

  allocate( Aref(n_det_alpha_unique,n_det_beta_unique) )
  Aref(:,:) = 0.d0
  do k = 1, N_det
    i         = psi_bilinear_matrix_rows(k)
    j         = psi_bilinear_matrix_columns(k)
    Aref(i,j) = psi_bilinear_matrix_values(k,i_state)
  enddo

  ! ---------------------------------------------------------------------------------------


  ! ---------------------------------------------------------------------------------------
  !                                  perform a Full SVD

  allocate( Uref(n_det_alpha_unique,n_det_beta_unique) )
  allocate( Dref(n_det_beta_unique) )
  allocate( Vref(n_det_beta_unique,n_det_beta_unique)  )
  allocate( Vtref(n_det_beta_unique,n_det_beta_unique) )
  call svd_s(Aref, size(Aref,1), Uref, size(Uref,1), Dref, Vtref, size(Vtref,1)          &
       , n_det_alpha_unique, n_det_beta_unique)

  print *, ' ~~~~~~~~~~~~~~~~~~~~~~~~~~~ '
  print *, ' ---   First SVD: ok    ---  '
  print *, ' ~~~~~~~~~~~~~~~~~~~~~~~~~~~ '

  do l = 1, n_det_beta_unique
    do i = 1, n_det_beta_unique
      Vref(i,l) = Vtref(l,i)
    enddo
  enddo
  deallocate( Vtref )

  ! check Truncate SVD error
  err0 = 0.d0
  do j = 1, n_det_beta_unique
    do i = 1, n_det_alpha_unique
      err_tmp = 0.d0
      do l = 1, n_det_beta_unique
        err_tmp = err_tmp + Dref(l) * Uref(i,l) * Vref(j,l)
      enddo
      err_tmp = Aref(i,j) - err_tmp
      err0   += err_tmp * err_tmp
    enddo
  enddo
  print *, ' Full SVD err (%) = ', 100.d0 * dsqrt(err0/norm_psi)

  ! ---------------------------------------------------------------------------------------

  nn = n_det_beta_unique

  ! ---------------------------------------------------------------------------------------
  !                               numerote vectors

  ! Full rank
  n_FSVD = nn * nn
  print*, ' Full psi space rank = ', n_FSVD


  ! Truncated rank
  n_TSVD = 25
  print*, ' initial psi space rank = ', n_TSVD

  ! check Truncate SVD error
  err0 = 0.d0
  do j = 1, n_det_beta_unique
    do i = 1, n_det_alpha_unique
      err_tmp = 0.d0
      do l = 1, n_TSVD
        err_tmp = err_tmp + Dref(l) * Uref(i,l) * Vref(j,l)
      enddo
      err_tmp = Aref(i,j) - err_tmp
      err0   += err_tmp * err_tmp
    enddo
  enddo
  deallocate( Aref )
  print *, ' Truncate SVD err (%) = ', 100.d0 * dsqrt(err0/norm_psi)

  n_selected = n_TSVD * n_TSVD
  allocate( numalpha_selected(n_selected) , numbeta_selected(n_selected) )
  k = 0
  ! first diagonal bloc
  do i = 1, n_TSVD
    do j = 1, n_TSVD
      k = k + 1
      numalpha_selected(k) = j
      numbeta_selected (k) = i
    enddo
  enddo
  ! check size
  if( k.ne.n_selected ) then
    print*, ' error in numeroting: selected '
    print*, ' k          = ', k
    print*, ' n_selected = ', n_selected
    stop
  endif


  ! perturbative space rank
  k          = 0
  n_toselect = n_FSVD - n_selected
  print*, ' perturbative psi space rank = ', n_toselect
  allocate( numalpha_toselect(n_toselect) , numbeta_toselect(n_toselect) )
  ! nondiagonal blocs
  do i = 1, n_TSVD
    do j = n_TSVD+1, nn
      k = k + 1
      numalpha_toselect(k) = j
      numbeta_toselect (k) = i
    enddo
  enddo
  do j = 1, n_TSVD
    do i = n_TSVD+1, nn
      k = k + 1
      numalpha_toselect(k) = j
      numbeta_toselect (k) = i
    enddo
  enddo
  ! diagonal bloc
  do i = n_TSVD+1, nn
    do j = n_TSVD+1, nn
      k = k + 1
      numalpha_toselect(k) = j
      numbeta_toselect (k) = i
    enddo
  enddo
  ! check size
  if( k.ne.n_toselect ) then
    print*, ' error in numeroting: to select '
    print*, ' k          = ', k
    print*, ' n_toselect = ', n_toselect
    stop
  endif

  ! ---------------------------------------------------------------------------------------


  !________________________________________________________________________________________________________
  !
  !                                   loop over SVD iterations
  !________________________________________________________________________________________________________

  E0_old     = 0.d0
  tol_energy = 1.d0
  it_svd     = 0
  it_svd_max = 1

  do while( ( it_svd .lt. it_svd_max) .and. ( tol_energy .gt. 1d-8 ) )

    call CPU_TIME(CPU_tbeg_it)
    call SYSTEM_CLOCK(COUNT=W_tbeg_it, COUNT_RATE=W_ir)

    it_svd = it_svd + 1
    print*, '+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +'
    print*, '                                                                         '
    print*, '                                                                         '
    print*, '                                                                         '
    print*, ' iteration', it_svd

    norm_coeff_psi = 0.d0
    do j = 1, n_TSVD
      norm_coeff_psi += Dref(j) * Dref(j)
    enddo
    inv_sqrt_norm_coeff_psi = 1.d0 / dsqrt(norm_coeff_psi)
    do j = 1, n_TSVD
      Dref(j) = Dref(j) * inv_sqrt_norm_coeff_psi
    enddo

    allocate( H0(n_selected,n_selected) )


    !---------------------------------------------------------------------------------------------------------

    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    call const_psihpsi_postsvd_H0(n_selected, numalpha_selected, numbeta_selected, Uref, Vref, H0)
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ''
    print *,  ' +++ CONST_PSIHPSI_POSTSVD_H0 time +++ ', W_tot_time_step
    ! avant SVD
    E0 = 0.d0
    do i = 1, n_TSVD
      ii = (i-1)*n_TSVD + i
      do j = 1, n_TSVD
        jj = (j-1)*n_TSVD + j
        E0 += Dref(j) * H0(jj,ii) * Dref(i)
      enddo
    enddo
    E0_av = E0 + nuclear_repulsion
    print *,' E0 (avant SVD) =', E0_av

    ! modified version
    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    call const_psihpsi_postsvd_H0_modif(n_selected, numalpha_selected, numbeta_selected, Uref, Vref, H0)
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ' +++ CONST_PSIHPSI_POSTSVD_H0_MODIF time +++ ', W_tot_time_step
    ! avant SVD
    E0 = 0.d0
    do i = 1, n_TSVD
      ii = (i-1)*n_TSVD + i
      do j = 1, n_TSVD
        jj = (j-1)*n_TSVD + j
        E0 += Dref(j) * H0(jj,ii) * Dref(i)
      enddo
    enddo
    E0_av = E0 + nuclear_repulsion
    print *,' E0 (avant SVD) =', E0_av
    print *,  ''

    !---------------------------------------------------------------------------------------------------------

    allocate( psi_postsvd(n_selected) )
    print *,  ' --- Diag post-SVD --- '
    call diag_postsvd(n_TSVD, n_selected, Dref, H0, E0_postsvd, overlop_postsvd, psi_postsvd)
    print*, ' postsvd energy  = ', E0_postsvd

    deallocate( H0 )

    ! post-SVD
    !Dref(:) = 0.d0
    call perform_newpostSVD(n_TSVD, n_selected, psi_postsvd, Uref, Vref, Dref)

    deallocate( psi_postsvd )


    print *,  ''
    print *,  ''
    print *,  ' --- Compute H first way --- '
    !----------------------------------------------------------------------------------------------------------------------------
    !                                                    first way
    !

    allocate( H(n_det_beta_unique,n_det_beta_unique,n_det_beta_unique,n_det_beta_unique)  )
    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    call const_H_uv(Uref, Vref, H)
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ''
    print *,  ' +++ CONST_H_UV +++ ', W_tot_time_step

    allocate( H0(n_selected,n_selected), Hdiag(n_toselect), Hkl(n_selected,n_toselect) )

    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    do n = 1, n_selected
      na = numalpha_selected(n)
      nb = numalpha_selected(n)
      do m = 1, n_selected
        ma = numalpha_selected(m)
        mb = numalpha_selected(m)
        H0(m,n) = H(ma,mb,na,nb)
      enddo
    enddo
    do n = 1, n_toselect
      na = numalpha_toselect(n)
      nb = numbeta_toselect (n)
      ! diagonal part
      Hdiag(n) = H(na,nb,na,nb)
      do m = 1, n_selected
        ma = numalpha_selected(m)
        mb = numalpha_selected(m)
        ! 3 blocs treated perturbatively
        Hkl(m,n) = H(ma,mb,na,nb)
      enddo
    enddo
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ' +++ CONST_H_UV +++ ', W_tot_time_step
    print *,  ''
    deallocate( H )

    E0             = 0.d0
    norm_coeff_psi = 0.d0
    do i = 1, n_TSVD
      ii = (i-1)*n_TSVD + i
      do j = 1, n_TSVD
        jj = (j-1)*n_TSVD + j
        E0 += Dref(j) * H0(jj,ii) * Dref(i)
      enddo
      norm_coeff_psi += Dref(i) * Dref(i)
    enddo
    E0_ap = E0 + nuclear_repulsion
    print *,' E0 (apres SVD) =', E0_ap

    deallocate(H0)

    print *,  ' --- Perturbation --- '
    allocate( coeff_psi_perturb(n_toselect) )
    ept2 = 0.d0
    do ii = 1, n_toselect
      ctmp = 0.d0
      do i = 1, n_TSVD
        l = (i-1)*n_TSVD + i
        ctmp += Dref(i) * Hkl(l,ii)
      enddo
      coeff_psi_perturb(ii) = ctmp / ( E0_ap - (Hdiag(ii)+nuclear_repulsion) )
      ept2            += ctmp*ctmp / ( E0_ap - (Hdiag(ii)+nuclear_repulsion) )
    enddo
    E0pt2 = E0_ap + ept2
    print *, ' perturb energy = ', E0pt2, ept2
    tol_energy = 100.d0 * dabs(E0pt2-E0_old) / dabs(E0pt2)
    E0_old     = E0pt2

    deallocate( Hdiag, Hkl)

    !----------------------------------------------------------------------------------------------------------------------------


    print *,  ''
    print *,  ''
    print *,  ' --- Compute H second way --- '
    !----------------------------------------------------------------------------------------------------------------------------
    !                                                    second way
    !

    allocate( H0(n_selected,n_selected), Hdiag(n_toselect), Hkl(n_selected,n_toselect) )

    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    call const_psihpsi_postsvd_H0_modif(n_selected, numalpha_selected, numbeta_selected, Uref, Vref, H0)
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ''
    print *,  ' +++ CONST_PSIHPSI_POSTSVD_H0_MODIF time +++ ', W_tot_time_step


    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    call const_Hdiag_Hkl(n_selected, n_toselect, Uref, Vref, numalpha_selected, numbeta_selected &
                                                           , numalpha_toselect, numbeta_toselect, Hdiag, Hkl)
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ' +++ CONST_HDIAG_HKL time +++ ', W_tot_time_step
    print *,  ''

    E0             = 0.d0
    norm_coeff_psi = 0.d0
    do i = 1, n_TSVD
      ii = (i-1)*n_TSVD + i
      do j = 1, n_TSVD
        jj = (j-1)*n_TSVD + j
        E0 += Dref(j) * H0(jj,ii) * Dref(i)
      enddo
      norm_coeff_psi += Dref(i) * Dref(i)
    enddo
    E0_ap = E0 + nuclear_repulsion
    print *,' E0 (apres SVD) =', E0_ap

    deallocate(H0)

    print *,  ' --- Perturbation --- '
    deallocate( coeff_psi_perturb )
    allocate( coeff_psi_perturb(n_toselect) )
    ept2 = 0.d0
    do ii = 1, n_toselect
      ctmp = 0.d0
      do i = 1, n_TSVD
        l = (i-1)*n_TSVD + i
        ctmp += Dref(i) * Hkl(l,ii)
      enddo
      coeff_psi_perturb(ii) = ctmp / ( E0_ap - (Hdiag(ii)+nuclear_repulsion) )
      ept2            += ctmp*ctmp / ( E0_ap - (Hdiag(ii)+nuclear_repulsion) )
    enddo
    E0pt2 = E0_ap + ept2
    print *, ' perturb energy = ', E0pt2, ept2
    tol_energy = 100.d0 * dabs(E0pt2-E0_old) / dabs(E0pt2)
    E0_old     = E0pt2

    deallocate( Hdiag, Hkl)

    !----------------------------------------------------------------------------------------------------------------------------


    print *,  ''
    print *,  ''
    print *,  ' --- Compute H third way --- '
    !----------------------------------------------------------------------------------------------------------------------------
    !                                                    second way
    !

    allocate( H0(n_selected,n_selected), Hdiag(n_toselect), Hkl(n_selected,n_toselect) )

    call SYSTEM_CLOCK(COUNT=W_tbeg_step, COUNT_RATE=W_ir)
    call const_Hdiag_Hkl_H0(n_selected, n_toselect, Uref, Vref, numalpha_selected, numbeta_selected &
                            , numalpha_toselect, numbeta_toselect, Hdiag, Hkl, H0)
    call SYSTEM_CLOCK(COUNT=W_tend_step, COUNT_RATE=W_ir)
    W_tot_time_step = real(W_tend_step-W_tbeg_step, kind=8) / real(W_ir, kind=8)
    print *,  ''
    print *,  ' +++ CONST_HDIAG_HKL_H0 time +++ ', W_tot_time_step
    print *,  ''

    E0             = 0.d0
    norm_coeff_psi = 0.d0
    do i = 1, n_TSVD
      ii = (i-1)*n_TSVD + i
      do j = 1, n_TSVD
        jj = (j-1)*n_TSVD + j
        E0 += Dref(j) * H0(jj,ii) * Dref(i)
      enddo
      norm_coeff_psi += Dref(i) * Dref(i)
    enddo
    E0_ap = E0 + nuclear_repulsion
    print *,' E0 (apres SVD) =', E0_ap

    deallocate(H0)

    print *,  ' --- Perturbation --- '
    deallocate( coeff_psi_perturb )
    allocate( coeff_psi_perturb(n_toselect) )
    ept2 = 0.d0
    do ii = 1, n_toselect
      ctmp = 0.d0
      do i = 1, n_TSVD
        l = (i-1)*n_TSVD + i
        ctmp += Dref(i) * Hkl(l,ii)
      enddo
      coeff_psi_perturb(ii) = ctmp / ( E0_ap - (Hdiag(ii)+nuclear_repulsion) )
      ept2            += ctmp*ctmp / ( E0_ap - (Hdiag(ii)+nuclear_repulsion) )
    enddo
    E0pt2 = E0_ap + ept2
    print *, ' perturb energy = ', E0pt2, ept2
    tol_energy = 100.d0 * dabs(E0pt2-E0_old) / dabs(E0pt2)
    E0_old     = E0pt2

    deallocate( Hdiag, Hkl)

    !----------------------------------------------------------------------------------------------------------------------------


    print *,  ' --- SVD --- '
    call perform_newSVD(n_toselect, numalpha_toselect, numbeta_toselect, coeff_psi_perturb, Uref, Vref, Dref)

    deallocate( coeff_psi_perturb )

    write(11,'(i5,4x,4(f22.15,2x))') it_svd, E0_av, E0_postsvd, E0_ap, E0pt2

    call CPU_TIME(CPU_tend_it)
    call SYSTEM_CLOCK(COUNT=W_tend_it, COUNT_RATE=W_ir)
    CPU_tot_time_it = CPU_tend_it - CPU_tbeg_it
    W_tot_time_it   = real(W_tend_it-W_tbeg_it, kind=8) / real(W_ir, kind=8)
    speedup_it      = CPU_tot_time_it / W_tot_time_it
    print '(//, 3X, "elapsed time = ", 1PE10.3, " min.", /,  &
              & 3X, "CPU time     = ", 1PE10.3, " min.", /,  &
              & 3X, "speed up     = ", 1PE10.3,//)', W_tot_time_it/60.d0, CPU_tot_time_it/60.d0, speedup_it

  end do
  !________________________________________________________________________________________________________
  !________________________________________________________________________________________________________


  deallocate( Uref, Vref, Dref )


  call CPU_TIME(CPU_tend)
  call SYSTEM_CLOCK(COUNT=W_tend, COUNT_RATE=W_ir)
  CPU_tot_time = CPU_tend - CPU_tbeg
  W_tot_time   = real(W_tend - W_tbeg, kind=8) / real(W_ir, kind=8)
  speedup      = CPU_tot_time / W_tot_time
  print *,' ___________________________________________________________________'
  print '(//,3X,"Execution avec ",i2," threads")',nb_taches
  print '(//, 3X, "elapsed time = ", 1PE10.3, " min.", /,  &
            & 3X, "CPU time     = ", 1PE10.3, " min.", /,  &
            & 3X, "speed up     = ", 1PE10.3 ,// )', W_tot_time/60.d0, CPU_tot_time/60.d0, speedup
  print *,' ___________________________________________________________________'


end


subroutine const_psihpsi_postsvd_H0(n_selected, numalpha_selected, numbeta_selected, Uref, Vref, H0)

  USE OMP_LIB

  implicit none

  integer, intent(in)           :: n_selected
  integer, intent(in)           :: numalpha_selected(n_selected), numbeta_selected(n_selected)
  double precision, intent(in)  :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(in)  :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(out) :: H0(n_selected,n_selected)

  integer(bit_kind)             :: det1(N_int,2), det2(N_int,2)
  integer                       :: i, j, k, l, degree
  integer                       :: n, na, nb, m , ma, mb
  double precision              :: h12, x
  double precision, allocatable :: H0_tmp(:,:)

  H0(:,:) = 0.d0

 !$OMP PARALLEL DEFAULT(NONE)                                                                         &
 !$OMP          PRIVATE(i,j,k,l,n,na,nb,m,ma,mb,x,h12,det1,det2,H0_tmp,degree)                        &
 !$OMP          SHARED(n_det_alpha_unique,n_det_beta_unique,psi_det_alpha_unique,psi_det_beta_unique, &
 !$OMP                 N_int,n_selected,Uref,Vref,H0,numalpha_selected,numbeta_selected )
  allocate( H0_tmp(n_selected,n_selected) )
  H0_tmp(:,:) = 0.d0
 !$OMP DO COLLAPSE(2) SCHEDULE(DYNAMIC,20)
  do i = 1, n_det_alpha_unique
    do k = 1, n_det_alpha_unique
      det1(:,1) = psi_det_alpha_unique(:,i)
      det2(:,1) = psi_det_alpha_unique(:,k)
      call get_excitation_degree_spin(det1(1,1),det2(1,1),degree,N_int)
      if (degree .gt. 2) then
        cycle
      endif
      do j = 1, n_det_beta_unique
        det1(:,2) = psi_det_beta_unique(:,j)
        do l = 1, n_det_beta_unique
          det2(:,2) = psi_det_beta_unique(:,l)
          call get_excitation_degree(det1,det2,degree,N_int)
          if (degree .gt. 2) then
            cycle
          endif
          ! !!!
          call i_H_j(det1, det2, N_int, h12)
          ! !!!
          ! ~~~            H0                  ~~~
          do n  = 1, n_selected
            na = numalpha_selected(n)
            nb = numbeta_selected (n)
            x   = Uref(k,na) * Vref(l,nb) * h12
            do m  = 1, n_selected
              ma = numalpha_selected(m)
              mb = numbeta_selected (m)
              H0_tmp(m,n) += Uref(i,ma) * Vref(j,mb) * x
            enddo
          enddo
          ! ~~~          ~~~~~~                ~~~
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP CRITICAL
  do n = 1, n_selected
     do m = 1, n_selected
       H0(m,n) += H0_tmp(m,n)
     enddo
  enddo
 !$OMP END CRITICAL
  deallocate( H0_tmp )
 !$OMP END PARALLEL

  return
end subroutine const_psihpsi_postsvd_H0


subroutine const_psihpsi_postsvd_H0_modif(n_selected, numalpha_selected, numbeta_selected, Uref, Vref, H0)

  USE OMP_LIB

  implicit none

  integer, intent(in)           :: n_selected
  integer, intent(in)           :: numalpha_selected(n_selected), numbeta_selected(n_selected)
  double precision, intent(in)  :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(in)  :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(out) :: H0(n_selected,n_selected)

  integer(bit_kind)             :: det1(N_int,2), det2(N_int,2)
  integer                       :: i, j, k, l, degree
  integer                       :: n, na, nb, m , ma, mb
  double precision              :: h12, x
  double precision, allocatable :: Htot(:,:,:,:), H1(:,:,:)

  H0(:,:)       = 0.d0

 !$OMP PARALLEL DEFAULT(NONE)                                                                         &
 !$OMP          PRIVATE(i,j,k,l,n,na,nb,m,ma,mb,x,h12,det1,det2,degree)                               &
 !$OMP          SHARED(n_det_alpha_unique,n_det_beta_unique,psi_det_alpha_unique,psi_det_beta_unique, &
 !$OMP                 N_int,n_selected,Uref,Vref,H0,Htot,H1,numalpha_selected,numbeta_selected )

 !$OMP SINGLE
  allocate( Htot(n_det_alpha_unique,n_det_beta_unique,n_det_alpha_unique,n_det_beta_unique) )
  Htot(:,:,:,:) = 0.d0
 !$OMP END SINGLE

 !$OMP DO COLLAPSE(2) SCHEDULE(DYNAMIC,20)
  do i = 1, n_det_alpha_unique
    do k = 1, n_det_alpha_unique
      det1(:,1) = psi_det_alpha_unique(:,i)
      det2(:,1) = psi_det_alpha_unique(:,k)
      call get_excitation_degree_spin(det1(1,1),det2(1,1),degree,N_int)
      if (degree .gt. 2) then
        cycle
      endif
      do j = 1, n_det_beta_unique
        det1(:,2) = psi_det_beta_unique(:,j)
        do l = 1, n_det_beta_unique
          det2(:,2) = psi_det_beta_unique(:,l)
          call get_excitation_degree(det1,det2,degree,N_int)
          if (degree .gt. 2) then
            cycle
          endif
          ! !!!
          call i_H_j(det1, det2, N_int, Htot(k,l,i,j))
          ! !!!
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP SINGLE
  allocate( H1(n_det_alpha_unique,n_det_beta_unique,n_selected) )
  H1(:,:,:) = 0.d0
 !$OMP END SINGLE

 !$OMP DO
  do n = 1, n_selected
    na = numalpha_selected(n)
    nb = numbeta_selected (n)
    do i = 1, n_det_alpha_unique
      do j = 1, n_det_beta_unique
        do l = 1, n_det_beta_unique
          do k = 1, n_det_alpha_unique
            H1(k,l,n) += Htot(k,l,i,j) * Uref(i,na) * Vref(j,nb)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP SINGLE
  deallocate( Htot )
 !$OMP END SINGLE

 !$OMP DO
  do m = 1, n_selected
    ma = numalpha_selected(m)
    mb = numbeta_selected (m)
    do n = 1, n_selected
      do k = 1, n_det_alpha_unique
        do l = 1, n_det_beta_unique
          H0(m,n) += H1(k,l,n) * Uref(k,ma) * Vref(l,mb)
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP SINGLE
  deallocate( H1 )
 !$OMP END SINGLE

 !$OMP END PARALLEL

  return
end subroutine const_psihpsi_postsvd_H0_modif


subroutine diag_postsvd(n_TSVD, n_selected, Dref, H0, E0, overlop, psi_postsvd )

  USE OMP_LIB

  implicit none

  integer, intent(in)           :: n_TSVD, n_selected
  double precision, intent(in)  :: H0(n_selected,n_selected)
  double precision, intent(in)  :: Dref(n_det_beta_unique)
  double precision, intent(out) :: E0, overlop, psi_postsvd(n_selected)

  integer(bit_kind)             :: det1(N_int,2), det2(N_int,2)
  integer                       :: i, j, k, l, degree
  integer                       :: ii0, jj0, ii, jj, n, m, np, mp
  integer                       :: nn0, mm0, nn, mm, ind_gs
  double precision              :: h12, x

  double precision, allocatable :: eigvec0(:,:), eigval0(:), check_ov(:)

  ! diagonalize H0
  allocate( eigvec0(n_selected,n_selected), eigval0(n_selected) )
  call lapack_diag(eigval0, eigvec0, H0, n_selected, n_selected)

  ! get the postsvd ground state
  allocate( check_ov(n_selected) )
  do l = 1, n_selected
    overlop = 0.d0
    do i = 1, n_TSVD
      ii      = n_TSVD*(i-1) + i
      overlop = overlop + eigvec0(ii,l) * Dref(i)
    enddo
    check_ov(l) = dabs(overlop)
  enddo
  ind_gs      = MAXLOC( check_ov, DIM=1 )
  !ind_gs      = 1
  overlop     = check_ov(ind_gs)
  E0          = eigval0(ind_gs)+nuclear_repulsion
  psi_postsvd = eigvec0(:,ind_gs)

  deallocate( check_ov, eigvec0, eigval0 )

  return
end subroutine diag_postsvd


subroutine const_H_uv(Uref, Vref, H)

  USE OMP_LIB

  implicit none

  double precision, intent(in)  :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(in)  :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(out) :: H(n_det_beta_unique,n_det_beta_unique,n_det_beta_unique,n_det_beta_unique)

  integer(bit_kind)             :: det1(N_int,2)
  integer(bit_kind)             :: det2(N_int,2)
  integer                       :: degree

  integer                       :: na, nb
  integer                       :: i, j, k, l
  integer                       :: p, q, r, s

  double precision, allocatable :: H0(:,:,:,:), H1(:,:,:,:), H2(:,:,:,:), H3(:,:,:,:)

  na = n_det_alpha_unique
  nb = n_det_beta_unique

  allocate( H0(na,nb,na,nb) , H1(na,nb,na,nb) )
  allocate( H2(na,nb,nb,nb) , H3(na,nb,nb,nb) )

  H(:,:,:,:)  = 0.d0
  H0(:,:,:,:) = 0.d0
  H1(:,:,:,:) = 0.d0
  H2(:,:,:,:) = 0.d0
  H3(:,:,:,:) = 0.d0

 !$OMP PARALLEL DEFAULT(NONE)                                          &
 !$OMP          PRIVATE(p,q,r,s,i,j,k,l,det1,det2,degree)              &
 !$OMP          SHARED(na,nb,psi_det_alpha_unique,psi_det_beta_unique, &
 !$OMP                 N_int,Uref,Vref,H0,H1,H2,H3,H)

 !$OMP DO COLLAPSE(2) SCHEDULE(DYNAMIC,20)
  do i = 1, na
    do k = 1, na
      det1(:,1) = psi_det_alpha_unique(:,i)
      det2(:,1) = psi_det_alpha_unique(:,k)
      call get_excitation_degree_spin(det1(1,1),det2(1,1),degree,N_int)
      if (degree .gt. 2) then
        cycle
      endif
      do j = 1, nb
        det1(:,2) = psi_det_beta_unique(:,j)
        do l = 1, nb
          det2(:,2) = psi_det_beta_unique(:,l)
          call get_excitation_degree(det1,det2,degree,N_int)
          if (degree .gt. 2) then
            cycle
          endif
          call i_H_j(det1, det2, N_int, H0(k,l,i,j))
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

! !$OMP SINGLE
!  allocate( H1(na,nb,na,nb) )
!  H1(:,:,:,:) = 0.d0
! !$OMP END SINGLE
 !$OMP DO
  do s = 1, nb
    do l = 1, nb
      do k = 1, na
        do j = 1, nb
          do i = 1, na
            H1(i,j,k,s) += H0(i,j,k,l) * Vref(l,s)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP DO
  do s = 1, nb
    do r = 1, nb
      do k = 1, na
        do j = 1, nb
          do i = 1, na
            H2(i,j,r,s) += H1(i,j,k,s) * Uref(k,r)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP DO
  do s = 1, nb
    do j = 1, nb
      do r = 1, na
        do q = 1, nb
          do i = 1, na
            H3(i,q,r,s) += H2(i,j,r,s) * Vref(j,q)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP DO
  do s = 1, nb
    do r = 1, nb
      do q = 1, nb
        do p = 1, nb
          do i = 1, na
            H(p,q,r,s) += H3(i,q,r,s) * Uref(i,p)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP END PARALLEL

  deallocate( H0, H1, H2, H3 )

  return

end subroutine const_H_uv


subroutine perform_newSVD(n_toselect, numalpha_toselect, numbeta_toselect, coeff_psi_perturb, Uref, Vref, Dref)

  USE OMP_LIB

  integer, intent(in)             :: n_toselect
  integer, intent(in)             :: numalpha_toselect(n_toselect), numbeta_toselect(n_toselect)
  double precision, intent(in)    :: coeff_psi_perturb(n_toselect)
  double precision, intent(inout) :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(inout) :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(inout) :: Dref(n_det_beta_unique)

  integer                         :: mm, nn, i, j, ii0, ii, l, jj, na, nb
  double precision                :: err0, err_norm, err_tmp, norm_tmp
  double precision                :: overlopU_mat, overlopV_mat, overlopU, overlopV
  double precision, allocatable   :: S_mat(:,:), SxVt(:,:)
  double precision, allocatable   :: U_svd(:,:), V_svd(:,:)
  double precision, allocatable   :: U_newsvd(:,:), V_newsvd(:,:), Vt_newsvd(:,:), D_newsvd(:), A_newsvd(:,:)

  mm = n_det_alpha_unique
  nn = n_det_beta_unique

  allocate( U_svd(mm,nn) , V_svd(nn,nn) , S_mat(nn,nn) )

  U_svd(:,:) = Uref(:,:)
  V_svd(:,:) = Vref(:,:)
  S_mat(:,:) = 0.d0
  norm_tmp   = 0.d0
  do j  = 1, n_det_beta_unique
    S_mat(j,j) = Dref(j)
    norm_tmp  += S_mat(j,j)*S_mat(j,j)
  enddo
  do l = 1, n_toselect
    na = numalpha_toselect(l)
    nb = numbeta_toselect (l)
    S_mat(na,nb) = coeff_psi_perturb(l)
    norm_tmp    += S_mat(na,nb)*S_mat(na,nb)
  enddo

  print*, ' norm de S_mat =', norm_tmp
  !norm_tmp = 1.d0/dsqrt(norm_tmp)
  !do i = 1, nn
  !  do j = 1, nn
  !    S_mat(j,i) = S_mat(j,i) * norm_tmp
  !  enddo
  !enddo


  ! first compute S_mat x transpose(V_svd)
  allocate( SxVt(nn,nn) )
  call dgemm( 'N', 'T', nn, nn, nn, 1.d0     &
             , S_mat     , size(S_mat,1)     &
             , V_svd     , size(V_svd,1)     &
             , 0.d0, SxVt, size(SxVt ,1)     )
  ! then compute U_svd x SxVt
  allocate( A_newsvd(mm,nn) )
  call dgemm( 'N', 'N', mm, nn, nn, 1.d0            &
             , U_svd         , size(U_svd   ,1)     &
             , SxVt          , size(SxVt    ,1)     &
             , 0.d0, A_newsvd, size(A_newsvd,1)     )
  deallocate( SxVt )

  ! perform new SVD
  allocate( U_newsvd(mm,nn), Vt_newsvd(nn,nn), D_newsvd(nn) )
  call svd_s( A_newsvd, size(A_newsvd,1), U_newsvd, size(U_newsvd,1), D_newsvd, Vt_newsvd, size(Vt_newsvd,1), mm, nn)
  print *, '    +++ new perturbative SVD is performed +++       '
  allocate( V_newsvd(nn,nn) )
  do l = 1, nn
    do j = 1, nn
      V_newsvd(j,l) = Vt_newsvd(l,j)
    enddo
  enddo

  ! check SVD error
  err0     = 0.d0
  err_norm = 0.d0
  do j = 1, nn
    do i = 1, mm
      err_tmp  = 0.d0
      do l = 1, nn
        err_tmp = err_tmp + D_newsvd(l) * U_newsvd(i,l) * V_newsvd(j,l)
      enddo
      err_tmp   = A_newsvd(i,j) - err_tmp
      err0     += err_tmp * err_tmp
      err_norm += A_newsvd(i,j) * A_newsvd(i,j)
    enddo
  enddo
  print *, ' SVD err (%)              = ', 100.d0 * dsqrt(err0/err_norm)
  print *, ' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ '


  do l = 1, nn
    Dref(l)   = D_newsvd(l)
    Uref(:,l) = U_newsvd(:,l)
    Vref(:,l) = V_newsvd(:,l)
  enddo
  !print *, Dref(:)


  overlopU_mat = 0.d0
  overlopV_mat = 0.d0
  do i = 1, nn
    do j = 1, nn
      overlopU = 0.d0
      do ii = 1, mm
        overlopU += Uref(ii,j) * Uref(ii,i)
      enddo
      overlopU_mat += overlopU
      overlopV = 0.d0
      do ii = 1, nn
        overlopV += Vref(ii,j) * Vref(ii,i)
      enddo
      overlopV_mat += overlopV
    enddo
  enddo
  print *, 'overlop U =', overlopU_mat
  print *, 'overlop V =', overlopV_mat


  deallocate( U_newsvd, V_newsvd, Vt_newsvd, D_newsvd, A_newsvd )

  return

end subroutine perform_newSVD


subroutine perform_newpostSVD(n_TSVD, n_selected, psi_postsvd, Uref, Vref, Dref)

  ! TODO: general case wherer we we don't consider the first trucated block
  USE OMP_LIB

  integer, intent(in)             :: n_TSVD, n_selected
  double precision, intent(in)    :: psi_postsvd(n_selected)
  double precision, intent(inout) :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(inout) :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(inout) :: Dref(n_det_beta_unique)

  integer                         :: mm, nn, i, j, ii0, ii, l, jj, na, nb
  double precision                :: err0, err_norm, err_tmp, norm_tmp
  double precision                :: overlopU_mat, overlopV_mat, overlopU, overlopV
  double precision, allocatable   :: S_mat(:,:), SxVt(:,:)
  double precision, allocatable   :: U_svd(:,:), V_svd(:,:)
  double precision, allocatable   :: U_newsvd(:,:), V_newsvd(:,:), Vt_newsvd(:,:), D_newsvd(:), A_newsvd(:,:)

  mm = n_det_alpha_unique
  nn = n_det_beta_unique

  allocate( U_svd(mm,n_TSVD) , V_svd(nn,n_TSVD) , S_mat(n_TSVD,n_TSVD) )

  U_svd(:,:) = Uref(:,1:n_TSVD)
  V_svd(:,:) = Vref(:,1:n_TSVD)
  S_mat(:,:) = 0.d0
  do i = 1, n_TSVD
    ii = (i-1)*n_TSVD
    do j = 1, n_TSVD
      jj = ii + j
      S_mat(j,i) = psi_postsvd(jj)
    enddo
  enddo

  ! first compute S_mat x transpose(V_svd)
  allocate( SxVt(n_TSVD,nn) )
  call dgemm( 'N', 'T', n_TSVD, nn, n_TSVD, 1.d0     &
             , S_mat     , size(S_mat,1)             &
             , V_svd     , size(V_svd,1)             &
             , 0.d0, SxVt, size(SxVt ,1)             )
  ! then compute U_svd x SxVt
  allocate( A_newsvd(mm,nn) )
  call dgemm( 'N', 'N', mm, nn, n_TSVD, 1.d0         &
             , U_svd         , size(U_svd   ,1)      &
             , SxVt          , size(SxVt    ,1)      &
             , 0.d0, A_newsvd, size(A_newsvd,1)      )
  deallocate( SxVt )

  ! perform new SVD
  allocate( U_newsvd(mm,nn), Vt_newsvd(nn,nn), D_newsvd(nn) )
  call svd_s( A_newsvd, size(A_newsvd,1), U_newsvd, size(U_newsvd,1), D_newsvd, Vt_newsvd, size(Vt_newsvd,1), mm, nn)
  print *, '    +++ new SVD is performed +++       '
  allocate( V_newsvd(nn,nn) )
  do l = 1, nn
    do j = 1, nn
      V_newsvd(j,l) = Vt_newsvd(l,j)
    enddo
  enddo

  ! check SVD error
  err0     = 0.d0
  err_norm = 0.d0
  do j = 1, nn
    do i = 1, mm
      err_tmp  = 0.d0
      do l = 1, n_TSVD
        err_tmp = err_tmp + D_newsvd(l) * U_newsvd(i,l) * V_newsvd(j,l)
      enddo
      err_tmp   = A_newsvd(i,j) - err_tmp
      err0     += err_tmp * err_tmp
      err_norm += A_newsvd(i,j) * A_newsvd(i,j)
    enddo
  enddo
  print *, ' SVD err (%) = ', 100.d0 * dsqrt(err0/err_norm)
  print *, ' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ '

  do l = 1, n_TSVD
    Dref(l)   = D_newsvd(l)
    Uref(:,l) = U_newsvd(:,l)
    Vref(:,l) = V_newsvd(:,l)
  enddo
!  print *, Dref(:)

  overlopU_mat = 0.d0
  overlopV_mat = 0.d0
  do i = 1, nn
    do j = 1, nn
      overlopU = 0.d0
      do ii = 1, mm
        overlopU += Uref(ii,j) * Uref(ii,i)
      enddo
      overlopU_mat += overlopU
      overlopV = 0.d0
      do ii = 1, nn
        overlopV += Vref(ii,j) * Vref(ii,i)
      enddo
      overlopV_mat += overlopV
    enddo
  enddo
  print *, 'overlop U =', overlopU_mat
  print *, 'overlop V =', overlopV_mat


  deallocate( U_newsvd, V_newsvd, Vt_newsvd, D_newsvd, A_newsvd )

  return

end subroutine perform_newpostSVD


subroutine const_Hdiag_Hkl(n_selected, n_toselect, Uref, Vref, numalpha_selected, numbeta_selected &
                                                             , numalpha_toselect, numbeta_toselect, Hdiag, Hkl)

  implicit none

  integer, intent(in)           :: n_selected, n_toselect
  integer, intent(in)           :: numalpha_selected(n_selected), numbeta_selected(n_selected)
  integer, intent(in)           :: numalpha_toselect(n_toselect), numbeta_toselect(n_toselect)
  double precision, intent(in)  :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(in)  :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(out) :: Hdiag(n_toselect), Hkl(n_selected,n_toselect)

  integer(bit_kind)             :: det1(N_int,2)
  integer(bit_kind)             :: det2(N_int,2)
  integer                       :: degree

  integer                       :: i, j, k, l
  integer                       :: n, na, nb, m, ma, mb
  double precision              :: h12, y

  double precision, allocatable :: Hdiag_tmp(:), Hkl_tmp(:,:)


  Hdiag(:) = 0.d0
  Hkl(:,:) = 0.d0

 !$OMP PARALLEL DEFAULT(NONE)                                                                         &
 !$OMP          PRIVATE(i,j,k,l,n,na,nb,m,ma,mb,y,h12,det1,det2,Hdiag_tmp,Hkl_tmp,degree)             &
 !$OMP          SHARED(n_det_alpha_unique,n_det_beta_unique,psi_det_alpha_unique,psi_det_beta_unique, &
 !$OMP                 N_int,n_selected,n_toselect,Uref,Vref,numalpha_toselect,numbeta_toselect,      &
 !$OMP                 Hkl,Hdiag,numalpha_selected, numbeta_selected )
  allocate( Hdiag_tmp(n_toselect), Hkl_tmp(n_selected,n_toselect) )
  Hdiag_tmp(:) = 0.d0
  Hkl_tmp(:,:) = 0.d0
 !$OMP DO COLLAPSE(2) SCHEDULE(DYNAMIC,20)
  do i = 1, n_det_alpha_unique
    do k = 1, n_det_alpha_unique
      det1(:,1) = psi_det_alpha_unique(:,i)
      det2(:,1) = psi_det_alpha_unique(:,k)
      ! !!!
      ! ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
      call get_excitation_degree_spin(det1(1,1),det2(1,1),degree,N_int)
      if (degree .gt. 2) then
        cycle
      endif
      ! ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
      ! !!!
      do j = 1, n_det_beta_unique
        det1(:,2) = psi_det_beta_unique(:,j)
        do l = 1, n_det_beta_unique
          det2(:,2) = psi_det_beta_unique(:,l)
          ! !!!
          ! ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
          call get_excitation_degree(det1,det2,degree,N_int)
          if (degree .gt. 2) then
            cycle
          endif
          ! ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
          ! !!!
          call i_H_j(det1, det2, N_int, h12)
          ! ~ ~ ~               H                ~ ~ ~
          do n = 1, n_toselect
            na = numalpha_toselect(n)
            nb = numbeta_toselect (n)
            y  = Uref(k,na) * Vref(l,nb) * h12
            ! Hdiag
            Hdiag_tmp(n) += y * Uref(i,na) * Vref(j,nb)
            do m = 1, n_selected
              ma = numalpha_selected(m)
              mb = numbeta_selected (m)
              ! Hkl
              Hkl_tmp(m,n) += Uref(i,ma) * Vref(j,mb) * y
            enddo
          enddo
          ! ~ ~ ~             ! ! !              ~ ~ ~
        enddo
      enddo
      ! !!!
    enddo
  enddo
 !$OMP END DO
 !$OMP CRITICAL
  do n = 1, n_toselect
     Hdiag(n) += Hdiag_tmp(n)
     do m = 1, n_selected
       Hkl(m,n) += Hkl_tmp(m,n)
     enddo
  enddo
 !$OMP END CRITICAL
  deallocate( Hdiag_tmp,Hkl_tmp )
 !$OMP END PARALLEL

  return

end subroutine const_Hdiag_Hkl


subroutine const_Hdiag_Hkl_H0(n_selected, n_toselect, Uref, Vref, numalpha_selected, numbeta_selected &
                            , numalpha_toselect, numbeta_toselect, Hdiag, Hkl, H0)

  implicit none

  integer, intent(in)           :: n_selected, n_toselect
  integer, intent(in)           :: numalpha_selected(n_selected), numbeta_selected(n_selected)
  integer, intent(in)           :: numalpha_toselect(n_toselect), numbeta_toselect(n_toselect)
  double precision, intent(in)  :: Uref(n_det_alpha_unique,n_det_beta_unique)
  double precision, intent(in)  :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(out) :: Hdiag(n_toselect), Hkl(n_selected,n_toselect), H0(n_selected,n_selected)

  integer(bit_kind)             :: det1(N_int,2)
  integer(bit_kind)             :: det2(N_int,2)
  integer                       :: degree

  integer                       :: i, j, k, l
  integer                       :: n, na, nb, m, ma, mb
  double precision, allocatable :: Htot(:,:,:,:), H1(:,:,:), H2(:,:,:)

  Hdiag(:) = 0.d0
  Hkl(:,:) = 0.d0
  H0(:,:)  = 0.d0

 !$OMP PARALLEL DEFAULT(NONE)                                                                         &
 !$OMP          PRIVATE(i,j,k,l,n,na,nb,m,ma,mb,det1,det2,degree)                                     &
 !$OMP          SHARED(n_det_alpha_unique,n_det_beta_unique,psi_det_alpha_unique,psi_det_beta_unique, &
 !$OMP                 N_int,n_selected,n_toselect,Uref,Vref,H0,Htot,H1,H2,Hdiag,Hkl,                 &
 !$OMP                 numalpha_selected,numbeta_selected,numalpha_toselect,numbeta_toselect )

 !$OMP SINGLE
  allocate( Htot(n_det_alpha_unique,n_det_beta_unique,n_det_alpha_unique,n_det_beta_unique) )
  Htot(:,:,:,:) = 0.d0
 !$OMP END SINGLE

 !$OMP DO COLLAPSE(2) SCHEDULE(DYNAMIC,20)
  do i = 1, n_det_alpha_unique
    do k = 1, n_det_alpha_unique
      det1(:,1) = psi_det_alpha_unique(:,i)
      det2(:,1) = psi_det_alpha_unique(:,k)
      call get_excitation_degree_spin(det1(1,1),det2(1,1),degree,N_int)
      if (degree .gt. 2) then
        cycle
      endif
      do j = 1, n_det_beta_unique
        det1(:,2) = psi_det_beta_unique(:,j)
        do l = 1, n_det_beta_unique
          det2(:,2) = psi_det_beta_unique(:,l)
          call get_excitation_degree(det1,det2,degree,N_int)
          if (degree .gt. 2) then
            cycle
          endif
          ! !!!
          call i_H_j(det1, det2, N_int, Htot(k,l,i,j))
          ! !!!
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP SINGLE
  allocate( H1(n_det_alpha_unique,n_det_beta_unique,n_selected) )
  H1(:,:,:) = 0.d0
 !$OMP END SINGLE
 !$OMP DO
  do n = 1, n_selected
    na = numalpha_selected(n)
    nb = numbeta_selected (n)
    do i = 1, n_det_alpha_unique
      do j = 1, n_det_beta_unique
        do l = 1, n_det_beta_unique
          do k = 1, n_det_alpha_unique
            H1(k,l,n) += Htot(k,l,i,j) * Uref(i,na) * Vref(j,nb)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP SINGLE
  allocate( H2(n_det_alpha_unique,n_det_beta_unique,n_toselect) )
  H2(:,:,:) = 0.d0
 !$OMP END SINGLE
 !$OMP DO
  do n = 1, n_toselect
    na = numalpha_toselect(n)
    nb = numbeta_toselect (n)
    do i = 1, n_det_alpha_unique
      do j = 1, n_det_beta_unique
        do l = 1, n_det_beta_unique
          do k = 1, n_det_alpha_unique
            H2(k,l,n) += Htot(k,l,i,j) * Uref(i,na) * Vref(j,nb)
            Hdiag(n)  += Htot(k,l,i,j) * Uref(i,na) * Vref(j,nb) * Uref(k,na) * Vref(l,nb)
          enddo
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO

 !$OMP SINGLE
  deallocate( Htot )
 !$OMP END SINGLE

 !$OMP DO
  do m = 1, n_selected
    ma = numalpha_selected(m)
    mb = numbeta_selected (m)
    do n = 1, n_toselect
      do k = 1, n_det_alpha_unique
        do l = 1, n_det_beta_unique
          Hkl(m,n) += H2(k,l,n) * Uref(k,ma) * Vref(l,mb)
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP SINGLE
  deallocate( H2 )
 !$OMP END SINGLE

 !$OMP DO
  do m = 1, n_selected
    ma = numalpha_selected(m)
    mb = numbeta_selected (m)
    do n = 1, n_selected
      do k = 1, n_det_alpha_unique
        do l = 1, n_det_beta_unique
          H0(m,n) += H1(k,l,n) * Uref(k,ma) * Vref(l,mb)
        enddo
      enddo
    enddo
  enddo
 !$OMP END DO
 !$OMP SINGLE
  deallocate( H1 )
 !$OMP END SINGLE

 !$OMP END PARALLEL

  return

end subroutine const_Hdiag_Hkl_H0