program buildpsi_diagSVDit_modif_v2

  implicit none

  BEGIN_DOC
  ! perturbative approach to build psi_postsvd
  END_DOC

  read_wf = .True.
  TOUCH read_wf

  PROVIDE N_int

  call run()
end


subroutine run

  USE OMP_LIB

  implicit none

  integer(bit_kind)             :: det1(N_int,2), det2(N_int,2)
  integer                       :: degree, i_state

  integer                       :: i, j, k, l, ii, jj, na, nb

  double precision              :: norm_psi, inv_sqrt_norm_psi
  double precision, allocatable :: Uref(:,:), Dref(:), Vtref(:,:), Aref(:,:), Vref(:,:)

  double precision              :: err0, err_tmp, e_tmp, E0, overlap, E0_old, tol_energy
  double precision              :: ctmp, htmp, Ept2
  double precision              :: E0_postsvd, overlap_postsvd, E_prev
  double precision              :: norm_coeff_psi, inv_sqrt_norm_coeff_psi
  double precision              :: overlapU, overlapU_mat, overlapV, overlapV_mat, overlap_psi

  double precision, allocatable :: H_diag(:,:), Hkl(:,:), H0(:,:), H(:,:,:,:)
  double precision, allocatable :: psi_postsvd(:,:), coeff_psi_perturb(:)

  integer                       :: n_FSVD, n_selected, n_toselect, n_tmp, it_svd, it_svd_max
  integer                       :: n_selected2
  integer,          allocatable :: numalpha_selected(:), numbeta_selected(:)
  integer,          allocatable :: numalpha_toselect(:), numbeta_toselect(:)
  integer,          allocatable :: numalpha_tmp(:), numbeta_tmp(:)

  integer(kind=8)               :: W_tbeg, W_tend, W_tbeg_it, W_tend_it, W_ir
  real(kind=8)                  :: W_tot_time, W_tot_time_it
  real(kind=8)                  :: CPU_tbeg, CPU_tend, CPU_tbeg_it, CPU_tend_it
  real(kind=8)                  :: CPU_tot_time, CPU_tot_time_it
  real(kind=8)                  :: speedup, speedup_it
  integer                       :: nb_taches

  !$OMP PARALLEL
  nb_taches = OMP_GET_NUM_THREADS()
  !$OMP END PARALLEL

  call CPU_TIME(CPU_tbeg)
  call SYSTEM_CLOCK(COUNT=W_tbeg, COUNT_RATE=W_ir)

  i_state = 1

  ! ---------------------------------------------------------------------------------------
  !                      construct the initial CISD matrix

  print *, ' ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~'
  print *, ' CI matrix:', n_det_alpha_unique,'x',n_det_beta_unique
  print *, ' N det    :', N_det
  print *, ' ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~'

  norm_psi = 0.d0
  do k = 1, N_det
    norm_psi = norm_psi + psi_bilinear_matrix_values(k,i_state) &
                        * psi_bilinear_matrix_values(k,i_state)
  enddo
  print *, ' initial norm = ', norm_psi

  allocate( Aref(n_det_alpha_unique,n_det_beta_unique) )
  Aref(:,:) = 0.d0
  do k = 1, N_det
    i         = psi_bilinear_matrix_rows(k)
    j         = psi_bilinear_matrix_columns(k)
    Aref(i,j) = psi_bilinear_matrix_values(k,i_state)
  enddo

  ! ---------------------------------------------------------------------------------------


  ! ---------------------------------------------------------------------------------------
  !                                  perform a Full SVD

  allocate( Uref(n_det_alpha_unique,n_det_alpha_unique) )
  allocate( Dref(max(n_det_beta_unique,n_det_alpha_unique)) )
  allocate( Vref(n_det_beta_unique,n_det_beta_unique)  )
  allocate( Vtref(n_det_beta_unique,n_det_beta_unique) )
  call svd_s(Aref, size(Aref,1), Uref, size(Uref,1), Dref, Vtref, size(Vtref,1)          &
       , n_det_alpha_unique, n_det_beta_unique)

  print *, ' ~~~~~~~~~~~~~~~~~~~~~~~~~~~ '

  do l = 1, n_det_beta_unique
    do i = 1, n_det_beta_unique
      Vref(i,l) = Vtref(l,i)
    enddo
  enddo
  deallocate( Vtref )

  ! Truncated rank
  n_selected  = n_svd
  call write_int(6,n_svd, 'Rank of psi')


  !________________________________________________________________________________________________________
  !
  !                                   loop over SVD iterations
  !________________________________________________________________________________________________________

  tol_energy = 1.d0
  it_svd     = 0
  it_svd_max = 100
  E_prev = 0.d0

  allocate(H(n_selected,n_selected,n_det_alpha_unique,n_det_beta_unique))
  allocate(H_diag(n_det_alpha_unique,n_det_beta_unique))
  allocate(psi_postsvd(n_det_alpha_unique,n_det_beta_unique))
  do while( ( it_svd .lt. it_svd_max) .and. ( tol_energy .gt. 1d-6 ) )

    call CPU_TIME(CPU_tbeg_it)
    call SYSTEM_CLOCK(COUNT=W_tbeg_it, COUNT_RATE=W_ir)

    it_svd = it_svd + 1

    double precision :: norm
    norm = 0.d0
    do j = 1, n_selected
      norm = norm + Dref(j)*Dref(j)
    enddo
    Dref = Dref / dsqrt(norm)

!    print *,  '-- Compute H --'
    call const_H_uv(Uref, Vref, H, H_diag, n_selected)

    ! H0(i,j) = < u_i v_j | H | u_i v_j >
    ! E0 = < psi_0 | H | psi_0 >
    E0 = 0.d0
    do j = 1, n_selected
      do i = 1, n_selected
         E0 = E0 + Dref(i) * H(i,i,j,j) * Dref(j)
      enddo
    enddo
    E0 = E0 + nuclear_repulsion
!    print *,' E0   =', E0

    double precision, allocatable :: eigval0(:)
    double precision, allocatable :: eigvec0(:,:,:)
    double precision, allocatable :: H_tmp(:,:,:,:)

    allocate( H_tmp(n_selected,n_selected,n_selected,n_selected) )
    do l=1,n_selected
     do k=1,n_selected
      do j=1,n_selected
       do i=1,n_selected
        H_tmp(i,j,k,l) = H(i,j,k,l)
       enddo
      enddo
     enddo
    enddo
    allocate( eigval0(n_selected**2),eigvec0(n_selected,n_selected,n_selected**2))
    eigvec0 = 0.d0

!    print *,  ' --- Diag post-SVD --- '
    call lapack_diag(eigval0, eigvec0, H_tmp, n_selected**2, n_selected**2)
!    print *, 'eig =', eigval0(1) + nuclear_repulsion
    deallocate(H_tmp, eigval0)

!    print *,  ' --- SVD --- '
    Dref = 0.d0
    call perform_newpostSVD(n_selected, eigvec0(1,1,1), Uref, Vref, Dref)
    deallocate(eigvec0)

!    print *,  ' --- Compute H --- '
    call const_H_uv(Uref, Vref, H, H_diag, n_selected)

    ! H0(i,j) = < u_i v_j | H | u_i v_j >
    ! E0 = < psi_0 | H | psi_0 >
    E0 = 0.d0
    norm = 0.d0
    do j = 1, n_det_beta_unique
      do i = 1, n_selected
         E0 = E0 + Dref(i) * H(i,i,j,j) * Dref(j)
      enddo
      norm = norm + Dref(j)*Dref(j)
    enddo
    E0 = E0 + nuclear_repulsion
!    print *,' E0   =', E0
!    print *,' norm =', norm

!    print *,  ' --- Perturbation --- '
    psi_postsvd = 0.d0
    do i=1,n_selected
      psi_postsvd(i,i) = Dref(i)
    enddo

    Ept2 = 0.d0
    do j=1,n_selected
      do i=n_selected+1,n_det_alpha_unique
        ctmp = 0.d0
        do l=1,n_selected
          do k=1,n_selected
            ctmp = ctmp + H(k,l,i,j) * psi_postsvd(k,l)
          enddo
        enddo
        psi_postsvd(i,j) = ctmp / (E0 - (H_diag(i,j)+nuclear_repulsion) )
        Ept2            += ctmp*ctmp / (E0 - (H_diag(i,j)+nuclear_repulsion) )
      enddo
    enddo

    do j=n_selected+1,n_det_beta_unique
      do i=1,n_selected
        ctmp = 0.d0
        do l=1,n_selected
          do k=1,n_selected
            ctmp = ctmp + H(k,l,i,j) * psi_postsvd(k,l)
          enddo
        enddo
        psi_postsvd(i,j) = ctmp / (E0 - (H_diag(i,j)+nuclear_repulsion) )
        Ept2            += ctmp*ctmp / (E0 - (H_diag(i,j)+nuclear_repulsion) )
      enddo
    enddo
!    do j=n_selected+1,n_det_beta_unique
!      do i=n_selected+1,n_det_alpha_unique
!        ctmp = 0.d0
!        do l=1,n_selected
!          do k=1,n_selected
!            ctmp = ctmp + H(k,l,i,j) * psi_postsvd(k,l)
!          enddo
!        enddo
!        psi_postsvd(i,j) = ctmp / (E0 - (H(i,j,i,j)+nuclear_repulsion) )
!        Ept2            += ctmp*ctmp / (E0 - (H(i,j,i,j)+nuclear_repulsion) )
!      enddo
!    enddo

    tol_energy = dabs(E_prev - E0)
    print '(I5, 3(3X, F20.10))',  it_svd, E0, E0 + Ept2, tol_energy
    E_prev = E0

!    print *,  ' --- SVD --- '
    call perform_newpostSVD(n_det_beta_unique, psi_postsvd, Uref, Vref, Dref)

  end do


end


subroutine perform_newpostSVD(n_selected, psi_postsvd, Uref, Vref, Dref)

  USE OMP_LIB

  integer, intent(in)             :: n_selected
  double precision, intent(in)    :: psi_postsvd(n_selected,n_selected)
  double precision, intent(inout) :: Uref(n_det_alpha_unique,n_det_alpha_unique)
  double precision, intent(inout) :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(inout) :: Dref(max(n_det_beta_unique,n_det_alpha_unique))

  integer                         :: mm, nn, i, j, ii0, ii, l, jj, na, nb
  double precision                :: err0, err_norm, err_tmp, norm_tmp
  double precision                :: overlapU_mat, overlapV_mat, overlapU, overlapV
  double precision, allocatable   :: S_mat(:,:), SxVt(:,:)
  double precision, allocatable   :: U_svd(:,:), V_svd(:,:)
  double precision, allocatable   :: U_newsvd(:,:), V_newsvd(:,:), Vt_newsvd(:,:), D_newsvd(:), A_newsvd(:,:)

  mm = n_det_alpha_unique
  nn = n_det_beta_unique

  allocate( U_svd(mm,n_selected) , V_svd(nn,n_selected) , S_mat(n_selected,n_selected) )

  U_svd(1:mm,1:n_selected) = Uref(1:mm,1:n_selected)
  V_svd(1:nn,1:n_selected) = Vref(1:nn,1:n_selected)

  S_mat(1:n_selected,1:n_selected) = psi_postsvd(1:n_selected,1:n_selected)

  ! first compute S_mat x transpose(V_svd)
  allocate( SxVt(n_selected,nn) )
  call dgemm( 'N', 'T', n_selected, nn, n_selected, 1.d0     &
             , S_mat     , size(S_mat,1)     &
             , V_svd     , size(V_svd,1)     &
             , 0.d0, SxVt, size(SxVt ,1)     )
  deallocate(S_mat)

  ! then compute U_svd x SxVt
  allocate( A_newsvd(mm,nn) )
  call dgemm( 'N', 'N', mm, nn, n_selected, 1.d0            &
             , U_svd         , size(U_svd   ,1)     &
             , SxVt          , size(SxVt    ,1)     &
             , 0.d0, A_newsvd, size(A_newsvd,1)     )
  deallocate( SxVt )

  ! perform new SVD
  allocate( U_newsvd(mm,mm), Vt_newsvd(nn,nn), D_newsvd(max(mm,nn)) )
  call svd_s( A_newsvd, size(A_newsvd,1), &
              U_newsvd, size(U_newsvd,1), &
              D_newsvd, &
              Vt_newsvd, size(Vt_newsvd,1), &
              mm, nn)
  deallocate(A_newsvd)

  allocate( V_newsvd(nn,nn) )
  do l = 1, nn
    do j = 1, nn
      V_newsvd(j,l) = Vt_newsvd(l,j)
    enddo
  enddo
  deallocate(Vt_newsvd)

  do l = 1, n_selected
    Dref(l)   = D_newsvd(l)
    Uref(1:mm,l) = U_newsvd(1:mm,l)
    Vref(1:nn,l) = V_newsvd(1:nn,l)
  enddo

  deallocate(U_newsvd)
  deallocate(V_newsvd)
  deallocate(D_newsvd)

end subroutine perform_newpostSVD


subroutine const_H_uv(Uref, Vref, H, H_diag, n_selected)

  USE OMP_LIB

  implicit none

  integer, intent(in)            :: n_selected
  double precision, intent(in)   :: Uref(n_det_alpha_unique,n_det_alpha_unique)
  double precision, intent(in)   :: Vref(n_det_beta_unique ,n_det_beta_unique)
  double precision, intent(out)  :: H(n_selected,n_selected, n_det_alpha_unique, n_det_beta_unique)
  double precision, intent(out)  :: H_diag(n_det_alpha_unique,n_det_beta_unique)

  integer(bit_kind)              :: det1(N_int,2), det2(N_int,2)
  integer                        :: i, j, k, l, degree
  integer                        :: ii0, jj0, ii, jj, n, m, np, mp
  integer                        :: nn0, mm0, na, nb, mm, ind_gs
  integer                        :: p,q,r,s
  double precision               :: h12, x

  double precision, allocatable  :: H0(:,:,:,:)
  double precision, allocatable  :: H1(:,:,:,:)
  double precision, allocatable  :: tmp3(:,:,:)
  double precision, allocatable  :: tmp1(:,:), tmp0(:,:)
  double precision :: c_tmp


  na = n_det_alpha_unique
  nb = n_det_beta_unique

  det1(:,1) = psi_det_alpha_unique(:,1)
  det2(:,1) = psi_det_alpha_unique(:,1)
  det1(:,2) = psi_det_beta_unique(:,1)
  det2(:,2) = psi_det_beta_unique(:,1)
  call i_H_j(det1, det2, N_int, h12)

  call wall_time(t0)
  tmp3 = 0.d0

  allocate( H0(na,nb,n_selected,n_selected) )
  allocate (tmp3(nb,nb,nb))
  H0 = 0.d0

  !$OMP PARALLEL DEFAULT(NONE)                                       &
      !$OMP          PRIVATE(i,j,k,l,m,n,det1,det2,degree,h12,c_tmp,tmp1,tmp0)&
      !$OMP          SHARED(na,nb,psi_det_alpha_unique,psi_det_beta_unique,&
      !$OMP                 N_int,tmp3,Uref,Vref,H_diag,H0,n_selected)

  allocate(tmp1(na,na), tmp0(na,na))

  do i=1,na
    do m=1,na
      tmp1(m,i) = Uref(i,m)
    enddo
  enddo

  !$OMP DO
  do l = 1, nb
    det2(:,2) = psi_det_beta_unique(:,l)

    do j = 1, nb
      det1(:,2) = psi_det_beta_unique(:,j)

      call get_excitation_degree_spin(det1(1,2),det2(1,2),degree,N_int)
      if (degree > 2) cycle

      do k = 1, na
        det2(:,1) = psi_det_alpha_unique(:,k)

        do i = 1, na
          det1(:,1) = psi_det_alpha_unique(:,i)

          call get_excitation_degree(det1,det2,degree,N_int)
          if ( degree > 2) cycle

          call i_H_j(det1, det2, N_int, h12)

          do m=1,nb
            tmp3(m,j,l) = tmp3(m,j,l) + h12 * tmp1(m,i) * tmp1(m,k)
          enddo

          do n=1,n_selected
            c_tmp = h12 * Vref(j,n)
            do m=1,n_selected
              H0(k,l,m,n) = H0(k,l,m,n) + c_tmp * tmp1(m,i)
            enddo
          enddo

        enddo
      enddo
    enddo
  enddo
  !$OMP END DO

  !$OMP DO
  do m=1,nb
    do l=1,nb
      do j=1,nb
        tmp1(j,l) = tmp3(m,j,l)
      enddo
    enddo
    call DGEMM('N','N',nb,nb,nb,1.d0,                                &
        tmp1, size(tmp1,1),                                          &
        Vref, size(Vref,1),                                          &
        0.d0, tmp0, size(tmp0,1))

    do n=1,nb
      H_diag(m,n) = 0.d0
      do j=1,nb
        H_diag(m,n) = H_diag(m,n) + tmp0(j,n) * Vref(j,n)
      enddo
    enddo
  enddo
  !$OMP END DO
  deallocate(tmp1, tmp0)
  !$OMP END PARALLEL

  call wall_time(t1)


  allocate( H1(nb,n_selected,n_selected,na) )
  call DGEMM('T','N', nb * n_selected * n_selected, na, na,          &
      1.d0, H0, size(H0,1), Uref, size(Uref,1), 0.d0, H1, size(H1,1)*size(H1,2)*size(H1,3))
  deallocate( H0 )

  ! (l,p,q,r) -> (p,q,r,s)
  call DGEMM('T','N',  n_selected * n_selected * na, nb, nb,         &
      1.d0, H1, size(H1,1), Vref, size(Vref,1), 0.d0, H, size(H,1)*size(H,2)*size(H,3))

!  do j=1,n_selected
!    do i=1,n_selected
!      print *, H_diag(i,j), H(i,j,i,j)
!    enddo
!  enddo
  deallocate(H1)

  call wall_time(t2)
!  print *, 't=', t1-t0, t2-t1
  double precision               :: t0, t1, t2
!  stop
end