4
1
mirror of https://github.com/pfloos/quack synced 2025-01-05 11:00:21 +01:00

optimized ppBSE kernels for Davidson

This commit is contained in:
Abdallah Ammar 2024-09-11 00:25:42 +02:00
parent 4f24625c9d
commit 13d9ce5eda
3 changed files with 719 additions and 298 deletions

View File

@ -139,9 +139,9 @@ subroutine RGW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nOrb,nC,nO,nV,nR,nS
call ppLR(TDA,nOO,nVV,Bpp,Cpp,Dpp,Om1,X1,Y1,Om2,X2,Y2,EcBSE(ispin)) call ppLR(TDA,nOO,nVV,Bpp,Cpp,Dpp,Om1,X1,Y1,Om2,X2,Y2,EcBSE(ispin))
deallocate(Bpp,Cpp,Dpp,KB_sta,KC_sta,KD_sta) deallocate(Bpp,Cpp,Dpp,KB_sta,KC_sta,KD_sta)
print*, 'LAPACK:' !print*, 'LAPACK:'
print*, Om2 !print*, Om2
print*, Om1 !print*, Om1
! --- ! ---
@ -151,46 +151,46 @@ subroutine RGW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nOrb,nC,nO,nV,nR,nS
! Davidson ! Davidson
! --- ! ---
n_states = nOO + 5 !n_states = nOO + 5
n_states_diag = n_states + 4 !n_states_diag = n_states + 4
allocate(Om(nOO+nVV), R(nOO+nVV,n_states_diag)) !allocate(Om(nOO+nVV), R(nOO+nVV,n_states_diag))
supp_data_int = 1 !supp_data_int = 1
allocate(supp_data_int(supp_data_int_size)) !allocate(supp_data_int(supp_data_int_size))
supp_data_int(1) = nS !supp_data_int(1) = nS
supp_data_dbl_size = nS + nOrb*nOrb*nS + 1 !supp_data_dbl_size = nS + nOrb*nOrb*nS + 1
allocate(supp_data_dbl(supp_data_dbl_size)) !allocate(supp_data_dbl(supp_data_dbl_size))
! scalars !! scalars
supp_data_dbl(1) = eta !supp_data_dbl(1) = eta
i_data = 1 !i_data = 1
! rho_RPA !! rho_RPA
do m = 1, nS !do q = 1, nOrb
do q = 1, nOrb ! do p = 1, nOrb
do p = 1, nOrb ! do m = 1, nS
i_data = i_data + 1 ! i_data = i_data + 1
supp_data_dbl(i_data) = rho_RPA(p,q,m) ! supp_data_dbl(i_data) = rho_RPA(p,q,m)
enddo ! enddo
enddo ! enddo
enddo !enddo
! OmRPA !! OmRPA
do m = 1, nS !do m = 1, nS
i_data = i_data + 1 ! i_data = i_data + 1
supp_data_dbl(i_data) = OmRPA(m) ! supp_data_dbl(i_data) = OmRPA(m)
enddo !enddo
call ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, & !call ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, &
1.d0, & ! lambda ! 1.d0, & ! lambda
eGW(1), & ! eGW(1), &
0.d0, & ! eF ! 0.d0, & ! eF
ERI(1,1,1,1), & ! ERI(1,1,1,1), &
supp_data_int(1), supp_data_int_size, & ! supp_data_int(1), supp_data_int_size, &
supp_data_dbl(1), supp_data_dbl_size, & ! supp_data_dbl(1), supp_data_dbl_size, &
Om(1), R(1,1), n_states, n_states_diag, "GW") ! Om(1), R(1,1), n_states, n_states_diag, "GW")
deallocate(Om, R) !deallocate(Om, R)
deallocate(supp_data_dbl, supp_data_int) !deallocate(supp_data_dbl, supp_data_int)
stop !stop
! --- ! ---
@ -274,9 +274,9 @@ subroutine RGW_ppBSE(TDA_W,TDA,dBSE,dTDA,singlet,triplet,eta,nOrb,nC,nO,nV,nR,nS
!supp_data_dbl(1) = eta !supp_data_dbl(1) = eta
!i_data = 1 !i_data = 1
!! rho_RPA !! rho_RPA
!do m = 1, nS
!do q = 1, nOrb !do q = 1, nOrb
! do p = 1, nOrb ! do p = 1, nOrb
! do m = 1, nS
! i_data = i_data + 1 ! i_data = i_data + 1
! supp_data_dbl(i_data) = rho_RPA(p,q,m) ! supp_data_dbl(i_data) = rho_RPA(p,q,m)
! enddo ! enddo

View File

@ -14,251 +14,108 @@ subroutine ppLR_GW_HR_calc(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
double precision, intent(in) :: lambda, eF, eta double precision, intent(in) :: lambda, eF, eta
double precision, intent(in) :: e(nOrb) double precision, intent(in) :: e(nOrb)
double precision, intent(in) :: ERI(nOrb,nOrb,nOrb,nOrb) double precision, intent(in) :: ERI(nOrb,nOrb,nOrb,nOrb)
double precision, intent(in) :: rho(nOrb,nOrb,nS), Om(nS) double precision, intent(in) :: rho(nS,nOrb,nOrb), Om(nS)
double precision, intent(in) :: U(nOO+nVV,n_states_diag) double precision, intent(in) :: U(nOO+nVV,n_states_diag)
double precision, intent(out) :: W(nOO+nVV,n_states_diag) double precision, intent(out) :: W(nOO+nVV,n_states_diag)
integer :: i, j, ij, k, l, kl integer :: i, j, ij
integer :: a, b, c, d, ab, cd integer :: ab
integer :: m integer :: m
integer :: state integer :: state
double precision :: mat_tmp, chi, eps double precision :: eta2
double precision :: t1, t2
double precision :: diff_tot, diff_loc
double precision, allocatable :: M_ref(:,:)
double precision, allocatable :: Bpp_ref(:,:), Cpp_ref(:,:), Dpp_ref(:,:)
double precision, allocatable :: KB_sta(:,:), KC_sta(:,:), KD_sta(:,:)
double precision, allocatable :: W_ref(:,:)
double precision, allocatable :: rho_t(:,:,:)
double precision, external :: Kronecker_delta ! call wall_time(t1)
if(ispin .eq. 1) then if((nOO+nVV) .le. 20000) then
ab = 0 call ppLR_GW_HR_calc_oneshot(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e(1), eF, n_states_diag, &
do a = nO+1, nOrb-nR ERI(1,1,1,1), eta, rho(1,1,1), Om(1), U(1,1), W(1,1))
do b = a, nOrb-nR
ab = ab + 1
do state = 1, n_states_diag
W(ab,state) = 0.d0
cd = 0
do c = nO+1, nOrb-nR
do d = c, nOrb-nR
cd = cd + 1
mat_tmp = (e(a) + e(b) - eF) * Kronecker_delta(a, c) * Kronecker_delta(b, d) &
+ lambda * (ERI(a,b,c,d) + ERI(a,b,d,c)) / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(c, d)))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(a,c,m) * rho(b,d,m) * Om(m) / eps &
- rho(a,d,m) * rho(b,c,m) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(c, d)))
W(ab,state) = W(ab,state) + mat_tmp * U(cd,state)
enddo
enddo
ij = nVV
do i = nC+1, nO
do j = i, nO
ij = ij + 1
mat_tmp = lambda * (ERI(a,b,i,j) + ERI(a,b,j,i)) / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(i, j)))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(i,a,m) * rho(j,b,m) * Om(m) / eps &
- rho(i,b,m) * rho(a,j,m) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(i, j)))
W(ab,state) = W(ab,state) - mat_tmp * U(ij,state)
enddo
enddo
enddo ! state
enddo ! b
enddo ! a
! ---
ij = nVV
do i = nC+1, nO
do j = i, nO
ij = ij + 1
do state = 1, n_states_diag
W(ij,state) = 0.d0
ab = 0
do a = nO+1, nOrb-nR
do b = a, nOrb-nR
ab = ab + 1
mat_tmp = lambda * (ERI(a,b,i,j) + ERI(a,b,j,i)) / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(i, j)))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(i,a,m) * rho(j,b,m) * Om(m) / eps &
- rho(i,b,m) * rho(a,j,m) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(i, j)))
W(ij,state) = W(ij,state) + mat_tmp * U(ab,state)
enddo
enddo
kl = nVV
do k = nC+1, nO
do l = k, nO
kl = kl + 1
mat_tmp = - (e(i) + e(j) - eF) * Kronecker_delta(i, k) * Kronecker_delta(j, l) &
+ lambda * (ERI(i,j,k,l) + ERI(i,j,l,k)) / dsqrt( (1.d0 + Kronecker_delta(i, j)) &
* (1.d0 + Kronecker_delta(k, l)))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(i,k,m) * rho(j,l,m) * Om(m) / eps &
- rho(i,l,m) * rho(j,k,m) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(i, j)) &
* (1.d0 + Kronecker_delta(k, l)))
W(ij,state) = W(ij,state) - mat_tmp * U(kl,state)
enddo
enddo
enddo ! state
enddo ! j
enddo ! i
elseif(ispin .eq. 2) then
ab = 0
do a = nO+1, nOrb-nR
do b = a+1, nOrb-nR
ab = ab + 1
do state = 1, n_states_diag
W(ab,state) = 0.d0
cd = 0
do c = nO+1, nOrb-nR
do d = c+1, nOrb-nR
cd = cd + 1
mat_tmp = (e(a) + e(b) - eF) * Kronecker_delta(a, c) * Kronecker_delta(b, d) &
+ lambda * (ERI(a,b,c,d) - ERI(a,b,d,c))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(a,c,m) * rho(b,d,m) * Om(m) / eps &
+ rho(a,d,m) * rho(b,c,m) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ab,state) = W(ab,state) + mat_tmp * U(cd,state)
enddo
enddo
ij = nVV
do i = nC+1, nO
do j = i+1, nO
ij = ij + 1
mat_tmp = lambda * (ERI(a,b,i,j) - ERI(a,b,j,i))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(i,a,m) * rho(j,b,m) * Om(m) / eps &
+ rho(i,b,m) * rho(a,j,m) * Om(m) / eps
end do
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ab,state) = W(ab,state) - mat_tmp * U(ij,state)
enddo
enddo
enddo ! state
enddo ! b
enddo ! a
! ---
ij = nVV
do i = nC+1, nO
do j = i+1, nO
ij = ij + 1
do state = 1, n_states_diag
W(ij,state) = 0.d0
ab = 0
do a = nO+1, nOrb-nR
do b = a+1, nOrb-nR
ab = ab + 1
mat_tmp = lambda * (ERI(a,b,i,j) - ERI(a,b,j,i))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(i,a,m) * rho(j,b,m) * Om(m) / eps &
+ rho(i,b,m) * rho(a,j,m) * Om(m) / eps
end do
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ij,state) = W(ij,state) + mat_tmp * U(ab,state)
enddo
enddo
kl = nVV
do k = nC+1, nO
do l = k+1, nO
kl = kl + 1
mat_tmp = - (e(i) + e(j) - eF) * Kronecker_delta(i, k) * Kronecker_delta(j, l) &
+ lambda * (ERI(i,j,k,l) - ERI(i,j,l,k))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta**2
chi = chi - rho(i,k,m) * rho(j,l,m) * Om(m) / eps &
+ rho(i,l,m) * rho(j,k,m) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ij,state) = W(ij,state) - mat_tmp * U(kl,state)
enddo
enddo
enddo ! state
enddo ! j
enddo ! i
else else
print*, ' Error in ppLR_GW_HR_calc' call ppLR_GW_HR_calc_batches(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e(1), eF, n_states_diag, &
print*, ' ispin is not supported' ERI(1,1,1,1), eta, rho(1,1,1), Om(1), U(1,1), W(1,1))
print*, ' ispin = ', ispin
stop
endif endif
! print*, ' debug ppLR_GW_H_diag:'
! allocate(M_ref(nOO+nVV,nOO+nVV))
! allocate(Bpp_ref(nVV,nOO), Cpp_ref(nVV,nVV), Dpp_ref(nOO,nOO))
! allocate(KB_sta(nVV,nOO), KC_sta(nVV,nVV), KD_sta(nOO,nOO))
! allocate(rho_t(nOrb,nOrb,nS))
! allocate(W_ref(nOO+nVV,n_states_diag))
!
! call ppLR_C(ispin, nOrb, nC, nO, nOrb-nO, nR, nVV, 1d0, e, ERI, Cpp_ref)
! call ppLR_D(ispin, nOrb, nC, nO, nOrb-nO, nR, nOO, 1d0, e, ERI, Dpp_ref)
! call ppLR_B(ispin, nOrb, nC, nO, nOrb-nO, nR, nOO, nVV, 1d0, ERI, Bpp_ref)
!
! do j = 1, nOrb
! do i = 1, nOrb
! do m = 1, nS
! rho_t(i,j,m) = rho(m,i,j)
! enddo
! enddo
! enddo
!
! call RGW_ppBSE_static_kernel_C(ispin, eta, nOrb, nC, nO, nOrb-nO, nR, nS, nVV, 1.d0, ERI, Om, rho_t, KC_sta)
! call RGW_ppBSE_static_kernel_D(ispin, eta, nOrb, nC, nO, nOrb-nO, nR, nS, nOO, 1.d0, ERI, Om, rho_t, KD_sta)
! call RGW_ppBSE_static_kernel_B(ispin, eta, nOrb, nC, nO, nOrb-nO, nR, nS, nOO, nVV, 1.d0, ERI, Om, rho_t, KB_sta)
!
! Cpp_ref = Cpp_ref + KC_sta
! Dpp_ref = Dpp_ref + KD_sta
! Bpp_ref = Bpp_ref + KB_sta
!
! M_ref = 0.d0
! M_ref( 1:nVV , 1:nVV) = + Cpp_ref(1:nVV,1:nVV)
! M_ref(nVV+1:nVV+nOO,nVV+1:nVV+nOO) = - Dpp_ref(1:nOO,1:nOO)
! M_ref( 1:nVV ,nVV+1:nOO+nVV) = - Bpp_ref(1:nVV,1:nOO)
! M_ref(nVV+1:nOO+nVV, 1:nVV) = + transpose(Bpp_ref(1:nVV,1:nOO))
!
! call dgemm('N', 'N', nOO+nVV, n_states_diag, nOO+nVV, 1.d0, &
! M_ref(1,1), size(M_ref, 1), U(1,1), size(U, 1), &
! 0.d0, W_ref(1,1), size(W_ref, 1))
!
! diff_tot = 0.d0
! do state = 1, n_states_diag
! do ab = 1, nOO
! diff_loc = dabs(W(ab,state) - W_ref(ab,state))
! if(diff_loc .gt. 1d-10) then
! print*, ' important diff on:', ab, state
! print*, W(ab,state), W_ref(ab,state)
! stop
! endif
! diff_tot = diff_tot + diff_loc
! enddo
! do ij = nVV+1, nVV+nOO
! diff_loc = dabs(W(ij,state) - W_ref(ij,state))
! if(diff_loc .gt. 1d-10) then
! print*, ' important diff on:', ij, state
! print*, W(ij,state), W_ref(ij,state)
! stop
! endif
! diff_tot = diff_tot + diff_loc
! enddo
! enddo
! print*, 'diff_tot = ', diff_tot
!
! deallocate(M_ref)
! deallocate(Bpp_ref, Cpp_ref, Dpp_ref)
! deallocate(KB_sta, KC_sta, KD_sta)
! deallocate(W_ref)
! deallocate(rho_t)
! call wall_time(t2)
! write(*,'(A50, F12.4)') 'total wall time for ppLR_GW_HR_calc (sec): ', t2-t1
return return
end end
@ -273,16 +130,18 @@ subroutine ppLR_GW_H_diag(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
double precision, intent(in) :: lambda, eF, eta double precision, intent(in) :: lambda, eF, eta
double precision, intent(in) :: e(nOrb) double precision, intent(in) :: e(nOrb)
double precision, intent(in) :: ERI(nOrb,nOrb,nOrb,nOrb) double precision, intent(in) :: ERI(nOrb,nOrb,nOrb,nOrb)
double precision, intent(in) :: rho(nOrb,nOrb,nS), Om(nS) double precision, intent(in) :: rho(nS,nOrb,nOrb), Om(nS)
double precision, intent(out) :: H_diag(nOO+nVV) double precision, intent(out) :: H_diag(nOO+nVV)
integer :: i, j, ij, k, l, kl integer :: i, j, ij, k, l, kl
integer :: a, b, c, d, ab, cd integer :: a, b, c, d, ab, cd
integer :: m integer :: m
double precision :: chi, eps double precision :: chi, eps
double precision :: t1, t2
double precision, external :: Kronecker_delta double precision, external :: Kronecker_delta
call wall_time(t1)
if(ispin .eq. 1) then if(ispin .eq. 1) then
@ -302,8 +161,8 @@ subroutine ppLR_GW_H_diag(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
chi = 0.d0 chi = 0.d0
do m = 1, nS do m = 1, nS
eps = Om(m)**2 + eta**2 eps = Om(m)**2 + eta**2
chi = chi - rho(a,c,m) * rho(b,d,m) * Om(m) / eps & chi = chi - rho(m,a,c) * rho(m,b,d) * Om(m) / eps &
- rho(a,d,m) * rho(b,c,m) * Om(m) / eps - rho(m,a,d) * rho(m,b,c) * Om(m) / eps
end do end do
H_diag(ab) = H_diag(ab) + 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(a, b)) & H_diag(ab) = H_diag(ab) + 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(a, b)) &
* (1.d0 + Kronecker_delta(c, d))) * (1.d0 + Kronecker_delta(c, d)))
@ -328,8 +187,8 @@ subroutine ppLR_GW_H_diag(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
chi = 0.d0 chi = 0.d0
do m = 1, nS do m = 1, nS
eps = Om(m)**2 + eta**2 eps = Om(m)**2 + eta**2
chi = chi - rho(i,k,m) * rho(j,l,m) * Om(m) / eps & chi = chi - rho(m,i,k) * rho(m,j,l) * Om(m) / eps &
- rho(i,l,m) * rho(j,k,m) * Om(m) / eps - rho(m,i,l) * rho(m,j,k) * Om(m) / eps
enddo enddo
H_diag(ij) = H_diag(ij) - 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(i, j)) & H_diag(ij) = H_diag(ij) - 4.d0 * lambda * chi / dsqrt( (1.d0 + Kronecker_delta(i, j)) &
* (1.d0 + Kronecker_delta(k, l))) * (1.d0 + Kronecker_delta(k, l)))
@ -354,8 +213,8 @@ subroutine ppLR_GW_H_diag(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
chi = 0.d0 chi = 0.d0
do m = 1, nS do m = 1, nS
eps = Om(m)**2 + eta**2 eps = Om(m)**2 + eta**2
chi = chi - rho(a,c,m) * rho(b,d,m) * Om(m) / eps & chi = chi - rho(m,a,c) * rho(m,b,d) * Om(m) / eps &
+ rho(a,d,m) * rho(b,c,m) * Om(m) / eps + rho(m,a,d) * rho(m,b,c) * Om(m) / eps
enddo enddo
H_diag(ab) = H_diag(ab) + 4.d0 * lambda * chi H_diag(ab) = H_diag(ab) + 4.d0 * lambda * chi
enddo enddo
@ -377,8 +236,8 @@ subroutine ppLR_GW_H_diag(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
chi = 0.d0 chi = 0.d0
do m = 1, nS do m = 1, nS
eps = Om(m)**2 + eta**2 eps = Om(m)**2 + eta**2
chi = chi - rho(i,k,m) * rho(j,l,m) * Om(m) / eps & chi = chi - rho(m,i,k) * rho(m,j,l) * Om(m) / eps &
+ rho(i,l,m) * rho(j,k,m) * Om(m) / eps + rho(m,i,l) * rho(m,j,k) * Om(m) / eps
end do end do
H_diag(ij) = H_diag(ij) - 4.d0 * lambda * chi H_diag(ij) = H_diag(ij) - 4.d0 * lambda * chi
enddo enddo
@ -395,6 +254,551 @@ subroutine ppLR_GW_H_diag(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF,
endif endif
call wall_time(t2)
write(*,'(A50, F12.4)') 'total wall time for ppLR_GW_H_diag (sec): ', t2-t1
return
end
! ---
subroutine ppLR_GW_HR_calc_oneshot(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF, n_states_diag, &
ERI, eta, rho, Om, U, W)
implicit none
integer, intent(in) :: ispin
integer, intent(in) :: n_states_diag
integer, intent(in) :: nOO, nVV, nOrb, nC, nO, nR, nS
double precision, intent(in) :: lambda, eF, eta
double precision, intent(in) :: e(nOrb)
double precision, intent(in) :: ERI(nOrb,nOrb,nOrb,nOrb)
double precision, intent(in) :: rho(nS,nOrb,nOrb), Om(nS)
double precision, intent(in) :: U(nOO+nVV,n_states_diag)
double precision, intent(out) :: W(nOO+nVV,n_states_diag)
integer :: i, j, ij, k, l, kl
integer :: a, b, c, d, ab, cd
integer :: a0, aa, i0, ii
integer :: m
integer :: state
double precision :: mat_tmp, chi, eps
double precision :: eta2
double precision :: tmp_e, tmp_ab, tmp_ij
double precision, allocatable :: Om_tmp(:), H_mat(:,:)
if(ispin .eq. 1) then
allocate(Om_tmp(nS))
a0 = nOrb - nR - nO
eta2 = eta * eta
do m = 1, nS
Om_tmp(m) = Om(m) / (Om(m) * Om(m) + eta2)
enddo
allocate(H_mat(nVV,nOO+nVV))
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(a, b, aa, ab, c, d, cd, i, j, ij, m, state, &
!$OMP tmp_e, tmp_ab, chi, mat_tmp) &
!$OMP SHARED(nC, nO, nOrb, nR, nS, n_states_diag, nVV, &
!$OMP nOO, a0, eF, lambda, e, Om_tmp, rho, ERI, U, &
!$OMP H_mat)
!$OMP DO SCHEDULE(GUIDED)
do a = nO+1, nOrb-nR
aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO
do b = a, nOrb-nR
ab = aa + b
tmp_e = e(a) + e(b) - eF
tmp_ab = lambda
if(a .eq. b) then
tmp_ab = 0.7071067811865475d0 * tmp_ab
endif
cd = 0
do c = nO+1, nOrb-nR
do d = c, nOrb-nR
cd = cd + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,a,c) * rho(m,b,d) + rho(m,a,d) * rho(m,b,c))
enddo
mat_tmp = tmp_ab
if(c .eq. d) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(c,d,a,b) + ERI(d,c,a,b))
if((a .eq. c) .and. (b .eq. d)) then
mat_tmp = mat_tmp + tmp_e
endif
H_mat(ab,cd) = mat_tmp
enddo ! d
enddo ! c
ij = nVV
do i = nC+1, nO
do j = i, nO
ij = ij + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,i,a) * rho(m,j,b) + rho(m,i,b) * rho(m,a,j))
enddo
mat_tmp = tmp_ab
if(i .eq. j) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(i,j,a,b) + ERI(j,i,a,b))
H_mat(ab,ij) = -mat_tmp
enddo ! j
enddo ! i
enddo ! b
enddo ! a
!$OMP END DO
!$OMP END PARALLEL
call dgemm("N", "N", nVV, n_states_diag, nOO+nVV, 1.d0, &
H_mat(1,1), size(H_mat, 1), U(1,1), size(U, 1), &
0.d0, W(1,1), size(W, 1))
deallocate(H_mat)
! ---
allocate(H_mat(nOO,nOO+nVV))
i0 = nO - nC
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, j, ii, ij, a, b, ab, k, l, kl, m, state, &
!$OMP tmp_e, tmp_ij, chi, mat_tmp) &
!$OMP SHARED(nC, nO, nOrb, nR, nS, nVV, i0, &
!$OMP eF, lambda, e, Om_tmp, rho, ERI, U, H_mat)
!$OMP DO SCHEDULE(GUIDED)
do i = nC+1, nO
ii = i0 * (i - nC - 1) - (i - nC - 1) * (i - nC) / 2 - nC
do j = i, nO
ij = ii + j
tmp_e = e(i) + e(j) - eF
tmp_ij = lambda
if(i .eq. j) then
tmp_ij = 0.7071067811865475d0 * tmp_ij
endif
ab = 0
do a = nO+1, nOrb-nR
do b = a, nOrb-nR
ab = ab + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,i,a) * rho(m,j,b) + rho(m,i,b) * rho(m,a,j))
enddo
mat_tmp = tmp_ij
if(a .eq. b) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(a,b,i,j) + ERI(a,b,j,i))
H_mat(ij,ab) = mat_tmp
enddo ! b
enddo ! a
kl = nVV
do k = nC+1, nO
do l = k, nO
kl = kl + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,i,k) * rho(m,j,l) + rho(m,i,l) * rho(m,j,k))
enddo
mat_tmp = tmp_ij
if(k .eq. l) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(i,j,k,l) + ERI(i,j,l,k))
if((i .eq. k) .and. (j .eq. l)) then
mat_tmp = mat_tmp - tmp_e
endif
H_mat(ij,kl) = -mat_tmp
enddo ! l
enddo ! k
enddo ! j
enddo ! i
!$OMP END DO
!$OMP END PARALLEL
call dgemm("N", "N", nOO, n_states_diag, nOO+nVV, 1.d0, &
H_mat(1,1), size(H_mat, 1), U(1,1), size(U, 1), &
0.d0, W(nVV+1,1), size(W, 1))
deallocate(H_mat)
deallocate(Om_tmp)
else
print*, ' Error in ppLR_GW_HR_calc_oneshot'
print*, ' ispin is not supported'
print*, ' ispin = ', ispin
stop
endif
return
end
! ---
subroutine ppLR_GW_HR_calc_batches(ispin, nOrb, nC, nO, nR, nOO, nVV, nS, lambda, e, eF, n_states_diag, &
ERI, eta, rho, Om, U, W)
use omp_lib
implicit none
integer, intent(in) :: ispin
integer, intent(in) :: n_states_diag
integer, intent(in) :: nOO, nVV, nOrb, nC, nO, nR, nS
double precision, intent(in) :: lambda, eF, eta
double precision, intent(in) :: e(nOrb)
double precision, intent(in) :: ERI(nOrb,nOrb,nOrb,nOrb)
double precision, intent(in) :: rho(nS,nOrb,nOrb), Om(nS)
double precision, intent(in) :: U(nOO+nVV,n_states_diag)
double precision, intent(out) :: W(nOO+nVV,n_states_diag)
integer :: i, j, ij, k, l, kl
integer :: a, b, c, d, ab, cd
integer :: a0, aa, i0, ii, bb
integer :: m
integer :: state
double precision :: mat_tmp, chi, eps
double precision :: eta2
double precision :: tmp_e, tmp_ab, tmp_ij
double precision, allocatable :: Om_tmp(:), H_mat(:,:)
double precision, external :: Kronecker_delta
if(ispin .eq. 1) then
call omp_set_max_active_levels(1)
allocate(Om_tmp(nS))
a0 = nOrb - nR - nO
eta2 = eta * eta
do m = 1, nS
Om_tmp(m) = Om(m) / (Om(m) * Om(m) + eta2)
enddo
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(a, b, aa, ab, c, d, cd, i, j, ij, m, state, &
!$OMP bb, tmp_e, tmp_ab, chi, mat_tmp, H_mat) &
!$OMP SHARED(nC, nO, nOrb, nR, nS, n_states_diag, nVV, &
!$OMP nOO, a0, eF, lambda, e, Om_tmp, rho, ERI, U, W)
allocate(H_mat(nOO+nVV,a0))
!$OMP DO SCHEDULE(GUIDED)
do a = nO+1, nOrb-nR
aa = a0 * (a - nO - 1) - (a - nO - 1) * (a - nO) / 2 - nO
do b = a, nOrb-nR
ab = aa + b
bb = b - a + 1
tmp_e = e(a) + e(b) - eF
tmp_ab = lambda
if(a .eq. b) then
tmp_ab = 0.7071067811865475d0 * tmp_ab
endif
cd = 0
do c = nO+1, nOrb-nR
do d = c, nOrb-nR
cd = cd + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,a,c) * rho(m,b,d) + rho(m,a,d) * rho(m,b,c))
enddo
mat_tmp = tmp_ab
if(c .eq. d) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(c,d,a,b) + ERI(d,c,a,b))
if((a .eq. c) .and. (b .eq. d)) then
mat_tmp = mat_tmp + tmp_e
endif
H_mat(cd,bb) = mat_tmp
enddo ! d
enddo ! c
ij = nVV
do i = nC+1, nO
do j = i, nO
ij = ij + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,i,a) * rho(m,j,b) + rho(m,i,b) * rho(m,a,j))
enddo
mat_tmp = tmp_ab
if(i .eq. j) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(i,j,a,b) + ERI(j,i,a,b))
H_mat(ij,bb) = -mat_tmp
enddo ! j
enddo ! i
enddo ! b
call dgemm("T", "N", nOrb-nR-a+1, n_states_diag, nOO+nVV, 1.d0, &
H_mat(1,1), size(H_mat, 1), U(1,1), size(U, 1), &
0.d0, W(aa+a,1), size(W, 1))
enddo ! a
!$OMP END DO
deallocate(H_mat)
!$OMP END PARALLEL
! ---
i0 = nO - nC
allocate(H_mat(nOO,nOO+nVV))
!$OMP PARALLEL &
!$OMP DEFAULT(NONE) &
!$OMP PRIVATE(i, j, ii, ij, a, b, ab, k, l, kl, m, state, &
!$OMP tmp_e, tmp_ij, chi, mat_tmp) &
!$OMP SHARED(nC, nO, nOrb, nR, nS, nVV, i0, &
!$OMP eF, lambda, e, Om_tmp, rho, ERI, U, H_mat)
!$OMP DO SCHEDULE(GUIDED)
do i = nC+1, nO
ii = i0 * (i - nC - 1) - (i - nC - 1) * (i - nC) / 2 - nC
do j = i, nO
ij = ii + j
tmp_e = e(i) + e(j) - eF
tmp_ij = lambda
if(i .eq. j) then
tmp_ij = 0.7071067811865475d0 * tmp_ij
endif
ab = 0
do a = nO+1, nOrb-nR
do b = a, nOrb-nR
ab = ab + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,i,a) * rho(m,j,b) + rho(m,i,b) * rho(m,a,j))
enddo
mat_tmp = tmp_ij
if(a .eq. b) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(a,b,i,j) + ERI(a,b,j,i))
H_mat(ij,ab) = mat_tmp
enddo ! b
enddo ! a
kl = nVV
do k = nC+1, nO
do l = k, nO
kl = kl + 1
chi = 0.d0
do m = 1, nS
chi = chi - Om_tmp(m) * (rho(m,i,k) * rho(m,j,l) + rho(m,i,l) * rho(m,j,k))
enddo
mat_tmp = tmp_ij
if(k .eq. l) then
mat_tmp = 0.7071067811865475d0 * mat_tmp
endif
mat_tmp = mat_tmp * (4.d0 * chi + ERI(i,j,k,l) + ERI(i,j,l,k))
if((i .eq. k) .and. (j .eq. l)) then
mat_tmp = mat_tmp - tmp_e
endif
H_mat(ij,kl) = -mat_tmp
enddo ! l
enddo ! k
enddo ! j
enddo ! i
!$OMP END DO
!$OMP END PARALLEL
call dgemm("N", "N", nOO, n_states_diag, nOO+nVV, 1.d0, &
H_mat(1,1), size(H_mat, 1), U(1,1), size(U, 1), &
0.d0, W(nVV+1,1), size(W, 1))
deallocate(H_mat)
deallocate(Om_tmp)
elseif(ispin .eq. 2) then
eta2 = eta * eta
ab = 0
do a = nO+1, nOrb-nR
do b = a+1, nOrb-nR
ab = ab + 1
do state = 1, n_states_diag
W(ab,state) = 0.d0
cd = 0
do c = nO+1, nOrb-nR
do d = c+1, nOrb-nR
cd = cd + 1
mat_tmp = (e(a) + e(b) - eF) * Kronecker_delta(a, c) * Kronecker_delta(b, d) &
+ lambda * (ERI(a,b,c,d) - ERI(a,b,d,c))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta2
chi = chi - rho(m,a,c) * rho(m,b,d) * Om(m) / eps &
+ rho(m,a,d) * rho(m,b,c) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ab,state) = W(ab,state) + mat_tmp * U(cd,state)
enddo
enddo
ij = nVV
do i = nC+1, nO
do j = i+1, nO
ij = ij + 1
mat_tmp = lambda * (ERI(a,b,i,j) - ERI(a,b,j,i))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta2
chi = chi - rho(m,i,a) * rho(m,j,b) * Om(m) / eps &
+ rho(m,i,b) * rho(m,a,j) * Om(m) / eps
end do
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ab,state) = W(ab,state) - mat_tmp * U(ij,state)
enddo
enddo
enddo ! state
enddo ! b
enddo ! a
! ---
ij = nVV
do i = nC+1, nO
do j = i+1, nO
ij = ij + 1
do state = 1, n_states_diag
W(ij,state) = 0.d0
ab = 0
do a = nO+1, nOrb-nR
do b = a+1, nOrb-nR
ab = ab + 1
mat_tmp = lambda * (ERI(a,b,i,j) - ERI(a,b,j,i))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta2
chi = chi - rho(m,i,a) * rho(m,j,b) * Om(m) / eps &
+ rho(m,i,b) * rho(m,a,j) * Om(m) / eps
end do
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ij,state) = W(ij,state) + mat_tmp * U(ab,state)
enddo
enddo
kl = nVV
do k = nC+1, nO
do l = k+1, nO
kl = kl + 1
mat_tmp = - (e(i) + e(j) - eF) * Kronecker_delta(i, k) * Kronecker_delta(j, l) &
+ lambda * (ERI(i,j,k,l) - ERI(i,j,l,k))
chi = 0.d0
do m = 1, nS
eps = Om(m)**2 + eta2
chi = chi - rho(m,i,k) * rho(m,j,l) * Om(m) / eps &
+ rho(m,i,l) * rho(m,j,k) * Om(m) / eps
enddo
mat_tmp = mat_tmp + 4.d0 * lambda * chi
W(ij,state) = W(ij,state) - mat_tmp * U(kl,state)
enddo
enddo
enddo ! state
enddo ! j
enddo ! i
else
print*, ' Error in ppLR_GW_HR_calc_batches'
print*, ' ispin is not supported'
print*, ' ispin = ', ispin
stop
endif
return return
end end

View File

@ -18,6 +18,8 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
! (-B.T -D) ! (-B.T -D)
! !
use omp_lib
implicit none implicit none
logical, intent(in) :: TDA logical, intent(in) :: TDA
@ -36,7 +38,7 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
double precision, intent(out) :: Om(n_states) double precision, intent(out) :: Om(n_states)
double precision, intent(out) :: R(nOO+nVV,n_states_diag) double precision, intent(out) :: R(nOO+nVV,n_states_diag)
integer :: N, M integer :: N, M, num_threads
integer :: iter, itermax, itertot integer :: iter, itermax, itertot
integer :: shift1, shift2 integer :: shift1, shift2
integer :: i, j, k, l, ab integer :: i, j, k, l, ab
@ -49,6 +51,7 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
double precision :: to_print(2,n_states) double precision :: to_print(2,n_states)
double precision :: mem double precision :: mem
double precision :: eta double precision :: eta
double precision :: t1, t2, tt1, tt2
character(len=len(kernel)) :: kernel_name character(len=len(kernel)) :: kernel_name
double precision, allocatable :: H_diag(:) double precision, allocatable :: H_diag(:)
double precision, allocatable :: W(:,:) double precision, allocatable :: W(:,:)
@ -61,6 +64,8 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
double precision, external :: u_dot_u double precision, external :: u_dot_u
call wall_time(t1)
dtwo_pi = 6.283185307179586d0 dtwo_pi = 6.283185307179586d0
N = nOO + nVV N = nOO + nVV
@ -98,11 +103,14 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
mem = 8.d0 * dble(nOrb + nOrb**4 + N*n_states) & mem = 8.d0 * dble(nOrb + nOrb**4 + N*n_states) &
+ 8.d0 * dble(supp_data_dbl_size) + 4.d0 * dble(supp_data_int_size) + 8.d0 * dble(supp_data_dbl_size) + 4.d0 * dble(supp_data_int_size)
write(*,'(A40, F12.4)') 'I/O mem (MB) = ', mem / (1024.d0*1024.d0) write(*,'(A40, F12.4)') 'I/O mem (GB) = ', mem / (1024.d0*1024.d0*1024.d0)
mem = 8.d0 * dble(N + N*M + N*M + M*M + M*M + M + n_states_diag + n_states_diag) mem = 8.d0 * dble(N + N*M + N*M + M*M + M*M + M + n_states_diag + n_states_diag)
write(*,'(A40, F12.4)') 'tmp mem (MB) = ', mem / (1024.d0*1024.d0) write(*,'(A40, F12.4)') 'tmp mem (GB) = ', mem / (1024.d0*1024.d0*1024.d0)
num_threads = omp_get_max_threads()
write(*,'(A40, I12)') 'Number of threads = ', num_threads
if(kernel_name .eq. "rpa") then if(kernel_name .eq. "rpa") then
@ -114,16 +122,16 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
nS = supp_data_int(1) nS = supp_data_int(1)
allocate(rho_tmp(nOrb,nOrb,nS)) allocate(rho_tmp(nS,nOrb,nOrb))
allocate(Om_tmp(nS)) allocate(Om_tmp(nS))
eta = supp_data_dbl(1) eta = supp_data_dbl(1)
i_data = 1 i_data = 1
do mm = 1, nS
do q = 1, nOrb do q = 1, nOrb
do p = 1, nOrb do p = 1, nOrb
do mm = 1, nS
i_data = i_data + 1 i_data = i_data + 1
rho_tmp(p,q,mm) = supp_data_dbl(i_data) rho_tmp(mm,p,q) = supp_data_dbl(i_data)
enddo enddo
enddo enddo
enddo enddo
@ -225,6 +233,8 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
if((iter > 1) .or. (itertot == 1)) then if((iter > 1) .or. (itertot == 1)) then
!call wall_time(tt1)
call ortho_qr(U(1,1), size(U, 1), N, shift2) call ortho_qr(U(1,1), size(U, 1), N, shift2)
!call ortho_qr(U(1,1), size(U, 1), N, shift2) !call ortho_qr(U(1,1), size(U, 1), N, shift2)
@ -340,6 +350,10 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
!write(*, '(1X, I3, 1X, 100(1X, F16.10, 1X, F16.10, 1X, F16.10))') iter-1, to_print(1:2,1:n_states) !write(*, '(1X, I3, 1X, 100(1X, F16.10, 1X, F16.10, 1X, F16.10))') iter-1, to_print(1:2,1:n_states)
endif endif
!call wall_time(tt2)
!write(*,'(A50, F12.4)') 'wall time for one Davidson iteration (sec): ', tt2-tt1
!stop
!print*, 'iter = ', iter !print*, 'iter = ', iter
if(iter > 1) then if(iter > 1) then
converged = dabs(maxval(residual_norm(1:n_states))) < 1d-15 converged = dabs(maxval(residual_norm(1:n_states))) < 1d-15
@ -426,6 +440,9 @@ subroutine ppLR_davidson(ispin, TDA, nC, nO, nR, nOrb, nOO, nVV, lambda, e, eF,
deallocate(Om_tmp) deallocate(Om_tmp)
endif endif
call wall_time(t2)
write(*,'(A50, F12.4)') 'total wall time for Davidson (sec): ', t2-t1
return return
end end