1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2024-06-02 11:25:23 +02:00
qp_plugins_scemama/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f

653 lines
15 KiB
Fortran

subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy)
implicit none
integer, intent(in) :: nO, nV
double precision, intent(in) :: tau(nO,nO,nV,nV)
double precision, intent(in) :: t1(nO,nV)
double precision, intent(out) :: energy
! internal
integer :: i,j,a,b
double precision :: e
energy = 0d0
!$omp parallel &
!$omp shared(nO,nV,energy,tau,t1,&
!$omp cc_space_f_vo,cc_space_w_oovv) &
!$omp private(i,j,a,b,e) &
!$omp default(none)
e = 0d0
!$omp do
do a = 1, nV
do i = 1, nO
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
enddo
enddo
!$omp end do nowait
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
energy = energy + e
!$omp end critical
!$omp end parallel
end
! Tau
subroutine update_tau_space_chol(nO,nV,t1,t2,tau)
implicit none
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV)
! out
double precision, intent(out) :: tau(nO,nO,nV,nV)
! internal
integer :: i,j,a,b
!$OMP PARALLEL &
!$OMP SHARED(nO,nV,tau,t2,t1) &
!$OMP PRIVATE(i,j,a,b) &
!$OMP DEFAULT(NONE)
!$OMP DO
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
enddo
enddo
enddo
enddo
!$OMP END DO
!$OMP END PARALLEL
end
! R1
subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
implicit none
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
! out
double precision, intent(out) :: r1(nO,nV), max_r1
! internal
integer :: u,i,j,beta,a,b
!$omp parallel &
!$omp shared(nO,nV,r1,cc_space_f_ov) &
!$omp private(u,beta) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
r1(u,beta) = cc_space_f_ov(u,beta)
enddo
enddo
!$omp end do
!$omp end parallel
double precision, allocatable :: X_oo(:,:)
allocate(X_oo(nO,nO))
call dgemm('N','N', nO, nO, nV, &
-2d0, t1 , size(t1,1), &
cc_space_f_vo, size(cc_space_f_vo,1), &
0d0, X_oo , size(X_oo,1))
call dgemm('T','N', nO, nV, nO, &
1d0, X_oo, size(X_oo,2), &
t1 , size(t1,1), &
1d0, r1 , size(r1,1))
deallocate(X_oo)
call dgemm('N','N', nO, nV, nV, &
1d0, t1 , size(t1,1), &
H_vv, size(H_vv,1), &
1d0, r1 , size(r1,1))
call dgemm('N','N', nO, nV, nO, &
-1d0, H_oo, size(H_oo,1), &
t1 , size(t1,1), &
1d0, r1, size(r1,1))
double precision, allocatable :: X_voov(:,:,:,:)
allocate(X_voov(nV, nO, nO, nV))
!$omp parallel &
!$omp shared(nO,nV,X_voov,t2,t1) &
!$omp private(u,beta,i,a) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
do i = 1, nO
do a = 1, nV
X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
call dgemv('T', nV*nO, nO*nV, &
1d0, X_voov, size(X_voov,1) * size(X_voov,2), &
H_vo , 1, &
1d0, r1 , 1)
deallocate(X_voov)
double precision, allocatable :: X_ovov(:,:,:,:)
allocate(X_ovov(nO, nV, nO, nV))
!$omp parallel &
!$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) &
!$omp private(u,beta,i,a) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
do a = 1, nv
do i = 1, nO
X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
call dgemv('T', nO*nV, nO*nV, &
1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
t1 , 1, &
1d0, r1 , 1)
deallocate(X_ovov)
integer :: iblock, block_size, nVmax
double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:)
block_size = 16
allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO))
!$omp parallel &
!$omp private(u,i,b,a) &
!$omp default(shared)
!$omp do
do u = 1, nO
do i = 1, nO
do b = 1, nV
do a = 1, nV
T_vvoo(a,b,i,u) = tau(i,u,a,b)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
do iblock = 1, nV, block_size
nVmax = min(block_size,nV-iblock+1)
call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
cc_space_v_vo_chol , cholesky_mo_num, &
cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, &
0.d0, W_vvov_tmp, nV*nO)
!$omp parallel &
!$omp private(b,i,a,beta) &
!$omp default(shared)
do beta = 1, nVmax
do i = 1, nO
!$omp do
do b = 1, nV
do a = 1, nV
W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta)
enddo
enddo
!$omp end do nowait
enddo
enddo
!$omp barrier
!$omp end parallel
call dgemm('T','N',nO,nVmax,nO*nV*nV, &
1d0, T_vvoo, nV*nV*nO, &
W_vvov, nO*nV*nV, &
1d0, r1(1,iblock), nO)
enddo
deallocate(W_vvov,T_vvoo)
double precision, allocatable :: W_oovo(:,:,:,:)
allocate(W_oovo(nO,nO,nV,nO))
!$omp parallel &
!$omp shared(nO,nV,cc_space_v_vooo,W_oovo) &
!$omp private(u,a,i,j) &
!$omp default(none)
do u = 1, nO
!$omp do
do a = 1, nV
do j = 1, nO
do i = 1, nO
W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i)
enddo
enddo
enddo
!$omp end do nowait
enddo
!$omp end parallel
call dgemm('T','N', nO, nV, nO*nO*nV, &
-1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), &
tau , size(tau,1) * size(tau,2) * size(tau,3), &
1d0, r1 , size(r1,1))
deallocate(W_oovo)
max_r1 = 0d0
do a = 1, nV
do i = 1, nO
max_r1 = max(dabs(r1(i,a)), max_r1)
enddo
enddo
! Change the sign for consistency with the code in spin orbitals
!$omp parallel &
!$omp shared(nO,nV,r1) &
!$omp private(a,i) &
!$omp default(none)
!$omp do
do a = 1, nV
do i = 1, nO
r1(i,a) = -r1(i,a)
enddo
enddo
!$omp end do
!$omp end parallel
end
! H_oo
subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo)
implicit none
integer, intent(in) :: nO,nV
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
double precision, intent(out) :: H_oo(nO, nO)
integer :: a,b,i,j,u,k
double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:)
allocate(tau_kau(cholesky_mo_num,nV,nO))
!$omp parallel &
!$omp default(shared) &
!$omp private(i,u,j,k,a,b,tmp_vov)
allocate(tmp_vov(nV,nO,nV) )
!$omp do
do u = 1, nO
do b=1,nV
do j=1,nO
do a=1,nV
tmp_vov(a,j,b) = tau_x(u,j,a,b)
enddo
enddo
enddo
call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, &
0.d0, tau_kau(1,1,u), cholesky_mo_num)
enddo
!$omp end do nowait
deallocate(tmp_vov)
!$omp do
do i = 1, nO
do u = 1, nO
H_oo(u,i) = cc_space_f_oo(u,i)
enddo
enddo
!$omp end do nowait
!$omp barrier
!$omp end parallel
call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, &
1.d0, H_oo, nO)
end
! H_vv
subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
implicit none
integer, intent(in) :: nO,nV
double precision, intent(in) :: tau_x(nO, nO, nV, nV)
double precision, intent(out) :: H_vv(nV, nV)
integer :: a,b,i,j,u,k, beta
double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
allocate(tau_kia(cholesky_mo_num,nO,nV))
!$omp parallel &
!$omp default(shared) &
!$omp private(i,beta,j,k,a,b,tmp_oov)
allocate(tmp_oov(nO,nO,nV) )
!$omp do
do a = 1, nV
do b=1,nV
do j=1,nO
do i=1,nO
tmp_oov(i,j,b) = tau_x(i,j,a,b)
enddo
enddo
enddo
call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
0.d0, tau_kia(1,1,a), cholesky_mo_num)
enddo
!$omp end do nowait
deallocate(tmp_oov)
!$omp do
do beta = 1, nV
do a = 1, nV
H_vv(a,beta) = cc_space_f_vv(a,beta)
enddo
enddo
!$omp end do nowait
!$omp barrier
!$omp end parallel
call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, &
1.d0, H_vv, nV)
end
! H_vo
subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
implicit none
integer, intent(in) :: nO,nV
double precision, intent(in) :: t1(nO, nV)
double precision, intent(out) :: H_vo(nV, nO)
integer :: a,b,i,j,u,k
double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:)
do i=1,nO
do a=1,nV
H_vo(a,i) = cc_space_f_vo(a,i)
enddo
enddo
allocate(tmp_k(cholesky_mo_num))
call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
cc_space_v_ov_chol, cholesky_mo_num, &
t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num)
call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, &
cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, &
H_vo, nV*nO)
deallocate(tmp_k)
allocate(tmp(cholesky_mo_num,nO,nO))
allocate(tmp2(cholesky_mo_num,nO,nO))
call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO)
do i=1,nO
do j=1,nO
do k=1,cholesky_mo_num
tmp2(k,j,i) = tmp(k,i,j)
enddo
enddo
enddo
deallocate(tmp)
call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, &
1.d0, H_vo, nV)
end
! R2
subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data)
use gpu_module
implicit none
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
type(c_ptr), intent(in) :: gpu_data
! out
double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
! internal
integer :: u,v,i,j,beta,gam,a,b
double precision :: max_r2_local
double precision, allocatable :: Y_oovv(:,:,:,:)
integer :: block_size, iblock, k
block_size = 16
call set_multiple_levels_omp(.False.)
double precision, allocatable :: g_occ(:,:)
allocate(g_occ(nO,nO))
!$omp parallel &
!$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) &
!$omp private(i,j,a,u) &
!$omp default(none)
!$omp do
do i = 1, nO
do u = 1, nO
g_occ(u,i) = H_oo(u,i)
enddo
enddo
!$omp end do
!$omp do
do i = 1, nO
do j = 1, nO
do a = 1, nV
do u = 1, nO
g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
double precision, allocatable :: K1(:,:,:,:)
allocate(K1(nO,nV,nO,nV))
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
H_vv, g_occ, K1, r2)
!---
double precision, allocatable :: X_oovv(:,:,:,:)
double precision, allocatable :: X_vovv(:,:,:,:)
double precision, allocatable :: X_ovvo(:,:,:,:)
double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:)
double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:)
double precision, allocatable :: Y_voov(:,:,:,:)
double precision, allocatable :: Z_ovov(:,:,:,:)
double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:)
allocate(X_ovov(nO,nV,nO,nV))
allocate(Y_ovov(nO,nV,nO,nV))
!$omp parallel &
!$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) &
!$omp private(u,a,i,beta,gam) &
!$omp default(none)
!$omp do
do beta = 1, nV
do u = 1, nO
do a = 1, nV
do i = 1, nO
X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp do
do gam = 1, nV
do v = 1, nO
do a = 1, nV
do i = 1, nO
Y_ovov(i,a,v,gam) = t2(i,v,gam,a)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
allocate(Z_ovov(nO,nV,nO,nV))
call dgemm('T','N',nO*nV,nO*nV,nO*nV, &
1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
deallocate(X_ovov, Y_ovov)
!$omp parallel &
!$omp shared(nO,nV,r2,Z_ovov) &
!$omp private(u,v,gam,beta) &
!$omp default(none)
!$omp do
do gam = 1, nV
do beta = 1, nV
do v = 1, nO
do u = 1, nO
r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
deallocate(Z_ovov)
allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV))
!$omp parallel &
!$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) &
!$omp private(u,v,gam,beta,i,a) &
!$omp default(none)
!$omp do
do a = 1, nV
do i = 1, nO
do gam = 1, nV
do u = 1, nO
X_ovov(u,gam,i,a) = K1(u,a,i,gam)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp do
do beta = 1, nV
do v = 1, nO
do a = 1, nV
do i = 1, nO
Y_ovov(i,a,v,beta) = t2(i,v,beta,a)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
deallocate(K1)
allocate(Z_ovov(nO,nV,nO,nV))
call dgemm('N','N',nO*nV,nO*nV,nO*nV, &
1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), &
Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), &
0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2))
deallocate(X_ovov,Y_ovov)
!$omp parallel &
!$omp shared(nO,nV,r2,Z_ovov) &
!$omp private(u,v,gam,beta) &
!$omp default(none)
!$omp do
do gam = 1, nV
do beta = 1, nV
do v = 1, nO
do u = 1, nO
r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
deallocate(Z_ovov)
! Change the sign for consistency with the code in spin orbitals
max_r2 = 0d0
!$omp parallel &
!$omp shared(nO,nV,r2,max_r2) &
!$omp private(i,j,a,b,max_r2_local) &
!$omp default(none)
max_r2_local = 0.d0
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
r2(i,j,a,b) = -r2(i,j,a,b)
max_r2_local = max(r2(i,j,a,b), max_r2_local)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
max_r2 = max(max_r2, max_r2_local)
!$omp end critical
!$omp end parallel
end