mirror of
https://github.com/QuantumPackage/qp2.git
synced 2025-05-01 20:55:13 +02:00
1269 lines
39 KiB
Fortran
1269 lines
39 KiB
Fortran
! H_oo
|
|
|
|
subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
|
|
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
|
|
use gpu
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: d_cc_space_f_oo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
|
|
type(gpu_double4), intent(in) :: tau_x
|
|
type(gpu_double2), intent(out) :: H_oo
|
|
|
|
integer :: a,b,i,j,u,k
|
|
|
|
type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv
|
|
|
|
call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO)
|
|
|
|
type(gpu_blas) :: blas
|
|
|
|
|
|
!$OMP PARALLEL &
|
|
!$OMP DEFAULT(SHARED) &
|
|
!$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
|
|
|
|
!$OMP SINGLE
|
|
!$OMP TASK
|
|
call gpu_copy(d_cc_space_f_oo, H_oo)
|
|
!$OMP END TASK
|
|
!$OMP END SINGLE
|
|
|
|
call gpu_allocate(tmp_ovv, nO, nV, nV)
|
|
call gpu_allocate(tmp_vov, nV, nO, nV)
|
|
|
|
call gpu_blas_create(blas)
|
|
|
|
!$OMP DO
|
|
do u=1,nO
|
|
call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
|
|
tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1)
|
|
do b=1,nV
|
|
call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, &
|
|
tmp_ovv%f(1,1,b), nO, 0.d0, &
|
|
tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
|
|
enddo
|
|
call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, &
|
|
0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
|
|
enddo
|
|
!$OMP END DO
|
|
|
|
call gpu_blas_destroy(blas)
|
|
|
|
call gpu_deallocate(tmp_vov)
|
|
call gpu_deallocate(tmp_ovv)
|
|
|
|
!$OMP TASKWAIT
|
|
!$OMP END PARALLEL
|
|
|
|
call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
|
|
tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, &
|
|
1.d0, H_oo%f(1,1), nO)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(tau_kau)
|
|
end
|
|
|
|
! H_vv
|
|
|
|
subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
|
|
d_cc_space_v_ov_chol,H_vv)
|
|
use gpu
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: d_cc_space_f_vv
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol
|
|
type(gpu_double4), intent(in) :: tau_x
|
|
type(gpu_double2), intent(out) :: H_vv
|
|
|
|
integer :: a,b,i,j,u,k, beta
|
|
|
|
type(gpu_double3) :: tau_kia, tmp_oov
|
|
|
|
call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
|
|
|
|
type(gpu_blas) :: blas
|
|
|
|
!$OMP PARALLEL &
|
|
!$OMP DEFAULT(SHARED) &
|
|
!$OMP PRIVATE(a,b,tmp_oov,blas)
|
|
|
|
!$OMP SINGLE
|
|
!$OMP TASK
|
|
call gpu_copy(d_cc_space_f_vv, H_vv)
|
|
!$OMP END TASK
|
|
!$OMP END SINGLE
|
|
|
|
call gpu_blas_create(blas)
|
|
call gpu_allocate(tmp_oov, nO, nO, nV)
|
|
|
|
!$OMP DO
|
|
do a = 1, nV
|
|
do b=1,nV
|
|
call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, &
|
|
tau_x%f(1,1,a,b), nO, 0.d0, &
|
|
tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
|
|
enddo
|
|
call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, &
|
|
0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
|
|
enddo
|
|
!$OMP END DO
|
|
|
|
call gpu_blas_destroy(blas)
|
|
|
|
call gpu_deallocate(tmp_oov)
|
|
!$OMP TASKWAIT
|
|
!$OMP END PARALLEL
|
|
|
|
call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
|
|
tau_kia%f(1,1,1), cholesky_mo_num*nO, d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
|
|
1.d0, H_vv%f(1,1), nV)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(tau_kia)
|
|
end
|
|
|
|
! H_vo
|
|
subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, &
|
|
d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
|
|
use gpu
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol
|
|
type(gpu_double2), intent(out) :: H_vo
|
|
|
|
integer :: a,b,i,j,u,k
|
|
|
|
type(gpu_double1) :: tmp_k
|
|
type(gpu_double3) :: tmp, tmp2
|
|
|
|
call gpu_copy(d_cc_space_f_vo, H_vo)
|
|
|
|
call gpu_allocate(tmp_k, cholesky_mo_num)
|
|
|
|
call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
|
|
t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
|
|
|
|
call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, &
|
|
d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
|
|
H_vo%f(1,1), nV*nO)
|
|
|
|
call gpu_deallocate(tmp_k)
|
|
|
|
|
|
call gpu_allocate(tmp, cholesky_mo_num, nO, nO)
|
|
|
|
call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO)
|
|
|
|
call gpu_allocate(tmp2, cholesky_mo_num, nO, nO)
|
|
|
|
type(gpu_stream) :: stream(nO)
|
|
do i=1,nO
|
|
call gpu_stream_create(stream(i))
|
|
enddo
|
|
|
|
!$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j)
|
|
do i=1,nO
|
|
do j=1,nO
|
|
call gpu_set_stream(blas_handle,stream(j))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, &
|
|
tmp%f(1,i,j), cholesky_mo_num, 0.d0, &
|
|
tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num)
|
|
enddo
|
|
enddo
|
|
!$OMP END PARALLEL DO
|
|
|
|
call gpu_set_stream(blas_handle,gpu_default_stream)
|
|
call gpu_synchronize()
|
|
|
|
do i=1,nO
|
|
call gpu_stream_destroy(stream(i))
|
|
enddo
|
|
call gpu_deallocate(tmp)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, &
|
|
1.d0, H_vo%f(1,1), nV)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(tmp2)
|
|
end
|
|
|
|
! R1
|
|
|
|
subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
|
|
d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
|
|
use gpu
|
|
implicit none
|
|
|
|
! in
|
|
integer, intent(in) :: nO, nV
|
|
type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
|
|
type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo
|
|
|
|
! out
|
|
type(gpu_double2), intent(out) :: r1
|
|
double precision, intent(out) :: max_r1
|
|
|
|
! internal
|
|
integer :: u,i,j,beta,a,b
|
|
|
|
type(gpu_stream) :: stream(nV)
|
|
|
|
do a=1,nV
|
|
call gpu_stream_create(stream(a))
|
|
enddo
|
|
|
|
type(gpu_double2) :: X_oo
|
|
call gpu_allocate(X_oo,nO,nO)
|
|
|
|
call gpu_copy(d_cc_space_f_ov, r1)
|
|
|
|
call gpu_set_stream(blas_handle, stream(1))
|
|
call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, &
|
|
1d0, t1%f(1,1) , size(t1%f,1), &
|
|
H_vv%f(1,1), size(H_vv%f,1), &
|
|
1d0, r1%f(1,1) , size(r1%f,1))
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, &
|
|
-1d0, H_oo%f(1,1), size(H_oo%f,1), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
1d0, r1%f(1,1), size(r1%f,1))
|
|
|
|
call gpu_set_stream(blas_handle, stream(nV))
|
|
call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, &
|
|
-2d0, t1%f(1,1), size(t1%f,1), &
|
|
d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
|
|
0d0, X_oo%f(1,1), size(X_oo%f,1))
|
|
|
|
call gpu_synchronize()
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, &
|
|
1d0, X_oo%f(1,1), size(X_oo%f,2), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
1d0, r1%f(1,1) , size(r1%f,1))
|
|
|
|
|
|
|
|
type(gpu_double4) :: X_voov
|
|
call gpu_allocate(X_voov, nV, nO, nO, nV)
|
|
|
|
do i=1,nO
|
|
do beta=1,nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, -1.d0, t2%f(1,i,1,beta), &
|
|
nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO)
|
|
enddo
|
|
enddo
|
|
|
|
do beta=1,nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO, 1.d0, X_voov%f(1,1,1,beta), &
|
|
nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV)
|
|
enddo
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(X_oo)
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, &
|
|
1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
|
|
H_vo%f(1,1) , 1, &
|
|
1d0, r1%f(1,1) , 1)
|
|
|
|
type(gpu_double4) :: X_ovov
|
|
call gpu_allocate(X_ovov, nO, nV, nO, nV)
|
|
|
|
do beta = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
do u=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), &
|
|
nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(X_voov)
|
|
|
|
call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, &
|
|
1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
|
|
t1%f(1,1), 1, &
|
|
1d0, r1%f(1,1), 1)
|
|
|
|
|
|
integer :: iblock, block_size, nVmax
|
|
type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo
|
|
|
|
block_size = 16
|
|
call gpu_allocate(T_vvoo, nV,nV,nO,nO)
|
|
|
|
call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), &
|
|
nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV)
|
|
|
|
call gpu_allocate(W_vvov,nV, nV,nO,block_size)
|
|
call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size)
|
|
|
|
do iblock = 1, nV, block_size
|
|
nVmax = min(block_size,nV-iblock+1)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, &
|
|
d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, &
|
|
d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
|
|
0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO)
|
|
|
|
call gpu_synchronize()
|
|
do b=1,nV
|
|
call gpu_set_stream(blas_handle, stream(b))
|
|
do i=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax, 2.d0, W_vvov_tmp%f(1,i,b,1), &
|
|
nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_synchronize()
|
|
|
|
do beta = 1, nVmax
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO, 1.d0, W_vvov%f(1,1,1,beta), &
|
|
nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV)
|
|
enddo
|
|
call gpu_synchronize()
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, &
|
|
1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, &
|
|
W_vvov%f(1,1,1,1), nO*nV*nV, &
|
|
1d0, r1%f(1,iblock), nO)
|
|
enddo
|
|
|
|
call gpu_deallocate(X_ovov)
|
|
|
|
type(gpu_double4) :: W_oovo
|
|
call gpu_allocate(W_oovo, nO,nO,nV,nO)
|
|
|
|
do u = 1, nO
|
|
do a = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(a))
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 2.d0, d_cc_space_v_oovo%f(1,1,a,u), &
|
|
nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
call gpu_synchronize()
|
|
|
|
call gpu_deallocate(W_vvov)
|
|
call gpu_deallocate(T_vvoo)
|
|
|
|
! Change the sign for consistency with the code in spin orbitals
|
|
call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, &
|
|
1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), &
|
|
tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), &
|
|
-1d0, r1%f(1,1), size(r1%f,1))
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(W_oovo)
|
|
|
|
max_r1 = 0d0
|
|
do a = 1, nV
|
|
do i = 1, nO
|
|
max_r1 = max(dabs(r1%f(i,a)), max_r1)
|
|
enddo
|
|
enddo
|
|
|
|
do a=1,nV
|
|
call gpu_stream_destroy(stream(a))
|
|
enddo
|
|
|
|
end
|
|
|
|
|
|
! R2
|
|
|
|
subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
|
|
d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
|
|
d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
|
|
d_cc_space_f_vo, &
|
|
r2,max_r2)
|
|
use gpu
|
|
implicit none
|
|
|
|
! in
|
|
integer, intent(in) :: nO, nV
|
|
type(gpu_double2), intent(in) :: t1, H_oo, H_vv, d_cc_space_f_vo
|
|
type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv
|
|
type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo
|
|
type(gpu_double4), intent(in) :: d_cc_space_v_vvoo, d_cc_space_v_oovo
|
|
type(gpu_double4), intent(in) :: d_cc_space_v_ovvo, d_cc_space_v_ovoo
|
|
type(gpu_double4), intent(in) :: d_cc_space_v_ovov
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol
|
|
|
|
! out
|
|
double precision, intent(out) :: max_r2
|
|
type(gpu_double4), intent(out) :: r2
|
|
|
|
! internal
|
|
integer :: u,v,i,j,beta,gam,a,b
|
|
double precision :: max_r2_local
|
|
|
|
type(gpu_stream) :: stream(nV)
|
|
|
|
call set_multiple_levels_omp(.False.)
|
|
|
|
call gpu_copy(d_cc_space_v_oovv, r2)
|
|
|
|
type(gpu_double4) :: A1
|
|
call gpu_allocate(A1,nO,nO,nO,nO)
|
|
call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
|
|
d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, &
|
|
1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), &
|
|
tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
|
|
1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2))
|
|
|
|
call gpu_deallocate(A1)
|
|
|
|
integer :: block_size, iblock, k
|
|
block_size = 16
|
|
type(gpu_double3) :: tmp_cc, B1, tmpB1
|
|
type(gpu_double2) :: tmp_cc2
|
|
|
|
call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV)
|
|
call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, &
|
|
d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
|
|
|
|
call set_multiple_levels_omp(.False.)
|
|
call gpu_synchronize()
|
|
|
|
type(gpu_blas) :: blas
|
|
|
|
!$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas)
|
|
call gpu_allocate(B1,nV,nV,block_size)
|
|
call gpu_allocate(tmpB1,nV,block_size,nV)
|
|
call gpu_allocate(tmp_cc2,cholesky_mo_num,nV)
|
|
|
|
call gpu_blas_create(blas)
|
|
|
|
!$OMP DO
|
|
do gam = 1, nV
|
|
|
|
call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), &
|
|
cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num)
|
|
|
|
do iblock = 1, nV, block_size
|
|
|
|
call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
|
|
-1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, &
|
|
d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, &
|
|
0.d0, tmpB1%f(1,1,1), nV*block_size)
|
|
|
|
call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, &
|
|
1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, &
|
|
tmp_cc2%f(1,1), cholesky_mo_num, &
|
|
1.d0, tmpB1%f(1,1,1), nV*block_size)
|
|
|
|
do beta = iblock, min(nV, iblock+block_size-1)
|
|
call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), &
|
|
nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV)
|
|
enddo
|
|
|
|
call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, &
|
|
1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
|
|
B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), &
|
|
1d0, r2%f(1,1,iblock,gam), size(r2%f ,1) * size(r2%f ,2))
|
|
enddo
|
|
|
|
enddo
|
|
!$OMP ENDDO
|
|
|
|
call gpu_blas_destroy(blas)
|
|
|
|
call gpu_deallocate(B1)
|
|
call gpu_deallocate(tmpB1)
|
|
call gpu_deallocate(tmp_cc2)
|
|
!$OMP END PARALLEL
|
|
|
|
call gpu_deallocate(tmp_cc)
|
|
|
|
type(gpu_double4) :: X_oovv
|
|
call gpu_allocate(X_oovv,nO,nO,nV,nV)
|
|
call gpu_copy(t2,X_oovv)
|
|
|
|
type(gpu_double2) :: g_occ, g_vir
|
|
call gpu_allocate(g_vir,nV,nV)
|
|
call gpu_allocate(g_occ,nO,nO)
|
|
call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
|
|
d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
|
|
call compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
|
|
d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
|
|
|
|
type(gpu_double4) :: Y_oovv
|
|
call gpu_allocate(Y_oovv,nO,nO,nV,nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, &
|
|
1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), &
|
|
g_vir%f(1,1), size(g_vir%f,1), &
|
|
0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, &
|
|
-1d0, g_occ%f(1,1), size(g_occ%f,1), &
|
|
t2%f(1,1,1,1) , size(t2%f,1), &
|
|
1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1))
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, &
|
|
-1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
|
|
|
|
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), &
|
|
nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(X_oovv)
|
|
|
|
call gpu_deallocate(g_vir)
|
|
call gpu_deallocate(g_occ)
|
|
|
|
type(gpu_double4) :: X_vovo, Y_oovo
|
|
call gpu_allocate(X_vovo,nV,nO,nV,nO)
|
|
|
|
do a=1,nV
|
|
call gpu_stream_create(stream(a))
|
|
enddo
|
|
|
|
do gam = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(gam))
|
|
do beta = 1, nV
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
|
|
nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO)
|
|
enddo
|
|
enddo
|
|
|
|
do i = 1, nO
|
|
do gam = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(gam))
|
|
call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), &
|
|
nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV)
|
|
enddo
|
|
enddo
|
|
|
|
do a=1,nV
|
|
call gpu_stream_destroy(stream(a))
|
|
enddo
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
|
|
|
|
call gpu_allocate(Y_oovo,nO,nO,nV,nO)
|
|
|
|
!$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv)
|
|
call gpu_blas_create(blas)
|
|
type(gpu_double4) :: X_vovv
|
|
call gpu_allocate(X_vovv,nV,nO,nV,block_size)
|
|
!$OMP DO
|
|
do iblock = 1, nV, block_size
|
|
do gam = iblock, min(nV, iblock+block_size-1)
|
|
call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, &
|
|
d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), &
|
|
cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV)
|
|
|
|
enddo
|
|
|
|
call gpu_dgemm(blas, 'N','N', nO, &
|
|
nO*nV*min(block_size, nV-iblock+1),nV, &
|
|
1.d0, t1%f(1,1) , size(t1%f,1), &
|
|
X_vovv%f(1,1,1,1), size(X_vovv%f,1), &
|
|
0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1))
|
|
enddo
|
|
!$OMP END DO
|
|
|
|
call gpu_blas_destroy(blas)
|
|
call gpu_deallocate(X_vovv)
|
|
!$OMP END PARALLEL
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, &
|
|
1d0, t1%f(1,1), size(t1%f,1), &
|
|
X_vovo%f(1,1,1,1), size(X_vovo%f,1), &
|
|
0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1))
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, &
|
|
-1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3))
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(X_vovo)
|
|
call gpu_deallocate(Y_oovo)
|
|
|
|
do a=1,nV
|
|
call gpu_stream_create(stream(a))
|
|
enddo
|
|
|
|
do gam = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(gam))
|
|
do beta = 1, nV
|
|
call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), &
|
|
nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
|
|
enddo
|
|
do j=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
|
|
nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(Y_oovv)
|
|
|
|
type(gpu_double4) :: X_ovvo
|
|
type(gpu_double3) :: tcc, tcc2
|
|
call gpu_allocate(tcc2,cholesky_mo_num,nV,nO)
|
|
call gpu_allocate(X_ovvo,nO,nV,nV,nO)
|
|
call gpu_allocate(tcc,cholesky_mo_num,nO,nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
|
|
d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, &
|
|
0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, &
|
|
d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, &
|
|
0.d0, tcc%f(1,1,1), cholesky_mo_num*nO)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, &
|
|
tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, &
|
|
X_ovvo%f(1,1,1,1), nO*nV)
|
|
|
|
call gpu_synchronize()
|
|
|
|
|
|
do gam = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(gam))
|
|
do j=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), &
|
|
nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO)
|
|
enddo
|
|
do beta = 1, nV
|
|
call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), &
|
|
nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_synchronize
|
|
call gpu_deallocate(tcc)
|
|
call gpu_deallocate(tcc2)
|
|
call gpu_deallocate(X_ovvo)
|
|
|
|
|
|
type(gpu_double4) :: J1, K1
|
|
type(gpu_double4) :: Y_voov, Z_ovov
|
|
|
|
|
|
call gpu_allocate(J1,nO,nV,nV,nO)
|
|
call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, &
|
|
d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
|
|
|
|
call gpu_allocate(K1,nO,nV,nO,nV)
|
|
call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, &
|
|
d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
|
|
|
|
|
|
call gpu_allocate(X_ovvo,nO,nV,nV,nO)
|
|
call gpu_allocate(Y_voov,nV,nO,nO,nV)
|
|
|
|
do a=1, nV
|
|
call gpu_set_stream(blas_handle, stream(a))
|
|
do i=1, nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
|
|
nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO)
|
|
call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), &
|
|
nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_allocate(Z_ovov,nO,nV,nO,nV)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(J1)
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, &
|
|
1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
|
|
Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), &
|
|
0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(Y_voov)
|
|
call gpu_deallocate(X_ovvo)
|
|
|
|
type(gpu_double4) :: Y_ovov, X_ovov
|
|
call gpu_allocate(X_ovov,nO,nV,nO,nV)
|
|
call gpu_allocate(Y_ovov,nO,nV,nO,nV)
|
|
|
|
do a=1, nV
|
|
call gpu_set_stream(blas_handle, stream(a))
|
|
do j=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), &
|
|
nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO)
|
|
enddo
|
|
do beta=1, nV
|
|
call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), &
|
|
nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV)
|
|
enddo
|
|
enddo
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_synchronize()
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, &
|
|
-1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
|
|
Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
|
|
1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
|
|
|
|
call gpu_synchronize()
|
|
|
|
do gam=1, nV
|
|
call gpu_set_stream(blas_handle, stream(gam))
|
|
do j=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
|
|
nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO)
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), &
|
|
nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO)
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), &
|
|
nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO)
|
|
enddo
|
|
do beta=1, nV
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), &
|
|
nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_deallocate(K1)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, &
|
|
1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), &
|
|
Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
|
|
0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2))
|
|
|
|
call gpu_synchronize()
|
|
|
|
call gpu_deallocate(X_ovov)
|
|
call gpu_deallocate(Y_ovov)
|
|
|
|
! Change the sign for consistency with the code in spin orbitals
|
|
do gam = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(gam))
|
|
do j=1,nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), &
|
|
nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO)
|
|
enddo
|
|
do beta = 1, nV
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), &
|
|
nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_deallocate(Z_ovov)
|
|
|
|
max_r2 = 0d0
|
|
!$omp parallel &
|
|
!$omp shared(nO,nV,r2,max_r2) &
|
|
!$omp private(i,j,a,b,max_r2_local) &
|
|
!$omp default(none)
|
|
max_r2_local = 0.d0
|
|
!$omp do
|
|
do b = 1, nV
|
|
do a = 1, nV
|
|
do j = 1, nO
|
|
do i = 1, nO
|
|
max_r2_local = max(r2%f(i,j,a,b), max_r2_local)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
enddo
|
|
!$omp end do nowait
|
|
!$omp critical
|
|
max_r2 = max(max_r2, max_r2_local)
|
|
!$omp end critical
|
|
!$omp end parallel
|
|
|
|
end
|
|
|
|
! A1
|
|
|
|
subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, &
|
|
d_cc_space_v_oooo, d_cc_space_v_vvoo, A1)
|
|
use gpu
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: t1
|
|
type(gpu_double4), intent(in) :: t2, tau
|
|
type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo
|
|
type(gpu_double4), intent(out) :: A1
|
|
|
|
integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta
|
|
|
|
type(gpu_double4) :: Y_oooo
|
|
call gpu_allocate(Y_oooo,nO,nO,nO,nO)
|
|
|
|
! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j)
|
|
! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) &
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, &
|
|
1d0, t1%f(1,1) , size(t1%f,1), &
|
|
d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), &
|
|
0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1))
|
|
|
|
type(gpu_stream) :: stream(nO)
|
|
|
|
do i=1, nO
|
|
call gpu_stream_create(stream(i))
|
|
enddo
|
|
|
|
call gpu_synchronize()
|
|
|
|
do j = 1, nO
|
|
call gpu_set_stream(blas_handle, stream(j))
|
|
do i = 1, nO
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), &
|
|
nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO)
|
|
enddo
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), &
|
|
nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO)
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
do i=1, nO
|
|
call gpu_stream_destroy(stream(i))
|
|
enddo
|
|
|
|
call gpu_deallocate(Y_oooo)
|
|
|
|
! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b)
|
|
call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, &
|
|
1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), &
|
|
d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), &
|
|
1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2))
|
|
call gpu_synchronize()
|
|
|
|
end
|
|
|
|
! g_occ
|
|
|
|
subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, &
|
|
d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ)
|
|
use gpu
|
|
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: t1, H_oo, d_cc_space_f_vo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol
|
|
type(gpu_double4), intent(in) :: t2, d_cc_space_v_ovoo
|
|
type(gpu_double2), intent(out) :: g_occ
|
|
|
|
call gpu_copy(H_oo, g_occ)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, &
|
|
1d0, t1%f(1,1), size(t1%f,1), &
|
|
d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), &
|
|
1d0, g_occ%f(1,1), size(g_occ%f,1))
|
|
|
|
type(gpu_double1) :: X
|
|
call gpu_allocate(X,cholesky_mo_num)
|
|
|
|
call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, &
|
|
t1%f(1,1), 1, 0.d0, X%f(1), 1)
|
|
|
|
call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, &
|
|
d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, &
|
|
X%f(1), 1, 1.d0, g_occ%f(1,1), 1)
|
|
|
|
call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, &
|
|
d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, &
|
|
t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(X)
|
|
|
|
end
|
|
|
|
! g_vir
|
|
|
|
subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, &
|
|
d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir)
|
|
use gpu
|
|
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: t1, H_vv, d_cc_space_f_vo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
|
|
type(gpu_double4), intent(in) :: t2
|
|
type(gpu_double2), intent(out) :: g_vir
|
|
|
|
integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
|
|
|
|
type(gpu_stream) :: stream(max(nO,4))
|
|
|
|
do i=1,max(nO,4)
|
|
call gpu_stream_create(stream(i))
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, stream(1))
|
|
call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, &
|
|
-1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
0d0, g_vir%f(1,1), size(g_vir%f,1))
|
|
|
|
type(gpu_double1) :: tmp_k
|
|
type(gpu_double3) :: tmp_vo, tmp_vo2
|
|
|
|
call gpu_allocate(tmp_k,cholesky_mo_num)
|
|
|
|
call gpu_set_stream(blas_handle, stream(2))
|
|
call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, &
|
|
d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, &
|
|
g_vir%f(1,1), nV*nV)
|
|
|
|
call gpu_set_stream(blas_handle, stream(3))
|
|
call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, &
|
|
d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV)
|
|
|
|
call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(tmp_k)
|
|
|
|
do i=1,nO
|
|
call gpu_set_stream(blas_handle, stream(i))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), &
|
|
cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO)
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
do i=1,max(nO,4)
|
|
call gpu_stream_destroy(stream(i))
|
|
enddo
|
|
call gpu_deallocate(tmp_vo)
|
|
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), &
|
|
nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, &
|
|
tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(tmp_vo2)
|
|
|
|
end
|
|
|
|
! J1
|
|
|
|
subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1)
|
|
use gpu
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: t1
|
|
type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol
|
|
type(gpu_double4), intent(out) :: J1
|
|
|
|
integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
|
|
|
|
type(gpu_double4) :: X_ovoo, Y_ovov
|
|
|
|
call gpu_allocate(X_ovoo,nO,nV,nO,nO)
|
|
|
|
type(gpu_stream) :: stream(nV)
|
|
|
|
|
|
do i=1,nO
|
|
call gpu_stream_create(stream(i))
|
|
enddo
|
|
|
|
do j = 1, nO
|
|
call gpu_set_stream(blas_handle, stream(j))
|
|
do i = 1, nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), &
|
|
nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
do i=1,nO
|
|
call gpu_stream_destroy(stream(i))
|
|
enddo
|
|
|
|
call gpu_allocate(Y_ovov,nO,nV,nO,nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
|
|
-1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3))
|
|
|
|
|
|
call gpu_copy(v_ovvo, J1)
|
|
|
|
call gpu_synchronize()
|
|
|
|
do a=1,nV
|
|
call gpu_stream_create(stream(a))
|
|
enddo
|
|
|
|
do i = 1, nO
|
|
do beta = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
|
|
nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO)
|
|
call gpu_allocate(J1_tmp,nV,nO,nV,nO)
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
type(gpu_double4) :: J1_tmp
|
|
type(gpu_double3) :: tmp_cc
|
|
|
|
call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, &
|
|
d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, &
|
|
t1%f(1,1), nO, &
|
|
0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, &
|
|
tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, &
|
|
0.d0, J1_tmp%f(1,1,1,1), nV*nO)
|
|
|
|
|
|
call gpu_deallocate(X_ovoo)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(tmp_cc)
|
|
|
|
do i = 1, nO
|
|
do a = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(a))
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), &
|
|
nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV)
|
|
enddo
|
|
enddo
|
|
|
|
type(gpu_double4) :: X_voov, Z_ovvo
|
|
|
|
call gpu_allocate(X_voov,nV,nO,nO,nV)
|
|
call gpu_allocate(Z_ovvo,nO,nV,nV,nO)
|
|
|
|
do j = 1, nO
|
|
do beta = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), &
|
|
nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO)
|
|
enddo
|
|
enddo
|
|
|
|
do b = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(b))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), &
|
|
nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV)
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(J1_tmp)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, &
|
|
-1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), &
|
|
X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), &
|
|
0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
|
|
|
|
call gpu_synchronize()
|
|
|
|
do i = 1, nO
|
|
do a = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(a))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), &
|
|
nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV)
|
|
enddo
|
|
enddo
|
|
|
|
type(gpu_double4) :: X_ovvo, Y_vovo
|
|
call gpu_allocate(Y_vovo,nV,nO,nV,nO)
|
|
|
|
do j = 1, nO
|
|
do i = 1, nO
|
|
call gpu_set_stream(blas_handle, stream(i))
|
|
call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), &
|
|
nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_allocate(X_ovvo,nO,nV,nV,nO)
|
|
|
|
do j = 1, nO
|
|
do b = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(b))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), &
|
|
nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(X_voov)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, &
|
|
1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), &
|
|
Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), &
|
|
0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2))
|
|
|
|
call gpu_synchronize()
|
|
|
|
do i = 1, nO
|
|
do beta = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), &
|
|
nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
call gpu_deallocate(Y_ovov)
|
|
call gpu_deallocate(X_ovvo)
|
|
|
|
do a = 1, nV
|
|
call gpu_stream_destroy(stream(a))
|
|
enddo
|
|
|
|
call gpu_deallocate(Z_ovvo)
|
|
|
|
end
|
|
|
|
! K1
|
|
|
|
subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, &
|
|
d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1)
|
|
use gpu
|
|
|
|
implicit none
|
|
|
|
integer, intent(in) :: nO,nV
|
|
type(gpu_double2), intent(in) :: t1
|
|
type(gpu_double4), intent(in) :: t2, v_vvoo, v_ovov, v_ovoo
|
|
type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
|
|
type(gpu_double4), intent(out) :: K1
|
|
|
|
type(gpu_double4) :: X, Y, Z
|
|
|
|
integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam
|
|
|
|
|
|
call gpu_copy(v_ovov, K1)
|
|
|
|
type(gpu_stream) :: stream(nV)
|
|
do a = 1, nV
|
|
call gpu_stream_create(stream(a))
|
|
enddo
|
|
|
|
call gpu_allocate(X,nV,nO,nV,nO)
|
|
call gpu_allocate(Y,nO,nV,nV,nO)
|
|
|
|
do a = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(a))
|
|
do i = 1, nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), &
|
|
nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV)
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), &
|
|
nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_set_stream(blas_handle, gpu_default_stream)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, &
|
|
-1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), &
|
|
t1%f(1,1) , size(t1%f,1), &
|
|
1d0, K1%f(1,1,1,1) , size(K1%f,1) * size(K1%f,2) * size(K1%f,3))
|
|
|
|
type(gpu_double4) :: K1tmp
|
|
type(gpu_double3) :: t1v
|
|
|
|
call gpu_allocate(t1v,cholesky_mo_num,nO,nO)
|
|
|
|
call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, &
|
|
d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, &
|
|
t1v%f(1,1,1), cholesky_mo_num*nO)
|
|
|
|
call gpu_allocate(K1tmp,nO,nO,nV,nV)
|
|
|
|
call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, &
|
|
t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, &
|
|
K1tmp%f(1,1,1,1), nO*nO)
|
|
|
|
call gpu_allocate(Z,nO,nV,nV,nO)
|
|
call gpu_synchronize()
|
|
|
|
! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i)
|
|
call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, &
|
|
1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), &
|
|
X%f(1,1,1,1), size(X%f,1) * size(X%f,2), &
|
|
0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2))
|
|
|
|
call gpu_synchronize()
|
|
call gpu_deallocate(t1v)
|
|
|
|
do beta = 1, nV
|
|
call gpu_set_stream(blas_handle, stream(beta))
|
|
do i = 1, nO
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
|
|
nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO)
|
|
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), &
|
|
nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO)
|
|
enddo
|
|
enddo
|
|
|
|
call gpu_deallocate(X)
|
|
call gpu_deallocate(Y)
|
|
|
|
do a = 1, nV
|
|
call gpu_stream_destroy(stream(a))
|
|
enddo
|
|
|
|
call gpu_deallocate(K1tmp)
|
|
call gpu_deallocate(Z)
|
|
|
|
end
|