mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2025-01-02 17:45:44 +01:00
r2 fully on GPU
This commit is contained in:
parent
a541abd04a
commit
d3c87d8181
@ -132,7 +132,8 @@ subroutine run_ccsd_space_orb
|
||||
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
||||
|
||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data)
|
||||
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
|
||||
H_oo, H_vv, r2, max_r2)
|
||||
else
|
||||
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
||||
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
||||
|
@ -439,84 +439,3 @@ subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
|
||||
end
|
||||
|
||||
|
||||
! R2
|
||||
|
||||
subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data)
|
||||
use gpu_module
|
||||
implicit none
|
||||
|
||||
! in
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
|
||||
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
|
||||
type(c_ptr), intent(in) :: gpu_data
|
||||
|
||||
! out
|
||||
double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
|
||||
|
||||
! internal
|
||||
integer :: u,v,i,j,beta,gam,a,b
|
||||
double precision :: max_r2_local
|
||||
double precision, allocatable :: Y_oovv(:,:,:,:)
|
||||
|
||||
integer :: block_size, iblock, k
|
||||
block_size = 16
|
||||
call set_multiple_levels_omp(.False.)
|
||||
|
||||
double precision, allocatable :: g_occ(:,:)
|
||||
allocate(g_occ(nO,nO))
|
||||
!$omp parallel &
|
||||
!$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) &
|
||||
!$omp private(i,j,a,u) &
|
||||
!$omp default(none)
|
||||
!$omp do
|
||||
do i = 1, nO
|
||||
do u = 1, nO
|
||||
g_occ(u,i) = H_oo(u,i)
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do
|
||||
|
||||
!$omp do
|
||||
do i = 1, nO
|
||||
do j = 1, nO
|
||||
do a = 1, nV
|
||||
do u = 1, nO
|
||||
g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
|
||||
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
|
||||
H_vv, g_occ, r2)
|
||||
|
||||
! Change the sign for consistency with the code in spin orbitals
|
||||
|
||||
max_r2 = 0d0
|
||||
!$omp parallel &
|
||||
!$omp shared(nO,nV,r2,max_r2) &
|
||||
!$omp private(i,j,a,b,max_r2_local) &
|
||||
!$omp default(none)
|
||||
max_r2_local = 0.d0
|
||||
!$omp do
|
||||
do b = 1, nV
|
||||
do a = 1, nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
r2(i,j,a,b) = -r2(i,j,a,b)
|
||||
max_r2_local = max(r2(i,j,a,b), max_r2_local)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do nowait
|
||||
!$omp critical
|
||||
max_r2 = max(max_r2, max_r2_local)
|
||||
!$omp end critical
|
||||
!$omp end parallel
|
||||
|
||||
end
|
||||
|
||||
|
@ -14,9 +14,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
||||
double* t1,
|
||||
double* t2,
|
||||
double* tau,
|
||||
double* H_oo,
|
||||
double* H_vv,
|
||||
double* g_occ,
|
||||
double* r2)
|
||||
double* r2,
|
||||
double* max_r2)
|
||||
{
|
||||
|
||||
int ngpus = 1;
|
||||
@ -636,7 +637,35 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
||||
double* d_g_occ;
|
||||
lda = nO;
|
||||
cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double));
|
||||
cublasSetMatrix(lda, nO, sizeof(double), g_occ, lda, d_g_occ, lda);
|
||||
cublasSetMatrix(lda, nO, sizeof(double), H_oo, lda, d_g_occ, lda);
|
||||
|
||||
double* d_X;
|
||||
cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double));
|
||||
|
||||
alpha = 2.0;
|
||||
beta = 0.0;
|
||||
m=cholesky_mo_num; n=nO*nV;
|
||||
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
|
||||
B=d_t1; ldb=1;
|
||||
C=d_X; ldc=1;
|
||||
cublasDgemv(handle, CUBLAS_OP_N, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||
|
||||
alpha = 1.0;
|
||||
beta = 1.0;
|
||||
m=cholesky_mo_num; n=nO*nO;
|
||||
A=d_cc_space_v_oo_chol; lda=cholesky_mo_num;
|
||||
B=d_X; ldb=1;
|
||||
C=d_g_occ; ldc=1;
|
||||
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||
|
||||
cudaFree(d_X);
|
||||
alpha = -1.0;
|
||||
beta = 1.0;
|
||||
m=nO*nV; n=nO*nO;
|
||||
A=d_cc_space_v_ovoo; lda=nO*nV;
|
||||
B=d_t1; ldb=1;
|
||||
C=d_g_occ; ldc=1;
|
||||
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
|
||||
|
||||
alpha = 1.0;
|
||||
beta = 1.0;
|
||||
@ -1272,13 +1301,14 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
||||
cudaFree(d_tau);
|
||||
cudaFree(d_t1);
|
||||
cudaFree(d_tmp_cc);
|
||||
|
||||
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));
|
||||
lda=nO*nO;
|
||||
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda);
|
||||
#pragma omp critical
|
||||
{
|
||||
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) {
|
||||
r2[i] += r2_tmp[i];
|
||||
r2[i] -= r2_tmp[i];
|
||||
}
|
||||
}
|
||||
free(r2_tmp);
|
||||
@ -1290,6 +1320,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
|
||||
free(K1);
|
||||
free(J1);
|
||||
|
||||
*max_r2 = 0.;
|
||||
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) {
|
||||
const double x = r2[i] > 0. ? r2[i] : -r2[i];
|
||||
*max_r2 = *max_r2 > x ? *max_r2 : x;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -27,16 +27,17 @@ module gpu_module
|
||||
end function
|
||||
|
||||
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,&
|
||||
H_vv, g_occ, r2) bind(C)
|
||||
H_oo, H_vv, r2, max_r2) bind(C)
|
||||
import c_int, c_double, c_ptr
|
||||
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
|
||||
type(c_ptr), value :: gpu_data
|
||||
real(c_double), intent(in) :: t1(nO,nV)
|
||||
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
||||
real(c_double), intent(in) :: tau(nO,nO,nV,nV)
|
||||
real(c_double), intent(in) :: H_oo(nO,nO)
|
||||
real(c_double), intent(in) :: H_vv(nV,nV)
|
||||
real(c_double), intent(in) :: g_occ(nO,nO)
|
||||
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
|
||||
real(c_double), intent(out) :: max_r2
|
||||
end subroutine
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user