1
0
mirror of https://gitlab.com/scemama/qp_plugins_scemama.git synced 2025-01-02 17:45:44 +01:00

r2 fully on GPU

This commit is contained in:
Anthony Scemama 2023-08-03 12:00:09 +02:00
parent a541abd04a
commit d3c87d8181
4 changed files with 45 additions and 88 deletions

View File

@ -132,7 +132,8 @@ subroutine run_ccsd_space_orb
call compute_H_vo_chol(nO,nV,t1,H_vo) call compute_H_vo_chol(nO,nV,t1,H_vo)
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data) call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
H_oo, H_vv, r2, max_r2)
else else
call compute_H_oo(nO,nV,t1,t2,tau,H_oo) call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
call compute_H_vv(nO,nV,t1,t2,tau,H_vv) call compute_H_vv(nO,nV,t1,t2,tau,H_vv)

View File

@ -439,84 +439,3 @@ subroutine compute_H_vo_chol(nO,nV,t1,H_vo)
end end
! R2
subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data)
use gpu_module
implicit none
! in
integer, intent(in) :: nO, nV
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV)
double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)
type(c_ptr), intent(in) :: gpu_data
! out
double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2
! internal
integer :: u,v,i,j,beta,gam,a,b
double precision :: max_r2_local
double precision, allocatable :: Y_oovv(:,:,:,:)
integer :: block_size, iblock, k
block_size = 16
call set_multiple_levels_omp(.False.)
double precision, allocatable :: g_occ(:,:)
allocate(g_occ(nO,nO))
!$omp parallel &
!$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) &
!$omp private(i,j,a,u) &
!$omp default(none)
!$omp do
do i = 1, nO
do u = 1, nO
g_occ(u,i) = H_oo(u,i)
enddo
enddo
!$omp end do
!$omp do
do i = 1, nO
do j = 1, nO
do a = 1, nV
do u = 1, nO
g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, &
H_vv, g_occ, r2)
! Change the sign for consistency with the code in spin orbitals
max_r2 = 0d0
!$omp parallel &
!$omp shared(nO,nV,r2,max_r2) &
!$omp private(i,j,a,b,max_r2_local) &
!$omp default(none)
max_r2_local = 0.d0
!$omp do
do b = 1, nV
do a = 1, nV
do j = 1, nO
do i = 1, nO
r2(i,j,a,b) = -r2(i,j,a,b)
max_r2_local = max(r2(i,j,a,b), max_r2_local)
enddo
enddo
enddo
enddo
!$omp end do nowait
!$omp critical
max_r2 = max(max_r2, max_r2_local)
!$omp end critical
!$omp end parallel
end

View File

@ -14,9 +14,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* t1, double* t1,
double* t2, double* t2,
double* tau, double* tau,
double* H_oo,
double* H_vv, double* H_vv,
double* g_occ, double* r2,
double* r2) double* max_r2)
{ {
int ngpus = 1; int ngpus = 1;
@ -636,7 +637,35 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
double* d_g_occ; double* d_g_occ;
lda = nO; lda = nO;
cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double)); cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double));
cublasSetMatrix(lda, nO, sizeof(double), g_occ, lda, d_g_occ, lda); cublasSetMatrix(lda, nO, sizeof(double), H_oo, lda, d_g_occ, lda);
double* d_X;
cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double));
alpha = 2.0;
beta = 0.0;
m=cholesky_mo_num; n=nO*nV;
A=d_cc_space_v_ov_chol; lda=cholesky_mo_num;
B=d_t1; ldb=1;
C=d_X; ldc=1;
cublasDgemv(handle, CUBLAS_OP_N, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0;
beta = 1.0;
m=cholesky_mo_num; n=nO*nO;
A=d_cc_space_v_oo_chol; lda=cholesky_mo_num;
B=d_X; ldb=1;
C=d_g_occ; ldc=1;
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
cudaFree(d_X);
alpha = -1.0;
beta = 1.0;
m=nO*nV; n=nO*nO;
A=d_cc_space_v_ovoo; lda=nO*nV;
B=d_t1; ldb=1;
C=d_g_occ; ldc=1;
cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc);
alpha = 1.0; alpha = 1.0;
beta = 1.0; beta = 1.0;
@ -1272,13 +1301,14 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
cudaFree(d_tau); cudaFree(d_tau);
cudaFree(d_t1); cudaFree(d_t1);
cudaFree(d_tmp_cc); cudaFree(d_tmp_cc);
double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double)); double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double));
lda=nO*nO; lda=nO*nO;
cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda); cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda);
#pragma omp critical #pragma omp critical
{ {
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) { for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) {
r2[i] += r2_tmp[i]; r2[i] -= r2_tmp[i];
} }
} }
free(r2_tmp); free(r2_tmp);
@ -1290,6 +1320,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo
free(K1); free(K1);
free(J1); free(J1);
*max_r2 = 0.;
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) {
const double x = r2[i] > 0. ? r2[i] : -r2[i];
*max_r2 = *max_r2 > x ? *max_r2 : x;
}
} }

View File

@ -27,16 +27,17 @@ module gpu_module
end function end function
subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,& subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,&
H_vv, g_occ, r2) bind(C) H_oo, H_vv, r2, max_r2) bind(C)
import c_int, c_double, c_ptr import c_int, c_double, c_ptr
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
type(c_ptr), value :: gpu_data type(c_ptr), value :: gpu_data
real(c_double), intent(in) :: t1(nO,nV) real(c_double), intent(in) :: t1(nO,nV)
real(c_double), intent(in) :: t2(nO,nO,nV,nV) real(c_double), intent(in) :: t2(nO,nO,nV,nV)
real(c_double), intent(in) :: tau(nO,nO,nV,nV) real(c_double), intent(in) :: tau(nO,nO,nV,nV)
real(c_double), intent(in) :: H_oo(nO,nO)
real(c_double), intent(in) :: H_vv(nV,nV) real(c_double), intent(in) :: H_vv(nV,nV)
real(c_double), intent(in) :: g_occ(nO,nO)
real(c_double), intent(out) :: r2(nO,nO,nV,nV) real(c_double), intent(out) :: r2(nO,nO,nV,nV)
real(c_double), intent(out) :: max_r2
end subroutine end subroutine