From a541abd04a4d0e9535506989ec0abb76237fd95a Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 3 Aug 2023 03:21:32 +0200 Subject: [PATCH] r2 on GPU --- devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f | 76 +-------------- devel/ccsd_gpu/gpu.c | 98 ++++++++++++++++++-- devel/ccsd_gpu/gpu_module.f90 | 3 +- 3 files changed, 91 insertions(+), 86 deletions(-) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index bbe46ea..3ae519a 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -490,82 +490,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_da !$omp end do !$omp end parallel - double precision, allocatable :: K1(:,:,:,:) - allocate(K1(nO,nV,nO,nV)) - call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, & - H_vv, g_occ, K1, r2) - -!--- - double precision, allocatable :: X_oovv(:,:,:,:) - double precision, allocatable :: X_vovv(:,:,:,:) - double precision, allocatable :: X_ovvo(:,:,:,:) - double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) - - double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - - double precision, allocatable :: Y_voov(:,:,:,:) - double precision, allocatable :: Z_ovov(:,:,:,:) - double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - - allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) - !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - do gam = 1, nV - do u = 1, nO - X_ovov(u,gam,i,a) = K1(u,a,i,gam) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do beta = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov(i,a,v,beta) = t2(i,v,beta,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(K1) - - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('N','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) - - deallocate(X_ovov,Y_ovov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Z_ovov) + H_vv, g_occ, r2) ! Change the sign for consistency with the code in spin orbitals diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 60c7503..180893f 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -16,7 +16,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* tau, double* H_vv, double* g_occ, - double* K1, double* r2) { @@ -24,6 +23,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaGetDeviceCount(&ngpus); double* J1 = malloc(nO*nV*nV*nO*sizeof(double)); + double* K1 = malloc(nO*nV*nV*nO*sizeof(double)); #pragma omp parallel num_threads(ngpus) { @@ -629,13 +629,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cublasSetStream(handle, NULL); cudaFree(d_Y_oovv); } - } // end sections - - lda = nO*nV; - cublasSetMatrix(lda, nO*nV, sizeof(double), K1, lda, d_K1, lda); - - #pragma omp sections - { // g_occ #pragma omp section @@ -693,6 +686,14 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_X_oovv); } + } // end sections + + lda = nO*nV; + cublasSetMatrix(lda, nO*nV, sizeof(double), K1, lda, d_K1, lda); + + #pragma omp sections + { + #pragma omp section { double* d_X_vovv; @@ -1112,10 +1113,87 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo #pragma omp section { + double* d_X_ovov; + cudaMalloc((void **)&d_X_ovov, nO*nV*nO*nV * sizeof(double)); + + double* d_Y_ovov; + cudaMalloc((void **)&d_Y_ovov, nO*nV*nO*nV * sizeof(double)); + + + for (int i=0 ; i