From d3c87d8181c78f378e49945f5c8b7be69e146564 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 3 Aug 2023 12:00:09 +0200 Subject: [PATCH] r2 fully on GPU --- devel/ccsd_gpu/ccsd_space_orb_sub.irp.f | 3 +- devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f | 81 -------------------- devel/ccsd_gpu/gpu.c | 44 ++++++++++- devel/ccsd_gpu/gpu_module.f90 | 5 +- 4 files changed, 45 insertions(+), 88 deletions(-) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index 6baf904..fafd124 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -132,7 +132,8 @@ subroutine run_ccsd_space_orb call compute_H_vo_chol(nO,nV,t1,H_vo) call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data) + call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, & + H_oo, H_vv, r2, max_r2) else call compute_H_oo(nO,nV,t1,t2,tau,H_oo) call compute_H_vv(nO,nV,t1,t2,tau,H_vv) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 3ae519a..b1c54bf 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -439,84 +439,3 @@ subroutine compute_H_vo_chol(nO,nV,t1,H_vo) end -! R2 - -subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_data) - use gpu_module - implicit none - - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) - type(c_ptr), intent(in) :: gpu_data - - ! out - double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 - - ! internal - integer :: u,v,i,j,beta,gam,a,b - double precision :: max_r2_local - double precision, allocatable :: Y_oovv(:,:,:,:) - - integer :: block_size, iblock, k - block_size = 16 - call set_multiple_levels_omp(.False.) - - double precision, allocatable :: g_occ(:,:) - allocate(g_occ(nO,nO)) - !$omp parallel & - !$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) & - !$omp private(i,j,a,u) & - !$omp default(none) - !$omp do - do i = 1, nO - do u = 1, nO - g_occ(u,i) = H_oo(u,i) - enddo - enddo - !$omp end do - - !$omp do - do i = 1, nO - do j = 1, nO - do a = 1, nV - do u = 1, nO - g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, & - H_vv, g_occ, r2) - - ! Change the sign for consistency with the code in spin orbitals - - max_r2 = 0d0 - !$omp parallel & - !$omp shared(nO,nV,r2,max_r2) & - !$omp private(i,j,a,b,max_r2_local) & - !$omp default(none) - max_r2_local = 0.d0 - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - r2(i,j,a,b) = -r2(i,j,a,b) - max_r2_local = max(r2(i,j,a,b), max_r2_local) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - max_r2 = max(max_r2, max_r2_local) - !$omp end critical - !$omp end parallel - -end - diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 180893f..554d233 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -14,9 +14,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* t1, double* t2, double* tau, + double* H_oo, double* H_vv, - double* g_occ, - double* r2) + double* r2, + double* max_r2) { int ngpus = 1; @@ -636,7 +637,35 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* d_g_occ; lda = nO; cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double)); - cublasSetMatrix(lda, nO, sizeof(double), g_occ, lda, d_g_occ, lda); + cublasSetMatrix(lda, nO, sizeof(double), H_oo, lda, d_g_occ, lda); + + double* d_X; + cudaMalloc((void **)&d_X, cholesky_mo_num*sizeof(double)); + + alpha = 2.0; + beta = 0.0; + m=cholesky_mo_num; n=nO*nV; + A=d_cc_space_v_ov_chol; lda=cholesky_mo_num; + B=d_t1; ldb=1; + C=d_X; ldc=1; + cublasDgemv(handle, CUBLAS_OP_N, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc); + + alpha = 1.0; + beta = 1.0; + m=cholesky_mo_num; n=nO*nO; + A=d_cc_space_v_oo_chol; lda=cholesky_mo_num; + B=d_X; ldb=1; + C=d_g_occ; ldc=1; + cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc); + + cudaFree(d_X); + alpha = -1.0; + beta = 1.0; + m=nO*nV; n=nO*nO; + A=d_cc_space_v_ovoo; lda=nO*nV; + B=d_t1; ldb=1; + C=d_g_occ; ldc=1; + cublasDgemv(handle, CUBLAS_OP_T, m, n, &alpha, A, lda, B, ldb, &beta, C, ldc); alpha = 1.0; beta = 1.0; @@ -1272,13 +1301,14 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_tau); cudaFree(d_t1); cudaFree(d_tmp_cc); + double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double)); lda=nO*nO; cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda); #pragma omp critical { for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) { - r2[i] += r2_tmp[i]; + r2[i] -= r2_tmp[i]; } } free(r2_tmp); @@ -1290,6 +1320,12 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo free(K1); free(J1); + *max_r2 = 0.; + for (size_t i=0 ; i<(size_t) nO*nO*nV*nV ; ++i) { + const double x = r2[i] > 0. ? r2[i] : -r2[i]; + *max_r2 = *max_r2 > x ? *max_r2 : x; + } + } diff --git a/devel/ccsd_gpu/gpu_module.f90 b/devel/ccsd_gpu/gpu_module.f90 index ed8d2a7..dbc75bc 100644 --- a/devel/ccsd_gpu/gpu_module.f90 +++ b/devel/ccsd_gpu/gpu_module.f90 @@ -27,16 +27,17 @@ module gpu_module end function subroutine compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num, gpu_data, t1, t2, tau,& - H_vv, g_occ, r2) bind(C) + H_oo, H_vv, r2, max_r2) bind(C) import c_int, c_double, c_ptr integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num type(c_ptr), value :: gpu_data real(c_double), intent(in) :: t1(nO,nV) real(c_double), intent(in) :: t2(nO,nO,nV,nV) real(c_double), intent(in) :: tau(nO,nO,nV,nV) + real(c_double), intent(in) :: H_oo(nO,nO) real(c_double), intent(in) :: H_vv(nV,nV) - real(c_double), intent(in) :: g_occ(nO,nO) real(c_double), intent(out) :: r2(nO,nO,nV,nV) + real(c_double), intent(out) :: max_r2 end subroutine