From eaa7160884a91afa1f3835ecb254140dd1ce64a2 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 17 Jul 2023 14:00:54 +0200 Subject: [PATCH] Added g_occ --- devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f | 148 +++---------------- devel/ccsd_gpu/gpu.c | 66 ++++++++- devel/ccsd_gpu/gpu_module.f90 | 23 +-- 3 files changed, 81 insertions(+), 156 deletions(-) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 2c55310..c6b376e 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -462,40 +462,26 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) block_size = 16 call set_multiple_levels_omp(.False.) - call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, & - cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & - cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & - cc_space_f_vo, H_vv, r2) - -! call compute_g_vir_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,t2, & -! cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & -! cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & -! cc_space_f_vo, H_vv, r2) - - -!--- double precision, allocatable :: g_occ(:,:) allocate(g_occ(nO,nO)) - call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) - - double precision, allocatable :: X_oovv(:,:,:,:) - allocate(X_oovv(nO,nO,nV,nV)) - call dgemm('N','N',nO,nO*nV*nV,nO, & - 1d0, g_occ , size(g_occ,1), & - t2 , size(t2,1), & - 0d0, X_oovv, size(X_oovv,1)) - deallocate(g_occ) - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & + !$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) & + !$omp private(i,j,a,u) & !$omp default(none) !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO + do i = 1, nO + do u = 1, nO + g_occ(u,i) = H_oo(u,i) + enddo + enddo + !$omp end do + + !$omp do + do i = 1, nO + do j = 1, nO + do a = 1, nV do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) + g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a) enddo enddo enddo @@ -503,8 +489,14 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel - deallocate(X_oovv) + call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, & + cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & + cc_space_f_vo, H_vv, g_occ, r2) + +!--- + double precision, allocatable :: X_oovv(:,:,:,:) double precision, allocatable :: X_vovv(:,:,:,:) allocate(X_vovv(nV,nO,nV,block_size)) @@ -894,104 +886,6 @@ end ! g_occ -subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_oo(nO, nO) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_occ(nO, nO) - - integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - - call dgemm('N','N',nO,nO,nV, & - 1d0, t1, size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 0d0, g_occ, size(g_occ,1)) - - !$omp parallel & - !$omp shared(nO,nV,g_occ,H_oo, cc_space_v_ovoo,t1) & - !$omp private(i,j,a,u) & - !$omp default(none) - !$omp do - do i = 1, nO - do u = 1, nO - g_occ(u,i) = g_occ(u,i) + H_oo(u,i) - enddo - enddo - !$omp end do - - !$omp do - do i = 1, nO - do j = 1, nO - do a = 1, nV - do u = 1, nO - g_occ(u,i) = g_occ(u,i) + (2d0 * cc_space_v_ovoo(u,a,i,j) - cc_space_v_ovoo(u,a,j,i)) * t1(j,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - -end - -! g_vir - -subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) - use gpu_module - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_vv(nV, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_vir(nV, nV) - - integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - -! do beta = 1, nV -! do a = 1, nV -! g_vir(a,beta) = H_vv(a,beta) -! enddo -! enddo - -! call dgemm('N','N',nV,nV,nO, & -! -1d0, cc_space_f_vo , size(cc_space_f_vo,1), & -! t1 , size(t1,1), & -! 1d0, g_vir, size(g_vir,1)) - - double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:) -! allocate(tmp_k(cholesky_mo_num)) -! call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & -! cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) -! -! call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & -! cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & -! g_vir, nV*nV) -! deallocate(tmp_k) - -! allocate(tmp_vo(cholesky_mo_num,nV,nO)) -! call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & -! cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV) -! -! allocate(tmp_vo2(cholesky_mo_num,nO,nV)) -! do beta=1,nV -! do i=1,nO -! do k=1,cholesky_mo_num -! tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i) -! enddo -! enddo -! enddo -! deallocate(tmp_vo) -! -! call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & -! cc_space_v_ov_chol, cholesky_mo_num*nO, & -! tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV) - -end - -! J1 subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) implicit none diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index b2a835b..f29b5e9 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -72,6 +72,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* cc_space_v_vvoo, double* cc_space_f_vo, double* H_vv, + double* g_occ, double* r2) { @@ -87,7 +88,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo int ithread = omp_get_thread_num(); int igpu = ithread ; - +//igpu=1; cudaSetDevice(igpu); cublasHandle_t handle; @@ -131,6 +132,10 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); + double* d_cc_space_f_vo; + cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); + cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); + #pragma omp sections { @@ -208,10 +213,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaMalloc((void**)&d_g_vir, nV*nV*sizeof(double)); cublasSetMatrix(nV, nV, sizeof(double), H_vv, nV, d_g_vir, nV); - double* d_cc_space_f_vo; - cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); - cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); - alpha = -1.0; beta = 1.0; m=nV ; n=nV; k=nO; @@ -219,7 +220,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo B = d_t1 ; ldb = nO; C = d_g_vir; ldc = nV; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); - cudaFree(d_cc_space_f_vo); double* d_tmp_k; cudaMalloc((void**)&d_tmp_k, cholesky_mo_num*sizeof(double)); @@ -302,8 +302,55 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_Y_oovv); } - } + // g_occ + #pragma omp section + { + double* d_g_occ; + lda = nO; + cudaMalloc((void **)&d_g_occ, nO*nO * sizeof(double)); + cublasSetMatrix(lda, nO, sizeof(double), g_occ, lda, d_g_occ, lda); + alpha = 1.0; + beta = 1.0; + m=nO; n=nO; k=nV; + A=d_t1; lda=nO; + B=d_cc_space_f_vo; ldb=nV; + C=d_g_occ; ldc=nO; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + + double* d_X_oovv; + lda = nO*nO; + cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double)); + + alpha = 1.0; + beta = 0.0; + m=nO; n=nO*nV*nV; k=nO; + A=d_g_occ; lda=nO; + B=d_t2; ldb=nO; + C=d_X_oovv; ldc=nO; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + cudaFree(d_g_occ); + + alpha = 1.0; + beta = -1.0; + A = d_r2; lda = nO*nO; + B = d_X_oovv; ldb = nO*nO; + C = d_r2; ldc = nO*nO; + cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); + for (int j=0 ; j