diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index adf42b9..f22611b 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -492,6 +492,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, & cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & + cc_space_v_oovo, cc_space_v_ovvo, & cc_space_f_vo, H_vv, g_occ, r2) @@ -501,83 +502,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) - allocate(X_oovv(nO,nO,nV,nV)) - - call dgemm('N','N',nO*nO*nV,nV,nO, & - 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(X_oovv) - double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO)) - - !$omp parallel & - !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & - !$omp private(a,v,gam,i) & - !$omp default(none) - do i = 1, nO - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - - allocate(Y_oovo(nO,nO,nV,nO)) - call dgemm('N','N',nO,nO*nV*nO,nV, & - 1d0, t1, size(t1,1), & - X_vovo, size(X_vovo,1), & - 0d0, Y_oovo, size(Y_oovo,1)) - - deallocate(X_vovo) - allocate(X_oovv(nO,nO,nV,nV)) - call dgemm('N','N',nO*nO*nV, nV, nO, & - 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) - deallocate(Y_oovo) - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(X_oovv) - double precision, allocatable :: J1(:,:,:,:) allocate(J1(nO,nV,nV,nO)) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index b355c3b..041b84b 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -71,6 +71,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* cc_space_v_vooo, double* cc_space_v_oovv, double* cc_space_v_vvoo, + double* cc_space_v_oovo, + double* cc_space_v_ovvo, double* cc_space_f_vo, double* H_vv, double* g_occ, @@ -460,6 +462,102 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_X_ovvo); } + #pragma omp section + { + double* d_cc_space_v_oovo; + lda = nO*nO; + cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double)); + cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda); + + double* d_X_oovv; + lda = nO*nO; + cudaMalloc((void **)&d_X_oovv, nO*nO*nV*nV * sizeof(double)); + + alpha = 1.0; + beta = 0.0; + m=nO*nO*nV; n=nV; k=nO; + A=d_cc_space_v_oovo; lda=nO*nO*nV; + B=d_t1; ldb=nO; + C=d_X_oovv; ldc=nO*nO*nV; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + + alpha = 1.0; + beta = -1.0; + A = d_r2; lda = nO*nO; + B = d_X_oovv; ldb = nO*nO; + C = d_r2; ldc = nO*nO; + cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); + for (int j=0 ; j