diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index f22611b..0de043a 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -489,11 +489,15 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) !$omp end do !$omp end parallel + double precision, allocatable :: J1(:,:,:,:) + allocate(J1(nO,nV,nV,nO)) + J1 = 0.d0 + call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,t2,tau, & cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & - cc_space_v_oovo, cc_space_v_ovvo, & - cc_space_f_vo, H_vv, g_occ, r2) + cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovoo, & + cc_space_f_vo, H_vv, g_occ, J1, r2) !--- @@ -504,8 +508,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - double precision, allocatable :: J1(:,:,:,:) - allocate(J1(nO,nV,nV,nO)) call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & cc_space_v_vvoo,J1) @@ -735,90 +737,15 @@ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) double precision, intent(in) :: t2(nO, nO, nV, nV) double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) double precision, intent(in) :: v_vvoo(nV,nV,nO,nO) - double precision, intent(out) :: J1(nO, nV, nV, nO) + double precision, intent(inout) :: J1(nO, nV, nV, nO) integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:) allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV)) - - !$omp parallel & - !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) & - !$omp private(i,j,a,u,beta) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = v_ovvo(u,a,beta,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do a = 1, nV - do u = 1, nO - X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), & - t1 , size(t1,1), & - 0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3)) - - !$omp parallel & - !$omp shared(nO,nV,J1,Y_ovov) & - !$omp private(i,beta,a,u) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel deallocate(X_ovoo) double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) - allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO)) - - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, & - t1, nO, & - 0.d0, tmp_cc, cholesky_mo_num*nV) - - call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & - tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, & - 0.d0, J1_tmp, nV*nO) - - deallocate(tmp_cc) - - do i=1,nO - do b=1,nV - do a=1,nV - do u=1,nO - J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i) - enddo - enddo - enddo - enddo - - deallocate(J1_tmp) !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 041b84b..346a6cd 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -73,9 +73,11 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* cc_space_v_vvoo, double* cc_space_v_oovo, double* cc_space_v_ovvo, + double* cc_space_v_ovoo, double* cc_space_f_vo, double* H_vv, double* g_occ, + double* J1, double* r2) { @@ -132,6 +134,11 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double)); cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda); + double* d_cc_space_v_ovvo; + lda = nO*nV; + cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); + cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); + lda = nO; cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); @@ -499,11 +506,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo } cudaFree(d_cc_space_v_oovo); - double* d_cc_space_v_ovvo; - lda = nO*nV; - cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double)); - cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda); - double* d_X_vovo; lda = nV*nO; cudaMalloc((void **)&d_X_vovo, nV*nO*nV*nO * sizeof(double)); @@ -518,7 +520,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nO, &alpha, A, lda, &beta, B, ldb, C, ldc); } } - cudaFree(d_cc_space_v_ovvo); double* d_Y_oovo; lda = nO*nO; @@ -558,6 +559,107 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo } cudaFree(d_X_oovv); } + + + #pragma omp section + { + + double* d_J1; + lda = nO*nV; + cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); + + alpha = 1.0; + beta = 0.0; + A = d_cc_space_v_ovvo; lda = nO*nV; + B = d_cc_space_v_ovvo; ldb = nO*nV; + C = d_J1; ldc = nO*nV; + cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); + + + double* d_cc_space_v_ovoo; + lda = nO*nV; + cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double)); + cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda); + + double* d_X_ovoo; + lda = nO*nV; + cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); + for (int j=0 ; j