diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 40eb3af..bbe46ea 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -507,62 +507,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_da double precision, allocatable :: Y_voov(:,:,:,:) double precision, allocatable :: Z_ovov(:,:,:,:) double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - allocate(X_ovov(nO,nV,nO,nV)) - allocate(Y_ovov(nO,nV,nO,nV)) - - !$omp parallel & - !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,a,i,beta,gam) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do a = 1, nV - do i = 1, nO - X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov(i,a,v,gam) = t2(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('T','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) - deallocate(X_ovov, Y_ovov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(Z_ovov) allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) !$omp parallel & diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 49a78f8..60c7503 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -62,6 +62,9 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* d_t2; double* d_tmp_cc; + double* d_K1; + cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double)); + lda = nO * nO; cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double)); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); @@ -82,6 +85,377 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo #pragma omp sections { + #pragma omp section + { + double* d_J1; + cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); + + alpha = 1.0; + beta = 0.0; + A = d_cc_space_v_ovvo; lda = nO*nV; + B = d_cc_space_v_ovvo; ldb = nO*nV; + C = d_J1; ldc = nO*nV; + cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); + + + double* d_X_ovoo; + cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); + alpha = 0.0; + beta = 1.0; + for (int i=0 ; i