diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 691074c..40eb3af 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -490,14 +490,11 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_da !$omp end do !$omp end parallel - double precision, allocatable :: J1(:,:,:,:) - allocate(J1(nO,nV,nV,nO)) - double precision, allocatable :: K1(:,:,:,:) allocate(K1(nO,nV,nO,nV)) call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,gpu_data,t1,t2,tau, & - H_vv, g_occ, J1, K1, r2) + H_vv, g_occ, K1, r2) !--- double precision, allocatable :: X_oovv(:,:,:,:) @@ -507,73 +504,8 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2,gpu_da double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - - allocate(X_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - do i = 1, nO - !$omp do - do a = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta)) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - deallocate(J1) - double precision, allocatable :: Y_voov(:,:,:,:) - allocate(Y_voov(nV,nO,nO,nV)) - - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - !$omp do - do gam = 1, nV - do v = 1, nO - do i = 1, nO - do a = 1, nV - Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: Z_ovov(:,:,:,:) - allocate(Z_ovov(nO,nV,nO,nV)) - - call dgemm('N','N', nO*nV,nO*nV,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_voov, size(Y_voov,1) * size(Y_voov,2), & - 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) - - deallocate(X_ovvo,Y_voov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Z_ovov) - double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) allocate(X_ovov(nO,nV,nO,nV)) allocate(Y_ovov(nO,nV,nO,nV)) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index a5827d2..49a78f8 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -16,7 +16,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* tau, double* H_vv, double* g_occ, - double* J1, double* K1, double* r2) { @@ -24,6 +23,8 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo int ngpus = 1; cudaGetDeviceCount(&ngpus); + double* J1 = malloc(nO*nV*nV*nO*sizeof(double)); + #pragma omp parallel num_threads(ngpus) { int m,n,k, lda, ldb, ldc; @@ -233,36 +234,6 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo C = d_r2; ldc = nO*nO; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nO, nV*nV, &alpha, A, lda, &beta, B, ldb, C, ldc); -/* - double * Y_oovv = malloc(nO*nO*nV*nV*sizeof(double)); - lda=nO*nO; - cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_Y_oovv, lda, Y_oovv, lda); - cudaFree(d_Y_oovv); - - double * r2_tmp = malloc(nO*nO*nV*nV*sizeof(double)); - lda=nO*nO; - cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_r2, lda, r2_tmp, lda); - - for (int j=0 ; j