diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index c6b376e..b1bb4ac 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -499,41 +499,41 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) double precision, allocatable :: X_oovv(:,:,:,:) double precision, allocatable :: X_vovv(:,:,:,:) - allocate(X_vovv(nV,nO,nV,block_size)) - allocate(Y_oovv(nO,nO,nV,nV)) - - do iblock = 1, nV, block_size - do gam = iblock, min(nV, iblock+block_size-1) - call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, & - cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) - - enddo - call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & - 1d0, t1 , size(t1,1), & - X_vovv, size(X_vovv,1), & - 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) - - enddo - deallocate(X_vovv) - - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(Y_oovv) +! allocate(X_vovv(nV,nO,nV,block_size)) +! allocate(Y_oovv(nO,nO,nV,nV)) +! +! do iblock = 1, nV, block_size +! do gam = iblock, min(nV, iblock+block_size-1) +! call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & +! cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, & +! cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) +! +! enddo +! call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & +! 1d0, t1 , size(t1,1), & +! X_vovv, size(X_vovv,1), & +! 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) +! +! enddo +! deallocate(X_vovv) +! +! !$omp parallel & +! !$omp shared(nO,nV,r2,Y_oovv) & +! !$omp private(u,v,gam,beta) & +! !$omp default(none) +! !$omp do +! do gam = 1, nV +! do beta = 1, nV +! do v = 1, nO +! do u = 1, nO +! r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel +! deallocate(Y_oovv) double precision, allocatable :: X_ovvo(:,:,:,:) double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) @@ -884,7 +884,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) end -! g_occ subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index f29b5e9..59067b8 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -350,8 +350,58 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaFree(d_X_oovv); } + #pragma omp section + { + double* d_X_vovv; + lda = nV*nO; + cudaMalloc((void **)&d_X_vovv, nV*nO*nV*BLOCK_SIZE * sizeof(double)); + + double* d_Y_oovv; + lda = nO*nO; + cudaMalloc((void **)&d_Y_oovv, nO*nO*nV*nV * sizeof(double)); + + for (size_t iblock=0 ; iblock