diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index c473d14..5f52709 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -470,46 +470,46 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & -! call dgemm('N','N', nO, nO*nO*nO, nV, & -! 1d0, t1 , size(t1,1), & -! cc_space_v_vooo, size(cc_space_v_vooo,1), & -! 0d0, Y_oooo, size(Y_oooo,1)) -! -! !$omp parallel & -! !$omp private(u,v,i,j) & -! !$omp default(shared) -! !$omp do collapse(2) -! do j = 1, nO -! do i = 1, nO -! do v = 1, nO -! do u = 1, nO -! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) -! enddo -! enddo -! enddo -! enddo -! !$omp end do -! !$omp end parallel -! -! deallocate(Y_oooo) -! -! ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) -! call dgemm('N','N', nO*nO, nO*nO, nV*nV, & -! 1d0, tau , size(tau,1) * size(tau,2), & -! cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & -! 1d0, A1 , size(A1,1) * size(A1,2)) -! -! call dgemm('N','N',nO*nO,nV*nV,nO*nO, & -! 1d0, A1, size(A1,1) * size(A1,2), & -! tau, size(tau,1) * size(tau,2), & -! 0d0, r2, size(r2,1) * size(r2,2)) -! -! deallocate(A1) + call dgemm('N','N', nO, nO*nO*nO, nV, & + 1d0, t1 , size(t1,1), & + cc_space_v_vooo, size(cc_space_v_vooo,1), & + 0d0, Y_oooo, size(Y_oooo,1)) + + !$omp parallel & + !$omp private(u,v,i,j) & + !$omp default(shared) + !$omp do collapse(2) + do j = 1, nO + do i = 1, nO + do v = 1, nO + do u = 1, nO + A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) + enddo + enddo + enddo + enddo + !$omp end do + !$omp end parallel + + deallocate(Y_oooo) + + ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) + call dgemm('N','N', nO*nO, nO*nO, nV*nV, & + 1d0, tau , size(tau,1) * size(tau,2), & + cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & + 1d0, A1 , size(A1,1) * size(A1,2)) + + call dgemm('N','N',nO*nO,nV*nV,nO*nO, & + 1d0, A1, size(A1,1) * size(A1,2), & + tau, size(tau,1) * size(tau,2), & + 0d0, r2, size(r2,1) * size(r2,2)) + + deallocate(A1) call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,tau, & cc_space_v_vo_chol, cc_space_v_vv_chol, & - cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & r2) double precision, allocatable :: X_oovv(:,:,:,:) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index d4a4fd8..8b88b8f 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -67,6 +67,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* cc_space_v_oooo, double* cc_space_v_vooo, double* cc_space_v_oovv, + double* cc_space_v_vvoo, double* r2) { double* d_tau; @@ -121,24 +122,7 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo cudaMalloc((void **)&d_t1, nO * nV * sizeof(double)); cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); - lda = cholesky_mo_num * nV; - cudaMalloc((void **)&d_tmp_cc, lda * nV * sizeof(double)); - - alpha=1.0; beta=0.0; - m=cholesky_mo_num*nV; n=nV; k=nO; - A = d_cc_space_v_vo_chol; B = d_t1; C = d_tmp_cc; - cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m); - - double* d_tmp_cc2; - cudaMalloc((void **)&d_tmp_cc2, cholesky_mo_num*nV*sizeof(double)); - - double* d_B1; - cudaMalloc((void**)&d_B1, nV*nV*BLOCK_SIZE*sizeof(double)); - - double* d_tmpB1; - cudaMalloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double)); - - #pragma sections + #pragma omp sections { #pragma omp section for (size_t i=0 ; i