diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 4de8ef5..c473d14 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -457,102 +457,60 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) integer :: u,v,i,j,beta,gam,a,b double precision :: max_r2_local + integer :: block_size, iblock, k + block_size = 16 call set_multiple_levels_omp(.False.) - !$omp parallel & - !$omp shared(nO,nV,r2,cc_space_v_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: A1(:,:,:,:) allocate(A1(nO,nO,nO,nO)) - call compute_A1_chol(nO,nV,t1,t2,tau,A1) - call dgemm('N','N',nO*nO,nV*nV,nO*nO, & - 1d0, A1, size(A1,1) * size(A1,2), & - tau, size(tau,1) * size(tau,2), & - 1d0, r2, size(r2,1) * size(r2,2)) - deallocate(A1) + double precision, allocatable :: Y_oooo(:,:,:,:) + allocate(Y_oooo(nO,nO,nO,nO)) - integer :: block_size, iblock, k - block_size = 16 - double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 - double precision, dimension(:,:), allocatable :: tmp_cc2 + ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & - call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,tau,cc_space_v_vo_chol, & - cc_space_v_vv_chol, r2) +! call dgemm('N','N', nO, nO*nO*nO, nV, & +! 1d0, t1 , size(t1,1), & +! cc_space_v_vooo, size(cc_space_v_vooo,1), & +! 0d0, Y_oooo, size(Y_oooo,1)) +! +! !$omp parallel & +! !$omp private(u,v,i,j) & +! !$omp default(shared) +! !$omp do collapse(2) +! do j = 1, nO +! do i = 1, nO +! do v = 1, nO +! do u = 1, nO +! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel +! +! deallocate(Y_oooo) +! +! ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) +! call dgemm('N','N', nO*nO, nO*nO, nV*nV, & +! 1d0, tau , size(tau,1) * size(tau,2), & +! cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & +! 1d0, A1 , size(A1,1) * size(A1,2)) +! +! call dgemm('N','N',nO*nO,nV*nV,nO*nO, & +! 1d0, A1, size(A1,1) * size(A1,2), & +! tau, size(tau,1) * size(tau,2), & +! 0d0, r2, size(r2,1) * size(r2,2)) +! +! deallocate(A1) -! allocate(tmp_cc(cholesky_mo_num,nV,nV)) -! call gemm0(nO, nV, cholesky_mo_num, cc_space_v_vo_chol, t1, tmp_cc) -! -!! call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & -!! cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV) -! -! call set_multiple_levels_omp(.False.) -! -! !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) -! allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV)) -! !$OMP DO -! do gam = 1, nV -!! -! do a=1,nV -! do k=1,cholesky_mo_num -! tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) -! enddo -! enddo -! -! do iblock = 1, nV, block_size -! -! call gemm1(iblock-1, nV, cholesky_mo_num, tmp_cc, cc_space_v_vv_chol(1,1,gam), tmpB1) -! -!! call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & -!! -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, & -!! cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, & -!! 0.d0, tmpB1, nV*block_size) -! -! call gemm2(iblock-1, nV, cholesky_mo_num, tmp_cc2, cc_space_v_vv_chol, tmpB1) -! -!! call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, 1.d0, & -!! cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & -!! tmp_cc2, cholesky_mo_num, & -!! 1.d0, tmpB1, nV*block_size) -! -! do beta = iblock, min(nV, iblock+block_size-1) -! do b = 1, nV -! do a = 1, nV -! B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b) -! enddo -! enddo -! enddo -! -! call gemm3(iblock-1, nO, nV, gam-1, tau, B1, r2) -! -!! call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & -!! 1d0, tau, nO*nO, & -!! B1 , nV*nV, & -!! 1d0, r2(1,1,iblock,gam), nO*nO) -! enddo -! -! enddo -! !$OMP ENDDO -! -! deallocate(B1, tmpB1, tmp_cc2) -! !$OMP END PARALLEL -! -! deallocate(tmp_cc) + call compute_r2_space_chol_gpu(nO,nV,cholesky_mo_num,t1,tau, & + cc_space_v_vo_chol, cc_space_v_vv_chol, & + cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, & + r2) double precision, allocatable :: X_oovv(:,:,:,:) allocate(X_oovv(nO,nO,nV,nV)) @@ -1021,58 +979,6 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) end -! A1 - - -subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: tau(nO, nO, nV, nV) - double precision, intent(out) :: A1(nO, nO, nO, nO) - - integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta - - double precision, allocatable :: Y_oooo(:,:,:,:) - allocate(Y_oooo(nO,nO,nO,nO)) - - ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) - ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & - - call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1 , size(t1,1), & - cc_space_v_vooo, size(cc_space_v_vooo,1), & - 0d0, Y_oooo, size(Y_oooo,1)) - - !$omp parallel & - !$omp private(u,v,i,j) & - !$omp default(shared) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do v = 1, nO - do u = 1, nO - A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Y_oooo) - - ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) - call dgemm('N','N', nO*nO, nO*nO, nV*nV, & - 1d0, tau , size(tau,1) * size(tau,2), & - cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & - 1d0, A1 , size(A1,1) * size(A1,2)) - -end - ! g_occ diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 2a4a0c3..d4a4fd8 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -63,6 +64,9 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* tau, double* cc_space_v_vo_chol, double* cc_space_v_vv_chol, + double* cc_space_v_oooo, + double* cc_space_v_vooo, + double* cc_space_v_oovv, double* r2) { double* d_tau; @@ -134,6 +138,83 @@ void compute_r2_space_chol_gpu(const int nO, const int nV, const int cholesky_mo double* d_tmpB1; cudaMalloc((void**)&d_tmpB1, nV*BLOCK_SIZE*nV*sizeof(double)); + #pragma sections + { + #pragma omp section + for (size_t i=0 ; i