diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index a30b0ee..785a5ca 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -127,10 +127,10 @@ subroutine run_ccsd_space_orb ! Residue if (do_ao_cholesky) then - call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1,H_vo) - call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_vv); + call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x); call compute_H_oo_chol_gpu(gpu_data,nO,nV,0,H_oo) + call compute_H_vv_chol_gpu(gpu_data,nO,nV,0,H_vv) call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 70612e7..1d28808 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -290,105 +290,6 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) end -! H_oo - -subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_oo(nO, nO) - - integer :: a,b,i,j,u,k - - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) - - allocate(tau_kau(cholesky_mo_num,nV,nO)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,u,j,k,a,b,tmp_vov) - allocate(tmp_vov(nV,nO,nV) ) - !$omp do - do u = 1, nO - do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tau_x(u,j,a,b) - enddo - enddo - enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) - enddo - !$omp end do nowait - deallocate(tmp_vov) - !$omp do - do i = 1, nO - do u = 1, nO - H_oo(u,i) = cc_space_f_oo(u,i) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) - -end - -! H_vv - -subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_vv(nV, nV) - - integer :: a,b,i,j,u,k, beta - - double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) - - allocate(tau_kia(cholesky_mo_num,nO,nV)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,beta,j,k,a,b,tmp_oov) - allocate(tmp_oov(nO,nO,nV) ) - !$omp do - do a = 1, nV - do b=1,nV - do j=1,nO - do i=1,nO - tmp_oov(i,j,b) = tau_x(i,j,a,b) - enddo - enddo - enddo - call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_mo_num) - enddo - !$omp end do nowait - deallocate(tmp_oov) - - !$omp do - do beta = 1, nV - do a = 1, nV - H_vv(a,beta) = cc_space_f_vv(a,beta) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) - -end - ! H_vo subroutine compute_H_vo_chol(nO,nV,t1,H_vo) @@ -439,41 +340,3 @@ subroutine compute_H_vo_chol(nO,nV,t1,H_vo) end -subroutine compute_H_oo_chol2(nO,nV,tau_x,H_oo) - - implicit none - - integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_oo(nO, nO) - - integer :: a,b,i,j,u,k - - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:), tmp_ovv(:,:,:) - - allocate(tau_kau(cholesky_mo_num,nV,nO)) - allocate(tmp_vov(nV,nO,nV) ) - allocate(tmp_ovv(nO,nV,nV) ) - do u = 1, nO - call dcopy(nO*nV*nV, tau_x(u,1,1,1), nO, tmp_ovv, 1) - print *, u - print *, tmp_ovv - do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tmp_ovv(j,a,b) - enddo - enddo - enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) - enddo - deallocate(tmp_vov) - call dcopy(nO*nO, cc_space_f_oo, 1, H_oo, 1); - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) - -end - diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 540c7d5..5d75928 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -11,8 +11,7 @@ void gpu_upload(gpu_data* data, double* t1, double* t2, double* tau, - double* tau_x, - double* H_vv) + double* tau_x) { int lda; const int cholesky_mo_num = data->cholesky_mo_num; @@ -40,10 +39,6 @@ void gpu_upload(gpu_data* data, double* d_t2 = data[igpu].t2; lda = nO*nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); - - double* d_H_vv = data[igpu].H_vv; - lda = nV; - cublasSetMatrix(nV, nV, sizeof(double), H_vv, lda, d_H_vv, lda); } } @@ -1353,6 +1348,7 @@ void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_o { int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + igpu = igpu % ngpus; const int cholesky_mo_num = data[igpu].cholesky_mo_num; cudaSetDevice(igpu); @@ -1437,3 +1433,88 @@ void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_o cublasDestroy(handle); } + + + + +void compute_h_vv_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_vv) +{ + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + igpu = igpu % ngpus; + + const int cholesky_mo_num = data[igpu].cholesky_mo_num; + cudaSetDevice(igpu); + + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; + cudaStream_t stream[nV]; + + cublasHandle_t handle; + cublasCreate(&handle); + + double* d_H_vv = data[igpu].H_vv; + double* d_tau_x = data[igpu].tau_x; + double* d_cc_space_f_vv = data[igpu].cc_space_f_vv; + double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; + + double* d_tau_kia; + cudaMalloc((void **)&d_tau_kia, cholesky_mo_num*nO*nV * sizeof(double)); + + double* d_tmp_oov; + cudaMalloc((void **)&d_tmp_oov, nO*nO*nV * sizeof(double)); + + alpha = 1.0; + beta = 0.0; +// for (int i=0 ; i