diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index 4f9f6a3..a30b0ee 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -120,17 +120,18 @@ subroutine run_ccsd_space_orb gpu_data = gpu_init(nO, nV, cholesky_mo_num, & cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & - cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, cc_space_f_vo) + cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & + cc_space_f_oo, cc_space_f_vo, cc_space_f_vv) do while (not_converged) ! Residue if (do_ao_cholesky) then - call compute_H_oo_chol(nO,nV,tau_x,H_oo) call compute_H_vv_chol(nO,nV,tau_x,H_vv) call compute_H_vo_chol(nO,nV,t1,H_vo) + call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_vv); + call compute_H_oo_chol_gpu(gpu_data,nO,nV,0,H_oo) - call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x, H_oo, H_vv); call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) else diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index b1c54bf..70612e7 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -439,3 +439,41 @@ subroutine compute_H_vo_chol(nO,nV,t1,H_vo) end +subroutine compute_H_oo_chol2(nO,nV,tau_x,H_oo) + + implicit none + + integer, intent(in) :: nO,nV + double precision, intent(in) :: tau_x(nO, nO, nV, nV) + double precision, intent(out) :: H_oo(nO, nO) + + integer :: a,b,i,j,u,k + + double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:), tmp_ovv(:,:,:) + + allocate(tau_kau(cholesky_mo_num,nV,nO)) + allocate(tmp_vov(nV,nO,nV) ) + allocate(tmp_ovv(nO,nV,nV) ) + do u = 1, nO + call dcopy(nO*nV*nV, tau_x(u,1,1,1), nO, tmp_ovv, 1) + print *, u + print *, tmp_ovv + do b=1,nV + do j=1,nO + do a=1,nV + tmp_vov(a,j,b) = tmp_ovv(j,a,b) + enddo + enddo + enddo + call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & + 0.d0, tau_kau(1,1,u), cholesky_mo_num) + enddo + deallocate(tmp_vov) + call dcopy(nO*nO, cc_space_f_oo, 1, H_oo, 1); + call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & + 1.d0, H_oo, nO) + +end + diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 1c36b54..540c7d5 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -12,7 +12,6 @@ void gpu_upload(gpu_data* data, double* t2, double* tau, double* tau_x, - double* H_oo, double* H_vv) { int lda; @@ -21,7 +20,6 @@ void gpu_upload(gpu_data* data, int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); - #pragma omp parallel num_threads(ngpus) { int igpu = omp_get_thread_num(); @@ -43,10 +41,6 @@ void gpu_upload(gpu_data* data, lda = nO*nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); - double* d_H_oo = data[igpu].H_oo; - lda = nO; - cublasSetMatrix(nO, nO, sizeof(double), H_oo, lda, d_H_oo, lda); - double* d_H_vv = data[igpu].H_vv; lda = nV; cublasSetMatrix(nV, nV, sizeof(double), H_vv, lda, d_H_vv, lda); @@ -75,8 +69,8 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl int igpu = omp_get_thread_num(); cudaSetDevice(igpu); - cublasHandle_t handle; + cublasHandle_t handle; cublasCreate(&handle); double* d_r2; @@ -91,13 +85,15 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; - double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; + double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; + double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; + double* d_cc_space_f_vv = data[igpu].cc_space_f_vv; double* d_tau = data[igpu].tau; double* d_t1 = data[igpu].t1; double* d_t2 = data[igpu].t2; @@ -127,9 +123,9 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); alpha = 0.0; beta = 1.0; - for (int i=0 ; i