diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index d3c6be9..b47e54f 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -121,7 +121,7 @@ subroutine run_ccsd_space_orb cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & - cc_space_f_oo, cc_space_f_vo, cc_space_f_vv) + cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) if (.not.do_ao_cholesky) then print *, 'ao_choleky is required' @@ -132,11 +132,19 @@ subroutine run_ccsd_space_orb ! Residue call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x); + !$OMP PARALLEL SECTIONS + !$OMP SECTION call compute_H_oo_chol_gpu(gpu_data,nO,nV,0,H_oo) + + !$OMP SECTION call compute_H_vo_chol_gpu(gpu_data,nO,nV,1,H_vo) + + !$OMP SECTION call compute_H_vv_chol_gpu(gpu_data,nO,nV,2,H_vv) - call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + !$OMP END PARALLEL SECTIONS + + call compute_r1_space_chol(gpu_data, nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) max_r = max(max_r1,max_r2) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f index 45b3ea7..f3e3ea8 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub_chol.irp.f @@ -80,11 +80,12 @@ end ! R1 -subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - +subroutine compute_r1_space_chol(gpu_data, nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) + use gpu_module implicit none ! in + type(c_ptr), intent(in) :: gpu_data integer, intent(in) :: nO, nV double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) @@ -95,68 +96,35 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) ! internal integer :: u,i,j,beta,a,b - !$omp parallel & - !$omp shared(nO,nV,r1,cc_space_f_ov) & - !$omp private(u,beta) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - r1(u,beta) = cc_space_f_ov(u,beta) - enddo - enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: X_oo(:,:) - allocate(X_oo(nO,nO)) - call dgemm('N','N', nO, nO, nV, & - -2d0, t1 , size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 0d0, X_oo , size(X_oo,1)) + call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) - call dgemm('T','N', nO, nV, nO, & - 1d0, X_oo, size(X_oo,2), & - t1 , size(t1,1), & - 1d0, r1 , size(r1,1)) - deallocate(X_oo) - - call dgemm('N','N', nO, nV, nV, & - 1d0, t1 , size(t1,1), & - H_vv, size(H_vv,1), & - 1d0, r1 , size(r1,1)) - - call dgemm('N','N', nO, nV, nO, & - -1d0, H_oo, size(H_oo,1), & - t1 , size(t1,1), & - 1d0, r1, size(r1,1)) - - double precision, allocatable :: X_voov(:,:,:,:) - allocate(X_voov(nV, nO, nO, nV)) - - !$omp parallel & - !$omp shared(nO,nV,X_voov,t2,t1) & - !$omp private(u,beta,i,a) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do i = 1, nO - do a = 1, nV - X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemv('T', nV*nO, nO*nV, & - 1d0, X_voov, size(X_voov,1) * size(X_voov,2), & - H_vo , 1, & - 1d0, r1 , 1) - - deallocate(X_voov) +! double precision, allocatable :: X_voov(:,:,:,:) +! allocate(X_voov(nV, nO, nO, nV)) +! +! !$omp parallel & +! !$omp shared(nO,nV,X_voov,t2,t1) & +! !$omp private(u,beta,i,a) & +! !$omp default(none) +! !$omp do +! do beta = 1, nV +! do u = 1, nO +! do i = 1, nO +! do a = 1, nV +! X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) +! enddo +! enddo +! enddo +! enddo +! !$omp end do +! !$omp end parallel +! +! call dgemv('T', nV*nO, nO*nV, & +! 1d0, X_voov, nV * nO, & +! H_vo , 1, & +! 1d0, r1 , 1) +! +! deallocate(X_voov) double precision, allocatable :: X_ovov(:,:,:,:) allocate(X_ovov(nO, nV, nO, nV)) @@ -263,7 +231,7 @@ subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) call dgemm('T','N', nO, nV, nO*nO*nV, & -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & tau , size(tau,1) * size(tau,2) * size(tau,3), & - 1d0, r1 , size(r1,1)) + 1d0, r1 , nO) deallocate(W_oovo) diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 173782a..47a47c2 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -43,6 +43,286 @@ void gpu_upload(gpu_data* data, } + +void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_oo) +{ + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + igpu = igpu % ngpus; + + const int cholesky_mo_num = data[igpu].cholesky_mo_num; + cudaSetDevice(igpu); + + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; + cudaStream_t stream[nV]; + + cublasHandle_t handle; + cublasCreate(&handle); + + double* d_H_oo = data[igpu].H_oo; + double* d_tau_x = data[igpu].tau_x; + double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; + double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; + double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; + + double* d_tau_kau; + cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double)); + + double* d_tmp_ovv; + cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double)); + + double* d_tmp_vov; + cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double)); + + for (int i=0 ; icholesky_mo_num; @@ -1294,7 +1574,6 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl for (size_t bet=iblock ; bet<(nV < iblock+BLOCK_SIZE ? nV : iblock+BLOCK_SIZE) ; ++bet) { - alpha = 1.0; beta = 0.0; A = &(d_tmpB1[nV*(bet-iblock)]); lda = nV*BLOCK_SIZE; @@ -1344,281 +1623,183 @@ void compute_r2_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl } -void compute_h_oo_chol_gpu(gpu_data* data, int nO, int nV, int igpu, double* H_oo) + + + + +void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r1, double* max_r1) { + const int cholesky_mo_num = data->cholesky_mo_num; + int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); - igpu = igpu % ngpus; - const int cholesky_mo_num = data[igpu].cholesky_mo_num; - cudaSetDevice(igpu); + #pragma omp parallel num_threads(ngpus) + { + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; + cudaStream_t stream[nV]; - int m,n,k, lda, ldb, ldc; - double alpha, beta; - double* A; - double* B; - double* C; - cudaStream_t stream[nV]; + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); - cublasHandle_t handle; - cublasCreate(&handle); + cublasHandle_t handle; + cublasCreate(&handle); - double* d_H_oo = data[igpu].H_oo; - double* d_tau_x = data[igpu].tau_x; - double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; - double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; - double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; + double* d_r1; + lda = nO ; + cudaMalloc((void **)&d_r1, lda * nV * sizeof(double)); + cudaMemset(d_r1, 0, nO*nV*sizeof(double)); + memset(r1, 0, nO*nV*sizeof(double)); - double* d_tau_kau; - cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double)); +// double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; +// double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; +// double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; +// double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; +// double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; +// double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; +// double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; +// double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; +// double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; +// double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; +// double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; +// double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; +// double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; + double* d_cc_space_f_ov = data[igpu].cc_space_f_ov; + double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; +// double* d_cc_space_f_vv = data[igpu].cc_space_f_vv; +// double* d_tau = data[igpu].tau; + double* d_t1 = data[igpu].t1; + double* d_t2 = data[igpu].t2; + double* d_H_oo = data[igpu].H_oo; + double* d_H_vo = data[igpu].H_vo; + double* d_H_vv = data[igpu].H_vv; - double* d_tmp_ovv; - cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double)); + #pragma omp sections + { - double* d_tmp_vov; - cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double)); + #pragma omp section + { + cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1); + + double* d_X_oo; + cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double)); + + alpha = -2.0; + beta = 0.0; + m=nO; n=nO; k=nV; + A=d_t1; lda=nO; + B=d_cc_space_f_vo; ldb=nV; + C=d_X_oo; ldc=nO; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + + alpha = 1.0; + beta = 1.0; + m=nO; n=nV; k=nO; + A=d_X_oo; lda=nO; + B=d_t1; ldb=nO; + C=d_r1; ldc=nO; + cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + + cudaFree(d_X_oo); + } + + #pragma omp section + { + alpha = 1.0; + beta = 1.0; + m=nO; n=nV; k=nV; + A=d_t1; lda=nO; + B=d_H_vv; ldb=nV; + C=d_r1; ldc=nO; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + } + + #pragma omp section + { + alpha = -1.0; + beta = 1.0; + m=nO; n=nV; k=nO; + A=d_H_oo; lda=nO; + B=d_t1; ldb=nO; + C=d_r1; ldc=nO; + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); + } + + #pragma omp section + { + double* d_X_voov; + cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double)); + + for (int i=0 ; i 0. ? r1[i] : -r1[i]; + *max_r1 = *max_r1 > x ? *max_r1 : x; + } } diff --git a/devel/ccsd_gpu/gpu.h b/devel/ccsd_gpu/gpu.h index 48c6f0f..4fbf657 100644 --- a/devel/ccsd_gpu/gpu.h +++ b/devel/ccsd_gpu/gpu.h @@ -12,6 +12,7 @@ typedef struct { double* cc_space_v_ovov; double* cc_space_v_ovoo; double* cc_space_f_oo; + double* cc_space_f_ov; double* cc_space_f_vo; double* cc_space_f_vv; double* tau; diff --git a/devel/ccsd_gpu/gpu_init.c b/devel/ccsd_gpu/gpu_init.c index 0625bca..7fd776f 100644 --- a/devel/ccsd_gpu/gpu_init.c +++ b/devel/ccsd_gpu/gpu_init.c @@ -14,8 +14,8 @@ gpu_data* gpu_init( double* cc_space_v_oovv, double* cc_space_v_vvoo, double* cc_space_v_oovo, double* cc_space_v_ovvo, double* cc_space_v_ovov, double* cc_space_v_ovoo, - double* cc_space_f_oo, double* cc_space_f_vo, - double* cc_space_f_vv) + double* cc_space_f_oo, double* cc_space_f_ov, + double* cc_space_f_vo, double* cc_space_f_vv) { int ngpus = 1; cudaGetDeviceCount(&ngpus); @@ -95,6 +95,10 @@ gpu_data* gpu_init( cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double)); cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV); + double* d_cc_space_f_ov; + cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double)); + cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO); + double* d_cc_space_f_vv; cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double)); cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV); @@ -135,6 +139,7 @@ gpu_data* gpu_init( data[igpu].cc_space_v_ovov = d_cc_space_v_ovov; data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo; data[igpu].cc_space_f_oo = d_cc_space_f_oo; + data[igpu].cc_space_f_ov = d_cc_space_f_ov; data[igpu].cc_space_f_vo = d_cc_space_f_vo; data[igpu].cc_space_f_vv = d_cc_space_f_vv; data[igpu].tau = d_tau; diff --git a/devel/ccsd_gpu/gpu_module.f90 b/devel/ccsd_gpu/gpu_module.f90 index 088caba..f1f04c0 100644 --- a/devel/ccsd_gpu/gpu_module.f90 +++ b/devel/ccsd_gpu/gpu_module.f90 @@ -8,7 +8,7 @@ module gpu_module cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, & cc_space_v_oooo, cc_space_v_vooo, cc_space_v_oovv, cc_space_v_vvoo, & cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, & - cc_space_f_oo, cc_space_f_vo, cc_space_f_vv) bind(C) + cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C) import c_int, c_double, c_ptr integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO) @@ -24,6 +24,7 @@ module gpu_module real(c_double), intent(in) :: cc_space_v_ovov(nO,nV,nO,nV) real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO) real(c_double), intent(in) :: cc_space_f_oo(nO,nO) + real(c_double), intent(in) :: cc_space_f_ov(nO,nV) real(c_double), intent(in) :: cc_space_f_vo(nV,nO) real(c_double), intent(in) :: cc_space_f_vv(nV,nV) end function @@ -59,6 +60,15 @@ module gpu_module real(c_double), intent(out) :: H_vv(nO,nO) end subroutine + subroutine compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1) bind(C) + import c_int, c_double, c_ptr + type(c_ptr), value :: gpu_data + integer(c_int), intent(in), value :: nO, nV + real(c_double), intent(in) :: t1(nO,nV) + real(c_double), intent(out) :: r1(nO,nO,nV,nV) + real(c_double), intent(out) :: max_r1 + end subroutine + subroutine compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2) bind(C) import c_int, c_double, c_ptr type(c_ptr), value :: gpu_data