From 699c55563308ec6ac42395175788a5306f97aa40 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 5 Aug 2023 00:50:58 +0200 Subject: [PATCH] Compute energy on GPU --- devel/ccsd_gpu/ccsd_space_orb_sub.irp.f | 214 +++--------------------- devel/ccsd_gpu/gpu.c | 136 +++++++++++++-- devel/ccsd_gpu/gpu_module.f90 | 9 +- 3 files changed, 156 insertions(+), 203 deletions(-) diff --git a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f index f72fee5..33ab63b 100644 --- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f +++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f @@ -9,7 +9,7 @@ subroutine run_ccsd_space_orb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb logical :: not_converged - double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) + double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:) double precision, allocatable :: t1(:,:), r1(:,:) double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) @@ -50,8 +50,6 @@ subroutine run_ccsd_space_orb !print*,'vir',list_vir allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) - allocate(tau(nO,nO,nV,nV)) - allocate(tau_x(nO,nO,nV,nV)) allocate(t1(nO,nV), r1(nO,nV)) allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) @@ -95,26 +93,6 @@ subroutine run_ccsd_space_orb endif ! Init - call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) - call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) - call update_tau_space(nO,nV,t1,t2,tau) - call update_tau_x_space(nO,nV,tau,tau_x) - !print*,'hf_energy', hf_energy - call det_energy(det,uncorr_energy) - print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) - print*,'Guess energy', uncorr_energy+energy, energy - - nb_iter = 0 - not_converged = .True. - max_r1 = 0d0 - max_r2 = 0d0 - - write(*,'(A77)') ' -----------------------------------------------------------------------------' - write(*,'(A77)') ' | It. | E(CCSD) (Ha) | Correlation (Ha) | Conv. T1 | Conv. T2 |' - write(*,'(A77)') ' -----------------------------------------------------------------------------' - call wall_time(ta) - type(c_ptr) :: gpu_data gpu_data = gpu_init(nO, nV, cholesky_mo_num, & @@ -128,10 +106,31 @@ subroutine run_ccsd_space_orb stop -1 endif + call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) + call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) + + call gpu_upload(gpu_data, nO, nV, t1, t2); + + !print*,'hf_energy', hf_energy + call det_energy(det,uncorr_energy) + print*,'Det energy', uncorr_energy + energy = ccsd_energy_space_gpu(gpu_data) + print*,'Guess energy', uncorr_energy+energy, energy + + nb_iter = 0 + not_converged = .True. + max_r1 = 0d0 + max_r2 = 0d0 + + write(*,'(A77)') ' -----------------------------------------------------------------------------' + write(*,'(A77)') ' | It. | E(CCSD) (Ha) | Correlation (Ha) | Conv. T1 | Conv. T2 |' + write(*,'(A77)') ' -----------------------------------------------------------------------------' + call wall_time(ta) + + do while (not_converged) ! Residue - call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x); !$OMP PARALLEL SECTIONS !$OMP SECTION call compute_H_oo_chol_gpu(gpu_data,0) @@ -163,11 +162,10 @@ subroutine run_ccsd_space_orb print*,'Unkown cc_method_method: '//cc_update_method endif - call update_tau_space(nO,nV,t1,t2,tau) - call update_tau_x_space(nO,nV,tau,tau_x) + call gpu_upload(gpu_data, nO, nV, t1, t2); ! Energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + energy = ccsd_energy_space_gpu(gpu_data) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -202,7 +200,7 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - deallocate(r1,r2,tau) + deallocate(r1,r2) ! CCSD(T) double precision :: e_t @@ -248,163 +246,3 @@ subroutine run_ccsd_space_orb end -! Energy - -subroutine ccsd_energy_space(nO,nV,tau,t1,energy) - - implicit none - - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel - -end - -subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) - - implicit none - - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau_x(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp cc_space_f_vo,cc_space_v_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel - -end - -! Tau - -subroutine update_tau_space(nO,nV,t1,t2,tau) - - implicit none - - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) - - ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) - - ! internal - integer :: i,j,a,b - - !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & - !$OMP PRIVATE(i,j,a,b) & - !$OMP DEFAULT(NONE) - !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - -end - -subroutine update_tau_x_space(nO,nV,tau,tau_x) - - implicit none - - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - - ! out - double precision, intent(out) :: tau_x(nO,nO,nV,nV) - - ! internal - integer :: i,j,a,b - - !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x) & - !$OMP PRIVATE(i,j,a,b) & - !$OMP DEFAULT(NONE) - !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) - enddo - enddo - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - -end - diff --git a/devel/ccsd_gpu/gpu.c b/devel/ccsd_gpu/gpu.c index 0177a60..fc407a6 100644 --- a/devel/ccsd_gpu/gpu.c +++ b/devel/ccsd_gpu/gpu.c @@ -9,9 +9,7 @@ void gpu_upload(gpu_data* data, int nO, int nV, double* t1, - double* t2, - double* tau, - double* tau_x) + double* t2) { int lda; const int cholesky_mo_num = data->cholesky_mo_num; @@ -19,19 +17,14 @@ void gpu_upload(gpu_data* data, int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + double * tau = malloc(nO*nO*nV*nV * sizeof(double)); + double * tau_x = malloc(nO*nO*nV*nV * sizeof(double)); + #pragma omp parallel num_threads(ngpus) { int igpu = omp_get_thread_num(); cudaSetDevice(igpu); - double* d_tau = data[igpu].tau; - lda = nO * nO; - cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); - - double* d_tau_x = data[igpu].tau_x; - lda = nO * nO; - cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); - double* d_t1 = data[igpu].t1; lda = nO; cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); @@ -39,11 +32,76 @@ void gpu_upload(gpu_data* data, double* d_t2 = data[igpu].t2; lda = nO*nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); - } + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; + + cublasHandle_t handle; + cublasCreate(&handle); + cudaStream_t stream[nV]; + + double* d_tau = data[igpu].tau; + + double* d_tau_x = data[igpu].tau_x; + lda = nO * nO; + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); + + if (igpu == 0) { + + for (int i=0 ; i 0) { + lda = nO * nO; + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); + cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); + } + cublasDestroy(handle); + } + free(tau); + free(tau_x); } + + void compute_h_oo_chol_gpu(gpu_data* data, int igpu) { int ngpus = 1; @@ -1932,3 +1990,57 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl } } + + +double ccsd_energy_space_gpu(gpu_data* data) +{ + double result = 0.0; + + const int nO = data->nO; + const int nV = data->nV; + + int ngpus = 1; + if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); + + #pragma omp parallel num_threads(ngpus) + { + int m,n,k, lda, ldb, ldc; + double alpha, beta; + double* A; + double* B; + double* C; + + int igpu = omp_get_thread_num(); + cudaSetDevice(igpu); + + cublasHandle_t handle; + cublasCreate(&handle); + + double result_local = 0.0; + #pragma omp sections + { + #pragma omp section + { + double* d_cc_space_f_ov = data[igpu].cc_space_f_ov; + double* d_t1 = data[igpu].t1; + double x; + cublasDdot(handle, nO*nV, d_cc_space_f_ov, 1, d_t1, 1, &x); + result_local += 2.0*x; + } + #pragma omp section + { + double* d_tau_x = data[igpu].tau_x; + double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; + double x; + cublasDdot(handle, nO*nO*nV*nV, d_tau_x, 1, d_cc_space_v_oovv, 1, &x); + result_local += x; + } + } + cublasDestroy(handle); + #pragma omp critical + { + result += result_local; + } + } + return result; +} diff --git a/devel/ccsd_gpu/gpu_module.f90 b/devel/ccsd_gpu/gpu_module.f90 index 37ce462..2dceffc 100644 --- a/devel/ccsd_gpu/gpu_module.f90 +++ b/devel/ccsd_gpu/gpu_module.f90 @@ -30,16 +30,15 @@ module gpu_module real(c_double), intent(in) :: cc_space_f_vv(nV,nV) end function - subroutine gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x) bind(C) + subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C) import c_int, c_double, c_ptr type(c_ptr), value :: gpu_data integer(c_int), intent(in), value :: nO, nV real(c_double), intent(in) :: t1(nO,nV) real(c_double), intent(in) :: t2(nO,nO,nV,nV) - real(c_double), intent(in) :: tau(nO,nO,nV,nV) - real(c_double), intent(in) :: tau_x(nO,nO,nV,nV) end subroutine + subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C) import c_int, c_double, c_ptr type(c_ptr), value :: gpu_data @@ -76,6 +75,10 @@ module gpu_module real(c_double), intent(out) :: max_r2 end subroutine + double precision function ccsd_energy_space_gpu(gpu_data) bind(C) + import c_ptr + type(c_ptr), value :: gpu_data + end function subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)