Compute energy on GPU

2024-12-22 12:23:37 +01:00 · 2023-08-05 00:50:58 +02:00 · 2023-08-05 00:50:58 +02:00 · 699c555633
commit 699c555633
parent 971a0ff160
3 changed files with 156 additions and 203 deletions
--- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
+++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
@ -9,7 +9,7 @@ subroutine run_ccsd_space_orb
  double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
  logical :: not_converged
-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
+  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:)
  double precision, allocatable :: t1(:,:), r1(:,:)
  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
@ -50,8 +50,6 @@ subroutine run_ccsd_space_orb
  !print*,'vir',list_vir
  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
  allocate(tau(nO,nO,nV,nV))
  allocate(tau_x(nO,nO,nV,nV))
  allocate(t1(nO,nV), r1(nO,nV))
  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
@ -95,26 +93,6 @@ subroutine run_ccsd_space_orb
  endif
  ! Init
  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
  call update_tau_space(nO,nV,t1,t2,tau)
  call update_tau_x_space(nO,nV,tau,tau_x)
  !print*,'hf_energy', hf_energy
  call det_energy(det,uncorr_energy)
  print*,'Det energy', uncorr_energy
  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
  print*,'Guess energy', uncorr_energy+energy, energy
  nb_iter = 0
  not_converged = .True.
  max_r1 = 0d0
  max_r2 = 0d0
  write(*,'(A77)') ' -----------------------------------------------------------------------------'
  write(*,'(A77)') ' |   It.  |       E(CCSD) (Ha) | Correlation (Ha) |  Conv. T1  |  Conv. T2  |'
  write(*,'(A77)') ' -----------------------------------------------------------------------------'
  call wall_time(ta)
  type(c_ptr) :: gpu_data
  gpu_data = gpu_init(nO, nV, cholesky_mo_num, &
@ -128,10 +106,31 @@ subroutine run_ccsd_space_orb
    stop -1
  endif
  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
  call gpu_upload(gpu_data, nO, nV, t1, t2);
  !print*,'hf_energy', hf_energy
  call det_energy(det,uncorr_energy)
  print*,'Det energy', uncorr_energy
  energy = ccsd_energy_space_gpu(gpu_data)
  print*,'Guess energy', uncorr_energy+energy, energy
  nb_iter = 0
  not_converged = .True.
  max_r1 = 0d0
  max_r2 = 0d0
  write(*,'(A77)') ' -----------------------------------------------------------------------------'
  write(*,'(A77)') ' |   It.  |       E(CCSD) (Ha) | Correlation (Ha) |  Conv. T1  |  Conv. T2  |'
  write(*,'(A77)') ' -----------------------------------------------------------------------------'
  call wall_time(ta)
  do while (not_converged)
    ! Residue
    call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x);
    !$OMP PARALLEL SECTIONS
    !$OMP SECTION
    call compute_H_oo_chol_gpu(gpu_data,0)
@ -163,11 +162,10 @@ subroutine run_ccsd_space_orb
      print*,'Unkown cc_method_method: '//cc_update_method
    endif
-    call update_tau_space(nO,nV,t1,t2,tau)
+    call gpu_upload(gpu_data, nO, nV, t1, t2);
    call update_tau_x_space(nO,nV,tau,tau_x)
    ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    energy = ccsd_energy_space_gpu(gpu_data)
    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
    nb_iter = nb_iter + 1
@ -202,7 +200,7 @@ subroutine run_ccsd_space_orb
    deallocate(all_err,all_t)
  endif
-  deallocate(r1,r2,tau)
+  deallocate(r1,r2)
  ! CCSD(T)
  double precision :: e_t
@ -248,163 +246,3 @@ subroutine run_ccsd_space_orb
 end
 ! Energy
 subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
  implicit none
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: tau(nO,nO,nV,nV)
  double precision, intent(in)  :: t1(nO,nV)
  double precision, intent(out) :: energy
  ! internal
  integer :: i,j,a,b
  double precision :: e
  energy = 0d0
  !$omp parallel &
  !$omp shared(nO,nV,energy,tau,t1,&
  !$omp cc_space_f_vo,cc_space_w_oovv) &
  !$omp private(i,j,a,b,e) &
  !$omp default(none)
  e = 0d0
  !$omp do
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
  !$omp end do nowait
  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
       enddo
      enddo
    enddo
  enddo
  !$omp end do nowait
  !$omp critical
  energy = energy + e
  !$omp end critical
  !$omp end parallel
 end
 subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
  implicit none
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
  double precision, intent(in)  :: t1(nO,nV)
  double precision, intent(out) :: energy
  ! internal
  integer :: i,j,a,b
  double precision :: e
  energy = 0d0
  !$omp parallel &
  !$omp shared(nO,nV,energy,tau_x,t1,&
  !$omp cc_space_f_vo,cc_space_v_oovv) &
  !$omp private(i,j,a,b,e) &
  !$omp default(none)
  e = 0d0
  !$omp do
  do a = 1, nV
    do i = 1, nO
      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
    enddo
  enddo
  !$omp end do nowait
  !$omp do
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
       enddo
      enddo
    enddo
  enddo
  !$omp end do nowait
  !$omp critical
  energy = energy + e
  !$omp end critical
  !$omp end parallel
 end
 ! Tau
 subroutine update_tau_space(nO,nV,t1,t2,tau)
  implicit none
  ! in
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
  ! out
  double precision, intent(out) :: tau(nO,nO,nV,nV)
  ! internal
  integer                       :: i,j,a,b
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tau,t2,t1) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
 subroutine update_tau_x_space(nO,nV,tau,tau_x)
  implicit none
  ! in
  integer, intent(in)           :: nO, nV
  double precision, intent(in)  :: tau(nO,nO,nV,nV)
  ! out
  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
  ! internal
  integer                       :: i,j,a,b
  !$OMP PARALLEL &
  !$OMP SHARED(nO,nV,tau,tau_x) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
  !$OMP DO
  do b = 1, nV
    do a = 1, nV
      do j = 1, nO
        do i = 1, nO
          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
        enddo
      enddo
    enddo
  enddo
  !$OMP END DO
  !$OMP END PARALLEL
 end
--- a/devel/ccsd_gpu/gpu.c
+++ b/devel/ccsd_gpu/gpu.c
@ -9,9 +9,7 @@
 void gpu_upload(gpu_data* data,
                int nO, int nV,
                double* t1,
-                double* t2,
+                double* t2)
                double* tau,
                double* tau_x)
 {
    int lda;
    const int cholesky_mo_num = data->cholesky_mo_num;
@ -19,19 +17,14 @@ void gpu_upload(gpu_data* data,
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    double * tau = malloc(nO*nO*nV*nV * sizeof(double));
    double * tau_x = malloc(nO*nO*nV*nV * sizeof(double));
    #pragma omp parallel num_threads(ngpus)
    {
        int igpu = omp_get_thread_num();
        cudaSetDevice(igpu);
        double* d_tau = data[igpu].tau;
        lda = nO * nO;
        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
        double* d_tau_x = data[igpu].tau_x;
        lda = nO * nO;
        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
        double* d_t1 = data[igpu].t1;
        lda = nO;
        cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
@ -39,9 +32,74 @@ void gpu_upload(gpu_data* data,
        double* d_t2 = data[igpu].t2;
        lda = nO*nO;
        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
        int m,n,k, lda, ldb, ldc;
        double alpha, beta;
        double* A;
        double* B;
        double* C;
        cublasHandle_t handle;
        cublasCreate(&handle);
        cudaStream_t stream[nV];
        double* d_tau = data[igpu].tau;
        double* d_tau_x = data[igpu].tau_x;
        lda = nO * nO;
        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
        if (igpu == 0) {
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamCreate(&(stream[i]));
          }
          alpha = 1.0;
          for (int j=0 ; j<nO ; ++j) {
            for (int b=0 ; b<nV ; ++b) {
              cublasSetStream(handle, stream[b]);
              beta = t1[j+b*nO];
              A = &(d_t2[nO*(j + nO*nV*b)]); lda = nO*nO;
              B = d_t1; ldb = nO;
              C = &(d_tau[nO*(j + nO*nV*b)]); ldc = nO*nO;
              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
            }
          }
          cudaDeviceSynchronize();
          alpha = 2.0;
          beta = -1.0;
          for (int b=0 ; b<nV ; ++b) {
            for (int a=0 ; a<nV ; ++a) {
              cublasSetStream(handle, stream[a]);
              A = &(d_tau[nO*nO*(a + nV*b)]); lda = nO;
              B = &(d_tau[nO*nO*(b + nV*a)]); ldb = nO;
              C = &(d_tau_x[nO*nO*(a + nV*b)]); ldc = nO;
              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
            }
          }
          for (int i=0 ; i<nV ; ++i) {
            cudaStreamDestroy(stream[i]);
          }
          cublasSetStream(handle, NULL);
          lda = nO*nO;
          cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_tau, lda, tau, lda);
          cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_tau_x, lda, tau_x, lda);
        }
        #pragma omp barrier
        if (igpu > 0) {
          lda = nO * nO;
          cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
          cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
        }
        cublasDestroy(handle);
    }
    free(tau);
    free(tau_x);
 }
 void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
@ -1932,3 +1990,57 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
  }
 }
 double ccsd_energy_space_gpu(gpu_data* data)
 {
    double result = 0.0;
    const int nO = data->nO;
    const int nV = data->nV;
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
    #pragma omp parallel num_threads(ngpus)
    {
      int m,n,k, lda, ldb, ldc;
      double alpha, beta;
      double* A;
      double* B;
      double* C;
      int igpu = omp_get_thread_num();
      cudaSetDevice(igpu);
      cublasHandle_t handle;
      cublasCreate(&handle);
      double result_local = 0.0;
      #pragma omp sections
      {
        #pragma omp section
        {
          double* d_cc_space_f_ov = data[igpu].cc_space_f_ov;
          double* d_t1 = data[igpu].t1;
          double x;
          cublasDdot(handle, nO*nV, d_cc_space_f_ov, 1, d_t1, 1, &x);
          result_local += 2.0*x;
        }
        #pragma omp section
        {
          double* d_tau_x = data[igpu].tau_x;
          double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv;
          double x;
          cublasDdot(handle, nO*nO*nV*nV, d_tau_x, 1, d_cc_space_v_oovv, 1, &x);
          result_local += x;
        }
      }
      cublasDestroy(handle);
      #pragma omp critical
      {
        result += result_local;
      }
  }
  return result;
 }
--- a/devel/ccsd_gpu/gpu_module.f90
+++ b/devel/ccsd_gpu/gpu_module.f90
@ -30,16 +30,15 @@ module gpu_module
        real(c_double), intent(in)  :: cc_space_f_vv(nV,nV)
    end function
-    subroutine gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x) bind(C)
+    subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
        integer(c_int), intent(in), value  :: nO, nV
        real(c_double), intent(in) :: t1(nO,nV)
        real(c_double), intent(in) :: t2(nO,nO,nV,nV)
        real(c_double), intent(in) :: tau(nO,nO,nV,nV)
        real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
    end subroutine
    subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
@ -76,6 +75,10 @@ module gpu_module
        real(c_double), intent(out) :: max_r2
    end subroutine
    double precision function ccsd_energy_space_gpu(gpu_data) bind(C)
        import c_ptr
        type(c_ptr), value    :: gpu_data
    end function
    subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)