Compute energy on GPU

2024-12-22 04:13:40 +01:00 · 2023-08-05 00:50:58 +02:00 · 2023-08-05 00:50:58 +02:00 · 699c555633
commit 699c555633
parent 971a0ff160
3 changed files with 156 additions and 203 deletions
--- a/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
+++ b/devel/ccsd_gpu/ccsd_space_orb_sub.irp.f
@ -9,7 +9,7 @@ subroutine run_ccsd_space_orb
  double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
  logical :: not_converged

-  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
+  double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:)
  double precision, allocatable :: t1(:,:), r1(:,:)
  double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)

@ -50,8 +50,6 @@ subroutine run_ccsd_space_orb
  !print*,'vir',list_vir

  allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
-  allocate(tau(nO,nO,nV,nV))
-  allocate(tau_x(nO,nO,nV,nV))
  allocate(t1(nO,nV), r1(nO,nV))
  allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))

@ -95,26 +93,6 @@ subroutine run_ccsd_space_orb
  endif

  ! Init
-  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
-  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
-  call update_tau_space(nO,nV,t1,t2,tau)
-  call update_tau_x_space(nO,nV,tau,tau_x)
-  !print*,'hf_energy', hf_energy
-  call det_energy(det,uncorr_energy)
-  print*,'Det energy', uncorr_energy
-  call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
-  print*,'Guess energy', uncorr_energy+energy, energy
-
-  nb_iter = 0
-  not_converged = .True.
-  max_r1 = 0d0
-  max_r2 = 0d0
-
-  write(*,'(A77)') ' -----------------------------------------------------------------------------'
-  write(*,'(A77)') ' |   It.  |       E(CCSD) (Ha) | Correlation (Ha) |  Conv. T1  |  Conv. T2  |'
-  write(*,'(A77)') ' -----------------------------------------------------------------------------'
-  call wall_time(ta)
-
  type(c_ptr) :: gpu_data

  gpu_data = gpu_init(nO, nV, cholesky_mo_num, &
@ -128,10 +106,31 @@ subroutine run_ccsd_space_orb
    stop -1
  endif

+  call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
+  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
+
+  call gpu_upload(gpu_data, nO, nV, t1, t2);
+
+  !print*,'hf_energy', hf_energy
+  call det_energy(det,uncorr_energy)
+  print*,'Det energy', uncorr_energy
+  energy = ccsd_energy_space_gpu(gpu_data)
+  print*,'Guess energy', uncorr_energy+energy, energy
+
+  nb_iter = 0
+  not_converged = .True.
+  max_r1 = 0d0
+  max_r2 = 0d0
+
+  write(*,'(A77)') ' -----------------------------------------------------------------------------'
+  write(*,'(A77)') ' |   It.  |       E(CCSD) (Ha) | Correlation (Ha) |  Conv. T1  |  Conv. T2  |'
+  write(*,'(A77)') ' -----------------------------------------------------------------------------'
+  call wall_time(ta)
+
+
  do while (not_converged)

    ! Residue
-    call gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x);
    !$OMP PARALLEL SECTIONS
    !$OMP SECTION
    call compute_H_oo_chol_gpu(gpu_data,0)
@ -163,11 +162,10 @@ subroutine run_ccsd_space_orb
      print*,'Unkown cc_method_method: '//cc_update_method
    endif

-    call update_tau_space(nO,nV,t1,t2,tau)
-    call update_tau_x_space(nO,nV,tau,tau_x)
+    call gpu_upload(gpu_data, nO, nV, t1, t2);

    ! Energy
-    call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
+    energy = ccsd_energy_space_gpu(gpu_data)
    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'

    nb_iter = nb_iter + 1
@ -202,7 +200,7 @@ subroutine run_ccsd_space_orb
    deallocate(all_err,all_t)
  endif

-  deallocate(r1,r2,tau)
+  deallocate(r1,r2)

  ! CCSD(T)
  double precision :: e_t
@ -248,163 +246,3 @@ subroutine run_ccsd_space_orb

 end

-! Energy
-
-subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau,t1,&
-  !$omp cc_space_f_vo,cc_space_w_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
-
-  implicit none
-
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau_x(nO,nO,nV,nV)
-  double precision, intent(in)  :: t1(nO,nV)
-  double precision, intent(out) :: energy
-
-  ! internal
-  integer :: i,j,a,b
-  double precision :: e
-
-  energy = 0d0
-  !$omp parallel &
-  !$omp shared(nO,nV,energy,tau_x,t1,&
-  !$omp cc_space_f_vo,cc_space_v_oovv) &
-  !$omp private(i,j,a,b,e) &
-  !$omp default(none)
-  e = 0d0
-  !$omp do
-  do a = 1, nV
-    do i = 1, nO
-      e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp do
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
-       enddo
-      enddo
-    enddo
-  enddo
-  !$omp end do nowait
-  !$omp critical
-  energy = energy + e
-  !$omp end critical
-  !$omp end parallel
-
-end
-
-! Tau
-
-subroutine update_tau_space(nO,nV,t1,t2,tau)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: t1(nO,nV), t2(nO,nO,nV,nV)
-
-  ! out
-  double precision, intent(out) :: tau(nO,nO,nV,nV)
-
-  ! internal
-  integer                       :: i,j,a,b
-
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,t2,t1) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-end
-
-subroutine update_tau_x_space(nO,nV,tau,tau_x)
-
-  implicit none
-
-  ! in
-  integer, intent(in)           :: nO, nV
-  double precision, intent(in)  :: tau(nO,nO,nV,nV)
-
-  ! out
-  double precision, intent(out) :: tau_x(nO,nO,nV,nV)
-
-  ! internal
-  integer                       :: i,j,a,b
-
-  !$OMP PARALLEL &
-  !$OMP SHARED(nO,nV,tau,tau_x) &
-  !$OMP PRIVATE(i,j,a,b) &
-  !$OMP DEFAULT(NONE)
-  !$OMP DO
-  do b = 1, nV
-    do a = 1, nV
-      do j = 1, nO
-        do i = 1, nO
-          tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
-        enddo
-      enddo
-    enddo
-  enddo
-  !$OMP END DO
-  !$OMP END PARALLEL
-
-end
-
--- a/devel/ccsd_gpu/gpu.c
+++ b/devel/ccsd_gpu/gpu.c
@ -9,9 +9,7 @@
 void gpu_upload(gpu_data* data,
                int nO, int nV,
                double* t1,
-                double* t2,
-                double* tau,
-                double* tau_x)
+                double* t2)
 {
    int lda;
    const int cholesky_mo_num = data->cholesky_mo_num;
@ -19,19 +17,14 @@ void gpu_upload(gpu_data* data,
    int ngpus = 1;
    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);

+    double * tau = malloc(nO*nO*nV*nV * sizeof(double));
+    double * tau_x = malloc(nO*nO*nV*nV * sizeof(double));
+
    #pragma omp parallel num_threads(ngpus)
    {
        int igpu = omp_get_thread_num();
        cudaSetDevice(igpu);

-        double* d_tau = data[igpu].tau;
-        lda = nO * nO;
-        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
-
-        double* d_tau_x = data[igpu].tau_x;
-        lda = nO * nO;
-        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
-
        double* d_t1 = data[igpu].t1;
        lda = nO;
        cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda);
@ -39,11 +32,76 @@ void gpu_upload(gpu_data* data,
        double* d_t2 = data[igpu].t2;
        lda = nO*nO;
        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda);
-    }

+        int m,n,k, lda, ldb, ldc;
+        double alpha, beta;
+        double* A;
+        double* B;
+        double* C;
+
+        cublasHandle_t handle;
+        cublasCreate(&handle);
+        cudaStream_t stream[nV];
+
+        double* d_tau = data[igpu].tau;
+
+        double* d_tau_x = data[igpu].tau_x;
+        lda = nO * nO;
+        cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
+
+        if (igpu == 0) {
+
+          for (int i=0 ; i<nV ; ++i) {
+            cudaStreamCreate(&(stream[i]));
+          }
+          alpha = 1.0;
+          for (int j=0 ; j<nO ; ++j) {
+            for (int b=0 ; b<nV ; ++b) {
+              cublasSetStream(handle, stream[b]);
+              beta = t1[j+b*nO];
+              A = &(d_t2[nO*(j + nO*nV*b)]); lda = nO*nO;
+              B = d_t1; ldb = nO;
+              C = &(d_tau[nO*(j + nO*nV*b)]); ldc = nO*nO;
+              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nV, &alpha, A, lda, &beta, B, ldb, C, ldc);
+            }
+          }
+          cudaDeviceSynchronize();
+          alpha = 2.0;
+          beta = -1.0;
+          for (int b=0 ; b<nV ; ++b) {
+            for (int a=0 ; a<nV ; ++a) {
+              cublasSetStream(handle, stream[a]);
+              A = &(d_tau[nO*nO*(a + nV*b)]); lda = nO;
+              B = &(d_tau[nO*nO*(b + nV*a)]); ldb = nO;
+              C = &(d_tau_x[nO*nO*(a + nV*b)]); ldc = nO;
+              cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO, nO, &alpha, A, lda, &beta, B, ldb, C, ldc);
+            }
+          }
+          for (int i=0 ; i<nV ; ++i) {
+            cudaStreamDestroy(stream[i]);
+          }
+          cublasSetStream(handle, NULL);
+          lda = nO*nO;
+          cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_tau, lda, tau, lda);
+          cublasGetMatrix(nO*nO, nV*nV, sizeof(double), d_tau_x, lda, tau_x, lda);
+        }
+
+        #pragma omp barrier
+
+        if (igpu > 0) {
+          lda = nO * nO;
+          cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda);
+          cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda);
+        }
+        cublasDestroy(handle);
+    }
+    free(tau);
+    free(tau_x);
 }


+
+
 void compute_h_oo_chol_gpu(gpu_data* data, int igpu)
 {
    int ngpus = 1;
@ -1932,3 +1990,57 @@ void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, doubl
  }
 }

+
+
+double ccsd_energy_space_gpu(gpu_data* data)
+{
+    double result = 0.0;
+
+    const int nO = data->nO;
+    const int nV = data->nV;
+
+    int ngpus = 1;
+    if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
+
+    #pragma omp parallel num_threads(ngpus)
+    {
+      int m,n,k, lda, ldb, ldc;
+      double alpha, beta;
+      double* A;
+      double* B;
+      double* C;
+
+      int igpu = omp_get_thread_num();
+      cudaSetDevice(igpu);
+
+      cublasHandle_t handle;
+      cublasCreate(&handle);
+
+      double result_local = 0.0;
+      #pragma omp sections
+      {
+        #pragma omp section
+        {
+          double* d_cc_space_f_ov = data[igpu].cc_space_f_ov;
+          double* d_t1 = data[igpu].t1;
+          double x;
+          cublasDdot(handle, nO*nV, d_cc_space_f_ov, 1, d_t1, 1, &x);
+          result_local += 2.0*x;
+        }
+        #pragma omp section
+        {
+          double* d_tau_x = data[igpu].tau_x;
+          double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv;
+          double x;
+          cublasDdot(handle, nO*nO*nV*nV, d_tau_x, 1, d_cc_space_v_oovv, 1, &x);
+          result_local += x;
+        }
+      }
+      cublasDestroy(handle);
+      #pragma omp critical
+      {
+        result += result_local;
+      }
+  }
+  return result;
+}
--- a/devel/ccsd_gpu/gpu_module.f90
+++ b/devel/ccsd_gpu/gpu_module.f90
@ -30,16 +30,15 @@ module gpu_module
        real(c_double), intent(in)  :: cc_space_f_vv(nV,nV)
    end function

-    subroutine gpu_upload(gpu_data, nO, nV, t1, t2, tau, tau_x) bind(C)
+    subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
        integer(c_int), intent(in), value  :: nO, nV
        real(c_double), intent(in) :: t1(nO,nV)
        real(c_double), intent(in) :: t2(nO,nO,nV,nV)
-        real(c_double), intent(in) :: tau(nO,nO,nV,nV)
-        real(c_double), intent(in) :: tau_x(nO,nO,nV,nV)
    end subroutine

+
    subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
        import c_int, c_double, c_ptr
        type(c_ptr), value    :: gpu_data
@ -76,6 +75,10 @@ module gpu_module
        real(c_double), intent(out) :: max_r2
    end subroutine

+    double precision function ccsd_energy_space_gpu(gpu_data) bind(C)
+        import c_ptr
+        type(c_ptr), value    :: gpu_data
+    end function


    subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)