H_vv

2024-12-23 03:53:29 +01:00 · 2024-07-01 18:04:48 +02:00 · 2024-07-01 18:04:48 +02:00 · d3c1994c64
commit d3c1994c64
parent 860121d404
5 changed files with 114 additions and 69 deletions
--- a/plugins/local/gpu_nvidia/gpu.c
+++ b/plugins/local/gpu_nvidia/gpu.c
@ -11,10 +11,6 @@
 /* Generic functions */
 bool no_gpu() {
 return false;
 }
 int gpu_ndevices() {
  int ngpus;
  cudaGetDeviceCount(&ngpus);
@ -35,13 +31,13 @@ void gpu_allocate(void** ptr, const int64_t size) {
      free = INT64_MAX;
    }
-    /* Use managed memory if it does not fit on the GPU */
+    rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
-    if (size < free && size < total/2) {
+//    /* Use managed memory if it does not fit on the GPU */
 //    if (size < free && size < total/2) {
 //      rc= cudaMalloc(ptr, size);
-      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    } else {
-    } else {
+//      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
-      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+//    }
    }
    assert (rc == cudaSuccess);
 }
--- a/plugins/local/gpu_x86/gpu.c
+++ b/plugins/local/gpu_x86/gpu.c
@ -5,14 +5,10 @@
 #include <stdbool.h>
 #include <assert.h>
 bool no_gpu() {
 return true;
 }
 /* Generic functions */
 int gpu_ndevices() {
-  return 1;
+  return 0;
 }
 void gpu_set_device(int32_t i) {
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -183,7 +183,8 @@ subroutine run_ccsd_space_orb
    if (do_mo_cholesky) then
      call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
              d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
-      call compute_H_vv_chol(nO,nV,tau_x,H_vv)
+      call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
              d_cc_space_v_ov_chol,H_vv)
      call compute_H_vo_chol(nO,nV,t1%f,H_vo%f)
      call compute_r1_space_chol(nO,nV,t1%f,t2%f,tau%f,H_oo%F,H_vv%F,H_vo%F,r1%f,max_r1)
@ -403,7 +404,7 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
  type(gpu_stream) :: stream(nV)
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL if (gpu_num == 0) &
  !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
@ -466,7 +467,7 @@ subroutine update_tau_x_space(nO,nV,tau,tau_x)
    call gpu_stream_create(stream(a))
  enddo
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL if (gpu_num == 0) &
  !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
  !$OMP PRIVATE(i,j,a,b) &
  !$OMP DEFAULT(NONE)
--- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f
@ -344,48 +344,47 @@ subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, &
 !    1.d0, H_oo%f(1,1), nO)
 !
-  type(gpu_stream) :: stream(nV)
+  type(gpu_blas) :: blas
  do b=1,nV
    call gpu_stream_create(stream(b))
  enddo
-  !$OMP PARALLEL if (no_gpu()) &
+  !$OMP PARALLEL  &
  !$OMP DEFAULT(SHARED) &
-  !$OMP PRIVATE(u,b,tmp_vov,tmp_ovv)
+  !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv)
  !$OMP SINGLE
  !$OMP TASK
  call gpu_copy(d_cc_space_f_oo, H_oo)
  !$OMP END TASK
  !$OMP END SINGLE
  call gpu_allocate(tmp_vov, nV, nO, nV)
  call gpu_allocate(tmp_ovv, nO, nV, nV)
  call gpu_allocate(tmp_vov, nV, nO, nV)
  call gpu_blas_create(blas)
  !$OMP DO
  do u=1,nO
-    call gpu_dgeam_f(blas_handle, 'N', 'N', 1, nO*nV*nV, 1.d0, &
+    call gpu_dgeam_f(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, &
           tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f, nO, tmp_ovv%f, 1)
    do b=1,nV
-      call gpu_set_stream(blas_handle,stream(b))
+      call gpu_dgeam_f(blas, 'T', 'T', nV, nO, 1.d0, &
      call gpu_dgeam_f(blas_handle, 'T', 'T', nV, nO, 1.d0, &
           tmp_ovv%f(1,1,b), nO, 0.d0, &
           tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV)
    enddo
-    call gpu_dgemm_f(blas_handle, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, &
      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_vov%f, nV, &
      0.d0, tau_kau%f(1,1,u), cholesky_mo_num)
    call gpu_synchronize()
  enddo
  !$OMP END DO
  call gpu_blas_destroy(blas)
  call gpu_deallocate(tmp_vov)
  call gpu_deallocate(tmp_ovv)
  !$OMP TASKWAIT
  !$OMP END PARALLEL
  do b=1,nV
    call gpu_stream_destroy(stream(b))
  enddo
  call gpu_set_stream(blas_handle,gpu_default_stream)
  call gpu_copy(d_cc_space_f_oo, H_oo)
  call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, &
    tau_kau, cholesky_mo_num*nV,  d_cc_space_v_vo_chol, cholesky_mo_num*nV, &
    1.d0, H_oo, nO)
@ -395,52 +394,97 @@ end
 ! H_vv
-subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv)
+subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, &
         d_cc_space_v_ov_chol,H_vv)
  use gpu
  implicit none
-  integer, intent(in)           :: nO,nV
+  integer, intent(in)              :: nO,nV
  type(gpu_double2), intent(in)    :: d_cc_space_f_vv
  type(gpu_double3), intent(in)    :: d_cc_space_v_ov_chol
  type(gpu_double4), intent(in)    :: tau_x
  type(gpu_double2), intent(out)   :: H_vv
  integer :: a,b,i,j,u,k, beta
-  double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:)
+  type(gpu_double3) :: tau_kia, tmp_oov
-  allocate(tau_kia(cholesky_mo_num,nO,nV))
+  call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV)
-  !$omp parallel &
+
-  !$omp default(shared) &
+!  !$omp parallel &
-  !$omp private(i,beta,j,k,a,b,tmp_oov)
+!  !$omp default(shared) &
-  allocate(tmp_oov(nO,nO,nV) )
+!  !$omp private(i,beta,j,k,a,b,tmp_oov)
-  !$omp do
+!  allocate(tmp_oov(nO,nO,nV) )
 !  !$omp do
 !  do a = 1, nV
 !    do b=1,nV
 !      do j=1,nO
 !        do i=1,nO
 !          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
 !        enddo
 !      enddo
 !    enddo
 !    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
 !      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov, nO, &
 !      0.d0, tau_kia(1,1,a), cholesky_mo_num)
 !  enddo
 !  !$omp end do nowait
 !  deallocate(tmp_oov)
 !  !$omp do
 !  do beta = 1, nV
 !    do a = 1, nV
 !      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
 !    enddo
 !  enddo
 !  !$omp end do nowait
 !  !$omp barrier
 !  !$omp end  parallel
 !  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
 !    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol%f, cholesky_mo_num*nO, &
 !    1.d0, H_vv%f, nV)
  type(gpu_blas) :: blas
  PROVIDE gpu_num
  !$OMP PARALLEL &
  !$OMP DEFAULT(SHARED) &
  !$OMP PRIVATE(a,b,tmp_oov,blas)
  !$OMP SINGLE
  !$OMP TASK
  call gpu_copy(d_cc_space_f_vv, H_vv)
  !$OMP END TASK
  !$OMP END SINGLE
  call gpu_blas_create(blas)
  call gpu_allocate(tmp_oov, nO, nO, nV)
  !$OMP DO
  do a = 1, nV
    do b=1,nV
-      do j=1,nO
+      call gpu_dgeam_f(blas, 'N', 'N', nO, nO, 1.d0, &
-        do i=1,nO
+        tau_x%f(1,1,a,b), nO, 0.d0, &
-          tmp_oov(i,j,b) = tau_x%f(i,j,a,b)
+        tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO)
        enddo
      enddo
    enddo
-    call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
+    call gpu_dgemm_f(blas, 'N','T',cholesky_mo_num,nO,nO*nV,1.d0, &
-      cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, &
+      d_cc_space_v_ov_chol%f, cholesky_mo_num, tmp_oov%f, nO, &
-      0.d0, tau_kia(1,1,a), cholesky_mo_num)
+      0.d0, tau_kia%f(1,1,a), cholesky_mo_num)
  enddo
-  !$omp end do nowait
+  !$OMP END DO
  deallocate(tmp_oov)
-  !$omp do
+  call gpu_blas_destroy(blas)
  do beta = 1, nV
    do a = 1, nV
      H_vv%f(a,beta) = cc_space_f_vv(a,beta)
    enddo
  enddo
  !$omp end do nowait
  !$omp barrier
  !$omp end  parallel
  call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
    tau_kia, cholesky_mo_num*nO,  cc_space_v_ov_chol, cholesky_mo_num*nO, &
    1.d0, H_vv%f, nV)
  call gpu_deallocate(tmp_oov)
  !$OMP TASKWAIT
  !$OMP END PARALLEL
  call gpu_dgemm(blas_handle,'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, &
    tau_kia, cholesky_mo_num*nO,  d_cc_space_v_ov_chol, cholesky_mo_num*nO, &
    1.d0, H_vv, nV)
  call gpu_deallocate(tau_kia)
 end
 ! H_vo
--- a/src/gpu/gpu.irp.f
+++ b/src/gpu/gpu.irp.f
@ -16,3 +16,11 @@ BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
 gpu_default_stream%c = C_NULL_PTR
 END_PROVIDER
 BEGIN_PROVIDER [ integer, gpu_num ]
 implicit none
 BEGIN_DOC
 ! Number of usable GPUs
 END_DOC
 gpu_num = gpu_ndevices()
 END_PROVIDER