From caec5183841840a7933e7fa4cf46bcb0ff3f14d9 Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Fri, 28 Jun 2024 11:00:58 +0200
Subject: [PATCH] Added Nvidia module

---
 src/ccsd/ccsd_space_orb_sub.irp.f |  10 +-
 src/gpu/gpu_module.F90            |   6 +-
 src/gpu_nvidia/LIB                |   1 +
 src/gpu_nvidia/NEED               |   1 +
 src/gpu_nvidia/README.rst         |   5 +
 src/gpu_nvidia/gpu.c              | 327 ++++++++++++++++++++++++++++++
 src/gpu_x86/gpu.c                 |  40 ++--
 7 files changed, 359 insertions(+), 31 deletions(-)
 create mode 100644 src/gpu_nvidia/LIB
 create mode 100644 src/gpu_nvidia/NEED
 create mode 100644 src/gpu_nvidia/README.rst
 create mode 100644 src/gpu_nvidia/gpu.c

diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f
index 455d62f7..e7c9b1ab 100644
--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@@ -384,17 +384,17 @@ subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
   !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas) &
   !$OMP PRIVATE(i,j,a,b) &
   !$OMP DEFAULT(NONE)
-  do j=1,nO
-    !$OMP DO
-    do b=1,nV
-      call gpu_set_stream(blas,stream(b))
+  !$OMP DO
+  do b=1,nV
+    call gpu_set_stream(blas,stream(b))
+    do j=1,nO
       call gpu_dgeam_c(blas%c, 'N', 'N', nO*1_8, nV*1_8, &
          1.d0, c_loc(t2%f(1,j,1,b)), nO*nO*1_8, &
          h_t1(j,b), t1%c, nO*1_8, &
          c_loc(tau%f(1,j,1,b)), nO*nO*1_8)
     enddo
-    !$OMP END DO
   enddo
+  !$OMP END DO
   !$OMP END PARALLEL
 
   call gpu_synchronize()
diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90
index 51f80ac0..d1ddad4c 100644
--- a/src/gpu/gpu_module.F90
+++ b/src/gpu/gpu_module.F90
@@ -120,7 +120,7 @@ module gpu
 
     subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_double), intent(out)    :: res
@@ -128,7 +128,7 @@ module gpu
 
     subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       integer(c_int64_t), value      :: n, incx, incy
       type(c_ptr), intent(in), value :: dx, dy
       real(c_float), intent(out)  :: res
@@ -137,7 +137,7 @@ module gpu
     subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
       b, ldb, c, ldc) bind(C, name='gpu_dgeam')
       import
-      type(c_ptr), intent(in), value :: handle
+      type(c_ptr), intent(in)        :: handle
       character(c_char), intent(in), value :: transa, transb
       integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
       real(c_double), intent(in), value :: alpha, beta
diff --git a/src/gpu_nvidia/LIB b/src/gpu_nvidia/LIB
new file mode 100644
index 00000000..91f54e91
--- /dev/null
+++ b/src/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/src/gpu_nvidia/NEED b/src/gpu_nvidia/NEED
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/src/gpu_nvidia/README.rst b/src/gpu_nvidia/README.rst
new file mode 100644
index 00000000..5dcfca92
--- /dev/null
+++ b/src/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
diff --git a/src/gpu_nvidia/gpu.c b/src/gpu_nvidia/gpu.c
new file mode 100644
index 00000000..f0bd247a
--- /dev/null
+++ b/src/gpu_nvidia/gpu.c
@@ -0,0 +1,327 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+/* Generic functions */
+
+int gpu_ndevices() {
+  int ngpus;
+  cudaGetDeviceCount(&ngpus);
+  return ngpus;
+}
+
+void gpu_set_device(int32_t igpu) {
+  cudaSetDevice(igpu);
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, const int64_t size) {
+    size_t free, total;
+    cudaError_t rc = cudaMemGetInfo( &free, &total );
+    if (rc != cudaSuccess) {
+      free = INT64_MAX;
+    }
+
+    /* Use managed memory if it does not fit on the GPU */
+    if (size < free && size < total/2) {
+//      rc= cudaMalloc(ptr, size);
+      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    } else {
+      rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    }
+    assert (rc == cudaSuccess);
+}
+
+void gpu_deallocate(void** ptr) {
+  assert (*ptr != NULL);
+  cudaFree(*ptr);
+  *ptr = NULL;
+}
+
+
+/* Memory transfer functions */
+
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+  cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
+}
+
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+  cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
+}
+
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+  cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
+}
+
+
+/* Streams */
+
+void gpu_stream_create(void** ptr) {
+  cudaStream_t stream;
+  cudaError_t rc = cudaStreamCreate(&stream);
+  assert (rc == cudaSuccess);
+  *ptr = (void*) stream;
+}
+
+void gpu_stream_destroy(void** ptr) {
+  assert (*ptr != NULL);
+  cudaError_t rc = cudaStreamDestroy( (cudaStream_t) *ptr);
+  assert (rc == cudaSuccess);
+  *ptr = NULL;
+}
+
+void gpu_set_stream(void** handle, void** stream) {
+  cublasSetStream( (cublasHandle_t) *handle, (cudaStream_t) *stream);
+}
+
+void gpu_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+
+/* BLAS functions */
+
+void gpu_blas_create(void** handle) {
+  cublasHandle_t cublas_handle;
+  cublasStatus_t rc = cublasCreate(&cublas_handle);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *handle = (void*) cublas_handle;
+}
+
+
+void gpu_blas_destroy(void** handle) {
+  assert (*handle != NULL);
+  cublasStatus_t rc = cublasDestroy( (cublasHandle_t) *handle);
+  assert (rc == CUBLAS_STATUS_SUCCESS);
+  *handle = NULL;
+}
+
+
+void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasDdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+}
+
+
+
+void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t n_, incx_, incy_;
+
+  n_    = (int32_t) n;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasSdot((cublasHandle_t) *handle, n_, x, incx_, y, incy_, result);
+}
+
+
+
+void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasDgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+
+void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, incx_, incy_;
+
+  m_    = (int32_t) m;
+  n_    = (int32_t) n;
+  lda_  = (int32_t) lda;
+  incx_ = (int32_t) incx;
+  incy_ = (int32_t) incy;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)    m_ == m   );
+  assert ( (int64_t)    n_ == n   );
+  assert ( (int64_t)  lda_ == lda );
+  assert ( (int64_t) incx_ == incx);
+  assert ( (int64_t) incy_ == incy);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+
+  cublasSgemv((cublasHandle_t) *handle, transa_, m_, n_, &alpha, a, lda_, x, incx_, &beta, y, incy_);
+}
+
+
+void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+               const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+
+void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+               const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
+
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, k_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  k_   = (int32_t) k;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t)   k_ == k  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgemm((cublasHandle_t) *handle, transa_, transb_, m_, n_, k_, &alpha, a, lda_, b, ldb_, &beta, c, ldc_);
+}
+
+
+void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+               const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasDgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
+
+
+void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+               const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
+  assert (*handle != NULL);
+
+  /* Convert to int32_t */
+  int32_t m_, n_, lda_, ldb_, ldc_;
+
+  m_   = (int32_t) m;
+  n_   = (int32_t) n;
+  lda_ = (int32_t) lda;
+  ldb_ = (int32_t) ldb;
+  ldc_ = (int32_t) ldc;
+
+  /* Check for integer overflows */
+  assert ( (int64_t)   m_ == m  );
+  assert ( (int64_t)   n_ == n  );
+  assert ( (int64_t) lda_ == lda);
+  assert ( (int64_t) ldb_ == ldb);
+  assert ( (int64_t) ldc_ == ldc);
+
+  cublasOperation_t transa_ = CUBLAS_OP_N;
+  cublasOperation_t transb_ = CUBLAS_OP_N;
+  if (transa == 'T' || transa == 't') transa_ = CUBLAS_OP_T;
+  if (transb == 'T' || transb == 't') transb_ = CUBLAS_OP_T;
+
+  cublasSgeam((cublasHandle_t) *handle, transa_, transb_, m_, n_, &alpha, a, lda_, &beta, b, ldb_, c, ldc_);
+
+}
diff --git a/src/gpu_x86/gpu.c b/src/gpu_x86/gpu.c
index 5f42cb0d..ac7c3620 100644
--- a/src/gpu_x86/gpu.c
+++ b/src/gpu_x86/gpu.c
@@ -56,7 +56,7 @@ void gpu_stream_destroy(void** ptr) {
   *ptr = NULL;
 }
 
-void gpu_set_stream(void* handle, void* stream) {
+void gpu_set_stream(void** handle, void** stream) {
   return;
 }
 
@@ -79,8 +79,8 @@ void gpu_blas_destroy(void** handle) {
 
 double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
 
-void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
-  assert (handle != NULL);
+void gpu_ddot(void** handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -100,8 +100,8 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
 
 float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
 
-void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
-  assert (handle != NULL);
+void gpu_sdot(void** handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t n_, incx_, incy_;
@@ -122,10 +122,10 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
 void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
             const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
 
-void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgemv(void** handle, const char transa, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -150,10 +150,10 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
 void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
                const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
 
-void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgemv(void** handle, const char transa, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, lda_, incx_, incy_;
@@ -178,10 +178,10 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
 void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
             const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
 
-void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
+void gpu_dgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
                const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -209,10 +209,10 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
 void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
             const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
 
-void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
+void gpu_sgemm(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
                const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
 
-  assert (handle != NULL);
+  assert (*handle != NULL);
 
   /* Convert to int32_t */
   int32_t m_, n_, k_, lda_, ldb_, ldc_;
@@ -236,12 +236,9 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
+void gpu_dgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
                const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+  assert (*handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||
@@ -371,12 +368,9 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
 }
 
 
-void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
+void gpu_sgeam(void** handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
                const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
-  if (handle == NULL) {
-    perror("NULL handle");
-    exit(-1);
-  }
+  assert (*handle != NULL);
 
   if ( (transa == 'N' && transb == 'N') ||
        (transa == 'n' && transb == 'N') ||