Merge branch 'gpu' into jastrow_hpc

2025-01-03 10:06:09 +01:00 · 2022-04-07 18:43:11 +02:00 · 2022-04-07 18:43:11 +02:00 · 4ee83a48d0
commit 4ee83a48d0
parent 0a3f427ace 185c1c3cb7
2 changed files with 59 additions and 49 deletions
--- a/configure.ac
+++ b/configure.ac
@ -310,20 +310,21 @@ AS_IF([test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [
  case $CC in

    *gcc*)
-          CFLAGS="$CFLAGS -fopenacc"
+          CFLAGS="$CFLAGS -fopenmp"
+          LDFLAGS="-lcublas"
          ;;
    *nvc*)
-          CFLAGS="$CFLAGS -acc=gpu"
+          CFLAGS="$CFLAGS -mp=gpu -cudalib=cublas"
          ;;
  esac

  case $FC in

    *gfortran*)
-          FCFLAGS="$FCFLAGS -fopenacc"
+          FCFLAGS="$FCFLAGS -fopenmp"
          ;;
    *nvfortran*)
-          FCFLAGS="$FCFLAGS -acc=gpu"
+          FCFLAGS="$FCFLAGS -mp=gpu -cudalib=cublas"
          ;;
  esac
 ])
--- a/org/qmckl_jastrow.org
+++ b/org/qmckl_jastrow.org
@ -6024,14 +6024,15 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,

     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_OPENACC_OFFLOAD
-qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
-                                                 const int64_t cord_num,
-                                                 const int64_t elec_num,
-                                                 const int64_t nucl_num,
-                                                 const int64_t walk_num,
-                                                 const double* een_rescaled_e,
-                                                 const double* een_rescaled_n,
-                                                 double* const tmp_c )
+qmckl_exit_code
+qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
+                                 const int64_t cord_num,
+                                 const int64_t elec_num,
+                                 const int64_t nucl_num,
+                                 const int64_t walk_num,
+                                 const double* een_rescaled_e,
+                                 const double* een_rescaled_n,
+                                 double* const tmp_c )
 {

  if (context == QMCKL_NULL_CONTEXT) {
@ -6071,22 +6072,22 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,

  #pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
  {
-  #pragma acc loop independent gang worker vector collapse(5)
-  for (int nw=0; nw < walk_num; ++nw) {
-    for (int i=0; i<cord_num; ++i){
+#pragma acc loop independent gang worker vector collapse(5)
+    for (int nw=0; nw < walk_num; ++nw) {
+      for (int i=0; i<cord_num; ++i){

-      // Replacement for single DGEMM
-      for (int j=0; j<cord_num+1; j++) {
-        for (int k=0; k<nucl_num; k++) {
-          for (int l=0; l<elec_num; l++) {
+        // Replacement for single DGEMM
+        for (int j=0; j<cord_num+1; j++) {
+          for (int k=0; k<nucl_num; k++) {
+            for (int l=0; l<elec_num; l++) {

-            // Single reduction
-            tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
-            for (int m=0; m<elec_num; m++) {
-              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
-              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
-              een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
-              een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
+              // Single reduction
+              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
+              for (int m=0; m<elec_num; m++) {
+                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
+                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
+                een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
+                een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
            }
          }
        }
@ -6164,32 +6165,33 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
  const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;

+
  // WARNING This implementation seems unomptimized
  #pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
  {
  #pragma omp teams distribute parallel for collapse(5)
-  for (int nw=0; nw < walk_num; ++nw) {
-    for (int i=0; i<cord_num; ++i){
+    for (int nw=0; nw < walk_num; ++nw) {
+      for (int i=0; i<cord_num; ++i){

-      // Replacement for single DGEMM
-      for (int j=0; j<cord_num+1; j++) {
-        for (int k=0; k<nucl_num; k++) {
-          for (int l=0; l<elec_num; l++) {
+        // Replacement for single DGEMM
+        for (int j=0; j<cord_num+1; j++) {
+          for (int k=0; k<nucl_num; k++) {
+            for (int l=0; l<elec_num; l++) {

-            // Single reduction
-            tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
-            for (int m=0; m<elec_num; m++) {
-              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
-              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
-              een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
-              een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
+              // Single reduction
+              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
+              for (int m=0; m<elec_num; m++) {
+                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
+                  tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
+                  een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
+                  een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
+              }
            }
          }
        }
      }
    }
  }
-  }

  return QMCKL_SUCCESS;
 }
@ -6214,6 +6216,7 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,

 #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_CUBLAS_OFFLOAD
+qmckl_exit_code
 qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
                                    const int64_t cord_num,
                                    const int64_t elec_num,
@ -6268,16 +6271,19 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
  const int64_t bf = elec_num*nucl_num*(cord_num+1);
  const int64_t cf = bf;

+  info = QMCKL_SUCCESS;
+
+
  #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
  #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
  {

-
+#pragma omp target teams distribute parallel for collapse(2) 
  for (int nw=0; nw < walk_num; ++nw) {
    for (int i=0; i<cord_num; ++i){

-        //CuBlas implementation
-      int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
+      cublasStatus_t cublasError =
+                        cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
                                    &(een_rescaled_e[af*(i+nw*(cord_num+1))]), \
                                    LDA,                                \
                                    &(een_rescaled_n[bf*nw]),           \
@ -6286,6 +6292,7 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
                                    &(tmp_c[cf*(i+nw*cord_num)]),       \
                                    LDC);

+/*
        //Manage cublas ERROR
      if(cublasError != CUBLAS_STATUS_SUCCESS){
            printf("CUBLAS ERROR %d", cublasError);
@ -6294,6 +6301,7 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
      }else{
        info = QMCKL_SUCCESS;
      }
+*/


    }
@ -6314,7 +6322,8 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,

    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_CUBLAS_OFFLOAD
-    qmckl_exit_code qmckl_compute_tmp_c_cublas_offload (
+qmckl_exit_code
+qmckl_compute_tmp_c_cublas_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
@ -6539,8 +6548,6 @@ qmckl_compute_dtmp_c_hpc (const qmckl_context context,
  const int64_t bf = elec_num*nucl_num*(cord_num+1);
  const int64_t cf = elec_num*4*nucl_num*(cord_num+1);

-  printf("COUCOU\n");
-
 #ifdef HAVE_OPENMP
 #pragma omp parallel for collapse(2)
 #endif
@ -6573,7 +6580,8 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (

     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_OPENACC_OFFLOAD
-qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
+qmckl_exit_code
+qmckl_compute_dtmp_c_acc_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
@ -6633,7 +6641,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
            for(int m=0; m<elec_num; m++) {

              // Single reduction
-              dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
+              dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
              for(int n=0; n<elec_num; n++){
                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
@ -6772,7 +6780,8 @@ qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (

     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_CUBLAS_OFFLOAD
-qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
+qmckl_exit_code
+qmckl_compute_dtmp_c_cublas_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,