Removed GPU from Jastrow

2025-01-03 10:06:09 +01:00 · 2023-03-01 14:47:32 +01:00 · 2023-03-01 14:47:32 +01:00 · ea21ec2ef7
commit ea21ec2ef7
parent 9a779f2a94
1 changed files with 24 additions and 705 deletions
--- a/org/qmckl_jastrow.org
+++ b/org/qmckl_jastrow.org
@ -20,16 +20,16 @@
  \[
  J_{\text{eN}}(\mathbf{r},\mathbf{R}) =
  \sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}} 
-  \frac{a_{1,\alpha}\, g_\alpha(R_{i\alpha})}{1+a_{2,\alpha}\, g_\alpha(R_{i\alpha})} +
+  \frac{a_{1\,\alpha}\, f_\alpha(R_{i\,\alpha})}{1+a_{2\,\alpha}\, f_\alpha(R_{i\alpha})} +
-  \sum_{p=2}^{N_\text{ord}^a} a_{p+1,\alpha}\, [g_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
+  \sum_{p=2}^{N_\text{ord}^a} a_{p+1\,\alpha}\, [f_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
  \]
  $J_{\text{ee}}$ contains electron-electron terms:
  \[
  J_{\text{ee}}(\mathbf{r}) =
  \sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1}
-  \frac{b_1\, f(r_{ij})}{1+b_2\, f(r_{ij})} +
+  \frac{\frac{1}{2}(1+\delta^{\uparrow\downarrow}_{ij}) b_1\, f_{\text{ee}}(r_{ij})}{1+b_2\, f_{\text{ee}}(r_{ij})} +
-  \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f(r_{ij})]^p  - J_{ee}^\infty
+  \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f_{\text{ee}}(r_{ij})]^p  - J_{ee}^\infty
  \]
  and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms:
@ -42,7 +42,7 @@
       \sum_{p=2}^{N_{\text{ord}}}
        \sum_{k=0}^{p-1}
         \sum_{l=0}^{p-k-2\delta_{k,0}}
-           c_{lkp\alpha} \left[ f({r}_{ij}) \right]^k
+           c_{lkp\alpha} \left[ g_\text{ee}({r}_{ij}) \right]^k
             \left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right]
             \left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2}
   \]
@ -52,7 +52,7 @@
  $f$ and $g$ are scaling function defined as
  \[
-  f(r) = \frac{1-e^{-\kappa\, r}}{\kappa} \text{ and }
+  f_\alpha(r) = \frac{1-e^{-\kappa_\alpha\, r}}{\kappa_\alpha} \text{ and }
  g_\alpha(r) = e^{-\kappa_\alpha\, r}.
  \]
@ -118,11 +118,6 @@ int main() {
 #include "qmckl_jastrow_private_func.h"
 #include "qmckl_jastrow_private_type.h"
 #ifdef HAVE_CUBLAS_OFFLOAD
 #include "cublas_v2.h"
 #endif
  #+end_src
 * Context
@ -404,13 +399,9 @@ typedef struct qmckl_jastrow_struct{
  bool      provided;
  char *    type;
  #ifdef HAVE_HPC
  bool     gpu_offload;
  #endif
 } qmckl_jastrow_struct;
    #+end_src
   The ~uninitialized~ integer contains one bit set to one for each
   initialization function which has not been called. It becomes equal
   to zero after all initialization functions have been called. The
@ -458,9 +449,9 @@ qmckl_exit_code  qmckl_set_jastrow_bord_num          (qmckl_context context, con
 qmckl_exit_code  qmckl_set_jastrow_cord_num          (qmckl_context context, const int64_t cord_num);
 qmckl_exit_code  qmckl_set_jastrow_type_nucl_num     (qmckl_context context, const int64_t type_nucl_num);
 qmckl_exit_code  qmckl_set_jastrow_type_nucl_vector  (qmckl_context context, const int64_t* type_nucl_vector, const int64_t nucl_num);
-qmckl_exit_code  qmckl_set_jastrow_a_vector       (qmckl_context context, const double * a_vector, const int64_t size_max);
+qmckl_exit_code  qmckl_set_jastrow_a_vector          (qmckl_context context, const double * a_vector, const int64_t size_max);
-qmckl_exit_code  qmckl_set_jastrow_b_vector       (qmckl_context context, const double * b_vector, const int64_t size_max);
+qmckl_exit_code  qmckl_set_jastrow_b_vector          (qmckl_context context, const double * b_vector, const int64_t size_max);
-qmckl_exit_code  qmckl_set_jastrow_c_vector       (qmckl_context context, const double * c_vector, const int64_t size_max);
+qmckl_exit_code  qmckl_set_jastrow_c_vector          (qmckl_context context, const double * c_vector, const int64_t size_max);
   #+end_src
   #+NAME:pre2
@ -492,7 +483,6 @@ if (ctx->jastrow.provided) {
 return QMCKL_SUCCESS;
   #+end_src
   #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none
 qmckl_exit_code
 qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num)
@ -910,7 +900,7 @@ qmckl_set_jastrow_rescale_factor_en(qmckl_context context,
 When the required information is completely entered, other data structures are
 computed to accelerate the calculations. The intermediates factors
- are precontracted using BLAS LEVEL 3 operations for an optimal flop count.
+ are precontracted using BLAS LEVEL 3 operations.
   #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context);
@ -952,11 +942,6 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) {
                           NULL);
  }
 /* Decide if the Jastrow if offloaded on GPU or not */
 #if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD))
  ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100;
 #endif
  qmckl_exit_code rc;
  rc = qmckl_provide_jastrow_asymp_jasa(context);
@ -2016,16 +2001,16 @@ assert(fabs(asymp_jasb[1]-0.31567342786262853) < 1.e-12);
     #+end_src
-** Electron-electron component \(f_{ee}\)
+** Electron-electron component \(f_\text{ee}\)
  Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~
-  componenet and the electron-electron rescaled distances ~ee_distance_rescaled~.
+  component and the electron-electron rescaled distances ~ee_distance_rescaled~.
  \[
-f_{ee} = \sum_{i,j<i} \left[ \frac{ \eta B_0 C_{ij}}{1 - B_1 C_{ij}}  + \sum^{nord}_{k}B_k C_{ij}^k \right] - J_{\text{ee}}^{\infty}
+f_\text{ee} = \sum_{i,j<i} \left[ \frac{ \delta B_0\, C_{ij}}{1 - B_1\, C_{ij}}  + \sum^{n_\text{ord}}_{k}B_k\, C_{ij}^k \right] - J_{\text{ee}}^{\infty}
  \]
-  $\eta$ is the spin factor, $B$ is the vector of $b$ parameters,
+  $\delta$ is the spin factor, $B$ is the vector of $b$ parameters,
  $C$ is the array of scaled distances.
@ -7294,47 +7279,7 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
      ctx->jastrow.tmp_c = tmp_c;
    }
-
+    rc = qmckl_compute_tmp_c(context,
 /* Choose the correct compute function (depending on offload type) */
 #ifdef HAVE_HPC
    const bool gpu_offload = ctx->jastrow.gpu_offload;
 #else
    const bool gpu_offload = false;
 #endif
    if (gpu_offload) {
 #ifdef HAVE_CUBLAS_OFFLOAD
      rc = qmckl_compute_tmp_c_cublas_offload(context,
                                              ctx->jastrow.cord_num,
                                              ctx->electron.num,
                                              ctx->nucleus.num,
                                              ctx->electron.walker.num,
                                              ctx->jastrow.een_rescaled_e,
                                              ctx->jastrow.een_rescaled_n,
                                              ctx->jastrow.tmp_c);
 #elif HAVE_OPENACC_OFFLOAD
      rc = qmckl_compute_tmp_c_acc_offload(context,
                                           ctx->jastrow.cord_num,
                                           ctx->electron.num,
                                           ctx->nucleus.num,
                                           ctx->electron.walker.num,
                                           ctx->jastrow.een_rescaled_e,
                                           ctx->jastrow.een_rescaled_n,
                                           ctx->jastrow.tmp_c);
 #elif HAVE_OPENMP_OFFLOAD
      rc = qmckl_compute_tmp_c_omp_offload(context,
                                           ctx->jastrow.cord_num,
                                           ctx->electron.num,
                                           ctx->nucleus.num,
                                           ctx->electron.walker.num,
                                           ctx->jastrow.een_rescaled_e,
                                           ctx->jastrow.een_rescaled_n,
                                           ctx->jastrow.tmp_c);
 #else
      rc = QMCKL_FAILURE;
 #endif
    } else {
      rc = qmckl_compute_tmp_c(context,
                               ctx->jastrow.cord_num,
                               ctx->electron.num,
                               ctx->nucleus.num,
@ -7342,8 +7287,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
                               ctx->jastrow.een_rescaled_e,
                               ctx->jastrow.een_rescaled_n,
                               ctx->jastrow.tmp_c);
    }
    ctx->jastrow.tmp_c_date = ctx->date;
  }
@ -7394,54 +7337,15 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
    }
-#ifdef HAVE_HPC
+    rc = qmckl_compute_dtmp_c(context,
-    const bool gpu_offload = ctx->jastrow.gpu_offload;
+                              ctx->jastrow.cord_num,
-#else
+                              ctx->electron.num,
-    const bool gpu_offload = false;
+                              ctx->nucleus.num,
-#endif
+                              ctx->electron.walker.num,
-
+                              ctx->jastrow.een_rescaled_e_deriv_e,
-    if (gpu_offload) {
+                              ctx->jastrow.een_rescaled_n,
-#ifdef HAVE_CUBLAS_OFFLOAD
+                              ctx->jastrow.dtmp_c);
-      rc = qmckl_compute_dtmp_c_cublas_offload(context,
+    
                                            ctx->jastrow.cord_num,
                                            ctx->electron.num,
                                            ctx->nucleus.num,
                                            ctx->electron.walker.num,
                                            ctx->jastrow.een_rescaled_e_deriv_e,
                                            ctx->jastrow.een_rescaled_n,
                                            ctx->jastrow.dtmp_c);
 #elif HAVE_OPENACC_OFFLOAD
      rc = qmckl_compute_dtmp_c_acc_offload(context,
                                            ctx->jastrow.cord_num,
                                            ctx->electron.num,
                                            ctx->nucleus.num,
                                            ctx->electron.walker.num,
                                            ctx->jastrow.een_rescaled_e_deriv_e,
                                            ctx->jastrow.een_rescaled_n,
                                            ctx->jastrow.dtmp_c);
 #elif HAVE_OPENMP_OFFLOAD
      rc = qmckl_compute_dtmp_c_omp_offload(context,
                                            ctx->jastrow.cord_num,
                                            ctx->electron.num,
                                            ctx->nucleus.num,
                                            ctx->electron.walker.num,
                                            ctx->jastrow.een_rescaled_e_deriv_e,
                                            ctx->jastrow.een_rescaled_n,
                                            ctx->jastrow.dtmp_c);
 #else
      rc = QMCKL_FAILURE;
 #endif
    } else {
        rc = qmckl_compute_dtmp_c(context,
                                ctx->jastrow.cord_num,
                                ctx->electron.num,
                                ctx->nucleus.num,
                                ctx->electron.walker.num,
                                ctx->jastrow.een_rescaled_e_deriv_e,
                                ctx->jastrow.een_rescaled_n,
                                ctx->jastrow.dtmp_c);
    }
    if (rc != QMCKL_SUCCESS) {
      return rc;
    }
@ -8139,293 +8043,6 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
                                         double* const tmp_c );
    #+end_src
 **** OpenACC offload                                               :noexport:
     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_OPENACC_OFFLOAD
 qmckl_exit_code
 qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
                                 const int64_t cord_num,
                                 const int64_t elec_num,
                                 const int64_t nucl_num,
                                 const int64_t walk_num,
                                 const double* een_rescaled_e,
                                 const double* een_rescaled_n,
                                 double* const tmp_c )
 {
  if (context == QMCKL_NULL_CONTEXT) {
    return QMCKL_INVALID_CONTEXT;
  }
  if (cord_num < 0) {
    return QMCKL_INVALID_ARG_2;
  }
  if (elec_num <= 0) {
    return QMCKL_INVALID_ARG_3;
  }
  if (nucl_num <= 0) {
    return QMCKL_INVALID_ARG_4;
  }
  // Compute array access strides:
  // For tmp_c...
  const int64_t stride_k_c  = elec_num;
  const int64_t stride_j_c  = stride_k_c * nucl_num;
  const int64_t stride_i_c  = stride_j_c * (cord_num+1);
  const int64_t stride_nw_c = stride_i_c * cord_num;
  // For een_rescaled_e...
  const int64_t stride_m_e  = elec_num;
  const int64_t stride_i_e  = stride_m_e * elec_num;
  const int64_t stride_nw_e = stride_i_e * (cord_num+1);
  // For een_rescaled_n...
  const int64_t stride_k_n  = elec_num;
  const int64_t stride_j_n  = stride_k_n * nucl_num;
  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
  const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
  const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
  #pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
  {
 #pragma acc loop independent gang worker vector collapse(5)
    for (int nw=0; nw < walk_num; ++nw) {
      for (int i=0; i<cord_num; ++i){
        // Replacement for single DGEMM
        for (int j=0; j<cord_num+1; j++) {
          for (int k=0; k<nucl_num; k++) {
            for (int l=0; l<elec_num; l++) {
              // Single reduction
              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
              for (int m=0; m<elec_num; m++) {
                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
                een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
                een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
            }
          }
        }
      }
    }
  }
  }
  return QMCKL_SUCCESS;
 }
 #endif
     #+end_src
    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_OPENACC_OFFLOAD
 qmckl_exit_code
 qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
                                 const int64_t cord_num,
                                 const int64_t elec_num,
                                 const int64_t nucl_num,
                                 const int64_t walk_num,
                                 const double* een_rescaled_e,
                                 const double* een_rescaled_n,
                                 double* const tmp_c );
 #endif
     #+end_src
 **** OpenMP offload                                                :noexport:
     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_OPENMP_OFFLOAD
 qmckl_exit_code
 qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
                                 const int64_t cord_num,
                                 const int64_t elec_num,
                                 const int64_t nucl_num,
                                 const int64_t walk_num,
                                 const double* een_rescaled_e,
                                 const double* een_rescaled_n,
                                 double* const tmp_c )
 {
  if (context == QMCKL_NULL_CONTEXT) {
    return QMCKL_INVALID_CONTEXT;
  }
  if (cord_num < 0) {
    return QMCKL_INVALID_ARG_2;
  }
  if (elec_num <= 0) {
    return QMCKL_INVALID_ARG_3;
  }
  if (nucl_num <= 0) {
    return QMCKL_INVALID_ARG_4;
  }
  // Compute array access strides:
  // For tmp_c...
  const int64_t stride_k_c  = elec_num;
  const int64_t stride_j_c  = stride_k_c * nucl_num;
  const int64_t stride_i_c  = stride_j_c * (cord_num+1);
  const int64_t stride_nw_c = stride_i_c * cord_num;
  // For een_rescaled_e...
  const int64_t stride_m_e  = elec_num;
  const int64_t stride_i_e  = stride_m_e * elec_num;
  const int64_t stride_nw_e = stride_i_e * (cord_num+1);
  // For een_rescaled_n...
  const int64_t stride_k_n  = elec_num;
  const int64_t stride_j_n  = stride_k_n * nucl_num;
  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
  const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
  const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
  // WARNING This implementation seems unomptimized
  #pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
  {
  #pragma omp teams distribute parallel for collapse(5)
    for (int nw=0; nw < walk_num; ++nw) {
      for (int i=0; i<cord_num; ++i){
        // Replacement for single DGEMM
        for (int j=0; j<cord_num+1; j++) {
          for (int k=0; k<nucl_num; k++) {
            for (int l=0; l<elec_num; l++) {
              // Single reduction
              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
              for (int m=0; m<elec_num; m++) {
                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
                  tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
                  een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
                  een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
              }
            }
          }
        }
      }
    }
  }
  return QMCKL_SUCCESS;
 }
 #endif
     #+end_src
    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_OPENMP_OFFLOAD
 qmckl_exit_code
 qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
                                 const int64_t cord_num,
                                 const int64_t elec_num,
                                 const int64_t nucl_num,
                                 const int64_t walk_num,
                                 const double* een_rescaled_e,
                                 const double* een_rescaled_n,
                                 double* const tmp_c );
 #endif
     #+end_src
 **** cuBLAS offload                                                :noexport:
 #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_CUBLAS_OFFLOAD
 qmckl_exit_code
 qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
                                    const int64_t cord_num,
                                    const int64_t elec_num,
                                    const int64_t nucl_num,
                                    const int64_t walk_num,
                                    const double* een_rescaled_e,
                                    const double* een_rescaled_n,
                                    double* const tmp_c )
 {
  if (context == QMCKL_NULL_CONTEXT) {
    return QMCKL_INVALID_CONTEXT;
  }
  if (cord_num < 0) {
    return QMCKL_INVALID_ARG_2;
  }
  if (elec_num <= 0) {
    return QMCKL_INVALID_ARG_3;
  }
  if (nucl_num <= 0) {
    return QMCKL_INVALID_ARG_4;
  }
  //cuBLAS initialization
  cublasHandle_t handle;
  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stdout, "CUBLAS initialization failed!\n");
    exit(EXIT_FAILURE);
  }
  const double alpha = 1.0;
  const double beta  = 0.0;
  const int64_t M = elec_num;
  const int64_t N = nucl_num*(cord_num + 1);
  const int64_t K = elec_num;
  const int64_t LDA = elec_num;
  const int64_t LDB = elec_num;
  const int64_t LDC = elec_num;
  const int64_t af = elec_num*elec_num;
  const int64_t bf = elec_num*nucl_num*(cord_num+1);
  const int64_t cf = bf;
  #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
  #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
  {
  for (int nw=0; nw < walk_num; ++nw) {
    // /!\  cublasError needs to be checked and return QMCKL_FAILURE if it fails
    int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
                                    &(een_rescaled_e[nw*(cord_num+1)]),
                                    LDA, af,
                                    &(een_rescaled_n[bf*nw]),
                                    LDB, 0,
                                    &beta,
                                    &(tmp_c[nw*cord_num]),
                                    LDC, cf, cord_num);
  }
  }
  #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
  cublasDestroy(handle);
  return QMCKL_SUCCESS;
  }
 #endif
 #+end_src
    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_CUBLAS_OFFLOAD
 qmckl_exit_code
 qmckl_compute_tmp_c_cublas_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e,
      const double* een_rescaled_n,
      double* const tmp_c );
 #endif
     #+end_src
 *** Compute dtmp_c
   :PROPERTIES:
   :Name:     qmckl_compute_dtmp_c
@ -8667,304 +8284,6 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
      double* const dtmp_c );
     #+end_src
 **** OpenACC offload                                               :noexport:
     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_OPENACC_OFFLOAD
 qmckl_exit_code
 qmckl_compute_dtmp_c_acc_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e_deriv_e,
      const double* een_rescaled_n,
      double* const dtmp_c ) {
  if (context == QMCKL_NULL_CONTEXT) {
     return QMCKL_INVALID_CONTEXT;
  }
  if (cord_num < 0) {
     return QMCKL_INVALID_ARG_2;
  }
  if (elec_num <= 0) {
     return QMCKL_INVALID_ARG_3;
  }
  if (nucl_num <= 0) {
     return QMCKL_INVALID_ARG_4;
  }
  // Compute strides...
  // For dtmp_c
  const int64_t stride_l_d  = elec_num;
  const int64_t stride_k_d  = stride_l_d * 4;
  const int64_t stride_j_d  = stride_k_d * nucl_num;
  const int64_t stride_i_d  = stride_j_d * (cord_num+1);
  const int64_t stride_nw_d = stride_i_d * cord_num;
  // For een_rescaled_e_deriv_e
  const int64_t stride_l_e  = elec_num;
  const int64_t stride_n_e  = stride_l_e * 4;
  const int64_t stride_i_e  = stride_n_e * elec_num;
  const int64_t stride_nw_e = stride_i_e * cord_num;
  // For een_rescaled_n
  const int64_t stride_k_n  = elec_num;
  const int64_t stride_j_n  = stride_k_n * nucl_num;
  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
  const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
  const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
  #pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
  {
  #pragma acc loop independent gang worker vector collapse(6)
  for (int nw=0; nw < walk_num; nw++) {
    for (int i=0; i < cord_num; i++) {
      // Single DGEMM
      for(int j=0; j<cord_num+1; j++) {
        for(int k=0; k<nucl_num; k++) {
          for(int l=0; l<4; l++) {
            for(int m=0; m<elec_num; m++) {
              // Single reduction
              dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
              for(int n=0; n<elec_num; n++){
                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
                een_rescaled_e_deriv_e[m + l * stride_l_e +  n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
                een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
              }
            }
          }
        }
      }
    }
  }
  }
  return QMCKL_SUCCESS;
 }
 #endif
     #+end_src
    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_OPENACC_OFFLOAD
    qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e_deriv_e,
      const double* een_rescaled_n,
      double* const dtmp_c );
 #endif
     #+end_src
 **** OpenMP offload                                                :noexport:
     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_OPENMP_OFFLOAD
 qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e_deriv_e,
      const double* een_rescaled_n,
      double* const dtmp_c ) {
  if (context == QMCKL_NULL_CONTEXT) {
     return QMCKL_INVALID_CONTEXT;
  }
  if (cord_num < 0) {
     return QMCKL_INVALID_ARG_2;
  }
  if (elec_num <= 0) {
     return QMCKL_INVALID_ARG_3;
  }
  if (nucl_num <= 0) {
     return QMCKL_INVALID_ARG_4;
  }
  // Compute strides...
  // For dtmp_c
  const int64_t stride_l_d  = elec_num;
  const int64_t stride_k_d  = stride_l_d * 4;
  const int64_t stride_j_d  = stride_k_d * nucl_num;
  const int64_t stride_i_d  = stride_j_d * (cord_num+1);
  const int64_t stride_nw_d = stride_i_d * cord_num;
  // For een_rescaled_e_deriv_e
  const int64_t stride_l_e  = elec_num;
  const int64_t stride_n_e  = stride_l_e * 4;
  const int64_t stride_i_e  = stride_n_e * elec_num;
  const int64_t stride_nw_e = stride_i_e * cord_num;
  // For een_rescaled_n
  const int64_t stride_k_n  = elec_num;
  const int64_t stride_j_n  = stride_k_n * nucl_num;
  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
  const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
  const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
  // WARNING This implementation seems unomptimized
  #pragma omp target map(from:dtmp_c[0:size_dtmp_c]) map(to:een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
  {
  #pragma omp teams distribute parallel for collapse(6)
  for (int nw=0; nw < walk_num; nw++) {
    for (int i=0; i < cord_num; i++) {
      // Single DGEMM
      for(int j=0; j<cord_num+1; j++) {
        for(int k=0; k<nucl_num; k++) {
          for(int l=0; l<4; l++) {
            for(int m=0; m<elec_num; m++) {
              // Single reduction
              dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
              for(int n=0; n<elec_num; n++){
                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
                een_rescaled_e_deriv_e[m + l * stride_l_e +  n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
                een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
              }
            }
          }
        }
      }
    }
  }
  }
  return QMCKL_SUCCESS;
 }
 #endif
     #+end_src
    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_OPENMP_OFFLOAD
    qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e_deriv_e,
      const double* een_rescaled_n,
      double* const dtmp_c );
 #endif
     #+end_src
 **** cuBLAS offload                                                :noexport:
     #+begin_src c :comments org :tangle (eval c) :noweb yes
 #ifdef HAVE_CUBLAS_OFFLOAD
 qmckl_exit_code
 qmckl_compute_dtmp_c_cublas_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e_deriv_e,
      const double* een_rescaled_n,
      double* const dtmp_c ) {
  if (context == QMCKL_NULL_CONTEXT) {
    return QMCKL_INVALID_CONTEXT;
  }
  if (cord_num < 0) {
    return QMCKL_INVALID_ARG_2;
  }
  if (elec_num <= 0) {
    return QMCKL_INVALID_ARG_3;
  }
  if (nucl_num <= 0) {
    return QMCKL_INVALID_ARG_4;
  }
  if (walk_num <= 0) {
    return QMCKL_INVALID_ARG_5;
  }
  qmckl_exit_code  info = QMCKL_SUCCESS;
  //cuBLAS initialization
  cublasHandle_t handle;
  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stdout, "CUBLAS initialization failed!\n");
    exit(EXIT_FAILURE);
  }
  const double alpha = 1.0;
  const double beta  = 0.0;
  const int64_t M = 4*elec_num;
  const int64_t N = nucl_num*(cord_num + 1);
  const int64_t K = elec_num;
  const int64_t LDA = 4*elec_num;
  const int64_t LDB = elec_num;
  const int64_t LDC = 4*elec_num;
  const int64_t af = elec_num*elec_num*4;
  const int64_t bf = elec_num*nucl_num*(cord_num+1);
  const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
  #pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num])
  #pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
  {
  for (int64_t nw=0; nw < walk_num; ++nw) {
    int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
                                      &(een_rescaled_e_deriv_e[(nw*(cord_num+1))]),
                                      LDA, af,
                                      &(een_rescaled_n[bf*nw]), LDB, 0,
                                      &beta,
                                      &(dtmp_c[(nw*cord_num)]),
                                      LDC, cf, cord_num);
  }
  }
  #pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num])
  cublasDestroy(handle);
  return info;
 }
 #endif
     #+end_src
     #+RESULTS:
    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 #ifdef HAVE_CUBLAS_OFFLOAD
    qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
      const qmckl_context context,
      const int64_t cord_num,
      const int64_t elec_num,
      const int64_t nucl_num,
      const int64_t walk_num,
      const double* een_rescaled_e_deriv_e,
      const double* een_rescaled_n,
      double* const dtmp_c );
 #endif
     #+end_src
 *** Test
    #+name: helper_funcs