Removed GPU from Jastrow

2025-04-29 11:54:48 +02:00 · 2023-03-01 14:47:32 +01:00 · 2023-03-01 14:47:32 +01:00 · ea21ec2ef7
commit ea21ec2ef7
parent 9a779f2a94
1 changed files with 24 additions and 705 deletions
--- a/org/qmckl_jastrow.org
+++ b/org/qmckl_jastrow.org
@ -20,16 +20,16 @@
  \[
  J_{\text{eN}}(\mathbf{r},\mathbf{R}) =
  \sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}} 
-  \frac{a_{1,\alpha}\, g_\alpha(R_{i\alpha})}{1+a_{2,\alpha}\, g_\alpha(R_{i\alpha})} +
-  \sum_{p=2}^{N_\text{ord}^a} a_{p+1,\alpha}\, [g_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
+  \frac{a_{1\,\alpha}\, f_\alpha(R_{i\,\alpha})}{1+a_{2\,\alpha}\, f_\alpha(R_{i\alpha})} +
+  \sum_{p=2}^{N_\text{ord}^a} a_{p+1\,\alpha}\, [f_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
  \]

  $J_{\text{ee}}$ contains electron-electron terms:
  \[
  J_{\text{ee}}(\mathbf{r}) =
  \sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1}
-  \frac{b_1\, f(r_{ij})}{1+b_2\, f(r_{ij})} +
-  \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f(r_{ij})]^p  - J_{ee}^\infty
+  \frac{\frac{1}{2}(1+\delta^{\uparrow\downarrow}_{ij}) b_1\, f_{\text{ee}}(r_{ij})}{1+b_2\, f_{\text{ee}}(r_{ij})} +
+  \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f_{\text{ee}}(r_{ij})]^p  - J_{ee}^\infty
  \]

  and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms:
@ -42,7 +42,7 @@
       \sum_{p=2}^{N_{\text{ord}}}
        \sum_{k=0}^{p-1}
         \sum_{l=0}^{p-k-2\delta_{k,0}}
-           c_{lkp\alpha} \left[ f({r}_{ij}) \right]^k
+           c_{lkp\alpha} \left[ g_\text{ee}({r}_{ij}) \right]^k
             \left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right]
             \left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2}
   \]
@ -52,7 +52,7 @@
  $f$ and $g$ are scaling function defined as

  \[
-  f(r) = \frac{1-e^{-\kappa\, r}}{\kappa} \text{ and }
+  f_\alpha(r) = \frac{1-e^{-\kappa_\alpha\, r}}{\kappa_\alpha} \text{ and }
  g_\alpha(r) = e^{-\kappa_\alpha\, r}.
  \]

@ -118,11 +118,6 @@ int main() {
 #include "qmckl_jastrow_private_func.h"
 #include "qmckl_jastrow_private_type.h"

-#ifdef HAVE_CUBLAS_OFFLOAD
-#include "cublas_v2.h"
-#endif
-
-
  #+end_src

 * Context
@ -404,13 +399,9 @@ typedef struct qmckl_jastrow_struct{
  bool      provided;
  char *    type;

-  #ifdef HAVE_HPC
-  bool     gpu_offload;
-  #endif
 } qmckl_jastrow_struct;
    #+end_src

-
   The ~uninitialized~ integer contains one bit set to one for each
   initialization function which has not been called. It becomes equal
   to zero after all initialization functions have been called. The
@ -492,7 +483,6 @@ if (ctx->jastrow.provided) {
 return QMCKL_SUCCESS;
   #+end_src

-
   #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none
 qmckl_exit_code
 qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num)
@ -910,7 +900,7 @@ qmckl_set_jastrow_rescale_factor_en(qmckl_context context,

 When the required information is completely entered, other data structures are
 computed to accelerate the calculations. The intermediates factors
- are precontracted using BLAS LEVEL 3 operations for an optimal flop count.
+ are precontracted using BLAS LEVEL 3 operations.

   #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
 qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context);
@ -952,11 +942,6 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) {
                           NULL);
  }

-  /* Decide if the Jastrow if offloaded on GPU or not */
-#if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD))
-  ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100;
-#endif
-
  qmckl_exit_code rc;

  rc = qmckl_provide_jastrow_asymp_jasa(context);
@ -2016,16 +2001,16 @@ assert(fabs(asymp_jasb[1]-0.31567342786262853) < 1.e-12);

     #+end_src

-** Electron-electron component \(f_{ee}\)
+** Electron-electron component \(f_\text{ee}\)

  Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~
-  componenet and the electron-electron rescaled distances ~ee_distance_rescaled~.
+  component and the electron-electron rescaled distances ~ee_distance_rescaled~.

  \[
-f_{ee} = \sum_{i,j<i} \left[ \frac{ \eta B_0 C_{ij}}{1 - B_1 C_{ij}}  + \sum^{nord}_{k}B_k C_{ij}^k \right] - J_{\text{ee}}^{\infty}
+f_\text{ee} = \sum_{i,j<i} \left[ \frac{ \delta B_0\, C_{ij}}{1 - B_1\, C_{ij}}  + \sum^{n_\text{ord}}_{k}B_k\, C_{ij}^k \right] - J_{\text{ee}}^{\infty}
  \]

-  $\eta$ is the spin factor, $B$ is the vector of $b$ parameters,
+  $\delta$ is the spin factor, $B$ is the vector of $b$ parameters,
  $C$ is the array of scaled distances.


@ -7294,46 +7279,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
      ctx->jastrow.tmp_c = tmp_c;
    }

-
-    /* Choose the correct compute function (depending on offload type) */
-#ifdef HAVE_HPC
-    const bool gpu_offload = ctx->jastrow.gpu_offload;
-#else
-    const bool gpu_offload = false;
-#endif
-
-    if (gpu_offload) {
-#ifdef HAVE_CUBLAS_OFFLOAD
-      rc = qmckl_compute_tmp_c_cublas_offload(context,
-                                              ctx->jastrow.cord_num,
-                                              ctx->electron.num,
-                                              ctx->nucleus.num,
-                                              ctx->electron.walker.num,
-                                              ctx->jastrow.een_rescaled_e,
-                                              ctx->jastrow.een_rescaled_n,
-                                              ctx->jastrow.tmp_c);
-#elif HAVE_OPENACC_OFFLOAD
-      rc = qmckl_compute_tmp_c_acc_offload(context,
-                                           ctx->jastrow.cord_num,
-                                           ctx->electron.num,
-                                           ctx->nucleus.num,
-                                           ctx->electron.walker.num,
-                                           ctx->jastrow.een_rescaled_e,
-                                           ctx->jastrow.een_rescaled_n,
-                                           ctx->jastrow.tmp_c);
-#elif HAVE_OPENMP_OFFLOAD
-      rc = qmckl_compute_tmp_c_omp_offload(context,
-                                           ctx->jastrow.cord_num,
-                                           ctx->electron.num,
-                                           ctx->nucleus.num,
-                                           ctx->electron.walker.num,
-                                           ctx->jastrow.een_rescaled_e,
-                                           ctx->jastrow.een_rescaled_n,
-                                           ctx->jastrow.tmp_c);
-#else
-      rc = QMCKL_FAILURE;
-#endif
-    } else {
    rc = qmckl_compute_tmp_c(context,
                               ctx->jastrow.cord_num,
                               ctx->electron.num,
@ -7342,8 +7287,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
                               ctx->jastrow.een_rescaled_e,
                               ctx->jastrow.een_rescaled_n,
                               ctx->jastrow.tmp_c);
-    }
-

    ctx->jastrow.tmp_c_date = ctx->date;
  }
@ -7394,44 +7337,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
    }


-#ifdef HAVE_HPC
-    const bool gpu_offload = ctx->jastrow.gpu_offload;
-#else
-    const bool gpu_offload = false;
-#endif
-
-    if (gpu_offload) {
-#ifdef HAVE_CUBLAS_OFFLOAD
-      rc = qmckl_compute_dtmp_c_cublas_offload(context,
-                                            ctx->jastrow.cord_num,
-                                            ctx->electron.num,
-                                            ctx->nucleus.num,
-                                            ctx->electron.walker.num,
-                                            ctx->jastrow.een_rescaled_e_deriv_e,
-                                            ctx->jastrow.een_rescaled_n,
-                                            ctx->jastrow.dtmp_c);
-#elif HAVE_OPENACC_OFFLOAD
-      rc = qmckl_compute_dtmp_c_acc_offload(context,
-                                            ctx->jastrow.cord_num,
-                                            ctx->electron.num,
-                                            ctx->nucleus.num,
-                                            ctx->electron.walker.num,
-                                            ctx->jastrow.een_rescaled_e_deriv_e,
-                                            ctx->jastrow.een_rescaled_n,
-                                            ctx->jastrow.dtmp_c);
-#elif HAVE_OPENMP_OFFLOAD
-      rc = qmckl_compute_dtmp_c_omp_offload(context,
-                                            ctx->jastrow.cord_num,
-                                            ctx->electron.num,
-                                            ctx->nucleus.num,
-                                            ctx->electron.walker.num,
-                                            ctx->jastrow.een_rescaled_e_deriv_e,
-                                            ctx->jastrow.een_rescaled_n,
-                                            ctx->jastrow.dtmp_c);
-#else
-      rc = QMCKL_FAILURE;
-#endif
-    } else {
    rc = qmckl_compute_dtmp_c(context,
                              ctx->jastrow.cord_num,
                              ctx->electron.num,
@ -7440,7 +7345,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
                              ctx->jastrow.een_rescaled_e_deriv_e,
                              ctx->jastrow.een_rescaled_n,
                              ctx->jastrow.dtmp_c);
-    }
    
    if (rc != QMCKL_SUCCESS) {
      return rc;
@ -8139,293 +8043,6 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
                                         double* const tmp_c );
    #+end_src

-**** OpenACC offload                                               :noexport:
-
-     #+begin_src c :comments org :tangle (eval c) :noweb yes
-#ifdef HAVE_OPENACC_OFFLOAD
-qmckl_exit_code
-qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
-                                 const int64_t cord_num,
-                                 const int64_t elec_num,
-                                 const int64_t nucl_num,
-                                 const int64_t walk_num,
-                                 const double* een_rescaled_e,
-                                 const double* een_rescaled_n,
-                                 double* const tmp_c )
-{
-
-  if (context == QMCKL_NULL_CONTEXT) {
-    return QMCKL_INVALID_CONTEXT;
-  }
-
-  if (cord_num < 0) {
-    return QMCKL_INVALID_ARG_2;
-  }
-
-  if (elec_num <= 0) {
-    return QMCKL_INVALID_ARG_3;
-  }
-
-  if (nucl_num <= 0) {
-    return QMCKL_INVALID_ARG_4;
-  }
-
-  // Compute array access strides:
-  // For tmp_c...
-  const int64_t stride_k_c  = elec_num;
-  const int64_t stride_j_c  = stride_k_c * nucl_num;
-  const int64_t stride_i_c  = stride_j_c * (cord_num+1);
-  const int64_t stride_nw_c = stride_i_c * cord_num;
-  // For een_rescaled_e...
-  const int64_t stride_m_e  = elec_num;
-  const int64_t stride_i_e  = stride_m_e * elec_num;
-  const int64_t stride_nw_e = stride_i_e * (cord_num+1);
-  // For een_rescaled_n...
-  const int64_t stride_k_n  = elec_num;
-  const int64_t stride_j_n  = stride_k_n * nucl_num;
-  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
-
-  const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
-  const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
-  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
-
-  #pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
-  {
-#pragma acc loop independent gang worker vector collapse(5)
-    for (int nw=0; nw < walk_num; ++nw) {
-      for (int i=0; i<cord_num; ++i){
-
-        // Replacement for single DGEMM
-        for (int j=0; j<cord_num+1; j++) {
-          for (int k=0; k<nucl_num; k++) {
-            for (int l=0; l<elec_num; l++) {
-
-              // Single reduction
-              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
-              for (int m=0; m<elec_num; m++) {
-                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
-                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
-                een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
-                een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
-            }
-          }
-        }
-      }
-    }
-  }
-  }
-
-  return QMCKL_SUCCESS;
-}
-#endif
-     #+end_src
-
-    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
-#ifdef HAVE_OPENACC_OFFLOAD
-qmckl_exit_code
-qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
-                                 const int64_t cord_num,
-                                 const int64_t elec_num,
-                                 const int64_t nucl_num,
-                                 const int64_t walk_num,
-                                 const double* een_rescaled_e,
-                                 const double* een_rescaled_n,
-                                 double* const tmp_c );
-#endif
-     #+end_src
-
-**** OpenMP offload                                                :noexport:
-
-     #+begin_src c :comments org :tangle (eval c) :noweb yes
-#ifdef HAVE_OPENMP_OFFLOAD
-qmckl_exit_code
-qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
-                                 const int64_t cord_num,
-                                 const int64_t elec_num,
-                                 const int64_t nucl_num,
-                                 const int64_t walk_num,
-                                 const double* een_rescaled_e,
-                                 const double* een_rescaled_n,
-                                 double* const tmp_c )
-{
-
-  if (context == QMCKL_NULL_CONTEXT) {
-    return QMCKL_INVALID_CONTEXT;
-  }
-
-  if (cord_num < 0) {
-    return QMCKL_INVALID_ARG_2;
-  }
-
-  if (elec_num <= 0) {
-    return QMCKL_INVALID_ARG_3;
-  }
-
-  if (nucl_num <= 0) {
-    return QMCKL_INVALID_ARG_4;
-  }
-
-  // Compute array access strides:
-  // For tmp_c...
-  const int64_t stride_k_c  = elec_num;
-  const int64_t stride_j_c  = stride_k_c * nucl_num;
-  const int64_t stride_i_c  = stride_j_c * (cord_num+1);
-  const int64_t stride_nw_c = stride_i_c * cord_num;
-  // For een_rescaled_e...
-  const int64_t stride_m_e  = elec_num;
-  const int64_t stride_i_e  = stride_m_e * elec_num;
-  const int64_t stride_nw_e = stride_i_e * (cord_num+1);
-  // For een_rescaled_n...
-  const int64_t stride_k_n  = elec_num;
-  const int64_t stride_j_n  = stride_k_n * nucl_num;
-  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
-
-  const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
-  const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
-  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
-
-
-  // WARNING This implementation seems unomptimized
-  #pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
-  {
-  #pragma omp teams distribute parallel for collapse(5)
-    for (int nw=0; nw < walk_num; ++nw) {
-      for (int i=0; i<cord_num; ++i){
-
-        // Replacement for single DGEMM
-        for (int j=0; j<cord_num+1; j++) {
-          for (int k=0; k<nucl_num; k++) {
-            for (int l=0; l<elec_num; l++) {
-
-              // Single reduction
-              tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
-              for (int m=0; m<elec_num; m++) {
-                tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
-                  tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
-                  een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
-                  een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  return QMCKL_SUCCESS;
-}
-#endif
-     #+end_src
-
-    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
-#ifdef HAVE_OPENMP_OFFLOAD
-qmckl_exit_code
-qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
-                                 const int64_t cord_num,
-                                 const int64_t elec_num,
-                                 const int64_t nucl_num,
-                                 const int64_t walk_num,
-                                 const double* een_rescaled_e,
-                                 const double* een_rescaled_n,
-                                 double* const tmp_c );
-#endif
-     #+end_src
-
-**** cuBLAS offload                                                :noexport:
-
-#+begin_src c :comments org :tangle (eval c) :noweb yes
-#ifdef HAVE_CUBLAS_OFFLOAD
-qmckl_exit_code
-qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
-                                    const int64_t cord_num,
-                                    const int64_t elec_num,
-                                    const int64_t nucl_num,
-                                    const int64_t walk_num,
-                                    const double* een_rescaled_e,
-                                    const double* een_rescaled_n,
-                                    double* const tmp_c )
-{
-
-  if (context == QMCKL_NULL_CONTEXT) {
-    return QMCKL_INVALID_CONTEXT;
-  }
-
-  if (cord_num < 0) {
-    return QMCKL_INVALID_ARG_2;
-  }
-
-  if (elec_num <= 0) {
-    return QMCKL_INVALID_ARG_3;
-  }
-
-  if (nucl_num <= 0) {
-    return QMCKL_INVALID_ARG_4;
-  }
-
-  //cuBLAS initialization
-  cublasHandle_t handle;
-  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
-  {
-    fprintf(stdout, "CUBLAS initialization failed!\n");
-    exit(EXIT_FAILURE);
-  }
-
-  const double alpha = 1.0;
-  const double beta  = 0.0;
-
-  const int64_t M = elec_num;
-  const int64_t N = nucl_num*(cord_num + 1);
-  const int64_t K = elec_num;
-
-  const int64_t LDA = elec_num;
-  const int64_t LDB = elec_num;
-  const int64_t LDC = elec_num;
-
-  const int64_t af = elec_num*elec_num;
-  const int64_t bf = elec_num*nucl_num*(cord_num+1);
-  const int64_t cf = bf;
-
-  #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
-  #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
-  {
-  for (int nw=0; nw < walk_num; ++nw) {
-
-    // /!\  cublasError needs to be checked and return QMCKL_FAILURE if it fails
-    int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
-                                    &(een_rescaled_e[nw*(cord_num+1)]),
-                                    LDA, af,
-                                    &(een_rescaled_n[bf*nw]),
-                                    LDB, 0,
-                                    &beta,
-                                    &(tmp_c[nw*cord_num]),
-                                    LDC, cf, cord_num);
-  }
-  }
-  #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
-
-  cublasDestroy(handle);
-  return QMCKL_SUCCESS;
-  }
-#endif
-
-#+end_src
-
-
-    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
-#ifdef HAVE_CUBLAS_OFFLOAD
-qmckl_exit_code
-qmckl_compute_tmp_c_cublas_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e,
-      const double* een_rescaled_n,
-      double* const tmp_c );
-#endif
-     #+end_src
-
 *** Compute dtmp_c
   :PROPERTIES:
   :Name:     qmckl_compute_dtmp_c
@ -8667,304 +8284,6 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
      double* const dtmp_c );
     #+end_src

-**** OpenACC offload                                               :noexport:
-
-     #+begin_src c :comments org :tangle (eval c) :noweb yes
-#ifdef HAVE_OPENACC_OFFLOAD
-qmckl_exit_code
-qmckl_compute_dtmp_c_acc_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e_deriv_e,
-      const double* een_rescaled_n,
-      double* const dtmp_c ) {
-
-  if (context == QMCKL_NULL_CONTEXT) {
-     return QMCKL_INVALID_CONTEXT;
-  }
-
-  if (cord_num < 0) {
-     return QMCKL_INVALID_ARG_2;
-  }
-
-  if (elec_num <= 0) {
-     return QMCKL_INVALID_ARG_3;
-  }
-
-  if (nucl_num <= 0) {
-     return QMCKL_INVALID_ARG_4;
-  }
-
-  // Compute strides...
-  // For dtmp_c
-  const int64_t stride_l_d  = elec_num;
-  const int64_t stride_k_d  = stride_l_d * 4;
-  const int64_t stride_j_d  = stride_k_d * nucl_num;
-  const int64_t stride_i_d  = stride_j_d * (cord_num+1);
-  const int64_t stride_nw_d = stride_i_d * cord_num;
-  // For een_rescaled_e_deriv_e
-  const int64_t stride_l_e  = elec_num;
-  const int64_t stride_n_e  = stride_l_e * 4;
-  const int64_t stride_i_e  = stride_n_e * elec_num;
-  const int64_t stride_nw_e = stride_i_e * cord_num;
-  // For een_rescaled_n
-  const int64_t stride_k_n  = elec_num;
-  const int64_t stride_j_n  = stride_k_n * nucl_num;
-  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
-
-  const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
-  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
-  const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
-
-  #pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
-  {
-  #pragma acc loop independent gang worker vector collapse(6)
-  for (int nw=0; nw < walk_num; nw++) {
-    for (int i=0; i < cord_num; i++) {
-
-      // Single DGEMM
-      for(int j=0; j<cord_num+1; j++) {
-        for(int k=0; k<nucl_num; k++) {
-          for(int l=0; l<4; l++) {
-            for(int m=0; m<elec_num; m++) {
-
-              // Single reduction
-              dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
-              for(int n=0; n<elec_num; n++){
-                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
-                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
-                een_rescaled_e_deriv_e[m + l * stride_l_e +  n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
-                een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  }
-
-  return QMCKL_SUCCESS;
-}
-#endif
-     #+end_src
-
-    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
-#ifdef HAVE_OPENACC_OFFLOAD
-    qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e_deriv_e,
-      const double* een_rescaled_n,
-      double* const dtmp_c );
-#endif
-     #+end_src
-
-**** OpenMP offload                                                :noexport:
-
-     #+begin_src c :comments org :tangle (eval c) :noweb yes
-#ifdef HAVE_OPENMP_OFFLOAD
-qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e_deriv_e,
-      const double* een_rescaled_n,
-      double* const dtmp_c ) {
-
-  if (context == QMCKL_NULL_CONTEXT) {
-     return QMCKL_INVALID_CONTEXT;
-  }
-
-  if (cord_num < 0) {
-     return QMCKL_INVALID_ARG_2;
-  }
-
-  if (elec_num <= 0) {
-     return QMCKL_INVALID_ARG_3;
-  }
-
-  if (nucl_num <= 0) {
-     return QMCKL_INVALID_ARG_4;
-  }
-
-  // Compute strides...
-  // For dtmp_c
-  const int64_t stride_l_d  = elec_num;
-  const int64_t stride_k_d  = stride_l_d * 4;
-  const int64_t stride_j_d  = stride_k_d * nucl_num;
-  const int64_t stride_i_d  = stride_j_d * (cord_num+1);
-  const int64_t stride_nw_d = stride_i_d * cord_num;
-  // For een_rescaled_e_deriv_e
-  const int64_t stride_l_e  = elec_num;
-  const int64_t stride_n_e  = stride_l_e * 4;
-  const int64_t stride_i_e  = stride_n_e * elec_num;
-  const int64_t stride_nw_e = stride_i_e * cord_num;
-  // For een_rescaled_n
-  const int64_t stride_k_n  = elec_num;
-  const int64_t stride_j_n  = stride_k_n * nucl_num;
-  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
-
-
-  const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
-  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
-  const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
-
-  // WARNING This implementation seems unomptimized
-  #pragma omp target map(from:dtmp_c[0:size_dtmp_c]) map(to:een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
-  {
-
-  #pragma omp teams distribute parallel for collapse(6)
-  for (int nw=0; nw < walk_num; nw++) {
-    for (int i=0; i < cord_num; i++) {
-
-      // Single DGEMM
-      for(int j=0; j<cord_num+1; j++) {
-        for(int k=0; k<nucl_num; k++) {
-          for(int l=0; l<4; l++) {
-            for(int m=0; m<elec_num; m++) {
-
-              // Single reduction
-              dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
-              for(int n=0; n<elec_num; n++){
-                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
-                dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
-                een_rescaled_e_deriv_e[m + l * stride_l_e +  n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
-                een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  }
-
-  return QMCKL_SUCCESS;
-}
-#endif
-     #+end_src
-
-    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
-#ifdef HAVE_OPENMP_OFFLOAD
-    qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e_deriv_e,
-      const double* een_rescaled_n,
-      double* const dtmp_c );
-#endif
-     #+end_src
-
-**** cuBLAS offload                                                :noexport:
-
-     #+begin_src c :comments org :tangle (eval c) :noweb yes
-#ifdef HAVE_CUBLAS_OFFLOAD
-qmckl_exit_code
-qmckl_compute_dtmp_c_cublas_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e_deriv_e,
-      const double* een_rescaled_n,
-      double* const dtmp_c ) {
-
-  if (context == QMCKL_NULL_CONTEXT) {
-    return QMCKL_INVALID_CONTEXT;
-  }
-
-  if (cord_num < 0) {
-    return QMCKL_INVALID_ARG_2;
-  }
-
-  if (elec_num <= 0) {
-    return QMCKL_INVALID_ARG_3;
-  }
-
-  if (nucl_num <= 0) {
-    return QMCKL_INVALID_ARG_4;
-  }
-
-  if (walk_num <= 0) {
-    return QMCKL_INVALID_ARG_5;
-  }
-
-  qmckl_exit_code  info = QMCKL_SUCCESS;
-
-  //cuBLAS initialization
-  cublasHandle_t handle;
-  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
-  {
-    fprintf(stdout, "CUBLAS initialization failed!\n");
-    exit(EXIT_FAILURE);
-  }
-
-  const double alpha = 1.0;
-  const double beta  = 0.0;
-
-  const int64_t M = 4*elec_num;
-  const int64_t N = nucl_num*(cord_num + 1);
-  const int64_t K = elec_num;
-
-  const int64_t LDA = 4*elec_num;
-  const int64_t LDB = elec_num;
-  const int64_t LDC = 4*elec_num;
-
-  const int64_t af = elec_num*elec_num*4;
-  const int64_t bf = elec_num*nucl_num*(cord_num+1);
-  const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
-
-  #pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num])
-  #pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
-  {
-  for (int64_t nw=0; nw < walk_num; ++nw) {
-    int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
-                                      &(een_rescaled_e_deriv_e[(nw*(cord_num+1))]),
-                                      LDA, af,
-                                      &(een_rescaled_n[bf*nw]), LDB, 0,
-                                      &beta,
-                                      &(dtmp_c[(nw*cord_num)]),
-                                      LDC, cf, cord_num);
-
-  }
-  }
-
-  #pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num])
-
-  cublasDestroy(handle);
-  return info;
-}
-#endif
-     #+end_src
-
-     #+RESULTS:
-    #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
-#ifdef HAVE_CUBLAS_OFFLOAD
-    qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
-      const qmckl_context context,
-      const int64_t cord_num,
-      const int64_t elec_num,
-      const int64_t nucl_num,
-      const int64_t walk_num,
-      const double* een_rescaled_e_deriv_e,
-      const double* een_rescaled_n,
-      double* const dtmp_c );
-#endif
-     #+end_src
-
 *** Test

    #+name: helper_funcs