From 39bcc569e0d6d7f5c9fc35f26919f47a8886b83d Mon Sep 17 00:00:00 2001 From: hoffer Date: Wed, 6 Apr 2022 11:16:17 +0200 Subject: [PATCH 1/4] Start implementing cublas --- org/qmckl_jastrow.org | 137 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 3 deletions(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index ffbf713..1cbd030 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -108,6 +108,12 @@ int main() { #include #include + +#include +#include "cublas_v2.h" + + + #include #include "qmckl.h" @@ -4857,7 +4863,7 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) } ctx->jastrow.tmp_c = tmp_c; } - +/* qmckl_exit_code rc = qmckl_compute_tmp_c(context, ctx->jastrow.cord_num, @@ -4870,6 +4876,20 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) if (rc != QMCKL_SUCCESS) { return rc; } +,*/ + qmckl_exit_code rc = + qmckl_compute_tmp_c_cuBlas(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.tmp_c); + if (rc != QMCKL_SUCCESS) { + return rc; + } + ctx->jastrow.tmp_c_date = ctx->date; } @@ -4899,7 +4919,7 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) qmckl_memory_info_struct mem_info = qmckl_memory_info_struct_zero; mem_info.size = (ctx->jastrow.cord_num) * (ctx->jastrow.cord_num + 1) - * 4 * ctx->electron.num * ctx->nucleus.num * ctx->electron.walk_num * sizeof(double); + ,* 4 * ctx->electron.num * ctx->nucleus.num * ctx->electron.walk_num * sizeof(double); double* dtmp_c = (double*) qmckl_malloc(context, mem_info); if (dtmp_c == NULL) { @@ -4910,7 +4930,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) } ctx->jastrow.dtmp_c = dtmp_c; } - qmckl_exit_code rc = qmckl_compute_dtmp_c(context, ctx->jastrow.cord_num, @@ -4924,6 +4943,7 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) return rc; } + ctx->jastrow.dtmp_c_date = ctx->date; } @@ -5453,6 +5473,105 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc ( } #+end_src +#+begin_src c :comments org :tangle (eval c) :noweb yes +qmckl_exit_code qmckl_compute_tmp_c_cuBlas ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ) { + + qmckl_exit_code info; + + //Initialisation of cublas + + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) + { + fprintf(stdout, "CUBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + + + + if (context == QMCKL_NULL_CONTEXT) { + return QMCKL_INVALID_CONTEXT; + } + + if (cord_num <= 0) { + return QMCKL_INVALID_ARG_2; + } + + if (elec_num <= 0) { + return QMCKL_INVALID_ARG_3; + } + + if (nucl_num <= 0) { + return QMCKL_INVALID_ARG_4; + } + + const double alpha = 1.0; + const double beta = 0.0; + + const int64_t M = elec_num; + const int64_t N = nucl_num*(cord_num + 1); + const int64_t K = elec_num; + + const int64_t LDA = elec_num; + const int64_t LDB = elec_num; + const int64_t LDC = elec_num; + + const int64_t af = elec_num*elec_num; + const int64_t bf = elec_num*nucl_num*(cord_num+1); + const int64_t cf = bf; + + const double* tmp_c_gpu = malloc(sizeof(tmp_c)); + + #pragma omp target enter data map(alloc:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*K],tmp_c_gpu[0:sizeof(tmp_c_gpu)/sizeof(double)]) + #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) + { + for (int nw=0; nw < walk_num; ++nw) { + for (int i=0; i Date: Wed, 6 Apr 2022 16:20:29 +0200 Subject: [PATCH 2/4] Add openmp and cublas --- org/qmckl_jastrow.org | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index 1cbd030..2ac3438 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -5528,11 +5528,11 @@ qmckl_exit_code qmckl_compute_tmp_c_cuBlas ( const int64_t bf = elec_num*nucl_num*(cord_num+1); const int64_t cf = bf; - const double* tmp_c_gpu = malloc(sizeof(tmp_c)); - - #pragma omp target enter data map(alloc:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*K],tmp_c_gpu[0:sizeof(tmp_c_gpu)/sizeof(double)]) + #pragma omp target enter data map(alloc:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) { + + for (int nw=0; nw < walk_num; ++nw) { for (int i=0; i Date: Wed, 6 Apr 2022 17:04:00 +0200 Subject: [PATCH 3/4] Ok for openmp and Cublas --- org/qmckl_jastrow.org | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index 2ac3438..5d164ad 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -5528,7 +5528,7 @@ qmckl_exit_code qmckl_compute_tmp_c_cuBlas ( const int64_t bf = elec_num*nucl_num*(cord_num+1); const int64_t cf = bf; - #pragma omp target enter data map(alloc:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) + #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) { From 7dc02571e92cdece74e64d368eff4a55a4f752c4 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 7 Apr 2022 13:33:50 +0200 Subject: [PATCH 4/4] Fix build --- org/qmckl_jastrow.org | 742 +++++++++++++++++++----------------------- 1 file changed, 334 insertions(+), 408 deletions(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index 269d3fd..b9981b5 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -109,11 +109,6 @@ int main() { #include -#include -#include "cublas_v2.h" - - - #include #include "qmckl.h" @@ -122,6 +117,13 @@ int main() { #include "qmckl_memory_private_func.h" #include "qmckl_jastrow_private_func.h" #include "qmckl_jastrow_private_type.h" + +#ifdef HAVE_CUBLAS_OFFLOAD +#include +#include "cublas_v2.h" +#endif + + #+end_src * Context @@ -1123,7 +1125,7 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) { #if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD)) ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100; #endif - + qmckl_exit_code rc = QMCKL_SUCCESS; return rc; @@ -1517,7 +1519,7 @@ qmckl_exit_code qmckl_compute_asymp_jasb ( const int64_t bord_num, const double* bord_vector, const double rescale_factor_kappa_ee, - double* const asymp_jasb ); + double* const asymp_jasb ); #+end_src @@ -1808,21 +1810,21 @@ qmckl_exit_code qmckl_compute_factor_ee ( int ipar; // can we use a smaller integer? double x, x1, spin_fact, power_ser; - if (context == QMCKL_NULL_CONTEXT) { + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_2; } - + if (elec_num <= 0) { return QMCKL_INVALID_ARG_3; - } + } if (bord_num <= 0) { return QMCKL_INVALID_ARG_4; - } + } for (int nw = 0; nw < walk_num; ++nw) { factor_ee[nw] = 0.0; // put init array here. @@ -1833,9 +1835,9 @@ qmckl_exit_code qmckl_compute_factor_ee ( x1 = x; power_ser = 0.0; spin_fact = 1.0; - ipar = 0; // index of asymp_jasb + ipar = 0; // index of asymp_jasb - for (int p = 1; p < bord_num; ++p) { + for (int p = 1; p < bord_num; ++p) { x = x * x1; power_ser = power_ser + bord_vector[p + 1] * x; } @@ -1844,7 +1846,7 @@ qmckl_exit_code qmckl_compute_factor_ee ( spin_fact = 0.5; ipar = 1; } - + factor_ee[nw] = factor_ee[nw] + spin_fact * bord_vector[0] * \ x1 / \ (1.0 + bord_vector[1] * \ @@ -1860,7 +1862,7 @@ qmckl_exit_code qmckl_compute_factor_ee ( #+end_src # #+CALL: generate_c_header(table=qmckl_factor_ee_args,rettyp=get_value("CRetType"),fname=get_value("Name")) - + #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none qmckl_exit_code qmckl_compute_factor_ee ( const qmckl_context context, @@ -1871,7 +1873,7 @@ qmckl_exit_code qmckl_compute_factor_ee ( const double* bord_vector, const double* ee_distance_rescaled, const double* asymp_jasb, - double* const factor_ee ); + double* const factor_ee ); #+end_src @@ -2183,7 +2185,7 @@ end function qmckl_compute_factor_ee_deriv_e_f const double* ee_distance_rescaled, const double* ee_distance_rescaled_deriv_e, const double* asymp_jasb, - double* const factor_ee_deriv_e ); + double* const factor_ee_deriv_e ); #+end_src @@ -2457,7 +2459,7 @@ qmckl_exit_code qmckl_provide_factor_en(qmckl_context context) if (rc != QMCKL_SUCCESS) { return rc; } - + ctx->jastrow.factor_en_date = ctx->date; } @@ -2556,7 +2558,7 @@ integer function qmckl_compute_factor_en_f( & end function qmckl_compute_factor_en_f #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes qmckl_exit_code qmckl_compute_factor_en ( @@ -2625,7 +2627,7 @@ qmckl_exit_code qmckl_compute_factor_en ( x1 = x; power_ser = 0.0; - for (int p = 2; p < aord_num+1; ++p) { + for (int p = 2; p < aord_num+1; ++p) { x = x * x1; power_ser = power_ser + aord_vector[(p+1)-1 + (type_nucl_vector[a]-1) * aord_num] * x; } @@ -2656,7 +2658,7 @@ qmckl_exit_code qmckl_compute_factor_en ( const int64_t aord_num, const double* aord_vector, const double* en_distance_rescaled, - double* const factor_en ); + double* const factor_en ); #+end_src @@ -2950,7 +2952,7 @@ end function qmckl_compute_factor_en_deriv_e_f const double* aord_vector, const double* en_distance_rescaled, const double* en_distance_rescaled_deriv_e, - double* const factor_en_deriv_e ); + double* const factor_en_deriv_e ); #+end_src @@ -3343,7 +3345,7 @@ end function qmckl_compute_een_rescaled_e_doc_f const int64_t cord_num, const double rescale_factor_kappa_ee, const double* ee_distance, - double* const een_rescaled_e ); + double* const een_rescaled_e ); #+end_src #+CALL: generate_c_interface(table=qmckl_factor_een_rescaled_e_args,rettyp=get_value("CRetType"),fname="qmckl_compute_een_rescaled_e_doc") @@ -3382,13 +3384,13 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( const double rescale_factor_kappa_ee, const double* ee_distance, double* const een_rescaled_e ) { - - double *een_rescaled_e_ij; + + double *een_rescaled_e_ij; double x; const int64_t elec_pairs = (elec_num * (elec_num - 1)) / 2; const int64_t len_een_ij = elec_pairs * (cord_num + 1); - int64_t k; - + int64_t k; + // number of element for the een_rescaled_e_ij[N_e*(N_e-1)/2][cord+1] // probably in C is better [cord+1, Ne*(Ne-1)/2] //elec_pairs = (elec_num * (elec_num - 1)) / 2; @@ -3397,7 +3399,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_2; @@ -3412,8 +3414,8 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( } // Prepare table of exponentiated distances raised to appropriate power - // init - + // init + for (int kk = 0; kk < walk_num*(cord_num+1)*elec_num*elec_num; ++kk) { een_rescaled_e[kk]= 0.0; } @@ -3431,14 +3433,14 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( */ for (int nw = 0; nw < walk_num; ++nw) { - + for (int kk = 0; kk < len_een_ij; ++kk) { // this array initialized at 0 except een_rescaled_e_ij(:, 1) = 1.0d0 // and the arrangement of indices is [cord_num+1, ne*(ne-1)/2] een_rescaled_e_ij[kk]= ( kk < (elec_pairs) ? 1.0 : 0.0 ); } - k = 0; + k = 0; for (int i = 0; i < elec_num; ++i) { for (int j = 0; j < i; ++j) { // een_rescaled_e_ij(k, 2) = dexp(-rescale_factor_kappa_ee * ee_distance(i, j, nw)); @@ -3456,7 +3458,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( een_rescaled_e_ij[k + elec_pairs]; } } - + // prepare the actual een table for (int i = 0; i < elec_num; ++i){ @@ -3464,7 +3466,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( een_rescaled_e[j + i*elec_num + 0 + nw*(cord_num+1)*elec_num*elec_num] = 1.0; } } - + // Up to here it should work. for ( int l = 1; l < (cord_num+1); ++l) { k = 0; @@ -3487,7 +3489,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( } free(een_rescaled_e_ij); - + return QMCKL_SUCCESS; } #+end_src @@ -3526,7 +3528,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( const double* ee_distance, double* const een_rescaled_e ); #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes qmckl_exit_code qmckl_compute_een_rescaled_e ( const qmckl_context context, @@ -3854,7 +3856,7 @@ end function qmckl_compute_factor_een_rescaled_e_deriv_e_f const double* coord_new, const double* ee_distance, const double* een_rescaled_e, - double* const een_rescaled_e_deriv_e ); + double* const een_rescaled_e_deriv_e ); #+end_src @@ -4213,7 +4215,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_n ( if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_2; @@ -4274,7 +4276,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_n ( const int64_t cord_num, const double rescale_factor_kappa_en, const double* en_distance, - double* const een_rescaled_n ); + double* const een_rescaled_n ); #+end_src *** Test @@ -4583,7 +4585,7 @@ end function qmckl_compute_factor_een_rescaled_n_deriv_e_f const double* coord, const double* en_distance, const double* een_rescaled_n, - double* const een_rescaled_n_deriv_e ); + double* const een_rescaled_n_deriv_e ); #+end_src #+CALL: generate_c_interface(table=qmckl_compute_factor_een_rescaled_n_deriv_e_args,rettyp=get_value("CRetType"),fname=get_value("Name")) @@ -5032,8 +5034,8 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) #else const bool gpu_offload = false; #endif - - if (gpu_offload) { + + if (gpu_offload) { #ifdef HAVE_CUBLAS_OFFLOAD rc = qmckl_compute_tmp_c_cublas_offload(context, ctx->jastrow.cord_num, @@ -5074,7 +5076,7 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) ctx->jastrow.een_rescaled_n, ctx->jastrow.tmp_c); } - + ctx->jastrow.tmp_c_date = ctx->date; } @@ -5121,8 +5123,8 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) #else const bool gpu_offload = false; #endif - - if (gpu_offload) { + + if (gpu_offload) { #ifdef HAVE_CUBLAS_OFFLOAD rc = qmckl_compute_dtmp_c_cublas_offload(context, ctx->jastrow.cord_num, @@ -5238,10 +5240,10 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( const qmckl_context context, const int64_t cord_num, int64_t* const dim_cord_vect){ - + int lmax; - + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } @@ -5251,7 +5253,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( } *dim_cord_vect = 0; - + for (int p=2; p <= cord_num; ++p){ for (int k=p-1; k >= 0; --k) { if (k != 0) { @@ -5265,7 +5267,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( } } } - + return QMCKL_SUCCESS; } #+end_src @@ -5276,7 +5278,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( qmckl_exit_code qmckl_compute_dim_cord_vect ( const qmckl_context context, const int64_t cord_num, - int64_t* const dim_cord_vect ); + int64_t* const dim_cord_vect ); #+end_src @@ -5541,15 +5543,15 @@ qmckl_exit_code qmckl_compute_lkpm_combined_index ( int kk, lmax, m; - if (context == QMCKL_NULL_CONTEXT) { + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } - if (cord_num <= 0) { + if (cord_num <= 0) { return QMCKL_INVALID_ARG_2; } - if (dim_cord_vect <= 0) { + if (dim_cord_vect <= 0) { return QMCKL_INVALID_ARG_3; } @@ -5586,7 +5588,7 @@ qmckl_exit_code qmckl_compute_lkpm_combined_index ( const qmckl_context context, const int64_t cord_num, const int64_t dim_cord_vect, - int64_t* const lkpm_combined_index ); + int64_t* const lkpm_combined_index ); #+end_src @@ -5627,7 +5629,7 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context, #endif } #+end_src - + # #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c") #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none @@ -5639,7 +5641,7 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context, const int64_t walk_num, const double* een_rescaled_e, const double* een_rescaled_n, - double* const tmp_c ); + double* const tmp_c ); #+end_src #+begin_src f90 :comments org :tangle (eval f) :noweb yes @@ -5719,11 +5721,11 @@ qmckl_exit_code qmckl_compute_tmp_c_doc ( const int64_t walk_num, const double* een_rescaled_e, const double* een_rescaled_n, - double* const tmp_c ); + double* const tmp_c ); #+end_src #+CALL: generate_c_interface(table=qmckl_factor_tmp_c_args,rettyp=get_value("FRetType"),fname="qmckl_compute_tmp_c_doc") - + #+RESULTS: #+begin_src f90 :tangle (eval f) :comments org :exports none integer(c_int32_t) function qmckl_compute_tmp_c_doc & @@ -5768,19 +5770,19 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc ( if (cord_num <= 0) { return QMCKL_INVALID_ARG_2; - } + } if (elec_num <= 0) { return QMCKL_INVALID_ARG_3; - } + } if (nucl_num <= 0) { return QMCKL_INVALID_ARG_4; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_5; - } + } qmckl_exit_code info = QMCKL_SUCCESS; @@ -5818,16 +5820,264 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc ( #+end_src + + #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c") + + #+RESULTS: + #+begin_src c :tangle (eval h_func) :comments org +qmckl_exit_code qmckl_compute_tmp_c ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ); + #+end_src + +# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c_doc") + + #+RESULTS: + #+begin_src c :tangle (eval h_private_func) :comments org +qmckl_exit_code qmckl_compute_tmp_c_doc ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ); + #+end_src + +# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c_hpc") + + #+RESULTS: + + #+begin_src c :tangle (eval h_private_func) :comments org +qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ); + #+end_src + +**** OpenACC offload :noexport: + + #+begin_src c :comments org :tangle (eval c) :noweb yes +#ifdef HAVE_OPENACC_OFFLOAD +qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ) +{ + + if (context == QMCKL_NULL_CONTEXT) { + return QMCKL_INVALID_CONTEXT; + } + + if (cord_num <= 0) { + return QMCKL_INVALID_ARG_2; + } + + if (elec_num <= 0) { + return QMCKL_INVALID_ARG_3; + } + + if (nucl_num <= 0) { + return QMCKL_INVALID_ARG_4; + } + + // Compute array access strides: + // For tmp_c... + const int64_t stride_k_c = elec_num; + const int64_t stride_j_c = stride_k_c * nucl_num; + const int64_t stride_i_c = stride_j_c * (cord_num+1); + const int64_t stride_nw_c = stride_i_c * cord_num; + // For een_rescaled_e... + const int64_t stride_m_e = elec_num; + const int64_t stride_i_e = stride_m_e * elec_num; + const int64_t stride_nw_e = stride_i_e * (cord_num+1); + // For een_rescaled_n... + const int64_t stride_k_n = elec_num; + const int64_t stride_j_n = stride_k_n * nucl_num; + const int64_t stride_nw_n = stride_j_n * (cord_num+1); + + const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num; + const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num; + const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; + +#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n]) + { +#pragma acc loop independent gang worker vector + for (int64_t i=0 ; i