From 69b9e0fb894b861b4cf9ae79013bac705f8bba65 Mon Sep 17 00:00:00 2001 From: hoffer Date: Thu, 7 Apr 2022 18:44:59 +0200 Subject: [PATCH 1/2] Add cublas batched --- org/qmckl_jastrow.org | 195 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 159 insertions(+), 36 deletions(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index 269d3fd..e13498e 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -110,7 +110,7 @@ int main() { #include -#include "cublas_v2.h" +#include @@ -5032,10 +5032,10 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) #else const bool gpu_offload = false; #endif - - if (gpu_offload) { + + if (gpu_offload) { #ifdef HAVE_CUBLAS_OFFLOAD - rc = qmckl_compute_tmp_c_cublas_offload(context, + rc = qmckl_compute_tmp_c_cuBlas(context, ctx->jastrow.cord_num, ctx->electron.num, ctx->nucleus.num, @@ -5074,7 +5074,7 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) ctx->jastrow.een_rescaled_n, ctx->jastrow.tmp_c); } - + ctx->jastrow.tmp_c_date = ctx->date; } @@ -5121,10 +5121,10 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) #else const bool gpu_offload = false; #endif - - if (gpu_offload) { + + if (gpu_offload) { #ifdef HAVE_CUBLAS_OFFLOAD - rc = qmckl_compute_dtmp_c_cublas_offload(context, + rc = qmckl_compute_dtmp_c_cuBlas(context, ctx->jastrow.cord_num, ctx->electron.num, ctx->nucleus.num, @@ -5829,6 +5829,93 @@ qmckl_exit_code qmckl_compute_tmp_c_cuBlas ( const double* een_rescaled_n, double* const tmp_c ) { + qmckl_exit_code info; + + //Initialisation of cublas + + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) + { + fprintf(stdout, "CUBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + + + + if (context == QMCKL_NULL_CONTEXT) { + return QMCKL_INVALID_CONTEXT; + } + + if (cord_num <= 0) { + return QMCKL_INVALID_ARG_2; + } + + if (elec_num <= 0) { + return QMCKL_INVALID_ARG_3; + } + + if (nucl_num <= 0) { + return QMCKL_INVALID_ARG_4; + } + + const double alpha = 1.0; + const double beta = 0.0; + + const int64_t M = elec_num; + const int64_t N = nucl_num*(cord_num + 1); + const int64_t K = elec_num; + + const int64_t LDA = elec_num; + const int64_t LDB = elec_num; + const int64_t LDC = elec_num; + + const int64_t af = elec_num*elec_num; + const int64_t bf = elec_num*nucl_num*(cord_num+1); + const int64_t cf = bf; + + #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) + #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) + { + for (int nw=0; nw < walk_num; ++nw) { + + int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, + &(een_rescaled_e[nw*(cord_num+1)]), \ + LDA, af, \ + &(een_rescaled_n[bf*nw]), \ + LDB, 0, \ + &beta, \ + &(tmp_c[nw*cord_num]), \ + LDC, cf, cord_num); + + //Manage cublas ERROR + if(cublasError != CUBLAS_STATUS_SUCCESS){ + printf("CUBLAS ERROR %d", cublasError); + info = QMCKL_FAILURE; + }else{ + info = QMCKL_SUCCESS; + } + } + } + cublasDestroy(handle); + #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) + + return info; + } +#+end_src + + + +#+begin_src c :comments org :tangle (eval c) :noweb yes +qmckl_exit_code qmckl_compute_tmp_c_cuBlas_batched ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ) { + qmckl_exit_code info; //Initialisation of cublas @@ -6708,40 +6795,47 @@ qmckl_exit_code qmckl_compute_dtmp_c_omp_offload ( #+begin_src c :comments org :tangle (eval c) :noweb yes #ifdef HAVE_CUBLAS_OFFLOAD -qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload ( - const qmckl_context context, - const int64_t cord_num, - const int64_t elec_num, - const int64_t nucl_num, - const int64_t walk_num, - const double* een_rescaled_e_deriv_e, - const double* een_rescaled_n, - double* const dtmp_c ) { +qmckl_exit_code qmckl_compute_dtmp_c_cuBlas (const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e_deriv_e, + const double* een_rescaled_n, + double* const dtmp_c ) +{ + + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) + { + fprintf(stdout, "CUBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + + if (context == QMCKL_NULL_CONTEXT) { - return QMCKL_INVALID_CONTEXT; + return QMCKL_INVALID_CONTEXT; } if (cord_num <= 0) { - return QMCKL_INVALID_ARG_2; + return QMCKL_INVALID_ARG_2; } if (elec_num <= 0) { - return QMCKL_INVALID_ARG_3; + return QMCKL_INVALID_ARG_3; } if (nucl_num <= 0) { - return QMCKL_INVALID_ARG_4; + return QMCKL_INVALID_ARG_4; } if (walk_num <= 0) { - return QMCKL_INVALID_ARG_5; + return QMCKL_INVALID_ARG_5; } qmckl_exit_code info = QMCKL_SUCCESS; - const char TransA = 'N'; - const char TransB = 'N'; const double alpha = 1.0; const double beta = 0.0; @@ -6757,19 +6851,48 @@ qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload ( const int64_t bf = elec_num*nucl_num*(cord_num+1); const int64_t cf = elec_num*4*nucl_num*(cord_num+1); - // TODO Replace with calls to cuBLAS - for (int64_t nw=0; nw < walk_num; ++nw) { - for (int64_t i=0; i < cord_num; ++i) { - info = qmckl_dgemm(context, TransA, TransB, M, N, K, alpha, \ - &(een_rescaled_e_deriv_e[af*(i+nw*(cord_num+1))]), \ - LDA, \ - &(een_rescaled_n[bf*nw]), \ - LDB, \ - beta, \ - &(dtmp_c[cf*(i+nw*cord_num)]), \ - LDC); +#pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num]) +#pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c) + { + for (int64_t nw=0; nw < walk_num; ++nw) { + /* + for (int64_t i=0; i < cord_num; ++i) { + int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, \ + &(een_rescaled_e_deriv_e[af*(i+nw*(cord_num+1))]), \ + LDA, \ + &(een_rescaled_n[bf*nw]), \ + LDB, \ + &beta, \ + &(dtmp_c[cf*(i+nw*cord_num)]), \ + LDC); + ,*/ + //Manage CUBLAS ERRORS + + int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, \ + &(een_rescaled_e_deriv_e[(nw*(cord_num+1))]), \ + LDA, af, \ + &(een_rescaled_n[bf*nw]), \ + LDB, 0, \ + &beta, \ + &(dtmp_c[(nw*cord_num)]), \ + LDC, cf, cord_num); + + + if(cublasError != CUBLAS_STATUS_SUCCESS){ + printf("CUBLAS ERROR %d", cublasError); + info = QMCKL_FAILURE; + return info; + }else{ + info = QMCKL_SUCCESS; + } + + //} } } + cudaDeviceSynchronize(); + cublasDestroy(handle); + +#pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num]) return info; } @@ -6779,7 +6902,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload ( #+RESULTS: #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none #ifdef HAVE_CUBLAS_OFFLOAD - qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload ( + qmckl_exit_code qmckl_compute_dtmp_c_cuBlas ( const qmckl_context context, const int64_t cord_num, const int64_t elec_num, From d4f0ccee3b28d30319afc22e74f923ce5e4a4930 Mon Sep 17 00:00:00 2001 From: hoffer Date: Fri, 8 Apr 2022 10:44:48 +0200 Subject: [PATCH 2/2] Add cublas batch Dgemm --- org/qmckl_jastrow.org | 1134 ++++++++++++++++++++--------------------- 1 file changed, 547 insertions(+), 587 deletions(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index e13498e..dc58d48 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -109,11 +109,6 @@ int main() { #include -#include -#include - - - #include #include "qmckl.h" @@ -122,6 +117,13 @@ int main() { #include "qmckl_memory_private_func.h" #include "qmckl_jastrow_private_func.h" #include "qmckl_jastrow_private_type.h" + +#ifdef HAVE_CUBLAS_OFFLOAD +#include +#include "cublas_v2.h" +#endif + + #+end_src * Context @@ -1123,7 +1125,7 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) { #if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD)) ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100; #endif - + qmckl_exit_code rc = QMCKL_SUCCESS; return rc; @@ -1517,7 +1519,7 @@ qmckl_exit_code qmckl_compute_asymp_jasb ( const int64_t bord_num, const double* bord_vector, const double rescale_factor_kappa_ee, - double* const asymp_jasb ); + double* const asymp_jasb ); #+end_src @@ -1808,21 +1810,21 @@ qmckl_exit_code qmckl_compute_factor_ee ( int ipar; // can we use a smaller integer? double x, x1, spin_fact, power_ser; - if (context == QMCKL_NULL_CONTEXT) { + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_2; } - + if (elec_num <= 0) { return QMCKL_INVALID_ARG_3; - } + } if (bord_num <= 0) { return QMCKL_INVALID_ARG_4; - } + } for (int nw = 0; nw < walk_num; ++nw) { factor_ee[nw] = 0.0; // put init array here. @@ -1833,9 +1835,9 @@ qmckl_exit_code qmckl_compute_factor_ee ( x1 = x; power_ser = 0.0; spin_fact = 1.0; - ipar = 0; // index of asymp_jasb + ipar = 0; // index of asymp_jasb - for (int p = 1; p < bord_num; ++p) { + for (int p = 1; p < bord_num; ++p) { x = x * x1; power_ser = power_ser + bord_vector[p + 1] * x; } @@ -1844,7 +1846,7 @@ qmckl_exit_code qmckl_compute_factor_ee ( spin_fact = 0.5; ipar = 1; } - + factor_ee[nw] = factor_ee[nw] + spin_fact * bord_vector[0] * \ x1 / \ (1.0 + bord_vector[1] * \ @@ -1860,7 +1862,7 @@ qmckl_exit_code qmckl_compute_factor_ee ( #+end_src # #+CALL: generate_c_header(table=qmckl_factor_ee_args,rettyp=get_value("CRetType"),fname=get_value("Name")) - + #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none qmckl_exit_code qmckl_compute_factor_ee ( const qmckl_context context, @@ -1871,7 +1873,7 @@ qmckl_exit_code qmckl_compute_factor_ee ( const double* bord_vector, const double* ee_distance_rescaled, const double* asymp_jasb, - double* const factor_ee ); + double* const factor_ee ); #+end_src @@ -2030,7 +2032,6 @@ qmckl_exit_code qmckl_provide_factor_ee_deriv_e(qmckl_context context) ctx->jastrow.bord_vector, ctx->electron.ee_distance_rescaled, ctx->electron.ee_distance_rescaled_deriv_e, - ctx->jastrow.asymp_jasb, ctx->jastrow.factor_ee_deriv_e); if (rc != QMCKL_SUCCESS) { return rc; @@ -2061,14 +2062,13 @@ qmckl_exit_code qmckl_provide_factor_ee_deriv_e(qmckl_context context) | ~bord_vector~ | ~double[bord_num+1]~ | in | List of coefficients | | ~ee_distance_rescaled~ | ~double[walk_num][elec_num][elec_num]~ | in | Electron-electron distances | | ~ee_distance_rescaled_deriv_e~ | ~double[walk_num][4][elec_num][elec_num]~ | in | Electron-electron distances | - | ~asymp_jasb~ | ~double[2]~ | in | Electron-electron distances | | ~factor_ee_deriv_e~ | ~double[walk_num][4][elec_num]~ | out | Electron-electron distances | #+begin_src f90 :comments org :tangle (eval f) :noweb yes -integer function qmckl_compute_factor_ee_deriv_e_f( & +integer function qmckl_compute_factor_ee_deriv_e_doc_f( & context, walk_num, elec_num, up_num, bord_num, & bord_vector, ee_distance_rescaled, ee_distance_rescaled_deriv_e, & - asymp_jasb, factor_ee_deriv_e) & + factor_ee_deriv_e) & result(info) use qmckl implicit none @@ -2077,10 +2077,9 @@ integer function qmckl_compute_factor_ee_deriv_e_f( & double precision , intent(in) :: bord_vector(bord_num + 1) double precision , intent(in) :: ee_distance_rescaled(elec_num, elec_num,walk_num) double precision , intent(in) :: ee_distance_rescaled_deriv_e(4,elec_num, elec_num,walk_num) !TODO - double precision , intent(in) :: asymp_jasb(2) double precision , intent(out) :: factor_ee_deriv_e(elec_num,4,walk_num) - integer*8 :: i, j, p, ipar, nw, ii + integer*8 :: i, j, p, nw, ii double precision :: x, spin_fact, y double precision :: den, invden, invden2, invden3, xinv double precision :: lap1, lap2, lap3, third @@ -2124,7 +2123,6 @@ integer function qmckl_compute_factor_ee_deriv_e_f( & invden2 = invden * invden invden3 = invden2 * invden xinv = 1.0d0 / (x + 1.0d-18) - ipar = 1 dx(1) = ee_distance_rescaled_deriv_e(1, i, j, nw) dx(2) = ee_distance_rescaled_deriv_e(2, i, j, nw) @@ -2166,7 +2164,120 @@ integer function qmckl_compute_factor_ee_deriv_e_f( & end do end do -end function qmckl_compute_factor_ee_deriv_e_f +end function qmckl_compute_factor_ee_deriv_e_doc_f + #+end_src + + #+begin_src c :comments org :tangle (eval c) :noweb yes +qmckl_exit_code qmckl_compute_factor_ee_deriv_e_hpc( + const qmckl_context context, + const int64_t walk_num, + const int64_t elec_num, + const int64_t up_num, + const int64_t bord_num, + const double* bord_vector, + const double* ee_distance_rescaled, + const double* ee_distance_rescaled_deriv_e, + double* const factor_ee_deriv_e ) { + + int64_t ii; + double pow_ser_g[3]; + double dx[4]; + double x, spin_fact, y; + double den, invden, invden2, invden3, xinv; + double lap1, lap2, lap3, third; + + if (context == QMCKL_NULL_CONTEXT) { + return QMCKL_INVALID_CONTEXT; + } + + if (walk_num <= 0) { + return QMCKL_INVALID_ARG_2; + } + + if (elec_num <= 0) { + return QMCKL_INVALID_ARG_3; + } + + if (bord_num <= 0) { + return QMCKL_INVALID_ARG_4; + } + + + for (int nw = 0; nw < walk_num; ++nw) { + for (int ii = 0; ii < 4; ++ii) { + for (int j = 0; j < elec_num; ++j) { + factor_ee_deriv_e[j + ii * elec_num + nw * elec_num * 4] = 0.0; + } + } + } + + third = 1.0 / 3.0; + + for (int nw = 0; nw < walk_num; ++nw) { + for (int i = 0; i < elec_num; ++i) { + for (int j = 0; j < elec_num; ++j) { + x = ee_distance_rescaled[j + i * elec_num + nw * elec_num * elec_num]; + if (fabs(x) < 1.0e-18) continue; + for (int ii = 0; ii < 3; ++ii){ + pow_ser_g[ii] = 0.0; + } + spin_fact = 1.0; + den = 1.0 + bord_vector[1] * x; + invden = 1.0 / den; + invden2 = invden * invden; + invden3 = invden2 * invden; + xinv = 1.0 / (x + 1.0e-18); + + dx[0] = ee_distance_rescaled_deriv_e[0 \ + + j * 4 + i * 4 * elec_num \ + + nw * 4 * elec_num * elec_num]; + dx[1] = ee_distance_rescaled_deriv_e[1 \ + + j * 4 + i * 4 * elec_num \ + + nw * 4 * elec_num * elec_num]; + dx[2] = ee_distance_rescaled_deriv_e[2 \ + + j * 4 + i * 4 * elec_num \ + + nw * 4 * elec_num * elec_num]; + dx[3] = ee_distance_rescaled_deriv_e[3 \ + + j * 4 + i * 4 * elec_num \ + + nw * 4 * elec_num * elec_num]; + + if((i <= (up_num-1) && j <= (up_num-1) ) || (i > (up_num-1) && j > (up_num-1))) { + spin_fact = 0.5; + } + + lap1 = 0.0; + lap2 = 0.0; + lap3 = 0.0; + for (int ii = 0; ii < 3; ++ii) { + x = ee_distance_rescaled[j + i * elec_num + nw * elec_num * elec_num]; + if (fabs(x) < 1.0e-18) continue; + for (int p = 2; p < bord_num+1; ++p) { + y = p * bord_vector[(p-1) + 1] * x; + pow_ser_g[ii] = pow_ser_g[ii] + y * dx[ii]; + lap1 = lap1 + (p - 1) * y * xinv * dx[ii] * dx[ii]; + lap2 = lap2 + y; + x = x * ee_distance_rescaled[j + i * elec_num + nw * elec_num * elec_num]; + } + + lap3 = lap3 - 2.0 * bord_vector[1] * dx[ii] * dx[ii]; + + factor_ee_deriv_e[i + ii * elec_num + nw * elec_num * 4 ] += \ + + spin_fact * bord_vector[0] * dx[ii] * invden2 \ + + pow_ser_g[ii] ; + } + + ii = 3; + lap2 = lap2 * dx[ii] * third; + lap3 = lap3 + den * dx[ii]; + lap3 = lap3 * (spin_fact * bord_vector[0] * invden3); + factor_ee_deriv_e[i + ii*elec_num + nw * elec_num * 4] += lap1 + lap2 + lap3; + + } + } + } + + return QMCKL_SUCCESS; +} #+end_src # #+CALL: generate_c_header(table=qmckl_factor_ee_deriv_e_args,rettyp=get_value("CRetType"),fname=get_value("Name")) @@ -2182,17 +2293,16 @@ end function qmckl_compute_factor_ee_deriv_e_f const double* bord_vector, const double* ee_distance_rescaled, const double* ee_distance_rescaled_deriv_e, - const double* asymp_jasb, - double* const factor_ee_deriv_e ); + double* const factor_ee_deriv_e ); #+end_src - #+CALL: generate_c_interface(table=qmckl_factor_ee_deriv_e_args,rettyp=get_value("CRetType"),fname=get_value("Name")) + #+CALL: generate_c_interface(table=qmckl_factor_ee_deriv_e_args,rettyp=get_value("CRetType"),fname="qmckl_compute_factor_ee_deriv_e_doc") #+RESULTS: #+begin_src f90 :tangle (eval f) :comments org :exports none - integer(c_int32_t) function qmckl_compute_factor_ee_deriv_e & - (context, & +integer(c_int32_t) function qmckl_compute_factor_ee_deriv_e_doc & + (context, & walk_num, & elec_num, & up_num, & @@ -2200,7 +2310,6 @@ end function qmckl_compute_factor_ee_deriv_e_f bord_vector, & ee_distance_rescaled, & ee_distance_rescaled_deriv_e, & - asymp_jasb, & factor_ee_deriv_e) & bind(C) result(info) @@ -2215,12 +2324,11 @@ end function qmckl_compute_factor_ee_deriv_e_f real (c_double ) , intent(in) :: bord_vector(bord_num+1) real (c_double ) , intent(in) :: ee_distance_rescaled(elec_num,elec_num,walk_num) real (c_double ) , intent(in) :: ee_distance_rescaled_deriv_e(elec_num,elec_num,4,walk_num) - real (c_double ) , intent(in) :: asymp_jasb(2) real (c_double ) , intent(out) :: factor_ee_deriv_e(elec_num,4,walk_num) - integer(c_int32_t), external :: qmckl_compute_factor_ee_deriv_e_f - info = qmckl_compute_factor_ee_deriv_e_f & - (context, & + integer(c_int32_t), external :: qmckl_compute_factor_ee_deriv_e_doc_f + info = qmckl_compute_factor_ee_deriv_e_doc_f & + (context, & walk_num, & elec_num, & up_num, & @@ -2228,11 +2336,60 @@ end function qmckl_compute_factor_ee_deriv_e_f bord_vector, & ee_distance_rescaled, & ee_distance_rescaled_deriv_e, & - asymp_jasb, & factor_ee_deriv_e) - end function qmckl_compute_factor_ee_deriv_e + end function qmckl_compute_factor_ee_deriv_e_doc #+end_src + + #+begin_src c :tangle (eval h_private_func) :comments org + qmckl_exit_code qmckl_compute_factor_ee_deriv_e_hpc ( + const qmckl_context context, + const int64_t walk_num, + const int64_t elec_num, + const int64_t up_num, + const int64_t bord_num, + const double* bord_vector, + const double* ee_distance_rescaled, + const double* ee_distance_rescaled_deriv_e, + double* const factor_ee_deriv_e ); + #+end_src + + #+begin_src c :tangle (eval h_private_func) :comments org + qmckl_exit_code qmckl_compute_factor_ee_deriv_e_doc ( + const qmckl_context context, + const int64_t walk_num, + const int64_t elec_num, + const int64_t up_num, + const int64_t bord_num, + const double* bord_vector, + const double* ee_distance_rescaled, + const double* ee_distance_rescaled_deriv_e, + double* const factor_ee_deriv_e ); + #+end_src + + #+begin_src c :comments org :tangle (eval c) :noweb yes + qmckl_exit_code qmckl_compute_factor_ee_deriv_e ( + const qmckl_context context, + const int64_t walk_num, + const int64_t elec_num, + const int64_t up_num, + const int64_t bord_num, + const double* bord_vector, + const double* ee_distance_rescaled, + const double* ee_distance_rescaled_deriv_e, + double* const factor_ee_deriv_e ) { + + #ifdef HAVE_HPC + return qmckl_compute_factor_ee_deriv_e_hpc(context, walk_num, elec_num, up_num, bord_num, bord_vector, ee_distance_rescaled, ee_distance_rescaled_deriv_e, factor_ee_deriv_e ); + #else + return qmckl_compute_factor_ee_deriv_e_doc(context, walk_num, elec_num, up_num, bord_num, bord_vector, ee_distance_rescaled, ee_distance_rescaled_deriv_e, factor_ee_deriv_e ); + #endif +} + #+end_src + + + + *** Test #+begin_src python :results output :exports none :noweb yes @@ -2351,7 +2508,6 @@ assert(fabs(factor_ee_deriv_e[0][0][0]-0.16364894652107934) < 1.e-12); assert(fabs(factor_ee_deriv_e[0][1][0]+0.6927548119830084 ) < 1.e-12); assert(fabs(factor_ee_deriv_e[0][2][0]-0.073267755223968 ) < 1.e-12); assert(fabs(factor_ee_deriv_e[0][3][0]-1.5111672803213185 ) < 1.e-12); - #+end_src ** Electron-nucleus component \(f_{en}\) @@ -2457,7 +2613,7 @@ qmckl_exit_code qmckl_provide_factor_en(qmckl_context context) if (rc != QMCKL_SUCCESS) { return rc; } - + ctx->jastrow.factor_en_date = ctx->date; } @@ -2556,7 +2712,7 @@ integer function qmckl_compute_factor_en_f( & end function qmckl_compute_factor_en_f #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes qmckl_exit_code qmckl_compute_factor_en ( @@ -2625,7 +2781,7 @@ qmckl_exit_code qmckl_compute_factor_en ( x1 = x; power_ser = 0.0; - for (int p = 2; p < aord_num+1; ++p) { + for (int p = 2; p < aord_num+1; ++p) { x = x * x1; power_ser = power_ser + aord_vector[(p+1)-1 + (type_nucl_vector[a]-1) * aord_num] * x; } @@ -2656,7 +2812,7 @@ qmckl_exit_code qmckl_compute_factor_en ( const int64_t aord_num, const double* aord_vector, const double* en_distance_rescaled, - double* const factor_en ); + double* const factor_en ); #+end_src @@ -2950,7 +3106,7 @@ end function qmckl_compute_factor_en_deriv_e_f const double* aord_vector, const double* en_distance_rescaled, const double* en_distance_rescaled_deriv_e, - double* const factor_en_deriv_e ); + double* const factor_en_deriv_e ); #+end_src @@ -3343,7 +3499,7 @@ end function qmckl_compute_een_rescaled_e_doc_f const int64_t cord_num, const double rescale_factor_kappa_ee, const double* ee_distance, - double* const een_rescaled_e ); + double* const een_rescaled_e ); #+end_src #+CALL: generate_c_interface(table=qmckl_factor_een_rescaled_e_args,rettyp=get_value("CRetType"),fname="qmckl_compute_een_rescaled_e_doc") @@ -3382,13 +3538,13 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( const double rescale_factor_kappa_ee, const double* ee_distance, double* const een_rescaled_e ) { - - double *een_rescaled_e_ij; + + double *een_rescaled_e_ij; double x; const int64_t elec_pairs = (elec_num * (elec_num - 1)) / 2; const int64_t len_een_ij = elec_pairs * (cord_num + 1); - int64_t k; - + int64_t k; + // number of element for the een_rescaled_e_ij[N_e*(N_e-1)/2][cord+1] // probably in C is better [cord+1, Ne*(Ne-1)/2] //elec_pairs = (elec_num * (elec_num - 1)) / 2; @@ -3397,7 +3553,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_2; @@ -3412,8 +3568,8 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( } // Prepare table of exponentiated distances raised to appropriate power - // init - + // init + for (int kk = 0; kk < walk_num*(cord_num+1)*elec_num*elec_num; ++kk) { een_rescaled_e[kk]= 0.0; } @@ -3431,14 +3587,14 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( */ for (int nw = 0; nw < walk_num; ++nw) { - + for (int kk = 0; kk < len_een_ij; ++kk) { // this array initialized at 0 except een_rescaled_e_ij(:, 1) = 1.0d0 // and the arrangement of indices is [cord_num+1, ne*(ne-1)/2] een_rescaled_e_ij[kk]= ( kk < (elec_pairs) ? 1.0 : 0.0 ); } - k = 0; + k = 0; for (int i = 0; i < elec_num; ++i) { for (int j = 0; j < i; ++j) { // een_rescaled_e_ij(k, 2) = dexp(-rescale_factor_kappa_ee * ee_distance(i, j, nw)); @@ -3456,7 +3612,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( een_rescaled_e_ij[k + elec_pairs]; } } - + // prepare the actual een table for (int i = 0; i < elec_num; ++i){ @@ -3464,7 +3620,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( een_rescaled_e[j + i*elec_num + 0 + nw*(cord_num+1)*elec_num*elec_num] = 1.0; } } - + // Up to here it should work. for ( int l = 1; l < (cord_num+1); ++l) { k = 0; @@ -3487,7 +3643,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( } free(een_rescaled_e_ij); - + return QMCKL_SUCCESS; } #+end_src @@ -3526,7 +3682,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( const double* ee_distance, double* const een_rescaled_e ); #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes qmckl_exit_code qmckl_compute_een_rescaled_e ( const qmckl_context context, @@ -3854,7 +4010,7 @@ end function qmckl_compute_factor_een_rescaled_e_deriv_e_f const double* coord_new, const double* ee_distance, const double* een_rescaled_e, - double* const een_rescaled_e_deriv_e ); + double* const een_rescaled_e_deriv_e ); #+end_src @@ -4213,7 +4369,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_n ( if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_2; @@ -4274,7 +4430,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_n ( const int64_t cord_num, const double rescale_factor_kappa_en, const double* en_distance, - double* const een_rescaled_n ); + double* const een_rescaled_n ); #+end_src *** Test @@ -4583,7 +4739,7 @@ end function qmckl_compute_factor_een_rescaled_n_deriv_e_f const double* coord, const double* en_distance, const double* een_rescaled_n, - double* const een_rescaled_n_deriv_e ); + double* const een_rescaled_n_deriv_e ); #+end_src #+CALL: generate_c_interface(table=qmckl_compute_factor_een_rescaled_n_deriv_e_args,rettyp=get_value("CRetType"),fname=get_value("Name")) @@ -5035,7 +5191,7 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) if (gpu_offload) { #ifdef HAVE_CUBLAS_OFFLOAD - rc = qmckl_compute_tmp_c_cuBlas(context, + rc = qmckl_compute_tmp_c_cublas_offload(context, ctx->jastrow.cord_num, ctx->electron.num, ctx->nucleus.num, @@ -5124,7 +5280,7 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) if (gpu_offload) { #ifdef HAVE_CUBLAS_OFFLOAD - rc = qmckl_compute_dtmp_c_cuBlas(context, + rc = qmckl_compute_dtmp_c_cublas_offload(context, ctx->jastrow.cord_num, ctx->electron.num, ctx->nucleus.num, @@ -5238,10 +5394,10 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( const qmckl_context context, const int64_t cord_num, int64_t* const dim_cord_vect){ - + int lmax; - + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } @@ -5251,7 +5407,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( } *dim_cord_vect = 0; - + for (int p=2; p <= cord_num; ++p){ for (int k=p-1; k >= 0; --k) { if (k != 0) { @@ -5265,7 +5421,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( } } } - + return QMCKL_SUCCESS; } #+end_src @@ -5276,7 +5432,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect ( qmckl_exit_code qmckl_compute_dim_cord_vect ( const qmckl_context context, const int64_t cord_num, - int64_t* const dim_cord_vect ); + int64_t* const dim_cord_vect ); #+end_src @@ -5541,15 +5697,15 @@ qmckl_exit_code qmckl_compute_lkpm_combined_index ( int kk, lmax, m; - if (context == QMCKL_NULL_CONTEXT) { + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } - if (cord_num <= 0) { + if (cord_num <= 0) { return QMCKL_INVALID_ARG_2; } - if (dim_cord_vect <= 0) { + if (dim_cord_vect <= 0) { return QMCKL_INVALID_ARG_3; } @@ -5586,7 +5742,7 @@ qmckl_exit_code qmckl_compute_lkpm_combined_index ( const qmckl_context context, const int64_t cord_num, const int64_t dim_cord_vect, - int64_t* const lkpm_combined_index ); + int64_t* const lkpm_combined_index ); #+end_src @@ -5627,7 +5783,7 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context, #endif } #+end_src - + # #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c") #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none @@ -5639,7 +5795,7 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context, const int64_t walk_num, const double* een_rescaled_e, const double* een_rescaled_n, - double* const tmp_c ); + double* const tmp_c ); #+end_src #+begin_src f90 :comments org :tangle (eval f) :noweb yes @@ -5719,11 +5875,11 @@ qmckl_exit_code qmckl_compute_tmp_c_doc ( const int64_t walk_num, const double* een_rescaled_e, const double* een_rescaled_n, - double* const tmp_c ); + double* const tmp_c ); #+end_src #+CALL: generate_c_interface(table=qmckl_factor_tmp_c_args,rettyp=get_value("FRetType"),fname="qmckl_compute_tmp_c_doc") - + #+RESULTS: #+begin_src f90 :tangle (eval f) :comments org :exports none integer(c_int32_t) function qmckl_compute_tmp_c_doc & @@ -5768,19 +5924,19 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc ( if (cord_num <= 0) { return QMCKL_INVALID_ARG_2; - } + } if (elec_num <= 0) { return QMCKL_INVALID_ARG_3; - } + } if (nucl_num <= 0) { return QMCKL_INVALID_ARG_4; - } + } if (walk_num <= 0) { return QMCKL_INVALID_ARG_5; - } + } qmckl_exit_code info = QMCKL_SUCCESS; @@ -5818,18 +5974,259 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc ( #+end_src -#+begin_src c :comments org :tangle (eval c) :noweb yes -qmckl_exit_code qmckl_compute_tmp_c_cuBlas ( - const qmckl_context context, - const int64_t cord_num, - const int64_t elec_num, - const int64_t nucl_num, - const int64_t walk_num, - const double* een_rescaled_e, - const double* een_rescaled_n, - double* const tmp_c ) { - qmckl_exit_code info; + #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c") + + #+RESULTS: + #+begin_src c :tangle (eval h_func) :comments org +qmckl_exit_code qmckl_compute_tmp_c ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ); + #+end_src + +# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c_doc") + + #+RESULTS: + #+begin_src c :tangle (eval h_private_func) :comments org +qmckl_exit_code qmckl_compute_tmp_c_doc ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ); + #+end_src + +# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c_hpc") + + #+RESULTS: + + #+begin_src c :tangle (eval h_private_func) :comments org +qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ); + #+end_src + +**** OpenACC offload :noexport: + + #+begin_src c :comments org :tangle (eval c) :noweb yes +#ifdef HAVE_OPENACC_OFFLOAD +qmckl_exit_code +qmckl_compute_tmp_c_acc_offload (const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ) +{ + + if (context == QMCKL_NULL_CONTEXT) { + return QMCKL_INVALID_CONTEXT; + } + + if (cord_num <= 0) { + return QMCKL_INVALID_ARG_2; + } + + if (elec_num <= 0) { + return QMCKL_INVALID_ARG_3; + } + + if (nucl_num <= 0) { + return QMCKL_INVALID_ARG_4; + } + + // Compute array access strides: + // For tmp_c... + const int64_t stride_k_c = elec_num; + const int64_t stride_j_c = stride_k_c * nucl_num; + const int64_t stride_i_c = stride_j_c * (cord_num+1); + const int64_t stride_nw_c = stride_i_c * cord_num; + // For een_rescaled_e... + const int64_t stride_m_e = elec_num; + const int64_t stride_i_e = stride_m_e * elec_num; + const int64_t stride_nw_e = stride_i_e * (cord_num+1); + // For een_rescaled_n... + const int64_t stride_k_n = elec_num; + const int64_t stride_j_n = stride_k_n * nucl_num; + const int64_t stride_nw_n = stride_j_n * (cord_num+1); + + const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num; + const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num; + const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; + + #pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n]) + { +#pragma acc loop independent gang worker vector collapse(5) + for (int nw=0; nw < walk_num; ++nw) { + for (int i=0; i