From ea21ec2ef75611cd0c5f05bd5d37b8d5dd88c824 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Wed, 1 Mar 2023 14:47:32 +0100 Subject: [PATCH] Removed GPU from Jastrow --- org/qmckl_jastrow.org | 729 ++---------------------------------------- 1 file changed, 24 insertions(+), 705 deletions(-) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index 8e06979..91ec082 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -20,16 +20,16 @@ \[ J_{\text{eN}}(\mathbf{r},\mathbf{R}) = \sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}} - \frac{a_{1,\alpha}\, g_\alpha(R_{i\alpha})}{1+a_{2,\alpha}\, g_\alpha(R_{i\alpha})} + - \sum_{p=2}^{N_\text{ord}^a} a_{p+1,\alpha}\, [g_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha} + \frac{a_{1\,\alpha}\, f_\alpha(R_{i\,\alpha})}{1+a_{2\,\alpha}\, f_\alpha(R_{i\alpha})} + + \sum_{p=2}^{N_\text{ord}^a} a_{p+1\,\alpha}\, [f_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha} \] $J_{\text{ee}}$ contains electron-electron terms: \[ J_{\text{ee}}(\mathbf{r}) = \sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1} - \frac{b_1\, f(r_{ij})}{1+b_2\, f(r_{ij})} + - \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f(r_{ij})]^p - J_{ee}^\infty + \frac{\frac{1}{2}(1+\delta^{\uparrow\downarrow}_{ij}) b_1\, f_{\text{ee}}(r_{ij})}{1+b_2\, f_{\text{ee}}(r_{ij})} + + \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f_{\text{ee}}(r_{ij})]^p - J_{ee}^\infty \] and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms: @@ -42,7 +42,7 @@ \sum_{p=2}^{N_{\text{ord}}} \sum_{k=0}^{p-1} \sum_{l=0}^{p-k-2\delta_{k,0}} - c_{lkp\alpha} \left[ f({r}_{ij}) \right]^k + c_{lkp\alpha} \left[ g_\text{ee}({r}_{ij}) \right]^k \left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right] \left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2} \] @@ -52,7 +52,7 @@ $f$ and $g$ are scaling function defined as \[ - f(r) = \frac{1-e^{-\kappa\, r}}{\kappa} \text{ and } + f_\alpha(r) = \frac{1-e^{-\kappa_\alpha\, r}}{\kappa_\alpha} \text{ and } g_\alpha(r) = e^{-\kappa_\alpha\, r}. \] @@ -118,11 +118,6 @@ int main() { #include "qmckl_jastrow_private_func.h" #include "qmckl_jastrow_private_type.h" -#ifdef HAVE_CUBLAS_OFFLOAD -#include "cublas_v2.h" -#endif - - #+end_src * Context @@ -404,13 +399,9 @@ typedef struct qmckl_jastrow_struct{ bool provided; char * type; - #ifdef HAVE_HPC - bool gpu_offload; - #endif } qmckl_jastrow_struct; #+end_src - The ~uninitialized~ integer contains one bit set to one for each initialization function which has not been called. It becomes equal to zero after all initialization functions have been called. The @@ -458,9 +449,9 @@ qmckl_exit_code qmckl_set_jastrow_bord_num (qmckl_context context, con qmckl_exit_code qmckl_set_jastrow_cord_num (qmckl_context context, const int64_t cord_num); qmckl_exit_code qmckl_set_jastrow_type_nucl_num (qmckl_context context, const int64_t type_nucl_num); qmckl_exit_code qmckl_set_jastrow_type_nucl_vector (qmckl_context context, const int64_t* type_nucl_vector, const int64_t nucl_num); -qmckl_exit_code qmckl_set_jastrow_a_vector (qmckl_context context, const double * a_vector, const int64_t size_max); -qmckl_exit_code qmckl_set_jastrow_b_vector (qmckl_context context, const double * b_vector, const int64_t size_max); -qmckl_exit_code qmckl_set_jastrow_c_vector (qmckl_context context, const double * c_vector, const int64_t size_max); +qmckl_exit_code qmckl_set_jastrow_a_vector (qmckl_context context, const double * a_vector, const int64_t size_max); +qmckl_exit_code qmckl_set_jastrow_b_vector (qmckl_context context, const double * b_vector, const int64_t size_max); +qmckl_exit_code qmckl_set_jastrow_c_vector (qmckl_context context, const double * c_vector, const int64_t size_max); #+end_src #+NAME:pre2 @@ -492,7 +483,6 @@ if (ctx->jastrow.provided) { return QMCKL_SUCCESS; #+end_src - #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none qmckl_exit_code qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num) @@ -910,7 +900,7 @@ qmckl_set_jastrow_rescale_factor_en(qmckl_context context, When the required information is completely entered, other data structures are computed to accelerate the calculations. The intermediates factors - are precontracted using BLAS LEVEL 3 operations for an optimal flop count. + are precontracted using BLAS LEVEL 3 operations. #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context); @@ -952,11 +942,6 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) { NULL); } - /* Decide if the Jastrow if offloaded on GPU or not */ -#if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD)) - ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100; -#endif - qmckl_exit_code rc; rc = qmckl_provide_jastrow_asymp_jasa(context); @@ -2016,16 +2001,16 @@ assert(fabs(asymp_jasb[1]-0.31567342786262853) < 1.e-12); #+end_src -** Electron-electron component \(f_{ee}\) +** Electron-electron component \(f_\text{ee}\) Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~ - componenet and the electron-electron rescaled distances ~ee_distance_rescaled~. + component and the electron-electron rescaled distances ~ee_distance_rescaled~. \[ -f_{ee} = \sum_{i,jjastrow.tmp_c = tmp_c; } - - /* Choose the correct compute function (depending on offload type) */ -#ifdef HAVE_HPC - const bool gpu_offload = ctx->jastrow.gpu_offload; -#else - const bool gpu_offload = false; -#endif - - if (gpu_offload) { -#ifdef HAVE_CUBLAS_OFFLOAD - rc = qmckl_compute_tmp_c_cublas_offload(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.tmp_c); -#elif HAVE_OPENACC_OFFLOAD - rc = qmckl_compute_tmp_c_acc_offload(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.tmp_c); -#elif HAVE_OPENMP_OFFLOAD - rc = qmckl_compute_tmp_c_omp_offload(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.tmp_c); -#else - rc = QMCKL_FAILURE; -#endif - } else { - rc = qmckl_compute_tmp_c(context, + rc = qmckl_compute_tmp_c(context, ctx->jastrow.cord_num, ctx->electron.num, ctx->nucleus.num, @@ -7342,8 +7287,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) ctx->jastrow.een_rescaled_e, ctx->jastrow.een_rescaled_n, ctx->jastrow.tmp_c); - } - ctx->jastrow.tmp_c_date = ctx->date; } @@ -7394,54 +7337,15 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) } -#ifdef HAVE_HPC - const bool gpu_offload = ctx->jastrow.gpu_offload; -#else - const bool gpu_offload = false; -#endif - - if (gpu_offload) { -#ifdef HAVE_CUBLAS_OFFLOAD - rc = qmckl_compute_dtmp_c_cublas_offload(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e_deriv_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.dtmp_c); -#elif HAVE_OPENACC_OFFLOAD - rc = qmckl_compute_dtmp_c_acc_offload(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e_deriv_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.dtmp_c); -#elif HAVE_OPENMP_OFFLOAD - rc = qmckl_compute_dtmp_c_omp_offload(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e_deriv_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.dtmp_c); -#else - rc = QMCKL_FAILURE; -#endif - } else { - rc = qmckl_compute_dtmp_c(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walker.num, - ctx->jastrow.een_rescaled_e_deriv_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.dtmp_c); - } - + rc = qmckl_compute_dtmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walker.num, + ctx->jastrow.een_rescaled_e_deriv_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.dtmp_c); + if (rc != QMCKL_SUCCESS) { return rc; } @@ -8139,293 +8043,6 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context, double* const tmp_c ); #+end_src -**** OpenACC offload :noexport: - - #+begin_src c :comments org :tangle (eval c) :noweb yes -#ifdef HAVE_OPENACC_OFFLOAD -qmckl_exit_code -qmckl_compute_tmp_c_acc_offload (const qmckl_context context, - const int64_t cord_num, - const int64_t elec_num, - const int64_t nucl_num, - const int64_t walk_num, - const double* een_rescaled_e, - const double* een_rescaled_n, - double* const tmp_c ) -{ - - if (context == QMCKL_NULL_CONTEXT) { - return QMCKL_INVALID_CONTEXT; - } - - if (cord_num < 0) { - return QMCKL_INVALID_ARG_2; - } - - if (elec_num <= 0) { - return QMCKL_INVALID_ARG_3; - } - - if (nucl_num <= 0) { - return QMCKL_INVALID_ARG_4; - } - - // Compute array access strides: - // For tmp_c... - const int64_t stride_k_c = elec_num; - const int64_t stride_j_c = stride_k_c * nucl_num; - const int64_t stride_i_c = stride_j_c * (cord_num+1); - const int64_t stride_nw_c = stride_i_c * cord_num; - // For een_rescaled_e... - const int64_t stride_m_e = elec_num; - const int64_t stride_i_e = stride_m_e * elec_num; - const int64_t stride_nw_e = stride_i_e * (cord_num+1); - // For een_rescaled_n... - const int64_t stride_k_n = elec_num; - const int64_t stride_j_n = stride_k_n * nucl_num; - const int64_t stride_nw_n = stride_j_n * (cord_num+1); - - const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num; - const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num; - const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; - - #pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n]) - { -#pragma acc loop independent gang worker vector collapse(5) - for (int nw=0; nw < walk_num; ++nw) { - for (int i=0; i