From 6ad40e0cf2d3f22f79187965239e4f5a1b49ffcc Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Sat, 14 Dec 2024 14:50:20 +0100 Subject: [PATCH] Fixed HPC --- org/qmckl_jastrow_champ.org | 137 +++++++++++++----------------------- 1 file changed, 48 insertions(+), 89 deletions(-) diff --git a/org/qmckl_jastrow_champ.org b/org/qmckl_jastrow_champ.org index 94bb814..dbad2ca 100644 --- a/org/qmckl_jastrow_champ.org +++ b/org/qmckl_jastrow_champ.org @@ -11298,6 +11298,7 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context, const double* restrict dtmp_c_2amlknw = dtmp_c_0amlknw + elec_num2; const double* restrict dtmp_c_3amknw = dtmp_c_0amknw + elec_num3; const double* restrict dtmp_c_3amlknw = dtmp_c_0amlknw + elec_num3; + const double* restrict een_rescaled_n_gl_1amnw = een_rescaled_n_gl_0amnw + elec_num; const double* restrict een_rescaled_n_gl_1amlnw = een_rescaled_n_gl_0amlnw + elec_num; const double* restrict een_rescaled_n_gl_2amnw = een_rescaled_n_gl_0amnw + elec_num2; @@ -11738,18 +11739,18 @@ qmckl_compute_jastrow_champ_factor_een_grad_hpc ( #+begin_src c :tangle (eval c) :comments org qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_grad_hpc(const qmckl_context context, - const int64_t walk_num, - const int64_t elec_num, - const int64_t nucl_num, - const int64_t cord_num, - const int64_t dim_c_vector, - const double *c_vector_full, - const int64_t *lkpm_combined_index, - const double *tmp_c, - const double *dtmp_c, - const double *een_rescaled_n, - const double *een_rescaled_n_gl, - double* const factor_een_grad) + const int64_t walk_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t cord_num, + const int64_t dim_c_vector, + const double *c_vector_full, + const int64_t *lkpm_combined_index, + const double *tmp_c, + const double *dtmp_c, + const double *een_rescaled_n, + const double *een_rescaled_n_gl, + double* const factor_een_grad) { int64_t info = QMCKL_SUCCESS; @@ -11771,13 +11772,12 @@ qmckl_compute_jastrow_champ_factor_een_grad_hpc(const qmckl_context context, return QMCKL_SUCCESS; } - const size_t elec_num2 = elec_num << 1; + const size_t elec_num2 = elec_num + elec_num; #ifdef HAVE_OPENMP #pragma omp parallel for #endif for (size_t nw = 0; nw < (size_t) walk_num; ++nw) { - bool touched = false; double* const restrict factor_een_grad_0nw = &(factor_een_grad[elec_num*3*nw]); for (size_t n = 0; n < (size_t) dim_c_vector; ++n) { const size_t l = lkpm_combined_index[n]; @@ -11786,11 +11786,11 @@ qmckl_compute_jastrow_champ_factor_een_grad_hpc(const qmckl_context context, const size_t en = elec_num*nucl_num; const size_t len = l*en; - const size_t len4 = len << 2; + const size_t len4 = len*4; const size_t cn = cord_num*nw; const size_t c1 = cord_num+1; const size_t addr0 = en*(m+c1*(k+cn)); - const size_t addr1 = en*(m+cn); + const size_t addr1 = en*(m+c1*nw); const double* restrict tmp_c_mkn = &(tmp_c[addr0]); const double* restrict tmp_c_mlkn = tmp_c_mkn + len; @@ -11800,12 +11800,13 @@ qmckl_compute_jastrow_champ_factor_een_grad_hpc(const qmckl_context context, const double* restrict dtmp_c_mlknw = dtmp_c_mknw + len4; const double* restrict een_rescaled_n_gl_mnw = &(een_rescaled_n_gl[addr1 << 2]); const double* restrict een_rescaled_n_gl_mlnw = een_rescaled_n_gl_mnw + len4; + for (size_t a = 0; a < (size_t) nucl_num; a++) { double cn = c_vector_full[a+n*nucl_num]; if (cn == 0.0) continue; const size_t ishift = elec_num*a; - const size_t ishift4 = ishift << 2; + const size_t ishift4 = ishift*4; const double* restrict tmp_c_amlkn = tmp_c_mlkn + ishift; const double* restrict tmp_c_amkn = tmp_c_mkn + ishift; @@ -11827,82 +11828,40 @@ qmckl_compute_jastrow_champ_factor_een_grad_hpc(const qmckl_context context, double* const restrict factor_een_grad_1nw = factor_een_grad_0nw + elec_num; double* const restrict factor_een_grad_2nw = factor_een_grad_0nw + elec_num2; - if (touched) { #ifdef HAVE_OPENMP #pragma omp simd #endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_grad_0nw[j] = factor_een_grad_0nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + - dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); - } - -#ifdef HAVE_OPENMP -#pragma omp simd -#endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_grad_1nw[j] = factor_een_grad_1nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + - dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); - } - -#ifdef HAVE_OPENMP -#pragma omp simd -#endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_grad_2nw[j] = factor_een_grad_2nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + - dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); - } - - } else { - - touched = true; - -#ifdef HAVE_OPENMP -#pragma omp simd -#endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_grad_0nw[j] = cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + - dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); - } - -#ifdef HAVE_OPENMP -#pragma omp simd -#endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_grad_1nw[j] = cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + - dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); - } - -#ifdef HAVE_OPENMP -#pragma omp simd -#endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_grad_2nw[j] = cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + - dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); - } - + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_grad_0nw[j] = factor_een_grad_0nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + + dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); } - } - } - if (!touched) { - memset(factor_een_grad_0nw, 0, elec_num*3*sizeof(double)); + +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_grad_1nw[j] = factor_een_grad_1nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + + dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); + } + +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_grad_2nw[j] = factor_een_grad_2nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + + dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); + } + + } } } return info;