From 41615ba14b5699387df3029fa953586fad43069f Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 20 Feb 2024 23:38:34 +0100 Subject: [PATCH] Avoid memset in Jastrow --- org/qmckl_jastrow_champ.org | 144 ++++++++++++++++++++++++++---------- 1 file changed, 105 insertions(+), 39 deletions(-) diff --git a/org/qmckl_jastrow_champ.org b/org/qmckl_jastrow_champ.org index 44b4a2b..2eddedf 100644 --- a/org/qmckl_jastrow_champ.org +++ b/org/qmckl_jastrow_champ.org @@ -6219,6 +6219,9 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( for (size_t l = 2; l < (size_t) (cord_num+1); ++l) { double* restrict ee1 = &een_rescaled_e_ij[l*elec_pairs]; const double* restrict ee2 = &een_rescaled_e_ij[(l-1)*elec_pairs]; +#ifdef HAVE_OPENMP +#pragma omp simd +#endif for (size_t k = 0; k < elec_pairs; ++k) { // een_rescaled_e_ij(k, l + 1) = een_rescaled_e_ij(k, l + 1 - 1) * een_rescaled_e_ij(k, 2) @@ -10337,10 +10340,10 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context, const size_t elec_num3 = elec_num * 3; #ifdef HAVE_OPENMP -#pragma omp parallel for schedule(guided) +#pragma omp parallel for #endif for (size_t nw = 0; nw < (size_t) walk_num; ++nw) { - memset(&factor_een_gl[elec_num*4*nw], 0, elec_num*4*sizeof(double)); + bool touched = false; double* const restrict factor_een_gl_0nw = &(factor_een_gl[elec_num*4*nw]); for (size_t n = 0; n < (size_t) dim_c_vector; ++n) { const size_t l = lkpm_combined_index[n]; @@ -10397,62 +10400,125 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context, double tmp3[elec_num]; + if (touched) { #ifdef HAVE_OPENMP #pragma omp simd #endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_gl_0nw[j] = factor_een_gl_0nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + - dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); - tmp3[j] = - dtmp_c_0amknw[j] * een_rescaled_n_gl_0amlnw[j] + - dtmp_c_0amlknw[j] * een_rescaled_n_gl_0amnw[j]; + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_0nw[j] = factor_een_gl_0nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + + dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); + tmp3[j] = + dtmp_c_0amknw[j] * een_rescaled_n_gl_0amlnw[j] + + dtmp_c_0amlknw[j] * een_rescaled_n_gl_0amnw[j]; + } + +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_1nw[j] = factor_een_gl_1nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + + dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); + tmp3[j] = tmp3[j] + + dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] + + dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j]; } #ifdef HAVE_OPENMP #pragma omp simd #endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_gl_1nw[j] = factor_een_gl_1nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + - dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); - tmp3[j] = tmp3[j] + - dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] + - dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j]; - } + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_2nw[j] = factor_een_gl_2nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + + dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); + tmp3[j] = tmp3[j] + + dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] + + dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j]; + } #ifdef HAVE_OPENMP #pragma omp simd #endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_gl_2nw[j] = factor_een_gl_2nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + - dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); - tmp3[j] = tmp3[j] + - dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] + - dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j]; - } + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_3nw[j] = factor_een_gl_3nw[j] + cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] + + dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_3amnw[j] + + tmp3[j]*2.0); + } + + } else { + + touched = true; #ifdef HAVE_OPENMP #pragma omp simd #endif - for (size_t j = 0; j < (size_t) elec_num; ++j) { - factor_een_gl_3nw[j] = factor_een_gl_3nw[j] + cn * - (tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] + - dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] + - dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] + - tmp_c_amlkn[j] * een_rescaled_n_gl_3amnw[j] + - tmp3[j]*2.0); - } + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_0nw[j] = cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + + dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); + tmp3[j] = + dtmp_c_0amknw[j] * een_rescaled_n_gl_0amlnw[j] + + dtmp_c_0amlknw[j] * een_rescaled_n_gl_0amnw[j]; + } +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_1nw[j] = cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + + dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); + tmp3[j] = tmp3[j] + + dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] + + dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j]; + } + +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_2nw[j] = cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + + dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); + tmp3[j] = tmp3[j] + + dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] + + dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j]; + } + +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (size_t j = 0; j < (size_t) elec_num; ++j) { + factor_een_gl_3nw[j] = cn * + (tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] + + dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] + + dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] + + tmp_c_amlkn[j] * een_rescaled_n_gl_3amnw[j] + + tmp3[j]*2.0); + } + + } } } + if (!touched) { + memset(factor_een_gl_0nw, 0, elec_num*4*sizeof(double)); + } } return info; }