1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-12-22 20:36:01 +01:00

Avoid memset in Jastrow

This commit is contained in:
Anthony Scemama 2024-02-20 23:38:34 +01:00
parent 2f0ca9f674
commit 41615ba14b

View File

@ -6219,6 +6219,9 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
for (size_t l = 2; l < (size_t) (cord_num+1); ++l) { for (size_t l = 2; l < (size_t) (cord_num+1); ++l) {
double* restrict ee1 = &een_rescaled_e_ij[l*elec_pairs]; double* restrict ee1 = &een_rescaled_e_ij[l*elec_pairs];
const double* restrict ee2 = &een_rescaled_e_ij[(l-1)*elec_pairs]; const double* restrict ee2 = &een_rescaled_e_ij[(l-1)*elec_pairs];
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (size_t k = 0; k < elec_pairs; ++k) { for (size_t k = 0; k < elec_pairs; ++k) {
// een_rescaled_e_ij(k, l + 1) = een_rescaled_e_ij(k, l + 1 - 1) * een_rescaled_e_ij(k, 2) // een_rescaled_e_ij(k, l + 1) = een_rescaled_e_ij(k, l + 1 - 1) * een_rescaled_e_ij(k, 2)
@ -10337,10 +10340,10 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context,
const size_t elec_num3 = elec_num * 3; const size_t elec_num3 = elec_num * 3;
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp parallel for schedule(guided) #pragma omp parallel for
#endif #endif
for (size_t nw = 0; nw < (size_t) walk_num; ++nw) { for (size_t nw = 0; nw < (size_t) walk_num; ++nw) {
memset(&factor_een_gl[elec_num*4*nw], 0, elec_num*4*sizeof(double)); bool touched = false;
double* const restrict factor_een_gl_0nw = &(factor_een_gl[elec_num*4*nw]); double* const restrict factor_een_gl_0nw = &(factor_een_gl[elec_num*4*nw]);
for (size_t n = 0; n < (size_t) dim_c_vector; ++n) { for (size_t n = 0; n < (size_t) dim_c_vector; ++n) {
const size_t l = lkpm_combined_index[n]; const size_t l = lkpm_combined_index[n];
@ -10397,62 +10400,125 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context,
double tmp3[elec_num]; double tmp3[elec_num];
if (touched) {
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
for (size_t j = 0; j < (size_t) elec_num; ++j) { for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_0nw[j] = factor_een_gl_0nw[j] + cn * factor_een_gl_0nw[j] = factor_een_gl_0nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] + (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] +
dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] + dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] + dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]); tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]);
tmp3[j] = tmp3[j] =
dtmp_c_0amknw[j] * een_rescaled_n_gl_0amlnw[j] + dtmp_c_0amknw[j] * een_rescaled_n_gl_0amlnw[j] +
dtmp_c_0amlknw[j] * een_rescaled_n_gl_0amnw[j]; dtmp_c_0amlknw[j] * een_rescaled_n_gl_0amnw[j];
}
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_1nw[j] = factor_een_gl_1nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] +
dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]);
tmp3[j] = tmp3[j] +
dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j];
} }
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
for (size_t j = 0; j < (size_t) elec_num; ++j) { for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_1nw[j] = factor_een_gl_1nw[j] + cn * factor_een_gl_2nw[j] = factor_een_gl_2nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] + (tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] +
dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] + dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] + dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]); tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]);
tmp3[j] = tmp3[j] + tmp3[j] = tmp3[j] +
dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] + dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j]; dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j];
} }
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
for (size_t j = 0; j < (size_t) elec_num; ++j) { for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_2nw[j] = factor_een_gl_2nw[j] + cn * factor_een_gl_3nw[j] = factor_een_gl_3nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] + (tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] +
dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] + dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] + dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]); tmp_c_amlkn[j] * een_rescaled_n_gl_3amnw[j] +
tmp3[j] = tmp3[j] + tmp3[j]*2.0);
dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] + }
dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j];
} } else {
touched = true;
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
for (size_t j = 0; j < (size_t) elec_num; ++j) { for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_3nw[j] = factor_een_gl_3nw[j] + cn * factor_een_gl_0nw[j] = cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] + (tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] +
dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] + dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] + dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_3amnw[j] + tmp_c_amlkn[j] * een_rescaled_n_gl_0amnw[j]);
tmp3[j]*2.0); tmp3[j] =
} dtmp_c_0amknw[j] * een_rescaled_n_gl_0amlnw[j] +
dtmp_c_0amlknw[j] * een_rescaled_n_gl_0amnw[j];
}
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_1nw[j] = cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] +
dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]);
tmp3[j] = tmp3[j] +
dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j];
}
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_2nw[j] = cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] +
dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]);
tmp3[j] = tmp3[j] +
dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] +
dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j];
}
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_3nw[j] = cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] +
dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_3amnw[j] +
tmp3[j]*2.0);
}
}
} }
} }
if (!touched) {
memset(factor_een_gl_0nw, 0, elec_num*4*sizeof(double));
}
} }
return info; return info;
} }