1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-05 11:00:36 +01:00

Vectorization

This commit is contained in:
Anthony Scemama 2024-02-14 10:42:55 +01:00
parent 48b80f68f1
commit 2228ab23c5

View File

@ -6709,27 +6709,46 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc (
if (elec_num <= 0) return QMCKL_INVALID_ARG_3; if (elec_num <= 0) return QMCKL_INVALID_ARG_3;
if (cord_num < 0) return QMCKL_INVALID_ARG_4; if (cord_num < 0) return QMCKL_INVALID_ARG_4;
double* restrict elec_dist_gl = (double*) calloc(elec_num * 4 * elec_num, sizeof(double)); double* restrict elec_dist_gl0 = (double*) calloc(elec_num * elec_num, sizeof(double));
assert (elec_dist_gl != NULL); double* restrict elec_dist_gl1 = (double*) calloc(elec_num * elec_num, sizeof(double));
double* restrict elec_dist_gl2 = (double*) calloc(elec_num * elec_num, sizeof(double));
double* restrict elec_dist_gl3 = (double*) calloc(elec_num * elec_num, sizeof(double));
assert (elec_dist_gl0 != NULL);
assert (elec_dist_gl1 != NULL);
assert (elec_dist_gl2 != NULL);
assert (elec_dist_gl3 != NULL);
#pragma omp parallel for #pragma omp parallel for
for (int64_t nw = 0; nw < walk_num; ++nw) { for (int64_t nw = 0; nw < walk_num; ++nw) {
double rij_inv[elec_num];
for (int64_t j = 0; j < elec_num; ++j) { for (int64_t j = 0; j < elec_num; ++j) {
for (int64_t i = 0; i < j ; ++i) { #ifdef HAVE_OPENMP
double rij_inv = 1.0 / ee_distance[i + j * elec_num + nw * elec_num * elec_num]; #pragma omp simd
for (int64_t ii = 0; ii < 3; ++ii) { #endif
elec_dist_gl[i + ii * elec_num + j * 4 * elec_num] = for (int64_t i = 0; i < elec_num ; ++i) {
(coord_ee[i + ii * elec_num + nw * elec_num * 3] - coord_ee[j + ii * elec_num + nw * elec_num * 3]) * rij_inv; rij_inv[i] = ee_distance[i + j * elec_num + nw * elec_num * elec_num] + 1.e-30;
}
elec_dist_gl[i + 3 * elec_num + j * 4 * elec_num] = 2.0 * rij_inv;
} }
for (int64_t i = j+1; i < elec_num; ++i) { #ifdef HAVE_OPENMP
double rij_inv = 1.0 / ee_distance[i + j * elec_num + nw * elec_num * elec_num]; #pragma omp simd
for (int64_t ii = 0; ii < 3; ++ii) { #endif
elec_dist_gl[i + ii * elec_num + j * 4 * elec_num] = for (int64_t i = 0; i < elec_num ; ++i) {
(coord_ee[i + ii * elec_num + nw * elec_num * 3] - coord_ee[j + ii * elec_num + nw * elec_num * 3]) * rij_inv; rij_inv[i] = 1.0/rij_inv[i];
} }
elec_dist_gl[i + 3 * elec_num + j * 4 * elec_num] = 2.0 * rij_inv; rij_inv[j] = 0.;
const double xj = coord_ee[j + nw * elec_num * 3];
const double yj = coord_ee[j + elec_num + nw * elec_num * 3];
const double zj = coord_ee[j + 2 * elec_num + nw * elec_num * 3];
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (int64_t i = 0; i < elec_num ; ++i) {
const double xi = coord_ee[i + nw * elec_num * 3];
const double yi = coord_ee[i + elec_num + nw * elec_num * 3];
const double zi = coord_ee[i + 2 * elec_num + nw * elec_num * 3];
elec_dist_gl0[i + j * elec_num] = rij_inv[i] * (xi-xj);
elec_dist_gl1[i + j * elec_num] = rij_inv[i] * (yi-yj);
elec_dist_gl2[i + j * elec_num] = rij_inv[i] * (zi-zj);
elec_dist_gl3[i + j * elec_num] = rij_inv[i] + rij_inv[i];
} }
} }
@ -6738,17 +6757,27 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc (
for (int64_t j = 0; j < elec_num; ++j) { for (int64_t j = 0; j < elec_num; ++j) {
double* restrict eegl = &een_rescaled_e_gl[ elec_num * 4 * (j + elec_num * (l + (cord_num + 1) * nw))]; double* restrict eegl = &een_rescaled_e_gl[ elec_num * 4 * (j + elec_num * (l + (cord_num + 1) * nw))];
const double* restrict ee = &een_rescaled_e [ elec_num * (j + elec_num * (l + (cord_num + 1) * nw))]; const double* restrict ee = &een_rescaled_e [ elec_num * (j + elec_num * (l + (cord_num + 1) * nw))];
for (int64_t k = 0; k < 4; ++k) { #ifdef HAVE_OPENMP
for (int64_t i = 0; i < elec_num; ++i) { #pragma omp simd
eegl[i + elec_num * k] = kappa_l * elec_dist_gl[i + k * elec_num + j * 4 * elec_num]; #endif
} for (int64_t i = 0; i < elec_num; ++i) {
eegl[i ] = kappa_l * elec_dist_gl0[i + j * elec_num];
eegl[i + elec_num ] = kappa_l * elec_dist_gl1[i + j * elec_num];
eegl[i + elec_num * 2] = kappa_l * elec_dist_gl2[i + j * elec_num];
eegl[i + elec_num * 3] = kappa_l * elec_dist_gl3[i + j * elec_num];
} }
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (int64_t i = 0; i < elec_num; ++i) { for (int64_t i = 0; i < elec_num; ++i) {
eegl[i + elec_num*3] = eegl[i + elec_num*3] + eegl[i + elec_num*3] = eegl[i + elec_num*3] +
eegl[i] * eegl[i] + eegl[i] * eegl[i] +
eegl[i + elec_num*1] * eegl[i + elec_num*1] + eegl[i + elec_num*1] * eegl[i + elec_num*1] +
eegl[i + elec_num*2] * eegl[i + elec_num*2]; eegl[i + elec_num*2] * eegl[i + elec_num*2];
} }
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (int64_t i = 0; i < elec_num; ++i) { for (int64_t i = 0; i < elec_num; ++i) {
eegl[i ] *= ee[i]; eegl[i ] *= ee[i];
eegl[i + elec_num * 1] *= ee[i]; eegl[i + elec_num * 1] *= ee[i];
@ -6759,7 +6788,10 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc (
} }
} }
free(elec_dist_gl); free(elec_dist_gl0);
free(elec_dist_gl1);
free(elec_dist_gl2);
free(elec_dist_gl3);
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
} }