diff --git a/org/qmckl_jastrow_champ.org b/org/qmckl_jastrow_champ.org index 40cabff..4af7881 100644 --- a/org/qmckl_jastrow_champ.org +++ b/org/qmckl_jastrow_champ.org @@ -2943,7 +2943,7 @@ qmckl_compute_jastrow_champ_factor_ee_gl_hpc(const qmckl_context context, const double* b_vector, const double* ee_distance_rescaled, const double* ee_distance_rescaled_gl, - const int32_t spin_independent, + const int32_t spin_independent, double* const factor_ee_gl ) { @@ -3041,7 +3041,7 @@ qmckl_compute_jastrow_champ_factor_ee_gl (const qmckl_context context, const double* b_vector, const double* ee_distance_rescaled, const double* ee_distance_rescaled_gl, - const int32_t spin_independent, + const int32_t spin_independent, double* const factor_ee_gl ); #+end_src @@ -3056,7 +3056,7 @@ qmckl_compute_jastrow_champ_factor_ee_gl_hpc (const qmckl_context context, const double* b_vector, const double* ee_distance_rescaled, const double* ee_distance_rescaled_gl, - const int32_t spin_independent, + const int32_t spin_independent, double* const factor_ee_gl ); #+end_src @@ -3071,7 +3071,7 @@ qmckl_compute_jastrow_champ_factor_ee_gl_doc (const qmckl_context context, const double* b_vector, const double* ee_distance_rescaled, const double* ee_distance_rescaled_gl, - const int32_t spin_independent, + const int32_t spin_independent, double* const factor_ee_gl ); #+end_src @@ -3086,7 +3086,7 @@ qmckl_compute_jastrow_champ_factor_ee_gl (const qmckl_context context, const double* b_vector, const double* ee_distance_rescaled, const double* ee_distance_rescaled_gl, - const int32_t spin_independent, + const int32_t spin_independent, double* const factor_ee_gl ) { #ifdef HAVE_HPC @@ -6219,7 +6219,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc ( for (size_t k = 0; k < elec_pairs; ++k) { // een_rescaled_e_ij(k, l + 1) = een_rescaled_e_ij(k, l + 1 - 1) * een_rescaled_e_ij(k, 2) - // een_rescaled_e_ij[k+l*elec_pairs] = een_rescaled_e_ij[k + (l - 1)*elec_pairs] * + // een_rescaled_e_ij[k+l*elec_pairs] = een_rescaled_e_ij[k + (l - 1)*elec_pairs] * // een_rescaled_e_ij[k + elec_pairs]; ee1[k] = ee2[k] * ee3[k]; } @@ -6709,46 +6709,75 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc ( if (elec_num <= 0) return QMCKL_INVALID_ARG_3; if (cord_num < 0) return QMCKL_INVALID_ARG_4; - double* restrict elec_dist_gl = (double*) calloc(elec_num * 4 * elec_num, sizeof(double)); - assert (elec_dist_gl != NULL); + double* restrict elec_dist_gl0 = (double*) calloc(elec_num * elec_num, sizeof(double)); + double* restrict elec_dist_gl1 = (double*) calloc(elec_num * elec_num, sizeof(double)); + double* restrict elec_dist_gl2 = (double*) calloc(elec_num * elec_num, sizeof(double)); + double* restrict elec_dist_gl3 = (double*) calloc(elec_num * elec_num, sizeof(double)); + assert (elec_dist_gl0 != NULL); + assert (elec_dist_gl1 != NULL); + assert (elec_dist_gl2 != NULL); + assert (elec_dist_gl3 != NULL); #pragma omp parallel for for (int64_t nw = 0; nw < walk_num; ++nw) { + double rij_inv[elec_num]; for (int64_t j = 0; j < elec_num; ++j) { - for (int64_t i = 0; i < j ; ++i) { - double rij_inv = 1.0 / ee_distance[i + j * elec_num + nw * elec_num * elec_num]; - for (int64_t ii = 0; ii < 3; ++ii) { - elec_dist_gl[i + ii * elec_num + j * 4 * elec_num] = - (coord_ee[i + ii * elec_num + nw * elec_num * 3] - coord_ee[j + ii * elec_num + nw * elec_num * 3]) * rij_inv; - } - elec_dist_gl[i + 3 * elec_num + j * 4 * elec_num] = 2.0 * rij_inv; +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (int64_t i = 0; i < elec_num ; ++i) { + rij_inv[i] = ee_distance[i + j * elec_num + nw * elec_num * elec_num] + 1.e-30; } - for (int64_t i = j+1; i < elec_num; ++i) { - double rij_inv = 1.0 / ee_distance[i + j * elec_num + nw * elec_num * elec_num]; - for (int64_t ii = 0; ii < 3; ++ii) { - elec_dist_gl[i + ii * elec_num + j * 4 * elec_num] = - (coord_ee[i + ii * elec_num + nw * elec_num * 3] - coord_ee[j + ii * elec_num + nw * elec_num * 3]) * rij_inv; - } - elec_dist_gl[i + 3 * elec_num + j * 4 * elec_num] = 2.0 * rij_inv; +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (int64_t i = 0; i < elec_num ; ++i) { + rij_inv[i] = 1.0/rij_inv[i]; + } + rij_inv[j] = 0.; + const double xj = coord_ee[j + nw * elec_num * 3]; + const double yj = coord_ee[j + elec_num + nw * elec_num * 3]; + const double zj = coord_ee[j + 2 * elec_num + nw * elec_num * 3]; +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (int64_t i = 0; i < elec_num ; ++i) { + const double xi = coord_ee[i + nw * elec_num * 3]; + const double yi = coord_ee[i + elec_num + nw * elec_num * 3]; + const double zi = coord_ee[i + 2 * elec_num + nw * elec_num * 3]; + elec_dist_gl0[i + j * elec_num] = rij_inv[i] * (xi-xj); + elec_dist_gl1[i + j * elec_num] = rij_inv[i] * (yi-yj); + elec_dist_gl2[i + j * elec_num] = rij_inv[i] * (zi-zj); + elec_dist_gl3[i + j * elec_num] = rij_inv[i] + rij_inv[i]; } } for (int64_t l = 1; l <= cord_num; ++l) { double kappa_l = - (double)l * rescale_factor_ee; for (int64_t j = 0; j < elec_num; ++j) { - double* restrict eegl = &een_rescaled_e_gl[ elec_num * 4 * (j + elec_num * (l + (cord_num + 1) * nw))]; - const double* restrict ee = &een_rescaled_e [ elec_num * (j + elec_num * (l + (cord_num + 1) * nw))]; - for (int64_t k = 0; k < 4; ++k) { - for (int64_t i = 0; i < elec_num; ++i) { - eegl[i + elec_num * k] = kappa_l * elec_dist_gl[i + k * elec_num + j * 4 * elec_num]; - } + double* restrict eegl = &een_rescaled_e_gl[ elec_num * 4 * (j + elec_num * (l + (cord_num + 1) * nw))]; + const double* restrict ee = &een_rescaled_e [ elec_num * (j + elec_num * (l + (cord_num + 1) * nw))]; +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (int64_t i = 0; i < elec_num; ++i) { + eegl[i ] = kappa_l * elec_dist_gl0[i + j * elec_num]; + eegl[i + elec_num ] = kappa_l * elec_dist_gl1[i + j * elec_num]; + eegl[i + elec_num * 2] = kappa_l * elec_dist_gl2[i + j * elec_num]; + eegl[i + elec_num * 3] = kappa_l * elec_dist_gl3[i + j * elec_num]; } +#ifdef HAVE_OPENMP +#pragma omp simd +#endif for (int64_t i = 0; i < elec_num; ++i) { eegl[i + elec_num*3] = eegl[i + elec_num*3] + eegl[i] * eegl[i] + eegl[i + elec_num*1] * eegl[i + elec_num*1] + eegl[i + elec_num*2] * eegl[i + elec_num*2]; } +#ifdef HAVE_OPENMP +#pragma omp simd +#endif for (int64_t i = 0; i < elec_num; ++i) { eegl[i ] *= ee[i]; eegl[i + elec_num * 1] *= ee[i]; @@ -6759,12 +6788,15 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc ( } } - free(elec_dist_gl); + free(elec_dist_gl0); + free(elec_dist_gl1); + free(elec_dist_gl2); + free(elec_dist_gl3); return QMCKL_SUCCESS; } #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl ( const qmckl_context context, @@ -6778,7 +6810,7 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl ( double* const een_rescaled_e_gl ) { #ifdef HAVE_HPC - return qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc + return qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_hpc #else return qmckl_compute_jastrow_champ_factor_een_rescaled_e_gl_doc #endif