diff --git a/org/qmckl_ao.org b/org/qmckl_ao.org index 5796127..8cab593 100644 --- a/org/qmckl_ao.org +++ b/org/qmckl_ao.org @@ -5671,7 +5671,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context, for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) { const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2; if (v <= cutoff) { - ar2[iprim] = -v; + ar2[iprim] = v; ++nidx; } else { break; @@ -5680,7 +5680,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context, IVDEP for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) { - exp_mat[iprim] = exp(ar2[iprim]); + exp_mat[iprim] = exp(-ar2[iprim]); } } else { @@ -5688,7 +5688,7 @@ IVDEP for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) { const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2; if (v <= cutoff) { - ar2_sp[iprim] = (float) -v; + ar2_sp[iprim] = (float) v; ++nidx; } else { break; @@ -5697,7 +5697,7 @@ IVDEP IVDEP for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) { - exp_mat_sp[iprim] = expf(ar2_sp[iprim]); + exp_mat_sp[iprim] = expf(-ar2_sp[iprim]); } IVDEP @@ -6494,6 +6494,7 @@ qmckl_compute_ao_vgl_hpc_gaussian ( for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) { const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2; if (v <= cutoff) { + ar2[iprim] = v; ar2_sp[iprim] = (float) v; ++nidx; } else { diff --git a/org/qmckl_mo.org b/org/qmckl_mo.org index 1382024..29cb961 100644 --- a/org/qmckl_mo.org +++ b/org/qmckl_mo.org @@ -1264,10 +1264,22 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context, const double* coefficient_t, const double* ao_value, double* const mo_value ); + +qmckl_exit_code +qmckl_compute_mo_basis_mo_value_hpc_sp (const qmckl_context context, + const int64_t ao_num, + const int64_t mo_num, + const int64_t point_num, + const double* coefficient_t, + const double* ao_value, + double* const mo_value ); #endif #+end_src - #+begin_src c :tangle (eval c) :comments org + +**** Single-precision + + #+begin_src c :tangle (eval c) :comments org #ifdef HAVE_HPC qmckl_exit_code qmckl_compute_mo_basis_mo_value_hpc_sp (const qmckl_context context, @@ -1280,14 +1292,14 @@ qmckl_compute_mo_basis_mo_value_hpc_sp (const qmckl_context context, { assert (context != QMCKL_NULL_CONTEXT); - float* coefficient_t_sp = calloc(mo_num, sizeof(float)); + float* __attribute__((aligned(64))) coefficient_t_sp = calloc(ao_num*mo_num, sizeof(float)); if (coefficient_t_sp == NULL) { return qmckl_failwith( context, QMCKL_ALLOCATION_FAILED, - "qmckl_compute_mo_basis_mo_vgl_hpc_sp", + "qmckl_compute_mo_basis_mo_value_hpc_sp", "coefficient_t_sp"); }; - + IVDEP #ifdef HAVE_OPENMP #pragma omp simd @@ -1297,25 +1309,20 @@ qmckl_compute_mo_basis_mo_value_hpc_sp (const qmckl_context context, } #ifdef HAVE_OPENMP -#pragma omp parallel -#endif - { - int64_t* idx = calloc(ao_num, sizeof(int64_t)); - float* av1 = calloc(ao_num, sizeof(float)); - float* vgl_sp = calloc(mo_num, sizeof(float)); - assert (idx != NULL); - assert (av1 != NULL); - assert (vgl_sp != NULL); - -#ifdef HAVE_OPENMP -#pragma omp for +#pragma omp parallel for #endif for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) { double* restrict const vgl1 = &(mo_value[ipoint*mo_num]); const double* restrict avgl1 = &(ao_value[ipoint*ao_num]); - - memset(vgl_sp, 0, mo_num*sizeof(float)); - + + // memset(vgl_sp, 0, mo_num*sizeof(float)); + int64_t* __attribute__((aligned(64))) idx = calloc((size_t) ao_num, sizeof(int64_t)); + float* __attribute__((aligned(64))) av1 = calloc((size_t) ao_num, sizeof(float)); + float* __attribute__((aligned(64))) vgl_sp = calloc((size_t) mo_num, sizeof(float)); + assert (idx != NULL); + assert (av1 != NULL); + assert (vgl_sp != NULL); + int64_t nidx=0; for (int64_t k=0 ; knumprec.precision; + const bool single_precision = precision <= 23; + + if (single_precision) { + return qmckl_compute_mo_basis_mo_vgl_hpc_sp (context, + ao_num, + mo_num, + point_num, + coefficient_t, + ao_vgl, + mo_vgl ); + } #ifdef HAVE_OPENMP - #pragma omp parallel for +#pragma omp parallel +#endif + { + int64_t* __attribute__((aligned(64))) idx = calloc(ao_num, sizeof(int64_t)); + double* __attribute__((aligned(64))) av1 = calloc(ao_num, sizeof(double)); + double* __attribute__((aligned(64))) av2 = calloc(ao_num, sizeof(double)); + double* __attribute__((aligned(64))) av3 = calloc(ao_num, sizeof(double)); + double* __attribute__((aligned(64))) av4 = calloc(ao_num, sizeof(double)); + double* __attribute__((aligned(64))) av5 = calloc(ao_num, sizeof(double)); + assert (idx != NULL); + assert (av1 != NULL); + assert (av2 != NULL); + assert (av3 != NULL); + assert (av4 != NULL); + assert (av5 != NULL); + +#ifdef HAVE_OPENMP +#pragma omp for #endif for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) { double* restrict const vgl1 = &(mo_vgl[ipoint*5*mo_num]); @@ -1909,22 +1963,9 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context, const double* restrict avgl4 = avgl1 + (ao_num << 1) + ao_num; const double* restrict avgl5 = avgl1 + (ao_num << 2); -IVDEP - for (int64_t i=0 ; i