1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-10 04:58:33 +01:00

Added control of precision in AOs

This commit is contained in:
Anthony Scemama 2023-11-28 17:00:39 +01:00
parent 063aada9e4
commit 6bf9388a4e
4 changed files with 239 additions and 97 deletions

View File

@ -66,7 +66,7 @@ AC_ARG_WITH([icx],
AS_IF([test "x$with_icx" = "xyes"], [ AS_IF([test "x$with_icx" = "xyes"], [
CC=icx CC=icx
CFLAGS="-march=native -ip -Ofast -ftz -finline -g -mkl=sequential" ]) CFLAGS="-march=native -Ofast -ftz -finline -g -qmkl=sequential" ])
AS_IF([test "x$with_icx.$with_ifort" = "xyes.yes"], [ AS_IF([test "x$with_icx.$with_ifort" = "xyes.yes"], [
ax_blas_ok="yes" ax_blas_ok="yes"

View File

@ -2684,7 +2684,7 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) {
} }
/* Find distance beyond which all AOs are zero. /* Find distance beyond which all AOs are zero.
The distance is obtained by sqrt(log(cutoff)*range) */ The distance is obtained by sqrt(-log(epsilon)*range) */
{ {
if (ctx->ao_basis.type == 'G') { if (ctx->ao_basis.type == 'G') {
qmckl_memory_info_struct mem_info = qmckl_memory_info_struct_zero; qmckl_memory_info_struct mem_info = qmckl_memory_info_struct_zero;
@ -5349,23 +5349,23 @@ for (int32_t ldl=3 ; ldl<=5 ; ++ldl) {
:END: :END:
*** Unoptimized version *** Unoptimized version
#+NAME: qmckl_ao_value_args_doc #+NAME: qmckl_ao_value_args_doc
| Variable | Type | In/Out | Description | | Variable | Type | In/Out | Description |
|-----------------------+-----------------------------------+--------+----------------------------------------------| |-----------------------+-----------------------------------+--------+----------------------------------------------------------|
| ~context~ | ~qmckl_context~ | in | Global state | | ~context~ | ~qmckl_context~ | in | Global state |
| ~ao_num~ | ~int64_t~ | in | Number of AOs | | ~ao_num~ | ~int64_t~ | in | Number of AOs |
| ~shell_num~ | ~int64_t~ | in | Number of shells | | ~shell_num~ | ~int64_t~ | in | Number of shells |
| ~point_num~ | ~int64_t~ | in | Number of points | | ~point_num~ | ~int64_t~ | in | Number of points |
| ~nucl_num~ | ~int64_t~ | in | Number of nuclei | | ~nucl_num~ | ~int64_t~ | in | Number of nuclei |
| ~coord~ | ~double[3][point_num]~ | in | Coordinates | | ~coord~ | ~double[3][point_num]~ | in | Coordinates |
| ~nucl_coord~ | ~double[3][nucl_num]~ | in | Nuclear coordinates | | ~nucl_coord~ | ~double[3][nucl_num]~ | in | Nuclear coordinates |
| ~nucleus_index~ | ~int64_t[nucl_num]~ | in | Index of the 1st shell of each nucleus | | ~nucleus_index~ | ~int64_t[nucl_num]~ | in | Index of the 1st shell of each nucleus |
| ~nucleus_shell_num~ | ~int64_t[nucl_num]~ | in | Number of shells per nucleus | | ~nucleus_shell_num~ | ~int64_t[nucl_num]~ | in | Number of shells per nucleus |
| ~nucleus_range~ | ~double[nucl_num]~ | in | Range beyond which all is zero | | ~nucleus_range~ | ~double[nucl_num]~ | in | Range beyond which all is zero |
| ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~ | in | Maximum angular momentum per nucleus | | ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~ | in | Maximum angular momentum per nucleus |
| ~shell_ang_mom~ | ~int32_t[shell_num]~ | in | Angular momentum of each shell | | ~shell_ang_mom~ | ~int32_t[shell_num]~ | in | Angular momentum of each shell |
| ~ao_factor~ | ~double[ao_num]~ | in | Normalization factor of the AOs | | ~ao_factor~ | ~double[ao_num]~ | in | Normalization factor of the AOs |
| ~shell_vgl~ | ~double[point_num][5][shell_num]~ | in | Value, gradients and Laplacian of the shells | | ~shell_vgl~ | ~double[point_num][5][shell_num]~ | in | Value, gradients and Laplacian of the shells |
| ~ao_value~ | ~double[point_num][ao_num]~ | out | Values of the AOs | | ~ao_value~ | ~double[point_num][ao_num]~ | out | Values of the AOs |
#+begin_src f90 :comments org :tangle (eval f) :noweb yes #+begin_src f90 :comments org :tangle (eval f) :noweb yes
function qmckl_compute_ao_value_doc(context, & function qmckl_compute_ao_value_doc(context, &
@ -5376,7 +5376,7 @@ function qmckl_compute_ao_value_doc(context, &
bind(C) result(info) bind(C) result(info)
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use qmckl_constants use qmckl_constants
use qmckl, only: qmckl_ao_polynomial_vgl use qmckl, only: qmckl_ao_polynomial_vgl, qmckl_get_numprec_epsilon
implicit none implicit none
integer (qmckl_context), intent(in) , value :: context integer (qmckl_context), intent(in) , value :: context
integer (c_int64_t) , intent(in) , value :: ao_num integer (c_int64_t) , intent(in) , value :: ao_num
@ -5428,7 +5428,7 @@ function qmckl_compute_ao_value_doc(context, &
info = QMCKL_SUCCESS info = QMCKL_SUCCESS
! Don't compute polynomials when the radial part is zero. ! Don't compute polynomials when the radial part is zero.
cutoff = 27.631021115928547d0 !-dlog(1.d-12) cutoff = -dlog(qmckl_get_numprec_epsilon(context))
do ipoint = 1, point_num do ipoint = 1, point_num
e_coord(1) = coord(ipoint,1) e_coord(1) = coord(ipoint,1)
@ -5478,27 +5478,27 @@ end function qmckl_compute_ao_value_doc
*** HPC version *** HPC version
#+NAME: qmckl_ao_value_args_hpc_gaussian #+NAME: qmckl_ao_value_args_hpc_gaussian
| Variable | Type | In/Out | Description | | Variable | Type | In/Out | Description |
|-----------------------+-----------------------------+--------+----------------------------------------------| |-----------------------+-----------------------------+--------+----------------------------------------------------------|
| ~context~ | ~qmckl_context~ | in | Global state | | ~context~ | ~qmckl_context~ | in | Global state |
| ~ao_num~ | ~int64_t~ | in | Number of AOs | | ~ao_num~ | ~int64_t~ | in | Number of AOs |
| ~shell_num~ | ~int64_t~ | in | Number of shells | | ~shell_num~ | ~int64_t~ | in | Number of shells |
| ~prim_num~ | ~int64_t~ | in | Number of primitives | | ~prim_num~ | ~int64_t~ | in | Number of primitives |
| ~point_num~ | ~int64_t~ | in | Number of points | | ~point_num~ | ~int64_t~ | in | Number of points |
| ~nucl_num~ | ~int64_t~ | in | Number of nuclei | | ~nucl_num~ | ~int64_t~ | in | Number of nuclei |
| ~coord~ | ~double[3][point_num]~ | in | Coordinates | | ~coord~ | ~double[3][point_num]~ | in | Coordinates |
| ~nucl_coord~ | ~double[3][nucl_num]~ | in | Nuclear coordinates | | ~nucl_coord~ | ~double[3][nucl_num]~ | in | Nuclear coordinates |
| ~nucleus_index~ | ~int64_t[nucl_num]~ | in | Index of the 1st shell of each nucleus | | ~nucleus_index~ | ~int64_t[nucl_num]~ | in | Index of the 1st shell of each nucleus |
| ~nucleus_shell_num~ | ~int64_t[nucl_num]~ | in | Number of shells per nucleus | | ~nucleus_shell_num~ | ~int64_t[nucl_num]~ | in | Number of shells per nucleus |
| ~nucleus_range~ | ~double[nucl_num]~ | in | Range beyond which all is zero | | ~nucleus_range~ | ~double[nucl_num]~ | in | Range beyond which all is zero |
| ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~ | in | Maximum angular momentum per nucleus | | ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~ | in | Maximum angular momentum per nucleus |
| ~shell_ang_mom~ | ~int32_t[shell_num]~ | in | Angular momentum of each shell | | ~shell_ang_mom~ | ~int32_t[shell_num]~ | in | Angular momentum of each shell |
| ~shell_prim_index~ | ~int64_t[shell_num]~ | in | Index of the 1st primitive of each shell | | ~shell_prim_index~ | ~int64_t[shell_num]~ | in | Index of the 1st primitive of each shell |
| ~shell_prim_num~ | ~int64_t[shell_num]~ | in | Number of primitives per shell | | ~shell_prim_num~ | ~int64_t[shell_num]~ | in | Number of primitives per shell |
| ~ao_factor~ | ~double[ao_num]~ | in | Normalization factor of the AOs | | ~ao_factor~ | ~double[ao_num]~ | in | Normalization factor of the AOs |
| ~ao_expo~ | ~double[prim_num]~ | in | Value, gradients and Laplacian of the shells | | ~ao_expo~ | ~double[prim_num]~ | in | Value, gradients and Laplacian of the shells |
| ~coef_normalized~ | ~double[prim_num]~ | in | Value, gradients and Laplacian of the shells | | ~coef_normalized~ | ~double[prim_num]~ | in | Value, gradients and Laplacian of the shells |
| ~ao_value~ | ~double[point_num][ao_num]~ | out | Values of the AOs | | ~ao_value~ | ~double[point_num][ao_num]~ | out | Values of the AOs |
#+NAME:ao_value_constants #+NAME:ao_value_constants
#+begin_src c :exports none #+begin_src c :exports none
@ -5530,8 +5530,13 @@ end function qmckl_compute_ao_value_doc
ao_index[shell_num] = ao_num+1; ao_index[shell_num] = ao_num+1;
} }
qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
assert (ctx != NULL);
/* Don't compute polynomials when the radial part is zero. */ /* Don't compute polynomials when the radial part is zero. */
const double cutoff = 27.631021115928547; // -log(1.e-12) const int precision = ctx->numprec.precision;
const double cutoff = log( (double) ( ((uint64_t) 1) << (precision-2) ) );
const bool double_precision = precision > 23;
#+end_src #+end_src
@ -5566,10 +5571,12 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
{ {
qmckl_exit_code rc; qmckl_exit_code rc;
double ar2[prim_max] __attribute__((aligned(64))); double ar2[prim_max] __attribute__((aligned(64)));
float ar2_sp[prim_max] __attribute__((aligned(64)));
int32_t powers[3*size_max] __attribute__((aligned(64))); int32_t powers[3*size_max] __attribute__((aligned(64)));
double poly_vgl[5*size_max] __attribute__((aligned(64))); double poly_vgl[5*size_max] __attribute__((aligned(64)));
double exp_mat[prim_max] __attribute__((aligned(64))); double exp_mat[prim_max] __attribute__((aligned(64)));
float exp_mat_sp[prim_max] __attribute__((aligned(64)));
double ce_mat[shell_max] __attribute__((aligned(64))); double ce_mat[shell_max] __attribute__((aligned(64)));
int32_t (*coef_mat_sparse_idx)[shell_max][prim_max+1] __attribute__((aligned(64))) = int32_t (*coef_mat_sparse_idx)[shell_max][prim_max+1] __attribute__((aligned(64))) =
@ -5613,7 +5620,10 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
const double y = e_coord[1] - n_coord[1]; const double y = e_coord[1] - n_coord[1];
const double z = e_coord[2] - n_coord[2]; const double z = e_coord[2] - n_coord[2];
const double r2 = x*x + y*y + z*z; const double x2 = x*x;
const double y2 = y*y;
const double z2 = z*z;
const double r2 = x2 + y2 + z2;
if (r2 > cutoff * nucleus_range[inucl]) { if (r2 > cutoff * nucleus_range[inucl]) {
continue; continue;
@ -5636,19 +5646,19 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
poly_vgl[1] = x; poly_vgl[1] = x;
poly_vgl[2] = y; poly_vgl[2] = y;
poly_vgl[3] = z; poly_vgl[3] = z;
poly_vgl[4] = x*x; poly_vgl[4] = x2;
poly_vgl[5] = x*y; poly_vgl[5] = x*y;
poly_vgl[6] = x*z; poly_vgl[6] = x*z;
poly_vgl[7] = y*y; poly_vgl[7] = y2;
poly_vgl[8] = y*z; poly_vgl[8] = y*z;
poly_vgl[9] = z*z; poly_vgl[9] = z2;
break; break;
default: default:
rc = qmckl_ao_polynomial_transp_vgl_hpc_inline(context, e_coord, n_coord, rc = qmckl_ao_polynomial_transp_vgl_hpc_inline(context, e_coord, n_coord,
nucleus_max_ang_mom[inucl], nucleus_max_ang_mom[inucl],
&n_poly, powers, (int64_t) 3, &n_poly, powers, (int64_t) 3,
poly_vgl, size_max); poly_vgl, size_max);
assert (rc == QMCKL_SUCCESS); assert (rc == QMCKL_SUCCESS);
break; break;
} }
@ -5656,18 +5666,45 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
/* Compute all exponents */ /* Compute all exponents */
int64_t nidx = 0; int64_t nidx = 0;
for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) { if (double_precision) {
const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
if (v <= cutoff) { for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
ar2[iprim] = v; const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
++nidx; if (v <= cutoff) {
} else { ar2[iprim] = -v;
break; ++nidx;
} } else {
} break;
}
}
IVDEP
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat[iprim] = exp(ar2[iprim]);
}
} else {
for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
if (v <= cutoff) {
ar2_sp[iprim] = (float) -v;
++nidx;
} else {
break;
}
}
IVDEP
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat_sp[iprim] = expf(ar2_sp[iprim]);
}
IVDEP
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat[iprim] = exp_mat_sp[iprim];
}
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat[iprim] = exp(-ar2[iprim]);
} }
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) { for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
@ -5684,20 +5721,27 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
const int64_t ishell_start = nucleus_index[inucl]; const int64_t ishell_start = nucleus_index[inucl];
const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl]; const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl];
const int64_t ipoint_ao_num = ipoint*ao_num;
for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) { for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
const double s1 = ce_mat[ishell-ishell_start]; const double s1 = ce_mat[ishell-ishell_start];
const int64_t k = ao_index[ishell]; const int64_t k = ao_index[ishell];
double* restrict const ao_value_1 = ao_value + ipoint*ao_num + k; double* restrict const ao_value_1 = ao_value + ipoint_ao_num + k;
const int32_t l = shell_ang_mom[ishell]; const int32_t l = shell_ang_mom[ishell];
const int32_t n = lstart[l+1]-lstart[l]; const int32_t n = lstart[l+1]-lstart[l];
if (s1 == 0.0) { if (s1 == 0.0) {
/*
IVDEP
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (int64_t il=0 ; il<n ; ++il) { for (int64_t il=0 ; il<n ; ++il) {
ao_value_1[il] = 0.; ao_value_1[il] = 0.;
} }
*/
continue; continue;
} }
@ -5713,6 +5757,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
ao_value_1[0] = s1 * f[0]; ao_value_1[0] = s1 * f[0];
break; break;
case 3: case 3:
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -5721,6 +5766,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
} }
break; break;
case(6): case(6):
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -5729,6 +5775,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
} }
break; break;
default: default:
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -5738,9 +5785,14 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
break; break;
} }
} else { } else {
/*
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (int64_t il=0 ; il<n ; ++il) { for (int64_t il=0 ; il<n ; ++il) {
ao_value_1[il] = 0.0; ao_value_1[il] = 0.0;
} }
*/
} }
} }
} }
@ -6121,7 +6173,7 @@ function qmckl_compute_ao_vgl_doc(context, &
bind(C) result(info) bind(C) result(info)
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use qmckl_constants use qmckl_constants
use qmckl, only : qmckl_ao_polynomial_vgl use qmckl, only : qmckl_ao_polynomial_vgl, qmckl_get_numprec_epsilon
implicit none implicit none
integer (qmckl_context), intent(in) , value :: context integer (qmckl_context), intent(in) , value :: context
integer (c_int64_t) , intent(in) , value :: ao_num integer (c_int64_t) , intent(in) , value :: ao_num
@ -6172,7 +6224,7 @@ function qmckl_compute_ao_vgl_doc(context, &
info = QMCKL_SUCCESS info = QMCKL_SUCCESS
! Don't compute polynomials when the radial part is zero. ! Don't compute polynomials when the radial part is zero.
cutoff = 27.631021115928547d0 ! -dlog(1.d-12) cutoff = -dlog(qmckl_get_numprec_epsilon(context))
do ipoint = 1, point_num do ipoint = 1, point_num
e_coord(1) = coord(ipoint,1) e_coord(1) = coord(ipoint,1)
@ -6306,9 +6358,11 @@ qmckl_compute_ao_vgl_hpc_gaussian (
{ {
qmckl_exit_code rc; qmckl_exit_code rc;
double ar2[prim_max] __attribute__((aligned(64))); double ar2[prim_max] __attribute__((aligned(64)));
float ar2_sp[prim_max] __attribute__((aligned(64)));
int32_t powers[3*size_max] __attribute__((aligned(64))); int32_t powers[3*size_max] __attribute__((aligned(64)));
double exp_mat[prim_max][8] __attribute__((aligned(64))) ; double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
float exp_mat_sp[prim_max] __attribute__((aligned(64)));
double ce_mat[shell_max][8] __attribute__((aligned(64))) ; double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
int32_t (*coef_mat_sparse_idx)[shell_max][prim_max+1] __attribute__((aligned(64))) = int32_t (*coef_mat_sparse_idx)[shell_max][prim_max+1] __attribute__((aligned(64))) =
@ -6365,7 +6419,10 @@ qmckl_compute_ao_vgl_hpc_gaussian (
const double y = e_coord[1] - n_coord[1]; const double y = e_coord[1] - n_coord[1];
const double z = e_coord[2] - n_coord[2]; const double z = e_coord[2] - n_coord[2];
const double r2 = x*x + y*y + z*z; const double x2 = x*x;
const double y2 = y*y;
const double z2 = z*z;
const double r2 = x2 + y2 + z2;
if (r2 > cutoff * nucleus_range[inucl]) { if (r2 > cutoff * nucleus_range[inucl]) {
continue; continue;
@ -6386,12 +6443,12 @@ qmckl_compute_ao_vgl_hpc_gaussian (
poly_vgl_l2[0][1] = x; poly_vgl_l2[0][1] = x;
poly_vgl_l2[0][2] = y; poly_vgl_l2[0][2] = y;
poly_vgl_l2[0][3] = z; poly_vgl_l2[0][3] = z;
poly_vgl_l2[0][4] = x*x; poly_vgl_l2[0][4] = x2;
poly_vgl_l2[0][5] = x*y; poly_vgl_l2[0][5] = x*y;
poly_vgl_l2[0][6] = x*z; poly_vgl_l2[0][6] = x*z;
poly_vgl_l2[0][7] = y*y; poly_vgl_l2[0][7] = y2;
poly_vgl_l2[0][8] = y*z; poly_vgl_l2[0][8] = y*z;
poly_vgl_l2[0][9] = z*z; poly_vgl_l2[0][9] = z2;
poly_vgl_l2[1][4] = x+x; poly_vgl_l2[1][4] = x+x;
poly_vgl_l2[1][5] = y; poly_vgl_l2[1][5] = y;
poly_vgl_l2[1][6] = z; poly_vgl_l2[1][6] = z;
@ -6405,9 +6462,9 @@ qmckl_compute_ao_vgl_hpc_gaussian (
default: default:
rc = qmckl_ao_polynomial_transp_vgl_hpc_inline(context, e_coord, n_coord, rc = qmckl_ao_polynomial_transp_vgl_hpc_inline(context, e_coord, n_coord,
nucleus_max_ang_mom[inucl], nucleus_max_ang_mom[inucl],
&n_poly, powers, (int64_t) 3, &n_poly, powers, (int64_t) 3,
&(poly_vgl[0][0]), size_max); &(poly_vgl[0][0]), size_max);
assert (rc == QMCKL_SUCCESS); assert (rc == QMCKL_SUCCESS);
break; break;
} }
@ -6415,18 +6472,43 @@ qmckl_compute_ao_vgl_hpc_gaussian (
/* Compute all exponents */ /* Compute all exponents */
int64_t nidx = 0; int64_t nidx = 0;
for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
if (v <= cutoff) {
ar2[iprim] = v;
++nidx;
} else {
break;
}
}
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) { if (double_precision) {
exp_mat[iprim][0] = exp(-ar2[iprim]);
for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
if (v <= cutoff) {
ar2[iprim] = v;
++nidx;
} else {
break;
}
}
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat[iprim][0] = exp(-ar2[iprim]);
}
} else {
for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
if (v <= cutoff) {
ar2_sp[iprim] = (float) v;
++nidx;
} else {
break;
}
}
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat_sp[iprim] = expf(-ar2_sp[iprim]);
}
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
exp_mat[iprim][0] = exp_mat_sp[iprim];
}
} }
for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) { for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
@ -6443,6 +6525,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
// if (do_sparse) { // if (do_sparse) {
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) { for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -6454,6 +6537,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
for (int l=1 ; l<idx[0]; ++l) { for (int l=1 ; l<idx[0]; ++l) {
const int k = idx[l]; const int k = idx[l];
if (k >= nidx) break; if (k >= nidx) break;
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -6467,12 +6551,13 @@ qmckl_compute_ao_vgl_hpc_gaussian (
const int64_t ishell_start = nucleus_index[inucl]; const int64_t ishell_start = nucleus_index[inucl];
const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl]; const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl];
const int64_t ipoint_5_ao_num = ipoint*5*ao_num;
for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) { for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
const double s1 = ce_mat[ishell-ishell_start][0]; const double s1 = ce_mat[ishell-ishell_start][0];
const int64_t k = ao_index[ishell]; const int64_t k = ao_index[ishell];
double* restrict const ao_vgl_1 = ao_vgl + ipoint*5*ao_num + k; double* restrict const ao_vgl_1 = ao_vgl + ipoint_5_ao_num + k;
const int32_t l = shell_ang_mom[ishell]; const int32_t l = shell_ang_mom[ishell];
const int32_t n = lstart[l+1]-lstart[l]; const int32_t n = lstart[l+1]-lstart[l];
@ -6483,6 +6568,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
double* restrict const ao_vgl_5 = ao_vgl_1 + (ao_num<<2); double* restrict const ao_vgl_5 = ao_vgl_1 + (ao_num<<2);
if (s1 == 0.0) { if (s1 == 0.0) {
/*
for (int64_t il=0 ; il<n ; ++il) { for (int64_t il=0 ; il<n ; ++il) {
ao_vgl_1[il] = 0.0; ao_vgl_1[il] = 0.0;
ao_vgl_2[il] = 0.0; ao_vgl_2[il] = 0.0;
@ -6490,6 +6576,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
ao_vgl_4[il] = 0.0; ao_vgl_4[il] = 0.0;
ao_vgl_5[il] = 0.0; ao_vgl_5[il] = 0.0;
} }
*/
continue; continue;
} }
@ -6539,6 +6626,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
ao_vgl_5[0] = s5; ao_vgl_5[0] = s5;
break; break;
case 3: case 3:
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -6554,6 +6642,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
} }
break; break;
case 6: case 6:
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -6569,6 +6658,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
} }
break; break;
default: default:
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -6585,6 +6675,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
break; break;
} }
} else { } else {
/*
for (int64_t il=0 ; il<n ; ++il) { for (int64_t il=0 ; il<n ; ++il) {
ao_vgl_1[il] = 0.0; ao_vgl_1[il] = 0.0;
ao_vgl_2[il] = 0.0; ao_vgl_2[il] = 0.0;
@ -6592,6 +6683,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
ao_vgl_4[il] = 0.0; ao_vgl_4[il] = 0.0;
ao_vgl_5[il] = 0.0; ao_vgl_5[il] = 0.0;
} }
*/
} }
} }
} }

View File

@ -1782,6 +1782,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
const double* restrict avgl4 = avgl1 + (ao_num << 1) + ao_num; const double* restrict avgl4 = avgl1 + (ao_num << 1) + ao_num;
const double* restrict avgl5 = avgl1 + (ao_num << 2); const double* restrict avgl5 = avgl1 + (ao_num << 2);
IVDEP
for (int64_t i=0 ; i<mo_num ; ++i) { for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] = 0.; vgl1[i] = 0.;
vgl2[i] = 0.; vgl2[i] = 0.;
@ -1842,15 +1843,16 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
const double a35 = av5[n+2]; const double a35 = av5[n+2];
const double a45 = av5[n+3]; const double a45 = av5[n+3];
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
for (int64_t i=0 ; i<mo_num ; ++i) { for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] = vgl1[i] + ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41; vgl1[i] += ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
vgl2[i] = vgl2[i] + ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42; vgl2[i] += ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
vgl3[i] = vgl3[i] + ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43; vgl3[i] += ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
vgl4[i] = vgl4[i] + ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44; vgl4[i] += ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
vgl5[i] = vgl5[i] + ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45; vgl5[i] += ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
} }
} }
@ -1862,6 +1864,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
const double a4 = av4[m]; const double a4 = av4[m];
const double a5 = av5[m]; const double a5 = av5[m];
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -2166,6 +2169,7 @@ qmckl_compute_mo_basis_mo_value_cusp_hpc (const qmckl_context context,
const double a31 = av1[n+2]; const double a31 = av1[n+2];
const double a41 = av1[n+3]; const double a41 = av1[n+3];
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -2178,6 +2182,7 @@ qmckl_compute_mo_basis_mo_value_cusp_hpc (const qmckl_context context,
const double* restrict ck = coefficient_t + idx[m]*mo_num; const double* restrict ck = coefficient_t + idx[m]*mo_num;
const double a1 = av1[m]; const double a1 = av1[m];
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -2189,9 +2194,7 @@ qmckl_compute_mo_basis_mo_value_cusp_hpc (const qmckl_context context,
for (int64_t inucl=0 ; inucl<nucl_num ; ++inucl) { for (int64_t inucl=0 ; inucl<nucl_num ; ++inucl) {
if (ria[inucl] < r_cusp[inucl]) { if (ria[inucl] < r_cusp[inucl]) {
const double r = ria[inucl]; const double r = ria[inucl];
#ifdef HAVE_OPENMP IVDEP
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) { for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*( vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*(
qmckl_ten3(cusp_param,i,1,inucl) + r*( qmckl_ten3(cusp_param,i,1,inucl) + r*(
@ -2591,6 +2594,7 @@ qmckl_compute_mo_basis_mo_vgl_cusp_hpc (const qmckl_context context,
const double a35 = av5[n+2]; const double a35 = av5[n+2];
const double a45 = av5[n+3]; const double a45 = av5[n+3];
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -2611,6 +2615,7 @@ qmckl_compute_mo_basis_mo_vgl_cusp_hpc (const qmckl_context context,
const double a4 = av4[m]; const double a4 = av4[m];
const double a5 = av5[m]; const double a5 = av5[m];
IVDEP
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd #pragma omp simd
#endif #endif
@ -2633,9 +2638,7 @@ qmckl_compute_mo_basis_mo_vgl_cusp_hpc (const qmckl_context context,
qmckl_mat(point_coord,ipoint,2) - qmckl_mat(nucl_coord,inucl,2) }; qmckl_mat(point_coord,ipoint,2) - qmckl_mat(nucl_coord,inucl,2) };
const double r_inv = 1./r; const double r_inv = 1./r;
#ifdef HAVE_OPENMP IVDEP
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) { for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*( vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*(
qmckl_ten3(cusp_param,i,1,inucl) + r*( qmckl_ten3(cusp_param,i,1,inucl) + r*(

View File

@ -86,8 +86,8 @@ trapfpe ()
default parameters determining the target numerical precision and default parameters determining the target numerical precision and
range are defined. Following the IEEE Standard for Floating-Point range are defined. Following the IEEE Standard for Floating-Point
Arithmetic (IEEE 754), Arithmetic (IEEE 754),
/precision/ refers to the number of significand bits and /range/ /precision/ refers to the number of significand bits (including the
refers to the number of exponent bits. sign bit) and /range/ refers to the number of exponent bits.
#+NAME: table-precision #+NAME: table-precision
| ~QMCKL_DEFAULT_PRECISION~ | 53 | | ~QMCKL_DEFAULT_PRECISION~ | 53 |
@ -340,7 +340,7 @@ double qmckl_get_numprec_epsilon(const qmckl_context context);
#+begin_src c :tangle (eval c) #+begin_src c :tangle (eval c)
double qmckl_get_numprec_epsilon(const qmckl_context context) { double qmckl_get_numprec_epsilon(const qmckl_context context) {
const int precision = qmckl_get_numprec_precision(context); const int precision = qmckl_get_numprec_precision(context);
return 1. / (double) (1L << (precision-2)); return 1. / (double) ( ((uint64_t) 1) << (precision-2));
} }
#+end_src #+end_src
@ -355,6 +355,53 @@ double qmckl_get_numprec_epsilon(const qmckl_context context) {
end interface end interface
#+end_src #+end_src
* Approximate functions
** Exponential
Fast exponential function, adapted from Johan Rade's implementation
(https://gist.github.com/jrade/293a73f89dfef51da6522428c857802d). It
is based on Schraudolph's paper:
N. Schraudolph, "A Fast, Compact Approximation of the Exponential Function",
/Neural Computation/ *11*, 853862 (1999).
(available at https://nic.schraudolph.org/pubs/Schraudolph99.pdf)
#+begin_src c :tangle (eval c)
float fastExpf(float x)
{
const float a = 12102203.0;
const float b = 1064986816.0;
x = a * x + b;
const float c = 8388608.0;
const float d = 2139095040.0;
if (x < c || x > d)
x = (x < c) ? 0.0f : d;
uint32_t n = (uint32_t) x;
memcpy(&x, &n, 4);
return x;
}
double fastExp(double x)
{
const double a = 6497320848556798.0;
const double b = 4606985713057410560.0;
x = a * x + b;
const double c = 4503599627370496.0;
const double d = 9218868437227405312.0;
if (x < c || x > d)
x = (x < c) ? 0.0 : d;
uint64_t n = (uint64_t) x;
memcpy(&x, &n, 8);
return x;
}
#+end_src
* End of files :noexport: * End of files :noexport:
#+begin_src c :comments link :tangle (eval h_private_type) #+begin_src c :comments link :tangle (eval h_private_type)