From 6bf9388a4edf3419f2ebccdfafaacecf3895d51a Mon Sep 17 00:00:00 2001
From: Anthony Scemama <scemama@irsamc.ups-tlse.fr>
Date: Tue, 28 Nov 2023 17:00:39 +0100
Subject: [PATCH] Added control of precision in AOs

---
 configure.ac          |   2 +-
 org/qmckl_ao.org      | 256 ++++++++++++++++++++++++++++--------------
 org/qmckl_mo.org      |  25 +++--
 org/qmckl_numprec.org |  53 ++++++++-
 4 files changed, 239 insertions(+), 97 deletions(-)

diff --git a/configure.ac b/configure.ac
index 60e2661..49e9fc3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,7 +66,7 @@ AC_ARG_WITH([icx],
 
 AS_IF([test "x$with_icx" = "xyes"], [
   CC=icx
-  CFLAGS="-march=native -ip -Ofast -ftz -finline -g -mkl=sequential" ])
+  CFLAGS="-march=native -Ofast -ftz -finline -g -qmkl=sequential" ])
 
 AS_IF([test "x$with_icx.$with_ifort" = "xyes.yes"], [
   ax_blas_ok="yes"
diff --git a/org/qmckl_ao.org b/org/qmckl_ao.org
index 10fefa7..5796127 100644
--- a/org/qmckl_ao.org
+++ b/org/qmckl_ao.org
@@ -2684,7 +2684,7 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) {
   }
 
   /* Find distance beyond which all AOs are zero.
-     The distance is obtained by sqrt(log(cutoff)*range) */
+     The distance is obtained by sqrt(-log(epsilon)*range) */
   {
     if (ctx->ao_basis.type == 'G') {
       qmckl_memory_info_struct mem_info = qmckl_memory_info_struct_zero;
@@ -5349,23 +5349,23 @@ for (int32_t ldl=3 ; ldl<=5 ; ++ldl) {
    :END:
 *** Unoptimized version
      #+NAME: qmckl_ao_value_args_doc
-    | Variable              | Type                              | In/Out | Description                                  |
-    |-----------------------+-----------------------------------+--------+----------------------------------------------|
-    | ~context~             | ~qmckl_context~                   | in     | Global state                                 |
-    | ~ao_num~              | ~int64_t~                         | in     | Number of AOs                                |
-    | ~shell_num~           | ~int64_t~                         | in     | Number of shells                             |
-    | ~point_num~           | ~int64_t~                         | in     | Number of points                             |
-    | ~nucl_num~            | ~int64_t~                         | in     | Number of nuclei                             |
-    | ~coord~               | ~double[3][point_num]~            | in     | Coordinates                                  |
-    | ~nucl_coord~          | ~double[3][nucl_num]~             | in     | Nuclear  coordinates                         |
-    | ~nucleus_index~       | ~int64_t[nucl_num]~               | in     | Index of the 1st shell of each nucleus       |
-    | ~nucleus_shell_num~   | ~int64_t[nucl_num]~               | in     | Number of shells per nucleus                 |
-    | ~nucleus_range~       | ~double[nucl_num]~                | in     | Range beyond which all is zero               |
-    | ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~               | in     | Maximum angular momentum per nucleus         |
-    | ~shell_ang_mom~       | ~int32_t[shell_num]~              | in     | Angular momentum of each shell               |
-    | ~ao_factor~           | ~double[ao_num]~                  | in     | Normalization factor of the AOs              |
-    | ~shell_vgl~           | ~double[point_num][5][shell_num]~ | in     | Value, gradients and Laplacian of the shells |
-    | ~ao_value~            | ~double[point_num][ao_num]~       | out    | Values of the AOs                            |
+    | Variable              | Type                              | In/Out | Description                                              |
+    |-----------------------+-----------------------------------+--------+----------------------------------------------------------|
+    | ~context~             | ~qmckl_context~                   | in     | Global state                                             |
+    | ~ao_num~              | ~int64_t~                         | in     | Number of AOs                                            |
+    | ~shell_num~           | ~int64_t~                         | in     | Number of shells                                         |
+    | ~point_num~           | ~int64_t~                         | in     | Number of points                                         |
+    | ~nucl_num~            | ~int64_t~                         | in     | Number of nuclei                                         |
+    | ~coord~               | ~double[3][point_num]~            | in     | Coordinates                                              |
+    | ~nucl_coord~          | ~double[3][nucl_num]~             | in     | Nuclear  coordinates                                     |
+    | ~nucleus_index~       | ~int64_t[nucl_num]~               | in     | Index of the 1st shell of each nucleus                   |
+    | ~nucleus_shell_num~   | ~int64_t[nucl_num]~               | in     | Number of shells per nucleus                             |
+    | ~nucleus_range~       | ~double[nucl_num]~                | in     | Range beyond which all is zero                           |
+    | ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~               | in     | Maximum angular momentum per nucleus                     |
+    | ~shell_ang_mom~       | ~int32_t[shell_num]~              | in     | Angular momentum of each shell                           |
+    | ~ao_factor~           | ~double[ao_num]~                  | in     | Normalization factor of the AOs                          |
+    | ~shell_vgl~           | ~double[point_num][5][shell_num]~ | in     | Value, gradients and Laplacian of the shells             |
+    | ~ao_value~            | ~double[point_num][ao_num]~       | out    | Values of the AOs                                        |
 
      #+begin_src f90 :comments org :tangle (eval f) :noweb yes
 function qmckl_compute_ao_value_doc(context, &
@@ -5376,7 +5376,7 @@ function qmckl_compute_ao_value_doc(context, &
      bind(C) result(info)
   use, intrinsic :: iso_c_binding
   use qmckl_constants
-  use qmckl, only: qmckl_ao_polynomial_vgl
+  use qmckl, only: qmckl_ao_polynomial_vgl, qmckl_get_numprec_epsilon
   implicit none
   integer (qmckl_context), intent(in)  , value :: context
   integer (c_int64_t) , intent(in)  , value :: ao_num
@@ -5428,7 +5428,7 @@ function qmckl_compute_ao_value_doc(context, &
   info = QMCKL_SUCCESS
 
   ! Don't compute polynomials when the radial part is zero.
-  cutoff = 27.631021115928547d0  !-dlog(1.d-12)
+  cutoff = -dlog(qmckl_get_numprec_epsilon(context))
 
   do ipoint = 1, point_num
      e_coord(1) = coord(ipoint,1)
@@ -5478,27 +5478,27 @@ end function qmckl_compute_ao_value_doc
 
 *** HPC version
      #+NAME: qmckl_ao_value_args_hpc_gaussian
-    | Variable              | Type                        | In/Out | Description                                  |
-    |-----------------------+-----------------------------+--------+----------------------------------------------|
-    | ~context~             | ~qmckl_context~             | in     | Global state                                 |
-    | ~ao_num~              | ~int64_t~                   | in     | Number of AOs                                |
-    | ~shell_num~           | ~int64_t~                   | in     | Number of shells                             |
-    | ~prim_num~            | ~int64_t~                   | in     | Number of primitives                         |
-    | ~point_num~           | ~int64_t~                   | in     | Number of points                             |
-    | ~nucl_num~            | ~int64_t~                   | in     | Number of nuclei                             |
-    | ~coord~               | ~double[3][point_num]~      | in     | Coordinates                                  |
-    | ~nucl_coord~          | ~double[3][nucl_num]~       | in     | Nuclear  coordinates                         |
-    | ~nucleus_index~       | ~int64_t[nucl_num]~         | in     | Index of the 1st shell of each nucleus       |
-    | ~nucleus_shell_num~   | ~int64_t[nucl_num]~         | in     | Number of shells per nucleus                 |
-    | ~nucleus_range~       | ~double[nucl_num]~          | in     | Range beyond which all is zero               |
-    | ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~         | in     | Maximum angular momentum per nucleus         |
-    | ~shell_ang_mom~       | ~int32_t[shell_num]~        | in     | Angular momentum of each shell               |
-    | ~shell_prim_index~    | ~int64_t[shell_num]~        | in     | Index of the 1st primitive of each shell     |
-    | ~shell_prim_num~      | ~int64_t[shell_num]~        | in     | Number of primitives per shell               |
-    | ~ao_factor~           | ~double[ao_num]~            | in     | Normalization factor of the AOs              |
-    | ~ao_expo~             | ~double[prim_num]~          | in     | Value, gradients and Laplacian of the shells |
-    | ~coef_normalized~     | ~double[prim_num]~          | in     | Value, gradients and Laplacian of the shells |
-    | ~ao_value~            | ~double[point_num][ao_num]~ | out    | Values of the AOs                            |
+    | Variable              | Type                        | In/Out | Description                                              |
+    |-----------------------+-----------------------------+--------+----------------------------------------------------------|
+    | ~context~             | ~qmckl_context~             | in     | Global state                                             |
+    | ~ao_num~              | ~int64_t~                   | in     | Number of AOs                                            |
+    | ~shell_num~           | ~int64_t~                   | in     | Number of shells                                         |
+    | ~prim_num~            | ~int64_t~                   | in     | Number of primitives                                     |
+    | ~point_num~           | ~int64_t~                   | in     | Number of points                                         |
+    | ~nucl_num~            | ~int64_t~                   | in     | Number of nuclei                                         |
+    | ~coord~               | ~double[3][point_num]~      | in     | Coordinates                                              |
+    | ~nucl_coord~          | ~double[3][nucl_num]~       | in     | Nuclear  coordinates                                     |
+    | ~nucleus_index~       | ~int64_t[nucl_num]~         | in     | Index of the 1st shell of each nucleus                   |
+    | ~nucleus_shell_num~   | ~int64_t[nucl_num]~         | in     | Number of shells per nucleus                             |
+    | ~nucleus_range~       | ~double[nucl_num]~          | in     | Range beyond which all is zero                           |
+    | ~nucleus_max_ang_mom~ | ~int32_t[nucl_num]~         | in     | Maximum angular momentum per nucleus                     |
+    | ~shell_ang_mom~       | ~int32_t[shell_num]~        | in     | Angular momentum of each shell                           |
+    | ~shell_prim_index~    | ~int64_t[shell_num]~        | in     | Index of the 1st primitive of each shell                 |
+    | ~shell_prim_num~      | ~int64_t[shell_num]~        | in     | Number of primitives per shell                           |
+    | ~ao_factor~           | ~double[ao_num]~            | in     | Normalization factor of the AOs                          |
+    | ~ao_expo~             | ~double[prim_num]~          | in     | Value, gradients and Laplacian of the shells             |
+    | ~coef_normalized~     | ~double[prim_num]~          | in     | Value, gradients and Laplacian of the shells             |
+    | ~ao_value~            | ~double[point_num][ao_num]~ | out    | Values of the AOs                                        |
 
      #+NAME:ao_value_constants
      #+begin_src c :exports none
@@ -5530,8 +5530,13 @@ end function qmckl_compute_ao_value_doc
     ao_index[shell_num] = ao_num+1;
   }
 
+  qmckl_context_struct* const ctx = (qmckl_context_struct*) context;
+  assert (ctx != NULL);
+
   /* Don't compute polynomials when the radial part is zero. */
-  const double cutoff = 27.631021115928547; // -log(1.e-12)
+  const int precision = ctx->numprec.precision;
+  const double cutoff = log( (double) ( ((uint64_t) 1) << (precision-2) ) );
+  const bool double_precision = precision > 23;
 
      #+end_src
 
@@ -5566,10 +5571,12 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
   {
     qmckl_exit_code rc;
     double ar2[prim_max] __attribute__((aligned(64)));
+    float  ar2_sp[prim_max] __attribute__((aligned(64)));
     int32_t powers[3*size_max] __attribute__((aligned(64)));
     double  poly_vgl[5*size_max] __attribute__((aligned(64)));
 
     double exp_mat[prim_max] __attribute__((aligned(64)));
+    float  exp_mat_sp[prim_max] __attribute__((aligned(64)));
     double ce_mat[shell_max] __attribute__((aligned(64)));
 
     int32_t (*coef_mat_sparse_idx)[shell_max][prim_max+1] __attribute__((aligned(64))) = 
@@ -5613,7 +5620,10 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
         const double y = e_coord[1] - n_coord[1];
         const double z = e_coord[2] - n_coord[2];
 
-        const double r2 = x*x + y*y + z*z;
+        const double x2 = x*x;
+        const double y2 = y*y;
+        const double z2 = z*z;
+        const double r2 = x2 + y2 + z2;
 
         if (r2 > cutoff * nucleus_range[inucl]) {
           continue;
@@ -5636,19 +5646,19 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
           poly_vgl[1] = x;
           poly_vgl[2] = y;
           poly_vgl[3] = z;
-          poly_vgl[4] = x*x;
+          poly_vgl[4] = x2;
           poly_vgl[5] = x*y;
           poly_vgl[6] = x*z;
-          poly_vgl[7] = y*y;
+          poly_vgl[7] = y2;
           poly_vgl[8] = y*z;
-          poly_vgl[9] = z*z;
+          poly_vgl[9] = z2;
           break;
 
         default:
           rc = qmckl_ao_polynomial_transp_vgl_hpc_inline(context, e_coord, n_coord,
-                                                  nucleus_max_ang_mom[inucl],
-                                                  &n_poly, powers, (int64_t) 3,
-                                                  poly_vgl, size_max);
+                                                         nucleus_max_ang_mom[inucl],
+                                                         &n_poly, powers, (int64_t) 3,
+                                                         poly_vgl, size_max);
           assert (rc == QMCKL_SUCCESS);
           break;
         }
@@ -5656,18 +5666,45 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
         /* Compute all exponents */
 
         int64_t nidx = 0;
-        for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
-          const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
-          if (v <= cutoff) {
-            ar2[iprim] = v;
-            ++nidx;
-          } else {
-            break;
-          }
-        }
+        if (double_precision) {
+
+          for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
+            const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
+            if (v <= cutoff) {
+              ar2[iprim] = -v;
+              ++nidx;
+            } else {
+              break;
+            }
+          }
+
+IVDEP
+          for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+            exp_mat[iprim] = exp(ar2[iprim]);
+          }
+
+        } else {
+
+          for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
+            const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
+            if (v <= cutoff) {
+              ar2_sp[iprim] = (float) -v;
+              ++nidx;
+            } else {
+              break;
+            }
+          }
+
+IVDEP
+          for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+            exp_mat_sp[iprim] = expf(ar2_sp[iprim]);
+          }
+
+IVDEP
+          for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+            exp_mat[iprim] = exp_mat_sp[iprim];
+          }
 
-        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
-          exp_mat[iprim] = exp(-ar2[iprim]);
         }
 
         for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
@@ -5684,20 +5721,27 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
         const int64_t ishell_start = nucleus_index[inucl];
         const int64_t ishell_end   = nucleus_index[inucl] + nucleus_shell_num[inucl];
 
+        const int64_t ipoint_ao_num = ipoint*ao_num;
         for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
 
           const double s1 = ce_mat[ishell-ishell_start];
 
           const int64_t k = ao_index[ishell];
-          double* restrict const ao_value_1 = ao_value + ipoint*ao_num + k;
+          double* restrict const ao_value_1 = ao_value + ipoint_ao_num + k;
 
           const int32_t l = shell_ang_mom[ishell];
           const int32_t n = lstart[l+1]-lstart[l];
 
           if (s1 == 0.0) {
+/*
+IVDEP
+#ifdef HAVE_OPENMP
+#pragma omp simd
+#endif
             for (int64_t il=0 ; il<n ; ++il) {
               ao_value_1[il] = 0.;
             }
+*/
             continue;
           }
 
@@ -5713,6 +5757,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
               ao_value_1[0] = s1 * f[0];
               break;
             case 3:
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -5721,6 +5766,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
               }
               break;
             case(6):
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -5729,6 +5775,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
               }
               break;
             default:
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -5738,9 +5785,14 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
               break;
             }
           } else {
+/*
+#ifdef HAVE_OPENMP
+#pragma omp simd
+#endif
             for (int64_t il=0 ; il<n ; ++il) {
               ao_value_1[il] = 0.0;
             }
+*/
           }
         }
       }
@@ -6121,7 +6173,7 @@ function qmckl_compute_ao_vgl_doc(context, &
      bind(C) result(info)
   use, intrinsic :: iso_c_binding
   use qmckl_constants
-  use qmckl, only : qmckl_ao_polynomial_vgl
+  use qmckl, only : qmckl_ao_polynomial_vgl, qmckl_get_numprec_epsilon
   implicit none
   integer (qmckl_context), intent(in)  , value :: context
   integer (c_int64_t) , intent(in)  , value :: ao_num
@@ -6172,7 +6224,7 @@ function qmckl_compute_ao_vgl_doc(context, &
   info = QMCKL_SUCCESS
 
   ! Don't compute polynomials when the radial part is zero.
-  cutoff = 27.631021115928547d0  ! -dlog(1.d-12)
+  cutoff = -dlog(qmckl_get_numprec_epsilon(context))
 
   do ipoint = 1, point_num
      e_coord(1) = coord(ipoint,1)
@@ -6306,9 +6358,11 @@ qmckl_compute_ao_vgl_hpc_gaussian (
   {
     qmckl_exit_code rc;
     double ar2[prim_max]  __attribute__((aligned(64)));
+    float  ar2_sp[prim_max] __attribute__((aligned(64)));
     int32_t powers[3*size_max]  __attribute__((aligned(64)));
 
     double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
+    float  exp_mat_sp[prim_max] __attribute__((aligned(64)));
     double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
 
     int32_t (*coef_mat_sparse_idx)[shell_max][prim_max+1] __attribute__((aligned(64))) = 
@@ -6365,7 +6419,10 @@ qmckl_compute_ao_vgl_hpc_gaussian (
         const double y = e_coord[1] - n_coord[1];
         const double z = e_coord[2] - n_coord[2];
 
-        const double r2 = x*x + y*y + z*z;
+        const double x2 = x*x;
+        const double y2 = y*y;
+        const double z2 = z*z;
+        const double r2 = x2 + y2 + z2;
 
         if (r2 > cutoff * nucleus_range[inucl]) {
           continue;
@@ -6386,12 +6443,12 @@ qmckl_compute_ao_vgl_hpc_gaussian (
           poly_vgl_l2[0][1] = x;
           poly_vgl_l2[0][2] = y;
           poly_vgl_l2[0][3] = z;
-          poly_vgl_l2[0][4] = x*x;
+          poly_vgl_l2[0][4] = x2;
           poly_vgl_l2[0][5] = x*y;
           poly_vgl_l2[0][6] = x*z;
-          poly_vgl_l2[0][7] = y*y;
+          poly_vgl_l2[0][7] = y2;
           poly_vgl_l2[0][8] = y*z;
-          poly_vgl_l2[0][9] = z*z;
+          poly_vgl_l2[0][9] = z2;
           poly_vgl_l2[1][4] = x+x;
           poly_vgl_l2[1][5] = y;
           poly_vgl_l2[1][6] = z;
@@ -6405,9 +6462,9 @@ qmckl_compute_ao_vgl_hpc_gaussian (
 
         default:
           rc = qmckl_ao_polynomial_transp_vgl_hpc_inline(context, e_coord, n_coord,
-                                                  nucleus_max_ang_mom[inucl],
-                                                  &n_poly, powers, (int64_t) 3,
-                                                  &(poly_vgl[0][0]), size_max);
+                                                         nucleus_max_ang_mom[inucl],
+                                                         &n_poly, powers, (int64_t) 3,
+                                                         &(poly_vgl[0][0]), size_max);
           assert (rc == QMCKL_SUCCESS);
           break;
         }
@@ -6415,18 +6472,43 @@ qmckl_compute_ao_vgl_hpc_gaussian (
         /* Compute all exponents */
 
         int64_t nidx = 0;
-        for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
-          const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
-          if (v <= cutoff) {
-            ar2[iprim] = v;
-            ++nidx;
-          } else {
-            break;
-          }
-        }
 
-        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
-          exp_mat[iprim][0] = exp(-ar2[iprim]);
+        if (double_precision) {
+
+          for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
+            const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
+            if (v <= cutoff) {
+              ar2[iprim] = v;
+              ++nidx;
+            } else {
+              break;
+            }
+          }
+
+          for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+            exp_mat[iprim][0] = exp(-ar2[iprim]);
+          }
+
+        } else {
+
+          for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
+            const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
+            if (v <= cutoff) {
+              ar2_sp[iprim] = (float) v;
+              ++nidx;
+            } else {
+              break;
+            }
+          }
+
+          for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+            exp_mat_sp[iprim] = expf(-ar2_sp[iprim]);
+          }
+
+          for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+            exp_mat[iprim][0] = exp_mat_sp[iprim];
+          }
+
         }
 
         for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
@@ -6443,6 +6525,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
 
         //        if (do_sparse) {
         for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -6454,6 +6537,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
           for (int l=1 ; l<idx[0]; ++l) {
             const int k = idx[l];
             if (k >= nidx) break;
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -6467,12 +6551,13 @@ qmckl_compute_ao_vgl_hpc_gaussian (
         const int64_t ishell_start = nucleus_index[inucl];
         const int64_t ishell_end   = nucleus_index[inucl] + nucleus_shell_num[inucl];
 
+        const int64_t ipoint_5_ao_num = ipoint*5*ao_num;
         for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
 
           const double s1 = ce_mat[ishell-ishell_start][0];
 
           const int64_t k = ao_index[ishell];
-          double* restrict const ao_vgl_1 = ao_vgl + ipoint*5*ao_num + k;
+          double* restrict const ao_vgl_1 = ao_vgl + ipoint_5_ao_num + k;
 
           const int32_t l = shell_ang_mom[ishell];
           const int32_t n = lstart[l+1]-lstart[l];
@@ -6483,6 +6568,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
           double* restrict const ao_vgl_5 = ao_vgl_1 + (ao_num<<2);
 
           if (s1 == 0.0) {
+/*
             for (int64_t il=0 ; il<n ; ++il) {
               ao_vgl_1[il] = 0.0;
               ao_vgl_2[il] = 0.0;
@@ -6490,6 +6576,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
               ao_vgl_4[il] = 0.0;
               ao_vgl_5[il] = 0.0;
             }
+*/
             continue;
           }
 
@@ -6539,6 +6626,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
               ao_vgl_5[0] = s5;
               break;
             case 3:
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -6554,6 +6642,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
               }
               break;
             case 6:
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -6569,6 +6658,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
               }
               break;
             default:
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -6585,6 +6675,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
               break;
             }
           } else {
+/*
             for (int64_t il=0 ; il<n ; ++il) {
               ao_vgl_1[il] = 0.0;
               ao_vgl_2[il] = 0.0;
@@ -6592,6 +6683,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
               ao_vgl_4[il] = 0.0;
               ao_vgl_5[il] = 0.0;
             }
+*/
           }
         }
       }
diff --git a/org/qmckl_mo.org b/org/qmckl_mo.org
index d8240cd..64bafa3 100644
--- a/org/qmckl_mo.org
+++ b/org/qmckl_mo.org
@@ -1782,6 +1782,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
     const double* restrict avgl4 = avgl1 + (ao_num << 1) + ao_num;
     const double* restrict avgl5 = avgl1 + (ao_num << 2);
 
+IVDEP
     for (int64_t i=0 ; i<mo_num ; ++i) {
       vgl1[i] = 0.;
       vgl2[i] = 0.;
@@ -1842,15 +1843,16 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
       const double a35 = av5[n+2];
       const double a45 = av5[n+3];
 
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
       for (int64_t i=0 ; i<mo_num ; ++i) {
-        vgl1[i] = vgl1[i] + ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
-        vgl2[i] = vgl2[i] + ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
-        vgl3[i] = vgl3[i] + ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
-        vgl4[i] = vgl4[i] + ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
-        vgl5[i] = vgl5[i] + ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
+        vgl1[i] += ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
+        vgl2[i] += ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
+        vgl3[i] += ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
+        vgl4[i] += ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
+        vgl5[i] += ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
       }
     }
 
@@ -1862,6 +1864,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
       const double a4 = av4[m];
       const double a5 = av5[m];
 
+IVDEP
 #ifdef HAVE_OPENMP
   #pragma omp simd
 #endif
@@ -2166,6 +2169,7 @@ qmckl_compute_mo_basis_mo_value_cusp_hpc (const qmckl_context context,
       const double a31 = av1[n+2];
       const double a41 = av1[n+3];
 
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -2178,6 +2182,7 @@ qmckl_compute_mo_basis_mo_value_cusp_hpc (const qmckl_context context,
       const double* restrict ck = coefficient_t + idx[m]*mo_num;
       const double a1 = av1[m];
 
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -2189,9 +2194,7 @@ qmckl_compute_mo_basis_mo_value_cusp_hpc (const qmckl_context context,
     for (int64_t inucl=0 ; inucl<nucl_num ; ++inucl) {
       if (ria[inucl] < r_cusp[inucl]) {
         const double r = ria[inucl];
-#ifdef HAVE_OPENMP
-#pragma omp simd
-#endif
+IVDEP
         for (int64_t i=0 ; i<mo_num ; ++i) {
           vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*(
                      qmckl_ten3(cusp_param,i,1,inucl) + r*(
@@ -2591,6 +2594,7 @@ qmckl_compute_mo_basis_mo_vgl_cusp_hpc (const qmckl_context context,
       const double a35 = av5[n+2];
       const double a45 = av5[n+3];
 
+IVDEP
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@@ -2611,6 +2615,7 @@ qmckl_compute_mo_basis_mo_vgl_cusp_hpc (const qmckl_context context,
       const double a4 = av4[m];
       const double a5 = av5[m];
 
+IVDEP
 #ifdef HAVE_OPENMP
   #pragma omp simd
 #endif
@@ -2633,9 +2638,7 @@ qmckl_compute_mo_basis_mo_vgl_cusp_hpc (const qmckl_context context,
           qmckl_mat(point_coord,ipoint,2) - qmckl_mat(nucl_coord,inucl,2) };
         const double r_inv = 1./r;
 
-#ifdef HAVE_OPENMP
-#pragma omp simd
-#endif
+IVDEP
         for (int64_t i=0 ; i<mo_num ; ++i) {
           vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*(
                      qmckl_ten3(cusp_param,i,1,inucl) + r*(
diff --git a/org/qmckl_numprec.org b/org/qmckl_numprec.org
index c3522be..decb073 100644
--- a/org/qmckl_numprec.org
+++ b/org/qmckl_numprec.org
@@ -86,8 +86,8 @@ trapfpe ()
  default parameters determining the target numerical precision and
  range are defined. Following the IEEE Standard for Floating-Point
  Arithmetic (IEEE 754),
- /precision/ refers to the number of significand bits and /range/
- refers to the number of exponent bits.
+ /precision/ refers to the number of significand bits (including the
+ sign bit) and /range/ refers to the number of exponent bits.
 
  #+NAME: table-precision
  | ~QMCKL_DEFAULT_PRECISION~ | 53 |
@@ -340,7 +340,7 @@ double qmckl_get_numprec_epsilon(const qmckl_context context);
   #+begin_src c :tangle (eval c)
 double qmckl_get_numprec_epsilon(const qmckl_context context) {
   const int precision = qmckl_get_numprec_precision(context);
-  return 1. /  (double) (1L << (precision-2));
+  return 1. /  (double) ( ((uint64_t) 1) << (precision-2));
 }
   #+end_src
 
@@ -355,6 +355,53 @@ double qmckl_get_numprec_epsilon(const qmckl_context context) {
   end interface
   #+end_src
 
+* Approximate functions
+  
+** Exponential
+
+   Fast exponential function, adapted from Johan Rade's implementation
+   (https://gist.github.com/jrade/293a73f89dfef51da6522428c857802d). It
+   is based on Schraudolph's paper:
+
+        N. Schraudolph, "A Fast, Compact Approximation of the Exponential Function",
+        /Neural Computation/ *11*, 853–862 (1999).
+        (available at https://nic.schraudolph.org/pubs/Schraudolph99.pdf)
+   
+   #+begin_src c :tangle (eval c)
+float fastExpf(float x)
+{
+  const float a = 12102203.0;
+  const float b = 1064986816.0;
+  x = a * x + b;
+  
+  const float c = 8388608.0;
+  const float d = 2139095040.0;
+  if (x < c || x > d)
+    x = (x < c) ? 0.0f : d;
+  
+  uint32_t n = (uint32_t) x;
+  memcpy(&x, &n, 4);
+  return x;
+}
+
+
+double fastExp(double x)
+{
+  const double a = 6497320848556798.0;
+  const double b = 4606985713057410560.0;
+  x = a * x + b;
+
+  const double c = 4503599627370496.0;
+  const double d = 9218868437227405312.0;
+  if (x < c || x > d)
+    x = (x < c) ? 0.0 : d;
+  
+  uint64_t n = (uint64_t) x;
+  memcpy(&x, &n, 8);
+  return x;
+}
+   #+end_src
+
 * End of files                                                     :noexport:
 
   #+begin_src c :comments link :tangle (eval h_private_type)