Faster AOs

2024-12-22 04:14:49 +01:00 · 2022-03-21 18:32:39 +01:00 · 2022-03-21 18:32:39 +01:00 · 5ecb1d6326
commit 5ecb1d6326
parent 9124c9209a
3 changed files with 463 additions and 411 deletions
--- a/org/qmckl_ao.org
+++ b/org/qmckl_ao.org
@ -48,6 +48,7 @@ gradients and Laplacian of the atomic basis functions.
  #+begin_src c :tangle (eval h_private_func)
 #ifndef QMCKL_AO_HPF
 #define QMCKL_AO_HPF
+#include "qmckl_blas_private_type.h"

  #+end_src

@ -57,6 +58,7 @@ gradients and Laplacian of the atomic basis functions.

 #include <stdbool.h>
 #include <stdio.h>
+#include "qmckl_blas_private_type.h"
  #+end_src

  #+begin_src f90 :tangle (eval f) :noweb yes
@ -105,6 +107,7 @@ int main() {
 #include "qmckl.h"
 #include "qmckl_context_private_type.h"
 #include "qmckl_memory_private_type.h"
+#include "qmckl_blas_private_type.h"
 #include "qmckl_memory_private_func.h"
 #include "qmckl_ao_private_type.h"
 #include "qmckl_ao_private_func.h"
@ -274,26 +277,26 @@ typedef struct qmckl_ao_basis_struct {
  int64_t            shell_num;
  int64_t            prim_num;
  int64_t            ao_num;
-  int64_t * nucleus_index;
-  int64_t * nucleus_shell_num;
-  int32_t * shell_ang_mom;
-  int64_t * shell_prim_num;
-  int64_t * shell_prim_index;
-  double  * shell_factor;
-  double  * exponent;
-  double  * coefficient;
-  double  * prim_factor;
-  double  * ao_factor;
+  int64_t * restrict nucleus_index;
+  int64_t * restrict nucleus_shell_num;
+  int32_t * restrict shell_ang_mom;
+  int64_t * restrict shell_prim_num;
+  int64_t * restrict shell_prim_index;
+  double  * restrict shell_factor;
+  double  * restrict exponent;
+  double  * restrict coefficient;
+  double  * restrict prim_factor;
+  double  * restrict ao_factor;

-  int64_t * nucleus_prim_index;
-  double  * coefficient_normalized;
-  int32_t * nucleus_max_ang_mom;
-  double  * nucleus_range;
-  double  * primitive_vgl;
+  int64_t * restrict nucleus_prim_index;
+  double  * restrict coefficient_normalized;
+  int32_t * restrict nucleus_max_ang_mom;
+  double  * restrict nucleus_range;
+  double  * restrict primitive_vgl;
  uint64_t           primitive_vgl_date;
-  double  * shell_vgl;
+  double  * restrict shell_vgl;
  uint64_t           shell_vgl_date;
-  double  * ao_vgl;
+  double  * restrict ao_vgl;
  uint64_t           ao_vgl_date;

  int32_t            uninitialized;
@ -2642,8 +2645,8 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) {


    For faster access, we provide extra arrays for the shell information as:
-    - $C_{psa}$ = =coef_per_nucleus (iprim, ishell, inucl)=
-    - $\gamma_{pa}$ =expo_per_nucleus (iprim, inucl)=
+    - $C_{psa}$ = =coef_per_nucleus[inucl][ishell][iprim]=
+    - $\gamma_{pa}$ =expo_per_nucleus[inucl][iprim]=

    such that the computation of the radial parts can be vectorized.

@ -2660,7 +2663,7 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) {
    #+NAME:HPC_struct
    #+begin_src c :comments org :exports none
 /* HPC specific data structures */
- int32_t*     prim_num_per_nucleus;
+ int32_t*  restrict   prim_num_per_nucleus;
 qmckl_tensor         coef_per_nucleus;
 qmckl_matrix         expo_per_nucleus;
    #+end_src
@ -2681,8 +2684,8 @@ struct combined {
 /* Comparison function */
 int compare_basis( const void * l, const void * r )
 {
-  const struct combined * _l= (const struct combined *)l;
-  const struct combined * _r= (const struct combined *)r;
+  const struct combined * restrict _l= (const struct combined *)l;
+  const struct combined * restrict _r= (const struct combined *)r;
  if( _l->expo > _r->expo ) return 1;
  if( _l->expo < _r->expo ) return -1;
  return 0;
@ -2756,11 +2759,10 @@ qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context)
      const int64_t iprim_start = ctx->ao_basis.shell_prim_index[ishell];
      const int64_t iprim_end   = ctx->ao_basis.shell_prim_index[ishell] + ctx->ao_basis.shell_prim_num[ishell];
      for (int64_t iprim = iprim_start ; iprim < iprim_end ; ++iprim) {
-//        newcoef[idx] = ctx->ao_basis.coefficient_normalized[iprim];
-        newcoef[idx] = ctx->ao_basis.coefficient[iprim];
+        newcoef[idx] = ctx->ao_basis.coefficient_normalized[iprim];
        idx += 1;
      }
-      for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+      for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
        idx2 = expo[i].index;
        coef[ishell - ishell_start][i] = newcoef[idx2];
      }
@ -2773,7 +2775,7 @@ qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context)
    int64_t idxmax = 0;
    idx = 0;
    newidx[0] = 0;
-    for (int64_t i=1 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+    for (int32_t i=1 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
      if (expo[i].expo != expo[i-1].expo) {
        idx += 1;
      }
@ -2781,33 +2783,33 @@ qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context)
    }
    idxmax = idx;

-    for (int64_t j=0 ; j<ishell_end-ishell_start ; ++j) {
+    for (int32_t j=0 ; j<ishell_end-ishell_start ; ++j) {
      memset(newcoef, 0, sizeof(newcoef));
-      for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+      for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
        newcoef[newidx[i]] += coef[j][i];
      }
-      for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+      for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
        coef[j][i] = newcoef[i];
      }
    }

-    for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+    for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
        expo[newidx[i]].expo = expo[i].expo;
    }
    ctx->ao_basis.prim_num_per_nucleus[inucl] = idxmax+1;

-    for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+    for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
      qmckl_mat( ctx->ao_basis.expo_per_nucleus, i, inucl ) = expo[i].expo;
    }

-    for (int64_t j=0 ; j<ishell_end-ishell_start ; ++j) {
-      for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+    for (int32_t j=0 ; j<ishell_end-ishell_start ; ++j) {
+      for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
        qmckl_ten3( ctx->ao_basis.coef_per_nucleus, i, j, inucl ) = coef[j][i];
      }
    }

 /*
-    for (int64_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
+    for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus[inucl] ; ++i) {
      printf("%4ld %4ld %15.5e | ", inucl, i, qmckl_mat( ctx->ao_basis.expo_per_nucleus, i, inucl ));
      for (int64_t j=0 ; j<ishell_end-ishell_start ; ++j) {
        printf("%8.5f ", qmckl_ten3( ctx->ao_basis.coef_per_nucleus, i, j, inucl ));
@ -4789,13 +4791,13 @@ end function qmckl_ao_polynomial_transp_vgl_doc_f
    #+begin_src c :tangle (eval c) :comments org
 qmckl_exit_code
 qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context,
-                                    const double* X,
-                                    const double* R,
+                                    const double* restrict X,
+                                    const double* restrict R,
                                    const int32_t lmax,
-                                    int64_t* n,
-                                    int32_t* const L,
+                                    int64_t* restrict n,
+                                    int32_t* restrict const L,
                                    const int64_t ldl,
-                                    double* const VGL,
+                                    double* restrict const VGL,
                                    const int64_t ldv )
 {
  const qmckl_context_struct* ctx = (qmckl_context_struct* const) context;
@ -4831,7 +4833,7 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context,

      } else {

-        int32_t* const l[4] = {L, L+ldl, L+(ldl<<1), L+ldl+(ldl<<1)};
+        int32_t* restrict const l[4] = {L, L+ldl, L+(ldl<<1), L+ldl+(ldl<<1)};
        l[0][0] = 0; l[0][1] = 0; l[0][2] = 0;
        l[1][0] = 1; l[1][1] = 0; l[1][2] = 0;
        l[2][0] = 0; l[2][1] = 1; l[2][2] = 0;
@ -4852,11 +4854,11 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context,

      } else {

-        double* const vgl1 = VGL;
-        double* const vgl2 = VGL + ldv;
-        double* const vgl3 = VGL + (ldv << 1);
-        double* const vgl4 = VGL + ldv + (ldv << 1);
-        double* const vgl5 = VGL + (ldv << 2);
+        double* restrict const vgl1 = VGL;
+        double* restrict const vgl2 = VGL + ldv;
+        double* restrict const vgl3 = VGL + (ldv << 1);
+        double* restrict const vgl4 = VGL + ldv + (ldv << 1);
+        double* restrict const vgl5 = VGL + (ldv << 2);

        for (int32_t k=0 ; k<4 ; ++k) {
          vgl2[k] = 0.0;
@ -4886,7 +4888,7 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context,
          L[i] = ltmp[i];

      } else {
-        int32_t* l[10];
+        int32_t* restrict l[10];
        for (int32_t i=0 ; i<10 ; ++i) {
          l[i] = L + i*ldl;
        }
@ -4922,11 +4924,11 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context,
         for (int i=0 ; i<50 ; ++i)
           VGL[i] = tmp[i];
      } else {
-        double* const vgl1 = VGL;
-        double* const vgl2 = VGL + ldv;
-        double* const vgl3 = VGL + (ldv << 1);
-        double* const vgl4 = VGL + 3*ldv;
-        double* const vgl5 = VGL + (ldv << 2);
+        double* restrict const vgl1 = VGL;
+        double* restrict const vgl2 = VGL + ldv;
+        double* restrict const vgl3 = VGL + (ldv << 1);
+        double* restrict const vgl4 = VGL + 3*ldv;
+        double* restrict const vgl5 = VGL + (ldv << 2);

        vgl1[0] = 1.0      ; vgl1[1] = Y[0]     ; vgl1[2] = Y[1];
        vgl1[3] = Y[2]     ; vgl1[4] = Y[0]*Y[0]; vgl1[5] = Y[0]*Y[1];
@ -4961,11 +4963,11 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context,
      const int32_t size_max = (lmax+1)*(lmax+2)*(lmax+3)/6;
      if (ldv < size_max) return QMCKL_INVALID_ARG_9;

-      double* const vgl1 = VGL;
-      double* const vgl2 = VGL + ldv;
-      double* const vgl3 = VGL + (ldv<<1);
-      double* const vgl4 = VGL + ldv + (ldv<<1);
-      double* const vgl5 = VGL + (ldv<<2);
+      double* restrict const vgl1 = VGL;
+      double* restrict const vgl2 = VGL + ldv;
+      double* restrict const vgl3 = VGL + (ldv<<1);
+      double* restrict const vgl4 = VGL + ldv + (ldv<<1);
+      double* restrict const vgl5 = VGL + (ldv<<2);

      const double Y[3] = { X[0]-R[0],
                            X[1]-R[1],
@ -5464,6 +5466,7 @@ end function qmckl_compute_ao_vgl_doc_f
   | ~coef_normalized~     | ~double[prim_num]~             | in     | Value, gradients and Laplacian of the shells |
   | ~ao_vgl~              | ~double[point_num][5][ao_num]~ | out    | Value, gradients and Laplacian of the AOs    |

+
    #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
 #ifdef HAVE_HPC
 qmckl_exit_code
@ -5471,22 +5474,20 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                   const qmckl_context context,
                                   const int64_t ao_num,
                                   const int64_t shell_num,
-          const int64_t prim_num,
+                                   const int32_t* restrict prim_num_per_nucleus,
                                   const int64_t point_num,
                                   const int64_t nucl_num,
-          const double* coord,
-          const double* nucl_coord,
-          const int64_t* nucleus_index,
-          const int64_t* nucleus_shell_num,
+                                   const double* restrict coord,
+                                   const double* restrict nucl_coord,
+                                   const int64_t* restrict nucleus_index,
+                                   const int64_t* restrict nucleus_shell_num,
                                   const double* nucleus_range,
-          const int32_t* nucleus_max_ang_mom,
-          const int32_t* shell_ang_mom,
-          const int64_t* shell_prim_index,
-          const int64_t* shell_prim_num,
-          const double* ao_factor,
-          const double* expo,
-          const double* coef_normalized,
-          double* const ao_vgl )
+                                   const int32_t* restrict nucleus_max_ang_mom,
+                                   const int32_t* restrict shell_ang_mom,
+                                   const double* restrict ao_factor,
+                                   const qmckl_matrix expo_per_nucleus,
+                                   const qmckl_tensor coef_per_nucleus,
+                                   double* restrict const ao_vgl )
 {
  int32_t lstart[32];
  for (int32_t l=0 ; l<32 ; ++l) {
@ -5495,9 +5496,15 @@ qmckl_compute_ao_vgl_hpc_gaussian (

  int64_t ao_index[shell_num+1];
  int64_t size_max = 0;
+  int64_t prim_max = 0;
+  int64_t shell_max = 0;
  {
    int64_t k=0;
    for (int inucl=0 ; inucl < nucl_num ; ++inucl) {
+      prim_max = prim_num_per_nucleus[inucl] > prim_max ?
+        prim_num_per_nucleus[inucl] : prim_max;
+      shell_max = nucleus_shell_num[inucl] > shell_max ?
+        nucleus_shell_num[inucl] : shell_max;
      const int64_t ishell_start = nucleus_index[inucl];
      const int64_t ishell_end   = nucleus_index[inucl] + nucleus_shell_num[inucl];
      for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
@ -5518,10 +5525,8 @@ qmckl_compute_ao_vgl_hpc_gaussian (
 #endif
  {
    qmckl_exit_code rc;
-    double c_[prim_num];
-    double expo_[prim_num];
-    double ar2[prim_num];
-    int32_t powers[prim_num];
+    double ar2[prim_max];
+    int32_t powers[prim_max];
    double  poly_vgl_l1[4][4] = {{1.0, 0.0, 0.0, 0.0},
                                 {0.0, 1.0, 0.0, 0.0},
                                 {0.0, 0.0, 1.0, 0.0},
@ -5533,6 +5538,18 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                  {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
    double  poly_vgl[5][size_max];

+    double exp_mat[prim_max][8];
+    double ce_mat[shell_max][8];
+
+    double coef_mat[nucl_num][shell_max][prim_max];
+    for (int i=0 ; i<nucl_num ; ++i) {
+      for (int j=0 ; j<shell_max; ++j) {
+        for (int k=0 ; k<prim_max; ++k) {
+          coef_mat[i][j][k] = qmckl_ten3(coef_per_nucleus,k, j, i);
+        }
+      }
+    }
+
 #ifdef HAVE_OPENMP
 #pragma omp for
 #endif
@ -5600,72 +5617,75 @@ qmckl_compute_ao_vgl_hpc_gaussian (
        }

 /* Compute all exponents */
-          /*
-          for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus ; ++i) {
-            const double v = expo[iprim]*r2;
-            if (v > cutoff) {
-              nidx = i;
+
+        int64_t nidx = 0;
+        for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) {
+          const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2;
+          if (v <= cutoff) {
+            ar2[iprim] = v;
+            ++nidx;
+          } else {
            break;
          }
-            ar2[i] = -v;
        }

-          for (int32_t i=0 ; i<ctx->ao_basis.prim_num_per_nucleus ; ++i) {
+        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+          exp_mat[iprim][0] = exp(-ar2[iprim]);
+        }
+        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
+          double f = qmckl_mat(expo_per_nucleus, iprim, inucl) * exp_mat[iprim][0];
+          f = -f-f;
+          exp_mat[iprim][1] = f * x;
+          exp_mat[iprim][2] = f * y;
+          exp_mat[iprim][3] = f * z;
+          exp_mat[iprim][4] = f * (3.0 - 2.0 * ar2[iprim]);
+        }
+
+        for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
+          for (int j=0 ; j<5 ; ++j) {
+            ce_mat[i][j] = 0.;
+          }
+        }
+        for (int k=0 ; k<nidx; ++k) {
+          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
+            if (coef_mat[inucl][i][k] != 0.) {
+              for (int j=0 ; j<5 ; ++j) {
+                ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j];
+              }
+            }
+          }
        }
-          */

        const int64_t ishell_start = nucleus_index[inucl];
        const int64_t ishell_end   = nucleus_index[inucl] + nucleus_shell_num[inucl];
+
        for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {

-             int64_t nidx = 0;
-             const int64_t iprim_start = shell_prim_index[ishell];
-             const int64_t iprim_end   = shell_prim_index[ishell] + shell_prim_num[ishell];
-             for (int64_t iprim = iprim_start ; iprim < iprim_end ; ++iprim) {
-                const double v = expo[iprim]*r2;
-                if (v <= cutoff) {
-                   ar2[nidx] = v;
-                   c_[nidx] = coef_normalized[iprim];
-                   expo_[nidx] = expo[iprim];
-                   ++nidx;
-                }
-             }
-
-             double s1_ = 0.;
-             double s5_ = 0.;
-             double s6_ = 0.;
-             for (int idx=0 ; idx<nidx ; ++idx) {
-                const double v = c_[idx] * exp(-ar2[idx]);
-                const double t = v*expo_[idx];
-                s1_ += v;
-                s6_ -= t;
-                s5_ += t*(4.0*ar2[idx]-6.0);
-             }
-             s6_ += s6_;
-             const double s1 = s1_;
-             const double s2 = s6_*x;
-             const double s3 = s6_*y;
-             const double s4 = s6_*z;
-             const double s5 = s5_;
+          const double s1 = ce_mat[ishell-ishell_start][0];
+          if (s1 == 0.0) continue;
+          const double s2 = ce_mat[ishell-ishell_start][1];
+          const double s3 = ce_mat[ishell-ishell_start][2];
+          const double s4 = ce_mat[ishell-ishell_start][3];
+          const double s5 = ce_mat[ishell-ishell_start][4];

          const int64_t k = ao_index[ishell];
-             double* __restrict__ const ao_vgl_1 = ao_vgl + ipoint*5*ao_num + k;
+          double* restrict const ao_vgl_1 = ao_vgl + ipoint*5*ao_num + k;

          const int32_t l = shell_ang_mom[ishell];
          const int32_t n = lstart[l+1]-lstart[l];

-             double* __restrict__ const ao_vgl_2 = ao_vgl_1 + ao_num;
-             double* __restrict__ const ao_vgl_3 = ao_vgl_1 + (ao_num<<1);
-             double* __restrict__ const ao_vgl_4 = ao_vgl_1 + (ao_num<<1) + ao_num;
-             double* __restrict__ const ao_vgl_5 = ao_vgl_1 + (ao_num<<2);
+          double* restrict const ao_vgl_2 = ao_vgl_1 + ao_num;
+          double* restrict const ao_vgl_3 = ao_vgl_1 + (ao_num<<1);
+          double* restrict const ao_vgl_4 = ao_vgl_1 + (ao_num<<1) + ao_num;
+          double* restrict const ao_vgl_5 = ao_vgl_1 + (ao_num<<2);

-             double* __restrict__ poly_vgl_1;
-             double* __restrict__ poly_vgl_2;
-             double* __restrict__ poly_vgl_3;
-             double* __restrict__ poly_vgl_4;
-             double* __restrict__ poly_vgl_5;
+          double* restrict poly_vgl_1 = NULL;
+          double* restrict poly_vgl_2 = NULL;
+          double* restrict poly_vgl_3 = NULL;
+          double* restrict poly_vgl_4 = NULL;
+          double* restrict poly_vgl_5 = NULL;
          if (nidx > 0) {
-                 const double* __restrict__ f = ao_factor + k;
+            const double* restrict f = ao_factor + k;
            const int64_t idx = lstart[l];

            switch (nucleus_max_ang_mom[inucl]) {
@ -5682,6 +5702,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
              poly_vgl_2 = &(poly_vgl_l2[1][idx]);
              poly_vgl_3 = &(poly_vgl_l2[2][idx]);
              poly_vgl_4 = &(poly_vgl_l2[3][idx]);
+              poly_vgl_5 = &(poly_vgl_l2[4][idx]);
              break;
            default:
              poly_vgl_1 = &(poly_vgl[0][idx]);
@ -5713,16 +5734,16 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                     poly_vgl_4[il] * s4 )) * f[il];
              }
              break;
-                 case(5):
+            case(6):
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
-                     for (int il=0 ; il<5 ; ++il) {
+              for (int il=0 ; il<6 ; ++il) {
                ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
                ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
                ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
                ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
-                       ao_vgl_5[il] = (poly_vgl_1[il] * s5 +
+                ao_vgl_5[il] = (poly_vgl_5[il] * s1 + poly_vgl_1[il] * s5 +
                                2.0*(poly_vgl_2[il] * s2 +
                                     poly_vgl_3[il] * s3 +
                                     poly_vgl_4[il] * s4 )) * f[il];
@ -5754,10 +5775,10 @@ qmckl_compute_ao_vgl_hpc_gaussian (
            }
          }
        }
+      }
+    }
+  }

-       }
-    }
-  }
  return QMCKL_SUCCESS;
 }
 #endif
@ -5792,7 +5813,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
          const qmckl_context context,
          const int64_t ao_num,
          const int64_t shell_num,
-          const int64_t prim_num,
+          const int32_t* prim_num_per_nucleus,
          const int64_t point_num,
          const int64_t nucl_num,
          const double* coord,
@ -5802,11 +5823,9 @@ qmckl_compute_ao_vgl_hpc_gaussian (
          const double* nucleus_range,
          const int32_t* nucleus_max_ang_mom,
          const int32_t* shell_ang_mom,
-          const int64_t* shell_prim_index,
-          const int64_t* shell_prim_num,
          const double* ao_factor,
-          const double* expo,
-          const double* coef_normalized,
+          const qmckl_matrix expo_per_nucleus,
+          const qmckl_tensor coef_per_nucleus,
          double* const ao_vgl );
 #endif
    #+end_src
@ -5934,7 +5953,7 @@ qmckl_exit_code qmckl_provide_ao_vgl(qmckl_context context)
      rc = qmckl_compute_ao_vgl_hpc_gaussian(context,
                                  ctx->ao_basis.ao_num,
                                  ctx->ao_basis.shell_num,
-                                  ctx->ao_basis.prim_num,
+                                  ctx->ao_basis.prim_num_per_nucleus,
                                  ctx->point.num,
                                  ctx->nucleus.num,
                                  ctx->point.coord.data,
@ -5944,11 +5963,9 @@ qmckl_exit_code qmckl_provide_ao_vgl(qmckl_context context)
                                  ctx->ao_basis.nucleus_range,
                                  ctx->ao_basis.nucleus_max_ang_mom,
                                  ctx->ao_basis.shell_ang_mom,
-                                  ctx->ao_basis.shell_prim_index,
-                                  ctx->ao_basis.shell_prim_num,
                                  ctx->ao_basis.ao_factor,
-                                  ctx->ao_basis.exponent,
-                                  ctx->ao_basis.coefficient_normalized,
+                                  ctx->ao_basis.expo_per_nucleus,
+                                  ctx->ao_basis.coef_per_nucleus,
                                  ctx->ao_basis.ao_vgl);
      /*
    } else if (ctx->ao_basis.type == 'S') {
@ -6023,77 +6040,74 @@ qmckl_exit_code qmckl_provide_ao_vgl(qmckl_context context)
 import numpy as np
 from math import sqrt

+h0 = 1.e-4
 def f(a,x,y):
    return np.sum( [c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] )

+def fx(a,x,y):
+    return f(a,x,y) * (x[0] - y[0])**2
+
+
 def df(a,x,y,n):
-    h0 = 1.e-6
     if   n == 1: h = np.array([h0,0.,0.])
     elif n == 2: h = np.array([0.,h0,0.])
     elif n == 3: h = np.array([0.,0.,h0])
-    return ( f(a,x+h,y) - f(a,x-h,y) ) / (2.*h0)
+     return ( fx(a,x+h,y) - fx(a,x-h,y) ) / (2.*h0)
+#   return np.sum( [-2.*b * c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) * (x-y)[n-1]

 def d2f(a,x,y,n):
-    h0 = 1.e-6
    if   n == 1: h = np.array([h0,0.,0.])
    elif n == 2: h = np.array([0.,h0,0.])
    elif n == 3: h = np.array([0.,0.,h0])
-    return ( f(a,x+h,y) - 2.*f(a,x,y) + f(a,x-h,y) ) / h0**2
+    return ( fx(a,x+h,y) - 2.*fx(a,x,y) + fx(a,x-h,y) ) / h0**2
+#   return np.sum( [( (2.*b*(x-y)[n-1])**2 -2.*b ) * c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) 

 def lf(a,x,y):
+#   return np.sum( [( (2.*b*np.linalg.norm(x-y))**2 -6.*b ) * c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) 
    return d2f(a,x,y,1) + d2f(a,x,y,2) + d2f(a,x,y,3)

+
 elec_26_w1 = np.array( [  1.49050402641, 2.90106987953, -1.05920815468  ] )
 elec_15_w2 = np.array( [  -2.20180344582,-1.9113150239,  2.2193744778600002 ] )
 nucl_1    = np.array( [ -2.302574592081335e+00, -3.542027060505035e-01, -5.334129934317614e-02] )

 #double ao_vgl[prim_num][5][elec_num];
 x = elec_26_w1 ; y = nucl_1
-a = [( 403.830000, 0.001473 * 5.9876577632594533e+04),
-     ( 121.170000, 0.012672 * 7.2836806319891484e+03),
-     (  46.345000, 0.058045 * 1.3549226646722386e+03),
-     (  19.721000, 0.170510 * 3.0376315094739988e+02),
-     (   8.862400, 0.318596 * 7.4924579607137730e+01),
-     (   3.996200, 0.384502 * 1.8590543353806009e+01),
-     (   1.763600, 0.273774 * 4.4423176930919421e+00),
-     (   0.706190, 0.074397 * 8.9541051939952665e-01)]
+a = [( 4.0382999999999998e+02, 1.4732000000000000e-03 * 5.9876577632594533e+04),
+     ( 1.2117000000000000e+02, 1.2672500000000000e-02 * 7.2836806319891484e+03),
+     ( 4.6344999999999999e+01, 5.8045100000000002e-02 * 1.3549226646722386e+03),
+     ( 1.9721000000000000e+01, 1.7051030000000000e-01 * 3.0376315094739988e+02),
+     ( 8.8623999999999992e+00, 3.1859579999999998e-01 * 7.4924579607137730e+01),
+     ( 3.9962000000000000e+00, 3.8450230000000002e-01 * 1.8590543353806009e+01),
+     ( 1.7636000000000001e+00, 2.7377370000000001e-01 * 4.4423176930919421e+00),
+     ( 7.0618999999999998e-01, 7.4396699999999996e-02 * 8.9541051939952665e-01)]

 norm = sqrt(3.)
-print ( "[0][26][219] : %25.15e"%(f(a,x,y) * (x[0] - y[0])**2) )
-print ( "[1][26][219] : %25.15e"%(df(a,x,y,1)* (x[0] - y[0]) * (x[1] - y[1]) + 2.*f(a,x,y) * (x[0] - y[0])) )
+# x^2 * g(r)
+print ( "[26][0][219] : %25.15e"%(fx(a,x,y)) )
+print ( "[26][1][219] : %25.15e"%(df(a,x,y,1)) )
+print ( "[26][2][219] : %25.15e"%(df(a,x,y,2)) )
+print ( "[26][3][219] : %25.15e"%(df(a,x,y,3)) )
+print ( "[26][4][219] : %25.15e"%(lf(a,x,y)) )

-print ( "[0][26][220] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[1] - y[1]) ))
-print ( "[1][26][220] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[1] - y[1]) + norm*f(a,x,y) * (x[1] - y[1])) )
+print ( "[26][0][220] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[1] - y[1]) ))
+print ( "[26][1][220] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[1] - y[1]) + norm*f(a,x,y) * (x[1] - y[1])) )

-print ( "[0][26][221] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[2] - y[2])) )
-print ( "[1][26][221] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[2] - y[2]) + norm*f(a,x,y) * (x[2] - y[2])) )
+print ( "[26][0][221] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[2] - y[2])) )
+print ( "[26][1][221] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[2] - y[2]) + norm*f(a,x,y) * (x[2] - y[2])) )

-print ( "[0][26][222] : %25.15e"%(f(a,x,y) * (x[1] - y[1]) * (x[1] - y[1])) )
-print ( "[1][26][222] : %25.15e"%(df(a,x,y,1)* (x[1] - y[1]) * (x[1] - y[1])) )
+print ( "[26][0][222] : %25.15e"%(f(a,x,y) * (x[1] - y[1]) * (x[1] - y[1])) )
+print ( "[26][1][222] : %25.15e"%(df(a,x,y,1)* (x[1] - y[1]) * (x[1] - y[1])) )

-print ( "[0][26][223] : %25.15e"%(norm*f(a,x,y) * (x[1] - y[1]) * (x[2] - y[2])) )
-print ( "[1][26][223] : %25.15e"%(norm*df(a,x,y,1)* (x[1] - y[1]) * (x[2] - y[2])) )
+print ( "[26][0][223] : %25.15e"%(norm*f(a,x,y) * (x[1] - y[1]) * (x[2] - y[2])) )
+print ( "[26][1][223] : %25.15e"%(norm*df(a,x,y,1)* (x[1] - y[1]) * (x[2] - y[2])) )

-print ( "[0][26][224] : %25.15e"%(f(a,x,y) * (x[2] - y[2]) * (x[2] - y[2])) )
-print ( "[1][26][224] : %25.15e"%(df(a,x,y,1)* (x[2] - y[2]) * (x[2] - y[2])) )
+print ( "[26][0][224] : %25.15e"%(f(a,x,y) * (x[2] - y[2]) * (x[2] - y[2])) )
+print ( "[26][1][224] : %25.15e"%(df(a,x,y,1)* (x[2] - y[2]) * (x[2] - y[2])) )

    #+end_src

    #+RESULTS:
-    #+begin_example
-    [0][26][219] :     1.020302912653649e-08
-    [1][26][219] :    -4.153046808203204e-08
-    [0][26][220] :     1.516649653540510e-08
-    [1][26][220] :    -7.725252615816528e-08
-    [0][26][221] :    -4.686389780112468e-09
-    [1][26][221] :     2.387073693851122e-08
-    [0][26][222] :     7.514847283937212e-09
-    [1][26][222] :    -4.025905373647693e-08
-    [0][26][223] :    -4.021924592380977e-09
-    [1][26][223] :     2.154652944642284e-08
-    [0][26][224] :     7.175074806631758e-10
-    [1][26][224] :    -3.843880138733679e-09
-    #+end_example

     #+begin_src c :tangle (eval c_test) :exports none
 {
@ -6125,32 +6139,69 @@ rc = qmckl_get_ao_basis_ao_vgl(context, &(ao_vgl[0][0][0]),
 assert (rc == QMCKL_SUCCESS);

 printf("\n");
-printf(" ao_vgl ao_vgl[0][26][219] %25.15e\n", ao_vgl[26][0][219]);
-printf(" ao_vgl ao_vgl[1][26][219] %25.15e\n", ao_vgl[26][1][219]);
-printf(" ao_vgl ao_vgl[0][26][220] %25.15e\n", ao_vgl[26][0][220]);
-printf(" ao_vgl ao_vgl[1][26][220] %25.15e\n", ao_vgl[26][1][220]);
-printf(" ao_vgl ao_vgl[0][26][221] %25.15e\n", ao_vgl[26][0][221]);
-printf(" ao_vgl ao_vgl[1][26][221] %25.15e\n", ao_vgl[26][1][221]);
-printf(" ao_vgl ao_vgl[0][26][222] %25.15e\n", ao_vgl[26][0][222]);
-printf(" ao_vgl ao_vgl[1][26][222] %25.15e\n", ao_vgl[26][1][222]);
-printf(" ao_vgl ao_vgl[0][26][223] %25.15e\n", ao_vgl[26][0][223]);
-printf(" ao_vgl ao_vgl[1][26][223] %25.15e\n", ao_vgl[26][1][223]);
-printf(" ao_vgl ao_vgl[0][26][224] %25.15e\n", ao_vgl[26][0][224]);
-printf(" ao_vgl ao_vgl[1][26][224] %25.15e\n", ao_vgl[26][1][224]);
+printf(" ao_vgl ao_vgl[26][0][219] %25.15e\n", ao_vgl[26][0][219]);
+printf(" ao_vgl ao_vgl[26][1][219] %25.15e\n", ao_vgl[26][1][219]);
+printf(" ao_vgl ao_vgl[26][2][219] %25.15e\n", ao_vgl[26][2][219]);
+printf(" ao_vgl ao_vgl[26][3][219] %25.15e\n", ao_vgl[26][3][219]);
+printf(" ao_vgl ao_vgl[26][4][219] %25.15e\n", ao_vgl[26][4][219]);
+printf(" ao_vgl ao_vgl[26][0][220] %25.15e\n", ao_vgl[26][0][220]);
+printf(" ao_vgl ao_vgl[26][1][220] %25.15e\n", ao_vgl[26][1][220]);
+printf(" ao_vgl ao_vgl[26][2][220] %25.15e\n", ao_vgl[26][2][220]);
+printf(" ao_vgl ao_vgl[26][3][220] %25.15e\n", ao_vgl[26][3][220]);
+printf(" ao_vgl ao_vgl[26][4][220] %25.15e\n", ao_vgl[26][4][220]);
+printf(" ao_vgl ao_vgl[26][0][221] %25.15e\n", ao_vgl[26][0][221]);
+printf(" ao_vgl ao_vgl[26][1][221] %25.15e\n", ao_vgl[26][1][221]);
+printf(" ao_vgl ao_vgl[26][2][221] %25.15e\n", ao_vgl[26][2][221]);
+printf(" ao_vgl ao_vgl[26][3][221] %25.15e\n", ao_vgl[26][3][221]);
+printf(" ao_vgl ao_vgl[26][4][221] %25.15e\n", ao_vgl[26][4][221]);
+printf(" ao_vgl ao_vgl[26][0][222] %25.15e\n", ao_vgl[26][0][222]);
+printf(" ao_vgl ao_vgl[26][1][222] %25.15e\n", ao_vgl[26][1][222]);
+printf(" ao_vgl ao_vgl[26][2][222] %25.15e\n", ao_vgl[26][2][222]);
+printf(" ao_vgl ao_vgl[26][3][222] %25.15e\n", ao_vgl[26][3][222]);
+printf(" ao_vgl ao_vgl[26][4][222] %25.15e\n", ao_vgl[26][4][222]);
+printf(" ao_vgl ao_vgl[26][0][223] %25.15e\n", ao_vgl[26][0][223]);
+printf(" ao_vgl ao_vgl[26][1][223] %25.15e\n", ao_vgl[26][1][223]);
+printf(" ao_vgl ao_vgl[26][2][223] %25.15e\n", ao_vgl[26][2][223]);
+printf(" ao_vgl ao_vgl[26][3][223] %25.15e\n", ao_vgl[26][3][223]);
+printf(" ao_vgl ao_vgl[26][4][223] %25.15e\n", ao_vgl[26][4][223]);
+printf(" ao_vgl ao_vgl[26][0][224] %25.15e\n", ao_vgl[26][0][224]);
+printf(" ao_vgl ao_vgl[26][1][224] %25.15e\n", ao_vgl[26][1][224]);
+printf(" ao_vgl ao_vgl[26][2][224] %25.15e\n", ao_vgl[26][2][224]);
+printf(" ao_vgl ao_vgl[26][3][224] %25.15e\n", ao_vgl[26][3][224]);
+printf(" ao_vgl ao_vgl[26][4][224] %25.15e\n", ao_vgl[26][4][224]);
 printf("\n");

 assert( fabs(ao_vgl[26][0][219] - (  1.020298798341620e-08)) < 1.e-14 );
 assert( fabs(ao_vgl[26][1][219] - ( -4.928035238010602e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][2][219] - ( -4.691009312035986e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][3][219] - (  1.449504046436699e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][4][219] - (  4.296442111843973e-07)) < 1.e-14 );
 assert( fabs(ao_vgl[26][0][220] - (  1.516643537739178e-08)) < 1.e-14 );
 assert( fabs(ao_vgl[26][1][220] - ( -7.725221462603871e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][2][220] - ( -6.507140835104833e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][3][220] - (  2.154644255710413e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][4][220] - (  6.365449359656352e-07)) < 1.e-14 );
 assert( fabs(ao_vgl[26][0][221] - ( -4.686370882518819e-09)) < 1.e-14 );
 assert( fabs(ao_vgl[26][1][221] - (  2.387064067626827e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][2][221] - (  2.154644255710412e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][3][221] - ( -1.998731863512374e-09)) < 1.e-14 );
+assert( fabs(ao_vgl[26][4][221] - ( -1.966899656441993e-07)) < 1.e-14 );
 assert( fabs(ao_vgl[26][0][222] - (  7.514816980753531e-09)) < 1.e-14 );
 assert( fabs(ao_vgl[26][1][222] - ( -4.025889138635182e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][2][222] - ( -2.993372555126361e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][3][222] - (  1.067604670272904e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][4][222] - (  3.168199650002648e-07)) < 1.e-14 );
 assert( fabs(ao_vgl[26][0][223] - ( -4.021908374204471e-09)) < 1.e-14 );
 assert( fabs(ao_vgl[26][1][223] - (  2.154644255710413e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][2][223] - (  1.725594944732276e-08)) < 1.e-14 );
+assert( fabs(ao_vgl[26][3][223] - ( -1.715339357718333e-09)) < 1.e-14 );
+assert( fabs(ao_vgl[26][4][223] - ( -1.688020516893476e-07)) < 1.e-14 );
 assert( fabs(ao_vgl[26][0][224] - (  7.175045873560788e-10)) < 1.e-14 );
 assert( fabs(ao_vgl[26][1][224] - ( -3.843864637762753e-09)) < 1.e-14 );
+assert( fabs(ao_vgl[26][2][224] - ( -3.298857850451910e-09)) < 1.e-14 );
+assert( fabs(ao_vgl[26][3][224] - ( -4.073047518790881e-10)) < 1.e-14 );
+assert( fabs(ao_vgl[26][4][224] - (  3.153244195820293e-08)) < 1.e-14 );
+
 }

     #+end_src
@ -6204,6 +6255,5 @@ assert( fabs(ao_vgl[26][1][224] - (-3.843864637762753e-09)) < 1.e-14 );
 # -*- mode: org -*-
 # vim: syntax=c

-
 * TODO [0/1] Missing features                                      :noexport:
  - [ ] Error messages to tell what is missing when initializing
--- a/org/qmckl_blas.org
+++ b/org/qmckl_blas.org
@ -85,7 +85,7 @@ are not intended to be passed to external codes.
   #+begin_src c :comments org :tangle (eval h_private_type) :exports none
 typedef struct qmckl_vector {
  int64_t size;
-  double* data;
+  double* restrict data;
 } qmckl_vector;
   #+end_src

@ -161,7 +161,7 @@ qmckl_vector_free( qmckl_context context,
   #+begin_src c :comments org :tangle (eval h_private_type) :exports none
 typedef struct qmckl_matrix {
  int64_t size[2];
-  double* data;
+  double* restrict data;
 } qmckl_matrix;
   #+end_src

@ -247,7 +247,7 @@ qmckl_matrix_free( qmckl_context context,
 typedef struct qmckl_tensor {
  int64_t order;
  int64_t size[QMCKL_TENSOR_ORDER_MAX];
-  double* data;
+  double* restrict data;
 } qmckl_tensor;
   #+end_src

--- a/tools/theme.setup
+++ b/tools/theme.setup
@ -6,7 +6,9 @@
 #+INFOJS_OPT: toc:t mouse:underline path:org-info.js
 #+HTML_HEAD: <link rel="stylesheet" title="Standard" href="qmckl.css" type="text/css" />

-#+STARTUP:    align fold nodlcheck hidestars oddeven lognotestate latexpreview
+# STARTUP: align fold nodlcheck hidestars oddeven lognotestate latexpreview
+#+STARTUP: showeverything
+
 #+AUTHOR:     TREX CoE
 #+LANGUAGE:   en