Fixed bug in ao_value

2024-12-22 20:36:01 +01:00 · 2022-08-14 16:09:58 +02:00 · 2022-08-14 16:09:58 +02:00 · 5512c1b41c
commit 5512c1b41c
parent 725e488199
2 changed files with 116 additions and 130 deletions
--- a/org/qmckl_ao.org
+++ b/org/qmckl_ao.org
@ -5648,33 +5648,14 @@ end function qmckl_compute_ao_value_doc_f
    | ~coef_normalized~     | ~double[prim_num]~          | in     | Value, gradients and Laplacian of the shells |
    | ~ao_value~            | ~double[point_num][ao_num]~ | out    | Values of the AOs                            |

-     #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
-#ifdef HAVE_HPC
-qmckl_exit_code
-qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
-                                     const int64_t ao_num,
-                                     const int64_t shell_num,
-                                     const int32_t* restrict prim_num_per_nucleus,
-                                     const int64_t point_num,
-                                     const int64_t nucl_num,
-                                     const double* restrict coord,
-                                     const double* restrict nucl_coord,
-                                     const int64_t* restrict nucleus_index,
-                                     const int64_t* restrict nucleus_shell_num,
-                                     const double* nucleus_range,
-                                     const int32_t* restrict nucleus_max_ang_mom,
-                                     const int32_t* restrict shell_ang_mom,
-                                     const double* restrict ao_factor,
-                                     const qmckl_matrix expo_per_nucleus,
-                                     const qmckl_tensor coef_per_nucleus,
-                                     double* restrict const ao_value )
-{
-  int32_t lstart[32];
+     #+NAME:ao_value_constants
+     #+begin_src c :exports none
+  int32_t lstart[32] __attribute__((aligned(64)));
  for (int32_t l=0 ; l<32 ; ++l) {
    lstart[l] = l*(l+1)*(l+2)/6;
  }

-  int64_t ao_index[shell_num+1];
+  int64_t ao_index[shell_num+1] __attribute__((aligned(64)));
  int64_t size_max = 0;
  int64_t prim_max = 0;
  int64_t shell_max = 0;
@ -5700,17 +5681,42 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
 /* Don't compute polynomials when the radial part is zero. */
  double cutoff = 27.631021115928547; // -log(1.e-12)

+     #+end_src
+
+     #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
+#ifdef HAVE_HPC
+qmckl_exit_code
+qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
+                                     const int64_t ao_num,
+                                     const int64_t shell_num,
+                                     const int32_t* restrict prim_num_per_nucleus,
+                                     const int64_t point_num,
+                                     const int64_t nucl_num,
+                                     const double* restrict coord,
+                                     const double* restrict nucl_coord,
+                                     const int64_t* restrict nucleus_index,
+                                     const int64_t* restrict nucleus_shell_num,
+                                     const double* nucleus_range,
+                                     const int32_t* restrict nucleus_max_ang_mom,
+                                     const int32_t* restrict shell_ang_mom,
+                                     const double* restrict ao_factor,
+                                     const qmckl_matrix expo_per_nucleus,
+                                     const qmckl_tensor coef_per_nucleus,
+                                     double* restrict const ao_value )
+{
+  <<ao_value_constants>>
+
 #ifdef HAVE_OPENMP
 #pragma omp parallel
 #endif
  {
    qmckl_exit_code rc;
-    double ar2[prim_max];
-    int32_t powers[3*size_max];
-    double  poly_vgl[5*size_max];
+    double ar2[prim_max] __attribute__((aligned(64)));
+    int32_t powers[3*size_max] __attribute__((aligned(64)));
+    double  poly_vgl[5*size_max] __attribute__((aligned(64)));

-    double exp_mat[prim_max];
-    double ce_mat[shell_max];
+    double exp_mat[prim_max] __attribute__((aligned(64)));
+    double ce_mat[shell_max] __attribute__((aligned(64)));

    double coef_mat[nucl_num][shell_max][prim_max];
    for (int i=0 ; i<nucl_num ; ++i) {
@ -5725,12 +5731,14 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
 #pragma omp for
 #endif
    for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) {
-      const double e_coord[3] = { coord[ipoint],
+      const double e_coord[3]  __attribute__((aligned(64))) =
+        { coord[ipoint],
          coord[ipoint + point_num],
          coord[ipoint + 2*point_num] };
      
      for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) {
-        const double n_coord[3] = { nucl_coord[inucl],
+        const double n_coord[3] __attribute__((aligned(64))) =
+          { nucl_coord[inucl],
            nucl_coord[inucl + nucl_num],
            nucl_coord[inucl + 2*nucl_num] };
        
@ -5751,14 +5759,14 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
          break;

        case 1:
-          poly_vgl[0] = 0.;
+          poly_vgl[0] = 1.;
          poly_vgl[1] = x;
          poly_vgl[2] = y;
          poly_vgl[3] = z;
          break;

        case 2:
-          poly_vgl[0] = 0.;
+          poly_vgl[0] = 1.;
          poly_vgl[1] = x;
          poly_vgl[2] = y;
          poly_vgl[3] = z;
@ -5798,9 +5806,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,

        for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
          ce_mat[i] = 0.;
-        }
          for (int k=0 ; k<nidx; ++k) {
-          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
            ce_mat[i] += coef_mat[inucl][i][k] * exp_mat[k];
          }
        }
@ -5811,7 +5817,6 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
        for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {

          const double s1 = ce_mat[ishell-ishell_start];
-          if (s1 == 0.0) continue;

          const int64_t k = ao_index[ishell];
          double* restrict const ao_value_1 = ao_value + ipoint*ao_num + k;
@ -5819,6 +5824,13 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
          const int32_t l = shell_ang_mom[ishell];
          const int32_t n = lstart[l+1]-lstart[l];

+          if (s1 == 0.0) {
+            for (int64_t il=0 ; il<n ; ++il) {
+              ao_value_1[il] = 0.;
+            }
+            continue;
+          }
+
          double* restrict poly_vgl_1 = NULL;
          if (nidx > 0) {
            const double* restrict f = ao_factor + k;
@ -5827,10 +5839,10 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
            poly_vgl_1 = &(poly_vgl[idx]);

            switch (n) {
-            case(1):
+            case 1:
              ao_value_1[0] = s1 * f[0];
              break;
-            case (3):
+            case 3:
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@ -5848,7 +5860,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
              break;
            default:
 #ifdef HAVE_OPENMP
-#pragma omp simd simdlen(8)
+#pragma omp simd
 #endif
              for (int il=0 ; il<n ; ++il) {
                ao_value_1[il] =  poly_vgl_1[il] * s1 * f[il];
@ -6470,36 +6482,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                   const qmckl_tensor coef_per_nucleus,
                                   double* restrict const ao_vgl )
 {
-  int32_t lstart[32];
-  for (int32_t l=0 ; l<32 ; ++l) {
-    lstart[l] = l*(l+1)*(l+2)/6;
-  }
-
-  int64_t ao_index[shell_num+1];
-  int64_t size_max = 0;
-  int64_t prim_max = 0;
-  int64_t shell_max = 0;
-  {
-    int64_t k=0;
-    for (int inucl=0 ; inucl < nucl_num ; ++inucl) {
-      prim_max = prim_num_per_nucleus[inucl] > prim_max ?
-        prim_num_per_nucleus[inucl] : prim_max;
-      shell_max = nucleus_shell_num[inucl] > shell_max ?
-        nucleus_shell_num[inucl] : shell_max;
-      const int64_t ishell_start = nucleus_index[inucl];
-      const int64_t ishell_end   = nucleus_index[inucl] + nucleus_shell_num[inucl];
-      for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
-        const int l = shell_ang_mom[ishell];
-        ao_index[ishell] = k;
-        k += lstart[l+1] - lstart[l];
-        size_max = size_max < lstart[l+1] ? lstart[l+1] : size_max;
-      }
-    }
-    ao_index[shell_num] = ao_num+1;
-  }
-
- /* Don't compute when the radial part is zero. */
-  double cutoff = 27.631021115928547; // -log(1.e-12)
+ <<ao_value_constants>>

 #ifdef HAVE_OPENMP
 #pragma omp parallel
@ -6508,18 +6491,6 @@ qmckl_compute_ao_vgl_hpc_gaussian (
    qmckl_exit_code rc;
    double ar2[prim_max]  __attribute__((aligned(64)));
    int32_t powers[3*size_max]  __attribute__((aligned(64)));
-    double  poly_vgl_l1[4][4] __attribute__((aligned(64))) =
-                                {{1.0, 0.0, 0.0, 0.0},
-                                 {0.0, 1.0, 0.0, 0.0},
-                                 {0.0, 0.0, 1.0, 0.0},
-                                 {0.0, 0.0, 0.0, 1.0}};
-    double  poly_vgl_l2[5][10]__attribute__((aligned(64))) =
-                                 {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
-    double  poly_vgl[5][size_max]  __attribute__((aligned(64)));

    double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
    double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
@ -6533,6 +6504,19 @@ qmckl_compute_ao_vgl_hpc_gaussian (
      }
    }

+    double  poly_vgl_l1[4][4] __attribute__((aligned(64))) =
+                                {{1.0, 0.0, 0.0, 0.0},
+                                 {0.0, 1.0, 0.0, 0.0},
+                                 {0.0, 0.0, 1.0, 0.0},
+                                 {0.0, 0.0, 0.0, 1.0}};
+    double  poly_vgl_l2[5][10]__attribute__((aligned(64))) =
+                                 {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
+                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
+                                  {0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
+                                  {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
+                                  {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
+    double  poly_vgl[5][size_max]  __attribute__((aligned(64)));
+
 #ifdef HAVE_OPENMP
 #pragma omp for
 #endif
@ -6616,6 +6600,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
          exp_mat[iprim][0] = exp(-ar2[iprim]);
        }
+
        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
          double f = qmckl_mat(expo_per_nucleus, iprim, inucl) * exp_mat[iprim][0];
          f = -f-f;
@ -6628,21 +6613,6 @@ qmckl_compute_ao_vgl_hpc_gaussian (

 /* --- */
        switch (8) {
-        case(5):
-
-          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
-            for (int j=0 ; j<5 ; ++j) {
-              ce_mat[i][j] = 0.;
-            }
-            for (int k=0 ; k<nidx; ++k) {
-              const double cm = coef_mat[inucl][i][k];
-              for (int j=0 ; j<5 ; ++j) {
-                ce_mat[i][j] += cm * exp_mat[k][j];
-              }
-            }
-          }
-          break;
-
        case(8):

          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
@ -6664,6 +6634,21 @@ qmckl_compute_ao_vgl_hpc_gaussian (
          }
          break;

+        case(5):
+
+          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
+            for (int j=0 ; j<5 ; ++j) {
+              ce_mat[i][j] = 0.;
+            }
+            for (int k=0 ; k<nidx; ++k) {
+              const double cm = coef_mat[inucl][i][k];
+              for (int j=0 ; j<5 ; ++j) {
+                ce_mat[i][j] += cm * exp_mat[k][j];
+              }
+            }
+          }
+          break;
+
        case(512):
          for(int i=0; i<nucleus_shell_num[inucl]; ++i){
            __m512d cemat_avx512;
@ -6778,14 +6763,14 @@ qmckl_compute_ao_vgl_hpc_gaussian (
              poly_vgl_5 = &(poly_vgl[4][idx]);
            }
            switch (n) {
-            case(1):
+            case 1:
              ao_vgl_1[0] = s1 * f[0];
              ao_vgl_2[0] = s2 * f[0];
              ao_vgl_3[0] = s3 * f[0];
              ao_vgl_4[0] = s4 * f[0];
              ao_vgl_5[0] = s5;
              break;
-            case (3):
+            case 3:
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@ -6800,7 +6785,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                     poly_vgl_4[il] * s4 )) * f[il];
              }
              break;
-            case(6):
+            case 6:
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
--- a/org/qmckl_mo.org
+++ b/org/qmckl_mo.org
@ -628,15 +628,8 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context)

 #+end_src

-    #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
-    rc = qmckl_provide_ao_basis_ao_value(context);
-    if (rc != QMCKL_SUCCESS) {
-      return qmckl_failwith( context,
-                             QMCKL_NOT_PROVIDED,
-                             "qmckl_ao_value",
-                             NULL);
-    }
    
+    #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
    if (ctx->mo_basis.mo_vgl_date == ctx->point.date) {

      // mo_vgl has been computed at this step: Just copy the data.
@ -653,6 +646,14 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context)

    } else {

+      rc = qmckl_provide_ao_basis_ao_value(context);
+      if (rc != QMCKL_SUCCESS) {
+        return qmckl_failwith( context,
+                               QMCKL_NOT_PROVIDED,
+                               "qmckl_ao_value",
+                               NULL);
+      }
+
      rc = qmckl_compute_mo_basis_mo_value(context,
                                           ctx->ao_basis.ao_num,
                                           ctx->mo_basis.mo_num,
@ -664,6 +665,8 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context)
    }
    #+end_src

+
+
 #+CALL: write_provider_post( group="mo_basis", data="mo_value" )

 #+RESULTS:
@ -727,8 +730,8 @@ integer function qmckl_compute_mo_basis_mo_value_doc_f(context, &
     do j=1,point_num
        mo_value(:,j) = 0.d0
        do k=1,ao_num
-           if (ao_value(k,j) /= 0.d0) then
           c1 = ao_value(k,j)
+           if (c1 /= 0.d0) then
              do i=1,mo_num
                 mo_value(i,j) = mo_value(i,j) + coefficient_t(i,k) * c1
              end do
@ -736,7 +739,7 @@ integer function qmckl_compute_mo_basis_mo_value_doc_f(context, &
        end do
     end do
     
-  else ! dgemm
+  else ! dgemm for checking

    LDA = size(coefficient_t,1)
    LDB = size(ao_value,1) 
@ -874,7 +877,7 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context,
      }
    }

-    int64_t n;
+    int64_t n=0;

    for (n=0 ; n < nidx-4 ; n+=4) {
      const double* restrict ck1 = coefficient_t + idx[n  ]*mo_num;
@ -895,8 +898,7 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context,
      }
    }

-    const int64_t n0 = n < 0 ? 0 : n;
-    for (int64_t m=n0 ; m < nidx ; m+=1) {
+    for (int64_t m=n ; m < nidx ; m+=1) {
      const double* restrict ck = coefficient_t + idx[m]*mo_num;
      const double a1 = av1[m];

@ -1355,7 +1357,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
      }
    }

-    int64_t n;
+    int64_t n=0;

    for (n=0 ; n < nidx-4 ; n+=4) {
      const double* restrict ck1 = coefficient_t + idx[n  ]*mo_num;
@ -1400,8 +1402,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
      }
    }

-    const int64_t n0 = n < 0 ? 0 : n;
-    for (int64_t m=n0 ; m < nidx ; m+=1) {
+    for (int64_t m=n ; m < nidx ; m+=1) {
      const double* restrict ck = coefficient_t + idx[m]*mo_num;
      const double a1 = av1[m];
      const double a2 = av2[m];