Fixed bug in ao_value

2024-07-17 16:33:59 +02:00 · 2022-08-14 16:09:58 +02:00 · 2022-08-14 16:09:58 +02:00 · 5512c1b41c
commit 5512c1b41c
parent 725e488199
2 changed files with 116 additions and 130 deletions
--- a/org/qmckl_ao.org
+++ b/org/qmckl_ao.org
@ -5648,33 +5648,14 @@ end function qmckl_compute_ao_value_doc_f
    | ~coef_normalized~     | ~double[prim_num]~          | in     | Value, gradients and Laplacian of the shells |
    | ~ao_value~            | ~double[point_num][ao_num]~ | out    | Values of the AOs                            |
-     #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
+     #+NAME:ao_value_constants
-#ifdef HAVE_HPC
+     #+begin_src c :exports none
-qmckl_exit_code
+  int32_t lstart[32] __attribute__((aligned(64)));
 qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
                                     const int64_t ao_num,
                                     const int64_t shell_num,
                                     const int32_t* restrict prim_num_per_nucleus,
                                     const int64_t point_num,
                                     const int64_t nucl_num,
                                     const double* restrict coord,
                                     const double* restrict nucl_coord,
                                     const int64_t* restrict nucleus_index,
                                     const int64_t* restrict nucleus_shell_num,
                                     const double* nucleus_range,
                                     const int32_t* restrict nucleus_max_ang_mom,
                                     const int32_t* restrict shell_ang_mom,
                                     const double* restrict ao_factor,
                                     const qmckl_matrix expo_per_nucleus,
                                     const qmckl_tensor coef_per_nucleus,
                                     double* restrict const ao_value )
 {
  int32_t lstart[32];
  for (int32_t l=0 ; l<32 ; ++l) {
    lstart[l] = l*(l+1)*(l+2)/6;
  }
-  int64_t ao_index[shell_num+1];
+  int64_t ao_index[shell_num+1] __attribute__((aligned(64)));
  int64_t size_max = 0;
  int64_t prim_max = 0;
  int64_t shell_max = 0;
@ -5700,17 +5681,42 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
 /* Don't compute polynomials when the radial part is zero. */
  double cutoff = 27.631021115928547; // -log(1.e-12)
     #+end_src
     #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
 #ifdef HAVE_HPC
 qmckl_exit_code
 qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
                                     const int64_t ao_num,
                                     const int64_t shell_num,
                                     const int32_t* restrict prim_num_per_nucleus,
                                     const int64_t point_num,
                                     const int64_t nucl_num,
                                     const double* restrict coord,
                                     const double* restrict nucl_coord,
                                     const int64_t* restrict nucleus_index,
                                     const int64_t* restrict nucleus_shell_num,
                                     const double* nucleus_range,
                                     const int32_t* restrict nucleus_max_ang_mom,
                                     const int32_t* restrict shell_ang_mom,
                                     const double* restrict ao_factor,
                                     const qmckl_matrix expo_per_nucleus,
                                     const qmckl_tensor coef_per_nucleus,
                                     double* restrict const ao_value )
 {
  <<ao_value_constants>>
 #ifdef HAVE_OPENMP
 #pragma omp parallel
 #endif
  {
    qmckl_exit_code rc;
-    double ar2[prim_max];
+    double ar2[prim_max] __attribute__((aligned(64)));
-    int32_t powers[3*size_max];
+    int32_t powers[3*size_max] __attribute__((aligned(64)));
-    double  poly_vgl[5*size_max];
+    double  poly_vgl[5*size_max] __attribute__((aligned(64)));
-    double exp_mat[prim_max];
+    double exp_mat[prim_max] __attribute__((aligned(64)));
-    double ce_mat[shell_max];
+    double ce_mat[shell_max] __attribute__((aligned(64)));
    double coef_mat[nucl_num][shell_max][prim_max];
    for (int i=0 ; i<nucl_num ; ++i) {
@ -5725,12 +5731,14 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
 #pragma omp for
 #endif
    for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) {
-      const double e_coord[3] = { coord[ipoint],
+      const double e_coord[3]  __attribute__((aligned(64))) =
        { coord[ipoint],
          coord[ipoint + point_num],
          coord[ipoint + 2*point_num] };
      for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) {
-        const double n_coord[3] = { nucl_coord[inucl],
+        const double n_coord[3] __attribute__((aligned(64))) =
          { nucl_coord[inucl],
            nucl_coord[inucl + nucl_num],
            nucl_coord[inucl + 2*nucl_num] };
@ -5751,14 +5759,14 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
          break;
        case 1:
-          poly_vgl[0] = 0.;
+          poly_vgl[0] = 1.;
          poly_vgl[1] = x;
          poly_vgl[2] = y;
          poly_vgl[3] = z;
          break;
        case 2:
-          poly_vgl[0] = 0.;
+          poly_vgl[0] = 1.;
          poly_vgl[1] = x;
          poly_vgl[2] = y;
          poly_vgl[3] = z;
@ -5798,9 +5806,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
        for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
          ce_mat[i] = 0.;
        }
          for (int k=0 ; k<nidx; ++k) {
          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
            ce_mat[i] += coef_mat[inucl][i][k] * exp_mat[k];
          }
        }
@ -5811,7 +5817,6 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
        for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
          const double s1 = ce_mat[ishell-ishell_start];
          if (s1 == 0.0) continue;
          const int64_t k = ao_index[ishell];
          double* restrict const ao_value_1 = ao_value + ipoint*ao_num + k;
@ -5819,6 +5824,13 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
          const int32_t l = shell_ang_mom[ishell];
          const int32_t n = lstart[l+1]-lstart[l];
          if (s1 == 0.0) {
            for (int64_t il=0 ; il<n ; ++il) {
              ao_value_1[il] = 0.;
            }
            continue;
          }
          double* restrict poly_vgl_1 = NULL;
          if (nidx > 0) {
            const double* restrict f = ao_factor + k;
@ -5827,10 +5839,10 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
            poly_vgl_1 = &(poly_vgl[idx]);
            switch (n) {
-            case(1):
+            case 1:
              ao_value_1[0] = s1 * f[0];
              break;
-            case (3):
+            case 3:
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@ -5848,7 +5860,7 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
              break;
            default:
 #ifdef HAVE_OPENMP
-#pragma omp simd simdlen(8)
+#pragma omp simd
 #endif
              for (int il=0 ; il<n ; ++il) {
                ao_value_1[il] =  poly_vgl_1[il] * s1 * f[il];
@ -6470,36 +6482,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                   const qmckl_tensor coef_per_nucleus,
                                   double* restrict const ao_vgl )
 {
-  int32_t lstart[32];
+ <<ao_value_constants>>
  for (int32_t l=0 ; l<32 ; ++l) {
    lstart[l] = l*(l+1)*(l+2)/6;
  }
  int64_t ao_index[shell_num+1];
  int64_t size_max = 0;
  int64_t prim_max = 0;
  int64_t shell_max = 0;
  {
    int64_t k=0;
    for (int inucl=0 ; inucl < nucl_num ; ++inucl) {
      prim_max = prim_num_per_nucleus[inucl] > prim_max ?
        prim_num_per_nucleus[inucl] : prim_max;
      shell_max = nucleus_shell_num[inucl] > shell_max ?
        nucleus_shell_num[inucl] : shell_max;
      const int64_t ishell_start = nucleus_index[inucl];
      const int64_t ishell_end   = nucleus_index[inucl] + nucleus_shell_num[inucl];
      for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) {
        const int l = shell_ang_mom[ishell];
        ao_index[ishell] = k;
        k += lstart[l+1] - lstart[l];
        size_max = size_max < lstart[l+1] ? lstart[l+1] : size_max;
      }
    }
    ao_index[shell_num] = ao_num+1;
  }
 /* Don't compute when the radial part is zero. */
  double cutoff = 27.631021115928547; // -log(1.e-12)
 #ifdef HAVE_OPENMP
 #pragma omp parallel
@ -6508,18 +6491,6 @@ qmckl_compute_ao_vgl_hpc_gaussian (
    qmckl_exit_code rc;
    double ar2[prim_max]  __attribute__((aligned(64)));
    int32_t powers[3*size_max]  __attribute__((aligned(64)));
    double  poly_vgl_l1[4][4] __attribute__((aligned(64))) =
                                {{1.0, 0.0, 0.0, 0.0},
                                 {0.0, 1.0, 0.0, 0.0},
                                 {0.0, 0.0, 1.0, 0.0},
                                 {0.0, 0.0, 0.0, 1.0}};
    double  poly_vgl_l2[5][10]__attribute__((aligned(64))) =
                                 {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
                                  {0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
                                  {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
                                  {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
    double  poly_vgl[5][size_max]  __attribute__((aligned(64)));
    double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
    double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
@ -6533,6 +6504,19 @@ qmckl_compute_ao_vgl_hpc_gaussian (
      }
    }
    double  poly_vgl_l1[4][4] __attribute__((aligned(64))) =
                                {{1.0, 0.0, 0.0, 0.0},
                                 {0.0, 1.0, 0.0, 0.0},
                                 {0.0, 0.0, 1.0, 0.0},
                                 {0.0, 0.0, 0.0, 1.0}};
    double  poly_vgl_l2[5][10]__attribute__((aligned(64))) =
                                 {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
                                  {0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
                                  {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
                                  {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
    double  poly_vgl[5][size_max]  __attribute__((aligned(64)));
 #ifdef HAVE_OPENMP
 #pragma omp for
 #endif
@ -6616,6 +6600,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
          exp_mat[iprim][0] = exp(-ar2[iprim]);
        }
        for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) {
          double f = qmckl_mat(expo_per_nucleus, iprim, inucl) * exp_mat[iprim][0];
          f = -f-f;
@ -6628,21 +6613,6 @@ qmckl_compute_ao_vgl_hpc_gaussian (
 /* --- */
        switch (8) {
        case(5):
          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
            for (int j=0 ; j<5 ; ++j) {
              ce_mat[i][j] = 0.;
            }
            for (int k=0 ; k<nidx; ++k) {
              const double cm = coef_mat[inucl][i][k];
              for (int j=0 ; j<5 ; ++j) {
                ce_mat[i][j] += cm * exp_mat[k][j];
              }
            }
          }
          break;
        case(8):
          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
@ -6664,6 +6634,21 @@ qmckl_compute_ao_vgl_hpc_gaussian (
          }
          break;
        case(5):
          for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
            for (int j=0 ; j<5 ; ++j) {
              ce_mat[i][j] = 0.;
            }
            for (int k=0 ; k<nidx; ++k) {
              const double cm = coef_mat[inucl][i][k];
              for (int j=0 ; j<5 ; ++j) {
                ce_mat[i][j] += cm * exp_mat[k][j];
              }
            }
          }
          break;
        case(512):
          for(int i=0; i<nucleus_shell_num[inucl]; ++i){
            __m512d cemat_avx512;
@ -6778,14 +6763,14 @@ qmckl_compute_ao_vgl_hpc_gaussian (
              poly_vgl_5 = &(poly_vgl[4][idx]);
            }
            switch (n) {
-            case(1):
+            case 1:
              ao_vgl_1[0] = s1 * f[0];
              ao_vgl_2[0] = s2 * f[0];
              ao_vgl_3[0] = s3 * f[0];
              ao_vgl_4[0] = s4 * f[0];
              ao_vgl_5[0] = s5;
              break;
-            case (3):
+            case 3:
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
@ -6800,7 +6785,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
                                     poly_vgl_4[il] * s4 )) * f[il];
              }
              break;
-            case(6):
+            case 6:
 #ifdef HAVE_OPENMP
 #pragma omp simd
 #endif
--- a/org/qmckl_mo.org
+++ b/org/qmckl_mo.org
@ -628,15 +628,8 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context)
 #+end_src
    #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
    rc = qmckl_provide_ao_basis_ao_value(context);
    if (rc != QMCKL_SUCCESS) {
      return qmckl_failwith( context,
                             QMCKL_NOT_PROVIDED,
                             "qmckl_ao_value",
                             NULL);
    }
    #+begin_src c :comments org :tangle (eval c) :noweb yes  :exports none
    if (ctx->mo_basis.mo_vgl_date == ctx->point.date) {
      // mo_vgl has been computed at this step: Just copy the data.
@ -653,6 +646,14 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context)
    } else {
      rc = qmckl_provide_ao_basis_ao_value(context);
      if (rc != QMCKL_SUCCESS) {
        return qmckl_failwith( context,
                               QMCKL_NOT_PROVIDED,
                               "qmckl_ao_value",
                               NULL);
      }
      rc = qmckl_compute_mo_basis_mo_value(context,
                                           ctx->ao_basis.ao_num,
                                           ctx->mo_basis.mo_num,
@ -664,6 +665,8 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context)
    }
    #+end_src
 #+CALL: write_provider_post( group="mo_basis", data="mo_value" )
 #+RESULTS:
@ -727,8 +730,8 @@ integer function qmckl_compute_mo_basis_mo_value_doc_f(context, &
     do j=1,point_num
        mo_value(:,j) = 0.d0
        do k=1,ao_num
           if (ao_value(k,j) /= 0.d0) then
           c1 = ao_value(k,j)
           if (c1 /= 0.d0) then
              do i=1,mo_num
                 mo_value(i,j) = mo_value(i,j) + coefficient_t(i,k) * c1
              end do
@ -736,7 +739,7 @@ integer function qmckl_compute_mo_basis_mo_value_doc_f(context, &
        end do
     end do
-  else ! dgemm
+  else ! dgemm for checking
    LDA = size(coefficient_t,1)
    LDB = size(ao_value,1) 
@ -874,7 +877,7 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context,
      }
    }
-    int64_t n;
+    int64_t n=0;
    for (n=0 ; n < nidx-4 ; n+=4) {
      const double* restrict ck1 = coefficient_t + idx[n  ]*mo_num;
@ -895,8 +898,7 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context,
      }
    }
-    const int64_t n0 = n < 0 ? 0 : n;
+    for (int64_t m=n ; m < nidx ; m+=1) {
    for (int64_t m=n0 ; m < nidx ; m+=1) {
      const double* restrict ck = coefficient_t + idx[m]*mo_num;
      const double a1 = av1[m];
@ -1355,7 +1357,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
      }
    }
-    int64_t n;
+    int64_t n=0;
    for (n=0 ; n < nidx-4 ; n+=4) {
      const double* restrict ck1 = coefficient_t + idx[n  ]*mo_num;
@ -1400,8 +1402,7 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
      }
    }
-    const int64_t n0 = n < 0 ? 0 : n;
+    for (int64_t m=n ; m < nidx ; m+=1) {
    for (int64_t m=n0 ; m < nidx ; m+=1) {
      const double* restrict ck = coefficient_t + idx[m]*mo_num;
      const double a1 = av1[m];
      const double a2 = av2[m];