Optimize AOs

2024-12-22 12:23:56 +01:00 · 2022-02-25 16:30:16 +01:00 · 2022-02-25 16:30:16 +01:00 · b6a31b8c58
commit b6a31b8c58
parent ff526a18cb
1 changed files with 92 additions and 65 deletions
--- a/org/qmckl_ao.org
+++ b/org/qmckl_ao.org
@ -3040,11 +3040,11 @@ integer(c_int32_t) function test_qmckl_ao_gaussian_vgl(context) bind(C)
  integer(c_int64_t), intent(in), value :: context

  integer*8                     :: n, ldv, j, i
-  double precision              :: X(3), R(3), Y(3), r2
+  double precision              :: X(3), R(3), Y(3), r2, z
  double precision, allocatable :: VGL(:,:), A(:)
  double precision              :: epsilon

-  epsilon = qmckl_get_numprec_epsilon(context)
+  epsilon = 3.d0 * qmckl_get_numprec_epsilon(context)

  X = (/ 1.1 , 2.2 ,  3.3 /)
  R = (/ 0.1 , 1.2 , -2.3 /)
@ -3068,29 +3068,43 @@ integer(c_int32_t) function test_qmckl_ao_gaussian_vgl(context) bind(C)

  do i=1,n
     test_qmckl_ao_gaussian_vgl = -11
-     if (dabs(1.d0 - VGL(i,1) / (&
-          dexp(-A(i) * r2) &
-          )) > epsilon ) return
+     z = dabs(1.d0 - VGL(i,1) / (dexp(-A(i) * r2)) )
+     if ( z > epsilon ) then
+        print *, z, epsilon
+        return
+     end if

     test_qmckl_ao_gaussian_vgl = -12
-     if (dabs(1.d0 - VGL(i,2) / (&
-          -2.d0 * A(i) * Y(1) * dexp(-A(i) * r2) &
-          )) > epsilon ) return
+     z = dabs(1.d0 - VGL(i,2) / (&
+          -2.d0 * A(i) * Y(1) * dexp(-A(i) * r2) ))
+     if ( z > epsilon ) then
+        print *, z, epsilon
+        return
+     end if

     test_qmckl_ao_gaussian_vgl = -13
-     if (dabs(1.d0 - VGL(i,3) / (&
-          -2.d0 * A(i) * Y(2) * dexp(-A(i) * r2) &
-          )) > epsilon ) return
+     z = dabs(1.d0 - VGL(i,3) / (&
+          -2.d0 * A(i) * Y(2) * dexp(-A(i) * r2) ))
+     if ( z > epsilon ) then
+        print *, z, epsilon
+        return
+     end if

     test_qmckl_ao_gaussian_vgl = -14
-     if (dabs(1.d0 - VGL(i,4) / (&
-          -2.d0 * A(i) * Y(3) * dexp(-A(i) * r2) &
-          )) > epsilon ) return
+     z = dabs(1.d0 - VGL(i,4) / (&
+          -2.d0 * A(i) * Y(3) * dexp(-A(i) * r2) ))
+     if ( z > epsilon ) then
+        print *, z, epsilon
+        return
+     end if

     test_qmckl_ao_gaussian_vgl = -15
-     if (dabs(1.d0 - VGL(i,5) / (&
-          A(i) * (4.d0*r2*A(i) - 6.d0) * dexp(-A(i) * r2) &
-          )) > epsilon ) return
+     z = dabs(1.d0 - VGL(i,5) / (&
+          A(i) * (4.d0*r2*A(i) - 6.d0) * dexp(-A(i) * r2) ))
+     if ( z > epsilon ) then
+        print *, z, epsilon
+        return
+     end if
  end do

  test_qmckl_ao_gaussian_vgl = 0
@ -5423,16 +5437,16 @@ qmckl_compute_ao_vgl_hpc_gaussian (
             const double s4 = s6_*z;
             const double s5 = s5_;

+             const int64_t k = ao_index[ishell];
+             double* __restrict__ const ao_vgl_1 = ao_vgl + ipoint*5*ao_num + k;

             const int32_t l = shell_ang_mom[ishell];
             const int32_t n = lstart[l+1]-lstart[l];
-             const int64_t k = ao_index[ishell];

-             double* __restrict__ const ao_vgl_1 = &(ao_vgl[ipoint*5*ao_num+k]);
-             double* __restrict__ const ao_vgl_2 = &(ao_vgl_1[  ao_num]);
-             double* __restrict__ const ao_vgl_3 = &(ao_vgl_1[2*ao_num]);
-             double* __restrict__ const ao_vgl_4 = &(ao_vgl_1[3*ao_num]);
-             double* __restrict__ const ao_vgl_5 = &(ao_vgl_1[4*ao_num]);
+             double* __restrict__ const ao_vgl_2 = ao_vgl_1 + ao_num;
+             double* __restrict__ const ao_vgl_3 = ao_vgl_1 + (ao_num<<1);
+             double* __restrict__ const ao_vgl_4 = ao_vgl_1 + (ao_num<<1) + ao_num;
+             double* __restrict__ const ao_vgl_5 = ao_vgl_1 + (ao_num<<2);

             double* __restrict__ poly_vgl_1;
             double* __restrict__ poly_vgl_2;
@ -5440,71 +5454,84 @@ qmckl_compute_ao_vgl_hpc_gaussian (
             double* __restrict__ poly_vgl_4;
             double* __restrict__ poly_vgl_5;
             if (nidx > 0) {
-                 const double* f = &(ao_factor[k]);
+                 const double* __restrict__ f = ao_factor + k;
                 const int64_t idx = lstart[l];

                 switch (nucleus_max_ang_mom[inucl]) {
                 case 0:
-                   ao_vgl_1[0] = s1 * f[0];
-                   ao_vgl_2[0] = s2 * f[0];
-                   ao_vgl_3[0] = s3 * f[0];
-                   ao_vgl_4[0] = s4 * f[0];
-                   ao_vgl_5[0] = s5;
                   break;
                 case 1:
                   poly_vgl_1 = &(poly_vgl_l1[0][idx]);
                   poly_vgl_2 = &(poly_vgl_l1[1][idx]);
                   poly_vgl_3 = &(poly_vgl_l1[2][idx]);
                   poly_vgl_4 = &(poly_vgl_l1[3][idx]);
-                   for (int64_t il=0 ; il<n ; ++il) {
-                     ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
-                     ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
-                     ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
-                     ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
-                     ao_vgl_5[il] = (poly_vgl_1[il] * s5 +
-                                     2.0*(poly_vgl_2[il] * s2 +
-                                          poly_vgl_3[il] * s3 +
-                                          poly_vgl_4[il] * s4 )) * f[il];
-                   }
                   break;
                 case 2:
                   poly_vgl_1 = &(poly_vgl_l2[0][idx]);
                   poly_vgl_2 = &(poly_vgl_l2[1][idx]);
                   poly_vgl_3 = &(poly_vgl_l2[2][idx]);
                   poly_vgl_4 = &(poly_vgl_l2[3][idx]);
-                   poly_vgl_5 = &(poly_vgl_l2[4][idx]);
-
-                   for (int64_t il=0 ; il<n ; ++il) {
-                     ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
-                     ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
-                     ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
-                     ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
-                     ao_vgl_5[il] = (poly_vgl_5[il] * s1 + poly_vgl_1[il] * s5 +
-                                     2.0*(poly_vgl_2[il] * s2 +
-                                          poly_vgl_3[il] * s3 +
-                                          poly_vgl_4[il] * s4 )) * f[il];
-                   }
                   break;
                 default:
                   poly_vgl_1 = &(poly_vgl[0][idx]);
                   poly_vgl_2 = &(poly_vgl[1][idx]);
                   poly_vgl_3 = &(poly_vgl[2][idx]);
                   poly_vgl_4 = &(poly_vgl[3][idx]);
-                   poly_vgl_5 = &(poly_vgl[4][idx]);
-
-                   for (int64_t il=0 ; il<n ; ++il) {
-                     ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
-                     ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
-                     ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
-                     ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
-                     ao_vgl_5[il] = (poly_vgl_5[il] * s1 + poly_vgl_1[il] * s5 +
-                                     2.0*(poly_vgl_2[il] * s2 +
-                                          poly_vgl_3[il] * s3 +
-                                          poly_vgl_4[il] * s4 )) * f[il];
-                   }
-                   break;
                 }
-
+                 switch (n) {
+                 case(1):
+                     ao_vgl_1[0] = s1 * f[0];
+                     ao_vgl_2[0] = s2 * f[0];
+                     ao_vgl_3[0] = s3 * f[0];
+                     ao_vgl_4[0] = s4 * f[0];
+                     ao_vgl_5[0] = s5;
+                     break;
+                 case (3):
+#ifdef HAVE_OPENMP
+                     #pragma omp simd
+#endif
+                     for (int il=0 ; il<3 ; ++il) {
+                       ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
+                       ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
+                       ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
+                       ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
+                       ao_vgl_5[il] = (poly_vgl_1[il] * s5 +
+                                       2.0*(poly_vgl_2[il] * s2 +
+                                             poly_vgl_3[il] * s3 +
+                                             poly_vgl_4[il] * s4 )) * f[il];
+                     }
+                     break;
+                 case(5):
+#ifdef HAVE_OPENMP
+                     #pragma omp simd
+#endif
+                     for (int il=0 ; il<5 ; ++il) {
+                       ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
+                       ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
+                       ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
+                       ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
+                       ao_vgl_5[il] = (poly_vgl_1[il] * s5 +
+                                       2.0*(poly_vgl_2[il] * s2 +
+                                             poly_vgl_3[il] * s3 +
+                                             poly_vgl_4[il] * s4 )) * f[il];
+                     }
+                     break;
+                 default:
+#ifdef HAVE_OPENMP
+                     #pragma omp simd simdlen(8)
+#endif
+                     for (int il=0 ; il<n ; ++il) {
+                       ao_vgl_1[il] =  poly_vgl_1[il] * s1 * f[il];
+                       ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il];
+                       ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il];
+                       ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il];
+                       ao_vgl_5[il] = (poly_vgl_1[il] * s5 +
+                                       2.0*(poly_vgl_2[il] * s2 +
+                                             poly_vgl_3[il] * s3 +
+                                             poly_vgl_4[il] * s4 )) * f[il];
+                     }
+                     break;
+                 }
             } else {
                 for (int64_t il=0 ; il<n ; ++il) {
                     ao_vgl_1[il] = 0.0;