1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-07-16 08:00:43 +02:00

Sparse AO contraction coefficients

This commit is contained in:
Anthony Scemama 2022-10-11 17:46:30 +02:00
parent be376d70d0
commit 1b02b49f51

View File

@ -5718,13 +5718,21 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
double exp_mat[prim_max] __attribute__((aligned(64)));
double ce_mat[shell_max] __attribute__((aligned(64)));
double coef_mat[nucl_num][shell_max][prim_max];
int coef_mat_sparse_idx[nucl_num][shell_max][prim_max+1];
double coef_mat_sparse [nucl_num][shell_max][prim_max+1];
for (int i=0 ; i<nucl_num ; ++i) {
for (int j=0 ; j<shell_max; ++j) {
int l=1;
for (int k=0 ; k<prim_max; ++k) {
coef_mat[i][j][k] = qmckl_ten3(coef_per_nucleus,k, j, i);
const double c = qmckl_ten3(coef_per_nucleus,k, j, i);
if (c != 0.) {
coef_mat_sparse_idx[i][j][l] = k;
coef_mat_sparse[i][j][l] = c;
l+=1;
}
}
coef_mat_sparse_idx[i][j][0] = l;
}
}
#ifdef HAVE_OPENMP
@ -5806,8 +5814,12 @@ qmckl_compute_ao_value_hpc_gaussian (const qmckl_context context,
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
ce_mat[i] = 0.;
for (int k=0 ; k<nidx; ++k) {
ce_mat[i] += coef_mat[inucl][i][k] * exp_mat[k];
const int* idx = &(coef_mat_sparse_idx[inucl][i][0]);
const double* v = &(coef_mat_sparse[inucl][i][0]);
for (int l=1 ; l<coef_mat_sparse_idx[inucl][i][0]; ++l) {
const int k = idx[l];
if (k >= nidx) break;
ce_mat[i] += v[l] * exp_mat[k];
}
}
@ -6264,6 +6276,7 @@ printf(" ao_value ao_value[26][223] %25.15e\n", ao_value[26][223]);
printf(" ao_value ao_value[26][224] %25.15e\n", ao_value[26][224]);
printf("\n");
printf("%e %e\n", ao_value[26][219], 1.020298798341620e-08);
assert( fabs(ao_value[26][219] - ( 1.020298798341620e-08)) < 1.e-14 );
assert( fabs(ao_value[26][220] - ( 1.516643537739178e-08)) < 1.e-14 );
assert( fabs(ao_value[26][221] - ( -4.686370882518819e-09)) < 1.e-14 );
@ -6495,13 +6508,21 @@ qmckl_compute_ao_vgl_hpc_gaussian (
double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
double coef_mat[nucl_num][shell_max][prim_max] __attribute__((aligned(64)));
double coef_mat_sparse[nucl_num][shell_max][prim_max+1];
int coef_mat_sparse_idx[nucl_num][shell_max][prim_max+1];
for (int i=0 ; i<nucl_num ; ++i) {
for (int j=0 ; j<shell_max; ++j) {
int l=1;
for (int k=0 ; k<prim_max; ++k) {
coef_mat[i][j][k] = qmckl_ten3(coef_per_nucleus,k, j, i);
const double c = qmckl_ten3(coef_per_nucleus,k, j, i);
if (c != 0.) {
coef_mat_sparse_idx[i][j][l] = k;
coef_mat_sparse[i][j][l] = c;
l+=1;
}
}
coef_mat_sparse_idx[i][j][0] = l;
}
}
double poly_vgl_l1[4][4] __attribute__((aligned(64))) =
@ -6612,9 +6633,8 @@ qmckl_compute_ao_vgl_hpc_gaussian (
/* --- */
switch (5) {
case(8):
// if (do_sparse) {
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
#ifdef HAVE_OPENMP
#pragma omp simd
@ -6622,79 +6642,20 @@ qmckl_compute_ao_vgl_hpc_gaussian (
for (int j=0 ; j<8 ; ++j) {
ce_mat[i][j] = 0.;
}
for (int k=0 ; k<nidx; ++k) {
const double cm = coef_mat[inucl][i][k];
const int* idx = &(coef_mat_sparse_idx[inucl][i][0]);
const double* v = &(coef_mat_sparse[inucl][i][0]);
for (int l=1 ; l<idx[0]; ++l) {
const int k = idx[l];
if (k >= nidx) break;
#ifdef HAVE_OPENMP
#pragma omp simd
#endif
for (int j=0 ; j<8 ; ++j) {
ce_mat[i][j] += cm * exp_mat[k][j];
ce_mat[i][j] += v[l] * exp_mat[k][j];
}
}
}
break;
case(5):
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
for (int j=0 ; j<5 ; ++j) {
ce_mat[i][j] = 0.;
}
for (int k=0 ; k<nidx; ++k) {
const double cm = coef_mat[inucl][i][k];
for (int j=0 ; j<5 ; ++j) {
ce_mat[i][j] += cm * exp_mat[k][j];
}
}
}
break;
case(512):
for(int i=0; i<nucleus_shell_num[inucl]; ++i){
__m512d cemat_avx512;
__m512d coefmat_avx512;
__m512d expmat_avx512;
// cemat_avx512 = _mm512_load_pd(&(ce_mat[i][0]));
cemat_avx512 = _mm512_xor_pd(cemat_avx512,cemat_avx512);
for(int k=0; k<nidx; ++k){
coefmat_avx512 = _mm512_set1_pd(coef_mat[inucl][i][k]);
expmat_avx512 = _mm512_load_pd(&(exp_mat[k][0]));
cemat_avx512 = _mm512_fmadd_pd(coefmat_avx512, expmat_avx512, cemat_avx512);
}
_mm512_store_pd(&(ce_mat[i][0]),cemat_avx512);
}
break;
case(256):
for(int i=0; i<nucleus_shell_num[inucl]; ++i){
__m256d cematlow_avx2;
__m256d cemathigh_avx2;
__m256d coefmat_avx2;
__m256d expmatlow_avx2;
__m256d expmathigh_avx2;
// cematlow_avx2 = _mm256_load_pd(&(ce_mat[i][0]));
// cemathigh_avx2 = _mm256_load_pd(&(ce_mat[i][4]));
cematlow_avx2 = _mm256_xor_pd(cematlow_avx2,cematlow_avx2);
cemathigh_avx2 = _mm256_xor_pd(cemathigh_avx2,cemathigh_avx2);
for(int k=0; k<nidx; ++k){
coefmat_avx2 = _mm256_set1_pd(coef_mat[inucl][i][k]);
expmatlow_avx2 = _mm256_load_pd(&(exp_mat[k][0]));
expmathigh_avx2 = _mm256_load_pd(&(exp_mat[k][4]));
cematlow_avx2 = _mm256_fmadd_pd(coefmat_avx2, expmatlow_avx2, cematlow_avx2);
cemathigh_avx2 = _mm256_fmadd_pd(coefmat_avx2, expmathigh_avx2, cemathigh_avx2);
}
_mm256_store_pd(&ce_mat[i][0],cematlow_avx2);
_mm256_store_pd(&ce_mat[i][4],cemathigh_avx2);
}
}
const int64_t ishell_start = nucleus_index[inucl];
const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl];