1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-05 11:00:36 +01:00

Intrinsics for AOs and alignment

This commit is contained in:
Anthony Scemama 2022-06-29 13:31:58 +02:00
parent 49e535feb9
commit d5fcd2e0fe

View File

@ -111,6 +111,10 @@ int main() {
#include "qmckl_memory_private_func.h" #include "qmckl_memory_private_func.h"
#include "qmckl_ao_private_type.h" #include "qmckl_ao_private_type.h"
#include "qmckl_ao_private_func.h" #include "qmckl_ao_private_func.h"
#ifdef HAVE_HPC
#include <immintrin.h>
#endif
#+end_src #+end_src
* Context * Context
@ -6411,23 +6415,25 @@ qmckl_compute_ao_vgl_hpc_gaussian (
#endif #endif
{ {
qmckl_exit_code rc; qmckl_exit_code rc;
double ar2[prim_max]; double ar2[prim_max] __attribute__((aligned(64)));
int32_t powers[3*size_max]; int32_t powers[3*size_max] __attribute__((aligned(64)));
double poly_vgl_l1[4][4] = {{1.0, 0.0, 0.0, 0.0}, double poly_vgl_l1[4][4] __attribute__((aligned(64))) =
{{1.0, 0.0, 0.0, 0.0},
{0.0, 1.0, 0.0, 0.0}, {0.0, 1.0, 0.0, 0.0},
{0.0, 0.0, 1.0, 0.0}, {0.0, 0.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 1.0}}; {0.0, 0.0, 0.0, 1.0}};
double poly_vgl_l2[5][10] = {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, double poly_vgl_l2[5][10]__attribute__((aligned(64))) =
{{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0.}, {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0.}, {0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0.}, {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}}; {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
double poly_vgl[5][size_max]; double poly_vgl[5][size_max] __attribute__((aligned(64)));
double exp_mat[prim_max][8]; double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
double ce_mat[shell_max][8]; double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
double coef_mat[nucl_num][shell_max][prim_max]; double coef_mat[nucl_num][shell_max][prim_max] __attribute__((aligned(64)));
for (int i=0 ; i<nucl_num ; ++i) { for (int i=0 ; i<nucl_num ; ++i) {
for (int j=0 ; j<shell_max; ++j) { for (int j=0 ; j<shell_max; ++j) {
for (int k=0 ; k<prim_max; ++k) { for (int k=0 ; k<prim_max; ++k) {
@ -6440,12 +6446,14 @@ qmckl_compute_ao_vgl_hpc_gaussian (
#pragma omp for #pragma omp for
#endif #endif
for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) { for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) {
const double e_coord[3] = { coord[ipoint], const double e_coord[3] __attribute__((aligned(64))) =
{ coord[ipoint],
coord[ipoint + point_num], coord[ipoint + point_num],
coord[ipoint + 2*point_num] }; coord[ipoint + 2*point_num] };
for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) { for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) {
const double n_coord[3] = { nucl_coord[inucl], const double n_coord[3] __attribute__((aligned(64))) =
{ nucl_coord[inucl],
nucl_coord[inucl + nucl_num], nucl_coord[inucl + nucl_num],
nucl_coord[inucl + 2*nucl_num] }; nucl_coord[inucl + 2*nucl_num] };
@ -6528,7 +6536,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
/* --- */ /* --- */
switch (5) { switch (512) {
case(5): case(5):
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) { for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
@ -6536,13 +6544,14 @@ qmckl_compute_ao_vgl_hpc_gaussian (
ce_mat[i][j] = 0.; ce_mat[i][j] = 0.;
} }
for (int k=0 ; k<nidx; ++k) { for (int k=0 ; k<nidx; ++k) {
const double cm = coef_mat[inucl][i][k];
for (int j=0 ; j<5 ; ++j) { for (int j=0 ; j<5 ; ++j) {
ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j]; ce_mat[i][j] += cm * exp_mat[k][j];
} }
} }
} }
break; break;
case(8): case(8):
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) { for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
@ -6553,87 +6562,62 @@ qmckl_compute_ao_vgl_hpc_gaussian (
ce_mat[i][j] = 0.; ce_mat[i][j] = 0.;
} }
for (int k=0 ; k<nidx; ++k) { for (int k=0 ; k<nidx; ++k) {
const double cm = coef_mat[inucl][i][k];
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp simd simdlen(8) #pragma omp simd simdlen(8)
#endif #endif
for (int j=0 ; j<8 ; ++j) { for (int j=0 ; j<8 ; ++j) {
ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j]; ce_mat[i][j] += cm * exp_mat[k][j];
} }
} }
} }
break; break;
/*
case(256):
// Following loop is the assembly version AVX2
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
__asm__ volatile (
"mov %[k],%%RSI" "\n\t" // &(nidx)
"mov %[a],%%RAX" "\n\t" // &(coef_mat[inucl][i][k])
"mov %[b],%%RBX" "\n\t" // &(exp_mat[k][0])
"mov %[c],%%RCX" "\n\t" // &(ce_mat[i][0])
"vxorpd %%YMM0,%%YMM0,%%YMM0" "\n\t" // ce_mat[i][:] = 0.
"vxorpd %%YMM2,%%YMM2,%%YMM2" "\n\t" // ce_mat[i][:] = 0.
"test %%RSI,%%RSI" "\n\t"
"je .L" "K_LOOP" "%=" "\n\t"
".L" "LOOP1" "%=" ":\n\t"
"vbroadcastsd 0*8(%%RAX),%%YMM1" "\n\t"
"vfmadd231pd 0*8(%%RBX),%%YMM1,%%YMM0" "\n\t"
"vfmadd231pd 4*8(%%RBX),%%YMM1,%%YMM2" "\n\t"
"lea 1*8(%%RAX),%%RAX" "\n\t"
"lea 8*8(%%RBX),%%RBX" "\n\t"
"dec %%RSI" "\n\t"
"jne .L" "LOOP1" "%=" "\n\t"
".L" "K_LOOP" "%=" ":\n\t"
"vmovupd %%YMM0,0*8(%%RCX)" "\n\t"
"vmovupd %%YMM2,4*8(%%RCX)" "\n\t"
: :
[k] "m"(nidx),
[c] "m"(&(ce_mat[i][0])),
[a] "m"(&(coef_mat[inucl][i][0])),
[b] "m"(&(exp_mat[0][0]))
: "rax", "rbx", "rcx", "rsi",
"ymm0", "ymm1", "ymm2", "memory" );
}
break;
case(512): case(512):
// Following loop is the assembly version AVX512
for(int i=0; i<nucleus_shell_num[inucl]; ++i){ for(int i=0; i<nucleus_shell_num[inucl]; ++i){
__asm__ volatile ( __m512d cemat_avx512;
"mov %[k],%%RSI" "\n\t" // &(nidx) __m512d coefmat_avx512;
"mov %[a],%%RAX" "\n\t" // &(coef_mat[inucl][i][k]) __m512d expmat_avx512;
"mov %[b],%%RBX" "\n\t" // &(exp_mat[k][0])
"mov %[c],%%RCX" "\n\t" // &(ce_mat[i][0])
"vxorpd %%ZMM0,%%ZMM0,%%ZMM0" "\n\t" // ce_mat[i][:] = 0. // cemat_avx512 = _mm512_load_pd(&(ce_mat[i][0]));
cemat_avx512 = _mm512_xor_pd(cemat_avx512,cemat_avx512);
"test %%RSI,%%RSI" "\n\t" for(int k=0; k<nidx; ++k){
"je .L" "K_LOOP" "%=" "\n\t" coefmat_avx512 = _mm512_set1_pd(coef_mat[inucl][i][k]);
".L" "LOOP1" "%=" ":\n\t" expmat_avx512 = _mm512_load_pd(&(exp_mat[k][0]));
cemat_avx512 = _mm512_fmadd_pd(coefmat_avx512, expmat_avx512, cemat_avx512);
}
_mm512_store_pd(&(ce_mat[i][0]),cemat_avx512);
}
break;
"vbroadcastsd 0*8(%%RAX),%%ZMM1" "\n\t" case(256):
"vfmadd231pd 0*8(%%RBX),%%ZMM1,%%ZMM0" "\n\t" for(int i=0; i<nucleus_shell_num[inucl]; ++i){
"lea 1*8(%%RAX),%%RAX" "\n\t" __m256d cematlow_avx2;
"lea 8*8(%%RBX),%%RBX" "\n\t" __m256d cemathigh_avx2;
__m256d coefmat_avx2;
__m256d expmatlow_avx2;
__m256d expmathigh_avx2;
"dec %%RSI" "\n\t" // cematlow_avx2 = _mm256_load_pd(&(ce_mat[i][0]));
"jne .L" "LOOP1" "%=" "\n\t" // cemathigh_avx2 = _mm256_load_pd(&(ce_mat[i][4]));
".L" "K_LOOP" "%=" ":\n\t"
"vmovupd %%ZMM0,0*8(%%RCX)" "\n\t"
: : [k] "m"(nidx), [c] "m"(&(ce_mat[i][0])), [a] "m"(&(coef_mat[inucl][i][0])), [b] "m"(&(exp_mat[0][0])) : "rax", "rbx", "rcx", "rsi", "zmm0", "zmm1", "memory" ); cematlow_avx2 = _mm256_xor_pd(cematlow_avx2,cematlow_avx2);
cemathigh_avx2 = _mm256_xor_pd(cemathigh_avx2,cemathigh_avx2);
for(int k=0; k<nidx; ++k){
coefmat_avx2 = _mm256_set1_pd(coef_mat[inucl][i][k]);
expmatlow_avx2 = _mm256_load_pd(&(exp_mat[k][0]));
expmathigh_avx2 = _mm256_load_pd(&(exp_mat[k][4]));
cematlow_avx2 = _mm256_fmadd_pd(coefmat_avx2, expmatlow_avx2, cematlow_avx2);
cemathigh_avx2 = _mm256_fmadd_pd(coefmat_avx2, expmathigh_avx2, cemathigh_avx2);
} }
*/ _mm256_store_pd(&ce_mat[i][0],cematlow_avx2);
_mm256_store_pd(&ce_mat[i][4],cemathigh_avx2);
}
} }
const int64_t ishell_start = nucleus_index[inucl]; const int64_t ishell_start = nucleus_index[inucl];