mirror of
https://github.com/TREX-CoE/qmckl.git
synced 2025-01-05 11:00:36 +01:00
Intrinsics for AOs and alignment
This commit is contained in:
parent
49e535feb9
commit
d5fcd2e0fe
136
org/qmckl_ao.org
136
org/qmckl_ao.org
@ -111,6 +111,10 @@ int main() {
|
|||||||
#include "qmckl_memory_private_func.h"
|
#include "qmckl_memory_private_func.h"
|
||||||
#include "qmckl_ao_private_type.h"
|
#include "qmckl_ao_private_type.h"
|
||||||
#include "qmckl_ao_private_func.h"
|
#include "qmckl_ao_private_func.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_HPC
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
* Context
|
* Context
|
||||||
@ -6411,23 +6415,25 @@ qmckl_compute_ao_vgl_hpc_gaussian (
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
qmckl_exit_code rc;
|
qmckl_exit_code rc;
|
||||||
double ar2[prim_max];
|
double ar2[prim_max] __attribute__((aligned(64)));
|
||||||
int32_t powers[3*size_max];
|
int32_t powers[3*size_max] __attribute__((aligned(64)));
|
||||||
double poly_vgl_l1[4][4] = {{1.0, 0.0, 0.0, 0.0},
|
double poly_vgl_l1[4][4] __attribute__((aligned(64))) =
|
||||||
|
{{1.0, 0.0, 0.0, 0.0},
|
||||||
{0.0, 1.0, 0.0, 0.0},
|
{0.0, 1.0, 0.0, 0.0},
|
||||||
{0.0, 0.0, 1.0, 0.0},
|
{0.0, 0.0, 1.0, 0.0},
|
||||||
{0.0, 0.0, 0.0, 1.0}};
|
{0.0, 0.0, 0.0, 1.0}};
|
||||||
double poly_vgl_l2[5][10] = {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
|
double poly_vgl_l2[5][10]__attribute__((aligned(64))) =
|
||||||
|
{{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
|
||||||
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
|
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
|
||||||
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
|
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
|
||||||
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
|
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
|
||||||
{0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
|
{0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}};
|
||||||
double poly_vgl[5][size_max];
|
double poly_vgl[5][size_max] __attribute__((aligned(64)));
|
||||||
|
|
||||||
double exp_mat[prim_max][8];
|
double exp_mat[prim_max][8] __attribute__((aligned(64))) ;
|
||||||
double ce_mat[shell_max][8];
|
double ce_mat[shell_max][8] __attribute__((aligned(64))) ;
|
||||||
|
|
||||||
double coef_mat[nucl_num][shell_max][prim_max];
|
double coef_mat[nucl_num][shell_max][prim_max] __attribute__((aligned(64)));
|
||||||
for (int i=0 ; i<nucl_num ; ++i) {
|
for (int i=0 ; i<nucl_num ; ++i) {
|
||||||
for (int j=0 ; j<shell_max; ++j) {
|
for (int j=0 ; j<shell_max; ++j) {
|
||||||
for (int k=0 ; k<prim_max; ++k) {
|
for (int k=0 ; k<prim_max; ++k) {
|
||||||
@ -6440,12 +6446,14 @@ qmckl_compute_ao_vgl_hpc_gaussian (
|
|||||||
#pragma omp for
|
#pragma omp for
|
||||||
#endif
|
#endif
|
||||||
for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) {
|
for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) {
|
||||||
const double e_coord[3] = { coord[ipoint],
|
const double e_coord[3] __attribute__((aligned(64))) =
|
||||||
|
{ coord[ipoint],
|
||||||
coord[ipoint + point_num],
|
coord[ipoint + point_num],
|
||||||
coord[ipoint + 2*point_num] };
|
coord[ipoint + 2*point_num] };
|
||||||
|
|
||||||
for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) {
|
for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) {
|
||||||
const double n_coord[3] = { nucl_coord[inucl],
|
const double n_coord[3] __attribute__((aligned(64))) =
|
||||||
|
{ nucl_coord[inucl],
|
||||||
nucl_coord[inucl + nucl_num],
|
nucl_coord[inucl + nucl_num],
|
||||||
nucl_coord[inucl + 2*nucl_num] };
|
nucl_coord[inucl + 2*nucl_num] };
|
||||||
|
|
||||||
@ -6528,7 +6536,7 @@ qmckl_compute_ao_vgl_hpc_gaussian (
|
|||||||
|
|
||||||
|
|
||||||
/* --- */
|
/* --- */
|
||||||
switch (5) {
|
switch (512) {
|
||||||
case(5):
|
case(5):
|
||||||
|
|
||||||
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
|
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
|
||||||
@ -6536,13 +6544,14 @@ qmckl_compute_ao_vgl_hpc_gaussian (
|
|||||||
ce_mat[i][j] = 0.;
|
ce_mat[i][j] = 0.;
|
||||||
}
|
}
|
||||||
for (int k=0 ; k<nidx; ++k) {
|
for (int k=0 ; k<nidx; ++k) {
|
||||||
|
const double cm = coef_mat[inucl][i][k];
|
||||||
for (int j=0 ; j<5 ; ++j) {
|
for (int j=0 ; j<5 ; ++j) {
|
||||||
ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j];
|
ce_mat[i][j] += cm * exp_mat[k][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case(8):
|
case(8):
|
||||||
|
|
||||||
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
|
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
|
||||||
@ -6553,87 +6562,62 @@ qmckl_compute_ao_vgl_hpc_gaussian (
|
|||||||
ce_mat[i][j] = 0.;
|
ce_mat[i][j] = 0.;
|
||||||
}
|
}
|
||||||
for (int k=0 ; k<nidx; ++k) {
|
for (int k=0 ; k<nidx; ++k) {
|
||||||
|
const double cm = coef_mat[inucl][i][k];
|
||||||
#ifdef HAVE_OPENMP
|
#ifdef HAVE_OPENMP
|
||||||
#pragma omp simd simdlen(8)
|
#pragma omp simd simdlen(8)
|
||||||
#endif
|
#endif
|
||||||
for (int j=0 ; j<8 ; ++j) {
|
for (int j=0 ; j<8 ; ++j) {
|
||||||
ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j];
|
ce_mat[i][j] += cm * exp_mat[k][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
/*
|
|
||||||
case(256):
|
|
||||||
|
|
||||||
// Following loop is the assembly version AVX2
|
|
||||||
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
|
|
||||||
__asm__ volatile (
|
|
||||||
"mov %[k],%%RSI" "\n\t" // &(nidx)
|
|
||||||
"mov %[a],%%RAX" "\n\t" // &(coef_mat[inucl][i][k])
|
|
||||||
"mov %[b],%%RBX" "\n\t" // &(exp_mat[k][0])
|
|
||||||
"mov %[c],%%RCX" "\n\t" // &(ce_mat[i][0])
|
|
||||||
|
|
||||||
"vxorpd %%YMM0,%%YMM0,%%YMM0" "\n\t" // ce_mat[i][:] = 0.
|
|
||||||
"vxorpd %%YMM2,%%YMM2,%%YMM2" "\n\t" // ce_mat[i][:] = 0.
|
|
||||||
|
|
||||||
"test %%RSI,%%RSI" "\n\t"
|
|
||||||
"je .L" "K_LOOP" "%=" "\n\t"
|
|
||||||
".L" "LOOP1" "%=" ":\n\t"
|
|
||||||
|
|
||||||
"vbroadcastsd 0*8(%%RAX),%%YMM1" "\n\t"
|
|
||||||
"vfmadd231pd 0*8(%%RBX),%%YMM1,%%YMM0" "\n\t"
|
|
||||||
"vfmadd231pd 4*8(%%RBX),%%YMM1,%%YMM2" "\n\t"
|
|
||||||
"lea 1*8(%%RAX),%%RAX" "\n\t"
|
|
||||||
"lea 8*8(%%RBX),%%RBX" "\n\t"
|
|
||||||
|
|
||||||
"dec %%RSI" "\n\t"
|
|
||||||
"jne .L" "LOOP1" "%=" "\n\t"
|
|
||||||
".L" "K_LOOP" "%=" ":\n\t"
|
|
||||||
"vmovupd %%YMM0,0*8(%%RCX)" "\n\t"
|
|
||||||
"vmovupd %%YMM2,4*8(%%RCX)" "\n\t"
|
|
||||||
|
|
||||||
: :
|
|
||||||
[k] "m"(nidx),
|
|
||||||
[c] "m"(&(ce_mat[i][0])),
|
|
||||||
[a] "m"(&(coef_mat[inucl][i][0])),
|
|
||||||
[b] "m"(&(exp_mat[0][0]))
|
|
||||||
: "rax", "rbx", "rcx", "rsi",
|
|
||||||
"ymm0", "ymm1", "ymm2", "memory" );
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
case(512):
|
case(512):
|
||||||
|
|
||||||
// Following loop is the assembly version AVX512
|
|
||||||
for(int i=0; i<nucleus_shell_num[inucl]; ++i){
|
for(int i=0; i<nucleus_shell_num[inucl]; ++i){
|
||||||
__asm__ volatile (
|
__m512d cemat_avx512;
|
||||||
"mov %[k],%%RSI" "\n\t" // &(nidx)
|
__m512d coefmat_avx512;
|
||||||
"mov %[a],%%RAX" "\n\t" // &(coef_mat[inucl][i][k])
|
__m512d expmat_avx512;
|
||||||
"mov %[b],%%RBX" "\n\t" // &(exp_mat[k][0])
|
|
||||||
"mov %[c],%%RCX" "\n\t" // &(ce_mat[i][0])
|
|
||||||
|
|
||||||
"vxorpd %%ZMM0,%%ZMM0,%%ZMM0" "\n\t" // ce_mat[i][:] = 0.
|
// cemat_avx512 = _mm512_load_pd(&(ce_mat[i][0]));
|
||||||
|
cemat_avx512 = _mm512_xor_pd(cemat_avx512,cemat_avx512);
|
||||||
|
|
||||||
"test %%RSI,%%RSI" "\n\t"
|
for(int k=0; k<nidx; ++k){
|
||||||
"je .L" "K_LOOP" "%=" "\n\t"
|
coefmat_avx512 = _mm512_set1_pd(coef_mat[inucl][i][k]);
|
||||||
".L" "LOOP1" "%=" ":\n\t"
|
expmat_avx512 = _mm512_load_pd(&(exp_mat[k][0]));
|
||||||
|
cemat_avx512 = _mm512_fmadd_pd(coefmat_avx512, expmat_avx512, cemat_avx512);
|
||||||
|
}
|
||||||
|
_mm512_store_pd(&(ce_mat[i][0]),cemat_avx512);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
"vbroadcastsd 0*8(%%RAX),%%ZMM1" "\n\t"
|
case(256):
|
||||||
"vfmadd231pd 0*8(%%RBX),%%ZMM1,%%ZMM0" "\n\t"
|
for(int i=0; i<nucleus_shell_num[inucl]; ++i){
|
||||||
"lea 1*8(%%RAX),%%RAX" "\n\t"
|
__m256d cematlow_avx2;
|
||||||
"lea 8*8(%%RBX),%%RBX" "\n\t"
|
__m256d cemathigh_avx2;
|
||||||
|
__m256d coefmat_avx2;
|
||||||
|
__m256d expmatlow_avx2;
|
||||||
|
__m256d expmathigh_avx2;
|
||||||
|
|
||||||
"dec %%RSI" "\n\t"
|
// cematlow_avx2 = _mm256_load_pd(&(ce_mat[i][0]));
|
||||||
"jne .L" "LOOP1" "%=" "\n\t"
|
// cemathigh_avx2 = _mm256_load_pd(&(ce_mat[i][4]));
|
||||||
".L" "K_LOOP" "%=" ":\n\t"
|
|
||||||
"vmovupd %%ZMM0,0*8(%%RCX)" "\n\t"
|
|
||||||
|
|
||||||
: : [k] "m"(nidx), [c] "m"(&(ce_mat[i][0])), [a] "m"(&(coef_mat[inucl][i][0])), [b] "m"(&(exp_mat[0][0])) : "rax", "rbx", "rcx", "rsi", "zmm0", "zmm1", "memory" );
|
cematlow_avx2 = _mm256_xor_pd(cematlow_avx2,cematlow_avx2);
|
||||||
|
cemathigh_avx2 = _mm256_xor_pd(cemathigh_avx2,cemathigh_avx2);
|
||||||
|
|
||||||
|
for(int k=0; k<nidx; ++k){
|
||||||
|
coefmat_avx2 = _mm256_set1_pd(coef_mat[inucl][i][k]);
|
||||||
|
|
||||||
|
expmatlow_avx2 = _mm256_load_pd(&(exp_mat[k][0]));
|
||||||
|
expmathigh_avx2 = _mm256_load_pd(&(exp_mat[k][4]));
|
||||||
|
|
||||||
|
cematlow_avx2 = _mm256_fmadd_pd(coefmat_avx2, expmatlow_avx2, cematlow_avx2);
|
||||||
|
cemathigh_avx2 = _mm256_fmadd_pd(coefmat_avx2, expmathigh_avx2, cemathigh_avx2);
|
||||||
|
|
||||||
}
|
}
|
||||||
*/
|
_mm256_store_pd(&ce_mat[i][0],cematlow_avx2);
|
||||||
|
_mm256_store_pd(&ce_mat[i][4],cemathigh_avx2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t ishell_start = nucleus_index[inucl];
|
const int64_t ishell_start = nucleus_index[inucl];
|
||||||
|
Loading…
Reference in New Issue
Block a user