1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-08 20:33:40 +01:00

Include assembly in qmckl_ao

This commit is contained in:
Anthony Scemama 2022-06-11 10:57:58 +02:00
parent 2784e894d4
commit 07e1e44f05

View File

@ -6521,20 +6521,94 @@ qmckl_compute_ao_vgl_hpc_gaussian (
exp_mat[iprim][4] = f * (3.0 - 2.0 * ar2[iprim]); exp_mat[iprim][4] = f * (3.0 - 2.0 * ar2[iprim]);
} }
/* --- */
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) { for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
for (int j=0 ; j<5 ; ++j) { #ifdef HAVE_OPENMP
#pragma omp simd simdlen(8)
#endif
for (int j=0 ; j<8 ; ++j) {
ce_mat[i][j] = 0.; ce_mat[i][j] = 0.;
} }
}
for (int k=0 ; k<nidx; ++k) { for (int k=0 ; k<nidx; ++k) {
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) { #ifdef HAVE_OPENMP
if (coef_mat[inucl][i][k] != 0.) { #pragma omp simd simdlen(8)
for (int j=0 ; j<5 ; ++j) { #endif
for (int j=0 ; j<8 ; ++j) {
ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j]; ce_mat[i][j] += coef_mat[inucl][i][k] * exp_mat[k][j];
} }
} }
} }
/*
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
// Following loop is the assembly version AVX2
__asm__ volatile (
"mov %[k],%%RSI" "\n\t" // &(nidx)
"mov %[a],%%RAX" "\n\t" // &(coef_mat[inucl][i][k])
"mov %[b],%%RBX" "\n\t" // &(exp_mat[k][0])
"mov %[c],%%RCX" "\n\t" // &(ce_mat[i][0])
"vxorpd %%YMM0,%%YMM0,%%YMM0" "\n\t" // ce_mat[i][:] = 0.
"vxorpd %%YMM2,%%YMM2,%%YMM2" "\n\t" // ce_mat[i][:] = 0.
"test %%RSI,%%RSI" "\n\t"
"je .L" "K_LOOP" "%=" "\n\t"
".L" "LOOP1" "%=" ":\n\t"
"vbroadcastsd 0*8(%%RAX),%%YMM1" "\n\t"
"vfmadd231pd 0*8(%%RBX),%%YMM1,%%YMM0" "\n\t"
"vfmadd231pd 4*8(%%RBX),%%YMM1,%%YMM2" "\n\t"
"lea 1*8(%%RAX),%%RAX" "\n\t"
"lea 8*8(%%RBX),%%RBX" "\n\t"
"dec %%RSI" "\n\t"
"jne .L" "LOOP1" "%=" "\n\t"
".L" "K_LOOP" "%=" ":\n\t"
"vmovupd %%YMM0,0*8(%%RCX)" "\n\t"
"vmovupd %%YMM2,4*8(%%RCX)" "\n\t"
: :
[k] "m"(nidx),
[c] "m"(&(ce_mat[i][0])),
[a] "m"(&(coef_mat[inucl][i][0])),
[b] "m"(&(exp_mat[0][0]))
: "rax", "rbx", "rcx", "rsi",
"ymm0", "ymm1", "ymm2", "memory" );
} }
}
,*/
/*
// Following loop is the assembly version AVX512
for (int i=0 ; i<nucleus_shell_num[inucl] ; ++i) {
__asm__ volatile (
"mov %[k],%%RSI" "\n\t" // &(nidx)
"mov %[a],%%RAX" "\n\t" // &(coef_mat[inucl][i][k])
"mov %[b],%%RBX" "\n\t" // &(exp_mat[k][0])
"mov %[c],%%RCX" "\n\t" // &(ce_mat[i][0])
"vxorpd %%ZMM0,%%ZMM0,%%ZMM0" "\n\t" // ce_mat[i][:] = 0.
"test %%RSI,%%RSI" "\n\t"
"je .L" "K_LOOP" "%=" "\n\t"
".L" "LOOP1" "%=" ":\n\t"
"vbroadcastsd 0*8(%%RAX),%%ZMM1" "\n\t"
"vfmadd231pd 0*8(%%RBX),%%ZMM1,%%ZMM0" "\n\t"
"lea 1*8(%%RAX),%%RAX" "\n\t"
"lea 8*8(%%RBX),%%RBX" "\n\t"
"dec %%RSI" "\n\t"
"jne .L" "LOOP1" "%=" "\n\t"
".L" "K_LOOP" "%=" ":\n\t"
"vmovupd %%ZMM0,0*8(%%RCX)" "\n\t"
: : [k] "m"(nidx), [c] "m"(&(ce_mat[i][0])), [a] "m"(&(coef_mat[inucl][i][0])), [b] "m"(&(exp_mat[0][0])) : "rax", "rbx", "rcx", "rsi", "zmm0", "zmm1", "memory" );
}
,*/
const int64_t ishell_start = nucleus_index[inucl]; const int64_t ishell_start = nucleus_index[inucl];
const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl]; const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl];