1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-11-20 04:52:47 +01:00

Added kernel template generator with padding based on SIMD_LENGTH.

Tested with 21x21 square matrices with SIMD_LENGTH = 4.
This commit is contained in:
Francois Coppens 2023-01-18 18:45:59 +01:00
parent b288740292
commit fcf0907b82

View File

@ -103,31 +103,99 @@ int main() {
double* determinant); double* determinant);
#+end_src #+end_src
*** C source
#+begin_src c :tangle (eval c) :comments org #+begin_src c :tangle (eval c) :comments org
#include <stdbool.h> #include <stdbool.h>
#include <math.h> #include <math.h>
#include "qmckl.h" #include "qmckl.h"
#include "config.h"
#if defined(__AVX512__) qmckl_exit_code qmckl_sherman_morrison_hpc(
#define SIMD 8 const qmckl_context context,
#elif defined(__AVX__) const uint64_t LDS,
#define SIMD 4 const uint64_t Dim,
#else const uint64_t N_updates,
#define SIMD 2 const double* __restrict Updates,
#endif const uint64_t* __restrict Updates_index,
const double breakdown,
double* __restrict Slater_inv,
double* __restrict determinant) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return qmckl_failwith( context,
QMCKL_NULL_CONTEXT,
"qmckl_sherman_morrison_naive_hpc",
NULL);
}
double __attribute__((aligned(8))) C[Dim];
double __attribute__((aligned(8))) D[LDS];
uint32_t l = 0;
// For each update
while (l < N_updates) {
// C = S^{-1} x u_l
for (uint32_t i = 0; i < Dim; i++) {
C[i] = 0.0f;
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j];
}
}
// Denominator: v_l^T * C
const int cui = Updates_index[l] - 1;
double den = 1.0f + C[cui];
if (fabs(den) < breakdown) {
return QMCKL_FAILURE;
}
double iden = 1.0f / den;
// Update det(A)
if (determinant)
*determinant *= den;
// selecting column: v_l^T * S_inv
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
D[j] = Slater_inv[cui * LDS + j];
}
// A^{-1} = A^{-1} - C x D / den
for (uint32_t i = 0; i < Dim; i++) {
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
const double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update;
}
}
l += 1;
}
return QMCKL_SUCCESS;
}
#+end_src #+end_src
#+NAME:naive_template #+NAME:naive_template
#+begin_src c #+begin_src c
static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context context, static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(
const qmckl_context context,
const uint64_t N_updates, const uint64_t N_updates,
const double* Updates, const double* __restrict Updates,
const uint64_t* Updates_index, const uint64_t* __restrict Updates_index,
const double breakdown, const double breakdown,
double* Slater_inv, double* __restrict Slater_inv,
double* determinant) { double* __restrict determinant) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return qmckl_failwith( context, return qmckl_failwith( context,
@ -136,11 +204,10 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context c
NULL); NULL);
} }
// TODO: Specialize for padding #define D{Dim}_P ((1+({Dim}-1)/SIMD_LENGTH)*SIMD_LENGTH)
// const uint LDS=(1+({Dim}-1)/SIMD) * SIMD;
const uint LDS={Dim}; double __attribute__((aligned(8))) C[{Dim}];
double C[{Dim}]; double __attribute__((aligned(8))) D[D{Dim}_P];
double D[{Dim}];
uint64_t l = 0; uint64_t l = 0;
// For each update // For each update
@ -148,33 +215,40 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context c
// C = A^{-1} x U_l // C = A^{-1} x U_l
for (uint64_t i = 0; i < {Dim}; i++) { for (uint64_t i = 0; i < {Dim}; i++) {
C[i] = 0; C[i] = 0;
for (uint64_t j = 0; j < {Dim}; j++) { #pragma ivdep
C[i] += Slater_inv[i * LDS + j] * Updates[l * {Dim} + j]; #pragma vector aligned
for (uint64_t j = 0; j < D{Dim}_P; j++) {
C[i] += Slater_inv[i * D{Dim}_P + j] * Updates[l * D{Dim}_P + j];
} }
} }
// Denominator // Denominator
double den = 1 + C[Updates_index[l] - 1]; const int cui = Updates_index[l] - 1;
double den = 1.0f + C[cui];
if (fabs(den) < breakdown) { if (fabs(den) < breakdown) {
return QMCKL_FAILURE; return QMCKL_FAILURE;
} }
double iden = 1 / den; double iden = 1.0f / den;
// Update det(A) // Update det(A)
if (determinant != NULL) if (determinant)
*determinant *= den; *determinant *= den;
// D = v^T x A^{-1} // selecting column: D = v_l^T * S_inv
for (uint64_t j = 0; j < {Dim}; j++) { #pragma ivdep
D[j] = Slater_inv[(Updates_index[l] - 1) * LDS + j]; #pragma vector aligned
for (uint64_t j = 0; j < D{Dim}_P; j++) {
D[j] = Slater_inv[cui * D{Dim}_P + j];
} }
// A^{-1} = A^{-1} - C x D / den // A^{-1} = A^{-1} - C x D / den
for (uint64_t i = 0; i < {Dim}; i++) { for (uint64_t i = 0; i < {Dim}; i++) {
for (uint64_t j = 0; j < {Dim}; j++) { #pragma ivdep
#pragma vector aligned
for (uint64_t j = 0; j < D{Dim}_P; j++) {
double update = C[i] * D[j] * iden; double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update; Slater_inv[i * D{Dim}_P + j] -= update;
} }
} }
@ -185,44 +259,50 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context c
} }
#+end_src #+end_src
#+NAME:naive-python #+NAME:naive-python
#+begin_src python :noweb yes :exports none #+begin_src python :noweb yes :exports none
text=""" text="""
<<naive_template>> <<naive_template>>
""" """
result = [] result = []
for Dim in range(1,26): for Dim in range(2, 22):
Dim=str(Dim) Dim=str(Dim)
result.append(text.replace("{Dim}",Dim) ) result.append(text.replace("{Dim}",Dim) )
return '\n'.join(result) return '\n'.join(result)
#+end_src #+end_src
#+RESULTS: naive-python
#+NAME:naive-python-switch #+NAME:naive-python-switch
#+begin_src python :noweb yes :exports none #+begin_src python :noweb yes :exports none
text=""" text="""
case {Dim}: case {Dim}:
return qmckl_sherman_morrison_{Dim}(context, return qmckl_sherman_morrison_{Dim}(context,
N_updates, N_updates,
Updates, Updates,
Updates_index, Updates_index,
breakdown, breakdown,
Slater_inv, Slater_inv,
determinant); determinant);
""" """
result = [] result = []
for Dim in range(1,26): for Dim in range(2, 22):
Dim=str(Dim) Dim=str(Dim)
result.append(text.replace("{Dim}",Dim) ) result.append(text.replace("{Dim}",Dim) )
return '\n'.join(result) return '\n'.join(result)
#+end_src #+end_src
#+RESULTS: naive-python-switch
------
#+begin_src c :tangle (eval c) :comments org :noweb yes #+begin_src c :tangle (eval c) :comments org :noweb yes
<<naive-python()>> <<naive-python()>>
@ -238,125 +318,37 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context,
double* determinant) { double* determinant) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return qmckl_failwith( context, return qmckl_failwith(context,
QMCKL_NULL_CONTEXT, QMCKL_NULL_CONTEXT,
"qmckl_sherman_morrison", "qmckl_sherman_morrison",
NULL); NULL);
} }
if (Dim == LDS) { if (LDS == (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) { // Most cases
switch (Dim) { switch (Dim) {
<<naive-python-switch()>> <<naive-python-switch()>>
} }
}
else { // When SIMD_LENGTH > 1, called with LDS == Dim and Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH)
return qmckl_sherman_morrison_hpc(context,
LDS,
Dim,
N_updates,
Updates,
Updates_index,
breakdown,
Slater_inv,
determinant);
} else {
double C[Dim];
double D[Dim];
uint64_t l = 0;
// For each update
while (l < N_updates) {
// C = A^{-1} x U_l
for (uint64_t i = 0; i < Dim; i++) {
C[i] = 0;
for (uint64_t j = 0; j < Dim; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * Dim + j];
}
}
// Denominator
double den = 1 + C[Updates_index[l] - 1];
if (fabs(den) < breakdown) {
return QMCKL_FAILURE;
}
double iden = 1 / den;
// Update det(A)
if (determinant != NULL)
*determinant *= den;
// D = v^T x A^{-1}
for (uint64_t j = 0; j < Dim; j++) {
D[j] = Slater_inv[(Updates_index[l] - 1) * LDS + j];
}
// A^{-1} = A^{-1} - C x D / den
for (uint64_t i = 0; i < Dim; i++) {
for (uint64_t j = 0; j < Dim; j++) {
double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update;
}
}
l += 1;
} }
}
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
} }
#+end_src #+end_src
#+RESULTS: naive_python
qmckl_exit_code qmckl_sherman_morrison_{Dim}_{LDS}(const qmckl_context context,
const uint64_t N_updates,
const double* Updates,
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_NULL_CONTEXT;
}
double C[{Dim}];
double D[{Dim}];
uint64_t l = 0;
// For each update
while (l < N_updates) {
// C = A^{-1} x U_l
for (uint64_t i = 0; i < {Dim}; i++) {
C[i] = 0;
for (uint64_t j = 0; j < {Dim}; j++) {
C[i] += Slater_inv[i * {LDS} + j] * Updates[l * {Dim} + j];
}
}
// Denominator
double den = 1 + C[Updates_index[l] - 1];
if (fabs(den) < breakdown) {
return QMCKL_FAILURE;
}
double iden = 1 / den;
// Update det(A)
if (determinant != NULL)
*determinant *= den;
// D = v^T x A^{-1}
for (uint64_t j = 0; j < {Dim}; j++) {
D[j] = Slater_inv[(Updates_index[l] - 1) * {LDS} + j];
}
// A^{-1} = A^{-1} - C x D / den
for (uint64_t i = 0; i < {Dim}; i++) {
for (uint64_t j = 0; j < {Dim}; j++) {
double update = C[i] * D[j] * iden;
Slater_inv[i * {LDS} + j] -= update;
}
}
l += 1;
}
return QMCKL_SUCCESS;
}
None
*** Performance *** Performance