From fcf0907b8228d0b95438c9db8e32efdaf7bd39e3 Mon Sep 17 00:00:00 2001 From: Francois Coppens Date: Wed, 18 Jan 2023 18:45:59 +0100 Subject: [PATCH 1/2] Added kernel template generator with padding based on SIMD_LENGTH. Tested with 21x21 square matrices with SIMD_LENGTH = 4. --- org/qmckl_sherman_morrison_woodbury.org | 282 ++++++++++++------------ 1 file changed, 137 insertions(+), 145 deletions(-) diff --git a/org/qmckl_sherman_morrison_woodbury.org b/org/qmckl_sherman_morrison_woodbury.org index ad01879..4a4c763 100644 --- a/org/qmckl_sherman_morrison_woodbury.org +++ b/org/qmckl_sherman_morrison_woodbury.org @@ -103,31 +103,99 @@ int main() { double* determinant); #+end_src -*** C source + + + #+begin_src c :tangle (eval c) :comments org #include #include #include "qmckl.h" +#include "config.h" -#if defined(__AVX512__) -#define SIMD 8 -#elif defined(__AVX__) -#define SIMD 4 -#else -#define SIMD 2 -#endif +qmckl_exit_code qmckl_sherman_morrison_hpc( + const qmckl_context context, + const uint64_t LDS, + const uint64_t Dim, + const uint64_t N_updates, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, + const double breakdown, + double* __restrict Slater_inv, + double* __restrict determinant) { + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return qmckl_failwith( context, + QMCKL_NULL_CONTEXT, + "qmckl_sherman_morrison_naive_hpc", + NULL); + } + + double __attribute__((aligned(8))) C[Dim]; + double __attribute__((aligned(8))) D[LDS]; + + uint32_t l = 0; + // For each update + while (l < N_updates) { + // C = S^{-1} x u_l + for (uint32_t i = 0; i < Dim; i++) { + C[i] = 0.0f; + #pragma ivdep + #pragma vector aligned + for (uint32_t j = 0; j < LDS; j++) { + C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; + } + } + + // Denominator: v_l^T * C + const int cui = Updates_index[l] - 1; + double den = 1.0f + C[cui]; + + if (fabs(den) < breakdown) { + return QMCKL_FAILURE; + } + double iden = 1.0f / den; + + // Update det(A) + if (determinant) + *determinant *= den; + + // selecting column: v_l^T * S_inv + #pragma ivdep + #pragma vector aligned + for (uint32_t j = 0; j < LDS; j++) { + D[j] = Slater_inv[cui * LDS + j]; + } + + // A^{-1} = A^{-1} - C x D / den + for (uint32_t i = 0; i < Dim; i++) { + #pragma ivdep + #pragma vector aligned + for (uint32_t j = 0; j < LDS; j++) { + const double update = C[i] * D[j] * iden; + Slater_inv[i * LDS + j] -= update; + } + } + l += 1; + } + return QMCKL_SUCCESS; +} #+end_src + + + + #+NAME:naive_template #+begin_src c -static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context context, +static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}( + const qmckl_context context, const uint64_t N_updates, - const double* Updates, - const uint64_t* Updates_index, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, const double breakdown, - double* Slater_inv, - double* determinant) { + double* __restrict Slater_inv, + double* __restrict determinant) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return qmckl_failwith( context, @@ -136,11 +204,10 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context c NULL); } -// TODO: Specialize for padding -// const uint LDS=(1+({Dim}-1)/SIMD) * SIMD; - const uint LDS={Dim}; - double C[{Dim}]; - double D[{Dim}]; + #define D{Dim}_P ((1+({Dim}-1)/SIMD_LENGTH)*SIMD_LENGTH) + + double __attribute__((aligned(8))) C[{Dim}]; + double __attribute__((aligned(8))) D[D{Dim}_P]; uint64_t l = 0; // For each update @@ -148,33 +215,40 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context c // C = A^{-1} x U_l for (uint64_t i = 0; i < {Dim}; i++) { C[i] = 0; - for (uint64_t j = 0; j < {Dim}; j++) { - C[i] += Slater_inv[i * LDS + j] * Updates[l * {Dim} + j]; + #pragma ivdep + #pragma vector aligned + for (uint64_t j = 0; j < D{Dim}_P; j++) { + C[i] += Slater_inv[i * D{Dim}_P + j] * Updates[l * D{Dim}_P + j]; } } // Denominator - double den = 1 + C[Updates_index[l] - 1]; + const int cui = Updates_index[l] - 1; + double den = 1.0f + C[cui]; if (fabs(den) < breakdown) { return QMCKL_FAILURE; } - double iden = 1 / den; + double iden = 1.0f / den; // Update det(A) - if (determinant != NULL) + if (determinant) *determinant *= den; - // D = v^T x A^{-1} - for (uint64_t j = 0; j < {Dim}; j++) { - D[j] = Slater_inv[(Updates_index[l] - 1) * LDS + j]; + // selecting column: D = v_l^T * S_inv + #pragma ivdep + #pragma vector aligned + for (uint64_t j = 0; j < D{Dim}_P; j++) { + D[j] = Slater_inv[cui * D{Dim}_P + j]; } // A^{-1} = A^{-1} - C x D / den for (uint64_t i = 0; i < {Dim}; i++) { - for (uint64_t j = 0; j < {Dim}; j++) { + #pragma ivdep + #pragma vector aligned + for (uint64_t j = 0; j < D{Dim}_P; j++) { double update = C[i] * D[j] * iden; - Slater_inv[i * LDS + j] -= update; + Slater_inv[i * D{Dim}_P + j] -= update; } } @@ -185,44 +259,50 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}(const qmckl_context c } #+end_src + + + + #+NAME:naive-python #+begin_src python :noweb yes :exports none text=""" <> """ result = [] -for Dim in range(1,26): +for Dim in range(2, 22): Dim=str(Dim) result.append(text.replace("{Dim}",Dim) ) return '\n'.join(result) #+end_src - #+RESULTS: naive-python + + + #+NAME:naive-python-switch #+begin_src python :noweb yes :exports none text=""" case {Dim}: - return qmckl_sherman_morrison_{Dim}(context, - N_updates, - Updates, - Updates_index, - breakdown, - Slater_inv, - determinant); + return qmckl_sherman_morrison_{Dim}(context, + N_updates, + Updates, + Updates_index, + breakdown, + Slater_inv, + determinant); """ result = [] -for Dim in range(1,26): +for Dim in range(2, 22): Dim=str(Dim) result.append(text.replace("{Dim}",Dim) ) return '\n'.join(result) #+end_src - #+RESULTS: naive-python-switch - ------ + + #+begin_src c :tangle (eval c) :comments org :noweb yes <> @@ -238,125 +318,37 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, double* determinant) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { - return qmckl_failwith( context, - QMCKL_NULL_CONTEXT, - "qmckl_sherman_morrison", - NULL); + return qmckl_failwith(context, + QMCKL_NULL_CONTEXT, + "qmckl_sherman_morrison", + NULL); } - if (Dim == LDS) { + if (LDS == (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) { // Most cases switch (Dim) { - <> + <> } - - } else { - - double C[Dim]; - double D[Dim]; - - uint64_t l = 0; - // For each update - while (l < N_updates) { - // C = A^{-1} x U_l - for (uint64_t i = 0; i < Dim; i++) { - C[i] = 0; - for (uint64_t j = 0; j < Dim; j++) { - C[i] += Slater_inv[i * LDS + j] * Updates[l * Dim + j]; - } - } - - // Denominator - double den = 1 + C[Updates_index[l] - 1]; - - if (fabs(den) < breakdown) { - return QMCKL_FAILURE; - } - double iden = 1 / den; - - // Update det(A) - if (determinant != NULL) - *determinant *= den; - - // D = v^T x A^{-1} - for (uint64_t j = 0; j < Dim; j++) { - D[j] = Slater_inv[(Updates_index[l] - 1) * LDS + j]; - } - - // A^{-1} = A^{-1} - C x D / den - for (uint64_t i = 0; i < Dim; i++) { - for (uint64_t j = 0; j < Dim; j++) { - double update = C[i] * D[j] * iden; - Slater_inv[i * LDS + j] -= update; - } - } - - l += 1; } + else { // When SIMD_LENGTH > 1, called with LDS == Dim and Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) + return qmckl_sherman_morrison_hpc(context, + LDS, + Dim, + N_updates, + Updates, + Updates_index, + breakdown, + Slater_inv, + determinant); } + return QMCKL_SUCCESS; } #+end_src - #+RESULTS: naive_python - qmckl_exit_code qmckl_sherman_morrison_{Dim}_{LDS}(const qmckl_context context, - const uint64_t N_updates, - const double* Updates, - const uint64_t* Updates_index, - const double breakdown, - double* Slater_inv, - double* determinant) { - if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { - return QMCKL_NULL_CONTEXT; - } - double C[{Dim}]; - double D[{Dim}]; - - uint64_t l = 0; - // For each update - while (l < N_updates) { - // C = A^{-1} x U_l - for (uint64_t i = 0; i < {Dim}; i++) { - C[i] = 0; - for (uint64_t j = 0; j < {Dim}; j++) { - C[i] += Slater_inv[i * {LDS} + j] * Updates[l * {Dim} + j]; - } - } - - // Denominator - double den = 1 + C[Updates_index[l] - 1]; - - if (fabs(den) < breakdown) { - return QMCKL_FAILURE; - } - double iden = 1 / den; - - // Update det(A) - if (determinant != NULL) - *determinant *= den; - - // D = v^T x A^{-1} - for (uint64_t j = 0; j < {Dim}; j++) { - D[j] = Slater_inv[(Updates_index[l] - 1) * {LDS} + j]; - } - - // A^{-1} = A^{-1} - C x D / den - for (uint64_t i = 0; i < {Dim}; i++) { - for (uint64_t j = 0; j < {Dim}; j++) { - double update = C[i] * D[j] * iden; - Slater_inv[i * {LDS} + j] -= update; - } - } - - l += 1; - } - - return QMCKL_SUCCESS; - } - None *** Performance From 9c608166ec50793ea49a45f250ff48cca2103895 Mon Sep 17 00:00:00 2001 From: Francois Coppens Date: Thu, 19 Jan 2023 15:25:12 +0100 Subject: [PATCH 2/2] Added compiler dependent macros that define vectorization pragmas. --- org/qmckl_sherman_morrison_woodbury.org | 43 +++++++++++++++++-------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/org/qmckl_sherman_morrison_woodbury.org b/org/qmckl_sherman_morrison_woodbury.org index 4a4c763..5727ac8 100644 --- a/org/qmckl_sherman_morrison_woodbury.org +++ b/org/qmckl_sherman_morrison_woodbury.org @@ -113,6 +113,23 @@ int main() { #include "qmckl.h" #include "config.h" +// Order important because +// __GNUC__ also set in ICC, ICX and CLANG +// __clang__ also set in ICX +#if defined(__INTEL_COMPILER) + #define IVDEP _Pragma("ivdep") + #define ALIGNED _Pragma("vector aligned") +#elif defined(__INTEL_LLVM_COMPILER) + #define IVDEP _Pragma("ivdep") + #define ALIGNED _Pragma("vector aligned") +#elif defined(__clang__) + #define IVDEP _Pragma("clang loop vectorize(enable)") + #define ALIGNED +#elif defined(__GNUC__) + #define IVDEP _Pragma("GCC ivdep") + #define ALIGNED +#endif + qmckl_exit_code qmckl_sherman_morrison_hpc( const qmckl_context context, const uint64_t LDS, @@ -140,8 +157,8 @@ qmckl_exit_code qmckl_sherman_morrison_hpc( // C = S^{-1} x u_l for (uint32_t i = 0; i < Dim; i++) { C[i] = 0.0f; - #pragma ivdep - #pragma vector aligned + IVDEP + ALIGNED for (uint32_t j = 0; j < LDS; j++) { C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; } @@ -161,16 +178,16 @@ qmckl_exit_code qmckl_sherman_morrison_hpc( *determinant *= den; // selecting column: v_l^T * S_inv - #pragma ivdep - #pragma vector aligned + IVDEP + ALIGNED for (uint32_t j = 0; j < LDS; j++) { D[j] = Slater_inv[cui * LDS + j]; } // A^{-1} = A^{-1} - C x D / den for (uint32_t i = 0; i < Dim; i++) { - #pragma ivdep - #pragma vector aligned + IVDEP + ALIGNED for (uint32_t j = 0; j < LDS; j++) { const double update = C[i] * D[j] * iden; Slater_inv[i * LDS + j] -= update; @@ -215,8 +232,8 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}( // C = A^{-1} x U_l for (uint64_t i = 0; i < {Dim}; i++) { C[i] = 0; - #pragma ivdep - #pragma vector aligned + IVDEP + ALIGNED for (uint64_t j = 0; j < D{Dim}_P; j++) { C[i] += Slater_inv[i * D{Dim}_P + j] * Updates[l * D{Dim}_P + j]; } @@ -236,16 +253,16 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}( *determinant *= den; // selecting column: D = v_l^T * S_inv - #pragma ivdep - #pragma vector aligned + IVDEP + ALIGNED for (uint64_t j = 0; j < D{Dim}_P; j++) { D[j] = Slater_inv[cui * D{Dim}_P + j]; } // A^{-1} = A^{-1} - C x D / den for (uint64_t i = 0; i < {Dim}; i++) { - #pragma ivdep - #pragma vector aligned + IVDEP + ALIGNED for (uint64_t j = 0; j < D{Dim}_P; j++) { double update = C[i] * D[j] * iden; Slater_inv[i * D{Dim}_P + j] -= update; @@ -329,7 +346,7 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, <> } } - else { // When SIMD_LENGTH > 1, called with LDS == Dim and Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) + else { // When SIMD_LENGTH > 1, called with LDS == Dim AND Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) return qmckl_sherman_morrison_hpc(context, LDS, Dim,