1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-08 04:19:15 +01:00

Merge pull request #99 from fmgjcoppens/master

Various SIMD_LENGTH related issues.
This commit is contained in:
Anthony Scemama 2023-01-26 14:07:29 +01:00 committed by GitHub
commit 728a81f96b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 306 additions and 191 deletions

View File

@ -151,15 +151,15 @@ qmckl_exit_code qmckl_sherman_morrison_hpc(
double __attribute__((aligned(8))) C[Dim]; double __attribute__((aligned(8))) C[Dim];
double __attribute__((aligned(8))) D[LDS]; double __attribute__((aligned(8))) D[LDS];
uint32_t l = 0; uint64_t l = 0;
// For each update // For each update
while (l < N_updates) { while (l < N_updates) {
// C = S^{-1} x u_l // C = S^{-1} x u_l
for (uint32_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
C[i] = 0.0f; C[i] = 0.0f;
IVDEP IVDEP
ALIGNED ALIGNED
for (uint32_t j = 0; j < LDS; j++) { for (uint64_t j = 0; j < LDS; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j];
} }
} }
@ -180,15 +180,15 @@ qmckl_exit_code qmckl_sherman_morrison_hpc(
// selecting column: v_l^T * S_inv // selecting column: v_l^T * S_inv
IVDEP IVDEP
ALIGNED ALIGNED
for (uint32_t j = 0; j < LDS; j++) { for (uint64_t j = 0; j < LDS; j++) {
D[j] = Slater_inv[cui * LDS + j]; D[j] = Slater_inv[cui * LDS + j];
} }
// A^{-1} = A^{-1} - C x D / den // A^{-1} = A^{-1} - C x D / den
for (uint32_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
IVDEP IVDEP
ALIGNED ALIGNED
for (uint32_t j = 0; j < LDS; j++) { for (uint64_t j = 0; j < LDS; j++) {
const double update = C[i] * D[j] * iden; const double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update; Slater_inv[i * LDS + j] -= update;
} }
@ -413,6 +413,7 @@ The tests for the kernels are executed on datasets that are extracted from a run
#+begin_src c :tangle (eval c_test) #+begin_src c :tangle (eval c_test)
const uint64_t Dim = 21; const uint64_t Dim = 21;
const uint64_t LDS = (1 + (Dim) / SIMD_LENGTH) * SIMD_LENGTH;
const double breakdown = 1e-3; const double breakdown = 1e-3;
const double tolerance = 1e-3; const double tolerance = 1e-3;
double res[441]; double res[441];
@ -425,7 +426,15 @@ assert(Slater_inv1 != NULL);
// original determinant of Slater1 (before applying updates) // original determinant of Slater1 (before applying updates)
double det = 3.407025646103221e-10; double det = 3.407025646103221e-10;
rc = qmckl_sherman_morrison(context, Dim, Dim, N_updates1, Updates1, Updates_index1, breakdown, Slater_inv1, &det); rc = qmckl_sherman_morrison(context,
LDS,
Dim,
N_updates1,
Updates1,
Updates_index1,
breakdown,
Slater_inv1,
&det);
// Check that the determinant is updated properly // Check that the determinant is updated properly
assert(fabs(det + 4.120398385068217e-10) < 1e-15); assert(fabs(det + 4.120398385068217e-10) < 1e-15);
@ -434,7 +443,7 @@ for (unsigned int i = 0; i < Dim; i++) {
for (unsigned int j = 0; j < Dim; j++) { for (unsigned int j = 0; j < Dim; j++) {
res[i * Dim + j] = 0; res[i * Dim + j] = 0;
for (unsigned int k = 0; k < Dim; k++) { for (unsigned int k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater1[i * Dim + k] * Slater_inv1[k * Dim + j]; res[i * Dim + j] += Slater1[i * Dim + k] * Slater_inv1[k * LDS + j];
} }
} }
} }
@ -519,10 +528,6 @@ assert(rc == QMCKL_SUCCESS);
*** C source *** C source
#+begin_src c :tangle (eval c) :comments org #+begin_src c :tangle (eval c) :comments org
#include <stdbool.h>
#include <math.h>
#include "qmckl.h"
qmckl_exit_code qmckl_woodbury_2(const qmckl_context context, qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
const uint64_t LDS, const uint64_t LDS,
const uint64_t Dim, const uint64_t Dim,
@ -538,25 +543,29 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
*/ */
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_NULL_CONTEXT; return qmckl_failwith(context,
QMCKL_NULL_CONTEXT,
"qmckl_woodbury_2",
NULL);
} }
const uint64_t row1 = (Updates_index[0] - 1); const uint64_t row1 = (Updates_index[0] - 1);
const uint64_t row2 = (Updates_index[1] - 1); const uint64_t row2 = (Updates_index[1] - 1);
// Compute C = S_inv * U !! NON-STANDARD MATRIX MULTIPLICATION BECAUSE // Compute C = (S^T)^{-1}U : Dim x 2
// OF LAYOUT OF 'Updates' !! double __attribute__((aligned(8))) C[2 * Dim];
double C[2 * Dim];
for (uint64_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
for (uint64_t j = 0; j < 2; j++) { C[i * 2] = 0;
C[i * 2 + j] = 0; C[i * 2 + 1] = 0;
for (uint64_t k = 0; k < Dim; k++) { IVDEP
C[i * 2 + j] += Slater_inv[i * LDS + k] * Updates[Dim * j + k]; ALIGNED
for (uint64_t k = 0; k < LDS; k++) {
C[i * 2] += Slater_inv[i * LDS + k] * Updates[k];
C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
} }
}
} }
// Compute B = 1 + V * C // Compute B = 1 + VC : 2 x 2
const double B0 = C[row1 * 2] + 1; const double B0 = C[row1 * 2] + 1;
const double B1 = C[row1 * 2 + 1]; const double B1 = C[row1 * 2 + 1];
const double B2 = C[row2 * 2]; const double B2 = C[row2 * 2];
@ -565,35 +574,39 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
// Check if determinant of inverted matrix is not zero // Check if determinant of inverted matrix is not zero
double det = B0 * B3 - B1 * B2; double det = B0 * B3 - B1 * B2;
if (fabs(det) < breakdown) { if (fabs(det) < breakdown) {
return QMCKL_FAILURE; return QMCKL_FAILURE;
} }
// Update det(S) when passed // Update det(S) when passed
if (determinant != NULL) if (determinant)
*determinant *= det; *determinant *= det;
// Compute B^{-1} with explicit formula for 2x2 inversion // Compute B^{-1} with explicit formula for 2 x 2 inversion
double Binv[4], idet = 1.0 / det; double __attribute__((aligned(8))) Binv[4], idet = 1.0 / det;
Binv[0] = idet * B3; Binv[0] = idet * B3;
Binv[1] = -1.0 * idet * B1; Binv[1] = -1.0 * idet * B1;
Binv[2] = -1.0 * idet * B2; Binv[2] = -1.0 * idet * B2;
Binv[3] = idet * B0; Binv[3] = idet * B0;
// Compute tmp = B^{-1} x (V.S^{-1}) // tmp = B^{-1}D : 2 x LDS
double tmp[2 * Dim]; double __attribute__((aligned(8))) tmp[2 * LDS];
for (uint64_t i = 0; i < 2; i++) { double* r1dim = &(Slater_inv[row1 * LDS]);
for (uint64_t j = 0; j < Dim; j++) { double* r2dim = &(Slater_inv[row2 * LDS]);
tmp[i * Dim + j] = Binv[i * 2] * Slater_inv[row1 * LDS + j]; IVDEP
tmp[i * Dim + j] += Binv[i * 2 + 1] * Slater_inv[row2 * LDS + j]; ALIGNED
} for (uint64_t j = 0; j < LDS; j++) {
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
} }
// Compute (S + U V)^{-1} = S^{-1} - C x tmp // Compute (S^T)^{-1} - C * tmp : Dim x LDS
for (uint64_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
for (uint64_t j = 0; j < Dim; j++) { IVDEP
Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j]; ALIGNED
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[Dim + j]; for (uint64_t j = 0; j < LDS; j++) {
} Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j];
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j];
}
} }
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
@ -644,13 +657,13 @@ assert(Updates2 != NULL);
assert(Updates_index2 != NULL); assert(Updates_index2 != NULL);
assert(Slater_inv2 != NULL); assert(Slater_inv2 != NULL);
det = -1.4432116661319376e-11; det = -1.4432116661319376e-11;
rc = qmckl_woodbury_2(context, Dim, Dim, Updates2, Updates_index2, breakdown, Slater_inv2, &det); rc = qmckl_woodbury_2(context, LDS, Dim, Updates2, Updates_index2, breakdown, Slater_inv2, &det);
assert(fabs(det-2.367058141251457e-10) < 1e-15); assert(fabs(det-2.367058141251457e-10) < 1e-15);
for (unsigned int i = 0; i < Dim; i++) { for (unsigned int i = 0; i < Dim; i++) {
for (unsigned int j = 0; j < Dim; j++) { for (unsigned int j = 0; j < Dim; j++) {
res[i * Dim + j] = 0; res[i * Dim + j] = 0;
for (unsigned int k = 0; k < Dim; k++) { for (unsigned int k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater2[i * Dim + k] * Slater_inv2[k * Dim + j]; res[i * Dim + j] += Slater2[i * Dim + k] * Slater_inv2[k * LDS + j];
} }
} }
} }
@ -687,8 +700,8 @@ assert(rc == QMCKL_SUCCESS);
If the determinant of the Slater-matrix is passed, it will be updated to the determinant resulting If the determinant of the Slater-matrix is passed, it will be updated to the determinant resulting
from applying the updates to the original matrix. from applying the updates to the original matrix.
#pragma ivdep
#pragma vector aligned
#+NAME: qmckl_woodbury_3_args #+NAME: qmckl_woodbury_3_args
| qmckl_context | context | in | Global state | | qmckl_context | context | in | Global state |
@ -730,10 +743,6 @@ assert(rc == QMCKL_SUCCESS);
*** C source *** C source
#+begin_src c :tangle (eval c) :comments org #+begin_src c :tangle (eval c) :comments org
#include <stdbool.h>
#include <math.h>
#include "qmckl.h"
qmckl_exit_code qmckl_woodbury_3(const qmckl_context context, qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
const uint64_t LDS, const uint64_t LDS,
const uint64_t Dim, const uint64_t Dim,
@ -749,26 +758,32 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
,*/ ,*/
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_NULL_CONTEXT; return qmckl_failwith(context,
QMCKL_NULL_CONTEXT,
"qmckl_woodbury_3",
NULL);
} }
const uint64_t row1 = (Updates_index[0] - 1); const uint64_t row1 = (Updates_index[0] - 1);
const uint64_t row2 = (Updates_index[1] - 1); const uint64_t row2 = (Updates_index[1] - 1);
const uint64_t row3 = (Updates_index[2] - 1); const uint64_t row3 = (Updates_index[2] - 1);
// Compute C = S_inv * U !! NON-STANDARD MATRIX MULTIPLICATION BECAUSE // Compute C = (S^T)^{-1}U : Dim x 3
// OF LAYOUT OF 'Updates' !! double __attribute__((aligned(8))) C[3 * Dim];
double C[3 * Dim];
for (uint64_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
for (uint64_t j = 0; j < 3; j++) { C[i * 3] = 0;
C[i * 3 + j] = 0; C[i * 3 + 1] = 0;
for (uint64_t k = 0; k < Dim; k++) { C[i * 3 + 2] = 0;
C[i * 3 + j] += Slater_inv[i * LDS + k] * Updates[Dim * j + k]; IVDEP
ALIGNED
for (uint64_t k = 0; k < LDS; k++) {
C[i * 3] += Slater_inv[i * LDS + k] * Updates[k];
C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k];
} }
}
} }
// Compute B = 1 + V.C // Compute B = 1 + VC : 3 x 3
const double B0 = C[row1 * 3] + 1; const double B0 = C[row1 * 3] + 1;
const double B1 = C[row1 * 3 + 1]; const double B1 = C[row1 * 3 + 1];
const double B2 = C[row1 * 3 + 2]; const double B2 = C[row1 * 3 + 2];
@ -784,15 +799,15 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
det = B0 * (B4 * B8 - B5 * B7) - B1 * (B3 * B8 - B5 * B6) + det = B0 * (B4 * B8 - B5 * B7) - B1 * (B3 * B8 - B5 * B6) +
B2 * (B3 * B7 - B4 * B6); B2 * (B3 * B7 - B4 * B6);
if (fabs(det) < breakdown) { if (fabs(det) < breakdown) {
return QMCKL_FAILURE; return QMCKL_FAILURE;
} }
// Update det(Slater) if passed // Update det(Slater) if passed
if (determinant != NULL) if (determinant)
*determinant *= det; *determinant *= det;
// Compute B^{-1} with explicit formula for 3x3 inversion // Compute B^{-1} with explicit formula for 3 x 3 inversion
double Binv[9], idet = 1.0 / det; double __attribute__((aligned(8))) Binv[9], idet = 1.0 / det;
Binv[0] = (B4 * B8 - B7 * B5) * idet; Binv[0] = (B4 * B8 - B7 * B5) * idet;
Binv[1] = -(B1 * B8 - B7 * B2) * idet; Binv[1] = -(B1 * B8 - B7 * B2) * idet;
Binv[2] = (B1 * B5 - B4 * B2) * idet; Binv[2] = (B1 * B5 - B4 * B2) * idet;
@ -803,23 +818,30 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
Binv[7] = -(B0 * B7 - B6 * B1) * idet; Binv[7] = -(B0 * B7 - B6 * B1) * idet;
Binv[8] = (B0 * B4 - B3 * B1) * idet; Binv[8] = (B0 * B4 - B3 * B1) * idet;
// Compute tmp = B^{-1} x (V.S^{-1}) // tmp = B^{-1}D : 3 x LDS
double tmp[3 * Dim]; double __attribute__((aligned(8))) tmp[3 * LDS];
for (uint64_t i = 0; i < 3; i++) { double* r1dim = &(Slater_inv[row1 * LDS]);
for (uint64_t j = 0; j < Dim; j++) { double* r2dim = &(Slater_inv[row2 * LDS]);
tmp[i * Dim + j] = Binv[i * 3] * Slater_inv[row1 * LDS + j]; double* r3dim = &(Slater_inv[row3 * LDS]);
tmp[i * Dim + j] += Binv[i * 3 + 1] * Slater_inv[row2 * LDS + j]; IVDEP
tmp[i * Dim + j] += Binv[i * 3 + 2] * Slater_inv[row3 * LDS + j]; ALIGNED
} for (uint64_t j = 0; j < LDS; j++) {
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
tmp[LDS + j] =
Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
tmp[2 * LDS + j] =
Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
} }
// Compute (S + U V)^{-1} = S^{-1} - C x tmp // Compute (S^T)^{-1} - C * tmp : Dim x LDS
for (uint64_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
for (uint64_t j = 0; j < Dim; j++) { IVDEP
Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j]; ALIGNED
Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[Dim + j]; for (uint64_t j = 0; j < LDS; j++) {
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * Dim + j]; Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j];
} Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j];
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j];
}
} }
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
@ -870,13 +892,13 @@ assert(Updates3 != NULL);
assert(Updates_index3 != NULL); assert(Updates_index3 != NULL);
assert(Slater_inv3_1 != NULL); assert(Slater_inv3_1 != NULL);
det = -1.23743195512859e-09; det = -1.23743195512859e-09;
rc = qmckl_woodbury_3(context, Dim, Dim, Updates3, Updates_index3, breakdown, Slater_inv3_1, &det); rc = qmckl_woodbury_3(context, LDS, Dim, Updates3, Updates_index3, breakdown, Slater_inv3_1, &det);
assert(fabs(det - 1.602708950725074e-10) < 1e-15); assert(fabs(det - 1.602708950725074e-10) < 1e-15);
for (unsigned int i = 0; i < Dim; i++) { for (unsigned int i = 0; i < Dim; i++) {
for (unsigned int j = 0; j < Dim; j++) { for (unsigned int j = 0; j < Dim; j++) {
res[i * Dim + j] = 0; res[i * Dim + j] = 0;
for (unsigned int k = 0; k < Dim; k++) { for (unsigned int k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_1[k * Dim + j]; res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_1[k * LDS + j];
} }
} }
} }
@ -963,9 +985,6 @@ assert(rc == QMCKL_SUCCESS);
*** C source *** C source
#+begin_src c :tangle (eval c) :comments org #+begin_src c :tangle (eval c) :comments org
#include <stdbool.h>
#include "qmckl.h"
qmckl_exit_code qmckl_sherman_morrison_splitting(const qmckl_context context, qmckl_exit_code qmckl_sherman_morrison_splitting(const qmckl_context context,
const uint64_t LDS, const uint64_t LDS,
const uint64_t Dim, const uint64_t Dim,
@ -977,19 +996,24 @@ qmckl_exit_code qmckl_sherman_morrison_splitting(const qmckl_context context,
double* determinant) { double* determinant) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_NULL_CONTEXT; return qmckl_failwith(context,
QMCKL_NULL_CONTEXT,
"qmckl_sherman_morrison_splitting",
NULL);
} }
double later_updates[Dim * N_updates]; double __attribute__((aligned(8))) later_updates[LDS * N_updates];
uint64_t later_index[N_updates]; uint64_t later_index[N_updates];
uint64_t later = 0; uint64_t later = 0;
(void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index, (void) qmckl_slagel_splitting(
breakdown, Slater_inv, later_updates, later_index, &later, determinant); LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv,
later_updates, later_index, &later, determinant);
if (later > 0) { if (later > 0) {
(void) qmckl_sherman_morrison_splitting(context, LDS, Dim, later, (void) qmckl_sherman_morrison_splitting(
later_updates, later_index, breakdown, Slater_inv, determinant); context, LDS, Dim, later, later_updates, later_index, breakdown,
Slater_inv, determinant);
} }
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
@ -1041,13 +1065,13 @@ assert(Updates3 != NULL);
assert(Updates_index3 != NULL); assert(Updates_index3 != NULL);
assert(Slater_inv3_2 != NULL); assert(Slater_inv3_2 != NULL);
det = -1.23743195512859e-09; det = -1.23743195512859e-09;
rc = qmckl_sherman_morrison_splitting(context, Dim, Dim, N_updates3, Updates3, Updates_index3, breakdown, Slater_inv3_2, &det); rc = qmckl_sherman_morrison_splitting(context, LDS, Dim, N_updates3, Updates3, Updates_index3, breakdown, Slater_inv3_2, &det);
assert(fabs(det - 1.602708950725074e-10) < 1e-15); assert(fabs(det - 1.602708950725074e-10) < 1e-15);
for (unsigned int i = 0; i < Dim; i++) { for (unsigned int i = 0; i < Dim; i++) {
for (unsigned int j = 0; j < Dim; j++) { for (unsigned int j = 0; j < Dim; j++) {
res[i * Dim + j] = 0; res[i * Dim + j] = 0;
for (unsigned int k = 0; k < Dim; k++) { for (unsigned int k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_2[k * Dim + j]; res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_2[k * LDS + j];
} }
} }
} }
@ -1127,9 +1151,6 @@ assert(rc == QMCKL_SUCCESS);
*** C source *** C source
#+begin_src c :tangle (eval c) :comments org #+begin_src c :tangle (eval c) :comments org
#include <stdbool.h>
#include "qmckl.h"
qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context, qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context,
const uint64_t LDS, const uint64_t LDS,
const uint64_t Dim, const uint64_t Dim,
@ -1141,59 +1162,108 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context,
double* determinant) { double* determinant) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_NULL_CONTEXT; return qmckl_failwith(context,
QMCKL_NULL_CONTEXT,
"qmckl_sherman_morrison_smw32s",
NULL);
} }
qmckl_exit_code rc; double __attribute__((aligned(8))) later_updates[LDS * N_updates];
uint64_t n_of_3blocks = N_updates / 3;
uint64_t remainder = N_updates % 3;
uint64_t length_3block = 3 * Dim;
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
// Woodbury 3x3 kernel
double later_updates[Dim * N_updates];
uint64_t later_index[N_updates]; uint64_t later_index[N_updates];
uint64_t later = 0; uint64_t later = 0;
// Special case for 4 rank-1 updates: 2+2
if (N_updates == 4) {
qmckl_exit_code rc =
qmckl_woodbury_2(context, LDS, Dim, Updates, Updates_index,
breakdown, Slater_inv, determinant);
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
uint64_t l = 0;
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates, Updates_index,
breakdown, Slater_inv,
later_updates + (LDS * later),
later_index + later, &l, determinant);
later += l;
}
rc = qmckl_woodbury_2(context, LDS, Dim, &Updates[2 * LDS],
&Updates_index[2], breakdown, Slater_inv,
determinant);
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
uint64_t l = 0;
rc = qmckl_slagel_splitting(
LDS, Dim, 2, &Updates[2 * LDS], &Updates_index[2], breakdown,
Slater_inv, later_updates + (LDS * later), later_index + later,
&l, determinant);
later += l;
}
if (later > 0) {
rc = qmckl_sherman_morrison_splitting(
context, LDS, Dim, later, later_updates, later_index, breakdown,
Slater_inv, determinant);
}
return QMCKL_SUCCESS;
}
// And for the other cases != 4
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates
// with Woodbury 3x3 kernel
uint64_t n_of_3blocks = N_updates / 3;
uint64_t remainder = N_updates % 3;
uint64_t length_3block = 3 * LDS;
if (n_of_3blocks > 0) { if (n_of_3blocks > 0) {
for (uint64_t i = 0; i < n_of_3blocks; i++) { for (uint64_t i = 0; i < n_of_3blocks; i++) {
const double *Updates_3block = &Updates[i * length_3block]; const double* Updates_3block = &Updates[i * length_3block];
const uint64_t *Updates_index_3block = &Updates_index[i * 3]; const uint64_t* Updates_index_3block = &Updates_index[i * 3];
rc = qmckl_woodbury_3(context, LDS, Dim, Updates_3block, Updates_index_3block, breakdown, Slater_inv, determinant); qmckl_exit_code rc = qmckl_woodbury_3(
if (rc != 0) { // Send the entire block to slagel_splitting context, LDS, Dim, Updates_3block, Updates_index_3block,
breakdown, Slater_inv, determinant);
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
uint64_t l = 0; uint64_t l = 0;
(void) qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block, Updates_index_3block, rc = qmckl_slagel_splitting(
breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant); LDS, Dim, 3, Updates_3block, Updates_index_3block,
later = later + l; breakdown, Slater_inv, later_updates + (LDS * later),
later_index + later, &l, determinant);
later += l;
} }
} }
} }
// Apply last remaining block of 2 updates with Woodbury 2x2 kernel // Apply last remaining block of 2 updates with Woodbury 2x2 kernel
if (remainder == 2) { if (remainder == 2) {
const double *Updates_2block = &Updates[n_of_3blocks * length_3block]; const double* Updates_2block = &Updates[n_of_3blocks * length_3block];
const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks]; const uint64_t* Updates_index_2block = &Updates_index[3 * n_of_3blocks];
rc = qmckl_woodbury_2(context, LDS, Dim, Updates_2block, Updates_index_2block, breakdown, Slater_inv, determinant); qmckl_exit_code rc = qmckl_woodbury_2(
if (rc != 0) { // Send the entire block to slagel_splitting context, LDS, Dim, Updates_2block, Updates_index_2block,
breakdown, Slater_inv, determinant);
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
uint64_t l = 0; uint64_t l = 0;
(void) qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block, Updates_index_2block, rc = qmckl_slagel_splitting(
breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant); LDS, Dim, 2, Updates_2block, Updates_index_2block, breakdown,
later = later + l; Slater_inv, later_updates + (LDS * later), later_index + later,
&l, determinant);
later += l;
} }
} }
// Apply last remaining update with slagel_splitting // Apply last remaining update with slagel_splitting
else if (remainder == 1) { if (remainder == 1) {
const double *Updates_1block = &Updates[n_of_3blocks * length_3block]; const double* Updates_1block = &Updates[n_of_3blocks * length_3block];
const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks]; const uint64_t* Updates_index_1block = &Updates_index[3 * n_of_3blocks];
uint64_t l = 0; uint64_t l = 0;
(void) qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block, Updates_index_1block, (void) qmckl_slagel_splitting(
breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant); LDS, Dim, 1, Updates_1block, Updates_index_1block, breakdown,
later = later + l; Slater_inv, later_updates + (LDS * later), later_index + later, &l,
determinant);
later += l;
} }
if (later > 0) { if (later > 0) {
(void) qmckl_sherman_morrison_splitting(context, LDS, Dim, later, later_updates, later_index, breakdown, Slater_inv, determinant); (void) qmckl_sherman_morrison_splitting(
context, LDS, Dim, later, later_updates, later_index, breakdown,
Slater_inv, determinant);
} }
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
} }
@ -1243,14 +1313,14 @@ assert(Updates5 != NULL);
assert(Updates_index5 != NULL); assert(Updates_index5 != NULL);
assert(Slater_inv5 != NULL); assert(Slater_inv5 != NULL);
det = -3.186005284713128e-10; det = -3.186005284713128e-10;
rc = qmckl_sherman_morrison_smw32s(context, Dim, Dim, N_updates5, Updates5, Updates_index5, breakdown, Slater_inv5, &det); rc = qmckl_sherman_morrison_smw32s(context, LDS, Dim, N_updates5, Updates5, Updates_index5, breakdown, Slater_inv5, &det);
assert(fabs(det + 5.260200118412903e-10) < 1e-15); assert(fabs(det + 5.260200118412903e-10) < 1e-15);
for (unsigned int i = 0; i < Dim; i++) { for (unsigned int i = 0; i < Dim; i++) {
for (unsigned int j = 0; j < Dim; j++) { for (unsigned int j = 0; j < Dim; j++) {
res[i * Dim + j] = 0; res[i * Dim + j] = 0;
for (unsigned int k = 0; k < Dim; k++) { for (unsigned int k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater5[i * Dim + k] * Slater_inv5[k * Dim + j]; res[i * Dim + j] += Slater5[i * Dim + k] * Slater_inv5[k * LDS + j];
} }
} }
} }
@ -1342,67 +1412,70 @@ These functions can only be used internally by the kernels in this module.
*** C source *** C source
#+begin_src c :tangle (eval c) :comments org #+begin_src c :tangle (eval c) :comments org
#include <stdbool.h>
#include <math.h>
#include "qmckl.h"
qmckl_exit_code qmckl_slagel_splitting(uint64_t LDS, qmckl_exit_code qmckl_slagel_splitting(uint64_t LDS,
uint64_t Dim, uint64_t Dim,
uint64_t N_updates, uint64_t N_updates,
const double *Updates, const double* Updates,
const uint64_t *Updates_index, const uint64_t* Updates_index,
const double breakdown, const double breakdown,
double *Slater_inv, double* Slater_inv,
double *later_updates, double* later_updates,
uint64_t *later_index, uint64_t* later_index,
uint64_t *later, uint64_t* later,
double *determinant) { double* determinant) {
// #ifdef DEBUG // Leave commented out since debugging information is not yet implemented in QMCkl.
// std::cerr << "Called slagel_splitting with " << N_updates << " updates" << std::endl;
// #endif
double C[Dim]; double __attribute__((aligned(8))) C[LDS];
double D[Dim]; double __attribute__((aligned(8))) D[LDS];
uint64_t l = 0; uint64_t l = 0;
// For each update // For each update
while (l < N_updates) { while (l < N_updates) {
// C = S^{-1} x U_l // C = S^{-1} x U_l
for (uint64_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
C[i] = 0; C[i] = 0.0f;
for (uint64_t j = 0; j < Dim; j++) { IVDEP
C[i] += Slater_inv[i * LDS + j] * Updates[l * Dim + j]; ALIGNED
for (uint64_t j = 0; j < LDS; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j];
} }
} }
// Denominator // Denominator
double den = 1 + C[Updates_index[l] - 1]; const int cui = Updates_index[l] - 1;
if (fabs(den) < breakdown) { // Here is decided to split the update, or not. double den = 1.0f + C[cui];
if (fabs(den) < breakdown) {
// U_l = U_l / 2: split the update in 2 equal halves and save the second halve in later_updates // U_l = U_l / 2: split the update in 2 equal halves and save the
for (uint64_t i = 0; i < Dim; i++) { // second halve in later_updates
later_updates[*later * Dim + i] = Updates[l * Dim + i] / 2.0; IVDEP
C[i] /= 2.0; ALIGNED
for (uint64_t i = 0; i < LDS; i++) {
later_updates[*later * LDS + i] = Updates[l * LDS + i] * 0.5f;
C[i] *= 0.5f;
} }
later_index[*later] = Updates_index[l]; later_index[*later] = Updates_index[l];
(*later)++; (*later)++;
den = 1 + C[Updates_index[l] - 1]; den = 1.0f + C[cui];
} // From here onwards we continue with applying the first havel of the update to Slater_inv } // From here onwards we continue with applying the first halve of the
double iden = 1 / den; // update to Slater_inv
double iden = 1.0f / den;
if (determinant != NULL) if (determinant)
*determinant *= den; *determinant *= den;
// D = v^T x S^{-1} // D = v^T x S^{-1} : 1 x LDS
for (uint64_t j = 0; j < Dim; j++) { IVDEP
D[j] = Slater_inv[(Updates_index[l] - 1) * LDS + j]; ALIGNED
for (uint64_t j = 0; j < LDS; j++) {
D[j] = Slater_inv[cui * LDS + j];
} }
// S^{-1} = S^{-1} - C x D / den // S^{-1} = S^{-1} - C x D / den
for (uint64_t i = 0; i < Dim; i++) { for (uint64_t i = 0; i < Dim; i++) {
for (uint64_t j = 0; j < Dim; j++) { IVDEP
double update = C[i] * D[j] * iden; ALIGNED
for (uint64_t j = 0; j < LDS; j++) {
const double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update; Slater_inv[i * LDS + j] -= update;
} }
} }

File diff suppressed because one or more lines are too long