mirror of
https://github.com/TREX-CoE/qmckl.git
synced 2025-01-08 04:19:15 +01:00
Merge pull request #99 from fmgjcoppens/master
Various SIMD_LENGTH related issues.
This commit is contained in:
commit
728a81f96b
@ -151,15 +151,15 @@ qmckl_exit_code qmckl_sherman_morrison_hpc(
|
|||||||
double __attribute__((aligned(8))) C[Dim];
|
double __attribute__((aligned(8))) C[Dim];
|
||||||
double __attribute__((aligned(8))) D[LDS];
|
double __attribute__((aligned(8))) D[LDS];
|
||||||
|
|
||||||
uint32_t l = 0;
|
uint64_t l = 0;
|
||||||
// For each update
|
// For each update
|
||||||
while (l < N_updates) {
|
while (l < N_updates) {
|
||||||
// C = S^{-1} x u_l
|
// C = S^{-1} x u_l
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
C[i] = 0.0f;
|
C[i] = 0.0f;
|
||||||
IVDEP
|
IVDEP
|
||||||
ALIGNED
|
ALIGNED
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j];
|
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -180,15 +180,15 @@ qmckl_exit_code qmckl_sherman_morrison_hpc(
|
|||||||
// selecting column: v_l^T * S_inv
|
// selecting column: v_l^T * S_inv
|
||||||
IVDEP
|
IVDEP
|
||||||
ALIGNED
|
ALIGNED
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
D[j] = Slater_inv[cui * LDS + j];
|
D[j] = Slater_inv[cui * LDS + j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// A^{-1} = A^{-1} - C x D / den
|
// A^{-1} = A^{-1} - C x D / den
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
IVDEP
|
IVDEP
|
||||||
ALIGNED
|
ALIGNED
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
const double update = C[i] * D[j] * iden;
|
const double update = C[i] * D[j] * iden;
|
||||||
Slater_inv[i * LDS + j] -= update;
|
Slater_inv[i * LDS + j] -= update;
|
||||||
}
|
}
|
||||||
@ -413,6 +413,7 @@ The tests for the kernels are executed on datasets that are extracted from a run
|
|||||||
|
|
||||||
#+begin_src c :tangle (eval c_test)
|
#+begin_src c :tangle (eval c_test)
|
||||||
const uint64_t Dim = 21;
|
const uint64_t Dim = 21;
|
||||||
|
const uint64_t LDS = (1 + (Dim) / SIMD_LENGTH) * SIMD_LENGTH;
|
||||||
const double breakdown = 1e-3;
|
const double breakdown = 1e-3;
|
||||||
const double tolerance = 1e-3;
|
const double tolerance = 1e-3;
|
||||||
double res[441];
|
double res[441];
|
||||||
@ -425,7 +426,15 @@ assert(Slater_inv1 != NULL);
|
|||||||
|
|
||||||
// original determinant of Slater1 (before applying updates)
|
// original determinant of Slater1 (before applying updates)
|
||||||
double det = 3.407025646103221e-10;
|
double det = 3.407025646103221e-10;
|
||||||
rc = qmckl_sherman_morrison(context, Dim, Dim, N_updates1, Updates1, Updates_index1, breakdown, Slater_inv1, &det);
|
rc = qmckl_sherman_morrison(context,
|
||||||
|
LDS,
|
||||||
|
Dim,
|
||||||
|
N_updates1,
|
||||||
|
Updates1,
|
||||||
|
Updates_index1,
|
||||||
|
breakdown,
|
||||||
|
Slater_inv1,
|
||||||
|
&det);
|
||||||
|
|
||||||
// Check that the determinant is updated properly
|
// Check that the determinant is updated properly
|
||||||
assert(fabs(det + 4.120398385068217e-10) < 1e-15);
|
assert(fabs(det + 4.120398385068217e-10) < 1e-15);
|
||||||
@ -434,7 +443,7 @@ for (unsigned int i = 0; i < Dim; i++) {
|
|||||||
for (unsigned int j = 0; j < Dim; j++) {
|
for (unsigned int j = 0; j < Dim; j++) {
|
||||||
res[i * Dim + j] = 0;
|
res[i * Dim + j] = 0;
|
||||||
for (unsigned int k = 0; k < Dim; k++) {
|
for (unsigned int k = 0; k < Dim; k++) {
|
||||||
res[i * Dim + j] += Slater1[i * Dim + k] * Slater_inv1[k * Dim + j];
|
res[i * Dim + j] += Slater1[i * Dim + k] * Slater_inv1[k * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -519,10 +528,6 @@ assert(rc == QMCKL_SUCCESS);
|
|||||||
*** C source
|
*** C source
|
||||||
|
|
||||||
#+begin_src c :tangle (eval c) :comments org
|
#+begin_src c :tangle (eval c) :comments org
|
||||||
#include <stdbool.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include "qmckl.h"
|
|
||||||
|
|
||||||
qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
|
qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
|
||||||
const uint64_t LDS,
|
const uint64_t LDS,
|
||||||
const uint64_t Dim,
|
const uint64_t Dim,
|
||||||
@ -538,25 +543,29 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
||||||
return QMCKL_NULL_CONTEXT;
|
return qmckl_failwith(context,
|
||||||
|
QMCKL_NULL_CONTEXT,
|
||||||
|
"qmckl_woodbury_2",
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t row1 = (Updates_index[0] - 1);
|
const uint64_t row1 = (Updates_index[0] - 1);
|
||||||
const uint64_t row2 = (Updates_index[1] - 1);
|
const uint64_t row2 = (Updates_index[1] - 1);
|
||||||
|
|
||||||
// Compute C = S_inv * U !! NON-STANDARD MATRIX MULTIPLICATION BECAUSE
|
// Compute C = (S^T)^{-1}U : Dim x 2
|
||||||
// OF LAYOUT OF 'Updates' !!
|
double __attribute__((aligned(8))) C[2 * Dim];
|
||||||
double C[2 * Dim];
|
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
for (uint64_t j = 0; j < 2; j++) {
|
C[i * 2] = 0;
|
||||||
C[i * 2 + j] = 0;
|
C[i * 2 + 1] = 0;
|
||||||
for (uint64_t k = 0; k < Dim; k++) {
|
IVDEP
|
||||||
C[i * 2 + j] += Slater_inv[i * LDS + k] * Updates[Dim * j + k];
|
ALIGNED
|
||||||
|
for (uint64_t k = 0; k < LDS; k++) {
|
||||||
|
C[i * 2] += Slater_inv[i * LDS + k] * Updates[k];
|
||||||
|
C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute B = 1 + V * C
|
// Compute B = 1 + VC : 2 x 2
|
||||||
const double B0 = C[row1 * 2] + 1;
|
const double B0 = C[row1 * 2] + 1;
|
||||||
const double B1 = C[row1 * 2 + 1];
|
const double B1 = C[row1 * 2 + 1];
|
||||||
const double B2 = C[row2 * 2];
|
const double B2 = C[row2 * 2];
|
||||||
@ -565,35 +574,39 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context,
|
|||||||
// Check if determinant of inverted matrix is not zero
|
// Check if determinant of inverted matrix is not zero
|
||||||
double det = B0 * B3 - B1 * B2;
|
double det = B0 * B3 - B1 * B2;
|
||||||
if (fabs(det) < breakdown) {
|
if (fabs(det) < breakdown) {
|
||||||
return QMCKL_FAILURE;
|
return QMCKL_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update det(S) when passed
|
// Update det(S) when passed
|
||||||
if (determinant != NULL)
|
if (determinant)
|
||||||
*determinant *= det;
|
*determinant *= det;
|
||||||
|
|
||||||
// Compute B^{-1} with explicit formula for 2x2 inversion
|
// Compute B^{-1} with explicit formula for 2 x 2 inversion
|
||||||
double Binv[4], idet = 1.0 / det;
|
double __attribute__((aligned(8))) Binv[4], idet = 1.0 / det;
|
||||||
Binv[0] = idet * B3;
|
Binv[0] = idet * B3;
|
||||||
Binv[1] = -1.0 * idet * B1;
|
Binv[1] = -1.0 * idet * B1;
|
||||||
Binv[2] = -1.0 * idet * B2;
|
Binv[2] = -1.0 * idet * B2;
|
||||||
Binv[3] = idet * B0;
|
Binv[3] = idet * B0;
|
||||||
|
|
||||||
// Compute tmp = B^{-1} x (V.S^{-1})
|
// tmp = B^{-1}D : 2 x LDS
|
||||||
double tmp[2 * Dim];
|
double __attribute__((aligned(8))) tmp[2 * LDS];
|
||||||
for (uint64_t i = 0; i < 2; i++) {
|
double* r1dim = &(Slater_inv[row1 * LDS]);
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
double* r2dim = &(Slater_inv[row2 * LDS]);
|
||||||
tmp[i * Dim + j] = Binv[i * 2] * Slater_inv[row1 * LDS + j];
|
IVDEP
|
||||||
tmp[i * Dim + j] += Binv[i * 2 + 1] * Slater_inv[row2 * LDS + j];
|
ALIGNED
|
||||||
}
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
|
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
|
||||||
|
tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute (S + U V)^{-1} = S^{-1} - C x tmp
|
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
IVDEP
|
||||||
Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j];
|
ALIGNED
|
||||||
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[Dim + j];
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
}
|
Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j];
|
||||||
|
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return QMCKL_SUCCESS;
|
return QMCKL_SUCCESS;
|
||||||
@ -644,13 +657,13 @@ assert(Updates2 != NULL);
|
|||||||
assert(Updates_index2 != NULL);
|
assert(Updates_index2 != NULL);
|
||||||
assert(Slater_inv2 != NULL);
|
assert(Slater_inv2 != NULL);
|
||||||
det = -1.4432116661319376e-11;
|
det = -1.4432116661319376e-11;
|
||||||
rc = qmckl_woodbury_2(context, Dim, Dim, Updates2, Updates_index2, breakdown, Slater_inv2, &det);
|
rc = qmckl_woodbury_2(context, LDS, Dim, Updates2, Updates_index2, breakdown, Slater_inv2, &det);
|
||||||
assert(fabs(det-2.367058141251457e-10) < 1e-15);
|
assert(fabs(det-2.367058141251457e-10) < 1e-15);
|
||||||
for (unsigned int i = 0; i < Dim; i++) {
|
for (unsigned int i = 0; i < Dim; i++) {
|
||||||
for (unsigned int j = 0; j < Dim; j++) {
|
for (unsigned int j = 0; j < Dim; j++) {
|
||||||
res[i * Dim + j] = 0;
|
res[i * Dim + j] = 0;
|
||||||
for (unsigned int k = 0; k < Dim; k++) {
|
for (unsigned int k = 0; k < Dim; k++) {
|
||||||
res[i * Dim + j] += Slater2[i * Dim + k] * Slater_inv2[k * Dim + j];
|
res[i * Dim + j] += Slater2[i * Dim + k] * Slater_inv2[k * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -687,8 +700,8 @@ assert(rc == QMCKL_SUCCESS);
|
|||||||
|
|
||||||
If the determinant of the Slater-matrix is passed, it will be updated to the determinant resulting
|
If the determinant of the Slater-matrix is passed, it will be updated to the determinant resulting
|
||||||
from applying the updates to the original matrix.
|
from applying the updates to the original matrix.
|
||||||
|
#pragma ivdep
|
||||||
|
#pragma vector aligned
|
||||||
|
|
||||||
#+NAME: qmckl_woodbury_3_args
|
#+NAME: qmckl_woodbury_3_args
|
||||||
| qmckl_context | context | in | Global state |
|
| qmckl_context | context | in | Global state |
|
||||||
@ -730,10 +743,6 @@ assert(rc == QMCKL_SUCCESS);
|
|||||||
*** C source
|
*** C source
|
||||||
|
|
||||||
#+begin_src c :tangle (eval c) :comments org
|
#+begin_src c :tangle (eval c) :comments org
|
||||||
#include <stdbool.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include "qmckl.h"
|
|
||||||
|
|
||||||
qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
|
qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
|
||||||
const uint64_t LDS,
|
const uint64_t LDS,
|
||||||
const uint64_t Dim,
|
const uint64_t Dim,
|
||||||
@ -749,26 +758,32 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
|
|||||||
,*/
|
,*/
|
||||||
|
|
||||||
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
||||||
return QMCKL_NULL_CONTEXT;
|
return qmckl_failwith(context,
|
||||||
|
QMCKL_NULL_CONTEXT,
|
||||||
|
"qmckl_woodbury_3",
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t row1 = (Updates_index[0] - 1);
|
const uint64_t row1 = (Updates_index[0] - 1);
|
||||||
const uint64_t row2 = (Updates_index[1] - 1);
|
const uint64_t row2 = (Updates_index[1] - 1);
|
||||||
const uint64_t row3 = (Updates_index[2] - 1);
|
const uint64_t row3 = (Updates_index[2] - 1);
|
||||||
|
|
||||||
// Compute C = S_inv * U !! NON-STANDARD MATRIX MULTIPLICATION BECAUSE
|
// Compute C = (S^T)^{-1}U : Dim x 3
|
||||||
// OF LAYOUT OF 'Updates' !!
|
double __attribute__((aligned(8))) C[3 * Dim];
|
||||||
double C[3 * Dim];
|
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
for (uint64_t j = 0; j < 3; j++) {
|
C[i * 3] = 0;
|
||||||
C[i * 3 + j] = 0;
|
C[i * 3 + 1] = 0;
|
||||||
for (uint64_t k = 0; k < Dim; k++) {
|
C[i * 3 + 2] = 0;
|
||||||
C[i * 3 + j] += Slater_inv[i * LDS + k] * Updates[Dim * j + k];
|
IVDEP
|
||||||
|
ALIGNED
|
||||||
|
for (uint64_t k = 0; k < LDS; k++) {
|
||||||
|
C[i * 3] += Slater_inv[i * LDS + k] * Updates[k];
|
||||||
|
C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
|
||||||
|
C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k];
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute B = 1 + V.C
|
// Compute B = 1 + VC : 3 x 3
|
||||||
const double B0 = C[row1 * 3] + 1;
|
const double B0 = C[row1 * 3] + 1;
|
||||||
const double B1 = C[row1 * 3 + 1];
|
const double B1 = C[row1 * 3 + 1];
|
||||||
const double B2 = C[row1 * 3 + 2];
|
const double B2 = C[row1 * 3 + 2];
|
||||||
@ -784,15 +799,15 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
|
|||||||
det = B0 * (B4 * B8 - B5 * B7) - B1 * (B3 * B8 - B5 * B6) +
|
det = B0 * (B4 * B8 - B5 * B7) - B1 * (B3 * B8 - B5 * B6) +
|
||||||
B2 * (B3 * B7 - B4 * B6);
|
B2 * (B3 * B7 - B4 * B6);
|
||||||
if (fabs(det) < breakdown) {
|
if (fabs(det) < breakdown) {
|
||||||
return QMCKL_FAILURE;
|
return QMCKL_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update det(Slater) if passed
|
// Update det(Slater) if passed
|
||||||
if (determinant != NULL)
|
if (determinant)
|
||||||
*determinant *= det;
|
*determinant *= det;
|
||||||
|
|
||||||
// Compute B^{-1} with explicit formula for 3x3 inversion
|
// Compute B^{-1} with explicit formula for 3 x 3 inversion
|
||||||
double Binv[9], idet = 1.0 / det;
|
double __attribute__((aligned(8))) Binv[9], idet = 1.0 / det;
|
||||||
Binv[0] = (B4 * B8 - B7 * B5) * idet;
|
Binv[0] = (B4 * B8 - B7 * B5) * idet;
|
||||||
Binv[1] = -(B1 * B8 - B7 * B2) * idet;
|
Binv[1] = -(B1 * B8 - B7 * B2) * idet;
|
||||||
Binv[2] = (B1 * B5 - B4 * B2) * idet;
|
Binv[2] = (B1 * B5 - B4 * B2) * idet;
|
||||||
@ -803,23 +818,30 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context,
|
|||||||
Binv[7] = -(B0 * B7 - B6 * B1) * idet;
|
Binv[7] = -(B0 * B7 - B6 * B1) * idet;
|
||||||
Binv[8] = (B0 * B4 - B3 * B1) * idet;
|
Binv[8] = (B0 * B4 - B3 * B1) * idet;
|
||||||
|
|
||||||
// Compute tmp = B^{-1} x (V.S^{-1})
|
// tmp = B^{-1}D : 3 x LDS
|
||||||
double tmp[3 * Dim];
|
double __attribute__((aligned(8))) tmp[3 * LDS];
|
||||||
for (uint64_t i = 0; i < 3; i++) {
|
double* r1dim = &(Slater_inv[row1 * LDS]);
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
double* r2dim = &(Slater_inv[row2 * LDS]);
|
||||||
tmp[i * Dim + j] = Binv[i * 3] * Slater_inv[row1 * LDS + j];
|
double* r3dim = &(Slater_inv[row3 * LDS]);
|
||||||
tmp[i * Dim + j] += Binv[i * 3 + 1] * Slater_inv[row2 * LDS + j];
|
IVDEP
|
||||||
tmp[i * Dim + j] += Binv[i * 3 + 2] * Slater_inv[row3 * LDS + j];
|
ALIGNED
|
||||||
}
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
|
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
|
||||||
|
tmp[LDS + j] =
|
||||||
|
Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
|
||||||
|
tmp[2 * LDS + j] =
|
||||||
|
Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute (S + U V)^{-1} = S^{-1} - C x tmp
|
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
IVDEP
|
||||||
Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j];
|
ALIGNED
|
||||||
Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[Dim + j];
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * Dim + j];
|
Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j];
|
||||||
}
|
Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j];
|
||||||
|
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return QMCKL_SUCCESS;
|
return QMCKL_SUCCESS;
|
||||||
@ -870,13 +892,13 @@ assert(Updates3 != NULL);
|
|||||||
assert(Updates_index3 != NULL);
|
assert(Updates_index3 != NULL);
|
||||||
assert(Slater_inv3_1 != NULL);
|
assert(Slater_inv3_1 != NULL);
|
||||||
det = -1.23743195512859e-09;
|
det = -1.23743195512859e-09;
|
||||||
rc = qmckl_woodbury_3(context, Dim, Dim, Updates3, Updates_index3, breakdown, Slater_inv3_1, &det);
|
rc = qmckl_woodbury_3(context, LDS, Dim, Updates3, Updates_index3, breakdown, Slater_inv3_1, &det);
|
||||||
assert(fabs(det - 1.602708950725074e-10) < 1e-15);
|
assert(fabs(det - 1.602708950725074e-10) < 1e-15);
|
||||||
for (unsigned int i = 0; i < Dim; i++) {
|
for (unsigned int i = 0; i < Dim; i++) {
|
||||||
for (unsigned int j = 0; j < Dim; j++) {
|
for (unsigned int j = 0; j < Dim; j++) {
|
||||||
res[i * Dim + j] = 0;
|
res[i * Dim + j] = 0;
|
||||||
for (unsigned int k = 0; k < Dim; k++) {
|
for (unsigned int k = 0; k < Dim; k++) {
|
||||||
res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_1[k * Dim + j];
|
res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_1[k * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -963,9 +985,6 @@ assert(rc == QMCKL_SUCCESS);
|
|||||||
*** C source
|
*** C source
|
||||||
|
|
||||||
#+begin_src c :tangle (eval c) :comments org
|
#+begin_src c :tangle (eval c) :comments org
|
||||||
#include <stdbool.h>
|
|
||||||
#include "qmckl.h"
|
|
||||||
|
|
||||||
qmckl_exit_code qmckl_sherman_morrison_splitting(const qmckl_context context,
|
qmckl_exit_code qmckl_sherman_morrison_splitting(const qmckl_context context,
|
||||||
const uint64_t LDS,
|
const uint64_t LDS,
|
||||||
const uint64_t Dim,
|
const uint64_t Dim,
|
||||||
@ -977,19 +996,24 @@ qmckl_exit_code qmckl_sherman_morrison_splitting(const qmckl_context context,
|
|||||||
double* determinant) {
|
double* determinant) {
|
||||||
|
|
||||||
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
||||||
return QMCKL_NULL_CONTEXT;
|
return qmckl_failwith(context,
|
||||||
|
QMCKL_NULL_CONTEXT,
|
||||||
|
"qmckl_sherman_morrison_splitting",
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
double later_updates[Dim * N_updates];
|
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||||
uint64_t later_index[N_updates];
|
uint64_t later_index[N_updates];
|
||||||
uint64_t later = 0;
|
uint64_t later = 0;
|
||||||
|
|
||||||
(void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
|
(void) qmckl_slagel_splitting(
|
||||||
breakdown, Slater_inv, later_updates, later_index, &later, determinant);
|
LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv,
|
||||||
|
later_updates, later_index, &later, determinant);
|
||||||
|
|
||||||
if (later > 0) {
|
if (later > 0) {
|
||||||
(void) qmckl_sherman_morrison_splitting(context, LDS, Dim, later,
|
(void) qmckl_sherman_morrison_splitting(
|
||||||
later_updates, later_index, breakdown, Slater_inv, determinant);
|
context, LDS, Dim, later, later_updates, later_index, breakdown,
|
||||||
|
Slater_inv, determinant);
|
||||||
}
|
}
|
||||||
|
|
||||||
return QMCKL_SUCCESS;
|
return QMCKL_SUCCESS;
|
||||||
@ -1041,13 +1065,13 @@ assert(Updates3 != NULL);
|
|||||||
assert(Updates_index3 != NULL);
|
assert(Updates_index3 != NULL);
|
||||||
assert(Slater_inv3_2 != NULL);
|
assert(Slater_inv3_2 != NULL);
|
||||||
det = -1.23743195512859e-09;
|
det = -1.23743195512859e-09;
|
||||||
rc = qmckl_sherman_morrison_splitting(context, Dim, Dim, N_updates3, Updates3, Updates_index3, breakdown, Slater_inv3_2, &det);
|
rc = qmckl_sherman_morrison_splitting(context, LDS, Dim, N_updates3, Updates3, Updates_index3, breakdown, Slater_inv3_2, &det);
|
||||||
assert(fabs(det - 1.602708950725074e-10) < 1e-15);
|
assert(fabs(det - 1.602708950725074e-10) < 1e-15);
|
||||||
for (unsigned int i = 0; i < Dim; i++) {
|
for (unsigned int i = 0; i < Dim; i++) {
|
||||||
for (unsigned int j = 0; j < Dim; j++) {
|
for (unsigned int j = 0; j < Dim; j++) {
|
||||||
res[i * Dim + j] = 0;
|
res[i * Dim + j] = 0;
|
||||||
for (unsigned int k = 0; k < Dim; k++) {
|
for (unsigned int k = 0; k < Dim; k++) {
|
||||||
res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_2[k * Dim + j];
|
res[i * Dim + j] += Slater3[i * Dim + k] * Slater_inv3_2[k * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1127,9 +1151,6 @@ assert(rc == QMCKL_SUCCESS);
|
|||||||
*** C source
|
*** C source
|
||||||
|
|
||||||
#+begin_src c :tangle (eval c) :comments org
|
#+begin_src c :tangle (eval c) :comments org
|
||||||
#include <stdbool.h>
|
|
||||||
#include "qmckl.h"
|
|
||||||
|
|
||||||
qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context,
|
qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context,
|
||||||
const uint64_t LDS,
|
const uint64_t LDS,
|
||||||
const uint64_t Dim,
|
const uint64_t Dim,
|
||||||
@ -1141,59 +1162,108 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context,
|
|||||||
double* determinant) {
|
double* determinant) {
|
||||||
|
|
||||||
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
|
||||||
return QMCKL_NULL_CONTEXT;
|
return qmckl_failwith(context,
|
||||||
|
QMCKL_NULL_CONTEXT,
|
||||||
|
"qmckl_sherman_morrison_smw32s",
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
qmckl_exit_code rc;
|
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||||
|
|
||||||
uint64_t n_of_3blocks = N_updates / 3;
|
|
||||||
uint64_t remainder = N_updates % 3;
|
|
||||||
uint64_t length_3block = 3 * Dim;
|
|
||||||
|
|
||||||
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
|
|
||||||
// Woodbury 3x3 kernel
|
|
||||||
double later_updates[Dim * N_updates];
|
|
||||||
uint64_t later_index[N_updates];
|
uint64_t later_index[N_updates];
|
||||||
uint64_t later = 0;
|
uint64_t later = 0;
|
||||||
|
|
||||||
|
// Special case for 4 rank-1 updates: 2+2
|
||||||
|
if (N_updates == 4) {
|
||||||
|
qmckl_exit_code rc =
|
||||||
|
qmckl_woodbury_2(context, LDS, Dim, Updates, Updates_index,
|
||||||
|
breakdown, Slater_inv, determinant);
|
||||||
|
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
|
||||||
|
uint64_t l = 0;
|
||||||
|
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates, Updates_index,
|
||||||
|
breakdown, Slater_inv,
|
||||||
|
later_updates + (LDS * later),
|
||||||
|
later_index + later, &l, determinant);
|
||||||
|
later += l;
|
||||||
|
}
|
||||||
|
rc = qmckl_woodbury_2(context, LDS, Dim, &Updates[2 * LDS],
|
||||||
|
&Updates_index[2], breakdown, Slater_inv,
|
||||||
|
determinant);
|
||||||
|
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
|
||||||
|
uint64_t l = 0;
|
||||||
|
rc = qmckl_slagel_splitting(
|
||||||
|
LDS, Dim, 2, &Updates[2 * LDS], &Updates_index[2], breakdown,
|
||||||
|
Slater_inv, later_updates + (LDS * later), later_index + later,
|
||||||
|
&l, determinant);
|
||||||
|
later += l;
|
||||||
|
}
|
||||||
|
if (later > 0) {
|
||||||
|
rc = qmckl_sherman_morrison_splitting(
|
||||||
|
context, LDS, Dim, later, later_updates, later_index, breakdown,
|
||||||
|
Slater_inv, determinant);
|
||||||
|
}
|
||||||
|
return QMCKL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// And for the other cases != 4
|
||||||
|
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates
|
||||||
|
// with Woodbury 3x3 kernel
|
||||||
|
uint64_t n_of_3blocks = N_updates / 3;
|
||||||
|
uint64_t remainder = N_updates % 3;
|
||||||
|
uint64_t length_3block = 3 * LDS;
|
||||||
|
|
||||||
if (n_of_3blocks > 0) {
|
if (n_of_3blocks > 0) {
|
||||||
for (uint64_t i = 0; i < n_of_3blocks; i++) {
|
for (uint64_t i = 0; i < n_of_3blocks; i++) {
|
||||||
const double *Updates_3block = &Updates[i * length_3block];
|
const double* Updates_3block = &Updates[i * length_3block];
|
||||||
const uint64_t *Updates_index_3block = &Updates_index[i * 3];
|
const uint64_t* Updates_index_3block = &Updates_index[i * 3];
|
||||||
rc = qmckl_woodbury_3(context, LDS, Dim, Updates_3block, Updates_index_3block, breakdown, Slater_inv, determinant);
|
qmckl_exit_code rc = qmckl_woodbury_3(
|
||||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
context, LDS, Dim, Updates_3block, Updates_index_3block,
|
||||||
|
breakdown, Slater_inv, determinant);
|
||||||
|
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
(void) qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block, Updates_index_3block,
|
rc = qmckl_slagel_splitting(
|
||||||
breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant);
|
LDS, Dim, 3, Updates_3block, Updates_index_3block,
|
||||||
later = later + l;
|
breakdown, Slater_inv, later_updates + (LDS * later),
|
||||||
|
later_index + later, &l, determinant);
|
||||||
|
later += l;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply last remaining block of 2 updates with Woodbury 2x2 kernel
|
// Apply last remaining block of 2 updates with Woodbury 2x2 kernel
|
||||||
if (remainder == 2) {
|
if (remainder == 2) {
|
||||||
const double *Updates_2block = &Updates[n_of_3blocks * length_3block];
|
const double* Updates_2block = &Updates[n_of_3blocks * length_3block];
|
||||||
const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks];
|
const uint64_t* Updates_index_2block = &Updates_index[3 * n_of_3blocks];
|
||||||
rc = qmckl_woodbury_2(context, LDS, Dim, Updates_2block, Updates_index_2block, breakdown, Slater_inv, determinant);
|
qmckl_exit_code rc = qmckl_woodbury_2(
|
||||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
context, LDS, Dim, Updates_2block, Updates_index_2block,
|
||||||
|
breakdown, Slater_inv, determinant);
|
||||||
|
if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
(void) qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block, Updates_index_2block,
|
rc = qmckl_slagel_splitting(
|
||||||
breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant);
|
LDS, Dim, 2, Updates_2block, Updates_index_2block, breakdown,
|
||||||
later = later + l;
|
Slater_inv, later_updates + (LDS * later), later_index + later,
|
||||||
|
&l, determinant);
|
||||||
|
later += l;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply last remaining update with slagel_splitting
|
// Apply last remaining update with slagel_splitting
|
||||||
else if (remainder == 1) {
|
if (remainder == 1) {
|
||||||
const double *Updates_1block = &Updates[n_of_3blocks * length_3block];
|
const double* Updates_1block = &Updates[n_of_3blocks * length_3block];
|
||||||
const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks];
|
const uint64_t* Updates_index_1block = &Updates_index[3 * n_of_3blocks];
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
(void) qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block, Updates_index_1block,
|
(void) qmckl_slagel_splitting(
|
||||||
breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant);
|
LDS, Dim, 1, Updates_1block, Updates_index_1block, breakdown,
|
||||||
later = later + l;
|
Slater_inv, later_updates + (LDS * later), later_index + later, &l,
|
||||||
|
determinant);
|
||||||
|
later += l;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (later > 0) {
|
if (later > 0) {
|
||||||
(void) qmckl_sherman_morrison_splitting(context, LDS, Dim, later, later_updates, later_index, breakdown, Slater_inv, determinant);
|
(void) qmckl_sherman_morrison_splitting(
|
||||||
|
context, LDS, Dim, later, later_updates, later_index, breakdown,
|
||||||
|
Slater_inv, determinant);
|
||||||
}
|
}
|
||||||
|
|
||||||
return QMCKL_SUCCESS;
|
return QMCKL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1243,14 +1313,14 @@ assert(Updates5 != NULL);
|
|||||||
assert(Updates_index5 != NULL);
|
assert(Updates_index5 != NULL);
|
||||||
assert(Slater_inv5 != NULL);
|
assert(Slater_inv5 != NULL);
|
||||||
det = -3.186005284713128e-10;
|
det = -3.186005284713128e-10;
|
||||||
rc = qmckl_sherman_morrison_smw32s(context, Dim, Dim, N_updates5, Updates5, Updates_index5, breakdown, Slater_inv5, &det);
|
rc = qmckl_sherman_morrison_smw32s(context, LDS, Dim, N_updates5, Updates5, Updates_index5, breakdown, Slater_inv5, &det);
|
||||||
assert(fabs(det + 5.260200118412903e-10) < 1e-15);
|
assert(fabs(det + 5.260200118412903e-10) < 1e-15);
|
||||||
|
|
||||||
for (unsigned int i = 0; i < Dim; i++) {
|
for (unsigned int i = 0; i < Dim; i++) {
|
||||||
for (unsigned int j = 0; j < Dim; j++) {
|
for (unsigned int j = 0; j < Dim; j++) {
|
||||||
res[i * Dim + j] = 0;
|
res[i * Dim + j] = 0;
|
||||||
for (unsigned int k = 0; k < Dim; k++) {
|
for (unsigned int k = 0; k < Dim; k++) {
|
||||||
res[i * Dim + j] += Slater5[i * Dim + k] * Slater_inv5[k * Dim + j];
|
res[i * Dim + j] += Slater5[i * Dim + k] * Slater_inv5[k * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1342,67 +1412,70 @@ These functions can only be used internally by the kernels in this module.
|
|||||||
*** C source
|
*** C source
|
||||||
|
|
||||||
#+begin_src c :tangle (eval c) :comments org
|
#+begin_src c :tangle (eval c) :comments org
|
||||||
#include <stdbool.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include "qmckl.h"
|
|
||||||
|
|
||||||
qmckl_exit_code qmckl_slagel_splitting(uint64_t LDS,
|
qmckl_exit_code qmckl_slagel_splitting(uint64_t LDS,
|
||||||
uint64_t Dim,
|
uint64_t Dim,
|
||||||
uint64_t N_updates,
|
uint64_t N_updates,
|
||||||
const double *Updates,
|
const double* Updates,
|
||||||
const uint64_t *Updates_index,
|
const uint64_t* Updates_index,
|
||||||
const double breakdown,
|
const double breakdown,
|
||||||
double *Slater_inv,
|
double* Slater_inv,
|
||||||
double *later_updates,
|
double* later_updates,
|
||||||
uint64_t *later_index,
|
uint64_t* later_index,
|
||||||
uint64_t *later,
|
uint64_t* later,
|
||||||
double *determinant) {
|
double* determinant) {
|
||||||
// #ifdef DEBUG // Leave commented out since debugging information is not yet implemented in QMCkl.
|
|
||||||
// std::cerr << "Called slagel_splitting with " << N_updates << " updates" << std::endl;
|
|
||||||
// #endif
|
|
||||||
|
|
||||||
double C[Dim];
|
double __attribute__((aligned(8))) C[LDS];
|
||||||
double D[Dim];
|
double __attribute__((aligned(8))) D[LDS];
|
||||||
|
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
// For each update
|
// For each update
|
||||||
while (l < N_updates) {
|
while (l < N_updates) {
|
||||||
// C = S^{-1} x U_l
|
// C = S^{-1} x U_l
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
C[i] = 0;
|
C[i] = 0.0f;
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
IVDEP
|
||||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * Dim + j];
|
ALIGNED
|
||||||
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
|
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Denominator
|
// Denominator
|
||||||
double den = 1 + C[Updates_index[l] - 1];
|
const int cui = Updates_index[l] - 1;
|
||||||
if (fabs(den) < breakdown) { // Here is decided to split the update, or not.
|
double den = 1.0f + C[cui];
|
||||||
|
if (fabs(den) < breakdown) {
|
||||||
// U_l = U_l / 2: split the update in 2 equal halves and save the second halve in later_updates
|
// U_l = U_l / 2: split the update in 2 equal halves and save the
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
// second halve in later_updates
|
||||||
later_updates[*later * Dim + i] = Updates[l * Dim + i] / 2.0;
|
IVDEP
|
||||||
C[i] /= 2.0;
|
ALIGNED
|
||||||
|
for (uint64_t i = 0; i < LDS; i++) {
|
||||||
|
later_updates[*later * LDS + i] = Updates[l * LDS + i] * 0.5f;
|
||||||
|
C[i] *= 0.5f;
|
||||||
}
|
}
|
||||||
later_index[*later] = Updates_index[l];
|
later_index[*later] = Updates_index[l];
|
||||||
(*later)++;
|
(*later)++;
|
||||||
|
|
||||||
den = 1 + C[Updates_index[l] - 1];
|
den = 1.0f + C[cui];
|
||||||
} // From here onwards we continue with applying the first havel of the update to Slater_inv
|
} // From here onwards we continue with applying the first halve of the
|
||||||
double iden = 1 / den;
|
// update to Slater_inv
|
||||||
|
double iden = 1.0f / den;
|
||||||
|
|
||||||
if (determinant != NULL)
|
if (determinant)
|
||||||
*determinant *= den;
|
*determinant *= den;
|
||||||
|
|
||||||
// D = v^T x S^{-1}
|
// D = v^T x S^{-1} : 1 x LDS
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
IVDEP
|
||||||
D[j] = Slater_inv[(Updates_index[l] - 1) * LDS + j];
|
ALIGNED
|
||||||
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
|
D[j] = Slater_inv[cui * LDS + j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// S^{-1} = S^{-1} - C x D / den
|
// S^{-1} = S^{-1} - C x D / den
|
||||||
for (uint64_t i = 0; i < Dim; i++) {
|
for (uint64_t i = 0; i < Dim; i++) {
|
||||||
for (uint64_t j = 0; j < Dim; j++) {
|
IVDEP
|
||||||
double update = C[i] * D[j] * iden;
|
ALIGNED
|
||||||
|
for (uint64_t j = 0; j < LDS; j++) {
|
||||||
|
const double update = C[i] * D[j] * iden;
|
||||||
Slater_inv[i * LDS + j] -= update;
|
Slater_inv[i * LDS + j] -= update;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user