mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2024-12-25 13:53:56 +01:00
Added first version of K x K Woodbury kernel using only CBLAS and LAPACK calls
This commit is contained in:
parent
732045284a
commit
0a083e2875
9
independent_test_harness/block
Normal file
9
independent_test_harness/block
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
9
independent_test_harness/kay
Normal file
9
independent_test_harness/kay
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
@ -1,7 +1,10 @@
|
|||||||
#include <mkl_lapacke.h>
|
#include <mkl_lapacke.h>
|
||||||
|
#include <mkl.h>
|
||||||
|
|
||||||
lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
|
lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
|
||||||
|
|
||||||
|
int min(int a, int b);
|
||||||
|
|
||||||
uint32_t qmckl_sherman_morrison(
|
uint32_t qmckl_sherman_morrison(
|
||||||
const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates,
|
const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
@ -32,6 +35,15 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
Slater_inv,
|
Slater_inv,
|
||||||
double *__restrict determinant);
|
double *__restrict determinant);
|
||||||
|
|
||||||
|
uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||||
|
const uint64_t vDim,
|
||||||
|
const uint64_t N_updates,
|
||||||
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
|
const uint64_t *__restrict Updates_index,
|
||||||
|
const double breakdown,
|
||||||
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
|
double *__restrict determinant);
|
||||||
|
|
||||||
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||||
const double *__restrict __attribute__((aligned(8)))
|
const double *__restrict __attribute__((aligned(8)))
|
||||||
Updates,
|
Updates,
|
||||||
|
9
independent_test_harness/mkl
Normal file
9
independent_test_harness/mkl
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
@ -1,12 +1,18 @@
|
|||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
#include "kernels.h"
|
#include "kernels.h"
|
||||||
|
|
||||||
|
|
||||||
extern uint64_t n_splits;
|
extern uint64_t n_splits;
|
||||||
extern uint64_t block_fail;
|
extern uint64_t block_fail;
|
||||||
extern uint64_t recursive_calls;
|
extern uint64_t recursive_calls;
|
||||||
|
|
||||||
|
int min(int a, int b) {
|
||||||
|
return (a > b) ? b : a;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t qmckl_sherman_morrison(
|
uint32_t qmckl_sherman_morrison(
|
||||||
const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates,
|
const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
@ -265,6 +271,91 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
COMPUTE S^{-1} - C B^{-1} D : Dim x LDS,
|
||||||
|
where S^{-1} : Dim x LDS,
|
||||||
|
C := S^{-1} U : Dim x K, dgemm
|
||||||
|
B := 1 + V C : K x K, copy
|
||||||
|
D := V S^{-1} : K x LDS, copy
|
||||||
|
U : LDS x K,
|
||||||
|
V : K x Dim
|
||||||
|
tmp := B^{-1} D : K x LDS, dgemm
|
||||||
|
S = S - C tmp : Dim x LDS, dgemm
|
||||||
|
*/
|
||||||
|
uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||||
|
const uint64_t vDim,
|
||||||
|
const uint64_t N_updates,
|
||||||
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
|
const uint64_t *__restrict Updates_index,
|
||||||
|
const double breakdown,
|
||||||
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
|
double *__restrict determinant) {
|
||||||
|
|
||||||
|
const uint32_t Dim = 21;
|
||||||
|
const uint32_t LDS = 24;
|
||||||
|
|
||||||
|
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||||
|
double C[Dim * N_updates];
|
||||||
|
double alpha = 1.0, beta = 0.0;
|
||||||
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
|
Dim, N_updates, LDS,
|
||||||
|
alpha, Slater_inv, LDS, Updates, LDS,
|
||||||
|
beta, C, N_updates);
|
||||||
|
|
||||||
|
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
||||||
|
// Construct D = V S^{-1} : K x LDS
|
||||||
|
double B[N_updates * N_updates], D[N_updates * LDS];
|
||||||
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
|
const uint32_t row = Updates_index[i] - 1;
|
||||||
|
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||||
|
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute determinant by LU decomposition
|
||||||
|
int ipiv[N_updates];
|
||||||
|
lapack_int ret;
|
||||||
|
ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
||||||
|
if (ret != 0) return ret;
|
||||||
|
double det = 1.0;
|
||||||
|
int j = 0;
|
||||||
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
|
j += min(abs(ipiv[i] - i), 1);
|
||||||
|
det *= B[(N_updates + 1) * i];
|
||||||
|
}
|
||||||
|
if (j & 1 == 0) {
|
||||||
|
det = -det;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if determinant of B is not too close to zero
|
||||||
|
if (fabs(det) < breakdown) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update det(Slater) if passed
|
||||||
|
if (determinant) *determinant *= det;
|
||||||
|
|
||||||
|
// Compute B^{-1} with explicit formula for K x K inversion
|
||||||
|
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
||||||
|
if (ret != 0) return ret;
|
||||||
|
|
||||||
|
// tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||||
|
double tmp[N_updates * LDS];
|
||||||
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
|
N_updates, LDS, N_updates,
|
||||||
|
alpha, B, N_updates, D, LDS,
|
||||||
|
beta, tmp, LDS);
|
||||||
|
|
||||||
|
// Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm
|
||||||
|
alpha = -1.0, beta = 1.0;
|
||||||
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
|
Dim, LDS, N_updates,
|
||||||
|
alpha, C, N_updates, tmp, LDS,
|
||||||
|
beta, Slater_inv, LDS);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
uint32_t qmckl_slagel_splitting(
|
uint32_t qmckl_slagel_splitting(
|
||||||
const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates,
|
const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
|
@ -41,10 +41,8 @@ printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
|
|||||||
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
|
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
|
||||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||||
// FOR EACH UPDATE CYCLE DO:
|
// FOR EACH UPDATE CYCLE DO:
|
||||||
for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
|
// for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
|
||||||
// for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) {
|
for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) {
|
||||||
// for (uint32_t cycles_index = 65; cycles_index < 66; cycles_index++) {
|
|
||||||
// for (uint32_t cycles_index = 8055; cycles_index < 8056; cycles_index++) {
|
|
||||||
// 1. READ DATA FROM DATASET
|
// 1. READ DATA FROM DATASET
|
||||||
uint32_t cycle = cycles[cycles_index];
|
uint32_t cycle = cycles[cycles_index];
|
||||||
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
|
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
|
||||||
@ -202,6 +200,21 @@ printf("#-----------------------------------------------------------------------
|
|||||||
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||||
accumulator += (double)(after - before);
|
accumulator += (double)(after - before);
|
||||||
|
|
||||||
|
} else if (version[0] == 'k') { // Woodbury K
|
||||||
|
|
||||||
|
// 1. FETCH START TIME
|
||||||
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
|
err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates,
|
||||||
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
|
// 3. FETCH FINISH TIME
|
||||||
|
uint64_t after = rdtsc();
|
||||||
|
|
||||||
|
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||||
|
accumulator += (double)(after - before);
|
||||||
|
|
||||||
} else if (version[0] == 's') { // Splitting
|
} else if (version[0] == 's') { // Splitting
|
||||||
|
|
||||||
// 1. FETCH START TIME
|
// 1. FETCH START TIME
|
||||||
|
9
independent_test_harness/three
Normal file
9
independent_test_harness/three
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
|
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||||
|
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
Loading…
Reference in New Issue
Block a user