Added first version of K x K Woodbury kernel using only CBLAS and LAPACK calls

This commit is contained in:
Francois Coppens 2022-07-20 19:09:55 +02:00
parent 732045284a
commit 0a083e2875
7 changed files with 156 additions and 4 deletions

View File

@ -0,0 +1,9 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -0,0 +1,9 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -1,7 +1,10 @@
#include <mkl_lapacke.h> #include <mkl_lapacke.h>
#include <mkl.h>
lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS); lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
int min(int a, int b);
uint32_t qmckl_sherman_morrison( uint32_t qmckl_sherman_morrison(
const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates, const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates,
const double *__restrict __attribute__((aligned(8))) Updates, const double *__restrict __attribute__((aligned(8))) Updates,
@ -32,6 +35,15 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
Slater_inv, Slater_inv,
double *__restrict determinant); double *__restrict determinant);
uint32_t qmckl_woodbury_k(const uint64_t vLDS,
const uint64_t vDim,
const uint64_t N_updates,
const double *__restrict __attribute__((aligned(8))) Updates,
const uint64_t *__restrict Updates_index,
const double breakdown,
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant);
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
const double *__restrict __attribute__((aligned(8))) const double *__restrict __attribute__((aligned(8)))
Updates, Updates,

View File

@ -0,0 +1,9 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -1,12 +1,18 @@
#include <math.h> #include <math.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include "kernels.h" #include "kernels.h"
extern uint64_t n_splits; extern uint64_t n_splits;
extern uint64_t block_fail; extern uint64_t block_fail;
extern uint64_t recursive_calls; extern uint64_t recursive_calls;
int min(int a, int b) {
return (a > b) ? b : a;
}
uint32_t qmckl_sherman_morrison( uint32_t qmckl_sherman_morrison(
const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates, const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates,
const double *__restrict __attribute__((aligned(8))) Updates, const double *__restrict __attribute__((aligned(8))) Updates,
@ -265,6 +271,91 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
return 0; return 0;
} }
/*
COMPUTE S^{-1} - C B^{-1} D : Dim x LDS,
where S^{-1} : Dim x LDS,
C := S^{-1} U : Dim x K, dgemm
B := 1 + V C : K x K, copy
D := V S^{-1} : K x LDS, copy
U : LDS x K,
V : K x Dim
tmp := B^{-1} D : K x LDS, dgemm
S = S - C tmp : Dim x LDS, dgemm
*/
uint32_t qmckl_woodbury_k(const uint64_t vLDS,
const uint64_t vDim,
const uint64_t N_updates,
const double *__restrict __attribute__((aligned(8))) Updates,
const uint64_t *__restrict Updates_index,
const double breakdown,
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
// Compute C = S^{-1} U : Dim x K : standard dgemm
double C[Dim * N_updates];
double alpha = 1.0, beta = 0.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
Dim, N_updates, LDS,
alpha, Slater_inv, LDS, Updates, LDS,
beta, C, N_updates);
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
// Construct D = V S^{-1} : K x LDS
double B[N_updates * N_updates], D[N_updates * LDS];
for (uint32_t i = 0; i < N_updates; i++) {
const uint32_t row = Updates_index[i] - 1;
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
}
// Compute determinant by LU decomposition
int ipiv[N_updates];
lapack_int ret;
ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
if (ret != 0) return ret;
double det = 1.0;
int j = 0;
for (uint32_t i = 0; i < N_updates; i++) {
j += min(abs(ipiv[i] - i), 1);
det *= B[(N_updates + 1) * i];
}
if (j & 1 == 0) {
det = -det;
}
// Check if determinant of B is not too close to zero
if (fabs(det) < breakdown) {
return 1;
}
// Update det(Slater) if passed
if (determinant) *determinant *= det;
// Compute B^{-1} with explicit formula for K x K inversion
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
if (ret != 0) return ret;
// tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
double tmp[N_updates * LDS];
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
N_updates, LDS, N_updates,
alpha, B, N_updates, D, LDS,
beta, tmp, LDS);
// Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm
alpha = -1.0, beta = 1.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
Dim, LDS, N_updates,
alpha, C, N_updates, tmp, LDS,
beta, Slater_inv, LDS);
return 0;
}
uint32_t qmckl_slagel_splitting( uint32_t qmckl_slagel_splitting(
const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates, const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates,
const double *__restrict __attribute__((aligned(8))) Updates, const double *__restrict __attribute__((aligned(8))) Updates,

View File

@ -41,10 +41,8 @@ printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n"); printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
// FOR EACH UPDATE CYCLE DO: // FOR EACH UPDATE CYCLE DO:
for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) { // for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
// for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) { for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) {
// for (uint32_t cycles_index = 65; cycles_index < 66; cycles_index++) {
// for (uint32_t cycles_index = 8055; cycles_index < 8056; cycles_index++) {
// 1. READ DATA FROM DATASET // 1. READ DATA FROM DATASET
uint32_t cycle = cycles[cycles_index]; uint32_t cycle = cycles[cycles_index];
sprintf(nupds_key, "/cycle_%d/nupdates", cycle); sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
@ -202,6 +200,21 @@ printf("#-----------------------------------------------------------------------
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before); accumulator += (double)(after - before);
} else if (version[0] == 'k') { // Woodbury K
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 's') { // Splitting } else if (version[0] == 's') { // Splitting
// 1. FETCH START TIME // 1. FETCH START TIME

View File

@ -0,0 +1,9 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------