From 0a083e287583ba73d237814cb9d66ded1d36c0ec Mon Sep 17 00:00:00 2001 From: Francois Coppens Date: Wed, 20 Jul 2022 19:09:55 +0200 Subject: [PATCH] Added first version of K x K Woodbury kernel using only CBLAS and LAPACK calls --- independent_test_harness/block | 9 +++ independent_test_harness/kay | 9 +++ independent_test_harness/kernels.h | 12 ++++ independent_test_harness/mkl | 9 +++ independent_test_harness/sm.c | 91 ++++++++++++++++++++++++++++++ independent_test_harness/test.c | 21 +++++-- independent_test_harness/three | 9 +++ 7 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 independent_test_harness/block create mode 100644 independent_test_harness/kay create mode 100644 independent_test_harness/mkl create mode 100644 independent_test_harness/three diff --git a/independent_test_harness/block b/independent_test_harness/block new file mode 100644 index 0000000..690e572 --- /dev/null +++ b/independent_test_harness/block @@ -0,0 +1,9 @@ +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0 +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/independent_test_harness/kay b/independent_test_harness/kay new file mode 100644 index 0000000..697c51d --- /dev/null +++ b/independent_test_harness/kay @@ -0,0 +1,9 @@ +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0 +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h index 2a8a32e..ef5fed6 100644 --- a/independent_test_harness/kernels.h +++ b/independent_test_harness/kernels.h @@ -1,7 +1,10 @@ #include +#include lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS); +int min(int a, int b); + uint32_t qmckl_sherman_morrison( const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates, const double *__restrict __attribute__((aligned(8))) Updates, @@ -32,6 +35,15 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, Slater_inv, double *__restrict determinant); +uint32_t qmckl_woodbury_k(const uint64_t vLDS, + const uint64_t vDim, + const uint64_t N_updates, + const double *__restrict __attribute__((aligned(8))) Updates, + const uint64_t *__restrict Updates_index, + const double breakdown, + double *__restrict __attribute__((aligned(8))) Slater_inv, + double *__restrict determinant); + uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, const double *__restrict __attribute__((aligned(8))) Updates, diff --git a/independent_test_harness/mkl b/independent_test_harness/mkl new file mode 100644 index 0000000..c1d48d7 --- /dev/null +++ b/independent_test_harness/mkl @@ -0,0 +1,9 @@ +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0 +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/independent_test_harness/sm.c b/independent_test_harness/sm.c index 24854f6..ade2b3a 100644 --- a/independent_test_harness/sm.c +++ b/independent_test_harness/sm.c @@ -1,12 +1,18 @@ #include #include #include +#include #include "kernels.h" + extern uint64_t n_splits; extern uint64_t block_fail; extern uint64_t recursive_calls; +int min(int a, int b) { + return (a > b) ? b : a; +} + uint32_t qmckl_sherman_morrison( const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates, const double *__restrict __attribute__((aligned(8))) Updates, @@ -265,6 +271,91 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, return 0; } +/* +COMPUTE S^{-1} - C B^{-1} D : Dim x LDS, +where S^{-1} : Dim x LDS, + C := S^{-1} U : Dim x K, dgemm + B := 1 + V C : K x K, copy + D := V S^{-1} : K x LDS, copy + U : LDS x K, + V : K x Dim + tmp := B^{-1} D : K x LDS, dgemm + S = S - C tmp : Dim x LDS, dgemm +*/ +uint32_t qmckl_woodbury_k(const uint64_t vLDS, + const uint64_t vDim, + const uint64_t N_updates, + const double *__restrict __attribute__((aligned(8))) Updates, + const uint64_t *__restrict Updates_index, + const double breakdown, + double *__restrict __attribute__((aligned(8))) Slater_inv, + double *__restrict determinant) { + + const uint32_t Dim = 21; + const uint32_t LDS = 24; + + // Compute C = S^{-1} U : Dim x K : standard dgemm + double C[Dim * N_updates]; + double alpha = 1.0, beta = 0.0; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + Dim, N_updates, LDS, + alpha, Slater_inv, LDS, Updates, LDS, + beta, C, N_updates); + + // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD + // Construct D = V S^{-1} : K x LDS + double B[N_updates * N_updates], D[N_updates * LDS]; + for (uint32_t i = 0; i < N_updates; i++) { + const uint32_t row = Updates_index[i] - 1; + for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j); + for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j]; + } + + // Compute determinant by LU decomposition + int ipiv[N_updates]; + lapack_int ret; + ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv); + if (ret != 0) return ret; + double det = 1.0; + int j = 0; + for (uint32_t i = 0; i < N_updates; i++) { + j += min(abs(ipiv[i] - i), 1); + det *= B[(N_updates + 1) * i]; + } + if (j & 1 == 0) { + det = -det; + } + + // Check if determinant of B is not too close to zero + if (fabs(det) < breakdown) { + return 1; + } + + // Update det(Slater) if passed + if (determinant) *determinant *= det; + + // Compute B^{-1} with explicit formula for K x K inversion + ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); + if (ret != 0) return ret; + + // tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm + double tmp[N_updates * LDS]; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + N_updates, LDS, N_updates, + alpha, B, N_updates, D, LDS, + beta, tmp, LDS); + + // Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm + alpha = -1.0, beta = 1.0; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + Dim, LDS, N_updates, + alpha, C, N_updates, tmp, LDS, + beta, Slater_inv, LDS); + + return 0; +} + + uint32_t qmckl_slagel_splitting( const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates, const double *__restrict __attribute__((aligned(8))) Updates, diff --git a/independent_test_harness/test.c b/independent_test_harness/test.c index e464c9a..d6f05dc 100644 --- a/independent_test_harness/test.c +++ b/independent_test_harness/test.c @@ -41,10 +41,8 @@ printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n"); printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n"); printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); // FOR EACH UPDATE CYCLE DO: - for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) { - // for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) { - // for (uint32_t cycles_index = 65; cycles_index < 66; cycles_index++) { - // for (uint32_t cycles_index = 8055; cycles_index < 8056; cycles_index++) { + // for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) { + for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) { // 1. READ DATA FROM DATASET uint32_t cycle = cycles[cycles_index]; sprintf(nupds_key, "/cycle_%d/nupdates", cycle); @@ -202,6 +200,21 @@ printf("#----------------------------------------------------------------------- // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR accumulator += (double)(after - before); + } else if (version[0] == 'k') { // Woodbury K + + // 1. FETCH START TIME + uint64_t before = rdtsc(); + + // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS + err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates, + Updates_index, breakdown, Slater_invT_copy, &determinant); + + // 3. FETCH FINISH TIME + uint64_t after = rdtsc(); + + // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR + accumulator += (double)(after - before); + } else if (version[0] == 's') { // Splitting // 1. FETCH START TIME diff --git a/independent_test_harness/three b/independent_test_harness/three new file mode 100644 index 0000000..7e4b6be --- /dev/null +++ b/independent_test_harness/three @@ -0,0 +1,9 @@ +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0 +#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +#1 2 3 4 5 6 7 8 9 10 11 12 13 14 +#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC +#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------