From bba5cf5f2c12e8820e718ac272cc0133c8f20258 Mon Sep 17 00:00:00 2001 From: Francois Coppens Date: Sat, 1 Oct 2022 18:54:18 +0200 Subject: [PATCH] Improved version. - All static arrays replaced by dynamic ones - All overhead induced by checking before and after running of the kernels replaced as much as possible with calls to MKL/DGEMMs. - Solved bugs due to dimension mismatches. Overhead time is dramatically reduced because no more calls to naive 'matmul'. --- independent_test_harness/Makefile | 1 + .../bench_cycles/1_cycle.h | 2 + independent_test_harness/check_result.sh | 16 +- independent_test_harness/cycles.h | 2 +- independent_test_harness/kernels.h | 2 - independent_test_harness/meuk.c | 137 +++++++++---- independent_test_harness/meuk.h | 14 +- independent_test_harness/sm.c | 112 ++++++----- independent_test_harness/test.c | 183 ++++++------------ 9 files changed, 252 insertions(+), 217 deletions(-) create mode 100644 independent_test_harness/bench_cycles/1_cycle.h diff --git a/independent_test_harness/Makefile b/independent_test_harness/Makefile index e723fb0..bc810d3 100644 --- a/independent_test_harness/Makefile +++ b/independent_test_harness/Makefile @@ -1,6 +1,7 @@ #FC = ifx #CC = nvc +#CFLAGS=-std=c99 -O0 -Wall -g -mp -target=gpu CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu INCLUDE=-I$(NVHPC_ROOT)/math_libs/include diff --git a/independent_test_harness/bench_cycles/1_cycle.h b/independent_test_harness/bench_cycles/1_cycle.h new file mode 100644 index 0000000..7dc8559 --- /dev/null +++ b/independent_test_harness/bench_cycles/1_cycle.h @@ -0,0 +1,2 @@ +const uint32_t n_cycles = 1; +uint32_t cycles[n_cycles] = {1}; diff --git a/independent_test_harness/check_result.sh b/independent_test_harness/check_result.sh index 4414603..33daaa1 100755 --- a/independent_test_harness/check_result.sh +++ b/independent_test_harness/check_result.sh @@ -1,5 +1,15 @@ #!/bin/bash -make -rm -v blocked kay cu -./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu + +rm -v blocked kay cu em + +make && + ./test m > em && \ + ./test b > blocked && \ + ./test k > kay && \ + ./test c > cu && \ + head -n 4 em && \ + awk 'NR==5' em && \ + awk 'NR==5' blocked && \ + awk 'NR==5' kay && \ + awk 'NR==5' cu diff --git a/independent_test_harness/cycles.h b/independent_test_harness/cycles.h index 811fdf3..9b3c53f 120000 --- a/independent_test_harness/cycles.h +++ b/independent_test_harness/cycles.h @@ -1 +1 @@ -cycles_329_dets/all_cycles.h \ No newline at end of file +bench_cycles/1_cycle.h \ No newline at end of file diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h index e279250..8d671c6 100644 --- a/independent_test_harness/kernels.h +++ b/independent_test_harness/kernels.h @@ -1,8 +1,6 @@ #include #include -#define DIM 21 -#define LDS 24 #define HAVE_CUBLAS_OFFLOAD #ifdef HAVE_CUBLAS_OFFLOAD diff --git a/independent_test_harness/meuk.c b/independent_test_harness/meuk.c index e103604..85c7d86 100644 --- a/independent_test_harness/meuk.c +++ b/independent_test_harness/meuk.c @@ -2,6 +2,89 @@ #include #include +void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim) { + for (uint32_t i = 0; i < Dim; i++) { + for (uint32_t j = 0; j < Lds; j++) { + if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j]; + else Slater_invT_copy[i * Lds + j] = 0.0; + } + } +} + +void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds) { + for (int i = 0; i < nupds; i++) { + int col = ui[i] - 1; + for (int j = 0; j < Dim; j++) { + slaterT[col + j * Dim] += upds[i * Lds + j]; + } + } +} + +void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds) { + for (int i = 0; i < nupds; i++) { + int col = ui[i] - 1; + for (int j = 0; j < Lds; j++) { + upds[i * Lds + j] -= slaterT[col + j * Dim]; + } + } +} + +double get_determinant(uint32_t cycle, hid_t file_id) { + char det_key[32]; + sprintf(det_key, "/cycle_%d/determinant", cycle); + double determinant; + read_double(file_id, det_key, &determinant); + return determinant; +} + +double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) { + char slater_inv_key[32]; + sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle); + double *slater_inv = malloc(sizeof *slater_inv * Dim * Lds); + read_double(file_id, slater_inv_key, slater_inv); + return slater_inv; +} + +double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) { + char slater_key[32]; + sprintf(slater_key, "/cycle_%d/slater_matrix", cycle); + double *slater = malloc(sizeof *slater * Dim * Lds); + read_double(file_id, slater_key, slater); + return slater; +} + +double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds) { + char upds_key[32]; + sprintf(upds_key, "/cycle_%d/updates", cycle); + double *upds = malloc(sizeof *upds * Lds * nupds); + read_double(file_id, upds_key, upds); + return upds; +} + +uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds) { + char upd_idx_key[32]; + sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle); + uint64_t* uis = malloc(sizeof *uis * nupds); + read_uint(file_id, upd_idx_key, uis); + return uis; +} + +uint64_t get_dim(uint32_t cycle, hid_t file_id) { + char dim_key[32]; + sprintf(dim_key, "/cycle_%d/slater_matrix_dim", cycle); + uint64_t Dim; + read_uint(file_id, dim_key, &Dim); + return Dim; +} + +uint64_t get_nupdates(uint32_t cycle, hid_t file_id) { + char nupds_key[32]; + sprintf(nupds_key, "/cycle_%d/nupdates", cycle); + uint64_t N_updates; + read_uint(file_id, nupds_key, &N_updates); + return N_updates; +} + double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) { double sum2 = 0; for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i]; @@ -65,16 +148,12 @@ void update_slater_matrix(const uint64_t Lds, const uint64_t Dim, uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT, double *Slater, const double tolerance) { - double res[Dim*Dim]; - - for (uint32_t i = 0; i < Dim; i++) { - for (uint32_t j = 0; j < Dim; j++) { - res[i * Dim + j] = 0; - for (uint32_t k = 0; k < Dim; k++) { - res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j]; - } - } - } + double* res = malloc(sizeof *res * Dim * Dim); + double alpha = 1.0, beta = 0.0; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + Dim, Dim, Dim, + alpha, Slater, Dim, Slater_invT, Lds, + beta, res, Dim); for (uint32_t i = 0; i < Dim; i++) { for (uint32_t j = 0; j < Dim; j++) { @@ -84,20 +163,20 @@ uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT if (i != j && fabs(elm) > tolerance) return 1; } } - + free(res); return 0; } -void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) { - for (uint32_t i = 0; i < Dim; i++) { - for (uint32_t j = 0; j < Dim; j++) { - prod[i * Dim + j] = 0; - for (uint32_t k = 0; k < Dim; k++) { - prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j]; - } - } - } -} +//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) { +// for (uint32_t i = 0; i < Dim; i++) { +// for (uint32_t j = 0; j < Dim; j++) { +// prod[i * Dim + j] = 0; +// for (uint32_t k = 0; k < Dim; k++) { +// prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j]; +// } +// } +// } +//} int32_t check_error_better(const double max, const double tolerance) { if (max < 0) return -1; // When max was a NaN @@ -119,21 +198,7 @@ uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim, const uint64_t *Updates_index, const double breakdown, const double tolerance, double *Slater, double *Slater_inv, double *determinant) { uint32_t rc = 0; - // if (version[0] == 'a') { // Anthony - // const double *Upds; - // const uint64_t *Ui; - // for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant; - // for (int j = 0; j < N_updates; j++) { - // Upds = &Updates[j * Lds]; - // Ui = &Updates_index[j]; - // detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant); - // if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n"); - // } - // for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant; - // update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater); - // rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance); - // if (rc != 0) printf("TEST_KERNEL: check_error failed\n"); - // } else if (version[0] == 'n') { // Naive + if (version[0] == 'n') { // Naive rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant); diff --git a/independent_test_harness/meuk.h b/independent_test_harness/meuk.h index 5e9a9e0..1b7688f 100644 --- a/independent_test_harness/meuk.h +++ b/independent_test_harness/meuk.h @@ -5,6 +5,7 @@ #include #include #include + #include "kernels.h" typedef struct Error { @@ -12,7 +13,18 @@ typedef struct Error { uint64_t error; } Error; -void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim); +void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim); +void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds); +void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds); +double get_determinant(uint32_t cycle, hid_t file_id); +double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds); +double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds); +double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds); +uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds); +uint64_t get_dim(uint32_t cycle, hid_t file_id); +uint64_t get_nupdates(uint32_t cycle, hid_t file_id); + +//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim); void residual(double *a, double *res, const uint64_t Dim); double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim); double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim); diff --git a/independent_test_harness/sm.c b/independent_test_harness/sm.c index e6570e1..362eded 100644 --- a/independent_test_harness/sm.c +++ b/independent_test_harness/sm.c @@ -2,7 +2,7 @@ #include #include #include "kernels.h" -#include "debug.h" +//#include "debug.h" extern uint64_t n_splits; extern uint64_t block_fail; @@ -19,11 +19,11 @@ uint32_t qmckl_sherman_morrison( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; - double __attribute__((aligned(8))) C[DIM]; - double __attribute__((aligned(8))) D[LDS]; + double __attribute__((aligned(8))) C[Dim]; + double __attribute__((aligned(8))) D[Lds]; uint32_t l = 0; // For each update @@ -48,7 +48,7 @@ uint32_t qmckl_sherman_morrison( double iden = 1.0 / den; // Update det(A) - if (!determinant) + if (determinant) *determinant *= den; #pragma ivdep @@ -89,14 +89,14 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; const uint32_t row1 = (Updates_index[0] - 1); const uint32_t row2 = (Updates_index[1] - 1); // Compute C = (S^T)^{-1}U : Dim x 2 - double __attribute__((aligned(8))) C[2 * DIM]; + double __attribute__((aligned(8))) C[2 * Dim]; for (uint32_t i = 0; i < Dim; i++) { C[i * 2] = 0; C[i * 2 + 1] = 0; @@ -132,7 +132,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, Binv[3] = idet * B0; // tmp = B^{-1}D : 2 x LDS - double __attribute__((aligned(8))) tmp[2 * LDS]; + double __attribute__((aligned(8))) tmp[2 * Lds]; double *__restrict r1dim = &(Slater_inv[row1 * Lds]); double *__restrict r2dim = &(Slater_inv[row2 * Lds]); #pragma ivdep @@ -173,15 +173,15 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; const uint32_t row1 = (Updates_index[0] - 1); const uint32_t row2 = (Updates_index[1] - 1); const uint32_t row3 = (Updates_index[2] - 1); // Compute C = (S^T)^{-1}U : Dim x 3 - double __attribute__((aligned(8))) C[3 * DIM]; + double __attribute__((aligned(8))) C[3 * Dim]; for (uint32_t i = 0; i < Dim; i++) { C[i * 3] = 0; C[i * 3 + 1] = 0; @@ -231,10 +231,10 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, Binv[8] = (B0 * B4 - B3 * B1) * idet; // tmp = B^{-1}D : 3 x LDS - double __attribute__((aligned(8))) tmp[3 * LDS]; - double *__restrict r1dim = &(Slater_inv[row1 * LDS]); - double *__restrict r2dim = &(Slater_inv[row2 * LDS]); - double *__restrict r3dim = &(Slater_inv[row3 * LDS]); + double __attribute__((aligned(8))) tmp[3 * Lds]; + double *__restrict r1dim = &(Slater_inv[row1 * Lds]); + double *__restrict r2dim = &(Slater_inv[row2 * Lds]); + double *__restrict r3dim = &(Slater_inv[row3 * Lds]); #pragma ivdep #pragma vector aligned for (uint32_t j = 0; j < Lds; j++) { @@ -277,11 +277,11 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; // Compute C = S^{-1} U : Dim x K : standard dgemm - double *C = calloc(1, DIM * N_updates * sizeof(double)); + double *C = calloc(1, Dim * N_updates * sizeof(double)); double alpha = 1.0, beta = 0.0; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, Dim, N_updates, Lds, @@ -289,7 +289,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, beta, C, N_updates); // Construct B = 1 + V C : K x K, construct D = V S^{-1} : K x LDS - double B[N_updates * N_updates], D[N_updates * LDS]; + double* B = calloc(1, sizeof *B * N_updates * N_updates); + double* D = calloc(1, sizeof *D * N_updates * Lds); for (uint32_t i = 0; i < N_updates; i++) { const uint32_t row = Updates_index[i] - 1; for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j); @@ -316,19 +317,23 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, pivot); // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm - double tmp1[N_updates * LDS]; + double* tmp1 = calloc(1, sizeof *tmp1 * N_updates * Lds); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - N_updates, LDS, N_updates, - alpha, B, N_updates, D, LDS, - beta, tmp1, LDS); + N_updates, Lds, N_updates, + alpha, B, N_updates, D, Lds, + beta, tmp1, Lds); // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm alpha = -1.0, beta = 1.0; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - Dim, LDS, N_updates, - alpha, C, N_updates, tmp1, LDS, - beta, Slater_inv, LDS); + Dim, Lds, N_updates, + alpha, C, N_updates, tmp1, Lds, + beta, Slater_inv, Lds); + free(C); + free(B); + free(D); + free(tmp1); free(pivot); return 0; } @@ -344,10 +349,12 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand double* Slater_inv, double* determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; - double alpha, beta; + bool swap; + uint32_t j; + double alpha, beta, det; int* pivot = calloc(1, sizeof *pivot * N_updates); double* C = calloc(1, sizeof *C * Dim * N_updates); double* B = calloc(1, sizeof *B * N_updates * N_updates); @@ -396,19 +403,22 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand // Compute determinant by LU decomposition (void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info); - bool swap = false; uint32_t j = 0; double det = 1.0f; + swap = false; j = 0; det = 1.0f; #pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det) for (uint32_t i = 0; i < N_updates; i++) { swap = (bool)(pivot[i] - (i + 1)); // swap = {0->false: no swap, >0->true: swap} j += (uint32_t)swap; // count # of swaps det *= B[i * (N_updates + 1)]; // prod. of diag elm. of B } - if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early. + } + if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early. if (determinant) { // update det(Slater) if determinant != NULL if ((j & 1) != 0) det = -det; // multiply det with -1 if # of swaps is odd *determinant *= det; } - + + #pragma omp target data use_device_ptr(Slater_inv, Updates, C, B, workspace, pivot, Binv, D, T1, T2) + { // Compute B^{-1} : initialise as I for solving BX=I #pragma omp target teams distribute parallel for for (int i = 0; i < N_updates; ++i) { @@ -471,11 +481,11 @@ uint32_t qmckl_slagel_splitting( uint64_t *__restrict later_index, uint64_t *__restrict later, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; - double __attribute__((aligned(8))) C[LDS]; - double __attribute__((aligned(8))) D[LDS]; + double __attribute__((aligned(8))) C[Lds]; + double __attribute__((aligned(8))) D[Lds]; uint32_t l = 0; // For each update @@ -513,7 +523,7 @@ uint32_t qmckl_slagel_splitting( } // From here onwards we continue with applying the first halve of the update to Slater_inv double iden = 1.0 / den; - if (!determinant) *determinant *= den; + if (determinant) *determinant *= den; // D = v^T x S^{-1} : 1 x LDS #pragma ivdep @@ -544,10 +554,10 @@ uint32_t qmckl_sherman_morrison_splitting( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; - double __attribute__((aligned(8))) later_updates[LDS * N_updates]; + double __attribute__((aligned(8))) later_updates[Lds * N_updates]; uint64_t later_index[N_updates]; uint64_t later = 0; // uint32_t rc; @@ -583,10 +593,10 @@ uint32_t qmckl_sherman_morrison_smw32s( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; - double __attribute__((aligned(8))) later_updates[LDS * N_updates]; + double __attribute__((aligned(8))) later_updates[Lds * N_updates]; uint64_t later_index[N_updates]; uint64_t later = 0; uint32_t rc; @@ -702,13 +712,13 @@ uint32_t qmckl_sherman_morrison_later( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = DIM; - const uint32_t Lds = LDS; + const uint32_t Dim = vDim; + const uint32_t Lds = vLDS; - double __attribute__((aligned(8))) C[DIM]; - double __attribute__((aligned(8))) D[LDS]; + double __attribute__((aligned(8))) C[Dim]; + double __attribute__((aligned(8))) D[Lds]; - double __attribute__((aligned(8))) later_updates[LDS * N_updates]; + double __attribute__((aligned(8))) later_updates[Lds * N_updates]; uint64_t later_index[N_updates]; uint64_t later = 0; @@ -743,7 +753,7 @@ uint32_t qmckl_sherman_morrison_later( } double iden = 1.0 / den; - if (!determinant) *determinant *= den; + if (determinant) *determinant *= den; // D = v^T x A^{-1} #pragma ivdep diff --git a/independent_test_harness/test.c b/independent_test_harness/test.c index 37b8133..62847c7 100644 --- a/independent_test_harness/test.c +++ b/independent_test_harness/test.c @@ -1,9 +1,9 @@ - #include +#include #include "meuk.h" #include "cycles.h" +#include "debug.h" -#define DATASET "dataset_329d_zeropadded_cm.hdf5" -// #define DATASET "dataset_15784d_zeropadded_cm.hdf5" +#define DATASET "dataset" #define REPETITIONS 1 uint64_t n_splits; @@ -27,77 +27,46 @@ int main(int argc, char **argv) { } #endif - // SETUP STORAGE AND DATA ACCESS - hid_t file_id; - file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT); - char nupds_key[32]; - char upd_idx_key[32]; - char upds_key[32]; - char slater_key[32]; - char slater_inv_key[32]; - char det_key[32]; - const uint64_t Dim = DIM; - const uint64_t Lds = LDS; - uint64_t N_updates; - double Slater[LDS * DIM ], SlaterT[LDS * DIM]; - double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM]; - double determinant, determinant_copy; + hid_t file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT); - // SETUP TEST PARAMETERS - const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small - const double tolerance = 0.001; // default = 0.001 - double cumulative = 0; - -printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); -printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n"); -printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n"); -printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); + printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); + printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n"); + printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n"); + printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); // FOR EACH UPDATE CYCLE DO: // for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) { - for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) { + for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) { + + // SETUP TEST PARAMETERS + const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small + const double tolerance = 0.001; // default = 0.001 + double cumulative = 0; + // 1. READ DATA FROM DATASET uint32_t cycle = cycles[cycles_index]; - sprintf(nupds_key, "/cycle_%d/nupdates", cycle); - sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle); - sprintf(upds_key, "/cycle_%d/updates", cycle); - sprintf(slater_key, "/cycle_%d/slater_matrix", cycle); - sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle); - sprintf(det_key, "/cycle_%d/determinant", cycle); - read_uint(file_id, nupds_key, &N_updates); - uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t)); - double *Updates = malloc(Lds * N_updates * sizeof(double)); - read_uint(file_id, upd_idx_key, Updates_index); - read_double(file_id, upds_key, Updates); - read_double(file_id, slater_key, Slater); - read_double(file_id, slater_inv_key, Slater_invT); - read_double(file_id, det_key, &determinant); + uint64_t Dim = get_dim(cycle, file_id); + uint64_t Lds = Dim; + uint64_t N_updates = get_nupdates(cycle, file_id); + uint64_t* Updates_index = get_upd_idcs(cycle, file_id, N_updates); + double* Updates = get_upds(cycle, file_id, N_updates, Lds); + double* Slater = get_slater(cycle, file_id, Dim, Lds); + double* Slater_invT = get_slater_inv(cycle, file_id, Dim, Lds); + double *Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds); + double determinant = get_determinant(cycle, file_id), determinant_copy; // Compute transpose of S. ST: 24 x 21 - for (int i = 0; i < Lds; i++) { - for (int j = 0; j < Dim; j++) { - SlaterT[i * Dim + j] = Slater[j * Lds + i]; - } - } + double *SlaterT = calloc(1, sizeof *SlaterT * Dim * Lds); + transpose(Slater, Lds, SlaterT, Dim, Dim, Lds); // Convert repl. upds into additive upds. - for (int i = 0; i < N_updates; i++) { - int col = Updates_index[i] - 1; - for (int j = 0; j < Lds; j++) { - Updates[i * Lds + j] -= SlaterT[col + j * Dim]; - } - } + convert(Updates, N_updates, Updates_index, SlaterT, Dim, Lds); // 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance); // Update Slater matrix - for (int i = 0; i < N_updates; i++) { - int col = Updates_index[i] - 1; - for (int j = 0; j < Dim; j++) { - SlaterT[col + j * Dim] += Updates[i * Lds + j]; - } - } // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB + update(SlaterT, Updates, Updates_index, N_updates, Dim, Lds); int32_t err_break; @@ -116,53 +85,13 @@ printf("#----------------------------------------------------------------------- determinant_copy = determinant; // ### CHOOSE A KERNEL: - // if (version[0] == 'a') { // Anthony - // const double *Upds; - // const uint64_t *Ui; - // double determinant_previous; - - // err_break = 0; - - // for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat) - - // for (int i = 0; i < N_updates; i++) { - // Upds = &Updates[i * Lds]; - // Ui = &Updates_index[i]; - // determinant_previous = determinant_copy; - - // // 1. FETCH START TIME - // uint64_t before = rdtsc(); - - // // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - // detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy); - - // // 3. FETCH FINISH TIME - // uint64_t after = rdtsc(); - - // // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR - // accumulator += (double)(after - before); - - // // 5. STOP APPLYING UPDATES IF BREAKDOWN DETECTED - // double lambda = determinant_copy / determinant_previous; // should be id. to lambda in detupd - // if (fabs(lambda) < breakdown) { - // err_break = 1; - // break; - // } - // } - - // if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again - // for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous; - // } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again - // for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy; - // } - // } else if (version[0] == 'n') { // Naive if (version[0] == 'n') { // Naive // 1. FETCH START TIME uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, - Updates_index, breakdown, Slater_invT_copy, &determinant); + Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -176,7 +105,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates, - Updates_index, breakdown, Slater_invT_copy, &determinant); + Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -190,7 +119,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index, - breakdown, Slater_invT_copy, &determinant); + breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -205,7 +134,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index, - breakdown, Slater_invT_copy, &determinant); + breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -220,7 +149,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates, - Updates_index, breakdown, Slater_invT_copy, &determinant); + Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -236,7 +165,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_woodbury_k_cublas_offload(handle, s_handle, Lds, Dim, N_updates, Updates, - Updates_index, breakdown, Slater_invT_copy, &determinant); + Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -251,7 +180,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates, - Updates_index, breakdown, Slater_invT_copy, &determinant); + Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -265,7 +194,7 @@ printf("#----------------------------------------------------------------------- // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates, - Updates_index, breakdown, Slater_invT_copy, &determinant); + Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME uint64_t after = rdtsc(); @@ -275,8 +204,8 @@ printf("#----------------------------------------------------------------------- } else if (version[0] == 'm') { // LAPACK/MKL // Only send upper Dim x Dim part of matrix to lapack - double tmp[DIM *DIM]; - memcpy(tmp, SlaterT, Dim*Dim*sizeof(double)); + double *tmp = malloc(sizeof *tmp * Dim * Dim); + memcpy(tmp, SlaterT, sizeof *SlaterT * Dim * Dim); // 1. FETCH START TIME uint64_t before = rdtsc(); @@ -288,12 +217,8 @@ printf("#----------------------------------------------------------------------- uint64_t after = rdtsc(); // Copy elements of inverse back, adding 0-padding in "correct" place - for (uint32_t i = 0; i < Dim; i++) { - for (uint32_t j = 0; j < Lds; j++) { - if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j]; - else Slater_invT_copy[i * Lds + j] = 0.0; - } - } + copy(Slater_invT_copy, Lds, tmp, Dim); + free(tmp); // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR accumulator += (double)(after - before); @@ -302,9 +227,10 @@ printf("#----------------------------------------------------------------------- printf("Version '%c' not implemented.\n", version[0]); return 1; } + } // END OF REPETITIONS LOOP - // 4. COPY RESULT BACK TO ORIGINAL + // 4. COPY RESULT BACK TO ORIGINAL memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double)); determinant = determinant_copy; // At this point Slater_invT contains the correct inverse matrix @@ -321,11 +247,19 @@ printf("#----------------------------------------------------------------------- // CUMULATIVE RESULT FOR THE ENTIRE DATASET cumulative += accumulator; - double SSi[DIM * DIM]; - matmul(SlaterT, Slater_invT, SSi, Lds, Dim); - double Res[DIM * DIM]; + // Compute Slater x Slater_inv + double* SSi = malloc(sizeof *SSi * Dim * Dim); + double alpha = 1.0, beta = 0.0; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + Dim, Dim, Dim, + alpha, SlaterT, Dim, Slater_invT, Lds, + beta, SSi, Dim); + + // Compute residual matrix S * Sinv - Id + double* Res = calloc(1, sizeof *Res * Dim * Dim); residual(SSi, Res, Dim); const double max = max_norm(Res, Dim, Dim); + free(SSi); // 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance); @@ -336,13 +270,16 @@ printf("#----------------------------------------------------------------------- // 8. COMPUTE CONDITION NUMBER const double condnr = condition_number(Slater, Slater_invT, Lds, Dim); const double frob = frobenius_norm(Res, Dim, Dim); - + free(Res); // 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS - printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls); - + printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls); + free(Updates_index); free(Updates); + free(SlaterT); + free(Slater_invT); + free(Slater_invT_copy); } // END OF CYCLE LOOP printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); @@ -352,7 +289,7 @@ printf("#----------------------------------------------------------------------- (void) H5Fclose(file_id); -#ifdef HAVE_CUBLAS_OFFLOAD +#ifdef HAVE_CUBLAS_OFFLOAD cublasDestroy(handle); cusolverDnDestroy(s_handle); #endif