From 00bdcba2301c78f1b93a8ae4ce2891c1b241fea7 Mon Sep 17 00:00:00 2001 From: Francois Coppens Date: Thu, 22 Sep 2022 14:37:00 +0200 Subject: [PATCH] cuBLAS version of Woodbury KxK is working, but called to lapacke dgetrf/ri need to be replaced with cuSOLVER calls to eliminate intermediate results to be transfered to/from device. --- independent_test_harness/Makefile | 12 +- .../cblasdgemm_vs_cublasdgemm_test.c | 27 +- independent_test_harness/debug.h | 22 +- independent_test_harness/kernels.h | 3 +- independent_test_harness/meuk.h | 2 +- independent_test_harness/sm.c | 251 ++++++++---------- independent_test_harness/test.c.old | 92 ------- independent_test_harness/test.c.older | 62 ----- 8 files changed, 157 insertions(+), 314 deletions(-) delete mode 100644 independent_test_harness/test.c.old delete mode 100644 independent_test_harness/test.c.older diff --git a/independent_test_harness/Makefile b/independent_test_harness/Makefile index 97d2a17..fc0c88e 100644 --- a/independent_test_harness/Makefile +++ b/independent_test_harness/Makefile @@ -1,11 +1,15 @@ -FC = ifx -CC = nvc +#FC = ifx +#CC = nvc CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu -LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl +INCLUDE=-I$(NVHPC_ROOT)/math_libs/include + +LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -LDFLAGS+=-lcublas -mp -target=gpu +LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -mp -target=gpu + +all: test ## Link with icc # test: sm.o test.o detupdate21.o meuk.o diff --git a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c index b190518..bee638c 100644 --- a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c +++ b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c @@ -1,18 +1,21 @@ /* Compile with: -nvc -L${MKLROOT}/lib/intel64 \ - -lmkl_intel_lp64 \ - -lmkl_sequential \ - -lmkl_core \ - -lpthread \ - -lm \ - -ldl \ - -lcublas \ - -mp \ - -target=gpu \ - cblasdgemm_vs_cublasdgemm_test.c \ - -o cblasdgemm_vs_cublasdgemm_test +nvc \ +-I$NV_CUDA_MATH_PATH/11.7/include \ +-L$NV_CUDA_MATH_PATH/11.7/lib64 \ +-L${MKLROOT}/lib/intel64 \ +-lmkl_intel_lp64 \ +-lmkl_sequential \ +-lmkl_core \ +-lpthread \ +-lm \ +-ldl \ +-lcublas \ +-mp \ +-target=gpu \ +cblasdgemm_vs_cublasdgemm_test.c \ +-o cblasdgemm_vs_cublasdgemm_test */ diff --git a/independent_test_harness/debug.h b/independent_test_harness/debug.h index 378b8e4..4ec98fa 100644 --- a/independent_test_harness/debug.h +++ b/independent_test_harness/debug.h @@ -1,11 +1,29 @@ -void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name) +#include +#include +#include + +void print_dm(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name) { printf("%s = \n", name); for (uint16_t i = 0; i < m; ++i) { for (uint16_t j = 0; j < n; ++j) { - printf("%11.5f ", mat[i * ldm + j]); + printf("%9.3f ", mat[i * ldm + j]); + } + printf("\n"); + } + printf("\n"); +} + +void print_im(const int* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name) +{ + printf("%s = \n", name); + for (uint16_t i = 0; i < m; ++i) + { + for (uint16_t j = 0; j < n; ++j) + { + printf("%d ", mat[i * ldm + j]); } printf("\n"); } diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h index c1b090c..95fc5b1 100644 --- a/independent_test_harness/kernels.h +++ b/independent_test_harness/kernels.h @@ -7,8 +7,9 @@ #ifdef HAVE_CUBLAS_OFFLOAD #include -#include +#include #include +#include #endif lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds); diff --git a/independent_test_harness/meuk.h b/independent_test_harness/meuk.h index 02432c0..5e9a9e0 100644 --- a/independent_test_harness/meuk.h +++ b/independent_test_harness/meuk.h @@ -4,7 +4,7 @@ #include #include #include -#include "hdf5.h" +#include #include "kernels.h" typedef struct Error { diff --git a/independent_test_harness/sm.c b/independent_test_harness/sm.c index 682aa32..cad422b 100644 --- a/independent_test_harness/sm.c +++ b/independent_test_harness/sm.c @@ -299,9 +299,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, // Compute determinant by LU decomposition int ipiv[N_updates]; - lapack_int ret; - ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv); - if (ret != 0) return ret; + (void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv); + double det = 1.0; int j = 0; for (uint32_t i = 0; i < N_updates; i++) { @@ -319,8 +318,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, if (determinant) *determinant *= det; // Compute B^{-1} with explicit formula for K x K inversion - ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); - if (ret != 0) return ret; + (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm double tmp1[N_updates * LDS]; @@ -328,173 +326,146 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, N_updates, LDS, N_updates, alpha, B, N_updates, D, LDS, beta, tmp1, LDS); - print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas"); // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm - // alpha = -1.0, beta = 1.0; - // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - // Dim, LDS, N_updates, - // alpha, C, N_updates, tmp1, LDS, - // beta, Slater_inv, LDS); - - double *tmp2 = calloc(1, DIM * LDS * sizeof(double)); - cblas_dgemm(CblasRowMajor, - CblasNoTrans, CblasNoTrans, + alpha = -1.0, beta = 1.0; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Dim, LDS, N_updates, alpha, C, N_updates, tmp1, LDS, - beta, tmp2, LDS); - print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas"); - - double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); - for(int i = 0; i < DIM * LDS; ++i) - { - tmp3[i] = Slater_inv[i] - tmp2[i]; - } - print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas"); - - for(int i = 0; i < DIM * LDS; ++i) - { - Slater_inv[i] = tmp3[i]; - } - - free(tmp2); - free(tmp3); + beta, Slater_inv, LDS); return 0; } #ifdef HAVE_CUBLAS_OFFLOAD uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle, - const uint64_t vLDS, - const uint64_t vDim, - const uint64_t N_updates, - const double *__restrict __attribute__((aligned(8))) Updates, - const uint64_t *__restrict Updates_index, - const double breakdown, - double *__restrict __attribute__((aligned(8))) Slater_inv, - double *__restrict determinant) { - + const uint64_t vLDS, + const uint64_t vDim, + const uint64_t N_updates, + const double* Updates, + const uint64_t* Updates_index, + const double breakdown, + double* Slater_inv, + double* determinant) +{ const uint32_t Dim = DIM; const uint32_t Lds = LDS; - // Compute C = S^{-1} U : Dim x K : standard dgemm - double *C = calloc(1, DIM * N_updates * sizeof(double)); - double alpha = 1.0f, beta = 0.0f; - #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates]) - #pragma omp target data use_device_ptr(Slater_inv, Updates, C) - { - int cublasError = cublasDgemm(handle, - CUBLAS_OP_T, CUBLAS_OP_N, - 15, 21, 24, - &alpha, Updates, 24, Slater_inv, 24, - &beta, C, 15); - } - #pragma omp target exit data map(from:C[0:DIM*N_updates]) + double alpha, beta; + int* ipiv = calloc(1, sizeof *ipiv * N_updates); + double* C = calloc(1, sizeof *C * Dim * N_updates); + double* B = calloc(1, sizeof *B * N_updates * N_updates); + double* D = calloc(1, sizeof *D * N_updates * Lds); + double* T1 = calloc(1, sizeof *T1 * N_updates * Lds); + double* T2 = calloc(1, sizeof *T2 * Dim * Lds); - // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD - // Construct D = V S^{-1} : K x LDS - double *B = calloc(1, N_updates * N_updates * sizeof(double)); - double *D = calloc(1, N_updates * LDS * sizeof(double)); - for (uint32_t i = 0; i < N_updates; i++) { - const uint32_t row = Updates_index[i] - 1; - for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j); - for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j]; + #pragma omp target enter data map(to: Updates[0:Lds*N_updates], \ + Updates_index[0:N_updates], \ + Slater_inv[0:Dim*Lds]) \ + map(alloc: B[0:N_updates*N_updates], \ + C[0:Dim*N_updates], \ + D[0:N_updates*Lds], \ + T1[0:N_updates*Lds], \ + T2[0:Dim*Lds], \ + ipiv[0:N_updates]) + + #pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE + { + alpha = 1.0f, beta = 0.0f; + (void) cublasDgemm(handle, + CUBLAS_OP_T, CUBLAS_OP_N, + N_updates, Dim, Lds, + &alpha, Updates, Lds, Slater_inv, Lds, + &beta, C, N_updates); } + // Construct B = 1 + V C : K x K + // Construct D = V S^{-1} : K x LDS + #pragma omp target teams distribute parallel for // compute B, D ON DEVICE + for (uint32_t i = 0; i < N_updates; i++) + { + const uint32_t row = Updates_index[i] - 1; + for (uint32_t j = 0; j < N_updates ; j++) + { + B[i * N_updates + j] = C[row * N_updates + j] + (i == j); + } + for (uint32_t j = 0; j < Lds; j++) + { + D[i * Lds + j] = Slater_inv[row * Lds + j]; + } + } + #pragma omp target update from(B[0:N_updates*N_updates]) // Update B ON HOST + // Compute determinant by LU decomposition - int ipiv[N_updates]; - lapack_int ret; - ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv); - if (ret != 0) return ret; - double det = 1.0; - int j = 0; - for (uint32_t i = 0; i < N_updates; i++) { + (void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv); + #pragma omp target update to(B[0:N_updates*N_updates], ipiv[0:N_updates]) // Update B ON DEVICE + + double det = 1.0f; uint32_t j = 0; + #pragma omp target teams distribute parallel for // compute det ON DEVICE + for (uint32_t i = 0; i < N_updates; i++) + { j += min(ipiv[i] - i, 1); det *= B[(N_updates + 1) * i]; // update determinant } if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even // Check if determinant of B is not too close to zero - if (fabs(det) < breakdown) { - return 1; - } + if (fabs(det) < breakdown) return det; // Update det(Slater) if passed if (determinant) *determinant *= det; - // Compute B^{-1} with explicit formula for K x K inversion - ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); - if (ret != 0) return ret; + // Compute B^{-1} + (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); // compute B^-1 ON HOST + #pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE - // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm - double *tmp1 = calloc(1, N_updates * LDS * sizeof(double)); - #pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS]) - #pragma omp target data use_device_ptr(D, B, tmp1) + // T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm + #pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE { - int cublasError = cublasDgemm(handle, - CUBLAS_OP_N, CUBLAS_OP_N, - LDS, N_updates, N_updates, - &alpha, D, LDS, B, N_updates, - &beta, tmp1, LDS); - } - #pragma omp target exit data map(from:tmp1[0:N_updates*LDS]) - print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas"); - - // Compute tmp2 = C * tmp1 : Dim x LDS - double *tmp2 = calloc(1, DIM * LDS * sizeof(double)); - #pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS]) - #pragma omp target data use_device_ptr(tmp1, C, tmp2) - { - int cublasError = cublasDgemm(handle, - CUBLAS_OP_N, CUBLAS_OP_N, - LDS, Dim, N_updates, - &alpha, tmp1, LDS, C, N_updates, - &beta, tmp2, LDS); - } - #pragma omp target exit data map(from:tmp2[0:DIM*LDS]) - print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas"); - - // Compute tmp3 = S^{-1} - tmp2 - double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); - beta = -1.0f; - #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS]) - #pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3) - { - int cublasError = cublasDgeam(handle, - CUBLAS_OP_N, CUBLAS_OP_N, - DIM, LDS, - &alpha, Slater_inv, LDS, - &beta, tmp2, LDS, - tmp3, LDS); - } - #pragma omp target exit data map(from:tmp3[0:DIM*LDS]) - print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas"); - for(int i = 0; i < DIM * LDS; ++i) - { - Slater_inv[i] = tmp3[i]; + alpha = 1.0, beta = 0.0; + (void) cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + Lds, N_updates, N_updates, + &alpha, D, Lds, B, N_updates, + &beta, T1, Lds); } - // // Compute S^{-1} <- S^{-1} - tmp2 - // beta = -1.0f; - // #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS]) - // #pragma omp target data use_device_ptr(Slater_inv, tmp2) - // { - // int cublasError = cublasDgeam(handle, - // CUBLAS_OP_N, CUBLAS_OP_N, - // DIM, LDS, - // &alpha, Slater_inv, LDS, - // &beta, tmp2, LDS, - // Slater_inv, LDS); - // } - // #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS]) + // Compute T2 <- C * T1 : Dim x LDS : standard dgemm + #pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE + { + alpha = 1.0f, beta = 0.0f; + (void) cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + Dim, Lds, N_updates, + &alpha, T1, Lds, C, N_updates, + &beta, T2, Lds); + } - + // Compute S^{-1} <- S^{-1} - T2 : Dim x LDS : standard dgemm + #pragma omp target teams distribute parallel for // compute S^-1 ON DEVICE + for (uint32_t i = 0; i < Dim * Lds; i++) + { + Slater_inv[i] = Slater_inv[i] - T2[i]; + } + #pragma omp target update from(Slater_inv[0:Dim*Lds]) // update S^-1 ON HOST + + #pragma omp target exit data map(delete: Updates[0:Lds*N_updates], \ + Updates_index[0:N_updates], \ + Slater_inv[0:Dim*Lds], \ + B[0:N_updates*N_updates], \ + C[0:Dim*N_updates], \ + D[0:N_updates*Lds], \ + T1[0:N_updates*Lds], \ + T2[0:Dim*Lds], \ + ipiv[0:N_updates]) + +// free(ipiv); free(B); free(C); free(D); - free(tmp1); - free(tmp2); - // free(tmp3); + free(T1); + free(T2); + return 0; } #endif @@ -529,7 +500,7 @@ uint32_t qmckl_slagel_splitting( // Denominator const int cui = Updates_index[l] - 1; - double den = 1.0 + C[cui]; + double den = 1.0 + C[cui]; // printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui); if (fabs(den) < breakdown) { // Here is decided to split the update, or not. // printf("Split! breakdown = %f\n", breakdown); @@ -662,7 +633,7 @@ uint32_t qmckl_sherman_morrison_smw32s( // And for the other cases != 4, 6 // Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with - // Woodbury 3x3 kernel + // Woodbury 3x3 kernel uint32_t n_of_3blocks = N_updates / 3; uint32_t remainder = N_updates % 3; uint32_t length_3block = 3 * Lds; @@ -799,7 +770,7 @@ uint32_t qmckl_sherman_morrison_later( } } l += 1; - } + } if (later == N_updates) { // If all the updates have failed, exit early with an error return 1; diff --git a/independent_test_harness/test.c.old b/independent_test_harness/test.c.old deleted file mode 100644 index a1c1cc2..0000000 --- a/independent_test_harness/test.c.old +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include "data_cm.h" -#include "meuk.h" - -#define REPETITIONS 10000000 -int main(int argc, char **argv) { - - assert(argc == 3); - char *version = argv[1]; - char *number_of_updates = argv[2]; - const uint64_t Dim = 21; - const uint64_t LDS = 24; - // const double breakdown = 1e-3; - const double breakdown = 1e-9; // this might be too small and cause NIs - uint32_t rc; - - const uint64_t *N_updates; - const double *Updates; - const uint64_t *Updates_index; - double *Slater, *Slater_invT; - double determinant; - if (number_of_updates[0] == '2') { // 2 Updates - N_updates = &N_updates2; - Updates = &Updates2[0]; - Updates_index = &Updates_index2[0]; - Slater = &Slater2[0]; - Slater_invT = &Slater_invT2[0]; // Slater_inv in QMC=Chem is actually its transpose - determinant = determinant2; - } else if (number_of_updates[0] == '3') { // 3 Updates - N_updates = &N_updates3; - Updates = &Updates3[0]; - Updates_index = &Updates_index3[0]; - Slater = &Slater3[0]; - Slater_invT = &Slater_invT3[0]; - determinant = determinant3; - } else if (number_of_updates[0] == '5') { // 5 Updates - N_updates = &N_updates5; - Updates = &Updates5[0]; - Updates_index = &Updates_index5[0]; - Slater = &Slater5[0]; - Slater_invT = &Slater_invT5[0]; - determinant = determinant5; - } else { // Exit - printf("Incorrect number of updates given\n"); - return 1; - } - - rc = check_residual(LDS, Dim, Slater_invT, Slater); - assert(rc == 0 && "check_residual()"); - rc = test_kernel(version, LDS, Dim, *N_updates, Updates, Updates_index, - breakdown, Slater, Slater_invT, &determinant); - assert(rc == 0 && "test_kernel()"); - - // EVERYTHING WORKS UP UNTILL HERE - - uint64_t before = rdtsc(); - if (version[0] == 'a') { // Anthony - for (int i = 0; i < REPETITIONS; i++) { - const double* Upds; - const uint64_t* Ui; - for (int j = 0; j < *N_updates; j++) { - Upds = &Updates[j*LDS]; - Ui = &Updates_index[j]; - detupd(Dim, LDS, Upds, Ui, Slater_invT, &determinant); - } - } - } else if (version[0] == 'n') { // Naive - for (int i = 0; i < REPETITIONS; i++) { - rc = qmckl_sherman_morrison(LDS, Dim, *N_updates, Updates, - Updates_index, breakdown, Slater_invT, &determinant); - if (rc != 0) printf("qmckl_sherman_morrison failed\n"); - } - } else if (version[0] == 's') { // Splitting - for (int i = 0; i < REPETITIONS; i++) { - rc = qmckl_sherman_morrison_splitting(LDS, Dim, *N_updates, Updates, - Updates_index, breakdown, Slater_invT, &determinant); - if (rc != 0) printf("qmckl_sherman_morrison_splitting failed\n"); - } - } else if (version[0] == 'b') { // Blocked - for (int i = 0; i < REPETITIONS; i++) { - // rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index, - // breakdown, Slater_inv, &determinant); - // rc = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index, - // breakdown, Slater_inv, &determinant); - rc = qmckl_sherman_morrison_smw32s(LDS, Dim, *N_updates, Updates, - Updates_index, breakdown, Slater_invT, &determinant); - if (rc != 0) printf("qmckl_sherman_morrison_smw32s failed\n"); - } - } - uint64_t after = rdtsc(); - printf("cycles = %f\n", ((double)(after - before) / (double) REPETITIONS)); -} diff --git a/independent_test_harness/test.c.older b/independent_test_harness/test.c.older deleted file mode 100644 index 2ae7c24..0000000 --- a/independent_test_harness/test.c.older +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include -#include -#include - -static __inline__ uint64_t rdtsc(void) { - unsigned hi, lo; - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return ((unsigned long long)lo) | (((unsigned long long)hi) << 32); -} - -int qmckl_sherman_morrison( - const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates, - const double *__restrict __attribute__((aligned(8))) Updates, - const uint64_t *__restrict Updates_index, const double breakdown, - double *__restrict __attribute__((aligned(8))) Slater_inv, - double *__restrict determinant); - -int detupd(const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates, - const double *__restrict __attribute__((aligned(8))) Updates, - const uint64_t *__restrict Updates_index, const double breakdown, - double *__restrict __attribute__((aligned(8))) Slater_inv, - double *__restrict determinant); - -#define REPETITIONS 100000000 -int main(int argc, char **argv) { - - assert(argc == 2); - char *version = argv[1]; - - const uint64_t Dim = 21; - const uint64_t LDS = 24; - const uint64_t N_updates = 1; - double Updates[LDS] __attribute__((aligned(8))); - uint64_t Updates_index[N_updates]; - Updates_index[0] = 1; - const double breakdown = 1e-3; - double Slater_inv[LDS * Dim] __attribute__((aligned(8))); - double determinant = 1.0; - - for (int i = 0; i < Dim; i++) { - Updates[i] = i; - for (int j = 0; j < Dim; j++) { - Slater_inv[LDS * i + j] = j; - } - } - - uint64_t before = rdtsc(); - if (version[0] == 'c') { - for (int i = 0; i < REPETITIONS; i++) { - detupd(LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, - &determinant); - } - } else { - for (int i = 0; i < REPETITIONS; i++) { - qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index, - breakdown, Slater_inv, &determinant); - } - } - uint64_t after = rdtsc(); - printf("cycles = %f\n", ((double)(after - before) / (double)REPETITIONS)); -}