From 87e319189e3e83a5ee60a7dd4c6db073bcf44ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Coppens?= Date: Fri, 22 Jul 2022 11:34:29 +0200 Subject: [PATCH] - Got rid of NVC compiler warnings - Included lib paths for MKL/HDF5 and cuBLAS - Cleaned Makefile - Added GPU node session request script --- independent_test_harness/Makefile | 32 +-- independent_test_harness/block | 9 - independent_test_harness/goto_gpu_node.sh | 4 + independent_test_harness/kay | 9 - independent_test_harness/kernels.h | 7 +- independent_test_harness/meuk.c | 66 +++--- independent_test_harness/meuk.h | 18 +- independent_test_harness/mkl | 9 - independent_test_harness/sm.c | 269 +++++++++++----------- independent_test_harness/test.c | 72 +++--- independent_test_harness/three | 9 - 11 files changed, 225 insertions(+), 279 deletions(-) delete mode 100644 independent_test_harness/block create mode 100755 independent_test_harness/goto_gpu_node.sh delete mode 100644 independent_test_harness/kay delete mode 100644 independent_test_harness/mkl delete mode 100644 independent_test_harness/three diff --git a/independent_test_harness/Makefile b/independent_test_harness/Makefile index 614be41..214433f 100644 --- a/independent_test_harness/Makefile +++ b/independent_test_harness/Makefile @@ -1,19 +1,11 @@ -# FC = gfortran -# CC = gcc -# FFLAGS=-O0 -finline -g -lm -Wall -pedantic -# CFLAGS=-std=c99 -O0 -finline -g -lm -Wall -pedantic FC = ifx -CC = icx -# FFLAGS=-O0 -warn all -g -pedantic -# CFLAGS=-std=c99 -O0 -Wall -g -pedantic -FFLAGS=-O3 -warn all -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64 -CFLAGS=-std=c99 -O3 -Wall -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64 -INCLUDE=-I/usr/include/hdf5/serial -I/usr/local/cuda/include -LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential -L/usr/local/cuda-11.7/targets/x86_64-linux/lib -lcublas -#FC = verificarlo-f -#CC = verificarlo-c -#FFLAGS=-O3 -finline -g -#CFLAGS=-O3 -finline -g +CC = nvc + +CFLAGS=-std=c99 -O3 -Wall -g + +LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl +LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl +LFLAGS+=-lcublas ## Link with icc # test: sm.o test.o detupdate21.o meuk.o @@ -21,16 +13,6 @@ LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential test: sm.o test.o meuk.o $(CC) $(LFLAGS) -o test sm.o test.o meuk.o - - -## Link with ifort -# test: sm.o test.o detupdate21.o meuk.o -# $(FC) $(LFLAGS) -nofor-main -o test sm.o detupdate21.o test.o meuk.o - -## Link with gfortran -# test: sm.o test.o detupdate21.o meuk.o -# $(FC) $(LFLAGS) -Wno-main -o test sm.o detupdate21.o test.o meuk.o - %.o: %.f90 $(FC) $(FFLAGS) -c -o $@ $< diff --git a/independent_test_harness/block b/independent_test_harness/block deleted file mode 100644 index 690e572..0000000 --- a/independent_test_harness/block +++ /dev/null @@ -1,9 +0,0 @@ -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0 -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/independent_test_harness/goto_gpu_node.sh b/independent_test_harness/goto_gpu_node.sh new file mode 100755 index 0000000..9553a6a --- /dev/null +++ b/independent_test_harness/goto_gpu_node.sh @@ -0,0 +1,4 @@ +salloc --nodes=1 --account=prcoe10 -p booster --gres=gpu:1 +wait +srun --pty /bin/bash + diff --git a/independent_test_harness/kay b/independent_test_harness/kay deleted file mode 100644 index 697c51d..0000000 --- a/independent_test_harness/kay +++ /dev/null @@ -1,9 +0,0 @@ -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0 -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h index c1b6a8a..08b1117 100644 --- a/independent_test_harness/kernels.h +++ b/independent_test_harness/kernels.h @@ -1,6 +1,8 @@ #include #include +#define DIM 21 +#define LDS 24 #define HAVE_CUBLAS_OFFLOAD #ifdef HAVE_CUBLAS_OFFLOAD @@ -9,7 +11,7 @@ #include #endif -lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS); +lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds); int min(int a, int b); @@ -72,7 +74,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, Slater_inv, double *__restrict determinant); -void detupd(const uint64_t Dim, const uint64_t LDS, +void detupd(const uint64_t Dim, const uint64_t Lds, const double *__restrict __attribute__((aligned(8))) Updates, const uint64_t *__restrict Updates_index, double *__restrict __attribute__((aligned(8))) Slater_inv, @@ -84,4 +86,3 @@ uint32_t qmckl_sherman_morrison_later( const uint64_t *__restrict Updates_index, const double breakdown, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant); - \ No newline at end of file diff --git a/independent_test_harness/meuk.c b/independent_test_harness/meuk.c index c0ca8d5..2560e57 100644 --- a/independent_test_harness/meuk.c +++ b/independent_test_harness/meuk.c @@ -2,27 +2,27 @@ #include #include -void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim) { - for (uint64_t i = 0; i < LDS * Dim; i++) { +void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) { + for (uint64_t i = 0; i < Lds * Dim; i++) { printf("%f\n", A[i]); } printf("\n"); } -double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim) { +double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) { double sum2 = 0; - for (uint64_t i = 0; i < LDS * Dim; i++) sum2 += A[i] * A[i]; + for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i]; return sum2; } -double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim) { - double sum2 = frobenius_norm2(A, LDS, Dim); +double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim) { + double sum2 = frobenius_norm2(A, Lds, Dim); return sqrt(sum2); } -double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) { +double max_norm(double *A, const uint64_t Lds, const uint64_t Dim) { double largest = 0; - for (uint64_t i = 0; i < LDS * Dim; i++) { + for (uint64_t i = 0; i < Lds * Dim; i++) { double elm = A[i]; double felm = fabs(elm); if (elm != elm) return -1.0; // Return a negative norm when NaN found @@ -31,9 +31,9 @@ double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) { return largest; } -double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim) { - double norm_A = frobenius_norm(A, LDS, Dim); - double norm_Ainv = frobenius_norm(Ainv, LDS, Dim); +double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim) { + double norm_A = frobenius_norm(A, Lds, Dim); + double norm_Ainv = frobenius_norm(Ainv, Lds, Dim); return fabs(norm_A) * fabs(norm_Ainv); } @@ -57,19 +57,19 @@ void read_double(hid_t file_id, const char *key, double *data) { assert(rc >= 0 && "H5Dclose"); } -void update_slater_matrix(const uint64_t LDS, const uint64_t Dim, +void update_slater_matrix(const uint64_t Lds, const uint64_t Dim, const uint64_t N_updates, const double *Updates, const uint64_t *Updates_index, double *Slater) { for (uint32_t i = 0; i < N_updates; i++) { uint32_t col = Updates_index[i] - 1; for (uint32_t j = 0; j < Dim; j++) { - Slater[col * Dim + j] += Updates[i * LDS + j]; + Slater[col * Dim + j] += Updates[i * Lds + j]; } } } -uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT, +uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT, double *Slater, const double tolerance) { double res[Dim*Dim]; @@ -78,7 +78,7 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT for (uint32_t j = 0; j < Dim; j++) { res[i * Dim + j] = 0; for (uint32_t k = 0; k < Dim; k++) { - res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * LDS + j]; + res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j]; } } } @@ -95,12 +95,12 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT return 0; } -void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim) { +void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) { for (uint32_t i = 0; i < Dim; i++) { for (uint32_t j = 0; j < Dim; j++) { prod[i * Dim + j] = 0; for (uint32_t k = 0; k < Dim; k++) { - prod[i * Dim + j] += a[i * Dim + k] * b[k * LDS + j]; + prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j]; } } } @@ -121,7 +121,7 @@ void residual(double *a, double *res, const uint64_t Dim) { } } -uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim, +uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim, const uint64_t N_updates, const double *Updates, const uint64_t *Updates_index, const double breakdown, const double tolerance, double *Slater, double *Slater_inv, double *determinant) { @@ -129,40 +129,40 @@ uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim, // if (version[0] == 'a') { // Anthony // const double *Upds; // const uint64_t *Ui; - // for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] *= *determinant; + // for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant; // for (int j = 0; j < N_updates; j++) { - // Upds = &Updates[j * LDS]; + // Upds = &Updates[j * Lds]; // Ui = &Updates_index[j]; - // detupd(Dim, LDS, Upds, Ui, Slater_inv, determinant); + // detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant); // if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n"); // } - // for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] /= *determinant; - // update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater); - // rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance); + // for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant; + // update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater); + // rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance); // if (rc != 0) printf("TEST_KERNEL: check_error failed\n"); // } else if (version[0] == 'n') { // Naive if (version[0] == 'n') { // Naive - rc = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index, + rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant); if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison failed\n"); - update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater); - rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance); + update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater); + rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance); if (rc != 0) printf("TEST_KERNEL: check_error failed\n"); } else if (version[0] == 's') { // Splitting - rc = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates, + rc = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant); if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_splitting failed\n"); - update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater); - rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance); + update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater); + rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance); if (rc != 0) printf("TEST_KERNEL: check_error failed\n"); } else if (version[0] == 'b') { // Blocked - rc = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates, + rc = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant); if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_smw32s failed\n"); - update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater); - rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance); + update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater); + rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance); if (rc != 0) printf("TEST_KERNEL: check_error failed\n"); } return rc; diff --git a/independent_test_harness/meuk.h b/independent_test_harness/meuk.h index 8b42fa7..65947dc 100644 --- a/independent_test_harness/meuk.h +++ b/independent_test_harness/meuk.h @@ -12,13 +12,13 @@ typedef struct Error { uint64_t error; } Error; -void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim); +void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim); void residual(double *a, double *res, const uint64_t Dim); -double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim); -void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim); -double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim); -double max_norm(double *A, const uint64_t LDS, const uint64_t Dim); -double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim); +double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim); +void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim); +double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim); +double max_norm(double *A, const uint64_t Lds, const uint64_t Dim); +double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim); void read_uint(hid_t file_id, const char *key, uint64_t *data); void read_double(hid_t file_id, const char *key, double *data); @@ -28,16 +28,16 @@ static __inline__ uint64_t rdtsc(void) { return ((unsigned long long)lo) | (((unsigned long long)hi) << 32); } -void update_slater_matrix(const uint64_t LDS, const uint64_t Dim, +void update_slater_matrix(const uint64_t Lds, const uint64_t Dim, const uint64_t N_updates, const double *Updates, const uint64_t *Updates_index, double *Slater); -uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT, +uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT, double *Slater, const double tolerance); int32_t check_error_better(const double max, const double tolerance); -uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim, +uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim, const uint64_t N_updates, const double *Updates, const uint64_t *Updates_index, const double breakdown, const double tolerance, double *Slater, double *Slater_inv, double *determinant); diff --git a/independent_test_harness/mkl b/independent_test_harness/mkl deleted file mode 100644 index c1d48d7..0000000 --- a/independent_test_harness/mkl +++ /dev/null @@ -1,9 +0,0 @@ -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0 -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/independent_test_harness/sm.c b/independent_test_harness/sm.c index b539dd3..650bda2 100644 --- a/independent_test_harness/sm.c +++ b/independent_test_harness/sm.c @@ -16,11 +16,11 @@ uint32_t qmckl_sherman_morrison( const uint64_t *__restrict Updates_index, const double breakdown, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - - const uint32_t Dim = 21; - const uint32_t LDS = 24; - double __attribute__((aligned(8))) C[Dim]; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; + + double __attribute__((aligned(8))) C[DIM]; double __attribute__((aligned(8))) D[LDS]; uint32_t l = 0; @@ -31,8 +31,8 @@ uint32_t qmckl_sherman_morrison( C[i] = 0.0; #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l. + for (uint32_t j = 0; j < Lds; j++) { + C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l. } } @@ -51,17 +51,17 @@ uint32_t qmckl_sherman_morrison( #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - D[j] = Slater_inv[cui * LDS + j]; // selecting proper column of v_l^T * S_inv + for (uint32_t j = 0; j < Lds; j++) { + D[j] = Slater_inv[cui * Lds + j]; // selecting proper column of v_l^T * S_inv } // A^{-1} = A^{-1} - C x D / den for (uint32_t i = 0; i < Dim; i++) { #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { + for (uint32_t j = 0; j < Lds; j++) { const double update = C[i] * D[j] * iden; - Slater_inv[i * LDS + j] -= update; + Slater_inv[i * Lds + j] -= update; } } l += 1; @@ -69,6 +69,15 @@ uint32_t qmckl_sherman_morrison( return 0; } +/* +COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS, +where S^{-1}P : Dim x LDS, + C := S^{-1}PP^TU : Dim x 2, + B := 1 + VC : 2 x 2, + D := VS^{-1}P : 2 x LDS, + P^TU : LDS x 2, + V : 2 x Dim +*/ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, const double *__restrict __attribute__((aligned(8))) Updates, @@ -77,31 +86,23 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; - /* - COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS, - where S^{-1}P : Dim x LDS, - C := S^{-1}PP^TU : Dim x 2, - B := 1 + VC : 2 x 2, - D := VS^{-1}P : 2 x LDS, - P^TU : LDS x 2, - V : 2 x Dim - */ + + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; const uint32_t row1 = (Updates_index[0] - 1); const uint32_t row2 = (Updates_index[1] - 1); // Compute C = (S^T)^{-1}U : Dim x 2 - double __attribute__((aligned(8))) C[2 * Dim]; + double __attribute__((aligned(8))) C[2 * DIM]; for (uint32_t i = 0; i < Dim; i++) { C[i * 2] = 0; C[i * 2 + 1] = 0; #pragma ivdep #pragma vector aligned - for (uint32_t k = 0; k < LDS; k++) { - C[i * 2] += Slater_inv[i * LDS + k] * Updates[k]; - C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k]; + for (uint32_t k = 0; k < Lds; k++) { + C[i * 2] += Slater_inv[i * Lds + k] * Updates[k]; + C[i * 2 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k]; } } @@ -130,28 +131,37 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim, // tmp = B^{-1}D : 2 x LDS double __attribute__((aligned(8))) tmp[2 * LDS]; - double *__restrict r1dim = &(Slater_inv[row1 * LDS]); - double *__restrict r2dim = &(Slater_inv[row2 * LDS]); + double *__restrict r1dim = &(Slater_inv[row1 * Lds]); + double *__restrict r2dim = &(Slater_inv[row2 * Lds]); #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { + for (uint32_t j = 0; j < Lds; j++) { tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j]; - tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j]; + tmp[Lds + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j]; } - // Compute (S^T)^{-1} - C * tmp : Dim x LDS + // Compute (S^T)^{-1} - C * tmp : Dim x Lds for (uint32_t i = 0; i < Dim; i++) { #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j]; - Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j]; + for (uint32_t j = 0; j < Lds; j++) { + Slater_inv[i * Lds + j] -= C[i * 2] * tmp[j]; + Slater_inv[i * Lds + j] -= C[i * 2 + 1] * tmp[Lds + j]; } } return 0; } +/* +COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS, +where S^T : Dim x LDS, + C := (S^T)^{-1}U : Dim x 3, + B := 1 + VC : 3 x 3, + D := V(S^T)^{-1} : 3 x LDS, + U : LDS x 3, + V : 3 x Dim +*/ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, const double *__restrict __attribute__((aligned(8))) Updates, @@ -160,34 +170,26 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; - /* - COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS, - where S^T : Dim x LDS, - C := (S^T)^{-1}U : Dim x 3, - B := 1 + VC : 3 x 3, - D := V(S^T)^{-1} : 3 x LDS, - U : LDS x 3, - V : 3 x Dim - */ + + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; const uint32_t row1 = (Updates_index[0] - 1); const uint32_t row2 = (Updates_index[1] - 1); const uint32_t row3 = (Updates_index[2] - 1); // Compute C = (S^T)^{-1}U : Dim x 3 - double __attribute__((aligned(8))) C[3 * Dim]; + double __attribute__((aligned(8))) C[3 * DIM]; for (uint32_t i = 0; i < Dim; i++) { C[i * 3] = 0; C[i * 3 + 1] = 0; C[i * 3 + 2] = 0; #pragma ivdep #pragma vector aligned - for (uint32_t k = 0; k < LDS; k++) { - C[i * 3] += Slater_inv[i * LDS + k] * Updates[k]; - C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k]; - C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k]; + for (uint32_t k = 0; k < Lds; k++) { + C[i * 3] += Slater_inv[i * Lds + k] * Updates[k]; + C[i * 3 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k]; + C[i * 3 + 2] += Slater_inv[i * Lds + k] * Updates[2 * Lds + k]; } } @@ -233,20 +235,20 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim, double *__restrict r3dim = &(Slater_inv[row3 * LDS]); #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { + for (uint32_t j = 0; j < Lds; j++) { tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j]; - tmp[LDS + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j]; - tmp[2 * LDS + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j]; + tmp[Lds + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j]; + tmp[2 * Lds + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j]; } - // Compute (S^T)^{-1} - C * tmp : Dim x LDS + // Compute (S^T)^{-1} - C * tmp : Dim x Lds for (uint32_t i = 0; i < Dim; i++) { #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j]; - Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j]; - Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j]; + for (uint32_t j = 0; j < Lds; j++) { + Slater_inv[i * Lds + j] -= C[i * 3] * tmp[j]; + Slater_inv[i * Lds + j] -= C[i * 3 + 1] * tmp[Lds + j]; + Slater_inv[i * Lds + j] -= C[i * 3 + 2] * tmp[2 * Lds + j]; } } @@ -273,15 +275,15 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; // Compute C = S^{-1} U : Dim x K : standard dgemm - double C[Dim * N_updates]; + double C[DIM * N_updates]; double alpha = 1.0, beta = 0.0; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - Dim, N_updates, LDS, - alpha, Slater_inv, LDS, Updates, LDS, + Dim, N_updates, Lds, + alpha, Slater_inv, Lds, Updates, Lds, beta, C, N_updates); // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD @@ -290,7 +292,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, for (uint32_t i = 0; i < N_updates; i++) { const uint32_t row = Updates_index[i] - 1; for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j); - for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j]; + for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j]; } // Compute determinant by LU decomposition @@ -345,41 +347,34 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; + // Compute C = S^{-1} U : Dim x K : standard dgemm + // double C[Dim * N_updates]; + double *C = malloc(DIM * N_updates * sizeof(double)); + double alpha = 1.0, beta = 0.0; + // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + // Dim, N_updates, Lds, + // alpha, Slater_inv, Lds, Updates, Lds, + // beta, C, N_updates); //cuBLAS initialization cublasHandle_t handle; if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { fprintf(stdout, "cuBLAS initialization failed!\n"); exit(EXIT_FAILURE); } - - // Compute C = S^{-1} U : Dim x K : standard dgemm - double C[Dim * N_updates]; - double alpha = 1.0, beta = 0.0; - - // #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:M*N*walk_num], tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) - // #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) - // { - // for (int nw=0; nw < walk_num; ++nw) { - // int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, - // &(een_rescaled_e[nw*(cord_num+1)]), - // LDA, af, - // &(een_rescaled_n[bf*nw]), - // LDB, 0, - // &beta, - // &(tmp_c[nw*cord_num]), - // LDC, cf, cord_num); - // } - // } - // #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) + #pragma omp target enter data map(to:Slater_inv, Updates, C) + #pragma omp target data use_device_ptr(Slater_inv, Updates, C) + { + int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, + Dim, N_updates, Lds, + &alpha, Slater_inv, Lds, Updates, Lds, + &beta, C, N_updates); + } + #pragma omp target exit data map(from:C) cublasDestroy(handle); - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - Dim, N_updates, LDS, - alpha, Slater_inv, LDS, Updates, LDS, - beta, C, N_updates); // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD // Construct D = V S^{-1} : K x LDS @@ -387,7 +382,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, for (uint32_t i = 0; i < N_updates; i++) { const uint32_t row = Updates_index[i] - 1; for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j); - for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j]; + for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j]; } // Compute determinant by LU decomposition @@ -443,8 +438,8 @@ uint32_t qmckl_slagel_splitting( uint64_t *__restrict later_index, uint64_t *__restrict later, double *__restrict determinant) { - const uint32_t LDS = 24; - const uint32_t Dim = 21; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; double __attribute__((aligned(8))) C[LDS]; double __attribute__((aligned(8))) D[LDS]; @@ -457,8 +452,8 @@ uint32_t qmckl_slagel_splitting( C[i] = 0.0; #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l. + for (uint32_t j = 0; j < Lds; j++) { + C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l. } } @@ -474,8 +469,8 @@ uint32_t qmckl_slagel_splitting( // in later_updates #pragma ivdep #pragma vector aligned - for (uint32_t i = 0; i < LDS; i++) { - later_updates[*later * LDS + i] = Updates[l * LDS + i] / 2.0; + for (uint32_t i = 0; i < Lds; i++) { + later_updates[*later * Lds + i] = Updates[l * Lds + i] / 2.0; C[i] /= 2.0; } later_index[*later] = Updates_index[l]; @@ -490,17 +485,17 @@ uint32_t qmckl_slagel_splitting( // D = v^T x S^{-1} : 1 x LDS #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - D[j] = Slater_inv[cui * LDS + j]; + for (uint32_t j = 0; j < Lds; j++) { + D[j] = Slater_inv[cui * Lds + j]; } // S^{-1} = S^{-1} - C x D / den for (uint32_t i = 0; i < Dim; i++) { #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { + for (uint32_t j = 0; j < Lds; j++) { const double update = C[i] * D[j] * iden; - Slater_inv[i * LDS + j] -= update; + Slater_inv[i * Lds + j] -= update; } } l += 1; @@ -516,18 +511,18 @@ uint32_t qmckl_sherman_morrison_splitting( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; double __attribute__((aligned(8))) later_updates[LDS * N_updates]; uint64_t later_index[N_updates]; uint64_t later = 0; // uint32_t rc; - (void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index, + (void) qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, later_updates, later_index, &later, determinant); - // rc = qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index, + // rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index, // breakdown, Slater_inv, later_updates, later_index, // &later, determinant); // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n"); @@ -535,11 +530,11 @@ uint32_t qmckl_sherman_morrison_splitting( if (later > 0) { recursive_calls++; // printf("Later > 0\n"); - (void) qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates, + (void) qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates, later_index, breakdown, Slater_inv, determinant); - // rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates, + // rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates, // later_index, breakdown, Slater_inv, // determinant); // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n"); @@ -555,8 +550,8 @@ uint32_t qmckl_sherman_morrison_smw32s( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; double __attribute__((aligned(8))) later_updates[LDS * N_updates]; uint64_t later_index[N_updates]; @@ -564,31 +559,31 @@ uint32_t qmckl_sherman_morrison_smw32s( uint32_t rc; if (N_updates == 4) { // Special case for 4 rank-1 updates: 2+2 - rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index, + rc = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index, breakdown, Slater_inv, determinant); if (rc != 0) { // Send the entire block to slagel_splitting block_fail += 1; uint64_t l = 0; - rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates, + rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates, Updates_index, breakdown, Slater_inv, - later_updates + (LDS * later), + later_updates + (Lds * later), later_index + later, &l, determinant); later += l; } - rc = qmckl_woodbury_2(LDS, Dim, &Updates[2*LDS], &Updates_index[2], + rc = qmckl_woodbury_2(Lds, Dim, &Updates[2*Lds], &Updates_index[2], breakdown, Slater_inv, determinant); if (rc != 0) { // Send the entire block to slagel_splitting block_fail += 1; uint64_t l = 0; - rc = qmckl_slagel_splitting(LDS, Dim, 2, &Updates[2*LDS], + rc = qmckl_slagel_splitting(Lds, Dim, 2, &Updates[2*Lds], &Updates_index[2], breakdown, Slater_inv, - later_updates + (LDS * later), + later_updates + (Lds * later), later_index + later, &l, determinant); later += l; } if (later > 0) { recursive_calls++; - rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates, + rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates, later_index, breakdown, Slater_inv, determinant); } @@ -600,21 +595,21 @@ uint32_t qmckl_sherman_morrison_smw32s( // Woodbury 3x3 kernel uint32_t n_of_3blocks = N_updates / 3; uint32_t remainder = N_updates % 3; - uint32_t length_3block = 3 * LDS; + uint32_t length_3block = 3 * Lds; if (n_of_3blocks > 0) { for (uint32_t i = 0; i < n_of_3blocks; i++) { const double *Updates_3block = &Updates[i * length_3block]; const uint64_t *Updates_index_3block = &Updates_index[i * 3]; - rc = qmckl_woodbury_3(LDS, Dim, Updates_3block, Updates_index_3block, + rc = qmckl_woodbury_3(Lds, Dim, Updates_3block, Updates_index_3block, breakdown, Slater_inv, determinant); if (rc != 0) { // Send the entire block to slagel_splitting // printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n"); block_fail += 1; uint64_t l = 0; - rc = qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block, + rc = qmckl_slagel_splitting(Lds, Dim, 3, Updates_3block, Updates_index_3block, breakdown, Slater_inv, - later_updates + (LDS * later), + later_updates + (Lds * later), later_index + later, &l, determinant); // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n"); later += l; @@ -626,15 +621,15 @@ uint32_t qmckl_sherman_morrison_smw32s( if (remainder == 2) { const double *Updates_2block = &Updates[n_of_3blocks * length_3block]; const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks]; - rc = qmckl_woodbury_2(LDS, Dim, Updates_2block, Updates_index_2block, + rc = qmckl_woodbury_2(Lds, Dim, Updates_2block, Updates_index_2block, breakdown, Slater_inv, determinant); if (rc != 0) { // Send the entire block to slagel_splitting // printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n"); block_fail += 1; uint64_t l = 0; - rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block, + rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates_2block, Updates_index_2block, breakdown, Slater_inv, - later_updates + (LDS * later), + later_updates + (Lds * later), later_index + later, &l, determinant); // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n"); later += l; @@ -647,9 +642,9 @@ uint32_t qmckl_sherman_morrison_smw32s( const double *Updates_1block = &Updates[n_of_3blocks * length_3block]; const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks]; uint64_t l = 0; - rc = qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block, + rc = qmckl_slagel_splitting(Lds, Dim, 1, Updates_1block, Updates_index_1block, breakdown, Slater_inv, - later_updates + (LDS * later), + later_updates + (Lds * later), later_index + later, &l, determinant); // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n"); later += l; @@ -658,7 +653,7 @@ uint32_t qmckl_sherman_morrison_smw32s( if (later > 0) { recursive_calls++; // printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n"); - rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates, + rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates, later_index, breakdown, Slater_inv, determinant); // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n"); @@ -674,10 +669,10 @@ uint32_t qmckl_sherman_morrison_later( double *__restrict __attribute__((aligned(8))) Slater_inv, double *__restrict determinant) { - const uint32_t Dim = 21; - const uint32_t LDS = 24; + const uint32_t Dim = DIM; + const uint32_t Lds = LDS; - double __attribute__((aligned(8))) C[Dim]; + double __attribute__((aligned(8))) C[DIM]; double __attribute__((aligned(8))) D[LDS]; double __attribute__((aligned(8))) later_updates[LDS * N_updates]; @@ -693,8 +688,8 @@ uint32_t qmckl_sherman_morrison_later( C[i] = 0.0; #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l. + for (uint32_t j = 0; j < Lds; j++) { + C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l. } } @@ -705,8 +700,8 @@ uint32_t qmckl_sherman_morrison_later( #pragma ivdep #pragma vector aligned // for (uint32_t i = 0; i < Dim; i++) { - for (uint32_t i = 0; i < LDS; i++) { - later_updates[later * LDS + i] = Updates[l * LDS + i]; + for (uint32_t i = 0; i < Lds; i++) { + later_updates[later * Lds + i] = Updates[l * Lds + i]; } later_index[later] = Updates_index[l]; later++; @@ -720,17 +715,17 @@ uint32_t qmckl_sherman_morrison_later( // D = v^T x A^{-1} #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { - D[j] = Slater_inv[cui * LDS + j]; + for (uint32_t j = 0; j < Lds; j++) { + D[j] = Slater_inv[cui * Lds + j]; } // S^{-1} = S^{-1} - C x D / den for (uint32_t i = 0; i < Dim; i++) { #pragma ivdep #pragma vector aligned - for (uint32_t j = 0; j < LDS; j++) { + for (uint32_t j = 0; j < Lds; j++) { const double update = C[i] * D[j] * iden; - Slater_inv[i * LDS + j] -= update; + Slater_inv[i * Lds + j] -= update; } } l += 1; @@ -741,7 +736,7 @@ uint32_t qmckl_sherman_morrison_later( } else if (later > 0) { // If some have failed, make a recursive call recursive_calls++; - (void) qmckl_sherman_morrison_later(LDS, Dim, later, later_updates, + (void) qmckl_sherman_morrison_later(Lds, Dim, later, later_updates, later_index, breakdown, Slater_inv, determinant); } diff --git a/independent_test_harness/test.c b/independent_test_harness/test.c index e10ebe9..fa57d31 100644 --- a/independent_test_harness/test.c +++ b/independent_test_harness/test.c @@ -1,6 +1,6 @@ +#include #include "meuk.h" #include "cycles.h" -#include #define DATASET "dataset_329d_zeropadded_cm.hdf5" // #define DATASET "dataset_15784d_zeropadded_cm.hdf5" @@ -23,11 +23,11 @@ int main(int argc, char **argv) { char slater_key[32]; char slater_inv_key[32]; char det_key[32]; - const uint64_t Dim = 21; - const uint64_t LDS = 24; + const uint64_t Dim = DIM; + const uint64_t Lds = LDS; uint64_t N_updates; - double Slater[LDS * Dim ], SlaterT[LDS * Dim]; - double Slater_invT[LDS * Dim], Slater_invT_copy[LDS * Dim]; + double Slater[LDS * DIM ], SlaterT[LDS * DIM]; + double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM]; double determinant, determinant_copy; // SETUP TEST PARAMETERS @@ -52,7 +52,7 @@ printf("#----------------------------------------------------------------------- sprintf(det_key, "/cycle_%d/determinant", cycle); read_uint(file_id, nupds_key, &N_updates); uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t)); - double *Updates = malloc(LDS * N_updates * sizeof(double)); + double *Updates = malloc(Lds * N_updates * sizeof(double)); read_uint(file_id, upd_idx_key, Updates_index); read_double(file_id, upds_key, Updates); read_double(file_id, slater_key, Slater); @@ -60,28 +60,28 @@ printf("#----------------------------------------------------------------------- read_double(file_id, det_key, &determinant); // Compute transpose of S. ST: 24 x 21 - for (int i = 0; i < LDS; i++) { + for (int i = 0; i < Lds; i++) { for (int j = 0; j < Dim; j++) { - SlaterT[i * Dim + j] = Slater[j * LDS + i]; + SlaterT[i * Dim + j] = Slater[j * Lds + i]; } } // Convert repl. upds into additive upds. for (int i = 0; i < N_updates; i++) { int col = Updates_index[i] - 1; - for (int j = 0; j < LDS; j++) { - Updates[i * LDS + j] -= SlaterT[col + j * Dim]; + for (int j = 0; j < Lds; j++) { + Updates[i * Lds + j] -= SlaterT[col + j * Dim]; } } // 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT - uint32_t err_inp = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance); + uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance); // Update Slater matrix for (int i = 0; i < N_updates; i++) { int col = Updates_index[i] - 1; for (int j = 0; j < Dim; j++) { - SlaterT[col + j * Dim] += Updates[i * LDS + j]; + SlaterT[col + j * Dim] += Updates[i * Lds + j]; } } // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB @@ -98,7 +98,7 @@ printf("#----------------------------------------------------------------------- for (int rep = 0; rep < REPETITIONS; rep++) { // 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY - memcpy(Slater_invT_copy, Slater_invT, LDS * Dim * sizeof(double)); + memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double)); determinant_copy = determinant; // ### CHOOSE A KERNEL: @@ -109,10 +109,10 @@ printf("#----------------------------------------------------------------------- // err_break = 0; - // for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat) + // for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat) // for (int i = 0; i < N_updates; i++) { - // Upds = &Updates[i * LDS]; + // Upds = &Updates[i * Lds]; // Ui = &Updates_index[i]; // determinant_previous = determinant_copy; @@ -120,7 +120,7 @@ printf("#----------------------------------------------------------------------- // uint64_t before = rdtsc(); // // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - // detupd(Dim, LDS, Upds, Ui, Slater_invT_copy, &determinant_copy); + // detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy); // // 3. FETCH FINISH TIME // uint64_t after = rdtsc(); @@ -137,9 +137,9 @@ printf("#----------------------------------------------------------------------- // } // if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again - // for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_previous; + // for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous; // } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again - // for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_copy; + // for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy; // } // } else if (version[0] == 'n') { // Naive if (version[0] == 'n') { // Naive @@ -147,7 +147,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, + err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -161,7 +161,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_sherman_morrison_later(LDS, Dim, N_updates, Updates, + err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -175,7 +175,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index, + err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -190,7 +190,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index, + err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -205,7 +205,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates, + err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -220,7 +220,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_woodbury_k_cublas_offload(LDS, Dim, N_updates, Updates, + err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -235,7 +235,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates, + err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -249,7 +249,7 @@ printf("#----------------------------------------------------------------------- uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates, + err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -260,7 +260,7 @@ printf("#----------------------------------------------------------------------- } else if (version[0] == 'm') { // LAPACK/MKL // Only send upper Dim x Dim part of matrix to lapack - double tmp[Dim*Dim]; + double tmp[DIM *DIM]; memcpy(tmp, SlaterT, Dim*Dim*sizeof(double)); // 1. FETCH START TIME @@ -274,9 +274,9 @@ printf("#----------------------------------------------------------------------- // Copy elements of inverse back, adding 0-padding in "correct" place for (uint32_t i = 0; i < Dim; i++) { - for (uint32_t j = 0; j < LDS; j++) { - if (j < Dim) Slater_invT_copy[i * LDS + j] = tmp[i * Dim + j]; - else Slater_invT_copy[i * LDS + j] = 0.0; + for (uint32_t j = 0; j < Lds; j++) { + if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j]; + else Slater_invT_copy[i * Lds + j] = 0.0; } } @@ -290,7 +290,7 @@ printf("#----------------------------------------------------------------------- } // END OF REPETITIONS LOOP // 4. COPY RESULT BACK TO ORIGINAL - memcpy(Slater_invT, Slater_invT_copy, LDS * Dim * sizeof(double)); + memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double)); determinant = determinant_copy; // At this point Slater_invT contains the correct inverse matrix @@ -306,20 +306,20 @@ printf("#----------------------------------------------------------------------- // CUMULATIVE RESULT FOR THE ENTIRE DATASET cumulative += accumulator; - double SSi[Dim * Dim]; - matmul(SlaterT, Slater_invT, SSi, LDS, Dim); - double Res[Dim * Dim]; + double SSi[DIM * DIM]; + matmul(SlaterT, Slater_invT, SSi, Lds, Dim); + double Res[DIM * DIM]; residual(SSi, Res, Dim); const double max = max_norm(Res, Dim, Dim); // 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT - uint32_t err_out = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance); + uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance); // int32_t err_out = check_error_better(max, tolerance); // if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates); // 8. COMPUTE CONDITION NUMBER - const double condnr = condition_number(Slater, Slater_invT, LDS, Dim); + const double condnr = condition_number(Slater, Slater_invT, Lds, Dim); const double frob = frobenius_norm(Res, Dim, Dim); diff --git a/independent_test_harness/three b/independent_test_harness/three deleted file mode 100644 index 7e4b6be..0000000 --- a/independent_test_harness/three +++ /dev/null @@ -1,9 +0,0 @@ -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0 -#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -#1 2 3 4 5 6 7 8 9 10 11 12 13 14 -#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC -#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------