- Got rid of NVC compiler warnings

- Included lib paths for MKL/HDF5 and cuBLAS - Cleaned Makefile - Added GPU node session request script
2024-12-25 13:53:56 +01:00 · 2022-07-22 11:34:29 +02:00 · 2022-07-22 11:34:29 +02:00 · 87e319189e
commit 87e319189e
parent fa03590f6f
11 changed files with 225 additions and 279 deletions
--- a/independent_test_harness/Makefile
+++ b/independent_test_harness/Makefile
@ -1,19 +1,11 @@
 # FC = gfortran
 # CC = gcc
 # FFLAGS=-O0 -finline -g -lm -Wall -pedantic
 # CFLAGS=-std=c99 -O0 -finline -g -lm -Wall -pedantic
 FC = ifx
-CC = icx
+CC = nvc
-# FFLAGS=-O0 -warn all -g -pedantic
+
-# CFLAGS=-std=c99 -O0 -Wall -g -pedantic
+CFLAGS=-std=c99 -O3 -Wall -g
-FFLAGS=-O3 -warn all -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
+
-CFLAGS=-std=c99 -O3 -Wall -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
+LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
-INCLUDE=-I/usr/include/hdf5/serial -I/usr/local/cuda/include
+LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
-LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential -L/usr/local/cuda-11.7/targets/x86_64-linux/lib -lcublas
+LFLAGS+=-lcublas
 #FC = verificarlo-f
 #CC = verificarlo-c
 #FFLAGS=-O3 -finline -g
 #CFLAGS=-O3 -finline -g
 ## Link with icc
 # test: sm.o test.o detupdate21.o meuk.o
@ -21,16 +13,6 @@ LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential
 test: sm.o test.o meuk.o
 	$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
 ## Link with ifort
 # test: sm.o test.o detupdate21.o meuk.o
 # 	$(FC) $(LFLAGS) -nofor-main -o test sm.o detupdate21.o test.o meuk.o
 ## Link with gfortran
 # test: sm.o test.o detupdate21.o meuk.o
 # 	$(FC) $(LFLAGS) -Wno-main -o test sm.o detupdate21.o test.o meuk.o
 %.o: %.f90
 	$(FC) $(FFLAGS) -c -o $@ $< 
--- a/independent_test_harness/block
+++ b/independent_test_harness/block
@ -1,9 +0,0 @@
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 41	15	0	0		0	0		0		1.266302e-05	2.506974e-05	7.553522e+04	3.249420e+03	2.166280e+02	3.249420e+03	0
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
--- a/independent_test_harness/goto_gpu_node.sh
+++ b/independent_test_harness/goto_gpu_node.sh
@ -0,0 +1,4 @@
 salloc --nodes=1 --account=prcoe10 -p booster --gres=gpu:1
 wait
 srun --pty /bin/bash
--- a/independent_test_harness/kay
+++ b/independent_test_harness/kay
@ -1,9 +0,0 @@
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 41	15	0	0		0	0		0		1.266299e-05	2.506969e-05	7.553522e+04	1.161662e+04	7.744417e+02	1.161662e+04	0
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
--- a/independent_test_harness/kernels.h
+++ b/independent_test_harness/kernels.h
@ -1,6 +1,8 @@
 #include <mkl_lapacke.h>
 #include <mkl.h>
 #define DIM 21 
 #define LDS 24   
 #define HAVE_CUBLAS_OFFLOAD
 #ifdef HAVE_CUBLAS_OFFLOAD
@ -9,7 +11,7 @@
 #include <cublas_v2.h>
 #endif
-lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
+lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
 int min(int a, int b);
@ -72,7 +74,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
                          Slater_inv,
                          double *__restrict determinant);
-void detupd(const uint64_t Dim, const uint64_t LDS,
+void detupd(const uint64_t Dim, const uint64_t Lds,
            const double *__restrict __attribute__((aligned(8))) Updates,
            const uint64_t *__restrict Updates_index, 
            double *__restrict __attribute__((aligned(8))) Slater_inv,
@ -84,4 +86,3 @@ uint32_t qmckl_sherman_morrison_later(
    const uint64_t *__restrict Updates_index, const double breakdown,
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant);
--- a/independent_test_harness/meuk.c
+++ b/independent_test_harness/meuk.c
@ -2,27 +2,27 @@
 #include <stdint.h>
 #include <assert.h>
-void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim) {
+void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) {
-  for (uint64_t i = 0; i < LDS * Dim; i++) {
+  for (uint64_t i = 0; i < Lds * Dim; i++) {
    printf("%f\n", A[i]);
  }
  printf("\n");
 }
-double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim) {
+double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
  double sum2 = 0;
-  for (uint64_t i = 0; i < LDS * Dim; i++) sum2 += A[i] * A[i];
+  for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
  return sum2;
 }
-double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
+double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
-  double sum2 = frobenius_norm2(A, LDS, Dim);
+  double sum2 = frobenius_norm2(A, Lds, Dim);
  return sqrt(sum2);
 }
-double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
+double max_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
  double largest = 0;
-  for (uint64_t i = 0; i < LDS * Dim; i++) {
+  for (uint64_t i = 0; i < Lds * Dim; i++) {
    double elm = A[i];
    double felm = fabs(elm);
    if (elm != elm) return -1.0; // Return a negative norm when NaN found
@ -31,9 +31,9 @@ double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
  return largest;
 }
-double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim) {
+double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim) {
-  double norm_A = frobenius_norm(A, LDS, Dim);
+  double norm_A = frobenius_norm(A, Lds, Dim);
-  double norm_Ainv = frobenius_norm(Ainv, LDS, Dim);
+  double norm_Ainv = frobenius_norm(Ainv, Lds, Dim);
  return fabs(norm_A) * fabs(norm_Ainv);
 }
@ -57,19 +57,19 @@ void read_double(hid_t file_id, const char *key, double *data) {
  assert(rc >= 0 && "H5Dclose");
 }
-void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
+void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
                          const uint64_t N_updates, const double *Updates,
                          const uint64_t *Updates_index, double *Slater) {
  for (uint32_t i = 0; i < N_updates; i++) {
    uint32_t col = Updates_index[i] - 1;
    for (uint32_t j = 0; j < Dim; j++) {
-      Slater[col * Dim + j]  += Updates[i * LDS + j];
+      Slater[col * Dim + j]  += Updates[i * Lds + j];
    }
  }
 }
-uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
+uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
                        double *Slater, const double tolerance) {
  double res[Dim*Dim];
@ -78,7 +78,7 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
    for (uint32_t j = 0; j < Dim; j++) {
      res[i * Dim + j] = 0;
      for (uint32_t k = 0; k < Dim; k++) {
-        res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * LDS + j];
+        res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
      }
    }
  }
@ -95,12 +95,12 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
  return 0;
 }
-void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim) {
+void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
  for (uint32_t i = 0; i < Dim; i++) {
    for (uint32_t j = 0; j < Dim; j++) {
      prod[i * Dim + j] = 0;
      for (uint32_t k = 0; k < Dim; k++) {
-        prod[i * Dim + j] += a[i * Dim + k] * b[k * LDS + j];
+        prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
      }
    }
  }
@ -121,7 +121,7 @@ void residual(double *a, double *res, const uint64_t Dim) {
  }
 }
-uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
+uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
                     const uint64_t N_updates, const double *Updates,
                     const uint64_t *Updates_index, const double breakdown, const double tolerance,
                     double *Slater, double *Slater_inv, double *determinant) {
@ -129,40 +129,40 @@ uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
  // if (version[0] == 'a') { // Anthony
  //   const double *Upds;
  //   const uint64_t *Ui;
-  //   for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] *= *determinant;
+  //   for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
  //   for (int j = 0; j < N_updates; j++) {
-  //     Upds = &Updates[j * LDS];
+  //     Upds = &Updates[j * Lds];
  //     Ui = &Updates_index[j];
-  //     detupd(Dim, LDS, Upds, Ui, Slater_inv, determinant);
+  //     detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
  //     if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
  //   }
-  //   for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] /= *determinant;
+  //   for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
-  //   update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
+  //   update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
-  //   rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
+  //   rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
  //   if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
  // } else if (version[0] == 'n') { // Naive
  if (version[0] == 'n') { // Naive
-    rc = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
+    rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
                                breakdown, Slater_inv, determinant);
    if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison failed\n");
-    update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
+    update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
-    rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
+    rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
    if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
  } else if (version[0] == 's') { // Splitting
-    rc = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
+    rc = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
                                          Updates_index, breakdown, Slater_inv,
                                          determinant);
    if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_splitting failed\n");
-    update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
+    update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
-    rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
+    rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
    if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
  } else if (version[0] == 'b') { // Blocked
-    rc = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
+    rc = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
                                       Updates_index, breakdown, Slater_inv,
                                       determinant);
    if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_smw32s failed\n");
-    update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
+    update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
-    rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
+    rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
    if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
  }
  return rc;
--- a/independent_test_harness/meuk.h
+++ b/independent_test_harness/meuk.h
@ -12,13 +12,13 @@ typedef struct Error {
  uint64_t error;
 } Error;
-void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim);
+void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
 void residual(double *a, double *res, const uint64_t Dim);
-double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim);
+double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
-void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim);
+void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim);
-double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim);
+double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
-double max_norm(double *A, const uint64_t LDS, const uint64_t Dim);
+double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
-double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim);
+double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
 void read_uint(hid_t file_id, const char *key, uint64_t *data);
 void read_double(hid_t file_id, const char *key, double *data);
@ -28,16 +28,16 @@ static __inline__ uint64_t rdtsc(void) {
  return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
 }
-void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
+void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
                          const uint64_t N_updates, const double *Updates,
                          const uint64_t *Updates_index, double *Slater);
-uint32_t  check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
+uint32_t  check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
                        double *Slater, const double tolerance);
 int32_t check_error_better(const double max, const double tolerance);
-uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
+uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
                     const uint64_t N_updates, const double *Updates,
                     const uint64_t *Updates_index, const double breakdown, const double tolerance,
                     double *Slater, double *Slater_inv, double *determinant);
--- a/independent_test_harness/mkl
+++ b/independent_test_harness/mkl
@ -1,9 +0,0 @@
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 41	15	0	0		0	0		0		2.278445e-12	8.167505e-12	7.553484e+04	1.771759e+04	1.181172e+03	1.771759e+04	0
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
--- a/independent_test_harness/sm.c
+++ b/independent_test_harness/sm.c
@ -17,10 +17,10 @@ uint32_t qmckl_sherman_morrison(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {
-  const uint32_t Dim = 21;
+  const uint32_t Dim = DIM;
-  const uint32_t LDS = 24;
+  const uint32_t Lds = LDS;
-  double __attribute__((aligned(8))) C[Dim];
+  double __attribute__((aligned(8))) C[DIM];
  double __attribute__((aligned(8))) D[LDS];
  uint32_t l = 0;
@ -31,8 +31,8 @@ uint32_t qmckl_sherman_morrison(
      C[i] = 0.0;
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t j = 0; j < LDS; j++) {
+      for (uint32_t j = 0; j < Lds; j++) {
-        C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
+        C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
      }
    }
@ -51,17 +51,17 @@ uint32_t qmckl_sherman_morrison(
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t j = 0; j < LDS; j++) {
+    for (uint32_t j = 0; j < Lds; j++) {
-      D[j] = Slater_inv[cui * LDS + j]; // selecting proper column of v_l^T * S_inv
+      D[j] = Slater_inv[cui * Lds + j]; // selecting proper column of v_l^T * S_inv
    }
    // A^{-1} = A^{-1} - C x D / den
    for (uint32_t i = 0; i < Dim; i++) {
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t j = 0; j < LDS; j++) {
+      for (uint32_t j = 0; j < Lds; j++) {
        const double update = C[i] * D[j] * iden;
-        Slater_inv[i * LDS + j] -= update;
+        Slater_inv[i * Lds + j] -= update;
      }
    }
    l += 1;
@ -69,16 +69,6 @@ uint32_t qmckl_sherman_morrison(
  return 0;
 }
 uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
                          const double *__restrict __attribute__((aligned(8)))
                          Updates,
                          const uint64_t *__restrict Updates_index,
                          const double breakdown,
                          double *__restrict __attribute__((aligned(8)))
                          Slater_inv,
                          double *__restrict determinant) {
  const uint32_t Dim = 21;
  const uint32_t LDS = 24;
 /*
 COMPUTE S^{-1}P - CB^{-1}D  : Dim x LDS,
 where S^{-1}P               : Dim x LDS,
@ -88,20 +78,31 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
      P^TU                  : LDS x 2,
      V                     : 2 x Dim
 */
 uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
                          const double *__restrict __attribute__((aligned(8)))
                          Updates,
                          const uint64_t *__restrict Updates_index,
                          const double breakdown,
                          double *__restrict __attribute__((aligned(8)))
                          Slater_inv,
                          double *__restrict determinant) {
  const uint32_t Dim = DIM;
  const uint32_t Lds = LDS;
  const uint32_t row1 = (Updates_index[0] - 1);
  const uint32_t row2 = (Updates_index[1] - 1);
  // Compute C = (S^T)^{-1}U : Dim x 2
-  double __attribute__((aligned(8))) C[2 * Dim];
+  double __attribute__((aligned(8))) C[2 * DIM];
  for (uint32_t i = 0; i < Dim; i++) {
    C[i * 2] = 0;
    C[i * 2 + 1] = 0;
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t k = 0; k < LDS; k++) {
+    for (uint32_t k = 0; k < Lds; k++) {
-      C[i * 2]     += Slater_inv[i * LDS + k] * Updates[k];
+      C[i * 2]     += Slater_inv[i * Lds + k] * Updates[k];
-      C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
+      C[i * 2 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
    }
  }
@ -130,38 +131,28 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
  // tmp = B^{-1}D : 2 x LDS
  double __attribute__((aligned(8))) tmp[2 * LDS];
-  double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
+  double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
-  double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
+  double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
  #pragma ivdep
  #pragma vector aligned
-  for (uint32_t j = 0; j < LDS; j++) {
+  for (uint32_t j = 0; j < Lds; j++) {
    tmp[j]       = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
-    tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
+    tmp[Lds + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
  }
-  // Compute (S^T)^{-1} - C * tmp : Dim x LDS
+  // Compute (S^T)^{-1} - C * tmp : Dim x Lds
  for (uint32_t i = 0; i < Dim; i++) {
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t j = 0; j < LDS; j++) {
+    for (uint32_t j = 0; j < Lds; j++) {
-      Slater_inv[i * LDS + j] -= C[i * 2]     * tmp[j];
+      Slater_inv[i * Lds + j] -= C[i * 2]     * tmp[j];
-      Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j];
+      Slater_inv[i * Lds + j] -= C[i * 2 + 1] * tmp[Lds + j];
    }
  }
  return 0;
 }
 uint32_t  qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
                          const double *__restrict __attribute__((aligned(8)))
                          Updates,
                          const uint64_t *__restrict Updates_index,
                          const double breakdown,
                          double *__restrict __attribute__((aligned(8)))
                          Slater_inv,
                          double *__restrict determinant) {
  const uint32_t Dim = 21;
  const uint32_t LDS = 24;
 /*
 COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS,
 where S^T                     : Dim x LDS,
@ -171,23 +162,34 @@ uint32_t  qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
      U                       : LDS x 3,
      V                       : 3 x Dim
 */
 uint32_t  qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
                          const double *__restrict __attribute__((aligned(8)))
                          Updates,
                          const uint64_t *__restrict Updates_index,
                          const double breakdown,
                          double *__restrict __attribute__((aligned(8)))
                          Slater_inv,
                          double *__restrict determinant) {
  const uint32_t Dim = DIM;
  const uint32_t Lds = LDS;
  const uint32_t row1 = (Updates_index[0] - 1);
  const uint32_t row2 = (Updates_index[1] - 1);
  const uint32_t row3 = (Updates_index[2] - 1);
  // Compute C = (S^T)^{-1}U : Dim x 3
-  double __attribute__((aligned(8))) C[3 * Dim];
+  double __attribute__((aligned(8))) C[3 * DIM];
  for (uint32_t i = 0; i < Dim; i++) {
    C[i * 3] = 0;
    C[i * 3 + 1] = 0;
    C[i * 3 + 2] = 0;
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t k = 0; k < LDS; k++) {
+    for (uint32_t k = 0; k < Lds; k++) {
-      C[i * 3] += Slater_inv[i * LDS + k] * Updates[k];
+      C[i * 3] += Slater_inv[i * Lds + k] * Updates[k];
-      C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
+      C[i * 3 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
-      C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k];
+      C[i * 3 + 2] += Slater_inv[i * Lds + k] * Updates[2 * Lds + k];
    }
  }
@ -233,20 +235,20 @@ uint32_t  qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
  double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
  #pragma ivdep
  #pragma vector aligned
-  for (uint32_t j = 0; j < LDS; j++) {
+  for (uint32_t j = 0; j < Lds; j++) {
    tmp[j]           = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
-    tmp[LDS + j]     = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
+    tmp[Lds + j]     = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
-    tmp[2 * LDS + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
+    tmp[2 * Lds + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
  }
-  // Compute (S^T)^{-1} - C * tmp : Dim x LDS
+  // Compute (S^T)^{-1} - C * tmp : Dim x Lds
  for (uint32_t i = 0; i < Dim; i++) {
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t j = 0; j < LDS; j++) {
+    for (uint32_t j = 0; j < Lds; j++) {
-      Slater_inv[i * LDS + j] -= C[i * 3]     * tmp[j];
+      Slater_inv[i * Lds + j] -= C[i * 3]     * tmp[j];
-      Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j];
+      Slater_inv[i * Lds + j] -= C[i * 3 + 1] * tmp[Lds + j];
-      Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j];
+      Slater_inv[i * Lds + j] -= C[i * 3 + 2] * tmp[2 * Lds + j];
    }
  }
@ -273,15 +275,15 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
                          double *__restrict __attribute__((aligned(8))) Slater_inv,
                          double *__restrict determinant) {
-  const uint32_t Dim = 21;
+  const uint32_t Dim = DIM;
-  const uint32_t LDS = 24;
+  const uint32_t Lds = LDS;
  // Compute C = S^{-1} U : Dim x K : standard dgemm
-  double C[Dim * N_updates];
+  double C[DIM * N_updates];
  double alpha = 1.0, beta = 0.0;
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-              Dim, N_updates, LDS,
+              Dim, N_updates, Lds,
-              alpha, Slater_inv, LDS, Updates, LDS,
+              alpha, Slater_inv, Lds, Updates, Lds,
              beta, C, N_updates);
  // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
@ -290,7 +292,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
  for (uint32_t i = 0; i < N_updates; i++) {
    const uint32_t row = Updates_index[i] - 1;
    for (uint32_t j = 0; j < N_updates  ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
-    for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
+    for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
  }
  // Compute determinant by LU decomposition
@ -345,41 +347,34 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
          double *__restrict __attribute__((aligned(8))) Slater_inv,
          double *__restrict determinant) {
-  const uint32_t Dim = 21;
+  const uint32_t Dim = DIM;
-  const uint32_t LDS = 24;
+  const uint32_t Lds = LDS;
  // Compute C = S^{-1} U : Dim x K : standard dgemm
  // double C[Dim * N_updates];
  double *C = malloc(DIM * N_updates * sizeof(double));
  double alpha = 1.0, beta = 0.0;
  // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
  //             Dim, N_updates, Lds,
  //             alpha, Slater_inv, Lds, Updates, Lds,
  //             beta, C, N_updates);
  //cuBLAS initialization
  cublasHandle_t handle;
  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
    fprintf(stdout, "cuBLAS initialization failed!\n");
    exit(EXIT_FAILURE);
  }
-
+  #pragma omp target enter data map(to:Slater_inv, Updates, C)
-  // Compute C = S^{-1} U : Dim x K : standard dgemm
+  #pragma omp target data use_device_ptr(Slater_inv, Updates, C)
-  double C[Dim * N_updates];
+  {
-  double alpha = 1.0, beta = 0.0;
+    int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
-
+                          Dim, N_updates, Lds,
-  // #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:M*N*walk_num], tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
+                          &alpha, Slater_inv, Lds, Updates, Lds,
-  // #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
+                          &beta, C, N_updates);
-  // {
+  }
-  //   for (int nw=0; nw < walk_num; ++nw) {
+  #pragma omp target exit data map(from:C)
  //     int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
  //                                     &(een_rescaled_e[nw*(cord_num+1)]),
  //                                     LDA, af,
  //                                     &(een_rescaled_n[bf*nw]),
  //                                     LDB, 0,
  //                                     &beta,
  //                                     &(tmp_c[nw*cord_num]),
  //                                     LDC, cf, cord_num);
  //   }
  // }
  // #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
  cublasDestroy(handle);
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
              Dim, N_updates, LDS,
              alpha, Slater_inv, LDS, Updates, LDS,
              beta, C, N_updates);
  // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
  // Construct D = V S^{-1} : K x LDS
@ -387,7 +382,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
  for (uint32_t i = 0; i < N_updates; i++) {
    const uint32_t row = Updates_index[i] - 1;
    for (uint32_t j = 0; j < N_updates  ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
-    for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
+    for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
  }
  // Compute determinant by LU decomposition
@ -443,8 +438,8 @@ uint32_t qmckl_slagel_splitting(
    uint64_t *__restrict later_index, uint64_t *__restrict later,
    double *__restrict determinant) {
-  const uint32_t LDS = 24;
+  const uint32_t Dim = DIM;
-  const uint32_t Dim = 21;
+  const uint32_t Lds = LDS;
  double __attribute__((aligned(8))) C[LDS];
  double __attribute__((aligned(8))) D[LDS];
@ -457,8 +452,8 @@ uint32_t qmckl_slagel_splitting(
      C[i] = 0.0;
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t j = 0; j < LDS; j++) {
+      for (uint32_t j = 0; j < Lds; j++) {
-        C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
+        C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
      }
    }
@ -474,8 +469,8 @@ uint32_t qmckl_slagel_splitting(
      // in later_updates
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t i = 0; i < LDS; i++) {
+      for (uint32_t i = 0; i < Lds; i++) {
-        later_updates[*later * LDS + i] = Updates[l * LDS + i] / 2.0;
+        later_updates[*later * Lds + i] = Updates[l * Lds + i] / 2.0;
        C[i] /= 2.0;
      }
      later_index[*later] = Updates_index[l];
@ -490,17 +485,17 @@ uint32_t qmckl_slagel_splitting(
    // D = v^T x S^{-1} : 1 x LDS
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t j = 0; j < LDS; j++) {
+    for (uint32_t j = 0; j < Lds; j++) {
-      D[j] = Slater_inv[cui * LDS + j];
+      D[j] = Slater_inv[cui * Lds + j];
    }
    // S^{-1} = S^{-1} - C x D / den
    for (uint32_t i = 0; i < Dim; i++) {
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t j = 0; j < LDS; j++) {
+      for (uint32_t j = 0; j < Lds; j++) {
        const double update = C[i] * D[j] * iden;
-        Slater_inv[i * LDS + j] -= update;
+        Slater_inv[i * Lds + j] -= update;
      }
    }
    l += 1;
@ -516,18 +511,18 @@ uint32_t qmckl_sherman_morrison_splitting(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {
-  const uint32_t Dim = 21;
+  const uint32_t Dim = DIM;
-  const uint32_t LDS = 24;
+  const uint32_t Lds = LDS;
  double __attribute__((aligned(8))) later_updates[LDS * N_updates];
  uint64_t later_index[N_updates];
  uint64_t later = 0;
  // uint32_t rc;
-  (void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
+  (void) qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
                              breakdown, Slater_inv, later_updates, later_index,
                              &later, determinant);
-  // rc = qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
+  // rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
  //                             breakdown, Slater_inv, later_updates, later_index,
  //                             &later, determinant);
  // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
@ -535,11 +530,11 @@ uint32_t qmckl_sherman_morrison_splitting(
  if (later > 0) {
    recursive_calls++;
    // printf("Later > 0\n");
-    (void) qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
+    (void) qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
                                          later_index, breakdown, Slater_inv,
                                          determinant);
-    // rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
+    // rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
    //                                       later_index, breakdown, Slater_inv,
    //                                       determinant);
    // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
@ -555,8 +550,8 @@ uint32_t qmckl_sherman_morrison_smw32s(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {
-  const uint32_t Dim = 21;
+  const uint32_t Dim = DIM;
-  const uint32_t LDS = 24;
+  const uint32_t Lds = LDS;
  double __attribute__((aligned(8))) later_updates[LDS * N_updates];
  uint64_t later_index[N_updates];
@ -564,31 +559,31 @@ uint32_t qmckl_sherman_morrison_smw32s(
  uint32_t rc;
  if (N_updates == 4) { // Special case for 4 rank-1 updates: 2+2
-    rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
+    rc = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
                  breakdown, Slater_inv, determinant);
    if (rc != 0) { // Send the entire block to slagel_splitting
      block_fail += 1;
      uint64_t l = 0;
-      rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates,
+      rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates,
                    Updates_index, breakdown, Slater_inv,
-                    later_updates + (LDS * later),
+                    later_updates + (Lds * later),
                    later_index + later, &l, determinant);
      later += l;
    }
-    rc = qmckl_woodbury_2(LDS, Dim, &Updates[2*LDS], &Updates_index[2],
+    rc = qmckl_woodbury_2(Lds, Dim, &Updates[2*Lds], &Updates_index[2],
                  breakdown, Slater_inv, determinant);
    if (rc != 0) { // Send the entire block to slagel_splitting
      block_fail += 1;
      uint64_t l = 0;
-      rc = qmckl_slagel_splitting(LDS, Dim, 2, &Updates[2*LDS],
+      rc = qmckl_slagel_splitting(Lds, Dim, 2, &Updates[2*Lds],
                    &Updates_index[2], breakdown, Slater_inv,
-                    later_updates + (LDS * later),
+                    later_updates + (Lds * later),
                    later_index + later, &l, determinant);
      later += l;
    }
    if (later > 0) {
      recursive_calls++;
-      rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
+      rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
                                            later_index, breakdown, Slater_inv,
                                            determinant);
    }
@ -600,21 +595,21 @@ uint32_t qmckl_sherman_morrison_smw32s(
  // Woodbury 3x3 kernel  
  uint32_t n_of_3blocks = N_updates / 3;
  uint32_t remainder = N_updates % 3;
-  uint32_t length_3block = 3 * LDS;
+  uint32_t length_3block = 3 * Lds;
  if (n_of_3blocks > 0) {
    for (uint32_t i = 0; i < n_of_3blocks; i++) {
      const double *Updates_3block = &Updates[i * length_3block];
      const uint64_t *Updates_index_3block = &Updates_index[i * 3];
-      rc = qmckl_woodbury_3(LDS, Dim, Updates_3block, Updates_index_3block,
+      rc = qmckl_woodbury_3(Lds, Dim, Updates_3block, Updates_index_3block,
                            breakdown, Slater_inv, determinant);
      if (rc != 0) { // Send the entire block to slagel_splitting
        // printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
        block_fail += 1;
        uint64_t l = 0;
-        rc = qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block,
+        rc = qmckl_slagel_splitting(Lds, Dim, 3, Updates_3block,
                                    Updates_index_3block, breakdown, Slater_inv,
-                                    later_updates + (LDS * later),
+                                    later_updates + (Lds * later),
                                    later_index + later, &l, determinant);
        // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
        later += l;
@ -626,15 +621,15 @@ uint32_t qmckl_sherman_morrison_smw32s(
  if (remainder == 2) {
    const double *Updates_2block = &Updates[n_of_3blocks * length_3block];
    const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks];
-    rc = qmckl_woodbury_2(LDS, Dim, Updates_2block, Updates_index_2block,
+    rc = qmckl_woodbury_2(Lds, Dim, Updates_2block, Updates_index_2block,
                          breakdown, Slater_inv, determinant);
    if (rc != 0) { // Send the entire block to slagel_splitting
      // printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
      block_fail += 1;
      uint64_t l = 0;
-      rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block,
+      rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates_2block,
                                  Updates_index_2block, breakdown, Slater_inv,
-                                  later_updates + (LDS * later),
+                                  later_updates + (Lds * later),
                                  later_index + later, &l, determinant);
      // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
      later += l;
@ -647,9 +642,9 @@ uint32_t qmckl_sherman_morrison_smw32s(
    const double *Updates_1block = &Updates[n_of_3blocks * length_3block];
    const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks];
    uint64_t l = 0;
-    rc = qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block,
+    rc = qmckl_slagel_splitting(Lds, Dim, 1, Updates_1block,
                                Updates_index_1block, breakdown, Slater_inv,
-                                later_updates + (LDS * later),
+                                later_updates + (Lds * later),
                                later_index + later, &l, determinant);
    // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
    later += l;
@ -658,7 +653,7 @@ uint32_t qmckl_sherman_morrison_smw32s(
  if (later > 0) {
    recursive_calls++;
    // printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
-    rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
+    rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
                                          later_index, breakdown, Slater_inv,
                                          determinant);
    // if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
@ -674,10 +669,10 @@ uint32_t qmckl_sherman_morrison_later(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {
-  const uint32_t Dim = 21;
+  const uint32_t Dim = DIM;
-  const uint32_t LDS = 24;
+  const uint32_t Lds = LDS;
-  double __attribute__((aligned(8))) C[Dim];
+  double __attribute__((aligned(8))) C[DIM];
  double __attribute__((aligned(8))) D[LDS];
  double __attribute__((aligned(8))) later_updates[LDS * N_updates];
@ -693,8 +688,8 @@ uint32_t qmckl_sherman_morrison_later(
      C[i] = 0.0;
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t j = 0; j < LDS; j++) {
+      for (uint32_t j = 0; j < Lds; j++) {
-        C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
+        C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
      }
    }
@ -705,8 +700,8 @@ uint32_t qmckl_sherman_morrison_later(
      #pragma ivdep
      #pragma vector aligned
      // for (uint32_t i = 0; i < Dim; i++) {
-      for (uint32_t i = 0; i < LDS; i++) {
+      for (uint32_t i = 0; i < Lds; i++) {
-        later_updates[later * LDS + i] = Updates[l * LDS + i];
+        later_updates[later * Lds + i] = Updates[l * Lds + i];
      }
      later_index[later] = Updates_index[l];
      later++;
@ -720,17 +715,17 @@ uint32_t qmckl_sherman_morrison_later(
    // D = v^T x A^{-1}
    #pragma ivdep
    #pragma vector aligned
-    for (uint32_t j = 0; j < LDS; j++) {
+    for (uint32_t j = 0; j < Lds; j++) {
-      D[j] = Slater_inv[cui * LDS + j];
+      D[j] = Slater_inv[cui * Lds + j];
    }
    // S^{-1} = S^{-1} - C x D / den
    for (uint32_t i = 0; i < Dim; i++) {
      #pragma ivdep
      #pragma vector aligned
-      for (uint32_t j = 0; j < LDS; j++) {
+      for (uint32_t j = 0; j < Lds; j++) {
        const double update = C[i] * D[j] * iden;
-        Slater_inv[i * LDS + j] -= update;
+        Slater_inv[i * Lds + j] -= update;
      }
    }
    l += 1;
@ -741,7 +736,7 @@ uint32_t qmckl_sherman_morrison_later(
  }
  else if (later > 0) { // If some have failed, make a recursive call
    recursive_calls++;
-    (void) qmckl_sherman_morrison_later(LDS, Dim, later, later_updates,
+    (void) qmckl_sherman_morrison_later(Lds, Dim, later, later_updates,
                      later_index, breakdown, Slater_inv, determinant);
  }
--- a/independent_test_harness/test.c
+++ b/independent_test_harness/test.c
@ -1,6 +1,6 @@
 #include <stdint.h>
 #include "meuk.h"
 #include "cycles.h"
 #include <stdint.h>
 #define DATASET "dataset_329d_zeropadded_cm.hdf5"
 // #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
@ -23,11 +23,11 @@ int main(int argc, char **argv) {
  char slater_key[32];
  char slater_inv_key[32];
  char det_key[32];
-  const uint64_t Dim = 21;
+  const uint64_t Dim = DIM;
-  const uint64_t LDS = 24;
+  const uint64_t Lds = LDS;
  uint64_t N_updates;
-  double Slater[LDS * Dim ], SlaterT[LDS * Dim];
+  double Slater[LDS * DIM ], SlaterT[LDS * DIM];
-  double Slater_invT[LDS * Dim], Slater_invT_copy[LDS * Dim];
+  double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
  double determinant, determinant_copy;
  // SETUP TEST PARAMETERS
@ -52,7 +52,7 @@ printf("#-----------------------------------------------------------------------
    sprintf(det_key, "/cycle_%d/determinant", cycle);
    read_uint(file_id, nupds_key, &N_updates);
    uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
-    double *Updates = malloc(LDS * N_updates * sizeof(double));
+    double *Updates = malloc(Lds * N_updates * sizeof(double));
    read_uint(file_id, upd_idx_key, Updates_index);
    read_double(file_id, upds_key, Updates);
    read_double(file_id, slater_key, Slater);
@ -60,28 +60,28 @@ printf("#-----------------------------------------------------------------------
    read_double(file_id, det_key, &determinant);
    // Compute transpose of S. ST: 24 x 21
-    for (int i = 0; i < LDS; i++) {
+    for (int i = 0; i < Lds; i++) {
      for (int j = 0; j < Dim; j++) {
-        SlaterT[i * Dim + j] = Slater[j * LDS + i];
+        SlaterT[i * Dim + j] = Slater[j * Lds + i];
      }
    }
    // Convert repl. upds into additive upds.
    for (int i = 0; i < N_updates; i++) {
      int col = Updates_index[i] - 1;
-      for (int j = 0; j < LDS; j++) {
+      for (int j = 0; j < Lds; j++) {
-        Updates[i * LDS + j] -= SlaterT[col + j * Dim];
+        Updates[i * Lds + j] -= SlaterT[col + j * Dim];
      }
    }
    // 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
-    uint32_t err_inp = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
+    uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
    // Update Slater matrix
    for (int i = 0; i < N_updates; i++) {
      int col = Updates_index[i] - 1;
      for (int j = 0; j < Dim; j++) {
-        SlaterT[col + j * Dim] += Updates[i * LDS + j];
+        SlaterT[col + j * Dim] += Updates[i * Lds + j];
      }
    } // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
@ -98,7 +98,7 @@ printf("#-----------------------------------------------------------------------
    for (int rep = 0; rep < REPETITIONS; rep++) {
      // 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
-      memcpy(Slater_invT_copy, Slater_invT, LDS * Dim * sizeof(double));
+      memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double));
      determinant_copy = determinant;
      // ### CHOOSE A KERNEL:
@ -109,10 +109,10 @@ printf("#-----------------------------------------------------------------------
      //   err_break = 0;
-      //   for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
+      //   for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
      //   for (int i = 0; i < N_updates; i++) {
-      //     Upds = &Updates[i * LDS];
+      //     Upds = &Updates[i * Lds];
      //     Ui = &Updates_index[i];
      //     determinant_previous = determinant_copy;
@ -120,7 +120,7 @@ printf("#-----------------------------------------------------------------------
      //     uint64_t before = rdtsc();
      //     // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-      //     detupd(Dim, LDS, Upds, Ui, Slater_invT_copy, &determinant_copy);
+      //     detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
      //     // 3. FETCH FINISH TIME
      //     uint64_t after = rdtsc();
@ -137,9 +137,9 @@ printf("#-----------------------------------------------------------------------
      //   }
      //   if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
-      //     for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
+      //     for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
      //   } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
-      //     for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
+      //     for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
      //   }
      // } else if (version[0] == 'n') { // Naive
      if (version[0] == 'n') { // Naive
@ -147,7 +147,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates,
+        err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -161,7 +161,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_sherman_morrison_later(LDS, Dim, N_updates, Updates,
+        err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -175,7 +175,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
+        err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
              breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -190,7 +190,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
+        err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
              breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -205,7 +205,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates,
+        err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -220,7 +220,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_woodbury_k_cublas_offload(LDS, Dim, N_updates, Updates,
+        err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -235,7 +235,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
+        err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -249,7 +249,7 @@ printf("#-----------------------------------------------------------------------
        uint64_t before = rdtsc();
        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
+        err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);
        // 3. FETCH FINISH TIME
@ -260,7 +260,7 @@ printf("#-----------------------------------------------------------------------
      } else if (version[0] == 'm') { // LAPACK/MKL
        // Only send upper Dim x Dim part of matrix to lapack
-        double tmp[Dim*Dim];
+        double tmp[DIM *DIM];
        memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
        // 1. FETCH START TIME
@ -274,9 +274,9 @@ printf("#-----------------------------------------------------------------------
        // Copy elements of inverse back, adding 0-padding in "correct" place
        for (uint32_t i = 0; i < Dim; i++) {
-          for (uint32_t j = 0; j < LDS; j++) {
+          for (uint32_t j = 0; j < Lds; j++) {
-            if (j < Dim) Slater_invT_copy[i * LDS + j] = tmp[i * Dim + j];
+            if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
-            else Slater_invT_copy[i * LDS + j] = 0.0;
+            else Slater_invT_copy[i * Lds + j] = 0.0;
          }
        }
@ -290,7 +290,7 @@ printf("#-----------------------------------------------------------------------
    } // END OF REPETITIONS LOOP
    // 4. COPY RESULT BACK TO ORIGINAL 
-    memcpy(Slater_invT, Slater_invT_copy, LDS * Dim * sizeof(double));
+    memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
    determinant = determinant_copy;
    // At this point Slater_invT contains the correct inverse matrix
@ -306,20 +306,20 @@ printf("#-----------------------------------------------------------------------
    // CUMULATIVE RESULT FOR THE ENTIRE DATASET
    cumulative += accumulator;
-    double SSi[Dim * Dim];
+    double SSi[DIM * DIM];
-    matmul(SlaterT, Slater_invT, SSi, LDS, Dim);
+    matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
-    double Res[Dim * Dim];
+    double Res[DIM * DIM];
    residual(SSi, Res, Dim);
    const double max = max_norm(Res, Dim, Dim);
    // 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
-    uint32_t err_out = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
+    uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
    // int32_t err_out = check_error_better(max, tolerance);
    // if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
    // 8. COMPUTE CONDITION NUMBER
-    const double condnr = condition_number(Slater, Slater_invT, LDS, Dim);
+    const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
    const double frob = frobenius_norm(Res, Dim, Dim);
--- a/independent_test_harness/three
+++ b/independent_test_harness/three
@ -1,9 +0,0 @@
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 41	15	0	0		1	0		0		3.438043e+01	9.578187e+01	2.792190e+04	6.417383e+02	4.278255e+01	6.417383e+02	0
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 #1	2	3	4		5	6		7		8		9		10		11		12		13		14
 #CYCLE	UPDS	ERR_IN	ERR_BREAK	ERR_OUT	SPLITS		BLK_FAILS	MAX		FROB		COND		CPU_CYC		CPU_CYC/UPD	CUMUL		REC
 #----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------