From 00bdcba2301c78f1b93a8ae4ce2891c1b241fea7 Mon Sep 17 00:00:00 2001
From: Francois Coppens <francois.coppens@uvsq.fr>
Date: Thu, 22 Sep 2022 14:37:00 +0200
Subject: [PATCH] cuBLAS version of Woodbury KxK is working, but called to
 lapacke dgetrf/ri need to be replaced with cuSOLVER calls to eliminate
 intermediate results to be transfered to/from device.

---
 independent_test_harness/Makefile             |  12 +-
 .../cblasdgemm_vs_cublasdgemm_test.c          |  27 +-
 independent_test_harness/debug.h              |  22 +-
 independent_test_harness/kernels.h            |   3 +-
 independent_test_harness/meuk.h               |   2 +-
 independent_test_harness/sm.c                 | 251 ++++++++----------
 independent_test_harness/test.c.old           |  92 -------
 independent_test_harness/test.c.older         |  62 -----
 8 files changed, 157 insertions(+), 314 deletions(-)
 delete mode 100644 independent_test_harness/test.c.old
 delete mode 100644 independent_test_harness/test.c.older

diff --git a/independent_test_harness/Makefile b/independent_test_harness/Makefile
index 97d2a17..fc0c88e 100644
--- a/independent_test_harness/Makefile
+++ b/independent_test_harness/Makefile
@@ -1,11 +1,15 @@
-FC = ifx
-CC = nvc
+#FC = ifx
+#CC = nvc
 
 CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu
 
-LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
+INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
+
+LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
 LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
-LDFLAGS+=-lcublas -mp -target=gpu
+LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -mp -target=gpu
+
+all: test
 
 ## Link with icc
 # test: sm.o test.o detupdate21.o meuk.o
diff --git a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
index b190518..bee638c 100644
--- a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
+++ b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
@@ -1,18 +1,21 @@
 /*
 Compile with:
 
-nvc -L${MKLROOT}/lib/intel64 \
-    -lmkl_intel_lp64 \
-    -lmkl_sequential \
-    -lmkl_core \
-    -lpthread \
-    -lm \
-    -ldl \
-    -lcublas \
-    -mp \
-    -target=gpu \
-    cblasdgemm_vs_cublasdgemm_test.c \
-    -o cblasdgemm_vs_cublasdgemm_test
+nvc \
+-I$NV_CUDA_MATH_PATH/11.7/include \
+-L$NV_CUDA_MATH_PATH/11.7/lib64 \
+-L${MKLROOT}/lib/intel64 \
+-lmkl_intel_lp64 \
+-lmkl_sequential \
+-lmkl_core \
+-lpthread \
+-lm \
+-ldl \
+-lcublas \
+-mp \
+-target=gpu \
+cblasdgemm_vs_cublasdgemm_test.c \
+-o cblasdgemm_vs_cublasdgemm_test
 
 */
 
diff --git a/independent_test_harness/debug.h b/independent_test_harness/debug.h
index 378b8e4..4ec98fa 100644
--- a/independent_test_harness/debug.h
+++ b/independent_test_harness/debug.h
@@ -1,11 +1,29 @@
-void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+void print_dm(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
 {
   printf("%s = \n", name);
   for (uint16_t i = 0; i < m; ++i)
   {
     for (uint16_t j = 0; j < n; ++j)
     {
-      printf("%11.5f ", mat[i * ldm + j]);
+      printf("%9.3f ", mat[i * ldm + j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+void print_im(const int* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
+{
+  printf("%s = \n", name);
+  for (uint16_t i = 0; i < m; ++i)
+  {
+    for (uint16_t j = 0; j < n; ++j)
+    {
+      printf("%d ", mat[i * ldm + j]);
     }
     printf("\n");
   }
diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h
index c1b090c..95fc5b1 100644
--- a/independent_test_harness/kernels.h
+++ b/independent_test_harness/kernels.h
@@ -7,8 +7,9 @@
 
 #ifdef HAVE_CUBLAS_OFFLOAD
 #include <stdio.h>
-#include <cuda_runtime.h>
+#include <omp.h>
 #include <cublas_v2.h>
+#include <cuda_runtime_api.h>
 #endif
 
 lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
diff --git a/independent_test_harness/meuk.h b/independent_test_harness/meuk.h
index 02432c0..5e9a9e0 100644
--- a/independent_test_harness/meuk.h
+++ b/independent_test_harness/meuk.h
@@ -4,7 +4,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "hdf5.h"
+#include <hdf5/serial/hdf5.h>
 #include "kernels.h"
 
 typedef struct Error {
diff --git a/independent_test_harness/sm.c b/independent_test_harness/sm.c
index 682aa32..cad422b 100644
--- a/independent_test_harness/sm.c
+++ b/independent_test_harness/sm.c
@@ -299,9 +299,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
 
   // Compute determinant by LU decomposition
   int ipiv[N_updates];
-  lapack_int ret;
-  ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
+  (void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
+
   double det = 1.0;
   int j = 0;
   for (uint32_t i = 0; i < N_updates; i++) {
@@ -319,8 +318,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
   if (determinant) *determinant *= det;
 
   // Compute B^{-1} with explicit formula for K x K inversion
-  ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
+  (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
 
   // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
   double tmp1[N_updates * LDS];
@@ -328,173 +326,146 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
               N_updates, LDS, N_updates,
               alpha, B, N_updates, D, LDS,
               beta, tmp1, LDS);
-  print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas");
 
   // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
-  // alpha = -1.0, beta = 1.0;
-  // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-  //             Dim, LDS, N_updates,
-  //             alpha, C, N_updates, tmp1, LDS,
-  //             beta, Slater_inv, LDS);
-
-  double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
-  cblas_dgemm(CblasRowMajor,
-              CblasNoTrans, CblasNoTrans,
+  alpha = -1.0, beta = 1.0;
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
               Dim, LDS, N_updates,
               alpha, C, N_updates, tmp1, LDS,
-              beta, tmp2, LDS);
-  print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas");
-
-  double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
-  for(int i = 0; i < DIM * LDS; ++i)
-  {
-    tmp3[i] = Slater_inv[i] - tmp2[i];
-  }
-  print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas");
-
-  for(int i = 0; i < DIM * LDS; ++i)
-  {
-    Slater_inv[i] = tmp3[i];
-  }
-
-  free(tmp2);
-  free(tmp3);
+              beta, Slater_inv, LDS);
 
   return 0;
 }
 
 #ifdef HAVE_CUBLAS_OFFLOAD
 uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
-          const uint64_t vLDS,
-          const uint64_t vDim,
-          const uint64_t N_updates,
-          const double *__restrict __attribute__((aligned(8))) Updates,
-          const uint64_t *__restrict Updates_index,
-          const double breakdown,
-          double *__restrict __attribute__((aligned(8))) Slater_inv,
-          double *__restrict determinant) {
-
+                                         const uint64_t vLDS,
+                                         const uint64_t vDim,
+                                         const uint64_t N_updates,
+                                         const double* Updates,
+                                         const uint64_t* Updates_index,
+                                         const double breakdown,
+                                         double* Slater_inv,
+                                         double* determinant)
+{
   const uint32_t Dim = DIM;
   const uint32_t Lds = LDS;
 
-  // Compute C = S^{-1} U : Dim x K : standard dgemm
-  double *C = calloc(1, DIM * N_updates * sizeof(double));
-  double alpha = 1.0f, beta = 0.0f;
-  #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates])
-  #pragma omp target data use_device_ptr(Slater_inv, Updates, C)
-  {
-    int cublasError = cublasDgemm(handle,
-                                  CUBLAS_OP_T, CUBLAS_OP_N,
-                                  15, 21, 24,
-                                  &alpha, Updates, 24, Slater_inv, 24, 
-                                  &beta, C, 15);
-  }
-  #pragma omp target exit data map(from:C[0:DIM*N_updates])
+  double alpha, beta;
+  int* ipiv = calloc(1, sizeof *ipiv * N_updates);
+  double* C  = calloc(1, sizeof *C * Dim * N_updates);
+  double* B  = calloc(1, sizeof *B * N_updates * N_updates);
+  double* D  = calloc(1, sizeof *D * N_updates * Lds);
+  double* T1 = calloc(1, sizeof *T1 * N_updates * Lds);
+  double* T2 = calloc(1, sizeof *T2 * Dim * Lds);
 
-  // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
-  // Construct D = V S^{-1} : K x LDS
-  double *B = calloc(1, N_updates * N_updates * sizeof(double));
-  double *D = calloc(1, N_updates * LDS * sizeof(double));
-  for (uint32_t i = 0; i < N_updates; i++) {
-    const uint32_t row = Updates_index[i] - 1;
-    for (uint32_t j = 0; j < N_updates  ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
-    for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
+  #pragma omp target enter data map(to:    Updates[0:Lds*N_updates], \
+                                           Updates_index[0:N_updates], \
+                                           Slater_inv[0:Dim*Lds]) \
+                                map(alloc: B[0:N_updates*N_updates], \
+                                           C[0:Dim*N_updates], \
+                                           D[0:N_updates*Lds], \
+                                           T1[0:N_updates*Lds], \
+                                           T2[0:Dim*Lds], \
+                                           ipiv[0:N_updates])
+
+  #pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE
+  {
+    alpha = 1.0f, beta = 0.0f;
+    (void) cublasDgemm(handle,
+                      CUBLAS_OP_T, CUBLAS_OP_N,
+                      N_updates, Dim, Lds,
+                      &alpha, Updates, Lds, Slater_inv, Lds,
+                      &beta, C, N_updates);
   }
 
+  // Construct B = 1 + V C : K x K
+  // Construct D = V S^{-1} : K x LDS
+  #pragma omp target teams distribute parallel for // compute B, D ON DEVICE
+  for (uint32_t i = 0; i < N_updates; i++)
+  {
+    const uint32_t row = Updates_index[i] - 1;
+    for (uint32_t j = 0; j < N_updates  ; j++)
+    {
+      B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
+    }
+    for (uint32_t j = 0; j < Lds; j++)
+    {
+      D[i * Lds + j] = Slater_inv[row * Lds + j];
+    }
+  }
+  #pragma omp target update from(B[0:N_updates*N_updates]) // Update B ON HOST
+
   // Compute determinant by LU decomposition
-  int ipiv[N_updates];
-  lapack_int ret;
-  ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
-  double det = 1.0;
-  int j = 0;
-  for (uint32_t i = 0; i < N_updates; i++) {
+  (void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
+  #pragma omp target update to(B[0:N_updates*N_updates], ipiv[0:N_updates]) // Update B ON DEVICE
+
+  double det = 1.0f; uint32_t j = 0;
+  #pragma omp target teams distribute parallel for // compute det ON DEVICE
+  for (uint32_t i = 0; i < N_updates; i++)
+  {
     j += min(ipiv[i] - i, 1);
     det *= B[(N_updates + 1) * i]; // update determinant
   }
   if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
 
   // Check if determinant of B is not too close to zero
-  if (fabs(det) < breakdown) {
-    return 1;
-  }
+  if (fabs(det) < breakdown) return det;
 
   // Update det(Slater) if passed
   if (determinant) *determinant *= det;
 
-  // Compute B^{-1} with explicit formula for K x K inversion
-  ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
+  // Compute B^{-1}
+  (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); // compute B^-1 ON HOST
+  #pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE
 
-  // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
-  double *tmp1 = calloc(1, N_updates * LDS * sizeof(double));
-  #pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS])
-  #pragma omp target data use_device_ptr(D, B, tmp1)
+  // T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
+  #pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE
   {
-    int cublasError = cublasDgemm(handle,
-                                  CUBLAS_OP_N, CUBLAS_OP_N,
-                                  LDS, N_updates, N_updates,
-                                  &alpha, D, LDS, B, N_updates, 
-                                  &beta, tmp1, LDS);
-  }
-  #pragma omp target exit data map(from:tmp1[0:N_updates*LDS])
-  print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas");
-
-  // Compute tmp2 = C * tmp1 : Dim x LDS
-  double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
-  #pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS])
-  #pragma omp target data use_device_ptr(tmp1, C, tmp2)
-  {
-    int cublasError = cublasDgemm(handle,
-                                  CUBLAS_OP_N, CUBLAS_OP_N,
-                                  LDS, Dim, N_updates,
-                                  &alpha, tmp1, LDS, C, N_updates,
-                                  &beta, tmp2, LDS);
-  }
-  #pragma omp target exit data map(from:tmp2[0:DIM*LDS])
-  print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas");
-
-  // Compute tmp3 = S^{-1} - tmp2
-  double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); 
-  beta = -1.0f;
-  #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS])
-  #pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3)
-  {
-    int cublasError = cublasDgeam(handle,
-                                  CUBLAS_OP_N, CUBLAS_OP_N,
-                                  DIM, LDS,
-                                  &alpha, Slater_inv, LDS,
-                                  &beta, tmp2, LDS,
-                                  tmp3, LDS);
-  }
-  #pragma omp target exit data map(from:tmp3[0:DIM*LDS])
-  print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas");
-  for(int i = 0; i < DIM * LDS; ++i)
-  {
-    Slater_inv[i] = tmp3[i];
+    alpha = 1.0, beta = 0.0;
+    (void) cublasDgemm(handle,
+                      CUBLAS_OP_N, CUBLAS_OP_N,
+                      Lds, N_updates, N_updates,
+                      &alpha, D, Lds, B, N_updates,
+                      &beta, T1, Lds);
   }
 
-  // // Compute S^{-1} <- S^{-1} - tmp2
-  // beta = -1.0f;
-  // #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS])
-  // #pragma omp target data use_device_ptr(Slater_inv, tmp2)
-  // {
-  //   int cublasError = cublasDgeam(handle,
-  //                                 CUBLAS_OP_N, CUBLAS_OP_N,
-  //                                 DIM, LDS,
-  //                                 &alpha, Slater_inv, LDS,
-  //                                 &beta, tmp2, LDS,
-  //                                 Slater_inv, LDS);
-  // }
-  // #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS])
+  // Compute T2 <- C * T1 : Dim x LDS : standard dgemm
+  #pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE
+  {
+    alpha = 1.0f, beta = 0.0f;
+    (void) cublasDgemm(handle,
+                      CUBLAS_OP_N, CUBLAS_OP_N,
+                      Dim, Lds, N_updates,
+                      &alpha, T1, Lds, C, N_updates,
+                      &beta, T2, Lds);
+  }
 
-  
+  // Compute S^{-1} <- S^{-1} - T2 : Dim x LDS : standard dgemm
+  #pragma omp target teams distribute parallel for // compute S^-1 ON DEVICE
+  for (uint32_t i = 0; i < Dim * Lds; i++)
+  {
+    Slater_inv[i] = Slater_inv[i] - T2[i];
+  }
+  #pragma omp target update from(Slater_inv[0:Dim*Lds]) // update S^-1 ON HOST
+
+  #pragma omp target exit data map(delete: Updates[0:Lds*N_updates], \
+                                           Updates_index[0:N_updates], \
+                                           Slater_inv[0:Dim*Lds], \
+                                           B[0:N_updates*N_updates], \
+                                           C[0:Dim*N_updates], \
+                                           D[0:N_updates*Lds], \
+                                           T1[0:N_updates*Lds], \
+                                           T2[0:Dim*Lds], \
+                                           ipiv[0:N_updates])
+
+//  free(ipiv);
   free(B);
   free(C);
   free(D);
-  free(tmp1);
-  free(tmp2);
-  // free(tmp3);
+  free(T1);
+  free(T2);
+
   return 0;
 }
 #endif
@@ -529,7 +500,7 @@ uint32_t qmckl_slagel_splitting(
 
     // Denominator
     const int cui = Updates_index[l] - 1;
-    double den = 1.0 + C[cui];    
+    double den = 1.0 + C[cui];
     // printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui);
     if (fabs(den) < breakdown) { // Here is decided to split the update, or not.
       // printf("Split! breakdown = %f\n", breakdown);
@@ -662,7 +633,7 @@ uint32_t qmckl_sherman_morrison_smw32s(
 
   // And for the other cases != 4, 6
   // Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
-  // Woodbury 3x3 kernel  
+  // Woodbury 3x3 kernel
   uint32_t n_of_3blocks = N_updates / 3;
   uint32_t remainder = N_updates % 3;
   uint32_t length_3block = 3 * Lds;
@@ -799,7 +770,7 @@ uint32_t qmckl_sherman_morrison_later(
       }
     }
     l += 1;
-  } 
+  }
 
   if (later == N_updates) { // If all the updates have failed, exit early with an error
     return 1;
diff --git a/independent_test_harness/test.c.old b/independent_test_harness/test.c.old
deleted file mode 100644
index a1c1cc2..0000000
--- a/independent_test_harness/test.c.old
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <assert.h>
-#include "data_cm.h"
-#include "meuk.h"
-
-#define REPETITIONS 10000000
-int main(int argc, char **argv) {
-
-  assert(argc == 3);
-  char *version = argv[1];
-  char *number_of_updates = argv[2];
-  const uint64_t Dim = 21;
-  const uint64_t LDS = 24;
-  // const double breakdown = 1e-3;
-  const double breakdown = 1e-9; // this might be too small and cause NIs
-  uint32_t rc;
-
-  const uint64_t *N_updates;
-  const double *Updates;
-  const uint64_t *Updates_index;
-  double *Slater, *Slater_invT;
-  double determinant;
-  if (number_of_updates[0] == '2') { // 2 Updates
-    N_updates = &N_updates2;
-    Updates = &Updates2[0];
-    Updates_index = &Updates_index2[0];
-    Slater = &Slater2[0];
-    Slater_invT = &Slater_invT2[0]; // Slater_inv in QMC=Chem is actually its transpose
-    determinant = determinant2;
-  } else if (number_of_updates[0] == '3') { // 3 Updates
-    N_updates = &N_updates3;
-    Updates = &Updates3[0];
-    Updates_index = &Updates_index3[0];
-    Slater = &Slater3[0];
-    Slater_invT = &Slater_invT3[0];
-    determinant = determinant3;
-  } else if (number_of_updates[0] == '5') { // 5 Updates
-    N_updates = &N_updates5;
-    Updates = &Updates5[0];
-    Updates_index = &Updates_index5[0];
-    Slater = &Slater5[0];
-    Slater_invT = &Slater_invT5[0];
-    determinant = determinant5;
-  } else { // Exit
-    printf("Incorrect number of updates given\n");
-    return 1;
-  }
-  
-  rc = check_residual(LDS, Dim, Slater_invT, Slater);
-  assert(rc == 0 && "check_residual()");
-  rc = test_kernel(version, LDS, Dim, *N_updates, Updates, Updates_index,
-                   breakdown, Slater, Slater_invT, &determinant);
-  assert(rc == 0 && "test_kernel()");
-
-  // EVERYTHING WORKS UP UNTILL HERE
-
-  uint64_t before = rdtsc();
-  if (version[0] == 'a') { // Anthony
-    for (int i = 0; i < REPETITIONS; i++) {
-      const double* Upds;
-      const uint64_t* Ui;
-      for (int j = 0; j < *N_updates; j++) {
-        Upds = &Updates[j*LDS];
-        Ui = &Updates_index[j];
-        detupd(Dim, LDS, Upds, Ui, Slater_invT, &determinant);
-      }
-    }
-  } else if (version[0] == 'n') { // Naive
-    for (int i = 0; i < REPETITIONS; i++) {
-      rc = qmckl_sherman_morrison(LDS, Dim, *N_updates, Updates,
-            Updates_index, breakdown, Slater_invT, &determinant);
-      if (rc != 0) printf("qmckl_sherman_morrison failed\n");
-    }
-  } else if (version[0] == 's') { // Splitting
-    for (int i = 0; i < REPETITIONS; i++) {
-      rc = qmckl_sherman_morrison_splitting(LDS, Dim, *N_updates, Updates,
-            Updates_index, breakdown, Slater_invT, &determinant);
-      if (rc != 0) printf("qmckl_sherman_morrison_splitting failed\n");
-    }
-  } else if (version[0] == 'b') { // Blocked
-    for (int i = 0; i < REPETITIONS; i++) {
-      // rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
-      //                        breakdown, Slater_inv, &determinant);
-      // rc = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
-      //                        breakdown, Slater_inv, &determinant);
-      rc = qmckl_sherman_morrison_smw32s(LDS, Dim, *N_updates, Updates,
-            Updates_index, breakdown, Slater_invT, &determinant);
-      if (rc != 0) printf("qmckl_sherman_morrison_smw32s failed\n");
-    }
-  }
-  uint64_t after = rdtsc();
-  printf("cycles = %f\n", ((double)(after - before) / (double) REPETITIONS));
-}
diff --git a/independent_test_harness/test.c.older b/independent_test_harness/test.c.older
deleted file mode 100644
index 2ae7c24..0000000
--- a/independent_test_harness/test.c.older
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <assert.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-static __inline__ uint64_t rdtsc(void) {
-  unsigned hi, lo;
-  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-  return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
-}
-
-int qmckl_sherman_morrison(
-    const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates,
-    const double *__restrict __attribute__((aligned(8))) Updates,
-    const uint64_t *__restrict Updates_index, const double breakdown,
-    double *__restrict __attribute__((aligned(8))) Slater_inv,
-    double *__restrict determinant);
-
-int detupd(const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates,
-           const double *__restrict __attribute__((aligned(8))) Updates,
-           const uint64_t *__restrict Updates_index, const double breakdown,
-           double *__restrict __attribute__((aligned(8))) Slater_inv,
-           double *__restrict determinant);
-
-#define REPETITIONS 100000000
-int main(int argc, char **argv) {
-
-  assert(argc == 2);
-  char *version = argv[1];
-
-  const uint64_t Dim = 21;
-  const uint64_t LDS = 24;
-  const uint64_t N_updates = 1;
-  double Updates[LDS] __attribute__((aligned(8)));
-  uint64_t Updates_index[N_updates];
-  Updates_index[0] = 1;
-  const double breakdown = 1e-3;
-  double Slater_inv[LDS * Dim] __attribute__((aligned(8)));
-  double determinant = 1.0;
-
-  for (int i = 0; i < Dim; i++) {
-    Updates[i] = i;
-    for (int j = 0; j < Dim; j++) {
-      Slater_inv[LDS * i + j] = j;
-    }
-  }
-
-  uint64_t before = rdtsc();
-  if (version[0] == 'c') {
-    for (int i = 0; i < REPETITIONS; i++) {
-      detupd(LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv,
-             &determinant);
-    }
-  } else {
-    for (int i = 0; i < REPETITIONS; i++) {
-      qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
-                             breakdown, Slater_inv, &determinant);
-    }
-  }
-  uint64_t after = rdtsc();
-  printf("cycles = %f\n", ((double)(after - before) / (double)REPETITIONS));
-}