cuBLAS version of Woodbury KxK is working, but called to lapacke dgetrf/ri need to be replaced with cuSOLVER calls to eliminate intermediate results to be transfered to/from device.

2024-12-23 21:04:10 +01:00 · 2022-09-22 14:37:00 +02:00 · 2022-09-22 14:37:00 +02:00 · 00bdcba230
commit 00bdcba230
parent 892358d0d1
8 changed files with 157 additions and 314 deletions
--- a/independent_test_harness/Makefile
+++ b/independent_test_harness/Makefile
@ -1,11 +1,15 @@
-FC = ifx
-CC = nvc
+#FC = ifx
+#CC = nvc

 CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu

-LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
+INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
+
+LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
 LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
-LDFLAGS+=-lcublas -mp -target=gpu
+LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -mp -target=gpu
+
+all: test

 ## Link with icc
 # test: sm.o test.o detupdate21.o meuk.o
--- a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
+++ b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
@ -1,18 +1,21 @@
 /*
 Compile with:

-nvc -L${MKLROOT}/lib/intel64 \
-    -lmkl_intel_lp64 \
-    -lmkl_sequential \
-    -lmkl_core \
-    -lpthread \
-    -lm \
-    -ldl \
-    -lcublas \
-    -mp \
-    -target=gpu \
-    cblasdgemm_vs_cublasdgemm_test.c \
-    -o cblasdgemm_vs_cublasdgemm_test
+nvc \
+-I$NV_CUDA_MATH_PATH/11.7/include \
+-L$NV_CUDA_MATH_PATH/11.7/lib64 \
+-L${MKLROOT}/lib/intel64 \
+-lmkl_intel_lp64 \
+-lmkl_sequential \
+-lmkl_core \
+-lpthread \
+-lm \
+-ldl \
+-lcublas \
+-mp \
+-target=gpu \
+cblasdgemm_vs_cublasdgemm_test.c \
+-o cblasdgemm_vs_cublasdgemm_test

 */

--- a/independent_test_harness/debug.h
+++ b/independent_test_harness/debug.h
@ -1,11 +1,29 @@
-void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+void print_dm(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
 {
  printf("%s = \n", name);
  for (uint16_t i = 0; i < m; ++i)
  {
    for (uint16_t j = 0; j < n; ++j)
    {
-      printf("%11.5f ", mat[i * ldm + j]);
+      printf("%9.3f ", mat[i * ldm + j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+void print_im(const int* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
+{
+  printf("%s = \n", name);
+  for (uint16_t i = 0; i < m; ++i)
+  {
+    for (uint16_t j = 0; j < n; ++j)
+    {
+      printf("%d ", mat[i * ldm + j]);
    }
    printf("\n");
  }
--- a/independent_test_harness/kernels.h
+++ b/independent_test_harness/kernels.h
@ -7,8 +7,9 @@

 #ifdef HAVE_CUBLAS_OFFLOAD
 #include <stdio.h>
-#include <cuda_runtime.h>
+#include <omp.h>
 #include <cublas_v2.h>
+#include <cuda_runtime_api.h>
 #endif

 lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
--- a/independent_test_harness/meuk.h
+++ b/independent_test_harness/meuk.h
@ -4,7 +4,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "hdf5.h"
+#include <hdf5/serial/hdf5.h>
 #include "kernels.h"

 typedef struct Error {
--- a/independent_test_harness/sm.c
+++ b/independent_test_harness/sm.c
@ -299,9 +299,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,

  // Compute determinant by LU decomposition
  int ipiv[N_updates];
-  lapack_int ret;
-  ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
+  (void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
+
  double det = 1.0;
  int j = 0;
  for (uint32_t i = 0; i < N_updates; i++) {
@ -319,8 +318,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
  if (determinant) *determinant *= det;

  // Compute B^{-1} with explicit formula for K x K inversion
-  ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
+  (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);

  // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
  double tmp1[N_updates * LDS];
@ -328,173 +326,146 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
              N_updates, LDS, N_updates,
              alpha, B, N_updates, D, LDS,
              beta, tmp1, LDS);
-  print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas");

  // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
-  // alpha = -1.0, beta = 1.0;
-  // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-  //             Dim, LDS, N_updates,
-  //             alpha, C, N_updates, tmp1, LDS,
-  //             beta, Slater_inv, LDS);
-
-  double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
-  cblas_dgemm(CblasRowMajor,
-              CblasNoTrans, CblasNoTrans,
+  alpha = -1.0, beta = 1.0;
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
              Dim, LDS, N_updates,
              alpha, C, N_updates, tmp1, LDS,
-              beta, tmp2, LDS);
-  print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas");
-
-  double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
-  for(int i = 0; i < DIM * LDS; ++i)
-  {
-    tmp3[i] = Slater_inv[i] - tmp2[i];
-  }
-  print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas");
-
-  for(int i = 0; i < DIM * LDS; ++i)
-  {
-    Slater_inv[i] = tmp3[i];
-  }
-
-  free(tmp2);
-  free(tmp3);
+              beta, Slater_inv, LDS);

  return 0;
 }

 #ifdef HAVE_CUBLAS_OFFLOAD
 uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
-          const uint64_t vLDS,
-          const uint64_t vDim,
-          const uint64_t N_updates,
-          const double *__restrict __attribute__((aligned(8))) Updates,
-          const uint64_t *__restrict Updates_index,
-          const double breakdown,
-          double *__restrict __attribute__((aligned(8))) Slater_inv,
-          double *__restrict determinant) {
-
+                                         const uint64_t vLDS,
+                                         const uint64_t vDim,
+                                         const uint64_t N_updates,
+                                         const double* Updates,
+                                         const uint64_t* Updates_index,
+                                         const double breakdown,
+                                         double* Slater_inv,
+                                         double* determinant)
+{
  const uint32_t Dim = DIM;
  const uint32_t Lds = LDS;

-  // Compute C = S^{-1} U : Dim x K : standard dgemm
-  double *C = calloc(1, DIM * N_updates * sizeof(double));
-  double alpha = 1.0f, beta = 0.0f;
-  #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates])
-  #pragma omp target data use_device_ptr(Slater_inv, Updates, C)
-  {
-    int cublasError = cublasDgemm(handle,
-                                  CUBLAS_OP_T, CUBLAS_OP_N,
-                                  15, 21, 24,
-                                  &alpha, Updates, 24, Slater_inv, 24, 
-                                  &beta, C, 15);
-  }
-  #pragma omp target exit data map(from:C[0:DIM*N_updates])
+  double alpha, beta;
+  int* ipiv = calloc(1, sizeof *ipiv * N_updates);
+  double* C  = calloc(1, sizeof *C * Dim * N_updates);
+  double* B  = calloc(1, sizeof *B * N_updates * N_updates);
+  double* D  = calloc(1, sizeof *D * N_updates * Lds);
+  double* T1 = calloc(1, sizeof *T1 * N_updates * Lds);
+  double* T2 = calloc(1, sizeof *T2 * Dim * Lds);

-  // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
-  // Construct D = V S^{-1} : K x LDS
-  double *B = calloc(1, N_updates * N_updates * sizeof(double));
-  double *D = calloc(1, N_updates * LDS * sizeof(double));
-  for (uint32_t i = 0; i < N_updates; i++) {
-    const uint32_t row = Updates_index[i] - 1;
-    for (uint32_t j = 0; j < N_updates  ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
-    for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
+  #pragma omp target enter data map(to:    Updates[0:Lds*N_updates], \
+                                           Updates_index[0:N_updates], \
+                                           Slater_inv[0:Dim*Lds]) \
+                                map(alloc: B[0:N_updates*N_updates], \
+                                           C[0:Dim*N_updates], \
+                                           D[0:N_updates*Lds], \
+                                           T1[0:N_updates*Lds], \
+                                           T2[0:Dim*Lds], \
+                                           ipiv[0:N_updates])
+
+  #pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE
+  {
+    alpha = 1.0f, beta = 0.0f;
+    (void) cublasDgemm(handle,
+                      CUBLAS_OP_T, CUBLAS_OP_N,
+                      N_updates, Dim, Lds,
+                      &alpha, Updates, Lds, Slater_inv, Lds,
+                      &beta, C, N_updates);
  }

+  // Construct B = 1 + V C : K x K
+  // Construct D = V S^{-1} : K x LDS
+  #pragma omp target teams distribute parallel for // compute B, D ON DEVICE
+  for (uint32_t i = 0; i < N_updates; i++)
+  {
+    const uint32_t row = Updates_index[i] - 1;
+    for (uint32_t j = 0; j < N_updates  ; j++)
+    {
+      B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
+    }
+    for (uint32_t j = 0; j < Lds; j++)
+    {
+      D[i * Lds + j] = Slater_inv[row * Lds + j];
+    }
+  }
+  #pragma omp target update from(B[0:N_updates*N_updates]) // Update B ON HOST
+
  // Compute determinant by LU decomposition
-  int ipiv[N_updates];
-  lapack_int ret;
-  ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
-  double det = 1.0;
-  int j = 0;
-  for (uint32_t i = 0; i < N_updates; i++) {
+  (void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
+  #pragma omp target update to(B[0:N_updates*N_updates], ipiv[0:N_updates]) // Update B ON DEVICE
+
+  double det = 1.0f; uint32_t j = 0;
+  #pragma omp target teams distribute parallel for // compute det ON DEVICE
+  for (uint32_t i = 0; i < N_updates; i++)
+  {
    j += min(ipiv[i] - i, 1);
    det *= B[(N_updates + 1) * i]; // update determinant
  }
  if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even

  // Check if determinant of B is not too close to zero
-  if (fabs(det) < breakdown) {
-    return 1;
-  }
+  if (fabs(det) < breakdown) return det;

  // Update det(Slater) if passed
  if (determinant) *determinant *= det;

-  // Compute B^{-1} with explicit formula for K x K inversion
-  ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
-  if (ret != 0) return ret;
+  // Compute B^{-1}
+  (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); // compute B^-1 ON HOST
+  #pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE

-  // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
-  double *tmp1 = calloc(1, N_updates * LDS * sizeof(double));
-  #pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS])
-  #pragma omp target data use_device_ptr(D, B, tmp1)
+  // T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
+  #pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE
  {
-    int cublasError = cublasDgemm(handle,
-                                  CUBLAS_OP_N, CUBLAS_OP_N,
-                                  LDS, N_updates, N_updates,
-                                  &alpha, D, LDS, B, N_updates, 
-                                  &beta, tmp1, LDS);
-  }
-  #pragma omp target exit data map(from:tmp1[0:N_updates*LDS])
-  print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas");
-
-  // Compute tmp2 = C * tmp1 : Dim x LDS
-  double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
-  #pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS])
-  #pragma omp target data use_device_ptr(tmp1, C, tmp2)
-  {
-    int cublasError = cublasDgemm(handle,
-                                  CUBLAS_OP_N, CUBLAS_OP_N,
-                                  LDS, Dim, N_updates,
-                                  &alpha, tmp1, LDS, C, N_updates,
-                                  &beta, tmp2, LDS);
-  }
-  #pragma omp target exit data map(from:tmp2[0:DIM*LDS])
-  print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas");
-
-  // Compute tmp3 = S^{-1} - tmp2
-  double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); 
-  beta = -1.0f;
-  #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS])
-  #pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3)
-  {
-    int cublasError = cublasDgeam(handle,
-                                  CUBLAS_OP_N, CUBLAS_OP_N,
-                                  DIM, LDS,
-                                  &alpha, Slater_inv, LDS,
-                                  &beta, tmp2, LDS,
-                                  tmp3, LDS);
-  }
-  #pragma omp target exit data map(from:tmp3[0:DIM*LDS])
-  print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas");
-  for(int i = 0; i < DIM * LDS; ++i)
-  {
-    Slater_inv[i] = tmp3[i];
+    alpha = 1.0, beta = 0.0;
+    (void) cublasDgemm(handle,
+                      CUBLAS_OP_N, CUBLAS_OP_N,
+                      Lds, N_updates, N_updates,
+                      &alpha, D, Lds, B, N_updates,
+                      &beta, T1, Lds);
  }

-  // // Compute S^{-1} <- S^{-1} - tmp2
-  // beta = -1.0f;
-  // #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS])
-  // #pragma omp target data use_device_ptr(Slater_inv, tmp2)
-  // {
-  //   int cublasError = cublasDgeam(handle,
-  //                                 CUBLAS_OP_N, CUBLAS_OP_N,
-  //                                 DIM, LDS,
-  //                                 &alpha, Slater_inv, LDS,
-  //                                 &beta, tmp2, LDS,
-  //                                 Slater_inv, LDS);
-  // }
-  // #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS])
+  // Compute T2 <- C * T1 : Dim x LDS : standard dgemm
+  #pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE
+  {
+    alpha = 1.0f, beta = 0.0f;
+    (void) cublasDgemm(handle,
+                      CUBLAS_OP_N, CUBLAS_OP_N,
+                      Dim, Lds, N_updates,
+                      &alpha, T1, Lds, C, N_updates,
+                      &beta, T2, Lds);
+  }

-  
+  // Compute S^{-1} <- S^{-1} - T2 : Dim x LDS : standard dgemm
+  #pragma omp target teams distribute parallel for // compute S^-1 ON DEVICE
+  for (uint32_t i = 0; i < Dim * Lds; i++)
+  {
+    Slater_inv[i] = Slater_inv[i] - T2[i];
+  }
+  #pragma omp target update from(Slater_inv[0:Dim*Lds]) // update S^-1 ON HOST
+
+  #pragma omp target exit data map(delete: Updates[0:Lds*N_updates], \
+                                           Updates_index[0:N_updates], \
+                                           Slater_inv[0:Dim*Lds], \
+                                           B[0:N_updates*N_updates], \
+                                           C[0:Dim*N_updates], \
+                                           D[0:N_updates*Lds], \
+                                           T1[0:N_updates*Lds], \
+                                           T2[0:Dim*Lds], \
+                                           ipiv[0:N_updates])
+
+//  free(ipiv);
  free(B);
  free(C);
  free(D);
-  free(tmp1);
-  free(tmp2);
-  // free(tmp3);
+  free(T1);
+  free(T2);
+
  return 0;
 }
 #endif
@ -529,7 +500,7 @@ uint32_t qmckl_slagel_splitting(

    // Denominator
    const int cui = Updates_index[l] - 1;
-    double den = 1.0 + C[cui];    
+    double den = 1.0 + C[cui];
    // printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui);
    if (fabs(den) < breakdown) { // Here is decided to split the update, or not.
      // printf("Split! breakdown = %f\n", breakdown);
@ -662,7 +633,7 @@ uint32_t qmckl_sherman_morrison_smw32s(

  // And for the other cases != 4, 6
  // Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
-  // Woodbury 3x3 kernel  
+  // Woodbury 3x3 kernel
  uint32_t n_of_3blocks = N_updates / 3;
  uint32_t remainder = N_updates % 3;
  uint32_t length_3block = 3 * Lds;
@ -799,7 +770,7 @@ uint32_t qmckl_sherman_morrison_later(
      }
    }
    l += 1;
-  } 
+  }

  if (later == N_updates) { // If all the updates have failed, exit early with an error
    return 1;
--- a/independent_test_harness/test.c.old
+++ b/independent_test_harness/test.c.old
@ -1,92 +0,0 @@
-#include <assert.h>
-#include "data_cm.h"
-#include "meuk.h"
-
-#define REPETITIONS 10000000
-int main(int argc, char **argv) {
-
-  assert(argc == 3);
-  char *version = argv[1];
-  char *number_of_updates = argv[2];
-  const uint64_t Dim = 21;
-  const uint64_t LDS = 24;
-  // const double breakdown = 1e-3;
-  const double breakdown = 1e-9; // this might be too small and cause NIs
-  uint32_t rc;
-
-  const uint64_t *N_updates;
-  const double *Updates;
-  const uint64_t *Updates_index;
-  double *Slater, *Slater_invT;
-  double determinant;
-  if (number_of_updates[0] == '2') { // 2 Updates
-    N_updates = &N_updates2;
-    Updates = &Updates2[0];
-    Updates_index = &Updates_index2[0];
-    Slater = &Slater2[0];
-    Slater_invT = &Slater_invT2[0]; // Slater_inv in QMC=Chem is actually its transpose
-    determinant = determinant2;
-  } else if (number_of_updates[0] == '3') { // 3 Updates
-    N_updates = &N_updates3;
-    Updates = &Updates3[0];
-    Updates_index = &Updates_index3[0];
-    Slater = &Slater3[0];
-    Slater_invT = &Slater_invT3[0];
-    determinant = determinant3;
-  } else if (number_of_updates[0] == '5') { // 5 Updates
-    N_updates = &N_updates5;
-    Updates = &Updates5[0];
-    Updates_index = &Updates_index5[0];
-    Slater = &Slater5[0];
-    Slater_invT = &Slater_invT5[0];
-    determinant = determinant5;
-  } else { // Exit
-    printf("Incorrect number of updates given\n");
-    return 1;
-  }
-  
-  rc = check_residual(LDS, Dim, Slater_invT, Slater);
-  assert(rc == 0 && "check_residual()");
-  rc = test_kernel(version, LDS, Dim, *N_updates, Updates, Updates_index,
-                   breakdown, Slater, Slater_invT, &determinant);
-  assert(rc == 0 && "test_kernel()");
-
-  // EVERYTHING WORKS UP UNTILL HERE
-
-  uint64_t before = rdtsc();
-  if (version[0] == 'a') { // Anthony
-    for (int i = 0; i < REPETITIONS; i++) {
-      const double* Upds;
-      const uint64_t* Ui;
-      for (int j = 0; j < *N_updates; j++) {
-        Upds = &Updates[j*LDS];
-        Ui = &Updates_index[j];
-        detupd(Dim, LDS, Upds, Ui, Slater_invT, &determinant);
-      }
-    }
-  } else if (version[0] == 'n') { // Naive
-    for (int i = 0; i < REPETITIONS; i++) {
-      rc = qmckl_sherman_morrison(LDS, Dim, *N_updates, Updates,
-            Updates_index, breakdown, Slater_invT, &determinant);
-      if (rc != 0) printf("qmckl_sherman_morrison failed\n");
-    }
-  } else if (version[0] == 's') { // Splitting
-    for (int i = 0; i < REPETITIONS; i++) {
-      rc = qmckl_sherman_morrison_splitting(LDS, Dim, *N_updates, Updates,
-            Updates_index, breakdown, Slater_invT, &determinant);
-      if (rc != 0) printf("qmckl_sherman_morrison_splitting failed\n");
-    }
-  } else if (version[0] == 'b') { // Blocked
-    for (int i = 0; i < REPETITIONS; i++) {
-      // rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
-      //                        breakdown, Slater_inv, &determinant);
-      // rc = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
-      //                        breakdown, Slater_inv, &determinant);
-      rc = qmckl_sherman_morrison_smw32s(LDS, Dim, *N_updates, Updates,
-            Updates_index, breakdown, Slater_invT, &determinant);
-      if (rc != 0) printf("qmckl_sherman_morrison_smw32s failed\n");
-    }
-  }
-  uint64_t after = rdtsc();
-  printf("cycles = %f\n", ((double)(after - before) / (double) REPETITIONS));
-}
--- a/independent_test_harness/test.c.older
+++ b/independent_test_harness/test.c.older
@ -1,62 +0,0 @@
-#include <assert.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-static __inline__ uint64_t rdtsc(void) {
-  unsigned hi, lo;
-  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-  return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
-}
-
-int qmckl_sherman_morrison(
-    const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates,
-    const double *__restrict __attribute__((aligned(8))) Updates,
-    const uint64_t *__restrict Updates_index, const double breakdown,
-    double *__restrict __attribute__((aligned(8))) Slater_inv,
-    double *__restrict determinant);
-
-int detupd(const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates,
-           const double *__restrict __attribute__((aligned(8))) Updates,
-           const uint64_t *__restrict Updates_index, const double breakdown,
-           double *__restrict __attribute__((aligned(8))) Slater_inv,
-           double *__restrict determinant);
-
-#define REPETITIONS 100000000
-int main(int argc, char **argv) {
-
-  assert(argc == 2);
-  char *version = argv[1];
-
-  const uint64_t Dim = 21;
-  const uint64_t LDS = 24;
-  const uint64_t N_updates = 1;
-  double Updates[LDS] __attribute__((aligned(8)));
-  uint64_t Updates_index[N_updates];
-  Updates_index[0] = 1;
-  const double breakdown = 1e-3;
-  double Slater_inv[LDS * Dim] __attribute__((aligned(8)));
-  double determinant = 1.0;
-
-  for (int i = 0; i < Dim; i++) {
-    Updates[i] = i;
-    for (int j = 0; j < Dim; j++) {
-      Slater_inv[LDS * i + j] = j;
-    }
-  }
-
-  uint64_t before = rdtsc();
-  if (version[0] == 'c') {
-    for (int i = 0; i < REPETITIONS; i++) {
-      detupd(LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv,
-             &determinant);
-    }
-  } else {
-    for (int i = 0; i < REPETITIONS; i++) {
-      qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
-                             breakdown, Slater_inv, &determinant);
-    }
-  }
-  uint64_t after = rdtsc();
-  printf("cycles = %f\n", ((double)(after - before) / (double)REPETITIONS));
-}