Improved version.

- All static arrays replaced by dynamic ones - All overhead induced by checking before and after running of the kernels replaced as much as possible with calls to MKL/DGEMMs. - Solved bugs due to dimension mismatches. Overhead time is dramatically reduced because no more calls to naive 'matmul'.
2025-01-13 06:28:35 +01:00 · 2022-10-01 18:54:18 +02:00 · 2022-10-01 18:54:18 +02:00 · bba5cf5f2c
commit bba5cf5f2c
parent 15f959099d
9 changed files with 252 additions and 217 deletions
--- a/independent_test_harness/Makefile
+++ b/independent_test_harness/Makefile
@ -1,6 +1,7 @@
 #FC = ifx
 #CC = nvc

+#CFLAGS=-std=c99 -O0 -Wall -g -mp -target=gpu
 CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu

 INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
--- a/independent_test_harness/bench_cycles/1_cycle.h
+++ b/independent_test_harness/bench_cycles/1_cycle.h
@ -0,0 +1,2 @@
+const uint32_t n_cycles = 1;
+uint32_t cycles[n_cycles] = {1};
--- a/independent_test_harness/check_result.sh
+++ b/independent_test_harness/check_result.sh
@ -1,5 +1,15 @@
 #!/bin/bash
-make
-rm -v blocked kay cu
-./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu 
+
+rm -v blocked kay cu em
+
+make && 
+	./test m > em && \
+	./test b > blocked && \
+	./test k > kay && \
+	./test c > cu && \
+	head -n 4 em && \
+	awk 'NR==5' em && \
+	awk 'NR==5' blocked && \
+	awk 'NR==5' kay && \
+	awk 'NR==5' cu

--- a/independent_test_harness/cycles.h
+++ b/independent_test_harness/cycles.h
@ -1 +1 @@
-cycles_329_dets/all_cycles.h
+bench_cycles/1_cycle.h
--- a/independent_test_harness/kernels.h
+++ b/independent_test_harness/kernels.h
@ -1,8 +1,6 @@
 #include <mkl_lapacke.h>
 #include <mkl.h>

-#define DIM 21 
-#define LDS 24   
 #define HAVE_CUBLAS_OFFLOAD

 #ifdef HAVE_CUBLAS_OFFLOAD
--- a/independent_test_harness/meuk.c
+++ b/independent_test_harness/meuk.c
@ -2,6 +2,89 @@
 #include <stdint.h>
 #include <assert.h>

+void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim) {
+  for (uint32_t i = 0; i < Dim; i++) {
+    for (uint32_t j = 0; j < Lds; j++) {
+      if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
+      else Slater_invT_copy[i * Lds + j] = 0.0;
+    }
+  }
+}
+
+void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds) {
+  for (int i = 0; i < nupds; i++) {
+    int col = ui[i] - 1;
+    for (int j = 0; j < Dim; j++) {
+      slaterT[col + j * Dim] += upds[i * Lds + j];
+    }
+  }
+}
+
+void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds) {
+  for (int i = 0; i < nupds; i++) {
+    int col = ui[i] - 1;
+    for (int j = 0; j < Lds; j++) {
+      upds[i * Lds + j] -= slaterT[col + j * Dim];
+    }
+  }
+}
+
+double get_determinant(uint32_t cycle, hid_t file_id) {
+  char det_key[32];
+  sprintf(det_key, "/cycle_%d/determinant", cycle);
+  double determinant;
+  read_double(file_id, det_key, &determinant);
+  return determinant;
+}
+
+double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) {
+  char slater_inv_key[32];
+  sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
+  double *slater_inv = malloc(sizeof *slater_inv * Dim * Lds);
+  read_double(file_id, slater_inv_key, slater_inv);
+  return slater_inv;
+}
+
+double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) {
+  char slater_key[32];
+  sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
+  double *slater = malloc(sizeof *slater * Dim * Lds);
+  read_double(file_id, slater_key, slater);
+  return slater;
+}
+
+double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds) {
+  char upds_key[32];
+  sprintf(upds_key, "/cycle_%d/updates", cycle);
+  double *upds = malloc(sizeof *upds * Lds * nupds);
+  read_double(file_id, upds_key, upds);
+  return upds;
+}
+
+uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds) {
+  char upd_idx_key[32];
+  sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
+  uint64_t* uis = malloc(sizeof *uis * nupds);
+  read_uint(file_id, upd_idx_key, uis);
+  return uis;
+}
+
+uint64_t get_dim(uint32_t cycle, hid_t file_id) {
+  char dim_key[32];
+  sprintf(dim_key, "/cycle_%d/slater_matrix_dim", cycle);
+  uint64_t Dim;
+  read_uint(file_id, dim_key, &Dim);
+  return Dim;
+}
+
+uint64_t get_nupdates(uint32_t cycle, hid_t file_id) {
+  char nupds_key[32];
+  sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
+  uint64_t N_updates;
+  read_uint(file_id, nupds_key, &N_updates);
+  return N_updates;
+}
+
 double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
  double sum2 = 0;
  for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
@ -65,16 +148,12 @@ void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
 uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
                        double *Slater, const double tolerance) {

-  double res[Dim*Dim];
-
-  for (uint32_t i = 0; i < Dim; i++) {
-    for (uint32_t j = 0; j < Dim; j++) {
-      res[i * Dim + j] = 0;
-      for (uint32_t k = 0; k < Dim; k++) {
-        res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
-      }
-    }
-  }
+  double* res = malloc(sizeof *res * Dim * Dim);
+  double alpha = 1.0, beta = 0.0;
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+              Dim, Dim, Dim,
+              alpha, Slater, Dim, Slater_invT, Lds,
+              beta, res, Dim);

  for (uint32_t i = 0; i < Dim; i++) {
    for (uint32_t j = 0; j < Dim; j++) {
@ -84,20 +163,20 @@ uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT
      if (i != j && fabs(elm) > tolerance) return 1;
    }
  }
-
+  free(res);
  return 0;
 }

-void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
-  for (uint32_t i = 0; i < Dim; i++) {
-    for (uint32_t j = 0; j < Dim; j++) {
-      prod[i * Dim + j] = 0;
-      for (uint32_t k = 0; k < Dim; k++) {
-        prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
-      }
-    }
-  }
-}
+//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
+//  for (uint32_t i = 0; i < Dim; i++) {
+//    for (uint32_t j = 0; j < Dim; j++) {
+//      prod[i * Dim + j] = 0;
+//      for (uint32_t k = 0; k < Dim; k++) {
+//        prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
+//      }
+//    }
+//  }
+//}

 int32_t check_error_better(const double max, const double tolerance) {
  if (max < 0) return -1; // When max was a NaN
@ -119,21 +198,7 @@ uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
                     const uint64_t *Updates_index, const double breakdown, const double tolerance,
                     double *Slater, double *Slater_inv, double *determinant) {
  uint32_t rc = 0;
-  // if (version[0] == 'a') { // Anthony
-  //   const double *Upds;
-  //   const uint64_t *Ui;
-  //   for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
-  //   for (int j = 0; j < N_updates; j++) {
-  //     Upds = &Updates[j * Lds];
-  //     Ui = &Updates_index[j];
-  //     detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
-  //     if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
-  //   }
-  //   for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
-  //   update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
-  //   rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
-  //   if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
-  // } else if (version[0] == 'n') { // Naive
+
  if (version[0] == 'n') { // Naive
    rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
                                breakdown, Slater_inv, determinant);
--- a/independent_test_harness/meuk.h
+++ b/independent_test_harness/meuk.h
@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <hdf5/serial/hdf5.h>
+
 #include "kernels.h"

 typedef struct Error {
@ -12,7 +13,18 @@ typedef struct Error {
  uint64_t error;
 } Error;

-void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
+void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim);
+void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds);
+void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds);
+double get_determinant(uint32_t cycle, hid_t file_id);
+double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
+double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
+double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds);
+uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds);
+uint64_t get_dim(uint32_t cycle, hid_t file_id);
+uint64_t get_nupdates(uint32_t cycle, hid_t file_id);
+
+//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
 void residual(double *a, double *res, const uint64_t Dim);
 double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
 double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
--- a/independent_test_harness/sm.c
+++ b/independent_test_harness/sm.c
@ -2,7 +2,7 @@
 #include <stdint.h>
 #include <stdbool.h>
 #include "kernels.h"
-#include "debug.h"
+//#include "debug.h"

 extern uint64_t n_splits;
 extern uint64_t block_fail;
@ -19,11 +19,11 @@ uint32_t qmckl_sherman_morrison(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

-  double __attribute__((aligned(8))) C[DIM];
-  double __attribute__((aligned(8))) D[LDS];
+  double __attribute__((aligned(8))) C[Dim];
+  double __attribute__((aligned(8))) D[Lds];

  uint32_t l = 0;
  // For each update
@ -48,7 +48,7 @@ uint32_t qmckl_sherman_morrison(
    double iden = 1.0 / den;

    // Update det(A)
-    if (!determinant)
+    if (determinant)
      *determinant *= den;

 #pragma ivdep
@ -89,14 +89,14 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
                          Slater_inv,
                          double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

  const uint32_t row1 = (Updates_index[0] - 1);
  const uint32_t row2 = (Updates_index[1] - 1);

  // Compute C = (S^T)^{-1}U : Dim x 2
-  double __attribute__((aligned(8))) C[2 * DIM];
+  double __attribute__((aligned(8))) C[2 * Dim];
  for (uint32_t i = 0; i < Dim; i++) {
    C[i * 2] = 0;
    C[i * 2 + 1] = 0;
@ -132,7 +132,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
  Binv[3] = idet * B0;

  // tmp = B^{-1}D : 2 x LDS
-  double __attribute__((aligned(8))) tmp[2 * LDS];
+  double __attribute__((aligned(8))) tmp[2 * Lds];
  double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
  double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
 #pragma ivdep
@ -173,15 +173,15 @@ uint32_t  qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
                           Slater_inv,
                           double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

  const uint32_t row1 = (Updates_index[0] - 1);
  const uint32_t row2 = (Updates_index[1] - 1);
  const uint32_t row3 = (Updates_index[2] - 1);

  // Compute C = (S^T)^{-1}U : Dim x 3
-  double __attribute__((aligned(8))) C[3 * DIM];
+  double __attribute__((aligned(8))) C[3 * Dim];
  for (uint32_t i = 0; i < Dim; i++) {
    C[i * 3] = 0;
    C[i * 3 + 1] = 0;
@ -231,10 +231,10 @@ uint32_t  qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
  Binv[8] =  (B0 * B4 - B3 * B1) * idet;

  // tmp = B^{-1}D : 3 x LDS
-  double __attribute__((aligned(8))) tmp[3 * LDS];
-  double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
-  double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
-  double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
+  double __attribute__((aligned(8))) tmp[3 * Lds];
+  double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
+  double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
+  double *__restrict r3dim = &(Slater_inv[row3 * Lds]);
 #pragma ivdep
 #pragma vector aligned
  for (uint32_t j = 0; j < Lds; j++) {
@ -277,11 +277,11 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
                          double *__restrict __attribute__((aligned(8))) Slater_inv,
                          double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

  // Compute C = S^{-1} U : Dim x K : standard dgemm
-  double *C = calloc(1, DIM * N_updates * sizeof(double));
+  double *C = calloc(1, Dim * N_updates * sizeof(double));
  double alpha = 1.0, beta = 0.0;
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
              Dim, N_updates, Lds,
@ -289,7 +289,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
              beta, C, N_updates);

  // Construct B = 1 + V C : K x K, construct D = V S^{-1} : K x LDS
-  double B[N_updates * N_updates], D[N_updates * LDS];
+  double* B = calloc(1, sizeof *B * N_updates * N_updates);
+  double* D = calloc(1, sizeof *D * N_updates * Lds);
  for (uint32_t i = 0; i < N_updates; i++) {
    const uint32_t row = Updates_index[i] - 1;
    for (uint32_t j = 0; j < N_updates  ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
@ -316,19 +317,23 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
  (void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, pivot);

  // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
-  double tmp1[N_updates * LDS];
+  double* tmp1 = calloc(1, sizeof *tmp1 * N_updates * Lds);
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-              N_updates, LDS, N_updates,
-              alpha, B, N_updates, D, LDS,
-              beta, tmp1, LDS);
+              N_updates, Lds, N_updates,
+              alpha, B, N_updates, D, Lds,
+              beta, tmp1, Lds);

  // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
  alpha = -1.0, beta = 1.0;
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-              Dim, LDS, N_updates,
-              alpha, C, N_updates, tmp1, LDS,
-              beta, Slater_inv, LDS);
+              Dim, Lds, N_updates,
+              alpha, C, N_updates, tmp1, Lds,
+              beta, Slater_inv, Lds);

+  free(C);
+  free(B);
+  free(D);
+  free(tmp1);
  free(pivot);
  return 0;
 }
@ -344,10 +349,12 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
                                         double* Slater_inv,
                                         double* determinant)
 {
-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

-  double alpha, beta;
+  bool swap;
+  uint32_t j;
+  double alpha, beta, det;
  int* pivot   = calloc(1, sizeof *pivot * N_updates);
  double* C    = calloc(1, sizeof *C * Dim * N_updates);
  double* B    = calloc(1, sizeof *B * N_updates * N_updates);
@ -396,19 +403,22 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand

    // Compute determinant by LU decomposition
    (void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info);
-    bool swap = false; uint32_t j = 0; double det = 1.0f;
+    swap = false; j = 0; det = 1.0f;
    #pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det)
    for (uint32_t i = 0; i < N_updates; i++) {
      swap = (bool)(pivot[i] - (i + 1));     // swap = {0->false: no swap, >0->true: swap}
      j += (uint32_t)swap;                   // count # of swaps
      det *= B[i * (N_updates + 1)];         // prod. of diag elm. of B
    }
-    if (fabs(det) < breakdown) return 1;  // check if determinant of B is too close to zero. If so, exit early.
+  }
+    if (fabs(det) < breakdown) return 1;     // check if determinant of B is too close to zero. If so, exit early.
    if (determinant) {                       // update det(Slater) if determinant != NULL
      if ((j & 1) != 0) det = -det;          // multiply det with -1 if # of swaps is odd
      *determinant *= det;
    }
-
+  
+  #pragma omp target data use_device_ptr(Slater_inv, Updates, C, B, workspace, pivot, Binv, D, T1, T2)
+  {
    // Compute B^{-1} : initialise as I for solving BX=I
    #pragma omp target teams distribute parallel for
    for (int i = 0; i < N_updates; ++i) {
@ -471,11 +481,11 @@ uint32_t qmckl_slagel_splitting(
    uint64_t *__restrict later_index, uint64_t *__restrict later,
    double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

-  double __attribute__((aligned(8))) C[LDS];
-  double __attribute__((aligned(8))) D[LDS];
+  double __attribute__((aligned(8))) C[Lds];
+  double __attribute__((aligned(8))) D[Lds];

  uint32_t l = 0;
  // For each update
@ -513,7 +523,7 @@ uint32_t qmckl_slagel_splitting(
    } // From here onwards we continue with applying the first halve of the update to Slater_inv
    double iden = 1.0 / den;

-    if (!determinant) *determinant *= den;
+    if (determinant) *determinant *= den;

    // D = v^T x S^{-1} : 1 x LDS
 #pragma ivdep
@ -544,10 +554,10 @@ uint32_t qmckl_sherman_morrison_splitting(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

-  double __attribute__((aligned(8))) later_updates[LDS * N_updates];
+  double __attribute__((aligned(8))) later_updates[Lds * N_updates];
  uint64_t later_index[N_updates];
  uint64_t later = 0;
  // uint32_t rc;
@ -583,10 +593,10 @@ uint32_t qmckl_sherman_morrison_smw32s(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

-  double __attribute__((aligned(8))) later_updates[LDS * N_updates];
+  double __attribute__((aligned(8))) later_updates[Lds * N_updates];
  uint64_t later_index[N_updates];
  uint64_t later = 0;
  uint32_t rc;
@ -702,13 +712,13 @@ uint32_t qmckl_sherman_morrison_later(
    double *__restrict __attribute__((aligned(8))) Slater_inv,
    double *__restrict determinant) {

-  const uint32_t Dim = DIM;
-  const uint32_t Lds = LDS;
+  const uint32_t Dim = vDim;
+  const uint32_t Lds = vLDS;

-  double __attribute__((aligned(8))) C[DIM];
-  double __attribute__((aligned(8))) D[LDS];
+  double __attribute__((aligned(8))) C[Dim];
+  double __attribute__((aligned(8))) D[Lds];

-  double __attribute__((aligned(8))) later_updates[LDS * N_updates];
+  double __attribute__((aligned(8))) later_updates[Lds * N_updates];
  uint64_t later_index[N_updates];
  uint64_t later = 0;

@ -743,7 +753,7 @@ uint32_t qmckl_sherman_morrison_later(
    }
    double iden = 1.0 / den;

-    if (!determinant) *determinant *= den;
+    if (determinant) *determinant *= den;

    // D = v^T x A^{-1}
 #pragma ivdep
--- a/independent_test_harness/test.c
+++ b/independent_test_harness/test.c
@ -1,9 +1,9 @@
-  #include <stdint.h>
+#include <stdint.h>
 #include "meuk.h"
 #include "cycles.h"
+#include "debug.h"

-#define DATASET "dataset_329d_zeropadded_cm.hdf5"
-// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
+#define DATASET "dataset"
 #define REPETITIONS 1

 uint64_t n_splits;
@ -27,77 +27,46 @@ int main(int argc, char **argv) {
  }
 #endif

-
  // SETUP STORAGE AND DATA ACCESS
-  hid_t  file_id;
-  file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
-  char nupds_key[32];
-  char upd_idx_key[32];
-  char upds_key[32];
-  char slater_key[32];
-  char slater_inv_key[32];
-  char det_key[32];
-  const uint64_t Dim = DIM;
-  const uint64_t Lds = LDS;
-  uint64_t N_updates;
-  double Slater[LDS * DIM ], SlaterT[LDS * DIM];
-  double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
-  double determinant, determinant_copy;
+  hid_t  file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);

-  // SETUP TEST PARAMETERS
-  const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
-  const double tolerance = 0.001; // default = 0.001
-  double cumulative = 0;
-
-printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
-printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
-printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
-printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
+  printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
+  printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
+  printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
+  printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
  // FOR EACH UPDATE CYCLE DO:
  // for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
-  for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) {  
+  for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) {
+
+    // SETUP TEST PARAMETERS
+    const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
+    const double tolerance = 0.001; // default = 0.001
+    double cumulative = 0;
+
    // 1. READ DATA FROM DATASET
    uint32_t cycle = cycles[cycles_index];
-    sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
-    sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
-    sprintf(upds_key, "/cycle_%d/updates", cycle);
-    sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
-    sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
-    sprintf(det_key, "/cycle_%d/determinant", cycle);
-    read_uint(file_id, nupds_key, &N_updates);
-    uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
-    double *Updates = malloc(Lds * N_updates * sizeof(double));
-    read_uint(file_id, upd_idx_key, Updates_index);
-    read_double(file_id, upds_key, Updates);
-    read_double(file_id, slater_key, Slater);
-    read_double(file_id, slater_inv_key, Slater_invT);
-    read_double(file_id, det_key, &determinant);
+    uint64_t Dim = get_dim(cycle, file_id);
+    uint64_t Lds = Dim;
+    uint64_t N_updates = get_nupdates(cycle, file_id);
+    uint64_t* Updates_index = get_upd_idcs(cycle, file_id, N_updates);
+    double* Updates = get_upds(cycle, file_id, N_updates, Lds);
+    double* Slater = get_slater(cycle, file_id, Dim, Lds);
+    double* Slater_invT = get_slater_inv(cycle, file_id, Dim, Lds);
+    double *Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds);
+    double determinant = get_determinant(cycle, file_id), determinant_copy;

    // Compute transpose of S. ST: 24 x 21
-    for (int i = 0; i < Lds; i++) {
-      for (int j = 0; j < Dim; j++) {
-        SlaterT[i * Dim + j] = Slater[j * Lds + i];
-      }
-    }
+    double *SlaterT = calloc(1, sizeof *SlaterT * Dim * Lds);
+    transpose(Slater, Lds, SlaterT, Dim, Dim, Lds);

    // Convert repl. upds into additive upds.
-    for (int i = 0; i < N_updates; i++) {
-      int col = Updates_index[i] - 1;
-      for (int j = 0; j < Lds; j++) {
-        Updates[i * Lds + j] -= SlaterT[col + j * Dim];
-      }
-    }
+    convert(Updates, N_updates, Updates_index, SlaterT, Dim, Lds);

    // 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
    uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);

    // Update Slater matrix
-    for (int i = 0; i < N_updates; i++) {
-      int col = Updates_index[i] - 1;
-      for (int j = 0; j < Dim; j++) {
-        SlaterT[col + j * Dim] += Updates[i * Lds + j];
-      }
-    } // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
+    update(SlaterT, Updates, Updates_index, N_updates, Dim, Lds);

    int32_t err_break;

@ -116,53 +85,13 @@ printf("#-----------------------------------------------------------------------
      determinant_copy = determinant;

      // ### CHOOSE A KERNEL:
-      // if (version[0] == 'a') { // Anthony
-      //   const double *Upds;
-      //   const uint64_t *Ui;
-      //   double determinant_previous;
-
-      //   err_break = 0;
-
-      //   for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
-
-      //   for (int i = 0; i < N_updates; i++) {
-      //     Upds = &Updates[i * Lds];
-      //     Ui = &Updates_index[i];
-      //     determinant_previous = determinant_copy;
-
-      //     // 1. FETCH START TIME
-      //     uint64_t before = rdtsc();
-
-      //     // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-      //     detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
-
-      //     // 3. FETCH FINISH TIME
-      //     uint64_t after = rdtsc();
-
-      //     // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
-      //     accumulator += (double)(after - before);
-
-      //     // 5. STOP APPLYING UPDATES IF BREAKDOWN DETECTED
-      //     double lambda = determinant_copy / determinant_previous; // should be id. to lambda in detupd
-      //     if (fabs(lambda) < breakdown) {
-      //       err_break = 1;
-      //       break;
-      //     }
-      //   }
-        
-      //   if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
-      //     for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
-      //   } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
-      //     for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
-      //   }
-      // } else if (version[0] == 'n') { // Naive
      if (version[0] == 'n') { // Naive
        // 1. FETCH START TIME
        uint64_t before = rdtsc();

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
-              Updates_index, breakdown, Slater_invT_copy, &determinant);
+                                           Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -176,7 +105,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
-              Updates_index, breakdown, Slater_invT_copy, &determinant);
+                                                 Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -190,7 +119,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
-              breakdown, Slater_invT_copy, &determinant);
+                                     breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -205,7 +134,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
-              breakdown, Slater_invT_copy, &determinant);
+                                     breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -220,7 +149,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
-              Updates_index, breakdown, Slater_invT_copy, &determinant);
+                                     Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -236,7 +165,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_woodbury_k_cublas_offload(handle, s_handle, Lds, Dim, N_updates, Updates,
-              Updates_index, breakdown, Slater_invT_copy, &determinant);
+                                                    Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -251,7 +180,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
-              Updates_index, breakdown, Slater_invT_copy, &determinant);
+                                                     Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -265,7 +194,7 @@ printf("#-----------------------------------------------------------------------

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
        err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
-              Updates_index, breakdown, Slater_invT_copy, &determinant);
+                                                  Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
        uint64_t after = rdtsc();
@ -275,8 +204,8 @@ printf("#-----------------------------------------------------------------------
      } else if (version[0] == 'm') { // LAPACK/MKL

        // Only send upper Dim x Dim part of matrix to lapack
-        double tmp[DIM *DIM];
-        memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
+        double *tmp = malloc(sizeof *tmp * Dim * Dim);
+        memcpy(tmp, SlaterT, sizeof *SlaterT * Dim * Dim);

        // 1. FETCH START TIME
        uint64_t before = rdtsc();
@ -288,12 +217,8 @@ printf("#-----------------------------------------------------------------------
        uint64_t after = rdtsc();

        // Copy elements of inverse back, adding 0-padding in "correct" place
-        for (uint32_t i = 0; i < Dim; i++) {
-          for (uint32_t j = 0; j < Lds; j++) {
-            if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
-            else Slater_invT_copy[i * Lds + j] = 0.0;
-          }
-        }
+        copy(Slater_invT_copy, Lds, tmp, Dim);
+        free(tmp);

        // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
        accumulator += (double)(after - before);
@ -302,9 +227,10 @@ printf("#-----------------------------------------------------------------------
        printf("Version '%c' not implemented.\n", version[0]);
        return 1;
      }
+
    } // END OF REPETITIONS LOOP

-    // 4. COPY RESULT BACK TO ORIGINAL 
+    // 4. COPY RESULT BACK TO ORIGINAL
    memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
    determinant = determinant_copy;
    // At this point Slater_invT contains the correct inverse matrix
@ -321,11 +247,19 @@ printf("#-----------------------------------------------------------------------
    // CUMULATIVE RESULT FOR THE ENTIRE DATASET
    cumulative += accumulator;

-    double SSi[DIM * DIM];
-    matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
-    double Res[DIM * DIM];
+    // Compute Slater x Slater_inv
+    double* SSi = malloc(sizeof *SSi * Dim * Dim);
+    double alpha = 1.0, beta = 0.0;
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                Dim, Dim, Dim,
+                alpha, SlaterT, Dim, Slater_invT, Lds,
+                beta, SSi, Dim);
+
+    // Compute residual matrix S * Sinv - Id
+    double* Res = calloc(1, sizeof *Res * Dim * Dim);
    residual(SSi, Res, Dim);
    const double max = max_norm(Res, Dim, Dim);
+    free(SSi);

    // 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
    uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
@ -336,13 +270,16 @@ printf("#-----------------------------------------------------------------------
    // 8. COMPUTE CONDITION NUMBER
    const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
    const double frob = frobenius_norm(Res, Dim, Dim);
-
+    free(Res);

    // 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
-        printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
-        
+    printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
+
    free(Updates_index);
    free(Updates);
+    free(SlaterT);
+    free(Slater_invT);
+    free(Slater_invT_copy);

  } // END OF CYCLE LOOP
  printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
@ -352,7 +289,7 @@ printf("#-----------------------------------------------------------------------

  (void) H5Fclose(file_id);

-#ifdef HAVE_CUBLAS_OFFLOAD  
+#ifdef HAVE_CUBLAS_OFFLOAD
  cublasDestroy(handle);
  cusolverDnDestroy(s_handle);
 #endif