From 6bb95f068d2a47839676521b8e2d84fbd3c546d2 Mon Sep 17 00:00:00 2001
From: Francois Coppens <francois.coppens@uvsq.fr>
Date: Mon, 17 Oct 2022 14:56:32 +0200
Subject: [PATCH]  - Resrtructured tree  - Added NVTX annotations to GPU
 kernel.

---
 independent_test_harness/Makefile.icc_mkl.cpu |  7 +-
 .../Makefile.nvc_ompol.gpu                    | 12 +--
 independent_test_harness/debug.h              | 13 +--
 independent_test_harness/{meuk.c => helper.c} | 13 ++-
 independent_test_harness/{meuk.h => helper.h} |  3 +
 independent_test_harness/{sm.c => kernels.c}  | 88 +++++++++++++------
 independent_test_harness/kernels.h            |  4 +-
 independent_test_harness/nvtx.h               | 26 ++++++
 independent_test_harness/test.c               |  6 +-
 9 files changed, 121 insertions(+), 51 deletions(-)
 rename independent_test_harness/{meuk.c => helper.c} (97%)
 rename independent_test_harness/{meuk.h => helper.h} (96%)
 rename independent_test_harness/{sm.c => kernels.c} (93%)
 create mode 100644 independent_test_harness/nvtx.h

diff --git a/independent_test_harness/Makefile.icc_mkl.cpu b/independent_test_harness/Makefile.icc_mkl.cpu
index acdb9d5..c9faee1 100644
--- a/independent_test_harness/Makefile.icc_mkl.cpu
+++ b/independent_test_harness/Makefile.icc_mkl.cpu
@@ -6,12 +6,11 @@ LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -
 
 all: test_icc_mkl
 
-test_icc_mkl: sm.o test.o meuk.o
-	$(CC) $(LDFLAGS) -o test_icc_mkl sm.o test.o meuk.o
+test_icc_mkl: kernels.o test.o helper.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 %.o : %.c
 	$(CC) $(CFLAGS) $(INCLUDE) -c -o $@ $<
 
-clean:  
+clean:
 	rm -rf *.o *genmod* test_icc_mkl
-
diff --git a/independent_test_harness/Makefile.nvc_ompol.gpu b/independent_test_harness/Makefile.nvc_ompol.gpu
index 25b1539..12d82f9 100644
--- a/independent_test_harness/Makefile.nvc_ompol.gpu
+++ b/independent_test_harness/Makefile.nvc_ompol.gpu
@@ -1,10 +1,12 @@
 #FC = ifx
-#CC = nvc
+CC = nvc
 
-#CFLAGS=-std=c99 -O0 -Wall -g -DHAVE_CUBLAS_OFFLOAD -mp -target=gpu
-CFLAGS=-std=c99 -O3 -Wall -g -DHAVE_CUBLAS_OFFLOAD -mp -target=gpu
+#CFLAGS=-std=c99 -O0 -Wall -g -DHAVE_CUBLAS_OFFLOAD -DUSE_NVTX -mp -target=gpu
+CFLAGS=-std=c99 -O3 -Wall -g -DHAVE_CUBLAS_OFFLOAD -DUSE_NVTX -mp -target=gpu
 
 INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
+INCLUDE+=-I$(NVHPC_ROOT)/cuda/11.7/targets/x86_64-linux/include
+INCLUDE+=-I$(NVHPC_ROOT)/profilers/Nsight_Systems/target-linux-x64/nvtx/include
 
 LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
 LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
@@ -12,8 +14,8 @@ LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -lcusolver -mp -target=gpu
 
 all: test_nvc_ompol
 
-test_nvc_ompol: sm.o test.o meuk.o
-	$(CC) $(LDFLAGS) -o test_nvc_ompol sm.o test.o meuk.o
+test_nvc_ompol: kernels.o test.o helper.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 %.o : %.c
 	$(CC) $(CFLAGS) $(INCLUDE) -c -o $@ $<
diff --git a/independent_test_harness/debug.h b/independent_test_harness/debug.h
index f4ebe6f..a7da246 100644
--- a/independent_test_harness/debug.h
+++ b/independent_test_harness/debug.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -43,14 +45,3 @@ void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* na
   }
   printf("\n");
 }
-
-void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n)
-{
-  for(uint16_t i = 0; i < m; i++)
-  {
-    for( uint16_t j = 0; j < n; j++)
-    {
-      b[j * ldb + i] = a[i * lda + j];
-    }
-  }
-}
diff --git a/independent_test_harness/meuk.c b/independent_test_harness/helper.c
similarity index 97%
rename from independent_test_harness/meuk.c
rename to independent_test_harness/helper.c
index 9528cee..98b759b 100644
--- a/independent_test_harness/meuk.c
+++ b/independent_test_harness/helper.c
@@ -1,4 +1,4 @@
-#include "meuk.h"
+#include "helper.h"
 #include <stdint.h>
 #include <assert.h>
 
@@ -49,6 +49,17 @@ void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64
   }
 }
 
+void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n)
+{
+  for(uint16_t i = 0; i < m; i++)
+  {
+    for( uint16_t j = 0; j < n; j++)
+    {
+      b[j * ldb + i] = a[i * lda + j];
+    }
+  }
+}
+
 double get_determinant(uint32_t cycle, hid_t file_id) {
   char det_key[32];
   sprintf(det_key, "/cycle_%d/determinant", cycle);
diff --git a/independent_test_harness/meuk.h b/independent_test_harness/helper.h
similarity index 96%
rename from independent_test_harness/meuk.h
rename to independent_test_harness/helper.h
index b7b6815..8d36537 100644
--- a/independent_test_harness/meuk.h
+++ b/independent_test_harness/helper.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <math.h>
 #include <stdio.h>
 #include <stdint.h>
@@ -21,6 +23,7 @@ typedef struct Error {
 void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim);
 void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds);
 void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds);
+void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n);
 double get_determinant(uint32_t cycle, hid_t file_id);
 double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
 double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
diff --git a/independent_test_harness/sm.c b/independent_test_harness/kernels.c
similarity index 93%
rename from independent_test_harness/sm.c
rename to independent_test_harness/kernels.c
index 6878e25..28bf26c 100644
--- a/independent_test_harness/sm.c
+++ b/independent_test_harness/kernels.c
@@ -2,7 +2,7 @@
 #include <stdint.h>
 #include <stdbool.h>
 #include "kernels.h"
-//#include "debug.h"
+#include "nvtx.h"
 
 extern uint64_t n_splits;
 extern uint64_t block_fail;
@@ -366,16 +366,19 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
   int workspace_size = 0, *info = NULL;
   double* workspace = NULL;
   cusolverDnDgetrf_bufferSize(s_handle, N_updates, N_updates, B, N_updates, &workspace_size);
-  printf("SIZE OF CUSOLVER WORKSPACE: %d doubles of %lu byte = %lu byte\n", workspace_size, sizeof *workspace, sizeof *workspace * workspace_size);
+//  printf("SIZE OF CUSOLVER WORKSPACE: %d doubles of %lu byte = %lu byte\n", workspace_size, sizeof *workspace, sizeof *workspace * workspace_size);
   workspace = malloc(sizeof *workspace * workspace_size);
 
+  PUSH_RANGE("Transfer data TO GPU", 2)
   #pragma omp target enter data map(to: Updates[0:Lds*N_updates], \
                                         Updates_index[0:N_updates], \
                                         Slater_inv[0:Dim*Lds])
+  POP_RANGE
 
   // Compute C <- S^{-1} U : Dim x K : standard dgemm
   alpha = 1.0f, beta = 0.0f;
   #pragma omp target enter data map(alloc: C[0:Dim*N_updates])
+  PUSH_RANGE("Compute C = S^{-1} U", 3)
   #pragma omp target data use_device_ptr(Slater_inv, Updates, C)
   {
     (void) cublasDgemm_v2(b_handle,
@@ -385,31 +388,38 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
                           &beta, C, N_updates);
   }
   #pragma omp target exit data map(delete: Updates[0:Lds*N_updates])
+  POP_RANGE
 
   // Construct B <- 1 + V C : K x K
   #pragma omp target enter data map(alloc: B[0:N_updates*N_updates])
-  #pragma omp target teams distribute parallel for
-  for (uint32_t i = 0; i < N_updates; i++) {
-    const uint32_t row = Updates_index[i] - 1;
-    for (uint32_t j = 0; j < N_updates  ; j++) {
-      B[j * N_updates + i] = C[row * N_updates + j] + (i == j); // B NEEDS TO BE IN COL-MAJ FOR cusolverDnDgetrf !
+  PUSH_RANGE("Construct B = 1 + V C", 4)
+  #pragma omp target teams distribute parallel for collapse(2)
+  for (int i = 0; i < N_updates; ++i) {
+    for (int j = 0; j < N_updates; ++j) {
+      const uint32_t row = Updates_index[i] - 1;
+      B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
     }
   }
+  POP_RANGE
 
   // Compute det(B) via LU(B)
   #pragma omp target enter data map(alloc: workspace[0:workspace_size], pivot[0:N_updates])
+  PUSH_RANGE("Compute LU(B)", 3)
   #pragma omp target data use_device_ptr(B, workspace, pivot)
   {
-    (void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info);
+    (void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info); // col-maj enforced, so res. is LU(B)^T
   }
+  POP_RANGE
   #pragma omp target exit data map(delete: workspace[0:workspace_size])
   swap = false; j = 0; det = 1.0f;
+  PUSH_RANGE("Test det(B) and count LU swaps", 4)
   #pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det)
   for (uint32_t i = 0; i < N_updates; i++) {
     swap = (bool)(pivot[i] - (i + 1));     // swap = {0->false: no swap, >0->true: swap}
     j += (uint32_t)swap;                   // count # of swaps
     det *= B[i * (N_updates + 1)];         // prod. of diag elm. of B
   }
+  POP_RANGE
   if (fabs(det) < breakdown) return 1;  // check if determinant of B is too close to zero. If so, exit early.
   if (determinant) {                       // update det(Slater) if determinant != NULL
     if ((j & 1) != 0) det = -det;          // multiply det with -1 if # of swaps is odd
@@ -417,63 +427,89 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
   }
   
   // Compute B^{-1} : initialise as I for solving BX=I
+  PUSH_RANGE("Allocate Binv ON GPU", 2)
   #pragma omp target enter data map(alloc: Binv[0:N_updates*N_updates])
+  POP_RANGE
+  PUSH_RANGE("Construct B^{-1}=I", 4)
   #pragma omp target teams distribute parallel for collapse(2)
   for (int i = 0; i < N_updates; ++i) {
     for (int j = 0; j < N_updates; ++j) {
       Binv[i * N_updates + j] = (i == j);
     }
   }
+  POP_RANGE
+
   #pragma omp target data use_device_ptr(B, pivot, Binv)
   {
-    (void) cusolverDnDgetrs(s_handle, CUBLAS_OP_N, N_updates, N_updates, B, N_updates, pivot, Binv, N_updates, info);
+    PUSH_RANGE("Compute B^{-1}", 3)
+    (void) cusolverDnDgetrs(s_handle, CUBLAS_OP_T, N_updates, N_updates, B, N_updates, pivot, Binv, N_updates, info); // Needs op(B) = B^T because of line 403
+    POP_RANGE
   }
+  PUSH_RANGE("Deallocate B, pivot ON GPU", 2)
   #pragma omp target exit data map(delete: B[0:N_updates*N_updates], pivot[0:N_updates])
+  POP_RANGE
 
   // Construct D = V S^{-1} : K x LDS
+  PUSH_RANGE("Allocate D ON GPU", 2)
   #pragma omp target enter data map(alloc: D[0:N_updates*Lds])
-  #pragma omp target teams distribute parallel for
-  for (uint32_t i = 0; i < N_updates; i++) {
-    const uint32_t row = Updates_index[i] - 1;
-    for (uint32_t j = 0; j < Lds; j++) {
+  POP_RANGE
+  PUSH_RANGE("Construct D = V S^{-1}", 4)
+  #pragma omp target teams distribute parallel for collapse(2)
+  for (uint32_t i = 0; i < N_updates; ++i) {
+    for (uint32_t j = 0; j < Lds; ++j) {
+      const uint32_t row = Updates_index[i] - 1;
       D[i * Lds + j] = Slater_inv[row * Lds + j];
     }
   }
+  POP_RANGE
+  PUSH_RANGE("Deallocate Updates_index ON GPU", 2)
   #pragma omp target exit data map(delete: Updates_index[0:N_updates])
+  POP_RANGE
 
   // T1 <- B^{-1} D : KxLDS : standard dgemm
+  PUSH_RANGE("Allocate T1 ON GPU", 2)
   #pragma omp target enter data map(alloc: T1[0:N_updates*Lds])
+  POP_RANGE
   #pragma omp target data use_device_ptr(D, Binv, T1)
   {
+    PUSH_RANGE("Compute T1 <- B^{-1} D", 3)
     (void) cublasDgemm_v2(b_handle,
                           CUBLAS_OP_N,
                           CUBLAS_OP_T, // REMEMBER THIS IS Binv TRANSPOSED  because of cusolverDnDgetrs CALL ON l.434 !!!
                           Lds, N_updates, N_updates,
                           &alpha, D, Lds, Binv, N_updates,
                           &beta, T1, Lds);
+    POP_RANGE
   }
-  #pragma omp target exit data map(delete: D[0:N_updates*Lds], Binv[0:N_updates*N_updates])
 
-  // Compute T2 <- C * T1 : Dim x LDS : standard dgemm
-  #pragma omp target enter data map(alloc: T2[0:Dim*Lds])
-  #pragma omp target data use_device_ptr(T1, C, T2)
+  PUSH_RANGE("Deallocate D, Binv ON GPU", 2)
+  #pragma omp target exit data map(delete: D[0:N_updates*Lds], Binv[0:N_updates*N_updates])
+  POP_RANGE
+
+  // Compute S^{-1} <- S^{-1} - C * T1 : Dim x LDS : standard dgemm
+  alpha = -1.0f, beta = 1.0f;
+  #pragma omp target data use_device_ptr(T1, C, Slater_inv)
   {
+    PUSH_RANGE("Compute S^{-1} <- S^{-1} - C * T1", 3)
     (void) cublasDgemm_v2(b_handle,
                           CUBLAS_OP_N, CUBLAS_OP_N,
                           Dim, Lds, N_updates,
                           &alpha, T1, Lds, C, N_updates,
-                          &beta, T2, Lds);
+                          &beta, Slater_inv, Lds);
+    POP_RANGE
   }
+
+  PUSH_RANGE("Deallocate T1, C ON GPU", 2)
   #pragma omp target exit data map(delete: T1[0:N_updates*Lds], C[0:Dim*N_updates])
+  POP_RANGE
 
-  // Compute S^{-1} <- S^{-1} - T2 : Dim x LDS
-  #pragma omp target teams distribute parallel for
-  for (uint32_t i = 0; i < Dim * Lds; i++) {
-    Slater_inv[i] = Slater_inv[i] - T2[i];
-  }
-
+  PUSH_RANGE("Update Slater_inv FROM GPU", 2)
   #pragma omp target update from(Slater_inv[0:Dim*Lds])
-  #pragma omp target exit data map(delete: Slater_inv[0:Dim*Lds], T2[0:Dim*Lds])
+  POP_RANGE
+
+  PUSH_RANGE("Deallocate Slater_inv ON GPU", 2)
+  #pragma omp target exit data map(delete: Slater_inv[0:Dim*Lds])
+  POP_RANGE
 
   free(pivot);
   free(B);
@@ -481,7 +517,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
   free(C);
   free(D);
   free(T1);
-  free(T2);
+
   return 0;
 }
 #endif
diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h
index 6f85249..9ee9e49 100644
--- a/independent_test_harness/kernels.h
+++ b/independent_test_harness/kernels.h
@@ -1,7 +1,9 @@
+#pragma once
+
 #include <mkl_lapacke.h>
 #include <mkl.h>
 
-//#define HAVE_CUBLAS_OFFLOAD
+#define HAVE_CUBLAS_OFFLOAD
 
 #ifdef HAVE_CUBLAS_OFFLOAD
   #include <stdio.h>
diff --git a/independent_test_harness/nvtx.h b/independent_test_harness/nvtx.h
new file mode 100644
index 0000000..cadb422
--- /dev/null
+++ b/independent_test_harness/nvtx.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#define USE_NVTX
+
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#include "nvtxDetail/nvtxLinkOnce.h"
+const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
+const int num_colors = sizeof(colors)/sizeof(uint32_t);
+#define PUSH_RANGE(name,cid) { \
+        int color_id = cid; \
+        color_id = color_id%num_colors;\
+        nvtxEventAttributes_t eventAttrib = {0}; \
+        eventAttrib.version = NVTX_VERSION; \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+        eventAttrib.colorType = NVTX_COLOR_ARGB; \
+        eventAttrib.color = colors[color_id]; \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+        eventAttrib.message.ascii = name; \
+        nvtxRangePushEx(&eventAttrib); \
+    }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name,cid)
+#define POP_RANGE
+#endif
diff --git a/independent_test_harness/test.c b/independent_test_harness/test.c
index 2dfd39c..8c19836 100644
--- a/independent_test_harness/test.c
+++ b/independent_test_harness/test.c
@@ -1,7 +1,6 @@
 #include <stdint.h>
-#include "meuk.h"
+#include "helper.h"
 #include "cycles.h"
-#include "debug.h"
 
 #define DATASET "dataset"
 #define REPETITIONS 1
@@ -31,6 +30,7 @@ int main(int argc, char **argv) {
   for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) {
 
     // SETUP TEST PARAMETERS
+    const uint32_t GHz = 2800000000;
     const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
     const double tolerance = 0.001; // default = 0.001
     double cumulative = 0;
@@ -265,7 +265,7 @@ int main(int argc, char **argv) {
     free(Res);
 
     // 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
-    printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
+    printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%9.6f\t%9.6f\t%9.6f\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, (double)accumulator/GHz, (double)cycles_per_update/GHz, (double)cumulative/GHz, recursive_calls);
 
     free(Updates_index);
     free(Updates);