From 6bb95f068d2a47839676521b8e2d84fbd3c546d2 Mon Sep 17 00:00:00 2001 From: Francois Coppens Date: Mon, 17 Oct 2022 14:56:32 +0200 Subject: [PATCH] - Resrtructured tree - Added NVTX annotations to GPU kernel. --- independent_test_harness/Makefile.icc_mkl.cpu | 7 +- .../Makefile.nvc_ompol.gpu | 12 +-- independent_test_harness/debug.h | 13 +-- independent_test_harness/{meuk.c => helper.c} | 13 ++- independent_test_harness/{meuk.h => helper.h} | 3 + independent_test_harness/{sm.c => kernels.c} | 88 +++++++++++++------ independent_test_harness/kernels.h | 4 +- independent_test_harness/nvtx.h | 26 ++++++ independent_test_harness/test.c | 6 +- 9 files changed, 121 insertions(+), 51 deletions(-) rename independent_test_harness/{meuk.c => helper.c} (97%) rename independent_test_harness/{meuk.h => helper.h} (96%) rename independent_test_harness/{sm.c => kernels.c} (93%) create mode 100644 independent_test_harness/nvtx.h diff --git a/independent_test_harness/Makefile.icc_mkl.cpu b/independent_test_harness/Makefile.icc_mkl.cpu index acdb9d5..c9faee1 100644 --- a/independent_test_harness/Makefile.icc_mkl.cpu +++ b/independent_test_harness/Makefile.icc_mkl.cpu @@ -6,12 +6,11 @@ LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core - all: test_icc_mkl -test_icc_mkl: sm.o test.o meuk.o - $(CC) $(LDFLAGS) -o test_icc_mkl sm.o test.o meuk.o +test_icc_mkl: kernels.o test.o helper.o + $(CC) $(LDFLAGS) -o $@ $^ %.o : %.c $(CC) $(CFLAGS) $(INCLUDE) -c -o $@ $< -clean: +clean: rm -rf *.o *genmod* test_icc_mkl - diff --git a/independent_test_harness/Makefile.nvc_ompol.gpu b/independent_test_harness/Makefile.nvc_ompol.gpu index 25b1539..12d82f9 100644 --- a/independent_test_harness/Makefile.nvc_ompol.gpu +++ b/independent_test_harness/Makefile.nvc_ompol.gpu @@ -1,10 +1,12 @@ #FC = ifx -#CC = nvc +CC = nvc -#CFLAGS=-std=c99 -O0 -Wall -g -DHAVE_CUBLAS_OFFLOAD -mp -target=gpu -CFLAGS=-std=c99 -O3 -Wall -g -DHAVE_CUBLAS_OFFLOAD -mp -target=gpu +#CFLAGS=-std=c99 -O0 -Wall -g -DHAVE_CUBLAS_OFFLOAD -DUSE_NVTX -mp -target=gpu +CFLAGS=-std=c99 -O3 -Wall -g -DHAVE_CUBLAS_OFFLOAD -DUSE_NVTX -mp -target=gpu INCLUDE=-I$(NVHPC_ROOT)/math_libs/include +INCLUDE+=-I$(NVHPC_ROOT)/cuda/11.7/targets/x86_64-linux/include +INCLUDE+=-I$(NVHPC_ROOT)/profilers/Nsight_Systems/target-linux-x64/nvtx/include LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl @@ -12,8 +14,8 @@ LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -lcusolver -mp -target=gpu all: test_nvc_ompol -test_nvc_ompol: sm.o test.o meuk.o - $(CC) $(LDFLAGS) -o test_nvc_ompol sm.o test.o meuk.o +test_nvc_ompol: kernels.o test.o helper.o + $(CC) $(LDFLAGS) -o $@ $^ %.o : %.c $(CC) $(CFLAGS) $(INCLUDE) -c -o $@ $< diff --git a/independent_test_harness/debug.h b/independent_test_harness/debug.h index f4ebe6f..a7da246 100644 --- a/independent_test_harness/debug.h +++ b/independent_test_harness/debug.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include @@ -43,14 +45,3 @@ void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* na } printf("\n"); } - -void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n) -{ - for(uint16_t i = 0; i < m; i++) - { - for( uint16_t j = 0; j < n; j++) - { - b[j * ldb + i] = a[i * lda + j]; - } - } -} diff --git a/independent_test_harness/meuk.c b/independent_test_harness/helper.c similarity index 97% rename from independent_test_harness/meuk.c rename to independent_test_harness/helper.c index 9528cee..98b759b 100644 --- a/independent_test_harness/meuk.c +++ b/independent_test_harness/helper.c @@ -1,4 +1,4 @@ -#include "meuk.h" +#include "helper.h" #include #include @@ -49,6 +49,17 @@ void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64 } } +void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n) +{ + for(uint16_t i = 0; i < m; i++) + { + for( uint16_t j = 0; j < n; j++) + { + b[j * ldb + i] = a[i * lda + j]; + } + } +} + double get_determinant(uint32_t cycle, hid_t file_id) { char det_key[32]; sprintf(det_key, "/cycle_%d/determinant", cycle); diff --git a/independent_test_harness/meuk.h b/independent_test_harness/helper.h similarity index 96% rename from independent_test_harness/meuk.h rename to independent_test_harness/helper.h index b7b6815..8d36537 100644 --- a/independent_test_harness/meuk.h +++ b/independent_test_harness/helper.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include @@ -21,6 +23,7 @@ typedef struct Error { void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim); void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds); void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds); +void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n); double get_determinant(uint32_t cycle, hid_t file_id); double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds); double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds); diff --git a/independent_test_harness/sm.c b/independent_test_harness/kernels.c similarity index 93% rename from independent_test_harness/sm.c rename to independent_test_harness/kernels.c index 6878e25..28bf26c 100644 --- a/independent_test_harness/sm.c +++ b/independent_test_harness/kernels.c @@ -2,7 +2,7 @@ #include #include #include "kernels.h" -//#include "debug.h" +#include "nvtx.h" extern uint64_t n_splits; extern uint64_t block_fail; @@ -366,16 +366,19 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand int workspace_size = 0, *info = NULL; double* workspace = NULL; cusolverDnDgetrf_bufferSize(s_handle, N_updates, N_updates, B, N_updates, &workspace_size); - printf("SIZE OF CUSOLVER WORKSPACE: %d doubles of %lu byte = %lu byte\n", workspace_size, sizeof *workspace, sizeof *workspace * workspace_size); +// printf("SIZE OF CUSOLVER WORKSPACE: %d doubles of %lu byte = %lu byte\n", workspace_size, sizeof *workspace, sizeof *workspace * workspace_size); workspace = malloc(sizeof *workspace * workspace_size); + PUSH_RANGE("Transfer data TO GPU", 2) #pragma omp target enter data map(to: Updates[0:Lds*N_updates], \ Updates_index[0:N_updates], \ Slater_inv[0:Dim*Lds]) + POP_RANGE // Compute C <- S^{-1} U : Dim x K : standard dgemm alpha = 1.0f, beta = 0.0f; #pragma omp target enter data map(alloc: C[0:Dim*N_updates]) + PUSH_RANGE("Compute C = S^{-1} U", 3) #pragma omp target data use_device_ptr(Slater_inv, Updates, C) { (void) cublasDgemm_v2(b_handle, @@ -385,31 +388,38 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand &beta, C, N_updates); } #pragma omp target exit data map(delete: Updates[0:Lds*N_updates]) + POP_RANGE // Construct B <- 1 + V C : K x K #pragma omp target enter data map(alloc: B[0:N_updates*N_updates]) - #pragma omp target teams distribute parallel for - for (uint32_t i = 0; i < N_updates; i++) { - const uint32_t row = Updates_index[i] - 1; - for (uint32_t j = 0; j < N_updates ; j++) { - B[j * N_updates + i] = C[row * N_updates + j] + (i == j); // B NEEDS TO BE IN COL-MAJ FOR cusolverDnDgetrf ! + PUSH_RANGE("Construct B = 1 + V C", 4) + #pragma omp target teams distribute parallel for collapse(2) + for (int i = 0; i < N_updates; ++i) { + for (int j = 0; j < N_updates; ++j) { + const uint32_t row = Updates_index[i] - 1; + B[i * N_updates + j] = C[row * N_updates + j] + (i == j); } } + POP_RANGE // Compute det(B) via LU(B) #pragma omp target enter data map(alloc: workspace[0:workspace_size], pivot[0:N_updates]) + PUSH_RANGE("Compute LU(B)", 3) #pragma omp target data use_device_ptr(B, workspace, pivot) { - (void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info); + (void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info); // col-maj enforced, so res. is LU(B)^T } + POP_RANGE #pragma omp target exit data map(delete: workspace[0:workspace_size]) swap = false; j = 0; det = 1.0f; + PUSH_RANGE("Test det(B) and count LU swaps", 4) #pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det) for (uint32_t i = 0; i < N_updates; i++) { swap = (bool)(pivot[i] - (i + 1)); // swap = {0->false: no swap, >0->true: swap} j += (uint32_t)swap; // count # of swaps det *= B[i * (N_updates + 1)]; // prod. of diag elm. of B } + POP_RANGE if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early. if (determinant) { // update det(Slater) if determinant != NULL if ((j & 1) != 0) det = -det; // multiply det with -1 if # of swaps is odd @@ -417,63 +427,89 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand } // Compute B^{-1} : initialise as I for solving BX=I + PUSH_RANGE("Allocate Binv ON GPU", 2) #pragma omp target enter data map(alloc: Binv[0:N_updates*N_updates]) + POP_RANGE + PUSH_RANGE("Construct B^{-1}=I", 4) #pragma omp target teams distribute parallel for collapse(2) for (int i = 0; i < N_updates; ++i) { for (int j = 0; j < N_updates; ++j) { Binv[i * N_updates + j] = (i == j); } } + POP_RANGE + #pragma omp target data use_device_ptr(B, pivot, Binv) { - (void) cusolverDnDgetrs(s_handle, CUBLAS_OP_N, N_updates, N_updates, B, N_updates, pivot, Binv, N_updates, info); + PUSH_RANGE("Compute B^{-1}", 3) + (void) cusolverDnDgetrs(s_handle, CUBLAS_OP_T, N_updates, N_updates, B, N_updates, pivot, Binv, N_updates, info); // Needs op(B) = B^T because of line 403 + POP_RANGE } + PUSH_RANGE("Deallocate B, pivot ON GPU", 2) #pragma omp target exit data map(delete: B[0:N_updates*N_updates], pivot[0:N_updates]) + POP_RANGE // Construct D = V S^{-1} : K x LDS + PUSH_RANGE("Allocate D ON GPU", 2) #pragma omp target enter data map(alloc: D[0:N_updates*Lds]) - #pragma omp target teams distribute parallel for - for (uint32_t i = 0; i < N_updates; i++) { - const uint32_t row = Updates_index[i] - 1; - for (uint32_t j = 0; j < Lds; j++) { + POP_RANGE + PUSH_RANGE("Construct D = V S^{-1}", 4) + #pragma omp target teams distribute parallel for collapse(2) + for (uint32_t i = 0; i < N_updates; ++i) { + for (uint32_t j = 0; j < Lds; ++j) { + const uint32_t row = Updates_index[i] - 1; D[i * Lds + j] = Slater_inv[row * Lds + j]; } } + POP_RANGE + PUSH_RANGE("Deallocate Updates_index ON GPU", 2) #pragma omp target exit data map(delete: Updates_index[0:N_updates]) + POP_RANGE // T1 <- B^{-1} D : KxLDS : standard dgemm + PUSH_RANGE("Allocate T1 ON GPU", 2) #pragma omp target enter data map(alloc: T1[0:N_updates*Lds]) + POP_RANGE #pragma omp target data use_device_ptr(D, Binv, T1) { + PUSH_RANGE("Compute T1 <- B^{-1} D", 3) (void) cublasDgemm_v2(b_handle, CUBLAS_OP_N, CUBLAS_OP_T, // REMEMBER THIS IS Binv TRANSPOSED because of cusolverDnDgetrs CALL ON l.434 !!! Lds, N_updates, N_updates, &alpha, D, Lds, Binv, N_updates, &beta, T1, Lds); + POP_RANGE } - #pragma omp target exit data map(delete: D[0:N_updates*Lds], Binv[0:N_updates*N_updates]) - // Compute T2 <- C * T1 : Dim x LDS : standard dgemm - #pragma omp target enter data map(alloc: T2[0:Dim*Lds]) - #pragma omp target data use_device_ptr(T1, C, T2) + PUSH_RANGE("Deallocate D, Binv ON GPU", 2) + #pragma omp target exit data map(delete: D[0:N_updates*Lds], Binv[0:N_updates*N_updates]) + POP_RANGE + + // Compute S^{-1} <- S^{-1} - C * T1 : Dim x LDS : standard dgemm + alpha = -1.0f, beta = 1.0f; + #pragma omp target data use_device_ptr(T1, C, Slater_inv) { + PUSH_RANGE("Compute S^{-1} <- S^{-1} - C * T1", 3) (void) cublasDgemm_v2(b_handle, CUBLAS_OP_N, CUBLAS_OP_N, Dim, Lds, N_updates, &alpha, T1, Lds, C, N_updates, - &beta, T2, Lds); + &beta, Slater_inv, Lds); + POP_RANGE } + + PUSH_RANGE("Deallocate T1, C ON GPU", 2) #pragma omp target exit data map(delete: T1[0:N_updates*Lds], C[0:Dim*N_updates]) + POP_RANGE - // Compute S^{-1} <- S^{-1} - T2 : Dim x LDS - #pragma omp target teams distribute parallel for - for (uint32_t i = 0; i < Dim * Lds; i++) { - Slater_inv[i] = Slater_inv[i] - T2[i]; - } - + PUSH_RANGE("Update Slater_inv FROM GPU", 2) #pragma omp target update from(Slater_inv[0:Dim*Lds]) - #pragma omp target exit data map(delete: Slater_inv[0:Dim*Lds], T2[0:Dim*Lds]) + POP_RANGE + + PUSH_RANGE("Deallocate Slater_inv ON GPU", 2) + #pragma omp target exit data map(delete: Slater_inv[0:Dim*Lds]) + POP_RANGE free(pivot); free(B); @@ -481,7 +517,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand free(C); free(D); free(T1); - free(T2); + return 0; } #endif diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h index 6f85249..9ee9e49 100644 --- a/independent_test_harness/kernels.h +++ b/independent_test_harness/kernels.h @@ -1,7 +1,9 @@ +#pragma once + #include #include -//#define HAVE_CUBLAS_OFFLOAD +#define HAVE_CUBLAS_OFFLOAD #ifdef HAVE_CUBLAS_OFFLOAD #include diff --git a/independent_test_harness/nvtx.h b/independent_test_harness/nvtx.h new file mode 100644 index 0000000..cadb422 --- /dev/null +++ b/independent_test_harness/nvtx.h @@ -0,0 +1,26 @@ +#pragma once + +#define USE_NVTX + +#ifdef USE_NVTX +#include "nvToolsExt.h" +#include "nvtxDetail/nvtxLinkOnce.h" +const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; +const int num_colors = sizeof(colors)/sizeof(uint32_t); +#define PUSH_RANGE(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name,cid) +#define POP_RANGE +#endif diff --git a/independent_test_harness/test.c b/independent_test_harness/test.c index 2dfd39c..8c19836 100644 --- a/independent_test_harness/test.c +++ b/independent_test_harness/test.c @@ -1,7 +1,6 @@ #include -#include "meuk.h" +#include "helper.h" #include "cycles.h" -#include "debug.h" #define DATASET "dataset" #define REPETITIONS 1 @@ -31,6 +30,7 @@ int main(int argc, char **argv) { for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) { // SETUP TEST PARAMETERS + const uint32_t GHz = 2800000000; const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small const double tolerance = 0.001; // default = 0.001 double cumulative = 0; @@ -265,7 +265,7 @@ int main(int argc, char **argv) { free(Res); // 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS - printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls); + printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%9.6f\t%9.6f\t%9.6f\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, (double)accumulator/GHz, (double)cycles_per_update/GHz, (double)cumulative/GHz, recursive_calls); free(Updates_index); free(Updates);