Compare commits

...

3 Commits

Author SHA1 Message Date
Francois Coppens
2d5a34faed Trivial change 2022-10-17 15:26:30 +02:00
Francois Coppens
90bc5090c2 Trivial change 2022-10-17 15:15:16 +02:00
Francois Coppens
6bb95f068d - Resrtructured tree
- Added NVTX annotations to GPU kernel.
2022-10-17 14:56:32 +02:00
9 changed files with 159 additions and 79 deletions

View File

@ -6,12 +6,11 @@ LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -
all: test_icc_mkl
test_icc_mkl: sm.o test.o meuk.o
$(CC) $(LDFLAGS) -o test_icc_mkl sm.o test.o meuk.o
test_icc_mkl: kernels.o test.o helper.o
$(CC) $(LDFLAGS) -o $@ $^
%.o : %.c
$(CC) $(CFLAGS) $(INCLUDE) -c -o $@ $<
clean:
clean:
rm -rf *.o *genmod* test_icc_mkl

View File

@ -1,10 +1,12 @@
#FC = ifx
#CC = nvc
CC = nvc
#CFLAGS=-std=c99 -O0 -Wall -g -DHAVE_CUBLAS_OFFLOAD -mp -target=gpu
CFLAGS=-std=c99 -O3 -Wall -g -DHAVE_CUBLAS_OFFLOAD -mp -target=gpu
#CFLAGS=-std=c99 -O0 -Wall -g -DHAVE_CUBLAS_OFFLOAD -DUSE_NVTX -mp -target=gpu
CFLAGS=-std=c99 -O3 -Wall -g -DHAVE_CUBLAS_OFFLOAD -DUSE_NVTX -mp -target=gpu
INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
INCLUDE+=-I$(NVHPC_ROOT)/cuda/11.7/targets/x86_64-linux/include
INCLUDE+=-I$(NVHPC_ROOT)/profilers/Nsight_Systems/target-linux-x64/nvtx/include
LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
@ -12,8 +14,8 @@ LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -lcusolver -mp -target=gpu
all: test_nvc_ompol
test_nvc_ompol: sm.o test.o meuk.o
$(CC) $(LDFLAGS) -o test_nvc_ompol sm.o test.o meuk.o
test_nvc_ompol: kernels.o test.o helper.o
$(CC) $(LDFLAGS) -o $@ $^
%.o : %.c
$(CC) $(CFLAGS) $(INCLUDE) -c -o $@ $<

View File

@ -1,3 +1,5 @@
#pragma once
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
@ -43,14 +45,3 @@ void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* na
}
printf("\n");
}
void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n)
{
for(uint16_t i = 0; i < m; i++)
{
for( uint16_t j = 0; j < n; j++)
{
b[j * ldb + i] = a[i * lda + j];
}
}
}

View File

@ -1,4 +1,4 @@
#include "meuk.h"
#include "helper.h"
#include <stdint.h>
#include <assert.h>
@ -49,6 +49,17 @@ void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64
}
}
void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n)
{
for(uint16_t i = 0; i < m; i++)
{
for( uint16_t j = 0; j < n; j++)
{
b[j * ldb + i] = a[i * lda + j];
}
}
}
double get_determinant(uint32_t cycle, hid_t file_id) {
char det_key[32];
sprintf(det_key, "/cycle_%d/determinant", cycle);

View File

@ -1,3 +1,5 @@
#pragma once
#include <math.h>
#include <stdio.h>
#include <stdint.h>
@ -21,6 +23,7 @@ typedef struct Error {
void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim);
void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds);
void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds);
void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n);
double get_determinant(uint32_t cycle, hid_t file_id);
double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);

View File

@ -2,7 +2,7 @@
#include <stdint.h>
#include <stdbool.h>
#include "kernels.h"
//#include "debug.h"
#include "nvtx.h"
extern uint64_t n_splits;
extern uint64_t block_fail;
@ -366,16 +366,19 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
int workspace_size = 0, *info = NULL;
double* workspace = NULL;
cusolverDnDgetrf_bufferSize(s_handle, N_updates, N_updates, B, N_updates, &workspace_size);
printf("SIZE OF CUSOLVER WORKSPACE: %d doubles of %lu byte = %lu byte\n", workspace_size, sizeof *workspace, sizeof *workspace * workspace_size);
// printf("SIZE OF CUSOLVER WORKSPACE: %d doubles of %lu byte = %lu byte\n", workspace_size, sizeof *workspace, sizeof *workspace * workspace_size);
workspace = malloc(sizeof *workspace * workspace_size);
PUSH_RANGE("Transfer data TO GPU", 2)
#pragma omp target enter data map(to: Updates[0:Lds*N_updates], \
Updates_index[0:N_updates], \
Slater_inv[0:Dim*Lds])
POP_RANGE
// Compute C <- S^{-1} U : Dim x K : standard dgemm
alpha = 1.0f, beta = 0.0f;
#pragma omp target enter data map(alloc: C[0:Dim*N_updates])
PUSH_RANGE("Compute C = S^{-1} U", 3)
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
{
(void) cublasDgemm_v2(b_handle,
@ -385,31 +388,38 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
&beta, C, N_updates);
}
#pragma omp target exit data map(delete: Updates[0:Lds*N_updates])
POP_RANGE
// Construct B <- 1 + V C : K x K
#pragma omp target enter data map(alloc: B[0:N_updates*N_updates])
#pragma omp target teams distribute parallel for
for (uint32_t i = 0; i < N_updates; i++) {
const uint32_t row = Updates_index[i] - 1;
for (uint32_t j = 0; j < N_updates ; j++) {
B[j * N_updates + i] = C[row * N_updates + j] + (i == j); // B NEEDS TO BE IN COL-MAJ FOR cusolverDnDgetrf !
PUSH_RANGE("Construct B = 1 + V C", 4)
#pragma omp target teams distribute parallel for collapse(2)
for (int i = 0; i < N_updates; ++i) {
for (int j = 0; j < N_updates; ++j) {
const uint32_t row = Updates_index[i] - 1;
B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
}
}
POP_RANGE
// Compute det(B) via LU(B)
#pragma omp target enter data map(alloc: workspace[0:workspace_size], pivot[0:N_updates])
PUSH_RANGE("Compute LU(B)", 3)
#pragma omp target data use_device_ptr(B, workspace, pivot)
{
(void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info);
(void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info); // col-maj enforced, so res. is LU(B)^T
}
POP_RANGE
#pragma omp target exit data map(delete: workspace[0:workspace_size])
swap = false; j = 0; det = 1.0f;
PUSH_RANGE("Test det(B) and count LU swaps", 4)
#pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det)
for (uint32_t i = 0; i < N_updates; i++) {
swap = (bool)(pivot[i] - (i + 1)); // swap = {0->false: no swap, >0->true: swap}
j += (uint32_t)swap; // count # of swaps
det *= B[i * (N_updates + 1)]; // prod. of diag elm. of B
}
POP_RANGE
if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early.
if (determinant) { // update det(Slater) if determinant != NULL
if ((j & 1) != 0) det = -det; // multiply det with -1 if # of swaps is odd
@ -417,63 +427,89 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
}
// Compute B^{-1} : initialise as I for solving BX=I
PUSH_RANGE("Allocate Binv ON GPU", 2)
#pragma omp target enter data map(alloc: Binv[0:N_updates*N_updates])
POP_RANGE
PUSH_RANGE("Construct B^{-1}=I", 4)
#pragma omp target teams distribute parallel for collapse(2)
for (int i = 0; i < N_updates; ++i) {
for (int j = 0; j < N_updates; ++j) {
Binv[i * N_updates + j] = (i == j);
}
}
POP_RANGE
#pragma omp target data use_device_ptr(B, pivot, Binv)
{
(void) cusolverDnDgetrs(s_handle, CUBLAS_OP_N, N_updates, N_updates, B, N_updates, pivot, Binv, N_updates, info);
PUSH_RANGE("Compute B^{-1}", 3)
(void) cusolverDnDgetrs(s_handle, CUBLAS_OP_T, N_updates, N_updates, B, N_updates, pivot, Binv, N_updates, info); // Needs op(B) = B^T because of line 403
POP_RANGE
}
PUSH_RANGE("Deallocate B, pivot ON GPU", 2)
#pragma omp target exit data map(delete: B[0:N_updates*N_updates], pivot[0:N_updates])
POP_RANGE
// Construct D = V S^{-1} : K x LDS
PUSH_RANGE("Allocate D ON GPU", 2)
#pragma omp target enter data map(alloc: D[0:N_updates*Lds])
#pragma omp target teams distribute parallel for
for (uint32_t i = 0; i < N_updates; i++) {
const uint32_t row = Updates_index[i] - 1;
for (uint32_t j = 0; j < Lds; j++) {
POP_RANGE
PUSH_RANGE("Construct D = V S^{-1}", 4)
#pragma omp target teams distribute parallel for collapse(2)
for (uint32_t i = 0; i < N_updates; ++i) {
for (uint32_t j = 0; j < Lds; ++j) {
const uint32_t row = Updates_index[i] - 1;
D[i * Lds + j] = Slater_inv[row * Lds + j];
}
}
POP_RANGE
PUSH_RANGE("Deallocate Updates_index ON GPU", 2)
#pragma omp target exit data map(delete: Updates_index[0:N_updates])
POP_RANGE
// T1 <- B^{-1} D : KxLDS : standard dgemm
PUSH_RANGE("Allocate T1 ON GPU", 2)
#pragma omp target enter data map(alloc: T1[0:N_updates*Lds])
POP_RANGE
#pragma omp target data use_device_ptr(D, Binv, T1)
{
PUSH_RANGE("Compute T1 <- B^{-1} D", 3)
(void) cublasDgemm_v2(b_handle,
CUBLAS_OP_N,
CUBLAS_OP_T, // REMEMBER THIS IS Binv TRANSPOSED because of cusolverDnDgetrs CALL ON l.434 !!!
Lds, N_updates, N_updates,
&alpha, D, Lds, Binv, N_updates,
&beta, T1, Lds);
POP_RANGE
}
#pragma omp target exit data map(delete: D[0:N_updates*Lds], Binv[0:N_updates*N_updates])
// Compute T2 <- C * T1 : Dim x LDS : standard dgemm
#pragma omp target enter data map(alloc: T2[0:Dim*Lds])
#pragma omp target data use_device_ptr(T1, C, T2)
PUSH_RANGE("Deallocate D, Binv ON GPU", 2)
#pragma omp target exit data map(delete: D[0:N_updates*Lds], Binv[0:N_updates*N_updates])
POP_RANGE
// Compute S^{-1} <- S^{-1} - C * T1 : Dim x LDS : standard dgemm
alpha = -1.0f, beta = 1.0f;
#pragma omp target data use_device_ptr(T1, C, Slater_inv)
{
PUSH_RANGE("Compute S^{-1} <- S^{-1} - C * T1", 3)
(void) cublasDgemm_v2(b_handle,
CUBLAS_OP_N, CUBLAS_OP_N,
Dim, Lds, N_updates,
&alpha, T1, Lds, C, N_updates,
&beta, T2, Lds);
&beta, Slater_inv, Lds);
POP_RANGE
}
PUSH_RANGE("Deallocate T1, C ON GPU", 2)
#pragma omp target exit data map(delete: T1[0:N_updates*Lds], C[0:Dim*N_updates])
POP_RANGE
// Compute S^{-1} <- S^{-1} - T2 : Dim x LDS
#pragma omp target teams distribute parallel for
for (uint32_t i = 0; i < Dim * Lds; i++) {
Slater_inv[i] = Slater_inv[i] - T2[i];
}
PUSH_RANGE("Update Slater_inv FROM GPU", 2)
#pragma omp target update from(Slater_inv[0:Dim*Lds])
#pragma omp target exit data map(delete: Slater_inv[0:Dim*Lds], T2[0:Dim*Lds])
POP_RANGE
PUSH_RANGE("Deallocate Slater_inv ON GPU", 2)
#pragma omp target exit data map(delete: Slater_inv[0:Dim*Lds])
POP_RANGE
free(pivot);
free(B);
@ -481,7 +517,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
free(C);
free(D);
free(T1);
free(T2);
return 0;
}
#endif

View File

@ -1,7 +1,9 @@
#pragma once
#include <mkl_lapacke.h>
#include <mkl.h>
//#define HAVE_CUBLAS_OFFLOAD
#define HAVE_CUBLAS_OFFLOAD
#ifdef HAVE_CUBLAS_OFFLOAD
#include <stdio.h>

View File

@ -0,0 +1,26 @@
#pragma once
#define USE_NVTX
#ifdef USE_NVTX
#include "nvToolsExt.h"
#include "nvtxDetail/nvtxLinkOnce.h"
const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
const int num_colors = sizeof(colors)/sizeof(uint32_t);
#define PUSH_RANGE(name,cid) { \
int color_id = cid; \
color_id = color_id%num_colors;\
nvtxEventAttributes_t eventAttrib = {0}; \
eventAttrib.version = NVTX_VERSION; \
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors[color_id]; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define POP_RANGE nvtxRangePop();
#else
#define PUSH_RANGE(name,cid)
#define POP_RANGE
#endif

View File

@ -1,7 +1,6 @@
#include <stdint.h>
#include "meuk.h"
#include "helper.h"
#include "cycles.h"
#include "debug.h"
#define DATASET "dataset"
#define REPETITIONS 1
@ -14,23 +13,24 @@ int main(int argc, char **argv) {
assert(argc == 2);
char *version = argv[1];
#ifdef HAVE_CUBLAS_OFFLOAD
cublasHandle_t handle = init_cublas();
cusolverDnHandle_t s_handle = init_cusolver();
#endif
#ifdef HAVE_CUBLAS_OFFLOAD
cublasHandle_t handle = init_cublas();
cusolverDnHandle_t s_handle = init_cusolver();
#endif
// SETUP STORAGE AND DATA ACCESS
// SETUP DATA ACCESS
hid_t file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
// FOR EACH UPDATE CYCLE DO:
// for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) {
// SETUP TEST PARAMETERS
const uint32_t GHz = 2800000000;
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
const double tolerance = 0.001; // default = 0.001
double cumulative = 0;
@ -44,7 +44,7 @@ int main(int argc, char **argv) {
double* Updates = get_upds(cycle, file_id, N_updates, Lds);
double* Slater = get_slater(cycle, file_id, Dim, Lds);
double* Slater_invT = get_slater_inv(cycle, file_id, Dim, Lds);
double *Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds);
double* Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds);
double determinant = get_determinant(cycle, file_id), determinant_copy;
// Compute transpose of S. ST: 24 x 21
@ -69,7 +69,7 @@ int main(int argc, char **argv) {
block_fail = 0;
recursive_calls = 0;
// ## FOR A SET NUMBER OF REPETITIONS DO:
// ## FOR A SET NUMBER OF REPETITIONS EXECUTE A CHOSEN KERNEL
for (int rep = 0; rep < REPETITIONS; rep++) {
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
@ -78,6 +78,7 @@ int main(int argc, char **argv) {
// ### CHOOSE A KERNEL:
if (version[0] == 'n') { // Naive
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -90,7 +91,8 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'l') { // Later
}
else if (version[0] == 'l') { // Later
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -104,7 +106,8 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == '2') { // by twos
}
else if (version[0] == '2') { // by twos
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -119,7 +122,8 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == '3') { // by threes
}
else if (version[0] == '3') { // by threes
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -134,7 +138,8 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'k') { // Woodbury K
}
else if (version[0] == 'k') { // Woodbury K
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -147,10 +152,10 @@ int main(int argc, char **argv) {
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
#ifdef HAVE_CUBLAS_OFFLOAD
} else if (version[0] == 'c') { // Woodbury K cuBLAS
accumulator += (double) (after - before);
}
#ifdef HAVE_CUBLAS_OFFLOAD
else if (version[0] == 'c') { // Woodbury K cuBLAS
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -163,9 +168,10 @@ int main(int argc, char **argv) {
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
#endif
} else if (version[0] == 's') { // Splitting
accumulator += (double) (after - before);
}
#endif
else if (version[0] == 's') { // Splitting
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -179,7 +185,8 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'b') { // Blocked
}
else if (version[0] == 'b') { // Blocked
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -193,7 +200,8 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'm') { // LAPACK/MKL
}
else if (version[0] == 'm') { // LAPACK/MKL
// Only send upper Dim x Dim part of matrix to lapack
double *tmp = malloc(sizeof *tmp * Dim * Dim);
@ -215,11 +223,11 @@ int main(int argc, char **argv) {
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else { // Exit
}
else { // Exit
printf("Version '%c' not implemented.\n", version[0]);
return 1;
}
} // END OF REPETITIONS LOOP
// 4. COPY RESULT BACK TO ORIGINAL
@ -255,7 +263,6 @@ int main(int argc, char **argv) {
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// int32_t err_out = check_error_better(max, tolerance);
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
@ -265,7 +272,7 @@ int main(int argc, char **argv) {
free(Res);
// 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%9.6f\t%9.6f\t%9.6f\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, (double)accumulator/GHz, (double)cycles_per_update/GHz, (double)cumulative/GHz, recursive_calls);
free(Updates_index);
free(Updates);
@ -274,6 +281,7 @@ int main(int argc, char **argv) {
free(Slater_invT_copy);
} // END OF CYCLE LOOP
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
@ -281,8 +289,10 @@ int main(int argc, char **argv) {
(void) H5Fclose(file_id);
#ifdef HAVE_CUBLAS_OFFLOAD
cublasDestroy(handle);
cusolverDnDestroy(s_handle);
#endif
#ifdef HAVE_CUBLAS_OFFLOAD
cublasDestroy_v2(handle);
cusolverDnDestroy(s_handle);
#endif
return 0;
}