diff --git a/independent_test_harness/Makefile b/independent_test_harness/Makefile index 214433f..97d2a17 100644 --- a/independent_test_harness/Makefile +++ b/independent_test_harness/Makefile @@ -1,17 +1,17 @@ FC = ifx CC = nvc -CFLAGS=-std=c99 -O3 -Wall -g +CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu -LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl -LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -LFLAGS+=-lcublas +LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl +LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl +LDFLAGS+=-lcublas -mp -target=gpu ## Link with icc # test: sm.o test.o detupdate21.o meuk.o -# $(CC) $(LFLAGS) -o test sm.o detupdate21.o test.o meuk.o +# $(CC) $(LDFLAGS) -o test sm.o detupdate21.o test.o meuk.o test: sm.o test.o meuk.o - $(CC) $(LFLAGS) -o test sm.o test.o meuk.o + $(CC) $(LDFLAGS) -o test sm.o test.o meuk.o %.o: %.f90 $(FC) $(FFLAGS) -c -o $@ $< diff --git a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c new file mode 100644 index 0000000..b190518 --- /dev/null +++ b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c @@ -0,0 +1,107 @@ +/* +Compile with: + +nvc -L${MKLROOT}/lib/intel64 \ + -lmkl_intel_lp64 \ + -lmkl_sequential \ + -lmkl_core \ + -lpthread \ + -lm \ + -ldl \ + -lcublas \ + -mp \ + -target=gpu \ + cblasdgemm_vs_cublasdgemm_test.c \ + -o cblasdgemm_vs_cublasdgemm_test + +*/ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "debug.h" + +int main() { + + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + fprintf(stdout, "cuBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + + uint16_t M = 3; + uint16_t N = 2; + uint16_t K = 4; + + double *a = malloc(M * K * sizeof(double)); // M x K = 3 x 4 + double *acm = malloc(M * K * sizeof(double)); // col-major stored + double *b = malloc(K * N * sizeof(double)); // K x N = 4 x 2 + double *bcm = malloc(K * N * sizeof(double)); // col-major stored + double *c = malloc(M * N * sizeof(double)); // M x N = 3 x 2 + + a[0] = 1, a[1] = 2, a[2] = 3, a[3] = 4, a[4] = 5, a[5] = 6, a[6] = 7, a[7] = 8; a[8] = 9, a[9] = 10, a[10] = 11, a[11] = 12; + acm[0] = 1, acm[1] = 5, acm[2] = 9, acm[3] = 2, acm[4] = 6, acm[5] = 10, acm[6] = 3, acm[7] = 7; acm[8] = 11, acm[9] = 4, acm[10] = 8, acm[11] = 12; + b[0] = 13, b[1] = 14, b[2] = 15, b[3] = 16, b[4] = 17, b[5] = 18, b[6] = 19, b[7] = 20; + bcm[0] = 13, bcm[1] = 15, bcm[2] = 17, bcm[3] = 19, bcm[4] = 14, bcm[5] = 16, bcm[6] = 18, bcm[7] = 20; + + uint16_t lda = K; + uint16_t ldacm = M; + uint16_t ldb = N; + uint16_t ldbcm = K; + uint16_t ldc = N; + + double alpha = 1.0, beta = 0.0; + + + cblas_dgemm(CblasRowMajor, + CblasNoTrans, CblasNoTrans, + M, N, K, + alpha, a, lda, b, ldb, + beta, c, ldc); + print_m(c, M, N, ldc, "c_cblas_dgemm"); + + + memset(c, 0, M*N*sizeof(double)); + #pragma omp target enter data map(to:a[0:M*K], b[0:K*N], c[0:M*N]) + #pragma omp target data use_device_ptr(a, b, c) + { + int cublasError = cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + N, M, K, + &alpha, b, ldb, a, lda, + &beta, c, ldc); + } + #pragma omp target exit data map(from:c[0:M*N]) + print_m(c, M, N, ldc, "c_cublasDgemm"); + + + memset(c, 0, M*N*sizeof(double)); + ldc = M; // ldc : N -> M, because cublasDgemm stores result in col-maj + #pragma omp target enter data map(to:acm[0:M*K], bcm[0:K*N], c[0:M*N]) + #pragma omp target data use_device_ptr(acm, bcm, c) + { + int cublasError = cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + M, N, K, + &alpha, acm, ldacm, bcm, ldbcm, + &beta, c, ldc); + } + #pragma omp target exit data map(from:c[0:M*N]) + print_m_t(c, M, N, ldc, "c_col-maj_cublasDgemm"); + + + free(a); + free(acm); + free(b); + free(bcm); + free(c); + return 0; +} diff --git a/independent_test_harness/check_result.sh b/independent_test_harness/check_result.sh new file mode 100755 index 0000000..4414603 --- /dev/null +++ b/independent_test_harness/check_result.sh @@ -0,0 +1,5 @@ +#!/bin/bash +make +rm -v blocked kay cu +./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu + diff --git a/independent_test_harness/debug.h b/independent_test_harness/debug.h new file mode 100644 index 0000000..378b8e4 --- /dev/null +++ b/independent_test_harness/debug.h @@ -0,0 +1,38 @@ +void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name) +{ + printf("%s = \n", name); + for (uint16_t i = 0; i < m; ++i) + { + for (uint16_t j = 0; j < n; ++j) + { + printf("%11.5f ", mat[i * ldm + j]); + } + printf("\n"); + } + printf("\n"); +} + +void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name) +{ + printf("%s = \n", name); + for (uint16_t i = 0; i < m; ++i) + { + for (uint16_t j = 0; j < n; ++j) + { + printf("%11.5f ", mat[j * ldm + i]); + } + printf("\n"); + } + printf("\n"); +} + +void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n) +{ + for(uint16_t i = 0; i < m; i++) + { + for( uint16_t j = 0; j < n; j++) + { + b[j * ldb + i] = a[i * lda + j]; + } + } +} diff --git a/independent_test_harness/kernels.h b/independent_test_harness/kernels.h index 08b1117..c1b090c 100644 --- a/independent_test_harness/kernels.h +++ b/independent_test_harness/kernels.h @@ -55,7 +55,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, double *__restrict determinant); #ifdef HAVE_CUBLAS_OFFLOAD -uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, +uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle, + const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates, const double *__restrict __attribute__((aligned(8))) Updates, diff --git a/independent_test_harness/meuk.c b/independent_test_harness/meuk.c index 2560e57..e103604 100644 --- a/independent_test_harness/meuk.c +++ b/independent_test_harness/meuk.c @@ -2,13 +2,6 @@ #include #include -void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) { - for (uint64_t i = 0; i < Lds * Dim; i++) { - printf("%f\n", A[i]); - } - printf("\n"); -} - double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) { double sum2 = 0; for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i]; diff --git a/independent_test_harness/meuk.h b/independent_test_harness/meuk.h index 65947dc..02432c0 100644 --- a/independent_test_harness/meuk.h +++ b/independent_test_harness/meuk.h @@ -15,7 +15,6 @@ typedef struct Error { void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim); void residual(double *a, double *res, const uint64_t Dim); double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim); -void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim); double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim); double max_norm(double *A, const uint64_t Lds, const uint64_t Dim); double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim); diff --git a/independent_test_harness/sm.c b/independent_test_harness/sm.c index 650bda2..682aa32 100644 --- a/independent_test_harness/sm.c +++ b/independent_test_harness/sm.c @@ -1,6 +1,7 @@ #include #include #include "kernels.h" +#include "debug.h" extern uint64_t n_splits; extern uint64_t block_fail; @@ -279,8 +280,9 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, const uint32_t Lds = LDS; // Compute C = S^{-1} U : Dim x K : standard dgemm - double C[DIM * N_updates]; + double *C = calloc(1, DIM * N_updates * sizeof(double)); double alpha = 1.0, beta = 0.0; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, Dim, N_updates, Lds, alpha, Slater_inv, Lds, Updates, Lds, @@ -320,25 +322,50 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS, ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); if (ret != 0) return ret; - // tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm - double tmp[N_updates * LDS]; + // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm + double tmp1[N_updates * LDS]; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N_updates, LDS, N_updates, alpha, B, N_updates, D, LDS, - beta, tmp, LDS); + beta, tmp1, LDS); + print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas"); - // Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm - alpha = -1.0, beta = 1.0; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm + // alpha = -1.0, beta = 1.0; + // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + // Dim, LDS, N_updates, + // alpha, C, N_updates, tmp1, LDS, + // beta, Slater_inv, LDS); + + double *tmp2 = calloc(1, DIM * LDS * sizeof(double)); + cblas_dgemm(CblasRowMajor, + CblasNoTrans, CblasNoTrans, Dim, LDS, N_updates, - alpha, C, N_updates, tmp, LDS, - beta, Slater_inv, LDS); + alpha, C, N_updates, tmp1, LDS, + beta, tmp2, LDS); + print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas"); + + double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); + for(int i = 0; i < DIM * LDS; ++i) + { + tmp3[i] = Slater_inv[i] - tmp2[i]; + } + print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas"); + + for(int i = 0; i < DIM * LDS; ++i) + { + Slater_inv[i] = tmp3[i]; + } + + free(tmp2); + free(tmp3); return 0; } #ifdef HAVE_CUBLAS_OFFLOAD -uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, +uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle, + const uint64_t vLDS, const uint64_t vDim, const uint64_t N_updates, const double *__restrict __attribute__((aligned(8))) Updates, @@ -351,34 +378,23 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, const uint32_t Lds = LDS; // Compute C = S^{-1} U : Dim x K : standard dgemm - // double C[Dim * N_updates]; - double *C = malloc(DIM * N_updates * sizeof(double)); - double alpha = 1.0, beta = 0.0; - // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - // Dim, N_updates, Lds, - // alpha, Slater_inv, Lds, Updates, Lds, - // beta, C, N_updates); - //cuBLAS initialization - cublasHandle_t handle; - if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { - fprintf(stdout, "cuBLAS initialization failed!\n"); - exit(EXIT_FAILURE); - } - #pragma omp target enter data map(to:Slater_inv, Updates, C) + double *C = calloc(1, DIM * N_updates * sizeof(double)); + double alpha = 1.0f, beta = 0.0f; + #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates]) #pragma omp target data use_device_ptr(Slater_inv, Updates, C) { - int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, - Dim, N_updates, Lds, - &alpha, Slater_inv, Lds, Updates, Lds, - &beta, C, N_updates); + int cublasError = cublasDgemm(handle, + CUBLAS_OP_T, CUBLAS_OP_N, + 15, 21, 24, + &alpha, Updates, 24, Slater_inv, 24, + &beta, C, 15); } - #pragma omp target exit data map(from:C) - cublasDestroy(handle); - + #pragma omp target exit data map(from:C[0:DIM*N_updates]) // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD // Construct D = V S^{-1} : K x LDS - double B[N_updates * N_updates], D[N_updates * LDS]; + double *B = calloc(1, N_updates * N_updates * sizeof(double)); + double *D = calloc(1, N_updates * LDS * sizeof(double)); for (uint32_t i = 0; i < N_updates; i++) { const uint32_t row = Updates_index[i] - 1; for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j); @@ -394,7 +410,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, int j = 0; for (uint32_t i = 0; i < N_updates; i++) { j += min(ipiv[i] - i, 1); - det *= B[(N_updates + 1) * i]; + det *= B[(N_updates + 1) * i]; // update determinant } if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even @@ -410,25 +426,79 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS, ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); if (ret != 0) return ret; - // tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm - double tmp[N_updates * LDS]; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - N_updates, LDS, N_updates, - alpha, B, N_updates, D, LDS, - beta, tmp, LDS); + // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm + double *tmp1 = calloc(1, N_updates * LDS * sizeof(double)); + #pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS]) + #pragma omp target data use_device_ptr(D, B, tmp1) + { + int cublasError = cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + LDS, N_updates, N_updates, + &alpha, D, LDS, B, N_updates, + &beta, tmp1, LDS); + } + #pragma omp target exit data map(from:tmp1[0:N_updates*LDS]) + print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas"); - // Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm - alpha = -1.0, beta = 1.0; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - Dim, LDS, N_updates, - alpha, C, N_updates, tmp, LDS, - beta, Slater_inv, LDS); + // Compute tmp2 = C * tmp1 : Dim x LDS + double *tmp2 = calloc(1, DIM * LDS * sizeof(double)); + #pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS]) + #pragma omp target data use_device_ptr(tmp1, C, tmp2) + { + int cublasError = cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + LDS, Dim, N_updates, + &alpha, tmp1, LDS, C, N_updates, + &beta, tmp2, LDS); + } + #pragma omp target exit data map(from:tmp2[0:DIM*LDS]) + print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas"); + // Compute tmp3 = S^{-1} - tmp2 + double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); + beta = -1.0f; + #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS]) + #pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3) + { + int cublasError = cublasDgeam(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + DIM, LDS, + &alpha, Slater_inv, LDS, + &beta, tmp2, LDS, + tmp3, LDS); + } + #pragma omp target exit data map(from:tmp3[0:DIM*LDS]) + print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas"); + for(int i = 0; i < DIM * LDS; ++i) + { + Slater_inv[i] = tmp3[i]; + } + + // // Compute S^{-1} <- S^{-1} - tmp2 + // beta = -1.0f; + // #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS]) + // #pragma omp target data use_device_ptr(Slater_inv, tmp2) + // { + // int cublasError = cublasDgeam(handle, + // CUBLAS_OP_N, CUBLAS_OP_N, + // DIM, LDS, + // &alpha, Slater_inv, LDS, + // &beta, tmp2, LDS, + // Slater_inv, LDS); + // } + // #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS]) + + + free(B); + free(C); + free(D); + free(tmp1); + free(tmp2); + // free(tmp3); return 0; } #endif - uint32_t qmckl_slagel_splitting( const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates, const double *__restrict __attribute__((aligned(8))) Updates, diff --git a/independent_test_harness/test.c b/independent_test_harness/test.c index fa57d31..e4e1f9f 100644 --- a/independent_test_harness/test.c +++ b/independent_test_harness/test.c @@ -4,7 +4,7 @@ #define DATASET "dataset_329d_zeropadded_cm.hdf5" // #define DATASET "dataset_15784d_zeropadded_cm.hdf5" -#define REPETITIONS 100000 +#define REPETITIONS 1 uint64_t n_splits; uint64_t block_fail; @@ -14,6 +14,15 @@ int main(int argc, char **argv) { assert(argc == 2); char *version = argv[1]; +#ifdef HAVE_CUBLAS_OFFLOAD + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + fprintf(stdout, "cuBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } +#endif + + // SETUP STORAGE AND DATA ACCESS hid_t file_id; file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT); @@ -214,13 +223,14 @@ printf("#----------------------------------------------------------------------- // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR accumulator += (double)(after - before); +#ifdef HAVE_CUBLAS_OFFLOAD } else if (version[0] == 'c') { // Woodbury K cuBLAS // 1. FETCH START TIME uint64_t before = rdtsc(); // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS - err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates, + err_break = qmckl_woodbury_k_cublas_offload(handle, Lds, Dim, N_updates, Updates, Updates_index, breakdown, Slater_invT_copy, &determinant); // 3. FETCH FINISH TIME @@ -228,7 +238,7 @@ printf("#----------------------------------------------------------------------- // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR accumulator += (double)(after - before); - +#endif } else if (version[0] == 's') { // Splitting // 1. FETCH START TIME @@ -283,7 +293,7 @@ printf("#----------------------------------------------------------------------- // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR accumulator += (double)(after - before); - } else { // Exit + } else { // Exit printf("Version '%c' not implemented.\n", version[0]); return 1; } @@ -336,4 +346,8 @@ printf("#----------------------------------------------------------------------- printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); (void) H5Fclose(file_id); + +#ifdef HAVE_CUBLAS_OFFLOAD + cublasDestroy(handle); +#endif }