Replaced all CBLAS dgemms with cuBLAS dgemms and dgeams. Works but not ideal.

2024-12-25 05:43:54 +01:00 · 2022-09-09 17:15:12 +02:00 · 2022-09-09 17:15:12 +02:00 · 892358d0d1
commit 892358d0d1
parent 87e319189e
9 changed files with 292 additions and 65 deletions
--- a/independent_test_harness/Makefile
+++ b/independent_test_harness/Makefile
@ -1,17 +1,17 @@
 FC = ifx
 CC = nvc

-CFLAGS=-std=c99 -O3 -Wall -g
+CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu

-LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
-LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
-LFLAGS+=-lcublas
+LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
+LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
+LDFLAGS+=-lcublas -mp -target=gpu

 ## Link with icc
 # test: sm.o test.o detupdate21.o meuk.o
-# 	$(CC) $(LFLAGS) -o test sm.o detupdate21.o test.o meuk.o
+# 	$(CC) $(LDFLAGS) -o test sm.o detupdate21.o test.o meuk.o
 test: sm.o test.o meuk.o
-	$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
+	$(CC) $(LDFLAGS) -o test sm.o test.o meuk.o

 %.o: %.f90
 	$(FC) $(FFLAGS) -c -o $@ $< 
--- a/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
+++ b/independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
@ -0,0 +1,107 @@
+/*
+Compile with:
+
+nvc -L${MKLROOT}/lib/intel64 \
+    -lmkl_intel_lp64 \
+    -lmkl_sequential \
+    -lmkl_core \
+    -lpthread \
+    -lm \
+    -ldl \
+    -lcublas \
+    -mp \
+    -target=gpu \
+    cblasdgemm_vs_cublasdgemm_test.c \
+    -o cblasdgemm_vs_cublasdgemm_test
+
+*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <mkl_lapacke.h>
+#include <mkl.h>
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#include "debug.h"
+
+int main() {
+
+  cublasHandle_t handle;
+  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stdout, "cuBLAS initialization failed!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  uint16_t M = 3;
+  uint16_t N = 2;
+  uint16_t K = 4;
+
+  double *a = malloc(M * K * sizeof(double));  // M x K = 3 x 4 
+  double *acm = malloc(M * K * sizeof(double)); // col-major stored
+  double *b = malloc(K * N * sizeof(double));  // K x N = 4 x 2
+  double *bcm = malloc(K * N * sizeof(double)); // col-major stored
+  double *c = malloc(M * N * sizeof(double));  // M x N = 3 x 2
+
+  a[0] = 1, a[1] = 2, a[2] = 3, a[3] = 4, a[4] = 5, a[5] = 6, a[6] = 7, a[7] = 8; a[8] = 9, a[9] = 10, a[10] = 11, a[11] = 12;
+  acm[0] = 1, acm[1] = 5, acm[2] = 9, acm[3] = 2, acm[4] = 6, acm[5] = 10, acm[6] = 3, acm[7] = 7; acm[8] = 11, acm[9] = 4, acm[10] = 8, acm[11] = 12;
+  b[0] = 13, b[1] = 14, b[2] = 15, b[3] = 16, b[4] = 17, b[5] = 18, b[6] = 19, b[7] = 20;
+  bcm[0] = 13, bcm[1] = 15, bcm[2] = 17, bcm[3] = 19, bcm[4] = 14, bcm[5] = 16, bcm[6] = 18, bcm[7] = 20;
+
+  uint16_t lda   = K;
+  uint16_t ldacm = M;
+  uint16_t ldb   = N;
+  uint16_t ldbcm = K;
+  uint16_t ldc   = N;
+
+  double alpha = 1.0, beta = 0.0;
+  
+
+  cblas_dgemm(CblasRowMajor,
+              CblasNoTrans, CblasNoTrans,
+              M, N, K,
+              alpha, a, lda, b, ldb,
+              beta, c, ldc);
+  print_m(c, M, N, ldc, "c_cblas_dgemm");
+  
+
+  memset(c, 0, M*N*sizeof(double));
+  #pragma omp target enter data map(to:a[0:M*K], b[0:K*N], c[0:M*N])
+  #pragma omp target data use_device_ptr(a, b, c)
+  {
+    int cublasError = cublasDgemm(handle,
+                                  CUBLAS_OP_N, CUBLAS_OP_N,
+                                  N, M, K,
+                                  &alpha, b, ldb, a, lda,
+                                  &beta, c, ldc);
+  }
+  #pragma omp target exit data map(from:c[0:M*N])
+  print_m(c, M, N, ldc, "c_cublasDgemm");
+  
+
+  memset(c, 0, M*N*sizeof(double));
+  ldc = M; // ldc : N -> M, because cublasDgemm stores result in col-maj
+  #pragma omp target enter data map(to:acm[0:M*K], bcm[0:K*N], c[0:M*N])
+  #pragma omp target data use_device_ptr(acm, bcm, c)
+  {
+    int cublasError = cublasDgemm(handle,
+                                  CUBLAS_OP_N, CUBLAS_OP_N,
+                                  M, N, K,
+                                  &alpha, acm, ldacm, bcm, ldbcm,
+                                  &beta, c, ldc);
+  }
+  #pragma omp target exit data map(from:c[0:M*N])
+  print_m_t(c, M, N, ldc, "c_col-maj_cublasDgemm");
+
+
+  free(a);
+  free(acm);
+  free(b);
+  free(bcm);
+  free(c);
+  return 0;
+}
--- a/independent_test_harness/check_result.sh
+++ b/independent_test_harness/check_result.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+make
+rm -v blocked kay cu
+./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu 
+
--- a/independent_test_harness/debug.h
+++ b/independent_test_harness/debug.h
@ -0,0 +1,38 @@
+void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
+{
+  printf("%s = \n", name);
+  for (uint16_t i = 0; i < m; ++i)
+  {
+    for (uint16_t j = 0; j < n; ++j)
+    {
+      printf("%11.5f ", mat[i * ldm + j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
+{
+  printf("%s = \n", name);
+  for (uint16_t i = 0; i < m; ++i)
+  {
+    for (uint16_t j = 0; j < n; ++j)
+    {
+      printf("%11.5f ", mat[j * ldm + i]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n)
+{
+  for(uint16_t i = 0; i < m; i++)
+  {
+    for( uint16_t j = 0; j < n; j++)
+    {
+      b[j * ldb + i] = a[i * lda + j];
+    }
+  }
+}
--- a/independent_test_harness/kernels.h
+++ b/independent_test_harness/kernels.h
@ -55,7 +55,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
                          double *__restrict determinant);

 #ifdef HAVE_CUBLAS_OFFLOAD
-uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
+uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
+          const uint64_t vLDS,
          const uint64_t vDim,
          const uint64_t N_updates,
          const double *__restrict __attribute__((aligned(8))) Updates,
--- a/independent_test_harness/meuk.c
+++ b/independent_test_harness/meuk.c
@ -2,13 +2,6 @@
 #include <stdint.h>
 #include <assert.h>

-void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) {
-  for (uint64_t i = 0; i < Lds * Dim; i++) {
-    printf("%f\n", A[i]);
-  }
-  printf("\n");
-}
-
 double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
  double sum2 = 0;
  for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
--- a/independent_test_harness/meuk.h
+++ b/independent_test_harness/meuk.h
@ -15,7 +15,6 @@ typedef struct Error {
 void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
 void residual(double *a, double *res, const uint64_t Dim);
 double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
-void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim);
 double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
 double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
 double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
--- a/independent_test_harness/sm.c
+++ b/independent_test_harness/sm.c
@ -1,6 +1,7 @@
 #include <math.h>
 #include <stdint.h>
 #include "kernels.h"
+#include "debug.h"

 extern uint64_t n_splits;
 extern uint64_t block_fail;
@ -279,8 +280,9 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
  const uint32_t Lds = LDS;

  // Compute C = S^{-1} U : Dim x K : standard dgemm
-  double C[DIM * N_updates];
+  double *C = calloc(1, DIM * N_updates * sizeof(double));
  double alpha = 1.0, beta = 0.0;
+
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
              Dim, N_updates, Lds,
              alpha, Slater_inv, Lds, Updates, Lds,
@ -320,25 +322,50 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
  ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
  if (ret != 0) return ret;

-  // tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
-  double tmp[N_updates * LDS];
+  // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
+  double tmp1[N_updates * LDS];
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
              N_updates, LDS, N_updates,
              alpha, B, N_updates, D, LDS,
-              beta, tmp, LDS);
+              beta, tmp1, LDS);
+  print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas");

-  // Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm
-  alpha = -1.0, beta = 1.0;
-  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+  // Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
+  // alpha = -1.0, beta = 1.0;
+  // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+  //             Dim, LDS, N_updates,
+  //             alpha, C, N_updates, tmp1, LDS,
+  //             beta, Slater_inv, LDS);
+
+  double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
+  cblas_dgemm(CblasRowMajor,
+              CblasNoTrans, CblasNoTrans,
              Dim, LDS, N_updates,
-              alpha, C, N_updates, tmp, LDS,
-              beta, Slater_inv, LDS);
+              alpha, C, N_updates, tmp1, LDS,
+              beta, tmp2, LDS);
+  print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas");
+
+  double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
+  for(int i = 0; i < DIM * LDS; ++i)
+  {
+    tmp3[i] = Slater_inv[i] - tmp2[i];
+  }
+  print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas");
+
+  for(int i = 0; i < DIM * LDS; ++i)
+  {
+    Slater_inv[i] = tmp3[i];
+  }
+
+  free(tmp2);
+  free(tmp3);

  return 0;
 }

 #ifdef HAVE_CUBLAS_OFFLOAD
-uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
+uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
+          const uint64_t vLDS,
          const uint64_t vDim,
          const uint64_t N_updates,
          const double *__restrict __attribute__((aligned(8))) Updates,
@ -351,34 +378,23 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
  const uint32_t Lds = LDS;

  // Compute C = S^{-1} U : Dim x K : standard dgemm
-  // double C[Dim * N_updates];
-  double *C = malloc(DIM * N_updates * sizeof(double));
-  double alpha = 1.0, beta = 0.0;
-  // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-  //             Dim, N_updates, Lds,
-  //             alpha, Slater_inv, Lds, Updates, Lds,
-  //             beta, C, N_updates);
-  //cuBLAS initialization
-  cublasHandle_t handle;
-  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stdout, "cuBLAS initialization failed!\n");
-    exit(EXIT_FAILURE);
-  }
-  #pragma omp target enter data map(to:Slater_inv, Updates, C)
+  double *C = calloc(1, DIM * N_updates * sizeof(double));
+  double alpha = 1.0f, beta = 0.0f;
+  #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates])
  #pragma omp target data use_device_ptr(Slater_inv, Updates, C)
  {
-    int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                          Dim, N_updates, Lds,
-                          &alpha, Slater_inv, Lds, Updates, Lds,
-                          &beta, C, N_updates);
+    int cublasError = cublasDgemm(handle,
+                                  CUBLAS_OP_T, CUBLAS_OP_N,
+                                  15, 21, 24,
+                                  &alpha, Updates, 24, Slater_inv, 24, 
+                                  &beta, C, 15);
  }
-  #pragma omp target exit data map(from:C)
-  cublasDestroy(handle);
-
+  #pragma omp target exit data map(from:C[0:DIM*N_updates])

  // Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
  // Construct D = V S^{-1} : K x LDS
-  double B[N_updates * N_updates], D[N_updates * LDS];
+  double *B = calloc(1, N_updates * N_updates * sizeof(double));
+  double *D = calloc(1, N_updates * LDS * sizeof(double));
  for (uint32_t i = 0; i < N_updates; i++) {
    const uint32_t row = Updates_index[i] - 1;
    for (uint32_t j = 0; j < N_updates  ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
@ -394,7 +410,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
  int j = 0;
  for (uint32_t i = 0; i < N_updates; i++) {
    j += min(ipiv[i] - i, 1);
-    det *= B[(N_updates + 1) * i];
+    det *= B[(N_updates + 1) * i]; // update determinant
  }
  if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even

@ -410,25 +426,79 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
  ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
  if (ret != 0) return ret;

-  // tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
-  double tmp[N_updates * LDS];
-  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-              N_updates, LDS, N_updates,
-              alpha, B, N_updates, D, LDS,
-              beta, tmp, LDS);
+  // tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
+  double *tmp1 = calloc(1, N_updates * LDS * sizeof(double));
+  #pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS])
+  #pragma omp target data use_device_ptr(D, B, tmp1)
+  {
+    int cublasError = cublasDgemm(handle,
+                                  CUBLAS_OP_N, CUBLAS_OP_N,
+                                  LDS, N_updates, N_updates,
+                                  &alpha, D, LDS, B, N_updates, 
+                                  &beta, tmp1, LDS);
+  }
+  #pragma omp target exit data map(from:tmp1[0:N_updates*LDS])
+  print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas");

-  // Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm
-  alpha = -1.0, beta = 1.0;
-  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-              Dim, LDS, N_updates,
-              alpha, C, N_updates, tmp, LDS,
-              beta, Slater_inv, LDS);
+  // Compute tmp2 = C * tmp1 : Dim x LDS
+  double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
+  #pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS])
+  #pragma omp target data use_device_ptr(tmp1, C, tmp2)
+  {
+    int cublasError = cublasDgemm(handle,
+                                  CUBLAS_OP_N, CUBLAS_OP_N,
+                                  LDS, Dim, N_updates,
+                                  &alpha, tmp1, LDS, C, N_updates,
+                                  &beta, tmp2, LDS);
+  }
+  #pragma omp target exit data map(from:tmp2[0:DIM*LDS])
+  print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas");

+  // Compute tmp3 = S^{-1} - tmp2
+  double *tmp3 = calloc(1, DIM * LDS * sizeof(double)); 
+  beta = -1.0f;
+  #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS])
+  #pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3)
+  {
+    int cublasError = cublasDgeam(handle,
+                                  CUBLAS_OP_N, CUBLAS_OP_N,
+                                  DIM, LDS,
+                                  &alpha, Slater_inv, LDS,
+                                  &beta, tmp2, LDS,
+                                  tmp3, LDS);
+  }
+  #pragma omp target exit data map(from:tmp3[0:DIM*LDS])
+  print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas");
+  for(int i = 0; i < DIM * LDS; ++i)
+  {
+    Slater_inv[i] = tmp3[i];
+  }
+
+  // // Compute S^{-1} <- S^{-1} - tmp2
+  // beta = -1.0f;
+  // #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS])
+  // #pragma omp target data use_device_ptr(Slater_inv, tmp2)
+  // {
+  //   int cublasError = cublasDgeam(handle,
+  //                                 CUBLAS_OP_N, CUBLAS_OP_N,
+  //                                 DIM, LDS,
+  //                                 &alpha, Slater_inv, LDS,
+  //                                 &beta, tmp2, LDS,
+  //                                 Slater_inv, LDS);
+  // }
+  // #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS])
+
+  
+  free(B);
+  free(C);
+  free(D);
+  free(tmp1);
+  free(tmp2);
+  // free(tmp3);
  return 0;
 }
 #endif

-
 uint32_t qmckl_slagel_splitting(
    const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates,
    const double *__restrict __attribute__((aligned(8))) Updates,
--- a/independent_test_harness/test.c
+++ b/independent_test_harness/test.c
@ -4,7 +4,7 @@

 #define DATASET "dataset_329d_zeropadded_cm.hdf5"
 // #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
-#define REPETITIONS 100000
+#define REPETITIONS 1

 uint64_t n_splits;
 uint64_t block_fail;
@ -14,6 +14,15 @@ int main(int argc, char **argv) {
  assert(argc == 2);
  char *version = argv[1];

+#ifdef HAVE_CUBLAS_OFFLOAD
+  cublasHandle_t handle;
+  if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stdout, "cuBLAS initialization failed!\n");
+    exit(EXIT_FAILURE);
+  }
+#endif
+
+
  // SETUP STORAGE AND DATA ACCESS
  hid_t  file_id;
  file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
@ -214,13 +223,14 @@ printf("#-----------------------------------------------------------------------
        // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
        accumulator += (double)(after - before);

+#ifdef HAVE_CUBLAS_OFFLOAD
      } else if (version[0] == 'c') { // Woodbury K cuBLAS

        // 1. FETCH START TIME
        uint64_t before = rdtsc();

        // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
-        err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates,
+        err_break = qmckl_woodbury_k_cublas_offload(handle, Lds, Dim, N_updates, Updates,
              Updates_index, breakdown, Slater_invT_copy, &determinant);

        // 3. FETCH FINISH TIME
@ -228,7 +238,7 @@ printf("#-----------------------------------------------------------------------

        // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
        accumulator += (double)(after - before);
-
+#endif
      } else if (version[0] == 's') { // Splitting

        // 1. FETCH START TIME
@ -283,7 +293,7 @@ printf("#-----------------------------------------------------------------------
        // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
        accumulator += (double)(after - before);

-      } else {                        // Exit
+      } else { // Exit
        printf("Version '%c' not implemented.\n", version[0]);
        return 1;
      }
@ -336,4 +346,8 @@ printf("#-----------------------------------------------------------------------
  printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");

  (void) H5Fclose(file_id);
+
+#ifdef HAVE_CUBLAS_OFFLOAD  
+  cublasDestroy(handle);
+#endif
 }