mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2024-12-25 13:53:56 +01:00
Replaced all CBLAS dgemms with cuBLAS dgemms and dgeams. Works but not ideal.
This commit is contained in:
parent
87e319189e
commit
892358d0d1
@ -1,17 +1,17 @@
|
|||||||
FC = ifx
|
FC = ifx
|
||||||
CC = nvc
|
CC = nvc
|
||||||
|
|
||||||
CFLAGS=-std=c99 -O3 -Wall -g
|
CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu
|
||||||
|
|
||||||
LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
|
LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
|
||||||
LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
||||||
LFLAGS+=-lcublas
|
LDFLAGS+=-lcublas -mp -target=gpu
|
||||||
|
|
||||||
## Link with icc
|
## Link with icc
|
||||||
# test: sm.o test.o detupdate21.o meuk.o
|
# test: sm.o test.o detupdate21.o meuk.o
|
||||||
# $(CC) $(LFLAGS) -o test sm.o detupdate21.o test.o meuk.o
|
# $(CC) $(LDFLAGS) -o test sm.o detupdate21.o test.o meuk.o
|
||||||
test: sm.o test.o meuk.o
|
test: sm.o test.o meuk.o
|
||||||
$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
|
$(CC) $(LDFLAGS) -o test sm.o test.o meuk.o
|
||||||
|
|
||||||
%.o: %.f90
|
%.o: %.f90
|
||||||
$(FC) $(FFLAGS) -c -o $@ $<
|
$(FC) $(FFLAGS) -c -o $@ $<
|
||||||
|
107
independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
Normal file
107
independent_test_harness/cblasdgemm_vs_cublasdgemm_test.c
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
/*
|
||||||
|
Compile with:
|
||||||
|
|
||||||
|
nvc -L${MKLROOT}/lib/intel64 \
|
||||||
|
-lmkl_intel_lp64 \
|
||||||
|
-lmkl_sequential \
|
||||||
|
-lmkl_core \
|
||||||
|
-lpthread \
|
||||||
|
-lm \
|
||||||
|
-ldl \
|
||||||
|
-lcublas \
|
||||||
|
-mp \
|
||||||
|
-target=gpu \
|
||||||
|
cblasdgemm_vs_cublasdgemm_test.c \
|
||||||
|
-o cblasdgemm_vs_cublasdgemm_test
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <mkl_lapacke.h>
|
||||||
|
#include <mkl.h>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
|
||||||
|
#include "debug.h"
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
cublasHandle_t handle;
|
||||||
|
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
||||||
|
fprintf(stdout, "cuBLAS initialization failed!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint16_t M = 3;
|
||||||
|
uint16_t N = 2;
|
||||||
|
uint16_t K = 4;
|
||||||
|
|
||||||
|
double *a = malloc(M * K * sizeof(double)); // M x K = 3 x 4
|
||||||
|
double *acm = malloc(M * K * sizeof(double)); // col-major stored
|
||||||
|
double *b = malloc(K * N * sizeof(double)); // K x N = 4 x 2
|
||||||
|
double *bcm = malloc(K * N * sizeof(double)); // col-major stored
|
||||||
|
double *c = malloc(M * N * sizeof(double)); // M x N = 3 x 2
|
||||||
|
|
||||||
|
a[0] = 1, a[1] = 2, a[2] = 3, a[3] = 4, a[4] = 5, a[5] = 6, a[6] = 7, a[7] = 8; a[8] = 9, a[9] = 10, a[10] = 11, a[11] = 12;
|
||||||
|
acm[0] = 1, acm[1] = 5, acm[2] = 9, acm[3] = 2, acm[4] = 6, acm[5] = 10, acm[6] = 3, acm[7] = 7; acm[8] = 11, acm[9] = 4, acm[10] = 8, acm[11] = 12;
|
||||||
|
b[0] = 13, b[1] = 14, b[2] = 15, b[3] = 16, b[4] = 17, b[5] = 18, b[6] = 19, b[7] = 20;
|
||||||
|
bcm[0] = 13, bcm[1] = 15, bcm[2] = 17, bcm[3] = 19, bcm[4] = 14, bcm[5] = 16, bcm[6] = 18, bcm[7] = 20;
|
||||||
|
|
||||||
|
uint16_t lda = K;
|
||||||
|
uint16_t ldacm = M;
|
||||||
|
uint16_t ldb = N;
|
||||||
|
uint16_t ldbcm = K;
|
||||||
|
uint16_t ldc = N;
|
||||||
|
|
||||||
|
double alpha = 1.0, beta = 0.0;
|
||||||
|
|
||||||
|
|
||||||
|
cblas_dgemm(CblasRowMajor,
|
||||||
|
CblasNoTrans, CblasNoTrans,
|
||||||
|
M, N, K,
|
||||||
|
alpha, a, lda, b, ldb,
|
||||||
|
beta, c, ldc);
|
||||||
|
print_m(c, M, N, ldc, "c_cblas_dgemm");
|
||||||
|
|
||||||
|
|
||||||
|
memset(c, 0, M*N*sizeof(double));
|
||||||
|
#pragma omp target enter data map(to:a[0:M*K], b[0:K*N], c[0:M*N])
|
||||||
|
#pragma omp target data use_device_ptr(a, b, c)
|
||||||
|
{
|
||||||
|
int cublasError = cublasDgemm(handle,
|
||||||
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
N, M, K,
|
||||||
|
&alpha, b, ldb, a, lda,
|
||||||
|
&beta, c, ldc);
|
||||||
|
}
|
||||||
|
#pragma omp target exit data map(from:c[0:M*N])
|
||||||
|
print_m(c, M, N, ldc, "c_cublasDgemm");
|
||||||
|
|
||||||
|
|
||||||
|
memset(c, 0, M*N*sizeof(double));
|
||||||
|
ldc = M; // ldc : N -> M, because cublasDgemm stores result in col-maj
|
||||||
|
#pragma omp target enter data map(to:acm[0:M*K], bcm[0:K*N], c[0:M*N])
|
||||||
|
#pragma omp target data use_device_ptr(acm, bcm, c)
|
||||||
|
{
|
||||||
|
int cublasError = cublasDgemm(handle,
|
||||||
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
M, N, K,
|
||||||
|
&alpha, acm, ldacm, bcm, ldbcm,
|
||||||
|
&beta, c, ldc);
|
||||||
|
}
|
||||||
|
#pragma omp target exit data map(from:c[0:M*N])
|
||||||
|
print_m_t(c, M, N, ldc, "c_col-maj_cublasDgemm");
|
||||||
|
|
||||||
|
|
||||||
|
free(a);
|
||||||
|
free(acm);
|
||||||
|
free(b);
|
||||||
|
free(bcm);
|
||||||
|
free(c);
|
||||||
|
return 0;
|
||||||
|
}
|
5
independent_test_harness/check_result.sh
Executable file
5
independent_test_harness/check_result.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
make
|
||||||
|
rm -v blocked kay cu
|
||||||
|
./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu
|
||||||
|
|
38
independent_test_harness/debug.h
Normal file
38
independent_test_harness/debug.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
|
||||||
|
{
|
||||||
|
printf("%s = \n", name);
|
||||||
|
for (uint16_t i = 0; i < m; ++i)
|
||||||
|
{
|
||||||
|
for (uint16_t j = 0; j < n; ++j)
|
||||||
|
{
|
||||||
|
printf("%11.5f ", mat[i * ldm + j]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
|
||||||
|
{
|
||||||
|
printf("%s = \n", name);
|
||||||
|
for (uint16_t i = 0; i < m; ++i)
|
||||||
|
{
|
||||||
|
for (uint16_t j = 0; j < n; ++j)
|
||||||
|
{
|
||||||
|
printf("%11.5f ", mat[j * ldm + i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void transpose(double* a, uint16_t lda, double *b, uint16_t ldb, uint16_t m, uint16_t n)
|
||||||
|
{
|
||||||
|
for(uint16_t i = 0; i < m; i++)
|
||||||
|
{
|
||||||
|
for( uint16_t j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
b[j * ldb + i] = a[i * lda + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -55,7 +55,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
double *__restrict determinant);
|
double *__restrict determinant);
|
||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
||||||
|
const uint64_t vLDS,
|
||||||
const uint64_t vDim,
|
const uint64_t vDim,
|
||||||
const uint64_t N_updates,
|
const uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
|
@ -2,13 +2,6 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) {
|
|
||||||
for (uint64_t i = 0; i < Lds * Dim; i++) {
|
|
||||||
printf("%f\n", A[i]);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
|
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||||
double sum2 = 0;
|
double sum2 = 0;
|
||||||
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
|
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
|
||||||
|
@ -15,7 +15,6 @@ typedef struct Error {
|
|||||||
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
|
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
|
||||||
void residual(double *a, double *res, const uint64_t Dim);
|
void residual(double *a, double *res, const uint64_t Dim);
|
||||||
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
|
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim);
|
|
||||||
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
|
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "kernels.h"
|
#include "kernels.h"
|
||||||
|
#include "debug.h"
|
||||||
|
|
||||||
extern uint64_t n_splits;
|
extern uint64_t n_splits;
|
||||||
extern uint64_t block_fail;
|
extern uint64_t block_fail;
|
||||||
@ -279,8 +280,9 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
const uint32_t Lds = LDS;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||||
double C[DIM * N_updates];
|
double *C = calloc(1, DIM * N_updates * sizeof(double));
|
||||||
double alpha = 1.0, beta = 0.0;
|
double alpha = 1.0, beta = 0.0;
|
||||||
|
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
Dim, N_updates, Lds,
|
Dim, N_updates, Lds,
|
||||||
alpha, Slater_inv, Lds, Updates, Lds,
|
alpha, Slater_inv, Lds, Updates, Lds,
|
||||||
@ -320,25 +322,50 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
||||||
if (ret != 0) return ret;
|
if (ret != 0) return ret;
|
||||||
|
|
||||||
// tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||||
double tmp[N_updates * LDS];
|
double tmp1[N_updates * LDS];
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
N_updates, LDS, N_updates,
|
N_updates, LDS, N_updates,
|
||||||
alpha, B, N_updates, D, LDS,
|
alpha, B, N_updates, D, LDS,
|
||||||
beta, tmp, LDS);
|
beta, tmp1, LDS);
|
||||||
|
print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas");
|
||||||
|
|
||||||
// Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm
|
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
|
||||||
alpha = -1.0, beta = 1.0;
|
// alpha = -1.0, beta = 1.0;
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
|
// Dim, LDS, N_updates,
|
||||||
|
// alpha, C, N_updates, tmp1, LDS,
|
||||||
|
// beta, Slater_inv, LDS);
|
||||||
|
|
||||||
|
double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
|
||||||
|
cblas_dgemm(CblasRowMajor,
|
||||||
|
CblasNoTrans, CblasNoTrans,
|
||||||
Dim, LDS, N_updates,
|
Dim, LDS, N_updates,
|
||||||
alpha, C, N_updates, tmp, LDS,
|
alpha, C, N_updates, tmp1, LDS,
|
||||||
beta, Slater_inv, LDS);
|
beta, tmp2, LDS);
|
||||||
|
print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas");
|
||||||
|
|
||||||
|
double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
|
||||||
|
for(int i = 0; i < DIM * LDS; ++i)
|
||||||
|
{
|
||||||
|
tmp3[i] = Slater_inv[i] - tmp2[i];
|
||||||
|
}
|
||||||
|
print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas");
|
||||||
|
|
||||||
|
for(int i = 0; i < DIM * LDS; ++i)
|
||||||
|
{
|
||||||
|
Slater_inv[i] = tmp3[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
free(tmp2);
|
||||||
|
free(tmp3);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
||||||
|
const uint64_t vLDS,
|
||||||
const uint64_t vDim,
|
const uint64_t vDim,
|
||||||
const uint64_t N_updates,
|
const uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
@ -351,34 +378,23 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
|||||||
const uint32_t Lds = LDS;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||||
// double C[Dim * N_updates];
|
double *C = calloc(1, DIM * N_updates * sizeof(double));
|
||||||
double *C = malloc(DIM * N_updates * sizeof(double));
|
double alpha = 1.0f, beta = 0.0f;
|
||||||
double alpha = 1.0, beta = 0.0;
|
#pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates])
|
||||||
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
||||||
// Dim, N_updates, Lds,
|
|
||||||
// alpha, Slater_inv, Lds, Updates, Lds,
|
|
||||||
// beta, C, N_updates);
|
|
||||||
//cuBLAS initialization
|
|
||||||
cublasHandle_t handle;
|
|
||||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
|
||||||
fprintf(stdout, "cuBLAS initialization failed!\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
#pragma omp target enter data map(to:Slater_inv, Updates, C)
|
|
||||||
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
|
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
|
||||||
{
|
{
|
||||||
int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
|
int cublasError = cublasDgemm(handle,
|
||||||
Dim, N_updates, Lds,
|
CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
&alpha, Slater_inv, Lds, Updates, Lds,
|
15, 21, 24,
|
||||||
&beta, C, N_updates);
|
&alpha, Updates, 24, Slater_inv, 24,
|
||||||
|
&beta, C, 15);
|
||||||
}
|
}
|
||||||
#pragma omp target exit data map(from:C)
|
#pragma omp target exit data map(from:C[0:DIM*N_updates])
|
||||||
cublasDestroy(handle);
|
|
||||||
|
|
||||||
|
|
||||||
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
||||||
// Construct D = V S^{-1} : K x LDS
|
// Construct D = V S^{-1} : K x LDS
|
||||||
double B[N_updates * N_updates], D[N_updates * LDS];
|
double *B = calloc(1, N_updates * N_updates * sizeof(double));
|
||||||
|
double *D = calloc(1, N_updates * LDS * sizeof(double));
|
||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
const uint32_t row = Updates_index[i] - 1;
|
const uint32_t row = Updates_index[i] - 1;
|
||||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||||
@ -394,7 +410,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
|||||||
int j = 0;
|
int j = 0;
|
||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
j += min(ipiv[i] - i, 1);
|
j += min(ipiv[i] - i, 1);
|
||||||
det *= B[(N_updates + 1) * i];
|
det *= B[(N_updates + 1) * i]; // update determinant
|
||||||
}
|
}
|
||||||
if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
|
if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
|
||||||
|
|
||||||
@ -410,25 +426,79 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
|||||||
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
||||||
if (ret != 0) return ret;
|
if (ret != 0) return ret;
|
||||||
|
|
||||||
// tmp = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||||
double tmp[N_updates * LDS];
|
double *tmp1 = calloc(1, N_updates * LDS * sizeof(double));
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
#pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS])
|
||||||
N_updates, LDS, N_updates,
|
#pragma omp target data use_device_ptr(D, B, tmp1)
|
||||||
alpha, B, N_updates, D, LDS,
|
{
|
||||||
beta, tmp, LDS);
|
int cublasError = cublasDgemm(handle,
|
||||||
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
LDS, N_updates, N_updates,
|
||||||
|
&alpha, D, LDS, B, N_updates,
|
||||||
|
&beta, tmp1, LDS);
|
||||||
|
}
|
||||||
|
#pragma omp target exit data map(from:tmp1[0:N_updates*LDS])
|
||||||
|
print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas");
|
||||||
|
|
||||||
// Compute S^{-1} - C * tmp : Dim x LDS : standard dgemm
|
// Compute tmp2 = C * tmp1 : Dim x LDS
|
||||||
alpha = -1.0, beta = 1.0;
|
double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
#pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS])
|
||||||
Dim, LDS, N_updates,
|
#pragma omp target data use_device_ptr(tmp1, C, tmp2)
|
||||||
alpha, C, N_updates, tmp, LDS,
|
{
|
||||||
beta, Slater_inv, LDS);
|
int cublasError = cublasDgemm(handle,
|
||||||
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
LDS, Dim, N_updates,
|
||||||
|
&alpha, tmp1, LDS, C, N_updates,
|
||||||
|
&beta, tmp2, LDS);
|
||||||
|
}
|
||||||
|
#pragma omp target exit data map(from:tmp2[0:DIM*LDS])
|
||||||
|
print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas");
|
||||||
|
|
||||||
|
// Compute tmp3 = S^{-1} - tmp2
|
||||||
|
double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
|
||||||
|
beta = -1.0f;
|
||||||
|
#pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS])
|
||||||
|
#pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3)
|
||||||
|
{
|
||||||
|
int cublasError = cublasDgeam(handle,
|
||||||
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
DIM, LDS,
|
||||||
|
&alpha, Slater_inv, LDS,
|
||||||
|
&beta, tmp2, LDS,
|
||||||
|
tmp3, LDS);
|
||||||
|
}
|
||||||
|
#pragma omp target exit data map(from:tmp3[0:DIM*LDS])
|
||||||
|
print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas");
|
||||||
|
for(int i = 0; i < DIM * LDS; ++i)
|
||||||
|
{
|
||||||
|
Slater_inv[i] = tmp3[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// // Compute S^{-1} <- S^{-1} - tmp2
|
||||||
|
// beta = -1.0f;
|
||||||
|
// #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS])
|
||||||
|
// #pragma omp target data use_device_ptr(Slater_inv, tmp2)
|
||||||
|
// {
|
||||||
|
// int cublasError = cublasDgeam(handle,
|
||||||
|
// CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
// DIM, LDS,
|
||||||
|
// &alpha, Slater_inv, LDS,
|
||||||
|
// &beta, tmp2, LDS,
|
||||||
|
// Slater_inv, LDS);
|
||||||
|
// }
|
||||||
|
// #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS])
|
||||||
|
|
||||||
|
|
||||||
|
free(B);
|
||||||
|
free(C);
|
||||||
|
free(D);
|
||||||
|
free(tmp1);
|
||||||
|
free(tmp2);
|
||||||
|
// free(tmp3);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
uint32_t qmckl_slagel_splitting(
|
uint32_t qmckl_slagel_splitting(
|
||||||
const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates,
|
const uint64_t vLDS, const uint64_t vDim, uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
|
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
|
||||||
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
|
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
|
||||||
#define REPETITIONS 100000
|
#define REPETITIONS 1
|
||||||
|
|
||||||
uint64_t n_splits;
|
uint64_t n_splits;
|
||||||
uint64_t block_fail;
|
uint64_t block_fail;
|
||||||
@ -14,6 +14,15 @@ int main(int argc, char **argv) {
|
|||||||
assert(argc == 2);
|
assert(argc == 2);
|
||||||
char *version = argv[1];
|
char *version = argv[1];
|
||||||
|
|
||||||
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
|
cublasHandle_t handle;
|
||||||
|
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
||||||
|
fprintf(stdout, "cuBLAS initialization failed!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// SETUP STORAGE AND DATA ACCESS
|
// SETUP STORAGE AND DATA ACCESS
|
||||||
hid_t file_id;
|
hid_t file_id;
|
||||||
file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
|
file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
|
||||||
@ -214,13 +223,14 @@ printf("#-----------------------------------------------------------------------
|
|||||||
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||||
accumulator += (double)(after - before);
|
accumulator += (double)(after - before);
|
||||||
|
|
||||||
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
} else if (version[0] == 'c') { // Woodbury K cuBLAS
|
} else if (version[0] == 'c') { // Woodbury K cuBLAS
|
||||||
|
|
||||||
// 1. FETCH START TIME
|
// 1. FETCH START TIME
|
||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates,
|
err_break = qmckl_woodbury_k_cublas_offload(handle, Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -228,7 +238,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
|
|
||||||
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||||
accumulator += (double)(after - before);
|
accumulator += (double)(after - before);
|
||||||
|
#endif
|
||||||
} else if (version[0] == 's') { // Splitting
|
} else if (version[0] == 's') { // Splitting
|
||||||
|
|
||||||
// 1. FETCH START TIME
|
// 1. FETCH START TIME
|
||||||
@ -283,7 +293,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||||
accumulator += (double)(after - before);
|
accumulator += (double)(after - before);
|
||||||
|
|
||||||
} else { // Exit
|
} else { // Exit
|
||||||
printf("Version '%c' not implemented.\n", version[0]);
|
printf("Version '%c' not implemented.\n", version[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -336,4 +346,8 @@ printf("#-----------------------------------------------------------------------
|
|||||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||||
|
|
||||||
(void) H5Fclose(file_id);
|
(void) H5Fclose(file_id);
|
||||||
|
|
||||||
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
|
cublasDestroy(handle);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user