mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2024-12-25 05:43:54 +01:00
Added cuSOLVER and replaced LAPACKE_dgetrf with cusolverDnDgetrf.
This commit is contained in:
parent
00bdcba230
commit
5a61ccc6b1
@ -7,7 +7,7 @@ INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
|
|||||||
|
|
||||||
LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
|
LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
|
||||||
LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
||||||
LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -mp -target=gpu
|
LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -lcusolver -mp -target=gpu
|
||||||
|
|
||||||
all: test
|
all: test
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ void print_m_t(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* na
|
|||||||
{
|
{
|
||||||
for (uint16_t j = 0; j < n; ++j)
|
for (uint16_t j = 0; j < n; ++j)
|
||||||
{
|
{
|
||||||
printf("%11.5f ", mat[j * ldm + i]);
|
printf("%9.3f ", mat[j * ldm + i]);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
#include <cusolverDn.h>
|
||||||
|
#include <cusolver_common.h>
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -56,7 +58,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
double *__restrict determinant);
|
double *__restrict determinant);
|
||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHandle_t s_handle,
|
||||||
const uint64_t vLDS,
|
const uint64_t vLDS,
|
||||||
const uint64_t vDim,
|
const uint64_t vDim,
|
||||||
const uint64_t N_updates,
|
const uint64_t N_updates,
|
||||||
|
@ -298,7 +298,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compute determinant by LU decomposition
|
// Compute determinant by LU decomposition
|
||||||
int ipiv[N_updates];
|
int* ipiv = calloc(1, sizeof *ipiv * N_updates);
|
||||||
(void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
(void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
||||||
|
|
||||||
double det = 1.0;
|
double det = 1.0;
|
||||||
@ -334,11 +334,12 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
alpha, C, N_updates, tmp1, LDS,
|
alpha, C, N_updates, tmp1, LDS,
|
||||||
beta, Slater_inv, LDS);
|
beta, Slater_inv, LDS);
|
||||||
|
|
||||||
|
free(ipiv);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHandle_t s_handle,
|
||||||
const uint64_t vLDS,
|
const uint64_t vLDS,
|
||||||
const uint64_t vDim,
|
const uint64_t vDim,
|
||||||
const uint64_t N_updates,
|
const uint64_t N_updates,
|
||||||
@ -352,13 +353,18 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
|||||||
const uint32_t Lds = LDS;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
double alpha, beta;
|
double alpha, beta;
|
||||||
int* ipiv = calloc(1, sizeof *ipiv * N_updates);
|
int* ipiv = calloc(1, sizeof *ipiv * N_updates);
|
||||||
double* C = calloc(1, sizeof *C * Dim * N_updates);
|
double* C = calloc(1, sizeof *C * Dim * N_updates);
|
||||||
double* B = calloc(1, sizeof *B * N_updates * N_updates);
|
double* B = calloc(1, sizeof *B * N_updates * N_updates);
|
||||||
double* D = calloc(1, sizeof *D * N_updates * Lds);
|
double* D = calloc(1, sizeof *D * N_updates * Lds);
|
||||||
double* T1 = calloc(1, sizeof *T1 * N_updates * Lds);
|
double* T1 = calloc(1, sizeof *T1 * N_updates * Lds);
|
||||||
double* T2 = calloc(1, sizeof *T2 * Dim * Lds);
|
double* T2 = calloc(1, sizeof *T2 * Dim * Lds);
|
||||||
|
|
||||||
|
int lwork = 0, *info = NULL; double* d_work = NULL;
|
||||||
|
cusolverDnDgetrf_bufferSize(s_handle, N_updates, N_updates, B, N_updates, &lwork);
|
||||||
|
printf("Size of lwork = %d\n", lwork);
|
||||||
|
d_work = calloc(1, sizeof *d_work * lwork);
|
||||||
|
|
||||||
#pragma omp target enter data map(to: Updates[0:Lds*N_updates], \
|
#pragma omp target enter data map(to: Updates[0:Lds*N_updates], \
|
||||||
Updates_index[0:N_updates], \
|
Updates_index[0:N_updates], \
|
||||||
Slater_inv[0:Dim*Lds]) \
|
Slater_inv[0:Dim*Lds]) \
|
||||||
@ -367,12 +373,13 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
|||||||
D[0:N_updates*Lds], \
|
D[0:N_updates*Lds], \
|
||||||
T1[0:N_updates*Lds], \
|
T1[0:N_updates*Lds], \
|
||||||
T2[0:Dim*Lds], \
|
T2[0:Dim*Lds], \
|
||||||
ipiv[0:N_updates])
|
ipiv[0:N_updates], \
|
||||||
|
d_work[0:lwork])
|
||||||
|
|
||||||
#pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE
|
#pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE
|
||||||
{
|
{
|
||||||
alpha = 1.0f, beta = 0.0f;
|
alpha = 1.0f, beta = 0.0f;
|
||||||
(void) cublasDgemm(handle,
|
(void) cublasDgemm(b_handle,
|
||||||
CUBLAS_OP_T, CUBLAS_OP_N,
|
CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
N_updates, Dim, Lds,
|
N_updates, Dim, Lds,
|
||||||
&alpha, Updates, Lds, Slater_inv, Lds,
|
&alpha, Updates, Lds, Slater_inv, Lds,
|
||||||
@ -387,24 +394,26 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
|||||||
const uint32_t row = Updates_index[i] - 1;
|
const uint32_t row = Updates_index[i] - 1;
|
||||||
for (uint32_t j = 0; j < N_updates ; j++)
|
for (uint32_t j = 0; j < N_updates ; j++)
|
||||||
{
|
{
|
||||||
B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
B[j * N_updates + i] = C[row * N_updates + j] + (i == j); // B NEEDS TO BE IN COL-MAJ FOR cusolverDnDgetrf !
|
||||||
}
|
}
|
||||||
for (uint32_t j = 0; j < Lds; j++)
|
for (uint32_t j = 0; j < Lds; j++)
|
||||||
{
|
{
|
||||||
D[i * Lds + j] = Slater_inv[row * Lds + j];
|
D[i * Lds + j] = Slater_inv[row * Lds + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#pragma omp target update from(B[0:N_updates*N_updates]) // Update B ON HOST
|
|
||||||
|
|
||||||
// Compute determinant by LU decomposition
|
// Compute determinant by LU decomposition
|
||||||
(void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
#pragma omp target data use_device_ptr(B, d_work, ipiv) // compute C ON DEVICE
|
||||||
#pragma omp target update to(B[0:N_updates*N_updates], ipiv[0:N_updates]) // Update B ON DEVICE
|
{
|
||||||
|
(void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, d_work, ipiv, info);
|
||||||
|
}
|
||||||
|
|
||||||
double det = 1.0f; uint32_t j = 0;
|
double det = 1.0f; uint32_t j = 0;
|
||||||
#pragma omp target teams distribute parallel for // compute det ON DEVICE
|
#pragma omp target teams distribute parallel for // compute det ON DEVICE
|
||||||
for (uint32_t i = 0; i < N_updates; i++)
|
for (uint32_t i = 0; i < N_updates; i++)
|
||||||
{
|
{
|
||||||
j += min(ipiv[i] - i, 1);
|
int row = ipiv[i] - i;
|
||||||
|
j += min(row, 1);
|
||||||
det *= B[(N_updates + 1) * i]; // update determinant
|
det *= B[(N_updates + 1) * i]; // update determinant
|
||||||
}
|
}
|
||||||
if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
|
if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
|
||||||
@ -416,15 +425,16 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
|||||||
if (determinant) *determinant *= det;
|
if (determinant) *determinant *= det;
|
||||||
|
|
||||||
// Compute B^{-1}
|
// Compute B^{-1}
|
||||||
(void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); // compute B^-1 ON HOST
|
#pragma omp target update from(B[0:N_updates*N_updates], ipiv[0:N_updates])
|
||||||
|
(void) LAPACKE_dgetri(LAPACK_COL_MAJOR, N_updates, B, N_updates, ipiv); // compute B^-1 ON HOST
|
||||||
#pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE
|
#pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE
|
||||||
|
|
||||||
// T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
// T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||||
#pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE
|
#pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE
|
||||||
{
|
{
|
||||||
alpha = 1.0, beta = 0.0;
|
alpha = 1.0, beta = 0.0;
|
||||||
(void) cublasDgemm(handle,
|
(void) cublasDgemm(b_handle,
|
||||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
CUBLAS_OP_N, CUBLAS_OP_T, // REMEMBER THIS IS TMP TRANSPOSED BECAUSE OF LAPACKE CALL ON l429 !!!
|
||||||
Lds, N_updates, N_updates,
|
Lds, N_updates, N_updates,
|
||||||
&alpha, D, Lds, B, N_updates,
|
&alpha, D, Lds, B, N_updates,
|
||||||
&beta, T1, Lds);
|
&beta, T1, Lds);
|
||||||
@ -434,7 +444,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
|||||||
#pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE
|
#pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE
|
||||||
{
|
{
|
||||||
alpha = 1.0f, beta = 0.0f;
|
alpha = 1.0f, beta = 0.0f;
|
||||||
(void) cublasDgemm(handle,
|
(void) cublasDgemm(b_handle,
|
||||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
Dim, Lds, N_updates,
|
Dim, Lds, N_updates,
|
||||||
&alpha, T1, Lds, C, N_updates,
|
&alpha, T1, Lds, C, N_updates,
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "meuk.h"
|
#include "meuk.h"
|
||||||
#include "cycles.h"
|
#include "cycles.h"
|
||||||
|
|
||||||
@ -16,10 +16,15 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
cublasHandle_t handle;
|
cublasHandle_t handle;
|
||||||
|
cusolverDnHandle_t s_handle;
|
||||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
||||||
fprintf(stdout, "cuBLAS initialization failed!\n");
|
fprintf(stdout, "cuBLAS initialization failed!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
if (cusolverDnCreate(&s_handle) != CUSOLVER_STATUS_SUCCESS) {
|
||||||
|
fprintf(stdout, "cuSOLVER initialization failed!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
@ -230,7 +235,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_woodbury_k_cublas_offload(handle, Lds, Dim, N_updates, Updates,
|
err_break = qmckl_woodbury_k_cublas_offload(handle, s_handle, Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -349,5 +354,6 @@ printf("#-----------------------------------------------------------------------
|
|||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
cublasDestroy(handle);
|
cublasDestroy(handle);
|
||||||
|
cusolverDnDestroy(s_handle);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user