mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2024-12-24 13:23:45 +01:00
cuBLAS version of Woodbury KxK is working, but called to lapacke dgetrf/ri need to be replaced with cuSOLVER calls to eliminate intermediate results to be transfered to/from device.
This commit is contained in:
parent
892358d0d1
commit
00bdcba230
@ -1,11 +1,15 @@
|
|||||||
FC = ifx
|
#FC = ifx
|
||||||
CC = nvc
|
#CC = nvc
|
||||||
|
|
||||||
CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu
|
CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu
|
||||||
|
|
||||||
LDFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
|
INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
|
||||||
|
|
||||||
|
LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl
|
||||||
LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
LDFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
||||||
LDFLAGS+=-lcublas -mp -target=gpu
|
LDFLAGS+=-L$(NVHPC_ROOT)/math_libs/lib64 -lcublas -mp -target=gpu
|
||||||
|
|
||||||
|
all: test
|
||||||
|
|
||||||
## Link with icc
|
## Link with icc
|
||||||
# test: sm.o test.o detupdate21.o meuk.o
|
# test: sm.o test.o detupdate21.o meuk.o
|
||||||
|
@ -1,18 +1,21 @@
|
|||||||
/*
|
/*
|
||||||
Compile with:
|
Compile with:
|
||||||
|
|
||||||
nvc -L${MKLROOT}/lib/intel64 \
|
nvc \
|
||||||
-lmkl_intel_lp64 \
|
-I$NV_CUDA_MATH_PATH/11.7/include \
|
||||||
-lmkl_sequential \
|
-L$NV_CUDA_MATH_PATH/11.7/lib64 \
|
||||||
-lmkl_core \
|
-L${MKLROOT}/lib/intel64 \
|
||||||
-lpthread \
|
-lmkl_intel_lp64 \
|
||||||
-lm \
|
-lmkl_sequential \
|
||||||
-ldl \
|
-lmkl_core \
|
||||||
-lcublas \
|
-lpthread \
|
||||||
-mp \
|
-lm \
|
||||||
-target=gpu \
|
-ldl \
|
||||||
cblasdgemm_vs_cublasdgemm_test.c \
|
-lcublas \
|
||||||
-o cblasdgemm_vs_cublasdgemm_test
|
-mp \
|
||||||
|
-target=gpu \
|
||||||
|
cblasdgemm_vs_cublasdgemm_test.c \
|
||||||
|
-o cblasdgemm_vs_cublasdgemm_test
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -1,11 +1,29 @@
|
|||||||
void print_m(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
void print_dm(const double* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
|
||||||
{
|
{
|
||||||
printf("%s = \n", name);
|
printf("%s = \n", name);
|
||||||
for (uint16_t i = 0; i < m; ++i)
|
for (uint16_t i = 0; i < m; ++i)
|
||||||
{
|
{
|
||||||
for (uint16_t j = 0; j < n; ++j)
|
for (uint16_t j = 0; j < n; ++j)
|
||||||
{
|
{
|
||||||
printf("%11.5f ", mat[i * ldm + j]);
|
printf("%9.3f ", mat[i * ldm + j]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_im(const int* mat, uint16_t m, uint16_t n, uint16_t ldm, char* name)
|
||||||
|
{
|
||||||
|
printf("%s = \n", name);
|
||||||
|
for (uint16_t i = 0; i < m; ++i)
|
||||||
|
{
|
||||||
|
for (uint16_t j = 0; j < n; ++j)
|
||||||
|
{
|
||||||
|
printf("%d ", mat[i * ldm + j]);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -7,8 +7,9 @@
|
|||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <cuda_runtime.h>
|
#include <omp.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
|
lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "hdf5.h"
|
#include <hdf5/serial/hdf5.h>
|
||||||
#include "kernels.h"
|
#include "kernels.h"
|
||||||
|
|
||||||
typedef struct Error {
|
typedef struct Error {
|
||||||
|
@ -299,9 +299,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
|
|
||||||
// Compute determinant by LU decomposition
|
// Compute determinant by LU decomposition
|
||||||
int ipiv[N_updates];
|
int ipiv[N_updates];
|
||||||
lapack_int ret;
|
(void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
||||||
ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
|
||||||
if (ret != 0) return ret;
|
|
||||||
double det = 1.0;
|
double det = 1.0;
|
||||||
int j = 0;
|
int j = 0;
|
||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
@ -319,8 +318,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
if (determinant) *determinant *= det;
|
if (determinant) *determinant *= det;
|
||||||
|
|
||||||
// Compute B^{-1} with explicit formula for K x K inversion
|
// Compute B^{-1} with explicit formula for K x K inversion
|
||||||
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
(void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
||||||
if (ret != 0) return ret;
|
|
||||||
|
|
||||||
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||||
double tmp1[N_updates * LDS];
|
double tmp1[N_updates * LDS];
|
||||||
@ -328,173 +326,146 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
N_updates, LDS, N_updates,
|
N_updates, LDS, N_updates,
|
||||||
alpha, B, N_updates, D, LDS,
|
alpha, B, N_updates, D, LDS,
|
||||||
beta, tmp1, LDS);
|
beta, tmp1, LDS);
|
||||||
print_m(tmp1, N_updates, LDS, LDS, "tmp1_cblas");
|
|
||||||
|
|
||||||
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
|
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
|
||||||
// alpha = -1.0, beta = 1.0;
|
alpha = -1.0, beta = 1.0;
|
||||||
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
// Dim, LDS, N_updates,
|
|
||||||
// alpha, C, N_updates, tmp1, LDS,
|
|
||||||
// beta, Slater_inv, LDS);
|
|
||||||
|
|
||||||
double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
|
|
||||||
cblas_dgemm(CblasRowMajor,
|
|
||||||
CblasNoTrans, CblasNoTrans,
|
|
||||||
Dim, LDS, N_updates,
|
Dim, LDS, N_updates,
|
||||||
alpha, C, N_updates, tmp1, LDS,
|
alpha, C, N_updates, tmp1, LDS,
|
||||||
beta, tmp2, LDS);
|
beta, Slater_inv, LDS);
|
||||||
print_m(tmp2, DIM, LDS, LDS, "tmp2_cblas");
|
|
||||||
|
|
||||||
double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
|
|
||||||
for(int i = 0; i < DIM * LDS; ++i)
|
|
||||||
{
|
|
||||||
tmp3[i] = Slater_inv[i] - tmp2[i];
|
|
||||||
}
|
|
||||||
print_m(tmp3, DIM, LDS, LDS, "tmp3_cblas");
|
|
||||||
|
|
||||||
for(int i = 0; i < DIM * LDS; ++i)
|
|
||||||
{
|
|
||||||
Slater_inv[i] = tmp3[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
free(tmp2);
|
|
||||||
free(tmp3);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t handle,
|
||||||
const uint64_t vLDS,
|
const uint64_t vLDS,
|
||||||
const uint64_t vDim,
|
const uint64_t vDim,
|
||||||
const uint64_t N_updates,
|
const uint64_t N_updates,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double* Updates,
|
||||||
const uint64_t *__restrict Updates_index,
|
const uint64_t* Updates_index,
|
||||||
const double breakdown,
|
const double breakdown,
|
||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double* Slater_inv,
|
||||||
double *__restrict determinant) {
|
double* determinant)
|
||||||
|
{
|
||||||
const uint32_t Dim = DIM;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t Lds = LDS;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
double alpha, beta;
|
||||||
double *C = calloc(1, DIM * N_updates * sizeof(double));
|
int* ipiv = calloc(1, sizeof *ipiv * N_updates);
|
||||||
double alpha = 1.0f, beta = 0.0f;
|
double* C = calloc(1, sizeof *C * Dim * N_updates);
|
||||||
#pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates])
|
double* B = calloc(1, sizeof *B * N_updates * N_updates);
|
||||||
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
|
double* D = calloc(1, sizeof *D * N_updates * Lds);
|
||||||
{
|
double* T1 = calloc(1, sizeof *T1 * N_updates * Lds);
|
||||||
int cublasError = cublasDgemm(handle,
|
double* T2 = calloc(1, sizeof *T2 * Dim * Lds);
|
||||||
CUBLAS_OP_T, CUBLAS_OP_N,
|
|
||||||
15, 21, 24,
|
|
||||||
&alpha, Updates, 24, Slater_inv, 24,
|
|
||||||
&beta, C, 15);
|
|
||||||
}
|
|
||||||
#pragma omp target exit data map(from:C[0:DIM*N_updates])
|
|
||||||
|
|
||||||
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
#pragma omp target enter data map(to: Updates[0:Lds*N_updates], \
|
||||||
// Construct D = V S^{-1} : K x LDS
|
Updates_index[0:N_updates], \
|
||||||
double *B = calloc(1, N_updates * N_updates * sizeof(double));
|
Slater_inv[0:Dim*Lds]) \
|
||||||
double *D = calloc(1, N_updates * LDS * sizeof(double));
|
map(alloc: B[0:N_updates*N_updates], \
|
||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
C[0:Dim*N_updates], \
|
||||||
const uint32_t row = Updates_index[i] - 1;
|
D[0:N_updates*Lds], \
|
||||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
T1[0:N_updates*Lds], \
|
||||||
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
|
T2[0:Dim*Lds], \
|
||||||
|
ipiv[0:N_updates])
|
||||||
|
|
||||||
|
#pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE
|
||||||
|
{
|
||||||
|
alpha = 1.0f, beta = 0.0f;
|
||||||
|
(void) cublasDgemm(handle,
|
||||||
|
CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
|
N_updates, Dim, Lds,
|
||||||
|
&alpha, Updates, Lds, Slater_inv, Lds,
|
||||||
|
&beta, C, N_updates);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Construct B = 1 + V C : K x K
|
||||||
|
// Construct D = V S^{-1} : K x LDS
|
||||||
|
#pragma omp target teams distribute parallel for // compute B, D ON DEVICE
|
||||||
|
for (uint32_t i = 0; i < N_updates; i++)
|
||||||
|
{
|
||||||
|
const uint32_t row = Updates_index[i] - 1;
|
||||||
|
for (uint32_t j = 0; j < N_updates ; j++)
|
||||||
|
{
|
||||||
|
B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||||
|
}
|
||||||
|
for (uint32_t j = 0; j < Lds; j++)
|
||||||
|
{
|
||||||
|
D[i * Lds + j] = Slater_inv[row * Lds + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#pragma omp target update from(B[0:N_updates*N_updates]) // Update B ON HOST
|
||||||
|
|
||||||
// Compute determinant by LU decomposition
|
// Compute determinant by LU decomposition
|
||||||
int ipiv[N_updates];
|
(void) LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
||||||
lapack_int ret;
|
#pragma omp target update to(B[0:N_updates*N_updates], ipiv[0:N_updates]) // Update B ON DEVICE
|
||||||
ret = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N_updates, N_updates, B, N_updates, ipiv);
|
|
||||||
if (ret != 0) return ret;
|
double det = 1.0f; uint32_t j = 0;
|
||||||
double det = 1.0;
|
#pragma omp target teams distribute parallel for // compute det ON DEVICE
|
||||||
int j = 0;
|
for (uint32_t i = 0; i < N_updates; i++)
|
||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
{
|
||||||
j += min(ipiv[i] - i, 1);
|
j += min(ipiv[i] - i, 1);
|
||||||
det *= B[(N_updates + 1) * i]; // update determinant
|
det *= B[(N_updates + 1) * i]; // update determinant
|
||||||
}
|
}
|
||||||
if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
|
if ((j & 1) == 0) det = -det; // multiply det with -1 if j is even
|
||||||
|
|
||||||
// Check if determinant of B is not too close to zero
|
// Check if determinant of B is not too close to zero
|
||||||
if (fabs(det) < breakdown) {
|
if (fabs(det) < breakdown) return det;
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update det(Slater) if passed
|
// Update det(Slater) if passed
|
||||||
if (determinant) *determinant *= det;
|
if (determinant) *determinant *= det;
|
||||||
|
|
||||||
// Compute B^{-1} with explicit formula for K x K inversion
|
// Compute B^{-1}
|
||||||
ret = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv);
|
(void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, ipiv); // compute B^-1 ON HOST
|
||||||
if (ret != 0) return ret;
|
#pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE
|
||||||
|
|
||||||
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
// T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||||
double *tmp1 = calloc(1, N_updates * LDS * sizeof(double));
|
#pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE
|
||||||
#pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS])
|
|
||||||
#pragma omp target data use_device_ptr(D, B, tmp1)
|
|
||||||
{
|
{
|
||||||
int cublasError = cublasDgemm(handle,
|
alpha = 1.0, beta = 0.0;
|
||||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
(void) cublasDgemm(handle,
|
||||||
LDS, N_updates, N_updates,
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
&alpha, D, LDS, B, N_updates,
|
Lds, N_updates, N_updates,
|
||||||
&beta, tmp1, LDS);
|
&alpha, D, Lds, B, N_updates,
|
||||||
}
|
&beta, T1, Lds);
|
||||||
#pragma omp target exit data map(from:tmp1[0:N_updates*LDS])
|
|
||||||
print_m(tmp1, N_updates, LDS, LDS, "tmp1_cublas");
|
|
||||||
|
|
||||||
// Compute tmp2 = C * tmp1 : Dim x LDS
|
|
||||||
double *tmp2 = calloc(1, DIM * LDS * sizeof(double));
|
|
||||||
#pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS])
|
|
||||||
#pragma omp target data use_device_ptr(tmp1, C, tmp2)
|
|
||||||
{
|
|
||||||
int cublasError = cublasDgemm(handle,
|
|
||||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
|
||||||
LDS, Dim, N_updates,
|
|
||||||
&alpha, tmp1, LDS, C, N_updates,
|
|
||||||
&beta, tmp2, LDS);
|
|
||||||
}
|
|
||||||
#pragma omp target exit data map(from:tmp2[0:DIM*LDS])
|
|
||||||
print_m(tmp2, DIM, LDS, LDS, "tmp2_cublas");
|
|
||||||
|
|
||||||
// Compute tmp3 = S^{-1} - tmp2
|
|
||||||
double *tmp3 = calloc(1, DIM * LDS * sizeof(double));
|
|
||||||
beta = -1.0f;
|
|
||||||
#pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS])
|
|
||||||
#pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3)
|
|
||||||
{
|
|
||||||
int cublasError = cublasDgeam(handle,
|
|
||||||
CUBLAS_OP_N, CUBLAS_OP_N,
|
|
||||||
DIM, LDS,
|
|
||||||
&alpha, Slater_inv, LDS,
|
|
||||||
&beta, tmp2, LDS,
|
|
||||||
tmp3, LDS);
|
|
||||||
}
|
|
||||||
#pragma omp target exit data map(from:tmp3[0:DIM*LDS])
|
|
||||||
print_m(tmp3, DIM, LDS, LDS, "tmp3_cublas");
|
|
||||||
for(int i = 0; i < DIM * LDS; ++i)
|
|
||||||
{
|
|
||||||
Slater_inv[i] = tmp3[i];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// // Compute S^{-1} <- S^{-1} - tmp2
|
// Compute T2 <- C * T1 : Dim x LDS : standard dgemm
|
||||||
// beta = -1.0f;
|
#pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE
|
||||||
// #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS])
|
{
|
||||||
// #pragma omp target data use_device_ptr(Slater_inv, tmp2)
|
alpha = 1.0f, beta = 0.0f;
|
||||||
// {
|
(void) cublasDgemm(handle,
|
||||||
// int cublasError = cublasDgeam(handle,
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
// CUBLAS_OP_N, CUBLAS_OP_N,
|
Dim, Lds, N_updates,
|
||||||
// DIM, LDS,
|
&alpha, T1, Lds, C, N_updates,
|
||||||
// &alpha, Slater_inv, LDS,
|
&beta, T2, Lds);
|
||||||
// &beta, tmp2, LDS,
|
}
|
||||||
// Slater_inv, LDS);
|
|
||||||
// }
|
|
||||||
// #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS])
|
|
||||||
|
|
||||||
|
// Compute S^{-1} <- S^{-1} - T2 : Dim x LDS : standard dgemm
|
||||||
|
#pragma omp target teams distribute parallel for // compute S^-1 ON DEVICE
|
||||||
|
for (uint32_t i = 0; i < Dim * Lds; i++)
|
||||||
|
{
|
||||||
|
Slater_inv[i] = Slater_inv[i] - T2[i];
|
||||||
|
}
|
||||||
|
#pragma omp target update from(Slater_inv[0:Dim*Lds]) // update S^-1 ON HOST
|
||||||
|
|
||||||
|
#pragma omp target exit data map(delete: Updates[0:Lds*N_updates], \
|
||||||
|
Updates_index[0:N_updates], \
|
||||||
|
Slater_inv[0:Dim*Lds], \
|
||||||
|
B[0:N_updates*N_updates], \
|
||||||
|
C[0:Dim*N_updates], \
|
||||||
|
D[0:N_updates*Lds], \
|
||||||
|
T1[0:N_updates*Lds], \
|
||||||
|
T2[0:Dim*Lds], \
|
||||||
|
ipiv[0:N_updates])
|
||||||
|
|
||||||
|
// free(ipiv);
|
||||||
free(B);
|
free(B);
|
||||||
free(C);
|
free(C);
|
||||||
free(D);
|
free(D);
|
||||||
free(tmp1);
|
free(T1);
|
||||||
free(tmp2);
|
free(T2);
|
||||||
// free(tmp3);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -529,7 +500,7 @@ uint32_t qmckl_slagel_splitting(
|
|||||||
|
|
||||||
// Denominator
|
// Denominator
|
||||||
const int cui = Updates_index[l] - 1;
|
const int cui = Updates_index[l] - 1;
|
||||||
double den = 1.0 + C[cui];
|
double den = 1.0 + C[cui];
|
||||||
// printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui);
|
// printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui);
|
||||||
if (fabs(den) < breakdown) { // Here is decided to split the update, or not.
|
if (fabs(den) < breakdown) { // Here is decided to split the update, or not.
|
||||||
// printf("Split! breakdown = %f\n", breakdown);
|
// printf("Split! breakdown = %f\n", breakdown);
|
||||||
@ -662,7 +633,7 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
|
|
||||||
// And for the other cases != 4, 6
|
// And for the other cases != 4, 6
|
||||||
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
|
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
|
||||||
// Woodbury 3x3 kernel
|
// Woodbury 3x3 kernel
|
||||||
uint32_t n_of_3blocks = N_updates / 3;
|
uint32_t n_of_3blocks = N_updates / 3;
|
||||||
uint32_t remainder = N_updates % 3;
|
uint32_t remainder = N_updates % 3;
|
||||||
uint32_t length_3block = 3 * Lds;
|
uint32_t length_3block = 3 * Lds;
|
||||||
@ -799,7 +770,7 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
l += 1;
|
l += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (later == N_updates) { // If all the updates have failed, exit early with an error
|
if (later == N_updates) { // If all the updates have failed, exit early with an error
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -1,92 +0,0 @@
|
|||||||
#include <assert.h>
|
|
||||||
#include "data_cm.h"
|
|
||||||
#include "meuk.h"
|
|
||||||
|
|
||||||
#define REPETITIONS 10000000
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
|
|
||||||
assert(argc == 3);
|
|
||||||
char *version = argv[1];
|
|
||||||
char *number_of_updates = argv[2];
|
|
||||||
const uint64_t Dim = 21;
|
|
||||||
const uint64_t LDS = 24;
|
|
||||||
// const double breakdown = 1e-3;
|
|
||||||
const double breakdown = 1e-9; // this might be too small and cause NIs
|
|
||||||
uint32_t rc;
|
|
||||||
|
|
||||||
const uint64_t *N_updates;
|
|
||||||
const double *Updates;
|
|
||||||
const uint64_t *Updates_index;
|
|
||||||
double *Slater, *Slater_invT;
|
|
||||||
double determinant;
|
|
||||||
if (number_of_updates[0] == '2') { // 2 Updates
|
|
||||||
N_updates = &N_updates2;
|
|
||||||
Updates = &Updates2[0];
|
|
||||||
Updates_index = &Updates_index2[0];
|
|
||||||
Slater = &Slater2[0];
|
|
||||||
Slater_invT = &Slater_invT2[0]; // Slater_inv in QMC=Chem is actually its transpose
|
|
||||||
determinant = determinant2;
|
|
||||||
} else if (number_of_updates[0] == '3') { // 3 Updates
|
|
||||||
N_updates = &N_updates3;
|
|
||||||
Updates = &Updates3[0];
|
|
||||||
Updates_index = &Updates_index3[0];
|
|
||||||
Slater = &Slater3[0];
|
|
||||||
Slater_invT = &Slater_invT3[0];
|
|
||||||
determinant = determinant3;
|
|
||||||
} else if (number_of_updates[0] == '5') { // 5 Updates
|
|
||||||
N_updates = &N_updates5;
|
|
||||||
Updates = &Updates5[0];
|
|
||||||
Updates_index = &Updates_index5[0];
|
|
||||||
Slater = &Slater5[0];
|
|
||||||
Slater_invT = &Slater_invT5[0];
|
|
||||||
determinant = determinant5;
|
|
||||||
} else { // Exit
|
|
||||||
printf("Incorrect number of updates given\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
rc = check_residual(LDS, Dim, Slater_invT, Slater);
|
|
||||||
assert(rc == 0 && "check_residual()");
|
|
||||||
rc = test_kernel(version, LDS, Dim, *N_updates, Updates, Updates_index,
|
|
||||||
breakdown, Slater, Slater_invT, &determinant);
|
|
||||||
assert(rc == 0 && "test_kernel()");
|
|
||||||
|
|
||||||
// EVERYTHING WORKS UP UNTILL HERE
|
|
||||||
|
|
||||||
uint64_t before = rdtsc();
|
|
||||||
if (version[0] == 'a') { // Anthony
|
|
||||||
for (int i = 0; i < REPETITIONS; i++) {
|
|
||||||
const double* Upds;
|
|
||||||
const uint64_t* Ui;
|
|
||||||
for (int j = 0; j < *N_updates; j++) {
|
|
||||||
Upds = &Updates[j*LDS];
|
|
||||||
Ui = &Updates_index[j];
|
|
||||||
detupd(Dim, LDS, Upds, Ui, Slater_invT, &determinant);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (version[0] == 'n') { // Naive
|
|
||||||
for (int i = 0; i < REPETITIONS; i++) {
|
|
||||||
rc = qmckl_sherman_morrison(LDS, Dim, *N_updates, Updates,
|
|
||||||
Updates_index, breakdown, Slater_invT, &determinant);
|
|
||||||
if (rc != 0) printf("qmckl_sherman_morrison failed\n");
|
|
||||||
}
|
|
||||||
} else if (version[0] == 's') { // Splitting
|
|
||||||
for (int i = 0; i < REPETITIONS; i++) {
|
|
||||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, *N_updates, Updates,
|
|
||||||
Updates_index, breakdown, Slater_invT, &determinant);
|
|
||||||
if (rc != 0) printf("qmckl_sherman_morrison_splitting failed\n");
|
|
||||||
}
|
|
||||||
} else if (version[0] == 'b') { // Blocked
|
|
||||||
for (int i = 0; i < REPETITIONS; i++) {
|
|
||||||
// rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
|
|
||||||
// breakdown, Slater_inv, &determinant);
|
|
||||||
// rc = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
|
|
||||||
// breakdown, Slater_inv, &determinant);
|
|
||||||
rc = qmckl_sherman_morrison_smw32s(LDS, Dim, *N_updates, Updates,
|
|
||||||
Updates_index, breakdown, Slater_invT, &determinant);
|
|
||||||
if (rc != 0) printf("qmckl_sherman_morrison_smw32s failed\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
uint64_t after = rdtsc();
|
|
||||||
printf("cycles = %f\n", ((double)(after - before) / (double) REPETITIONS));
|
|
||||||
}
|
|
@ -1,62 +0,0 @@
|
|||||||
#include <assert.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
static __inline__ uint64_t rdtsc(void) {
|
|
||||||
unsigned hi, lo;
|
|
||||||
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
|
|
||||||
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
int qmckl_sherman_morrison(
|
|
||||||
const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates,
|
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
|
||||||
const uint64_t *__restrict Updates_index, const double breakdown,
|
|
||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
|
||||||
double *__restrict determinant);
|
|
||||||
|
|
||||||
int detupd(const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates,
|
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
|
||||||
const uint64_t *__restrict Updates_index, const double breakdown,
|
|
||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
|
||||||
double *__restrict determinant);
|
|
||||||
|
|
||||||
#define REPETITIONS 100000000
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
|
|
||||||
assert(argc == 2);
|
|
||||||
char *version = argv[1];
|
|
||||||
|
|
||||||
const uint64_t Dim = 21;
|
|
||||||
const uint64_t LDS = 24;
|
|
||||||
const uint64_t N_updates = 1;
|
|
||||||
double Updates[LDS] __attribute__((aligned(8)));
|
|
||||||
uint64_t Updates_index[N_updates];
|
|
||||||
Updates_index[0] = 1;
|
|
||||||
const double breakdown = 1e-3;
|
|
||||||
double Slater_inv[LDS * Dim] __attribute__((aligned(8)));
|
|
||||||
double determinant = 1.0;
|
|
||||||
|
|
||||||
for (int i = 0; i < Dim; i++) {
|
|
||||||
Updates[i] = i;
|
|
||||||
for (int j = 0; j < Dim; j++) {
|
|
||||||
Slater_inv[LDS * i + j] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t before = rdtsc();
|
|
||||||
if (version[0] == 'c') {
|
|
||||||
for (int i = 0; i < REPETITIONS; i++) {
|
|
||||||
detupd(LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv,
|
|
||||||
&determinant);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = 0; i < REPETITIONS; i++) {
|
|
||||||
qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
|
|
||||||
breakdown, Slater_inv, &determinant);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
uint64_t after = rdtsc();
|
|
||||||
printf("cycles = %f\n", ((double)(after - before) / (double)REPETITIONS));
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user