- Got rid of NVC compiler warnings

- Included lib paths for MKL/HDF5 and cuBLAS
- Cleaned Makefile
- Added GPU node session request script
This commit is contained in:
François Coppens 2022-07-22 11:34:29 +02:00
parent fa03590f6f
commit 87e319189e
11 changed files with 225 additions and 279 deletions

View File

@ -1,19 +1,11 @@
# FC = gfortran
# CC = gcc
# FFLAGS=-O0 -finline -g -lm -Wall -pedantic
# CFLAGS=-std=c99 -O0 -finline -g -lm -Wall -pedantic
FC = ifx
CC = icx
# FFLAGS=-O0 -warn all -g -pedantic
# CFLAGS=-std=c99 -O0 -Wall -g -pedantic
FFLAGS=-O3 -warn all -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
CFLAGS=-std=c99 -O3 -Wall -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
INCLUDE=-I/usr/include/hdf5/serial -I/usr/local/cuda/include
LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential -L/usr/local/cuda-11.7/targets/x86_64-linux/lib -lcublas
#FC = verificarlo-f
#CC = verificarlo-c
#FFLAGS=-O3 -finline -g
#CFLAGS=-O3 -finline -g
CC = nvc
CFLAGS=-std=c99 -O3 -Wall -g
LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
LFLAGS+=-lcublas
## Link with icc
# test: sm.o test.o detupdate21.o meuk.o
@ -21,16 +13,6 @@ LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential
test: sm.o test.o meuk.o
$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
## Link with ifort
# test: sm.o test.o detupdate21.o meuk.o
# $(FC) $(LFLAGS) -nofor-main -o test sm.o detupdate21.o test.o meuk.o
## Link with gfortran
# test: sm.o test.o detupdate21.o meuk.o
# $(FC) $(LFLAGS) -Wno-main -o test sm.o detupdate21.o test.o meuk.o
%.o: %.f90
$(FC) $(FFLAGS) -c -o $@ $<

View File

@ -1,9 +0,0 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -0,0 +1,4 @@
salloc --nodes=1 --account=prcoe10 -p booster --gres=gpu:1
wait
srun --pty /bin/bash

View File

@ -1,9 +0,0 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -1,6 +1,8 @@
#include <mkl_lapacke.h>
#include <mkl.h>
#define DIM 21
#define LDS 24
#define HAVE_CUBLAS_OFFLOAD
#ifdef HAVE_CUBLAS_OFFLOAD
@ -9,7 +11,7 @@
#include <cublas_v2.h>
#endif
lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
int min(int a, int b);
@ -72,7 +74,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
Slater_inv,
double *__restrict determinant);
void detupd(const uint64_t Dim, const uint64_t LDS,
void detupd(const uint64_t Dim, const uint64_t Lds,
const double *__restrict __attribute__((aligned(8))) Updates,
const uint64_t *__restrict Updates_index,
double *__restrict __attribute__((aligned(8))) Slater_inv,
@ -84,4 +86,3 @@ uint32_t qmckl_sherman_morrison_later(
const uint64_t *__restrict Updates_index, const double breakdown,
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant);

View File

@ -2,27 +2,27 @@
#include <stdint.h>
#include <assert.h>
void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim) {
for (uint64_t i = 0; i < LDS * Dim; i++) {
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) {
for (uint64_t i = 0; i < Lds * Dim; i++) {
printf("%f\n", A[i]);
}
printf("\n");
}
double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim) {
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
double sum2 = 0;
for (uint64_t i = 0; i < LDS * Dim; i++) sum2 += A[i] * A[i];
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
return sum2;
}
double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
double sum2 = frobenius_norm2(A, LDS, Dim);
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
double sum2 = frobenius_norm2(A, Lds, Dim);
return sqrt(sum2);
}
double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
double largest = 0;
for (uint64_t i = 0; i < LDS * Dim; i++) {
for (uint64_t i = 0; i < Lds * Dim; i++) {
double elm = A[i];
double felm = fabs(elm);
if (elm != elm) return -1.0; // Return a negative norm when NaN found
@ -31,9 +31,9 @@ double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
return largest;
}
double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim) {
double norm_A = frobenius_norm(A, LDS, Dim);
double norm_Ainv = frobenius_norm(Ainv, LDS, Dim);
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim) {
double norm_A = frobenius_norm(A, Lds, Dim);
double norm_Ainv = frobenius_norm(Ainv, Lds, Dim);
return fabs(norm_A) * fabs(norm_Ainv);
}
@ -57,19 +57,19 @@ void read_double(hid_t file_id, const char *key, double *data) {
assert(rc >= 0 && "H5Dclose");
}
void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
const uint64_t N_updates, const double *Updates,
const uint64_t *Updates_index, double *Slater) {
for (uint32_t i = 0; i < N_updates; i++) {
uint32_t col = Updates_index[i] - 1;
for (uint32_t j = 0; j < Dim; j++) {
Slater[col * Dim + j] += Updates[i * LDS + j];
Slater[col * Dim + j] += Updates[i * Lds + j];
}
}
}
uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
double *Slater, const double tolerance) {
double res[Dim*Dim];
@ -78,7 +78,7 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
for (uint32_t j = 0; j < Dim; j++) {
res[i * Dim + j] = 0;
for (uint32_t k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * LDS + j];
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
}
}
}
@ -95,12 +95,12 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
return 0;
}
void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim) {
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Dim; j++) {
prod[i * Dim + j] = 0;
for (uint32_t k = 0; k < Dim; k++) {
prod[i * Dim + j] += a[i * Dim + k] * b[k * LDS + j];
prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
}
}
}
@ -121,7 +121,7 @@ void residual(double *a, double *res, const uint64_t Dim) {
}
}
uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
const uint64_t N_updates, const double *Updates,
const uint64_t *Updates_index, const double breakdown, const double tolerance,
double *Slater, double *Slater_inv, double *determinant) {
@ -129,40 +129,40 @@ uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
// if (version[0] == 'a') { // Anthony
// const double *Upds;
// const uint64_t *Ui;
// for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] *= *determinant;
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
// for (int j = 0; j < N_updates; j++) {
// Upds = &Updates[j * LDS];
// Upds = &Updates[j * Lds];
// Ui = &Updates_index[j];
// detupd(Dim, LDS, Upds, Ui, Slater_inv, determinant);
// detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
// if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
// }
// for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] /= *determinant;
// update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
// rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
// update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
// rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
// if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
// } else if (version[0] == 'n') { // Naive
if (version[0] == 'n') { // Naive
rc = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
breakdown, Slater_inv, determinant);
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison failed\n");
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
} else if (version[0] == 's') { // Splitting
rc = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
rc = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_inv,
determinant);
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_splitting failed\n");
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
} else if (version[0] == 'b') { // Blocked
rc = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
rc = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_inv,
determinant);
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_smw32s failed\n");
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
}
return rc;

View File

@ -12,13 +12,13 @@ typedef struct Error {
uint64_t error;
} Error;
void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim);
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
void residual(double *a, double *res, const uint64_t Dim);
double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim);
void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim);
double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim);
double max_norm(double *A, const uint64_t LDS, const uint64_t Dim);
double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim);
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim);
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
void read_uint(hid_t file_id, const char *key, uint64_t *data);
void read_double(hid_t file_id, const char *key, double *data);
@ -28,16 +28,16 @@ static __inline__ uint64_t rdtsc(void) {
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
}
void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
const uint64_t N_updates, const double *Updates,
const uint64_t *Updates_index, double *Slater);
uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
double *Slater, const double tolerance);
int32_t check_error_better(const double max, const double tolerance);
uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
const uint64_t N_updates, const double *Updates,
const uint64_t *Updates_index, const double breakdown, const double tolerance,
double *Slater, double *Slater_inv, double *determinant);

View File

@ -1,9 +0,0 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -17,10 +17,10 @@ uint32_t qmckl_sherman_morrison(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
double __attribute__((aligned(8))) C[Dim];
double __attribute__((aligned(8))) C[DIM];
double __attribute__((aligned(8))) D[LDS];
uint32_t l = 0;
@ -31,8 +31,8 @@ uint32_t qmckl_sherman_morrison(
C[i] = 0.0;
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
for (uint32_t j = 0; j < Lds; j++) {
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
}
}
@ -51,17 +51,17 @@ uint32_t qmckl_sherman_morrison(
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
D[j] = Slater_inv[cui * LDS + j]; // selecting proper column of v_l^T * S_inv
for (uint32_t j = 0; j < Lds; j++) {
D[j] = Slater_inv[cui * Lds + j]; // selecting proper column of v_l^T * S_inv
}
// A^{-1} = A^{-1} - C x D / den
for (uint32_t i = 0; i < Dim; i++) {
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
for (uint32_t j = 0; j < Lds; j++) {
const double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update;
Slater_inv[i * Lds + j] -= update;
}
}
l += 1;
@ -69,16 +69,6 @@ uint32_t qmckl_sherman_morrison(
return 0;
}
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
const double *__restrict __attribute__((aligned(8)))
Updates,
const uint64_t *__restrict Updates_index,
const double breakdown,
double *__restrict __attribute__((aligned(8)))
Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
/*
COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS,
where S^{-1}P : Dim x LDS,
@ -88,20 +78,31 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
P^TU : LDS x 2,
V : 2 x Dim
*/
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
const double *__restrict __attribute__((aligned(8)))
Updates,
const uint64_t *__restrict Updates_index,
const double breakdown,
double *__restrict __attribute__((aligned(8)))
Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t row1 = (Updates_index[0] - 1);
const uint32_t row2 = (Updates_index[1] - 1);
// Compute C = (S^T)^{-1}U : Dim x 2
double __attribute__((aligned(8))) C[2 * Dim];
double __attribute__((aligned(8))) C[2 * DIM];
for (uint32_t i = 0; i < Dim; i++) {
C[i * 2] = 0;
C[i * 2 + 1] = 0;
#pragma ivdep
#pragma vector aligned
for (uint32_t k = 0; k < LDS; k++) {
C[i * 2] += Slater_inv[i * LDS + k] * Updates[k];
C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
for (uint32_t k = 0; k < Lds; k++) {
C[i * 2] += Slater_inv[i * Lds + k] * Updates[k];
C[i * 2 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
}
}
@ -130,38 +131,28 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
// tmp = B^{-1}D : 2 x LDS
double __attribute__((aligned(8))) tmp[2 * LDS];
double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
for (uint32_t j = 0; j < Lds; j++) {
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
tmp[Lds + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
}
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
for (uint32_t i = 0; i < Dim; i++) {
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j];
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j];
for (uint32_t j = 0; j < Lds; j++) {
Slater_inv[i * Lds + j] -= C[i * 2] * tmp[j];
Slater_inv[i * Lds + j] -= C[i * 2 + 1] * tmp[Lds + j];
}
}
return 0;
}
uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
const double *__restrict __attribute__((aligned(8)))
Updates,
const uint64_t *__restrict Updates_index,
const double breakdown,
double *__restrict __attribute__((aligned(8)))
Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
/*
COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS,
where S^T : Dim x LDS,
@ -171,23 +162,34 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
U : LDS x 3,
V : 3 x Dim
*/
uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
const double *__restrict __attribute__((aligned(8)))
Updates,
const uint64_t *__restrict Updates_index,
const double breakdown,
double *__restrict __attribute__((aligned(8)))
Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t row1 = (Updates_index[0] - 1);
const uint32_t row2 = (Updates_index[1] - 1);
const uint32_t row3 = (Updates_index[2] - 1);
// Compute C = (S^T)^{-1}U : Dim x 3
double __attribute__((aligned(8))) C[3 * Dim];
double __attribute__((aligned(8))) C[3 * DIM];
for (uint32_t i = 0; i < Dim; i++) {
C[i * 3] = 0;
C[i * 3 + 1] = 0;
C[i * 3 + 2] = 0;
#pragma ivdep
#pragma vector aligned
for (uint32_t k = 0; k < LDS; k++) {
C[i * 3] += Slater_inv[i * LDS + k] * Updates[k];
C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k];
for (uint32_t k = 0; k < Lds; k++) {
C[i * 3] += Slater_inv[i * Lds + k] * Updates[k];
C[i * 3 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
C[i * 3 + 2] += Slater_inv[i * Lds + k] * Updates[2 * Lds + k];
}
}
@ -233,20 +235,20 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
for (uint32_t j = 0; j < Lds; j++) {
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
tmp[LDS + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
tmp[2 * LDS + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
tmp[Lds + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
tmp[2 * Lds + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
}
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
for (uint32_t i = 0; i < Dim; i++) {
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j];
Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j];
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j];
for (uint32_t j = 0; j < Lds; j++) {
Slater_inv[i * Lds + j] -= C[i * 3] * tmp[j];
Slater_inv[i * Lds + j] -= C[i * 3 + 1] * tmp[Lds + j];
Slater_inv[i * Lds + j] -= C[i * 3 + 2] * tmp[2 * Lds + j];
}
}
@ -273,15 +275,15 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
// Compute C = S^{-1} U : Dim x K : standard dgemm
double C[Dim * N_updates];
double C[DIM * N_updates];
double alpha = 1.0, beta = 0.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
Dim, N_updates, LDS,
alpha, Slater_inv, LDS, Updates, LDS,
Dim, N_updates, Lds,
alpha, Slater_inv, Lds, Updates, Lds,
beta, C, N_updates);
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
@ -290,7 +292,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
for (uint32_t i = 0; i < N_updates; i++) {
const uint32_t row = Updates_index[i] - 1;
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
}
// Compute determinant by LU decomposition
@ -345,41 +347,34 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
// Compute C = S^{-1} U : Dim x K : standard dgemm
// double C[Dim * N_updates];
double *C = malloc(DIM * N_updates * sizeof(double));
double alpha = 1.0, beta = 0.0;
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
// Dim, N_updates, Lds,
// alpha, Slater_inv, Lds, Updates, Lds,
// beta, C, N_updates);
//cuBLAS initialization
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
fprintf(stdout, "cuBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
// Compute C = S^{-1} U : Dim x K : standard dgemm
double C[Dim * N_updates];
double alpha = 1.0, beta = 0.0;
// #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:M*N*walk_num], tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
// #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
// {
// for (int nw=0; nw < walk_num; ++nw) {
// int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
// &(een_rescaled_e[nw*(cord_num+1)]),
// LDA, af,
// &(een_rescaled_n[bf*nw]),
// LDB, 0,
// &beta,
// &(tmp_c[nw*cord_num]),
// LDC, cf, cord_num);
// }
// }
// #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
#pragma omp target enter data map(to:Slater_inv, Updates, C)
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
{
int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
Dim, N_updates, Lds,
&alpha, Slater_inv, Lds, Updates, Lds,
&beta, C, N_updates);
}
#pragma omp target exit data map(from:C)
cublasDestroy(handle);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
Dim, N_updates, LDS,
alpha, Slater_inv, LDS, Updates, LDS,
beta, C, N_updates);
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
// Construct D = V S^{-1} : K x LDS
@ -387,7 +382,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
for (uint32_t i = 0; i < N_updates; i++) {
const uint32_t row = Updates_index[i] - 1;
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
}
// Compute determinant by LU decomposition
@ -443,8 +438,8 @@ uint32_t qmckl_slagel_splitting(
uint64_t *__restrict later_index, uint64_t *__restrict later,
double *__restrict determinant) {
const uint32_t LDS = 24;
const uint32_t Dim = 21;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
double __attribute__((aligned(8))) C[LDS];
double __attribute__((aligned(8))) D[LDS];
@ -457,8 +452,8 @@ uint32_t qmckl_slagel_splitting(
C[i] = 0.0;
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
for (uint32_t j = 0; j < Lds; j++) {
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
}
}
@ -474,8 +469,8 @@ uint32_t qmckl_slagel_splitting(
// in later_updates
#pragma ivdep
#pragma vector aligned
for (uint32_t i = 0; i < LDS; i++) {
later_updates[*later * LDS + i] = Updates[l * LDS + i] / 2.0;
for (uint32_t i = 0; i < Lds; i++) {
later_updates[*later * Lds + i] = Updates[l * Lds + i] / 2.0;
C[i] /= 2.0;
}
later_index[*later] = Updates_index[l];
@ -490,17 +485,17 @@ uint32_t qmckl_slagel_splitting(
// D = v^T x S^{-1} : 1 x LDS
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
D[j] = Slater_inv[cui * LDS + j];
for (uint32_t j = 0; j < Lds; j++) {
D[j] = Slater_inv[cui * Lds + j];
}
// S^{-1} = S^{-1} - C x D / den
for (uint32_t i = 0; i < Dim; i++) {
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
for (uint32_t j = 0; j < Lds; j++) {
const double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update;
Slater_inv[i * Lds + j] -= update;
}
}
l += 1;
@ -516,18 +511,18 @@ uint32_t qmckl_sherman_morrison_splitting(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
uint64_t later_index[N_updates];
uint64_t later = 0;
// uint32_t rc;
(void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
(void) qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
breakdown, Slater_inv, later_updates, later_index,
&later, determinant);
// rc = qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
// rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
// breakdown, Slater_inv, later_updates, later_index,
// &later, determinant);
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
@ -535,11 +530,11 @@ uint32_t qmckl_sherman_morrison_splitting(
if (later > 0) {
recursive_calls++;
// printf("Later > 0\n");
(void) qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
(void) qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
later_index, breakdown, Slater_inv,
determinant);
// rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
// rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
// later_index, breakdown, Slater_inv,
// determinant);
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
@ -555,8 +550,8 @@ uint32_t qmckl_sherman_morrison_smw32s(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
uint64_t later_index[N_updates];
@ -564,31 +559,31 @@ uint32_t qmckl_sherman_morrison_smw32s(
uint32_t rc;
if (N_updates == 4) { // Special case for 4 rank-1 updates: 2+2
rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
rc = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
breakdown, Slater_inv, determinant);
if (rc != 0) { // Send the entire block to slagel_splitting
block_fail += 1;
uint64_t l = 0;
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates,
rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates,
Updates_index, breakdown, Slater_inv,
later_updates + (LDS * later),
later_updates + (Lds * later),
later_index + later, &l, determinant);
later += l;
}
rc = qmckl_woodbury_2(LDS, Dim, &Updates[2*LDS], &Updates_index[2],
rc = qmckl_woodbury_2(Lds, Dim, &Updates[2*Lds], &Updates_index[2],
breakdown, Slater_inv, determinant);
if (rc != 0) { // Send the entire block to slagel_splitting
block_fail += 1;
uint64_t l = 0;
rc = qmckl_slagel_splitting(LDS, Dim, 2, &Updates[2*LDS],
rc = qmckl_slagel_splitting(Lds, Dim, 2, &Updates[2*Lds],
&Updates_index[2], breakdown, Slater_inv,
later_updates + (LDS * later),
later_updates + (Lds * later),
later_index + later, &l, determinant);
later += l;
}
if (later > 0) {
recursive_calls++;
rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
later_index, breakdown, Slater_inv,
determinant);
}
@ -600,21 +595,21 @@ uint32_t qmckl_sherman_morrison_smw32s(
// Woodbury 3x3 kernel
uint32_t n_of_3blocks = N_updates / 3;
uint32_t remainder = N_updates % 3;
uint32_t length_3block = 3 * LDS;
uint32_t length_3block = 3 * Lds;
if (n_of_3blocks > 0) {
for (uint32_t i = 0; i < n_of_3blocks; i++) {
const double *Updates_3block = &Updates[i * length_3block];
const uint64_t *Updates_index_3block = &Updates_index[i * 3];
rc = qmckl_woodbury_3(LDS, Dim, Updates_3block, Updates_index_3block,
rc = qmckl_woodbury_3(Lds, Dim, Updates_3block, Updates_index_3block,
breakdown, Slater_inv, determinant);
if (rc != 0) { // Send the entire block to slagel_splitting
// printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
block_fail += 1;
uint64_t l = 0;
rc = qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block,
rc = qmckl_slagel_splitting(Lds, Dim, 3, Updates_3block,
Updates_index_3block, breakdown, Slater_inv,
later_updates + (LDS * later),
later_updates + (Lds * later),
later_index + later, &l, determinant);
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later += l;
@ -626,15 +621,15 @@ uint32_t qmckl_sherman_morrison_smw32s(
if (remainder == 2) {
const double *Updates_2block = &Updates[n_of_3blocks * length_3block];
const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks];
rc = qmckl_woodbury_2(LDS, Dim, Updates_2block, Updates_index_2block,
rc = qmckl_woodbury_2(Lds, Dim, Updates_2block, Updates_index_2block,
breakdown, Slater_inv, determinant);
if (rc != 0) { // Send the entire block to slagel_splitting
// printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
block_fail += 1;
uint64_t l = 0;
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block,
rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates_2block,
Updates_index_2block, breakdown, Slater_inv,
later_updates + (LDS * later),
later_updates + (Lds * later),
later_index + later, &l, determinant);
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later += l;
@ -647,9 +642,9 @@ uint32_t qmckl_sherman_morrison_smw32s(
const double *Updates_1block = &Updates[n_of_3blocks * length_3block];
const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks];
uint64_t l = 0;
rc = qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block,
rc = qmckl_slagel_splitting(Lds, Dim, 1, Updates_1block,
Updates_index_1block, breakdown, Slater_inv,
later_updates + (LDS * later),
later_updates + (Lds * later),
later_index + later, &l, determinant);
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later += l;
@ -658,7 +653,7 @@ uint32_t qmckl_sherman_morrison_smw32s(
if (later > 0) {
recursive_calls++;
// printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
later_index, breakdown, Slater_inv,
determinant);
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
@ -674,10 +669,10 @@ uint32_t qmckl_sherman_morrison_later(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = 21;
const uint32_t LDS = 24;
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
double __attribute__((aligned(8))) C[Dim];
double __attribute__((aligned(8))) C[DIM];
double __attribute__((aligned(8))) D[LDS];
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
@ -693,8 +688,8 @@ uint32_t qmckl_sherman_morrison_later(
C[i] = 0.0;
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
for (uint32_t j = 0; j < Lds; j++) {
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
}
}
@ -705,8 +700,8 @@ uint32_t qmckl_sherman_morrison_later(
#pragma ivdep
#pragma vector aligned
// for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t i = 0; i < LDS; i++) {
later_updates[later * LDS + i] = Updates[l * LDS + i];
for (uint32_t i = 0; i < Lds; i++) {
later_updates[later * Lds + i] = Updates[l * Lds + i];
}
later_index[later] = Updates_index[l];
later++;
@ -720,17 +715,17 @@ uint32_t qmckl_sherman_morrison_later(
// D = v^T x A^{-1}
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
D[j] = Slater_inv[cui * LDS + j];
for (uint32_t j = 0; j < Lds; j++) {
D[j] = Slater_inv[cui * Lds + j];
}
// S^{-1} = S^{-1} - C x D / den
for (uint32_t i = 0; i < Dim; i++) {
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < LDS; j++) {
for (uint32_t j = 0; j < Lds; j++) {
const double update = C[i] * D[j] * iden;
Slater_inv[i * LDS + j] -= update;
Slater_inv[i * Lds + j] -= update;
}
}
l += 1;
@ -741,7 +736,7 @@ uint32_t qmckl_sherman_morrison_later(
}
else if (later > 0) { // If some have failed, make a recursive call
recursive_calls++;
(void) qmckl_sherman_morrison_later(LDS, Dim, later, later_updates,
(void) qmckl_sherman_morrison_later(Lds, Dim, later, later_updates,
later_index, breakdown, Slater_inv, determinant);
}

View File

@ -1,6 +1,6 @@
#include <stdint.h>
#include "meuk.h"
#include "cycles.h"
#include <stdint.h>
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
@ -23,11 +23,11 @@ int main(int argc, char **argv) {
char slater_key[32];
char slater_inv_key[32];
char det_key[32];
const uint64_t Dim = 21;
const uint64_t LDS = 24;
const uint64_t Dim = DIM;
const uint64_t Lds = LDS;
uint64_t N_updates;
double Slater[LDS * Dim ], SlaterT[LDS * Dim];
double Slater_invT[LDS * Dim], Slater_invT_copy[LDS * Dim];
double Slater[LDS * DIM ], SlaterT[LDS * DIM];
double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
double determinant, determinant_copy;
// SETUP TEST PARAMETERS
@ -52,7 +52,7 @@ printf("#-----------------------------------------------------------------------
sprintf(det_key, "/cycle_%d/determinant", cycle);
read_uint(file_id, nupds_key, &N_updates);
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
double *Updates = malloc(LDS * N_updates * sizeof(double));
double *Updates = malloc(Lds * N_updates * sizeof(double));
read_uint(file_id, upd_idx_key, Updates_index);
read_double(file_id, upds_key, Updates);
read_double(file_id, slater_key, Slater);
@ -60,28 +60,28 @@ printf("#-----------------------------------------------------------------------
read_double(file_id, det_key, &determinant);
// Compute transpose of S. ST: 24 x 21
for (int i = 0; i < LDS; i++) {
for (int i = 0; i < Lds; i++) {
for (int j = 0; j < Dim; j++) {
SlaterT[i * Dim + j] = Slater[j * LDS + i];
SlaterT[i * Dim + j] = Slater[j * Lds + i];
}
}
// Convert repl. upds into additive upds.
for (int i = 0; i < N_updates; i++) {
int col = Updates_index[i] - 1;
for (int j = 0; j < LDS; j++) {
Updates[i * LDS + j] -= SlaterT[col + j * Dim];
for (int j = 0; j < Lds; j++) {
Updates[i * Lds + j] -= SlaterT[col + j * Dim];
}
}
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
uint32_t err_inp = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// Update Slater matrix
for (int i = 0; i < N_updates; i++) {
int col = Updates_index[i] - 1;
for (int j = 0; j < Dim; j++) {
SlaterT[col + j * Dim] += Updates[i * LDS + j];
SlaterT[col + j * Dim] += Updates[i * Lds + j];
}
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
@ -98,7 +98,7 @@ printf("#-----------------------------------------------------------------------
for (int rep = 0; rep < REPETITIONS; rep++) {
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
memcpy(Slater_invT_copy, Slater_invT, LDS * Dim * sizeof(double));
memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double));
determinant_copy = determinant;
// ### CHOOSE A KERNEL:
@ -109,10 +109,10 @@ printf("#-----------------------------------------------------------------------
// err_break = 0;
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
// for (int i = 0; i < N_updates; i++) {
// Upds = &Updates[i * LDS];
// Upds = &Updates[i * Lds];
// Ui = &Updates_index[i];
// determinant_previous = determinant_copy;
@ -120,7 +120,7 @@ printf("#-----------------------------------------------------------------------
// uint64_t before = rdtsc();
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
// detupd(Dim, LDS, Upds, Ui, Slater_invT_copy, &determinant_copy);
// detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
// // 3. FETCH FINISH TIME
// uint64_t after = rdtsc();
@ -137,9 +137,9 @@ printf("#-----------------------------------------------------------------------
// }
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
// }
// } else if (version[0] == 'n') { // Naive
if (version[0] == 'n') { // Naive
@ -147,7 +147,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates,
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -161,7 +161,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_later(LDS, Dim, N_updates, Updates,
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -175,7 +175,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -190,7 +190,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -205,7 +205,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates,
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -220,7 +220,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k_cublas_offload(LDS, Dim, N_updates, Updates,
err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -235,7 +235,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -249,7 +249,7 @@ printf("#-----------------------------------------------------------------------
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
@ -260,7 +260,7 @@ printf("#-----------------------------------------------------------------------
} else if (version[0] == 'm') { // LAPACK/MKL
// Only send upper Dim x Dim part of matrix to lapack
double tmp[Dim*Dim];
double tmp[DIM *DIM];
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
// 1. FETCH START TIME
@ -274,9 +274,9 @@ printf("#-----------------------------------------------------------------------
// Copy elements of inverse back, adding 0-padding in "correct" place
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < LDS; j++) {
if (j < Dim) Slater_invT_copy[i * LDS + j] = tmp[i * Dim + j];
else Slater_invT_copy[i * LDS + j] = 0.0;
for (uint32_t j = 0; j < Lds; j++) {
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
else Slater_invT_copy[i * Lds + j] = 0.0;
}
}
@ -290,7 +290,7 @@ printf("#-----------------------------------------------------------------------
} // END OF REPETITIONS LOOP
// 4. COPY RESULT BACK TO ORIGINAL
memcpy(Slater_invT, Slater_invT_copy, LDS * Dim * sizeof(double));
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
determinant = determinant_copy;
// At this point Slater_invT contains the correct inverse matrix
@ -306,20 +306,20 @@ printf("#-----------------------------------------------------------------------
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
cumulative += accumulator;
double SSi[Dim * Dim];
matmul(SlaterT, Slater_invT, SSi, LDS, Dim);
double Res[Dim * Dim];
double SSi[DIM * DIM];
matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
double Res[DIM * DIM];
residual(SSi, Res, Dim);
const double max = max_norm(Res, Dim, Dim);
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
uint32_t err_out = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// int32_t err_out = check_error_better(max, tolerance);
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
// 8. COMPUTE CONDITION NUMBER
const double condnr = condition_number(Slater, Slater_invT, LDS, Dim);
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
const double frob = frobenius_norm(Res, Dim, Dim);

View File

@ -1,9 +0,0 @@
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------