mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2025-01-12 05:58:28 +01:00
- Got rid of NVC compiler warnings
- Included lib paths for MKL/HDF5 and cuBLAS - Cleaned Makefile - Added GPU node session request script
This commit is contained in:
parent
fa03590f6f
commit
87e319189e
@ -1,19 +1,11 @@
|
||||
# FC = gfortran
|
||||
# CC = gcc
|
||||
# FFLAGS=-O0 -finline -g -lm -Wall -pedantic
|
||||
# CFLAGS=-std=c99 -O0 -finline -g -lm -Wall -pedantic
|
||||
FC = ifx
|
||||
CC = icx
|
||||
# FFLAGS=-O0 -warn all -g -pedantic
|
||||
# CFLAGS=-std=c99 -O0 -Wall -g -pedantic
|
||||
FFLAGS=-O3 -warn all -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
|
||||
CFLAGS=-std=c99 -O3 -Wall -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
|
||||
INCLUDE=-I/usr/include/hdf5/serial -I/usr/local/cuda/include
|
||||
LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential -L/usr/local/cuda-11.7/targets/x86_64-linux/lib -lcublas
|
||||
#FC = verificarlo-f
|
||||
#CC = verificarlo-c
|
||||
#FFLAGS=-O3 -finline -g
|
||||
#CFLAGS=-O3 -finline -g
|
||||
CC = nvc
|
||||
|
||||
CFLAGS=-std=c99 -O3 -Wall -g
|
||||
|
||||
LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
|
||||
LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
||||
LFLAGS+=-lcublas
|
||||
|
||||
## Link with icc
|
||||
# test: sm.o test.o detupdate21.o meuk.o
|
||||
@ -21,16 +13,6 @@ LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential
|
||||
test: sm.o test.o meuk.o
|
||||
$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
|
||||
|
||||
|
||||
|
||||
## Link with ifort
|
||||
# test: sm.o test.o detupdate21.o meuk.o
|
||||
# $(FC) $(LFLAGS) -nofor-main -o test sm.o detupdate21.o test.o meuk.o
|
||||
|
||||
## Link with gfortran
|
||||
# test: sm.o test.o detupdate21.o meuk.o
|
||||
# $(FC) $(LFLAGS) -Wno-main -o test sm.o detupdate21.o test.o meuk.o
|
||||
|
||||
%.o: %.f90
|
||||
$(FC) $(FFLAGS) -c -o $@ $<
|
||||
|
||||
|
@ -1,9 +0,0 @@
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
4
independent_test_harness/goto_gpu_node.sh
Executable file
4
independent_test_harness/goto_gpu_node.sh
Executable file
@ -0,0 +1,4 @@
|
||||
salloc --nodes=1 --account=prcoe10 -p booster --gres=gpu:1
|
||||
wait
|
||||
srun --pty /bin/bash
|
||||
|
@ -1,9 +0,0 @@
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
@ -1,6 +1,8 @@
|
||||
#include <mkl_lapacke.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#define DIM 21
|
||||
#define LDS 24
|
||||
#define HAVE_CUBLAS_OFFLOAD
|
||||
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
@ -9,7 +11,7 @@
|
||||
#include <cublas_v2.h>
|
||||
#endif
|
||||
|
||||
lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
|
||||
lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
|
||||
|
||||
int min(int a, int b);
|
||||
|
||||
@ -72,7 +74,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||
Slater_inv,
|
||||
double *__restrict determinant);
|
||||
|
||||
void detupd(const uint64_t Dim, const uint64_t LDS,
|
||||
void detupd(const uint64_t Dim, const uint64_t Lds,
|
||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||
const uint64_t *__restrict Updates_index,
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
@ -84,4 +86,3 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
const uint64_t *__restrict Updates_index, const double breakdown,
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant);
|
||||
|
@ -2,27 +2,27 @@
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim) {
|
||||
for (uint64_t i = 0; i < LDS * Dim; i++) {
|
||||
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||
for (uint64_t i = 0; i < Lds * Dim; i++) {
|
||||
printf("%f\n", A[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim) {
|
||||
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||
double sum2 = 0;
|
||||
for (uint64_t i = 0; i < LDS * Dim; i++) sum2 += A[i] * A[i];
|
||||
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
|
||||
return sum2;
|
||||
}
|
||||
|
||||
double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
|
||||
double sum2 = frobenius_norm2(A, LDS, Dim);
|
||||
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||
double sum2 = frobenius_norm2(A, Lds, Dim);
|
||||
return sqrt(sum2);
|
||||
}
|
||||
|
||||
double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
|
||||
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||
double largest = 0;
|
||||
for (uint64_t i = 0; i < LDS * Dim; i++) {
|
||||
for (uint64_t i = 0; i < Lds * Dim; i++) {
|
||||
double elm = A[i];
|
||||
double felm = fabs(elm);
|
||||
if (elm != elm) return -1.0; // Return a negative norm when NaN found
|
||||
@ -31,9 +31,9 @@ double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
|
||||
return largest;
|
||||
}
|
||||
|
||||
double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim) {
|
||||
double norm_A = frobenius_norm(A, LDS, Dim);
|
||||
double norm_Ainv = frobenius_norm(Ainv, LDS, Dim);
|
||||
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim) {
|
||||
double norm_A = frobenius_norm(A, Lds, Dim);
|
||||
double norm_Ainv = frobenius_norm(Ainv, Lds, Dim);
|
||||
return fabs(norm_A) * fabs(norm_Ainv);
|
||||
}
|
||||
|
||||
@ -57,19 +57,19 @@ void read_double(hid_t file_id, const char *key, double *data) {
|
||||
assert(rc >= 0 && "H5Dclose");
|
||||
}
|
||||
|
||||
void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
|
||||
void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
|
||||
const uint64_t N_updates, const double *Updates,
|
||||
const uint64_t *Updates_index, double *Slater) {
|
||||
|
||||
for (uint32_t i = 0; i < N_updates; i++) {
|
||||
uint32_t col = Updates_index[i] - 1;
|
||||
for (uint32_t j = 0; j < Dim; j++) {
|
||||
Slater[col * Dim + j] += Updates[i * LDS + j];
|
||||
Slater[col * Dim + j] += Updates[i * Lds + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
|
||||
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
|
||||
double *Slater, const double tolerance) {
|
||||
|
||||
double res[Dim*Dim];
|
||||
@ -78,7 +78,7 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
|
||||
for (uint32_t j = 0; j < Dim; j++) {
|
||||
res[i * Dim + j] = 0;
|
||||
for (uint32_t k = 0; k < Dim; k++) {
|
||||
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * LDS + j];
|
||||
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -95,12 +95,12 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
|
||||
return 0;
|
||||
}
|
||||
|
||||
void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim) {
|
||||
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < Dim; j++) {
|
||||
prod[i * Dim + j] = 0;
|
||||
for (uint32_t k = 0; k < Dim; k++) {
|
||||
prod[i * Dim + j] += a[i * Dim + k] * b[k * LDS + j];
|
||||
prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -121,7 +121,7 @@ void residual(double *a, double *res, const uint64_t Dim) {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
|
||||
uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
|
||||
const uint64_t N_updates, const double *Updates,
|
||||
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
||||
double *Slater, double *Slater_inv, double *determinant) {
|
||||
@ -129,40 +129,40 @@ uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
|
||||
// if (version[0] == 'a') { // Anthony
|
||||
// const double *Upds;
|
||||
// const uint64_t *Ui;
|
||||
// for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] *= *determinant;
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
|
||||
// for (int j = 0; j < N_updates; j++) {
|
||||
// Upds = &Updates[j * LDS];
|
||||
// Upds = &Updates[j * Lds];
|
||||
// Ui = &Updates_index[j];
|
||||
// detupd(Dim, LDS, Upds, Ui, Slater_inv, determinant);
|
||||
// detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
|
||||
// if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
|
||||
// }
|
||||
// for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] /= *determinant;
|
||||
// update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
// rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
|
||||
// update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
// rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
|
||||
// if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||
// } else if (version[0] == 'n') { // Naive
|
||||
if (version[0] == 'n') { // Naive
|
||||
rc = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
|
||||
rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
|
||||
breakdown, Slater_inv, determinant);
|
||||
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison failed\n");
|
||||
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
|
||||
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
|
||||
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||
} else if (version[0] == 's') { // Splitting
|
||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
|
||||
rc = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_inv,
|
||||
determinant);
|
||||
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_splitting failed\n");
|
||||
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
|
||||
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
|
||||
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||
} else if (version[0] == 'b') { // Blocked
|
||||
rc = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
|
||||
rc = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_inv,
|
||||
determinant);
|
||||
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_smw32s failed\n");
|
||||
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
|
||||
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
|
||||
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||
}
|
||||
return rc;
|
||||
|
@ -12,13 +12,13 @@ typedef struct Error {
|
||||
uint64_t error;
|
||||
} Error;
|
||||
|
||||
void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim);
|
||||
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
|
||||
void residual(double *a, double *res, const uint64_t Dim);
|
||||
double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim);
|
||||
void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim);
|
||||
double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim);
|
||||
double max_norm(double *A, const uint64_t LDS, const uint64_t Dim);
|
||||
double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim);
|
||||
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
|
||||
void read_uint(hid_t file_id, const char *key, uint64_t *data);
|
||||
void read_double(hid_t file_id, const char *key, double *data);
|
||||
|
||||
@ -28,16 +28,16 @@ static __inline__ uint64_t rdtsc(void) {
|
||||
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
|
||||
}
|
||||
|
||||
void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
|
||||
void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
|
||||
const uint64_t N_updates, const double *Updates,
|
||||
const uint64_t *Updates_index, double *Slater);
|
||||
|
||||
uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
|
||||
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
|
||||
double *Slater, const double tolerance);
|
||||
|
||||
int32_t check_error_better(const double max, const double tolerance);
|
||||
|
||||
uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
|
||||
uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
|
||||
const uint64_t N_updates, const double *Updates,
|
||||
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
||||
double *Slater, double *Slater_inv, double *determinant);
|
||||
|
@ -1,9 +0,0 @@
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
@ -16,11 +16,11 @@ uint32_t qmckl_sherman_morrison(
|
||||
const uint64_t *__restrict Updates_index, const double breakdown,
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
|
||||
double __attribute__((aligned(8))) C[Dim];
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
double __attribute__((aligned(8))) C[DIM];
|
||||
double __attribute__((aligned(8))) D[LDS];
|
||||
|
||||
uint32_t l = 0;
|
||||
@ -31,8 +31,8 @@ uint32_t qmckl_sherman_morrison(
|
||||
C[i] = 0.0;
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||
}
|
||||
}
|
||||
|
||||
@ -51,17 +51,17 @@ uint32_t qmckl_sherman_morrison(
|
||||
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
D[j] = Slater_inv[cui * LDS + j]; // selecting proper column of v_l^T * S_inv
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
D[j] = Slater_inv[cui * Lds + j]; // selecting proper column of v_l^T * S_inv
|
||||
}
|
||||
|
||||
// A^{-1} = A^{-1} - C x D / den
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
const double update = C[i] * D[j] * iden;
|
||||
Slater_inv[i * LDS + j] -= update;
|
||||
Slater_inv[i * Lds + j] -= update;
|
||||
}
|
||||
}
|
||||
l += 1;
|
||||
@ -69,6 +69,15 @@ uint32_t qmckl_sherman_morrison(
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS,
|
||||
where S^{-1}P : Dim x LDS,
|
||||
C := S^{-1}PP^TU : Dim x 2,
|
||||
B := 1 + VC : 2 x 2,
|
||||
D := VS^{-1}P : 2 x LDS,
|
||||
P^TU : LDS x 2,
|
||||
V : 2 x Dim
|
||||
*/
|
||||
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||
const double *__restrict __attribute__((aligned(8)))
|
||||
Updates,
|
||||
@ -77,31 +86,23 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||
double *__restrict __attribute__((aligned(8)))
|
||||
Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
/*
|
||||
COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS,
|
||||
where S^{-1}P : Dim x LDS,
|
||||
C := S^{-1}PP^TU : Dim x 2,
|
||||
B := 1 + VC : 2 x 2,
|
||||
D := VS^{-1}P : 2 x LDS,
|
||||
P^TU : LDS x 2,
|
||||
V : 2 x Dim
|
||||
*/
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
const uint32_t row1 = (Updates_index[0] - 1);
|
||||
const uint32_t row2 = (Updates_index[1] - 1);
|
||||
|
||||
// Compute C = (S^T)^{-1}U : Dim x 2
|
||||
double __attribute__((aligned(8))) C[2 * Dim];
|
||||
double __attribute__((aligned(8))) C[2 * DIM];
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
C[i * 2] = 0;
|
||||
C[i * 2 + 1] = 0;
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t k = 0; k < LDS; k++) {
|
||||
C[i * 2] += Slater_inv[i * LDS + k] * Updates[k];
|
||||
C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
|
||||
for (uint32_t k = 0; k < Lds; k++) {
|
||||
C[i * 2] += Slater_inv[i * Lds + k] * Updates[k];
|
||||
C[i * 2 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
|
||||
}
|
||||
}
|
||||
|
||||
@ -130,28 +131,37 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||
|
||||
// tmp = B^{-1}D : 2 x LDS
|
||||
double __attribute__((aligned(8))) tmp[2 * LDS];
|
||||
double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
|
||||
double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
|
||||
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
|
||||
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
|
||||
tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
|
||||
tmp[Lds + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
|
||||
}
|
||||
|
||||
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
|
||||
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j];
|
||||
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j];
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
Slater_inv[i * Lds + j] -= C[i * 2] * tmp[j];
|
||||
Slater_inv[i * Lds + j] -= C[i * 2 + 1] * tmp[Lds + j];
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS,
|
||||
where S^T : Dim x LDS,
|
||||
C := (S^T)^{-1}U : Dim x 3,
|
||||
B := 1 + VC : 3 x 3,
|
||||
D := V(S^T)^{-1} : 3 x LDS,
|
||||
U : LDS x 3,
|
||||
V : 3 x Dim
|
||||
*/
|
||||
uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
||||
const double *__restrict __attribute__((aligned(8)))
|
||||
Updates,
|
||||
@ -160,34 +170,26 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
||||
double *__restrict __attribute__((aligned(8)))
|
||||
Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
/*
|
||||
COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS,
|
||||
where S^T : Dim x LDS,
|
||||
C := (S^T)^{-1}U : Dim x 3,
|
||||
B := 1 + VC : 3 x 3,
|
||||
D := V(S^T)^{-1} : 3 x LDS,
|
||||
U : LDS x 3,
|
||||
V : 3 x Dim
|
||||
*/
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
const uint32_t row1 = (Updates_index[0] - 1);
|
||||
const uint32_t row2 = (Updates_index[1] - 1);
|
||||
const uint32_t row3 = (Updates_index[2] - 1);
|
||||
|
||||
// Compute C = (S^T)^{-1}U : Dim x 3
|
||||
double __attribute__((aligned(8))) C[3 * Dim];
|
||||
double __attribute__((aligned(8))) C[3 * DIM];
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
C[i * 3] = 0;
|
||||
C[i * 3 + 1] = 0;
|
||||
C[i * 3 + 2] = 0;
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t k = 0; k < LDS; k++) {
|
||||
C[i * 3] += Slater_inv[i * LDS + k] * Updates[k];
|
||||
C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
|
||||
C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k];
|
||||
for (uint32_t k = 0; k < Lds; k++) {
|
||||
C[i * 3] += Slater_inv[i * Lds + k] * Updates[k];
|
||||
C[i * 3 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
|
||||
C[i * 3 + 2] += Slater_inv[i * Lds + k] * Updates[2 * Lds + k];
|
||||
}
|
||||
}
|
||||
|
||||
@ -233,20 +235,20 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
||||
double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
|
||||
tmp[LDS + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
|
||||
tmp[2 * LDS + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
|
||||
tmp[Lds + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
|
||||
tmp[2 * Lds + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
|
||||
}
|
||||
|
||||
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
|
||||
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j];
|
||||
Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j];
|
||||
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j];
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
Slater_inv[i * Lds + j] -= C[i * 3] * tmp[j];
|
||||
Slater_inv[i * Lds + j] -= C[i * 3 + 1] * tmp[Lds + j];
|
||||
Slater_inv[i * Lds + j] -= C[i * 3 + 2] * tmp[2 * Lds + j];
|
||||
}
|
||||
}
|
||||
|
||||
@ -273,15 +275,15 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||
double C[Dim * N_updates];
|
||||
double C[DIM * N_updates];
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
Dim, N_updates, LDS,
|
||||
alpha, Slater_inv, LDS, Updates, LDS,
|
||||
Dim, N_updates, Lds,
|
||||
alpha, Slater_inv, Lds, Updates, Lds,
|
||||
beta, C, N_updates);
|
||||
|
||||
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
||||
@ -290,7 +292,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||
for (uint32_t i = 0; i < N_updates; i++) {
|
||||
const uint32_t row = Updates_index[i] - 1;
|
||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
|
||||
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
|
||||
}
|
||||
|
||||
// Compute determinant by LU decomposition
|
||||
@ -345,41 +347,34 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||
// double C[Dim * N_updates];
|
||||
double *C = malloc(DIM * N_updates * sizeof(double));
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
// Dim, N_updates, Lds,
|
||||
// alpha, Slater_inv, Lds, Updates, Lds,
|
||||
// beta, C, N_updates);
|
||||
//cuBLAS initialization
|
||||
cublasHandle_t handle;
|
||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stdout, "cuBLAS initialization failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||
double C[Dim * N_updates];
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
|
||||
// #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:M*N*walk_num], tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
// #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
|
||||
// {
|
||||
// for (int nw=0; nw < walk_num; ++nw) {
|
||||
// int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
|
||||
// &(een_rescaled_e[nw*(cord_num+1)]),
|
||||
// LDA, af,
|
||||
// &(een_rescaled_n[bf*nw]),
|
||||
// LDB, 0,
|
||||
// &beta,
|
||||
// &(tmp_c[nw*cord_num]),
|
||||
// LDC, cf, cord_num);
|
||||
// }
|
||||
// }
|
||||
// #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
#pragma omp target enter data map(to:Slater_inv, Updates, C)
|
||||
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
|
||||
{
|
||||
int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
|
||||
Dim, N_updates, Lds,
|
||||
&alpha, Slater_inv, Lds, Updates, Lds,
|
||||
&beta, C, N_updates);
|
||||
}
|
||||
#pragma omp target exit data map(from:C)
|
||||
cublasDestroy(handle);
|
||||
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
Dim, N_updates, LDS,
|
||||
alpha, Slater_inv, LDS, Updates, LDS,
|
||||
beta, C, N_updates);
|
||||
|
||||
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
||||
// Construct D = V S^{-1} : K x LDS
|
||||
@ -387,7 +382,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
||||
for (uint32_t i = 0; i < N_updates; i++) {
|
||||
const uint32_t row = Updates_index[i] - 1;
|
||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
|
||||
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
|
||||
}
|
||||
|
||||
// Compute determinant by LU decomposition
|
||||
@ -443,8 +438,8 @@ uint32_t qmckl_slagel_splitting(
|
||||
uint64_t *__restrict later_index, uint64_t *__restrict later,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t LDS = 24;
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
double __attribute__((aligned(8))) C[LDS];
|
||||
double __attribute__((aligned(8))) D[LDS];
|
||||
@ -457,8 +452,8 @@ uint32_t qmckl_slagel_splitting(
|
||||
C[i] = 0.0;
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||
}
|
||||
}
|
||||
|
||||
@ -474,8 +469,8 @@ uint32_t qmckl_slagel_splitting(
|
||||
// in later_updates
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t i = 0; i < LDS; i++) {
|
||||
later_updates[*later * LDS + i] = Updates[l * LDS + i] / 2.0;
|
||||
for (uint32_t i = 0; i < Lds; i++) {
|
||||
later_updates[*later * Lds + i] = Updates[l * Lds + i] / 2.0;
|
||||
C[i] /= 2.0;
|
||||
}
|
||||
later_index[*later] = Updates_index[l];
|
||||
@ -490,17 +485,17 @@ uint32_t qmckl_slagel_splitting(
|
||||
// D = v^T x S^{-1} : 1 x LDS
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
D[j] = Slater_inv[cui * LDS + j];
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
D[j] = Slater_inv[cui * Lds + j];
|
||||
}
|
||||
|
||||
// S^{-1} = S^{-1} - C x D / den
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
const double update = C[i] * D[j] * iden;
|
||||
Slater_inv[i * LDS + j] -= update;
|
||||
Slater_inv[i * Lds + j] -= update;
|
||||
}
|
||||
}
|
||||
l += 1;
|
||||
@ -516,18 +511,18 @@ uint32_t qmckl_sherman_morrison_splitting(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||
uint64_t later_index[N_updates];
|
||||
uint64_t later = 0;
|
||||
// uint32_t rc;
|
||||
|
||||
(void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
|
||||
(void) qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
|
||||
breakdown, Slater_inv, later_updates, later_index,
|
||||
&later, determinant);
|
||||
// rc = qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
|
||||
// rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
|
||||
// breakdown, Slater_inv, later_updates, later_index,
|
||||
// &later, determinant);
|
||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||
@ -535,11 +530,11 @@ uint32_t qmckl_sherman_morrison_splitting(
|
||||
if (later > 0) {
|
||||
recursive_calls++;
|
||||
// printf("Later > 0\n");
|
||||
(void) qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
||||
(void) qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||
later_index, breakdown, Slater_inv,
|
||||
determinant);
|
||||
|
||||
// rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
||||
// rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||
// later_index, breakdown, Slater_inv,
|
||||
// determinant);
|
||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
||||
@ -555,8 +550,8 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||
uint64_t later_index[N_updates];
|
||||
@ -564,31 +559,31 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
uint32_t rc;
|
||||
|
||||
if (N_updates == 4) { // Special case for 4 rank-1 updates: 2+2
|
||||
rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
|
||||
rc = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
|
||||
breakdown, Slater_inv, determinant);
|
||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||
block_fail += 1;
|
||||
uint64_t l = 0;
|
||||
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates,
|
||||
rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates,
|
||||
Updates_index, breakdown, Slater_inv,
|
||||
later_updates + (LDS * later),
|
||||
later_updates + (Lds * later),
|
||||
later_index + later, &l, determinant);
|
||||
later += l;
|
||||
}
|
||||
rc = qmckl_woodbury_2(LDS, Dim, &Updates[2*LDS], &Updates_index[2],
|
||||
rc = qmckl_woodbury_2(Lds, Dim, &Updates[2*Lds], &Updates_index[2],
|
||||
breakdown, Slater_inv, determinant);
|
||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||
block_fail += 1;
|
||||
uint64_t l = 0;
|
||||
rc = qmckl_slagel_splitting(LDS, Dim, 2, &Updates[2*LDS],
|
||||
rc = qmckl_slagel_splitting(Lds, Dim, 2, &Updates[2*Lds],
|
||||
&Updates_index[2], breakdown, Slater_inv,
|
||||
later_updates + (LDS * later),
|
||||
later_updates + (Lds * later),
|
||||
later_index + later, &l, determinant);
|
||||
later += l;
|
||||
}
|
||||
if (later > 0) {
|
||||
recursive_calls++;
|
||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
||||
rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||
later_index, breakdown, Slater_inv,
|
||||
determinant);
|
||||
}
|
||||
@ -600,21 +595,21 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
// Woodbury 3x3 kernel
|
||||
uint32_t n_of_3blocks = N_updates / 3;
|
||||
uint32_t remainder = N_updates % 3;
|
||||
uint32_t length_3block = 3 * LDS;
|
||||
uint32_t length_3block = 3 * Lds;
|
||||
|
||||
if (n_of_3blocks > 0) {
|
||||
for (uint32_t i = 0; i < n_of_3blocks; i++) {
|
||||
const double *Updates_3block = &Updates[i * length_3block];
|
||||
const uint64_t *Updates_index_3block = &Updates_index[i * 3];
|
||||
rc = qmckl_woodbury_3(LDS, Dim, Updates_3block, Updates_index_3block,
|
||||
rc = qmckl_woodbury_3(Lds, Dim, Updates_3block, Updates_index_3block,
|
||||
breakdown, Slater_inv, determinant);
|
||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||
// printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
|
||||
block_fail += 1;
|
||||
uint64_t l = 0;
|
||||
rc = qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block,
|
||||
rc = qmckl_slagel_splitting(Lds, Dim, 3, Updates_3block,
|
||||
Updates_index_3block, breakdown, Slater_inv,
|
||||
later_updates + (LDS * later),
|
||||
later_updates + (Lds * later),
|
||||
later_index + later, &l, determinant);
|
||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||
later += l;
|
||||
@ -626,15 +621,15 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
if (remainder == 2) {
|
||||
const double *Updates_2block = &Updates[n_of_3blocks * length_3block];
|
||||
const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks];
|
||||
rc = qmckl_woodbury_2(LDS, Dim, Updates_2block, Updates_index_2block,
|
||||
rc = qmckl_woodbury_2(Lds, Dim, Updates_2block, Updates_index_2block,
|
||||
breakdown, Slater_inv, determinant);
|
||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||
// printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
|
||||
block_fail += 1;
|
||||
uint64_t l = 0;
|
||||
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block,
|
||||
rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates_2block,
|
||||
Updates_index_2block, breakdown, Slater_inv,
|
||||
later_updates + (LDS * later),
|
||||
later_updates + (Lds * later),
|
||||
later_index + later, &l, determinant);
|
||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||
later += l;
|
||||
@ -647,9 +642,9 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
const double *Updates_1block = &Updates[n_of_3blocks * length_3block];
|
||||
const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks];
|
||||
uint64_t l = 0;
|
||||
rc = qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block,
|
||||
rc = qmckl_slagel_splitting(Lds, Dim, 1, Updates_1block,
|
||||
Updates_index_1block, breakdown, Slater_inv,
|
||||
later_updates + (LDS * later),
|
||||
later_updates + (Lds * later),
|
||||
later_index + later, &l, determinant);
|
||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||
later += l;
|
||||
@ -658,7 +653,7 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
if (later > 0) {
|
||||
recursive_calls++;
|
||||
// printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
||||
rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||
later_index, breakdown, Slater_inv,
|
||||
determinant);
|
||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
||||
@ -674,10 +669,10 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = 21;
|
||||
const uint32_t LDS = 24;
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
|
||||
double __attribute__((aligned(8))) C[Dim];
|
||||
double __attribute__((aligned(8))) C[DIM];
|
||||
double __attribute__((aligned(8))) D[LDS];
|
||||
|
||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||
@ -693,8 +688,8 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
C[i] = 0.0;
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||
}
|
||||
}
|
||||
|
||||
@ -705,8 +700,8 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
// for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t i = 0; i < LDS; i++) {
|
||||
later_updates[later * LDS + i] = Updates[l * LDS + i];
|
||||
for (uint32_t i = 0; i < Lds; i++) {
|
||||
later_updates[later * Lds + i] = Updates[l * Lds + i];
|
||||
}
|
||||
later_index[later] = Updates_index[l];
|
||||
later++;
|
||||
@ -720,17 +715,17 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
// D = v^T x A^{-1}
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
D[j] = Slater_inv[cui * LDS + j];
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
D[j] = Slater_inv[cui * Lds + j];
|
||||
}
|
||||
|
||||
// S^{-1} = S^{-1} - C x D / den
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
const double update = C[i] * D[j] * iden;
|
||||
Slater_inv[i * LDS + j] -= update;
|
||||
Slater_inv[i * Lds + j] -= update;
|
||||
}
|
||||
}
|
||||
l += 1;
|
||||
@ -741,7 +736,7 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
}
|
||||
else if (later > 0) { // If some have failed, make a recursive call
|
||||
recursive_calls++;
|
||||
(void) qmckl_sherman_morrison_later(LDS, Dim, later, later_updates,
|
||||
(void) qmckl_sherman_morrison_later(Lds, Dim, later, later_updates,
|
||||
later_index, breakdown, Slater_inv, determinant);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include <stdint.h>
|
||||
#include "meuk.h"
|
||||
#include "cycles.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
|
||||
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
|
||||
@ -23,11 +23,11 @@ int main(int argc, char **argv) {
|
||||
char slater_key[32];
|
||||
char slater_inv_key[32];
|
||||
char det_key[32];
|
||||
const uint64_t Dim = 21;
|
||||
const uint64_t LDS = 24;
|
||||
const uint64_t Dim = DIM;
|
||||
const uint64_t Lds = LDS;
|
||||
uint64_t N_updates;
|
||||
double Slater[LDS * Dim ], SlaterT[LDS * Dim];
|
||||
double Slater_invT[LDS * Dim], Slater_invT_copy[LDS * Dim];
|
||||
double Slater[LDS * DIM ], SlaterT[LDS * DIM];
|
||||
double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
|
||||
double determinant, determinant_copy;
|
||||
|
||||
// SETUP TEST PARAMETERS
|
||||
@ -52,7 +52,7 @@ printf("#-----------------------------------------------------------------------
|
||||
sprintf(det_key, "/cycle_%d/determinant", cycle);
|
||||
read_uint(file_id, nupds_key, &N_updates);
|
||||
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
|
||||
double *Updates = malloc(LDS * N_updates * sizeof(double));
|
||||
double *Updates = malloc(Lds * N_updates * sizeof(double));
|
||||
read_uint(file_id, upd_idx_key, Updates_index);
|
||||
read_double(file_id, upds_key, Updates);
|
||||
read_double(file_id, slater_key, Slater);
|
||||
@ -60,28 +60,28 @@ printf("#-----------------------------------------------------------------------
|
||||
read_double(file_id, det_key, &determinant);
|
||||
|
||||
// Compute transpose of S. ST: 24 x 21
|
||||
for (int i = 0; i < LDS; i++) {
|
||||
for (int i = 0; i < Lds; i++) {
|
||||
for (int j = 0; j < Dim; j++) {
|
||||
SlaterT[i * Dim + j] = Slater[j * LDS + i];
|
||||
SlaterT[i * Dim + j] = Slater[j * Lds + i];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert repl. upds into additive upds.
|
||||
for (int i = 0; i < N_updates; i++) {
|
||||
int col = Updates_index[i] - 1;
|
||||
for (int j = 0; j < LDS; j++) {
|
||||
Updates[i * LDS + j] -= SlaterT[col + j * Dim];
|
||||
for (int j = 0; j < Lds; j++) {
|
||||
Updates[i * Lds + j] -= SlaterT[col + j * Dim];
|
||||
}
|
||||
}
|
||||
|
||||
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
|
||||
uint32_t err_inp = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
|
||||
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
|
||||
|
||||
// Update Slater matrix
|
||||
for (int i = 0; i < N_updates; i++) {
|
||||
int col = Updates_index[i] - 1;
|
||||
for (int j = 0; j < Dim; j++) {
|
||||
SlaterT[col + j * Dim] += Updates[i * LDS + j];
|
||||
SlaterT[col + j * Dim] += Updates[i * Lds + j];
|
||||
}
|
||||
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
|
||||
|
||||
@ -98,7 +98,7 @@ printf("#-----------------------------------------------------------------------
|
||||
for (int rep = 0; rep < REPETITIONS; rep++) {
|
||||
|
||||
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
|
||||
memcpy(Slater_invT_copy, Slater_invT, LDS * Dim * sizeof(double));
|
||||
memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double));
|
||||
determinant_copy = determinant;
|
||||
|
||||
// ### CHOOSE A KERNEL:
|
||||
@ -109,10 +109,10 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// err_break = 0;
|
||||
|
||||
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
|
||||
|
||||
// for (int i = 0; i < N_updates; i++) {
|
||||
// Upds = &Updates[i * LDS];
|
||||
// Upds = &Updates[i * Lds];
|
||||
// Ui = &Updates_index[i];
|
||||
// determinant_previous = determinant_copy;
|
||||
|
||||
@ -120,7 +120,7 @@ printf("#-----------------------------------------------------------------------
|
||||
// uint64_t before = rdtsc();
|
||||
|
||||
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
// detupd(Dim, LDS, Upds, Ui, Slater_invT_copy, &determinant_copy);
|
||||
// detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
|
||||
|
||||
// // 3. FETCH FINISH TIME
|
||||
// uint64_t after = rdtsc();
|
||||
@ -137,9 +137,9 @@ printf("#-----------------------------------------------------------------------
|
||||
// }
|
||||
|
||||
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
|
||||
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
|
||||
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
|
||||
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
|
||||
// }
|
||||
// } else if (version[0] == 'n') { // Naive
|
||||
if (version[0] == 'n') { // Naive
|
||||
@ -147,7 +147,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates,
|
||||
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -161,7 +161,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison_later(LDS, Dim, N_updates, Updates,
|
||||
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -175,7 +175,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
|
||||
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
|
||||
breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -190,7 +190,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
|
||||
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
|
||||
breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -205,7 +205,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates,
|
||||
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -220,7 +220,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_k_cublas_offload(LDS, Dim, N_updates, Updates,
|
||||
err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -235,7 +235,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
|
||||
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -249,7 +249,7 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
|
||||
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
@ -260,7 +260,7 @@ printf("#-----------------------------------------------------------------------
|
||||
} else if (version[0] == 'm') { // LAPACK/MKL
|
||||
|
||||
// Only send upper Dim x Dim part of matrix to lapack
|
||||
double tmp[Dim*Dim];
|
||||
double tmp[DIM *DIM];
|
||||
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
|
||||
|
||||
// 1. FETCH START TIME
|
||||
@ -274,9 +274,9 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// Copy elements of inverse back, adding 0-padding in "correct" place
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < LDS; j++) {
|
||||
if (j < Dim) Slater_invT_copy[i * LDS + j] = tmp[i * Dim + j];
|
||||
else Slater_invT_copy[i * LDS + j] = 0.0;
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
|
||||
else Slater_invT_copy[i * Lds + j] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -290,7 +290,7 @@ printf("#-----------------------------------------------------------------------
|
||||
} // END OF REPETITIONS LOOP
|
||||
|
||||
// 4. COPY RESULT BACK TO ORIGINAL
|
||||
memcpy(Slater_invT, Slater_invT_copy, LDS * Dim * sizeof(double));
|
||||
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
|
||||
determinant = determinant_copy;
|
||||
// At this point Slater_invT contains the correct inverse matrix
|
||||
|
||||
@ -306,20 +306,20 @@ printf("#-----------------------------------------------------------------------
|
||||
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
|
||||
cumulative += accumulator;
|
||||
|
||||
double SSi[Dim * Dim];
|
||||
matmul(SlaterT, Slater_invT, SSi, LDS, Dim);
|
||||
double Res[Dim * Dim];
|
||||
double SSi[DIM * DIM];
|
||||
matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
|
||||
double Res[DIM * DIM];
|
||||
residual(SSi, Res, Dim);
|
||||
const double max = max_norm(Res, Dim, Dim);
|
||||
|
||||
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
|
||||
uint32_t err_out = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
|
||||
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
|
||||
// int32_t err_out = check_error_better(max, tolerance);
|
||||
|
||||
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
|
||||
|
||||
// 8. COMPUTE CONDITION NUMBER
|
||||
const double condnr = condition_number(Slater, Slater_invT, LDS, Dim);
|
||||
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
|
||||
const double frob = frobenius_norm(Res, Dim, Dim);
|
||||
|
||||
|
||||
|
@ -1,9 +0,0 @@
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
Loading…
Reference in New Issue
Block a user