mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2025-01-12 22:18:36 +01:00
- Got rid of NVC compiler warnings
- Included lib paths for MKL/HDF5 and cuBLAS - Cleaned Makefile - Added GPU node session request script
This commit is contained in:
parent
fa03590f6f
commit
87e319189e
@ -1,19 +1,11 @@
|
|||||||
# FC = gfortran
|
|
||||||
# CC = gcc
|
|
||||||
# FFLAGS=-O0 -finline -g -lm -Wall -pedantic
|
|
||||||
# CFLAGS=-std=c99 -O0 -finline -g -lm -Wall -pedantic
|
|
||||||
FC = ifx
|
FC = ifx
|
||||||
CC = icx
|
CC = nvc
|
||||||
# FFLAGS=-O0 -warn all -g -pedantic
|
|
||||||
# CFLAGS=-std=c99 -O0 -Wall -g -pedantic
|
CFLAGS=-std=c99 -O3 -Wall -g
|
||||||
FFLAGS=-O3 -warn all -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
|
|
||||||
CFLAGS=-std=c99 -O3 -Wall -finline -xCORE-AVX2 -g -qopenmp -fopenmp-targets=spir64
|
LFLAGS=-L$(HDF5_DIR)/lib -lhdf5 -lhdf5_hl
|
||||||
INCLUDE=-I/usr/include/hdf5/serial -I/usr/local/cuda/include
|
LFLAGS+=-L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
|
||||||
LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential -L/usr/local/cuda-11.7/targets/x86_64-linux/lib -lcublas
|
LFLAGS+=-lcublas
|
||||||
#FC = verificarlo-f
|
|
||||||
#CC = verificarlo-c
|
|
||||||
#FFLAGS=-O3 -finline -g
|
|
||||||
#CFLAGS=-O3 -finline -g
|
|
||||||
|
|
||||||
## Link with icc
|
## Link with icc
|
||||||
# test: sm.o test.o detupdate21.o meuk.o
|
# test: sm.o test.o detupdate21.o meuk.o
|
||||||
@ -21,16 +13,6 @@ LFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial -lhdf5 -lhdf5_hl -qmkl=sequential
|
|||||||
test: sm.o test.o meuk.o
|
test: sm.o test.o meuk.o
|
||||||
$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
|
$(CC) $(LFLAGS) -o test sm.o test.o meuk.o
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Link with ifort
|
|
||||||
# test: sm.o test.o detupdate21.o meuk.o
|
|
||||||
# $(FC) $(LFLAGS) -nofor-main -o test sm.o detupdate21.o test.o meuk.o
|
|
||||||
|
|
||||||
## Link with gfortran
|
|
||||||
# test: sm.o test.o detupdate21.o meuk.o
|
|
||||||
# $(FC) $(LFLAGS) -Wno-main -o test sm.o detupdate21.o test.o meuk.o
|
|
||||||
|
|
||||||
%.o: %.f90
|
%.o: %.f90
|
||||||
$(FC) $(FFLAGS) -c -o $@ $<
|
$(FC) $(FFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
41 15 0 0 0 0 0 1.266302e-05 2.506974e-05 7.553522e+04 3.249420e+03 2.166280e+02 3.249420e+03 0
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
4
independent_test_harness/goto_gpu_node.sh
Executable file
4
independent_test_harness/goto_gpu_node.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
salloc --nodes=1 --account=prcoe10 -p booster --gres=gpu:1
|
||||||
|
wait
|
||||||
|
srun --pty /bin/bash
|
||||||
|
|
@ -1,9 +0,0 @@
|
|||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
41 15 0 0 0 0 0 1.266299e-05 2.506969e-05 7.553522e+04 1.161662e+04 7.744417e+02 1.161662e+04 0
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
@ -1,6 +1,8 @@
|
|||||||
#include <mkl_lapacke.h>
|
#include <mkl_lapacke.h>
|
||||||
#include <mkl.h>
|
#include <mkl.h>
|
||||||
|
|
||||||
|
#define DIM 21
|
||||||
|
#define LDS 24
|
||||||
#define HAVE_CUBLAS_OFFLOAD
|
#define HAVE_CUBLAS_OFFLOAD
|
||||||
|
|
||||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||||
@ -9,7 +11,7 @@
|
|||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
lapack_int inverse(double *A, uint64_t Dim, uint64_t LDS);
|
lapack_int inverse(double *A, uint64_t Dim, uint64_t Lds);
|
||||||
|
|
||||||
int min(int a, int b);
|
int min(int a, int b);
|
||||||
|
|
||||||
@ -72,7 +74,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
Slater_inv,
|
Slater_inv,
|
||||||
double *__restrict determinant);
|
double *__restrict determinant);
|
||||||
|
|
||||||
void detupd(const uint64_t Dim, const uint64_t LDS,
|
void detupd(const uint64_t Dim, const uint64_t Lds,
|
||||||
const double *__restrict __attribute__((aligned(8))) Updates,
|
const double *__restrict __attribute__((aligned(8))) Updates,
|
||||||
const uint64_t *__restrict Updates_index,
|
const uint64_t *__restrict Updates_index,
|
||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
@ -84,4 +86,3 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
const uint64_t *__restrict Updates_index, const double breakdown,
|
const uint64_t *__restrict Updates_index, const double breakdown,
|
||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant);
|
double *__restrict determinant);
|
||||||
|
|
@ -2,27 +2,27 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim) {
|
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||||
for (uint64_t i = 0; i < LDS * Dim; i++) {
|
for (uint64_t i = 0; i < Lds * Dim; i++) {
|
||||||
printf("%f\n", A[i]);
|
printf("%f\n", A[i]);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim) {
|
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||||
double sum2 = 0;
|
double sum2 = 0;
|
||||||
for (uint64_t i = 0; i < LDS * Dim; i++) sum2 += A[i] * A[i];
|
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
|
||||||
return sum2;
|
return sum2;
|
||||||
}
|
}
|
||||||
|
|
||||||
double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
|
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||||
double sum2 = frobenius_norm2(A, LDS, Dim);
|
double sum2 = frobenius_norm2(A, Lds, Dim);
|
||||||
return sqrt(sum2);
|
return sqrt(sum2);
|
||||||
}
|
}
|
||||||
|
|
||||||
double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
|
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||||
double largest = 0;
|
double largest = 0;
|
||||||
for (uint64_t i = 0; i < LDS * Dim; i++) {
|
for (uint64_t i = 0; i < Lds * Dim; i++) {
|
||||||
double elm = A[i];
|
double elm = A[i];
|
||||||
double felm = fabs(elm);
|
double felm = fabs(elm);
|
||||||
if (elm != elm) return -1.0; // Return a negative norm when NaN found
|
if (elm != elm) return -1.0; // Return a negative norm when NaN found
|
||||||
@ -31,9 +31,9 @@ double max_norm(double *A, const uint64_t LDS, const uint64_t Dim) {
|
|||||||
return largest;
|
return largest;
|
||||||
}
|
}
|
||||||
|
|
||||||
double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim) {
|
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim) {
|
||||||
double norm_A = frobenius_norm(A, LDS, Dim);
|
double norm_A = frobenius_norm(A, Lds, Dim);
|
||||||
double norm_Ainv = frobenius_norm(Ainv, LDS, Dim);
|
double norm_Ainv = frobenius_norm(Ainv, Lds, Dim);
|
||||||
return fabs(norm_A) * fabs(norm_Ainv);
|
return fabs(norm_A) * fabs(norm_Ainv);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,19 +57,19 @@ void read_double(hid_t file_id, const char *key, double *data) {
|
|||||||
assert(rc >= 0 && "H5Dclose");
|
assert(rc >= 0 && "H5Dclose");
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
|
void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
|
||||||
const uint64_t N_updates, const double *Updates,
|
const uint64_t N_updates, const double *Updates,
|
||||||
const uint64_t *Updates_index, double *Slater) {
|
const uint64_t *Updates_index, double *Slater) {
|
||||||
|
|
||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
uint32_t col = Updates_index[i] - 1;
|
uint32_t col = Updates_index[i] - 1;
|
||||||
for (uint32_t j = 0; j < Dim; j++) {
|
for (uint32_t j = 0; j < Dim; j++) {
|
||||||
Slater[col * Dim + j] += Updates[i * LDS + j];
|
Slater[col * Dim + j] += Updates[i * Lds + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
|
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
|
||||||
double *Slater, const double tolerance) {
|
double *Slater, const double tolerance) {
|
||||||
|
|
||||||
double res[Dim*Dim];
|
double res[Dim*Dim];
|
||||||
@ -78,7 +78,7 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
|
|||||||
for (uint32_t j = 0; j < Dim; j++) {
|
for (uint32_t j = 0; j < Dim; j++) {
|
||||||
res[i * Dim + j] = 0;
|
res[i * Dim + j] = 0;
|
||||||
for (uint32_t k = 0; k < Dim; k++) {
|
for (uint32_t k = 0; k < Dim; k++) {
|
||||||
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * LDS + j];
|
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -95,12 +95,12 @@ uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim) {
|
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
for (uint32_t j = 0; j < Dim; j++) {
|
for (uint32_t j = 0; j < Dim; j++) {
|
||||||
prod[i * Dim + j] = 0;
|
prod[i * Dim + j] = 0;
|
||||||
for (uint32_t k = 0; k < Dim; k++) {
|
for (uint32_t k = 0; k < Dim; k++) {
|
||||||
prod[i * Dim + j] += a[i * Dim + k] * b[k * LDS + j];
|
prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -121,7 +121,7 @@ void residual(double *a, double *res, const uint64_t Dim) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
|
uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
|
||||||
const uint64_t N_updates, const double *Updates,
|
const uint64_t N_updates, const double *Updates,
|
||||||
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
||||||
double *Slater, double *Slater_inv, double *determinant) {
|
double *Slater, double *Slater_inv, double *determinant) {
|
||||||
@ -129,40 +129,40 @@ uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
|
|||||||
// if (version[0] == 'a') { // Anthony
|
// if (version[0] == 'a') { // Anthony
|
||||||
// const double *Upds;
|
// const double *Upds;
|
||||||
// const uint64_t *Ui;
|
// const uint64_t *Ui;
|
||||||
// for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] *= *determinant;
|
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
|
||||||
// for (int j = 0; j < N_updates; j++) {
|
// for (int j = 0; j < N_updates; j++) {
|
||||||
// Upds = &Updates[j * LDS];
|
// Upds = &Updates[j * Lds];
|
||||||
// Ui = &Updates_index[j];
|
// Ui = &Updates_index[j];
|
||||||
// detupd(Dim, LDS, Upds, Ui, Slater_inv, determinant);
|
// detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
|
||||||
// if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
|
// if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
|
||||||
// }
|
// }
|
||||||
// for (int i = 0; i < LDS * Dim; i++) Slater_inv[i] /= *determinant;
|
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
|
||||||
// update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
// update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||||
// rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
|
// rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
|
||||||
// if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
// if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||||
// } else if (version[0] == 'n') { // Naive
|
// } else if (version[0] == 'n') { // Naive
|
||||||
if (version[0] == 'n') { // Naive
|
if (version[0] == 'n') { // Naive
|
||||||
rc = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates, Updates_index,
|
rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
|
||||||
breakdown, Slater_inv, determinant);
|
breakdown, Slater_inv, determinant);
|
||||||
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison failed\n");
|
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison failed\n");
|
||||||
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||||
rc = check_error(LDS, Dim, Slater_inv, Slater, tolerance);
|
rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
|
||||||
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||||
} else if (version[0] == 's') { // Splitting
|
} else if (version[0] == 's') { // Splitting
|
||||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
|
rc = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_inv,
|
Updates_index, breakdown, Slater_inv,
|
||||||
determinant);
|
determinant);
|
||||||
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_splitting failed\n");
|
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_splitting failed\n");
|
||||||
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||||
rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
|
rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
|
||||||
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||||
} else if (version[0] == 'b') { // Blocked
|
} else if (version[0] == 'b') { // Blocked
|
||||||
rc = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
|
rc = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_inv,
|
Updates_index, breakdown, Slater_inv,
|
||||||
determinant);
|
determinant);
|
||||||
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_smw32s failed\n");
|
if (rc != 0) printf("TEST_KERNEL: qmckl_sherman_morrison_smw32s failed\n");
|
||||||
update_slater_matrix(LDS, Dim, N_updates, Updates, Updates_index, Slater);
|
update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||||
rc = check_error(LDS, Dim, Slater, Slater_inv, tolerance);
|
rc = check_error(Lds, Dim, Slater, Slater_inv, tolerance);
|
||||||
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -12,13 +12,13 @@ typedef struct Error {
|
|||||||
uint64_t error;
|
uint64_t error;
|
||||||
} Error;
|
} Error;
|
||||||
|
|
||||||
void matmul(double *a, double *b, double *prod, const uint64_t LDS, const uint64_t Dim);
|
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
|
||||||
void residual(double *a, double *res, const uint64_t Dim);
|
void residual(double *a, double *res, const uint64_t Dim);
|
||||||
double frobenius_norm2(double *A, const uint64_t LDS, const uint64_t Dim);
|
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
void print_matrix(double *A, const uint64_t LDS, const uint64_t Dim);
|
void print_matrix(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
double frobenius_norm(double *A, const uint64_t LDS, const uint64_t Dim);
|
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
double max_norm(double *A, const uint64_t LDS, const uint64_t Dim);
|
double max_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||||
double condition_number(double *A, double *Ainv, const uint64_t LDS, const uint64_t Dim);
|
double condition_number(double *A, double *Ainv, const uint64_t Lds, const uint64_t Dim);
|
||||||
void read_uint(hid_t file_id, const char *key, uint64_t *data);
|
void read_uint(hid_t file_id, const char *key, uint64_t *data);
|
||||||
void read_double(hid_t file_id, const char *key, double *data);
|
void read_double(hid_t file_id, const char *key, double *data);
|
||||||
|
|
||||||
@ -28,16 +28,16 @@ static __inline__ uint64_t rdtsc(void) {
|
|||||||
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
|
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_slater_matrix(const uint64_t LDS, const uint64_t Dim,
|
void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
|
||||||
const uint64_t N_updates, const double *Updates,
|
const uint64_t N_updates, const double *Updates,
|
||||||
const uint64_t *Updates_index, double *Slater);
|
const uint64_t *Updates_index, double *Slater);
|
||||||
|
|
||||||
uint32_t check_error(const uint64_t LDS, const uint64_t Dim, double *Slater_invT,
|
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
|
||||||
double *Slater, const double tolerance);
|
double *Slater, const double tolerance);
|
||||||
|
|
||||||
int32_t check_error_better(const double max, const double tolerance);
|
int32_t check_error_better(const double max, const double tolerance);
|
||||||
|
|
||||||
uint32_t test_kernel(char *version, const uint64_t LDS, const uint64_t Dim,
|
uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
|
||||||
const uint64_t N_updates, const double *Updates,
|
const uint64_t N_updates, const double *Updates,
|
||||||
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
||||||
double *Slater, double *Slater_inv, double *determinant);
|
double *Slater, double *Slater_inv, double *determinant);
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
41 15 0 0 0 0 0 2.278445e-12 8.167505e-12 7.553484e+04 1.771759e+04 1.181172e+03 1.771759e+04 0
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
@ -17,10 +17,10 @@ uint32_t qmckl_sherman_morrison(
|
|||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
double __attribute__((aligned(8))) C[Dim];
|
double __attribute__((aligned(8))) C[DIM];
|
||||||
double __attribute__((aligned(8))) D[LDS];
|
double __attribute__((aligned(8))) D[LDS];
|
||||||
|
|
||||||
uint32_t l = 0;
|
uint32_t l = 0;
|
||||||
@ -31,8 +31,8 @@ uint32_t qmckl_sherman_morrison(
|
|||||||
C[i] = 0.0;
|
C[i] = 0.0;
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,17 +51,17 @@ uint32_t qmckl_sherman_morrison(
|
|||||||
|
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
D[j] = Slater_inv[cui * LDS + j]; // selecting proper column of v_l^T * S_inv
|
D[j] = Slater_inv[cui * Lds + j]; // selecting proper column of v_l^T * S_inv
|
||||||
}
|
}
|
||||||
|
|
||||||
// A^{-1} = A^{-1} - C x D / den
|
// A^{-1} = A^{-1} - C x D / den
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
const double update = C[i] * D[j] * iden;
|
const double update = C[i] * D[j] * iden;
|
||||||
Slater_inv[i * LDS + j] -= update;
|
Slater_inv[i * Lds + j] -= update;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
l += 1;
|
l += 1;
|
||||||
@ -69,6 +69,15 @@ uint32_t qmckl_sherman_morrison(
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS,
|
||||||
|
where S^{-1}P : Dim x LDS,
|
||||||
|
C := S^{-1}PP^TU : Dim x 2,
|
||||||
|
B := 1 + VC : 2 x 2,
|
||||||
|
D := VS^{-1}P : 2 x LDS,
|
||||||
|
P^TU : LDS x 2,
|
||||||
|
V : 2 x Dim
|
||||||
|
*/
|
||||||
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||||
const double *__restrict __attribute__((aligned(8)))
|
const double *__restrict __attribute__((aligned(8)))
|
||||||
Updates,
|
Updates,
|
||||||
@ -77,31 +86,23 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
double *__restrict __attribute__((aligned(8)))
|
double *__restrict __attribute__((aligned(8)))
|
||||||
Slater_inv,
|
Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
const uint32_t Dim = 21;
|
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Dim = DIM;
|
||||||
/*
|
const uint32_t Lds = LDS;
|
||||||
COMPUTE S^{-1}P - CB^{-1}D : Dim x LDS,
|
|
||||||
where S^{-1}P : Dim x LDS,
|
|
||||||
C := S^{-1}PP^TU : Dim x 2,
|
|
||||||
B := 1 + VC : 2 x 2,
|
|
||||||
D := VS^{-1}P : 2 x LDS,
|
|
||||||
P^TU : LDS x 2,
|
|
||||||
V : 2 x Dim
|
|
||||||
*/
|
|
||||||
|
|
||||||
const uint32_t row1 = (Updates_index[0] - 1);
|
const uint32_t row1 = (Updates_index[0] - 1);
|
||||||
const uint32_t row2 = (Updates_index[1] - 1);
|
const uint32_t row2 = (Updates_index[1] - 1);
|
||||||
|
|
||||||
// Compute C = (S^T)^{-1}U : Dim x 2
|
// Compute C = (S^T)^{-1}U : Dim x 2
|
||||||
double __attribute__((aligned(8))) C[2 * Dim];
|
double __attribute__((aligned(8))) C[2 * DIM];
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
C[i * 2] = 0;
|
C[i * 2] = 0;
|
||||||
C[i * 2 + 1] = 0;
|
C[i * 2 + 1] = 0;
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t k = 0; k < LDS; k++) {
|
for (uint32_t k = 0; k < Lds; k++) {
|
||||||
C[i * 2] += Slater_inv[i * LDS + k] * Updates[k];
|
C[i * 2] += Slater_inv[i * Lds + k] * Updates[k];
|
||||||
C[i * 2 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
|
C[i * 2 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,28 +131,37 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
|
|
||||||
// tmp = B^{-1}D : 2 x LDS
|
// tmp = B^{-1}D : 2 x LDS
|
||||||
double __attribute__((aligned(8))) tmp[2 * LDS];
|
double __attribute__((aligned(8))) tmp[2 * LDS];
|
||||||
double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
|
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
|
||||||
double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
|
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
|
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j];
|
||||||
tmp[LDS + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
|
tmp[Lds + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
|
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
Slater_inv[i * LDS + j] -= C[i * 2] * tmp[j];
|
Slater_inv[i * Lds + j] -= C[i * 2] * tmp[j];
|
||||||
Slater_inv[i * LDS + j] -= C[i * 2 + 1] * tmp[LDS + j];
|
Slater_inv[i * Lds + j] -= C[i * 2 + 1] * tmp[Lds + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS,
|
||||||
|
where S^T : Dim x LDS,
|
||||||
|
C := (S^T)^{-1}U : Dim x 3,
|
||||||
|
B := 1 + VC : 3 x 3,
|
||||||
|
D := V(S^T)^{-1} : 3 x LDS,
|
||||||
|
U : LDS x 3,
|
||||||
|
V : 3 x Dim
|
||||||
|
*/
|
||||||
uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
||||||
const double *__restrict __attribute__((aligned(8)))
|
const double *__restrict __attribute__((aligned(8)))
|
||||||
Updates,
|
Updates,
|
||||||
@ -160,34 +170,26 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
double *__restrict __attribute__((aligned(8)))
|
double *__restrict __attribute__((aligned(8)))
|
||||||
Slater_inv,
|
Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
const uint32_t Dim = 21;
|
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Dim = DIM;
|
||||||
/*
|
const uint32_t Lds = LDS;
|
||||||
COMPUTE (S^T)^{-1} - CB^{-1}D : Dim x LDS,
|
|
||||||
where S^T : Dim x LDS,
|
|
||||||
C := (S^T)^{-1}U : Dim x 3,
|
|
||||||
B := 1 + VC : 3 x 3,
|
|
||||||
D := V(S^T)^{-1} : 3 x LDS,
|
|
||||||
U : LDS x 3,
|
|
||||||
V : 3 x Dim
|
|
||||||
*/
|
|
||||||
|
|
||||||
const uint32_t row1 = (Updates_index[0] - 1);
|
const uint32_t row1 = (Updates_index[0] - 1);
|
||||||
const uint32_t row2 = (Updates_index[1] - 1);
|
const uint32_t row2 = (Updates_index[1] - 1);
|
||||||
const uint32_t row3 = (Updates_index[2] - 1);
|
const uint32_t row3 = (Updates_index[2] - 1);
|
||||||
|
|
||||||
// Compute C = (S^T)^{-1}U : Dim x 3
|
// Compute C = (S^T)^{-1}U : Dim x 3
|
||||||
double __attribute__((aligned(8))) C[3 * Dim];
|
double __attribute__((aligned(8))) C[3 * DIM];
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
C[i * 3] = 0;
|
C[i * 3] = 0;
|
||||||
C[i * 3 + 1] = 0;
|
C[i * 3 + 1] = 0;
|
||||||
C[i * 3 + 2] = 0;
|
C[i * 3 + 2] = 0;
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t k = 0; k < LDS; k++) {
|
for (uint32_t k = 0; k < Lds; k++) {
|
||||||
C[i * 3] += Slater_inv[i * LDS + k] * Updates[k];
|
C[i * 3] += Slater_inv[i * Lds + k] * Updates[k];
|
||||||
C[i * 3 + 1] += Slater_inv[i * LDS + k] * Updates[LDS + k];
|
C[i * 3 + 1] += Slater_inv[i * Lds + k] * Updates[Lds + k];
|
||||||
C[i * 3 + 2] += Slater_inv[i * LDS + k] * Updates[2 * LDS + k];
|
C[i * 3 + 2] += Slater_inv[i * Lds + k] * Updates[2 * Lds + k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,20 +235,20 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
|||||||
double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
|
double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
|
tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j];
|
||||||
tmp[LDS + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
|
tmp[Lds + j] = Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j];
|
||||||
tmp[2 * LDS + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
|
tmp[2 * Lds + j] = Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute (S^T)^{-1} - C * tmp : Dim x LDS
|
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
Slater_inv[i * LDS + j] -= C[i * 3] * tmp[j];
|
Slater_inv[i * Lds + j] -= C[i * 3] * tmp[j];
|
||||||
Slater_inv[i * LDS + j] -= C[i * 3 + 1] * tmp[LDS + j];
|
Slater_inv[i * Lds + j] -= C[i * 3 + 1] * tmp[Lds + j];
|
||||||
Slater_inv[i * LDS + j] -= C[i * 3 + 2] * tmp[2 * LDS + j];
|
Slater_inv[i * Lds + j] -= C[i * 3 + 2] * tmp[2 * Lds + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -273,15 +275,15 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||||
double C[Dim * N_updates];
|
double C[DIM * N_updates];
|
||||||
double alpha = 1.0, beta = 0.0;
|
double alpha = 1.0, beta = 0.0;
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
Dim, N_updates, LDS,
|
Dim, N_updates, Lds,
|
||||||
alpha, Slater_inv, LDS, Updates, LDS,
|
alpha, Slater_inv, Lds, Updates, Lds,
|
||||||
beta, C, N_updates);
|
beta, C, N_updates);
|
||||||
|
|
||||||
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
||||||
@ -290,7 +292,7 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
|||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
const uint32_t row = Updates_index[i] - 1;
|
const uint32_t row = Updates_index[i] - 1;
|
||||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||||
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
|
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute determinant by LU decomposition
|
// Compute determinant by LU decomposition
|
||||||
@ -345,41 +347,34 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
|||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
|
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||||
|
// double C[Dim * N_updates];
|
||||||
|
double *C = malloc(DIM * N_updates * sizeof(double));
|
||||||
|
double alpha = 1.0, beta = 0.0;
|
||||||
|
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
|
// Dim, N_updates, Lds,
|
||||||
|
// alpha, Slater_inv, Lds, Updates, Lds,
|
||||||
|
// beta, C, N_updates);
|
||||||
//cuBLAS initialization
|
//cuBLAS initialization
|
||||||
cublasHandle_t handle;
|
cublasHandle_t handle;
|
||||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
|
||||||
fprintf(stdout, "cuBLAS initialization failed!\n");
|
fprintf(stdout, "cuBLAS initialization failed!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
#pragma omp target enter data map(to:Slater_inv, Updates, C)
|
||||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
#pragma omp target data use_device_ptr(Slater_inv, Updates, C)
|
||||||
double C[Dim * N_updates];
|
{
|
||||||
double alpha = 1.0, beta = 0.0;
|
int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
|
||||||
|
Dim, N_updates, Lds,
|
||||||
// #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:M*N*walk_num], tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
&alpha, Slater_inv, Lds, Updates, Lds,
|
||||||
// #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
|
&beta, C, N_updates);
|
||||||
// {
|
}
|
||||||
// for (int nw=0; nw < walk_num; ++nw) {
|
#pragma omp target exit data map(from:C)
|
||||||
// int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
|
|
||||||
// &(een_rescaled_e[nw*(cord_num+1)]),
|
|
||||||
// LDA, af,
|
|
||||||
// &(een_rescaled_n[bf*nw]),
|
|
||||||
// LDB, 0,
|
|
||||||
// &beta,
|
|
||||||
// &(tmp_c[nw*cord_num]),
|
|
||||||
// LDC, cf, cord_num);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
|
||||||
cublasDestroy(handle);
|
cublasDestroy(handle);
|
||||||
|
|
||||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
||||||
Dim, N_updates, LDS,
|
|
||||||
alpha, Slater_inv, LDS, Updates, LDS,
|
|
||||||
beta, C, N_updates);
|
|
||||||
|
|
||||||
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
|
||||||
// Construct D = V S^{-1} : K x LDS
|
// Construct D = V S^{-1} : K x LDS
|
||||||
@ -387,7 +382,7 @@ uint32_t qmckl_woodbury_k_cublas_offload(const uint64_t vLDS,
|
|||||||
for (uint32_t i = 0; i < N_updates; i++) {
|
for (uint32_t i = 0; i < N_updates; i++) {
|
||||||
const uint32_t row = Updates_index[i] - 1;
|
const uint32_t row = Updates_index[i] - 1;
|
||||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||||
for (uint32_t j = 0; j < LDS; j++) D[i * LDS + j] = Slater_inv[row * LDS + j];
|
for (uint32_t j = 0; j < Lds; j++) D[i * Lds + j] = Slater_inv[row * Lds + j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute determinant by LU decomposition
|
// Compute determinant by LU decomposition
|
||||||
@ -443,8 +438,8 @@ uint32_t qmckl_slagel_splitting(
|
|||||||
uint64_t *__restrict later_index, uint64_t *__restrict later,
|
uint64_t *__restrict later_index, uint64_t *__restrict later,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
double __attribute__((aligned(8))) C[LDS];
|
double __attribute__((aligned(8))) C[LDS];
|
||||||
double __attribute__((aligned(8))) D[LDS];
|
double __attribute__((aligned(8))) D[LDS];
|
||||||
@ -457,8 +452,8 @@ uint32_t qmckl_slagel_splitting(
|
|||||||
C[i] = 0.0;
|
C[i] = 0.0;
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -474,8 +469,8 @@ uint32_t qmckl_slagel_splitting(
|
|||||||
// in later_updates
|
// in later_updates
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t i = 0; i < LDS; i++) {
|
for (uint32_t i = 0; i < Lds; i++) {
|
||||||
later_updates[*later * LDS + i] = Updates[l * LDS + i] / 2.0;
|
later_updates[*later * Lds + i] = Updates[l * Lds + i] / 2.0;
|
||||||
C[i] /= 2.0;
|
C[i] /= 2.0;
|
||||||
}
|
}
|
||||||
later_index[*later] = Updates_index[l];
|
later_index[*later] = Updates_index[l];
|
||||||
@ -490,17 +485,17 @@ uint32_t qmckl_slagel_splitting(
|
|||||||
// D = v^T x S^{-1} : 1 x LDS
|
// D = v^T x S^{-1} : 1 x LDS
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
D[j] = Slater_inv[cui * LDS + j];
|
D[j] = Slater_inv[cui * Lds + j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// S^{-1} = S^{-1} - C x D / den
|
// S^{-1} = S^{-1} - C x D / den
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
const double update = C[i] * D[j] * iden;
|
const double update = C[i] * D[j] * iden;
|
||||||
Slater_inv[i * LDS + j] -= update;
|
Slater_inv[i * Lds + j] -= update;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
l += 1;
|
l += 1;
|
||||||
@ -516,18 +511,18 @@ uint32_t qmckl_sherman_morrison_splitting(
|
|||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||||
uint64_t later_index[N_updates];
|
uint64_t later_index[N_updates];
|
||||||
uint64_t later = 0;
|
uint64_t later = 0;
|
||||||
// uint32_t rc;
|
// uint32_t rc;
|
||||||
|
|
||||||
(void) qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
|
(void) qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
|
||||||
breakdown, Slater_inv, later_updates, later_index,
|
breakdown, Slater_inv, later_updates, later_index,
|
||||||
&later, determinant);
|
&later, determinant);
|
||||||
// rc = qmckl_slagel_splitting(LDS, Dim, N_updates, Updates, Updates_index,
|
// rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
|
||||||
// breakdown, Slater_inv, later_updates, later_index,
|
// breakdown, Slater_inv, later_updates, later_index,
|
||||||
// &later, determinant);
|
// &later, determinant);
|
||||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||||
@ -535,11 +530,11 @@ uint32_t qmckl_sherman_morrison_splitting(
|
|||||||
if (later > 0) {
|
if (later > 0) {
|
||||||
recursive_calls++;
|
recursive_calls++;
|
||||||
// printf("Later > 0\n");
|
// printf("Later > 0\n");
|
||||||
(void) qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
(void) qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||||
later_index, breakdown, Slater_inv,
|
later_index, breakdown, Slater_inv,
|
||||||
determinant);
|
determinant);
|
||||||
|
|
||||||
// rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
// rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||||
// later_index, breakdown, Slater_inv,
|
// later_index, breakdown, Slater_inv,
|
||||||
// determinant);
|
// determinant);
|
||||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
||||||
@ -555,8 +550,8 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||||
uint64_t later_index[N_updates];
|
uint64_t later_index[N_updates];
|
||||||
@ -564,31 +559,31 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
uint32_t rc;
|
uint32_t rc;
|
||||||
|
|
||||||
if (N_updates == 4) { // Special case for 4 rank-1 updates: 2+2
|
if (N_updates == 4) { // Special case for 4 rank-1 updates: 2+2
|
||||||
rc = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
|
rc = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
|
||||||
breakdown, Slater_inv, determinant);
|
breakdown, Slater_inv, determinant);
|
||||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||||
block_fail += 1;
|
block_fail += 1;
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates,
|
rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates,
|
||||||
Updates_index, breakdown, Slater_inv,
|
Updates_index, breakdown, Slater_inv,
|
||||||
later_updates + (LDS * later),
|
later_updates + (Lds * later),
|
||||||
later_index + later, &l, determinant);
|
later_index + later, &l, determinant);
|
||||||
later += l;
|
later += l;
|
||||||
}
|
}
|
||||||
rc = qmckl_woodbury_2(LDS, Dim, &Updates[2*LDS], &Updates_index[2],
|
rc = qmckl_woodbury_2(Lds, Dim, &Updates[2*Lds], &Updates_index[2],
|
||||||
breakdown, Slater_inv, determinant);
|
breakdown, Slater_inv, determinant);
|
||||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||||
block_fail += 1;
|
block_fail += 1;
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
rc = qmckl_slagel_splitting(LDS, Dim, 2, &Updates[2*LDS],
|
rc = qmckl_slagel_splitting(Lds, Dim, 2, &Updates[2*Lds],
|
||||||
&Updates_index[2], breakdown, Slater_inv,
|
&Updates_index[2], breakdown, Slater_inv,
|
||||||
later_updates + (LDS * later),
|
later_updates + (Lds * later),
|
||||||
later_index + later, &l, determinant);
|
later_index + later, &l, determinant);
|
||||||
later += l;
|
later += l;
|
||||||
}
|
}
|
||||||
if (later > 0) {
|
if (later > 0) {
|
||||||
recursive_calls++;
|
recursive_calls++;
|
||||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||||
later_index, breakdown, Slater_inv,
|
later_index, breakdown, Slater_inv,
|
||||||
determinant);
|
determinant);
|
||||||
}
|
}
|
||||||
@ -600,21 +595,21 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
// Woodbury 3x3 kernel
|
// Woodbury 3x3 kernel
|
||||||
uint32_t n_of_3blocks = N_updates / 3;
|
uint32_t n_of_3blocks = N_updates / 3;
|
||||||
uint32_t remainder = N_updates % 3;
|
uint32_t remainder = N_updates % 3;
|
||||||
uint32_t length_3block = 3 * LDS;
|
uint32_t length_3block = 3 * Lds;
|
||||||
|
|
||||||
if (n_of_3blocks > 0) {
|
if (n_of_3blocks > 0) {
|
||||||
for (uint32_t i = 0; i < n_of_3blocks; i++) {
|
for (uint32_t i = 0; i < n_of_3blocks; i++) {
|
||||||
const double *Updates_3block = &Updates[i * length_3block];
|
const double *Updates_3block = &Updates[i * length_3block];
|
||||||
const uint64_t *Updates_index_3block = &Updates_index[i * 3];
|
const uint64_t *Updates_index_3block = &Updates_index[i * 3];
|
||||||
rc = qmckl_woodbury_3(LDS, Dim, Updates_3block, Updates_index_3block,
|
rc = qmckl_woodbury_3(Lds, Dim, Updates_3block, Updates_index_3block,
|
||||||
breakdown, Slater_inv, determinant);
|
breakdown, Slater_inv, determinant);
|
||||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||||
// printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
|
// printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
|
||||||
block_fail += 1;
|
block_fail += 1;
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
rc = qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block,
|
rc = qmckl_slagel_splitting(Lds, Dim, 3, Updates_3block,
|
||||||
Updates_index_3block, breakdown, Slater_inv,
|
Updates_index_3block, breakdown, Slater_inv,
|
||||||
later_updates + (LDS * later),
|
later_updates + (Lds * later),
|
||||||
later_index + later, &l, determinant);
|
later_index + later, &l, determinant);
|
||||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||||
later += l;
|
later += l;
|
||||||
@ -626,15 +621,15 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
if (remainder == 2) {
|
if (remainder == 2) {
|
||||||
const double *Updates_2block = &Updates[n_of_3blocks * length_3block];
|
const double *Updates_2block = &Updates[n_of_3blocks * length_3block];
|
||||||
const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks];
|
const uint64_t *Updates_index_2block = &Updates_index[3 * n_of_3blocks];
|
||||||
rc = qmckl_woodbury_2(LDS, Dim, Updates_2block, Updates_index_2block,
|
rc = qmckl_woodbury_2(Lds, Dim, Updates_2block, Updates_index_2block,
|
||||||
breakdown, Slater_inv, determinant);
|
breakdown, Slater_inv, determinant);
|
||||||
if (rc != 0) { // Send the entire block to slagel_splitting
|
if (rc != 0) { // Send the entire block to slagel_splitting
|
||||||
// printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
|
// printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
|
||||||
block_fail += 1;
|
block_fail += 1;
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
rc = qmckl_slagel_splitting(LDS, Dim, 2, Updates_2block,
|
rc = qmckl_slagel_splitting(Lds, Dim, 2, Updates_2block,
|
||||||
Updates_index_2block, breakdown, Slater_inv,
|
Updates_index_2block, breakdown, Slater_inv,
|
||||||
later_updates + (LDS * later),
|
later_updates + (Lds * later),
|
||||||
later_index + later, &l, determinant);
|
later_index + later, &l, determinant);
|
||||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||||
later += l;
|
later += l;
|
||||||
@ -647,9 +642,9 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
const double *Updates_1block = &Updates[n_of_3blocks * length_3block];
|
const double *Updates_1block = &Updates[n_of_3blocks * length_3block];
|
||||||
const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks];
|
const uint64_t *Updates_index_1block = &Updates_index[3 * n_of_3blocks];
|
||||||
uint64_t l = 0;
|
uint64_t l = 0;
|
||||||
rc = qmckl_slagel_splitting(LDS, Dim, 1, Updates_1block,
|
rc = qmckl_slagel_splitting(Lds, Dim, 1, Updates_1block,
|
||||||
Updates_index_1block, breakdown, Slater_inv,
|
Updates_index_1block, breakdown, Slater_inv,
|
||||||
later_updates + (LDS * later),
|
later_updates + (Lds * later),
|
||||||
later_index + later, &l, determinant);
|
later_index + later, &l, determinant);
|
||||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
|
||||||
later += l;
|
later += l;
|
||||||
@ -658,7 +653,7 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
|||||||
if (later > 0) {
|
if (later > 0) {
|
||||||
recursive_calls++;
|
recursive_calls++;
|
||||||
// printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
// printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
||||||
rc = qmckl_sherman_morrison_splitting(LDS, Dim, later, later_updates,
|
rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
|
||||||
later_index, breakdown, Slater_inv,
|
later_index, breakdown, Slater_inv,
|
||||||
determinant);
|
determinant);
|
||||||
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
|
||||||
@ -674,10 +669,10 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||||
double *__restrict determinant) {
|
double *__restrict determinant) {
|
||||||
|
|
||||||
const uint32_t Dim = 21;
|
const uint32_t Dim = DIM;
|
||||||
const uint32_t LDS = 24;
|
const uint32_t Lds = LDS;
|
||||||
|
|
||||||
double __attribute__((aligned(8))) C[Dim];
|
double __attribute__((aligned(8))) C[DIM];
|
||||||
double __attribute__((aligned(8))) D[LDS];
|
double __attribute__((aligned(8))) D[LDS];
|
||||||
|
|
||||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||||
@ -693,8 +688,8 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
C[i] = 0.0;
|
C[i] = 0.0;
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
C[i] += Slater_inv[i * LDS + j] * Updates[l * LDS + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
C[i] += Slater_inv[i * Lds + j] * Updates[l * Lds + j]; // regular mat-vec product, but actually working on S_inv^T * U_l.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -705,8 +700,8 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
// for (uint32_t i = 0; i < Dim; i++) {
|
// for (uint32_t i = 0; i < Dim; i++) {
|
||||||
for (uint32_t i = 0; i < LDS; i++) {
|
for (uint32_t i = 0; i < Lds; i++) {
|
||||||
later_updates[later * LDS + i] = Updates[l * LDS + i];
|
later_updates[later * Lds + i] = Updates[l * Lds + i];
|
||||||
}
|
}
|
||||||
later_index[later] = Updates_index[l];
|
later_index[later] = Updates_index[l];
|
||||||
later++;
|
later++;
|
||||||
@ -720,17 +715,17 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
// D = v^T x A^{-1}
|
// D = v^T x A^{-1}
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
D[j] = Slater_inv[cui * LDS + j];
|
D[j] = Slater_inv[cui * Lds + j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// S^{-1} = S^{-1} - C x D / den
|
// S^{-1} = S^{-1} - C x D / den
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
const double update = C[i] * D[j] * iden;
|
const double update = C[i] * D[j] * iden;
|
||||||
Slater_inv[i * LDS + j] -= update;
|
Slater_inv[i * Lds + j] -= update;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
l += 1;
|
l += 1;
|
||||||
@ -741,7 +736,7 @@ uint32_t qmckl_sherman_morrison_later(
|
|||||||
}
|
}
|
||||||
else if (later > 0) { // If some have failed, make a recursive call
|
else if (later > 0) { // If some have failed, make a recursive call
|
||||||
recursive_calls++;
|
recursive_calls++;
|
||||||
(void) qmckl_sherman_morrison_later(LDS, Dim, later, later_updates,
|
(void) qmckl_sherman_morrison_later(Lds, Dim, later, later_updates,
|
||||||
later_index, breakdown, Slater_inv, determinant);
|
later_index, breakdown, Slater_inv, determinant);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
#include <stdint.h>
|
||||||
#include "meuk.h"
|
#include "meuk.h"
|
||||||
#include "cycles.h"
|
#include "cycles.h"
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
|
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
|
||||||
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
|
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
|
||||||
@ -23,11 +23,11 @@ int main(int argc, char **argv) {
|
|||||||
char slater_key[32];
|
char slater_key[32];
|
||||||
char slater_inv_key[32];
|
char slater_inv_key[32];
|
||||||
char det_key[32];
|
char det_key[32];
|
||||||
const uint64_t Dim = 21;
|
const uint64_t Dim = DIM;
|
||||||
const uint64_t LDS = 24;
|
const uint64_t Lds = LDS;
|
||||||
uint64_t N_updates;
|
uint64_t N_updates;
|
||||||
double Slater[LDS * Dim ], SlaterT[LDS * Dim];
|
double Slater[LDS * DIM ], SlaterT[LDS * DIM];
|
||||||
double Slater_invT[LDS * Dim], Slater_invT_copy[LDS * Dim];
|
double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
|
||||||
double determinant, determinant_copy;
|
double determinant, determinant_copy;
|
||||||
|
|
||||||
// SETUP TEST PARAMETERS
|
// SETUP TEST PARAMETERS
|
||||||
@ -52,7 +52,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
sprintf(det_key, "/cycle_%d/determinant", cycle);
|
sprintf(det_key, "/cycle_%d/determinant", cycle);
|
||||||
read_uint(file_id, nupds_key, &N_updates);
|
read_uint(file_id, nupds_key, &N_updates);
|
||||||
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
|
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
|
||||||
double *Updates = malloc(LDS * N_updates * sizeof(double));
|
double *Updates = malloc(Lds * N_updates * sizeof(double));
|
||||||
read_uint(file_id, upd_idx_key, Updates_index);
|
read_uint(file_id, upd_idx_key, Updates_index);
|
||||||
read_double(file_id, upds_key, Updates);
|
read_double(file_id, upds_key, Updates);
|
||||||
read_double(file_id, slater_key, Slater);
|
read_double(file_id, slater_key, Slater);
|
||||||
@ -60,28 +60,28 @@ printf("#-----------------------------------------------------------------------
|
|||||||
read_double(file_id, det_key, &determinant);
|
read_double(file_id, det_key, &determinant);
|
||||||
|
|
||||||
// Compute transpose of S. ST: 24 x 21
|
// Compute transpose of S. ST: 24 x 21
|
||||||
for (int i = 0; i < LDS; i++) {
|
for (int i = 0; i < Lds; i++) {
|
||||||
for (int j = 0; j < Dim; j++) {
|
for (int j = 0; j < Dim; j++) {
|
||||||
SlaterT[i * Dim + j] = Slater[j * LDS + i];
|
SlaterT[i * Dim + j] = Slater[j * Lds + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert repl. upds into additive upds.
|
// Convert repl. upds into additive upds.
|
||||||
for (int i = 0; i < N_updates; i++) {
|
for (int i = 0; i < N_updates; i++) {
|
||||||
int col = Updates_index[i] - 1;
|
int col = Updates_index[i] - 1;
|
||||||
for (int j = 0; j < LDS; j++) {
|
for (int j = 0; j < Lds; j++) {
|
||||||
Updates[i * LDS + j] -= SlaterT[col + j * Dim];
|
Updates[i * Lds + j] -= SlaterT[col + j * Dim];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
|
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
|
||||||
uint32_t err_inp = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
|
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
|
||||||
|
|
||||||
// Update Slater matrix
|
// Update Slater matrix
|
||||||
for (int i = 0; i < N_updates; i++) {
|
for (int i = 0; i < N_updates; i++) {
|
||||||
int col = Updates_index[i] - 1;
|
int col = Updates_index[i] - 1;
|
||||||
for (int j = 0; j < Dim; j++) {
|
for (int j = 0; j < Dim; j++) {
|
||||||
SlaterT[col + j * Dim] += Updates[i * LDS + j];
|
SlaterT[col + j * Dim] += Updates[i * Lds + j];
|
||||||
}
|
}
|
||||||
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
|
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
for (int rep = 0; rep < REPETITIONS; rep++) {
|
for (int rep = 0; rep < REPETITIONS; rep++) {
|
||||||
|
|
||||||
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
|
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
|
||||||
memcpy(Slater_invT_copy, Slater_invT, LDS * Dim * sizeof(double));
|
memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double));
|
||||||
determinant_copy = determinant;
|
determinant_copy = determinant;
|
||||||
|
|
||||||
// ### CHOOSE A KERNEL:
|
// ### CHOOSE A KERNEL:
|
||||||
@ -109,10 +109,10 @@ printf("#-----------------------------------------------------------------------
|
|||||||
|
|
||||||
// err_break = 0;
|
// err_break = 0;
|
||||||
|
|
||||||
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
|
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
|
||||||
|
|
||||||
// for (int i = 0; i < N_updates; i++) {
|
// for (int i = 0; i < N_updates; i++) {
|
||||||
// Upds = &Updates[i * LDS];
|
// Upds = &Updates[i * Lds];
|
||||||
// Ui = &Updates_index[i];
|
// Ui = &Updates_index[i];
|
||||||
// determinant_previous = determinant_copy;
|
// determinant_previous = determinant_copy;
|
||||||
|
|
||||||
@ -120,7 +120,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
// uint64_t before = rdtsc();
|
// uint64_t before = rdtsc();
|
||||||
|
|
||||||
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
// detupd(Dim, LDS, Upds, Ui, Slater_invT_copy, &determinant_copy);
|
// detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
|
||||||
|
|
||||||
// // 3. FETCH FINISH TIME
|
// // 3. FETCH FINISH TIME
|
||||||
// uint64_t after = rdtsc();
|
// uint64_t after = rdtsc();
|
||||||
@ -137,9 +137,9 @@ printf("#-----------------------------------------------------------------------
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
|
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
|
||||||
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
|
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
|
||||||
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
|
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
|
||||||
// for (int i = 0; i < LDS * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
|
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
|
||||||
// }
|
// }
|
||||||
// } else if (version[0] == 'n') { // Naive
|
// } else if (version[0] == 'n') { // Naive
|
||||||
if (version[0] == 'n') { // Naive
|
if (version[0] == 'n') { // Naive
|
||||||
@ -147,7 +147,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_sherman_morrison(LDS, Dim, N_updates, Updates,
|
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -161,7 +161,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_sherman_morrison_later(LDS, Dim, N_updates, Updates,
|
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -175,7 +175,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_woodbury_2(LDS, Dim, Updates, Updates_index,
|
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
|
||||||
breakdown, Slater_invT_copy, &determinant);
|
breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -190,7 +190,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_woodbury_3(LDS, Dim, Updates, Updates_index,
|
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
|
||||||
breakdown, Slater_invT_copy, &determinant);
|
breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -205,7 +205,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_woodbury_k(LDS, Dim, N_updates, Updates,
|
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -220,7 +220,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_woodbury_k_cublas_offload(LDS, Dim, N_updates, Updates,
|
err_break = qmckl_woodbury_k_cublas_offload(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -235,7 +235,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_sherman_morrison_splitting(LDS, Dim, N_updates, Updates,
|
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -249,7 +249,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
uint64_t before = rdtsc();
|
uint64_t before = rdtsc();
|
||||||
|
|
||||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||||
err_break = qmckl_sherman_morrison_smw32s(LDS, Dim, N_updates, Updates,
|
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
|
||||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||||
|
|
||||||
// 3. FETCH FINISH TIME
|
// 3. FETCH FINISH TIME
|
||||||
@ -260,7 +260,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
} else if (version[0] == 'm') { // LAPACK/MKL
|
} else if (version[0] == 'm') { // LAPACK/MKL
|
||||||
|
|
||||||
// Only send upper Dim x Dim part of matrix to lapack
|
// Only send upper Dim x Dim part of matrix to lapack
|
||||||
double tmp[Dim*Dim];
|
double tmp[DIM *DIM];
|
||||||
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
|
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
|
||||||
|
|
||||||
// 1. FETCH START TIME
|
// 1. FETCH START TIME
|
||||||
@ -274,9 +274,9 @@ printf("#-----------------------------------------------------------------------
|
|||||||
|
|
||||||
// Copy elements of inverse back, adding 0-padding in "correct" place
|
// Copy elements of inverse back, adding 0-padding in "correct" place
|
||||||
for (uint32_t i = 0; i < Dim; i++) {
|
for (uint32_t i = 0; i < Dim; i++) {
|
||||||
for (uint32_t j = 0; j < LDS; j++) {
|
for (uint32_t j = 0; j < Lds; j++) {
|
||||||
if (j < Dim) Slater_invT_copy[i * LDS + j] = tmp[i * Dim + j];
|
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
|
||||||
else Slater_invT_copy[i * LDS + j] = 0.0;
|
else Slater_invT_copy[i * Lds + j] = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -290,7 +290,7 @@ printf("#-----------------------------------------------------------------------
|
|||||||
} // END OF REPETITIONS LOOP
|
} // END OF REPETITIONS LOOP
|
||||||
|
|
||||||
// 4. COPY RESULT BACK TO ORIGINAL
|
// 4. COPY RESULT BACK TO ORIGINAL
|
||||||
memcpy(Slater_invT, Slater_invT_copy, LDS * Dim * sizeof(double));
|
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
|
||||||
determinant = determinant_copy;
|
determinant = determinant_copy;
|
||||||
// At this point Slater_invT contains the correct inverse matrix
|
// At this point Slater_invT contains the correct inverse matrix
|
||||||
|
|
||||||
@ -306,20 +306,20 @@ printf("#-----------------------------------------------------------------------
|
|||||||
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
|
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
|
||||||
cumulative += accumulator;
|
cumulative += accumulator;
|
||||||
|
|
||||||
double SSi[Dim * Dim];
|
double SSi[DIM * DIM];
|
||||||
matmul(SlaterT, Slater_invT, SSi, LDS, Dim);
|
matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
|
||||||
double Res[Dim * Dim];
|
double Res[DIM * DIM];
|
||||||
residual(SSi, Res, Dim);
|
residual(SSi, Res, Dim);
|
||||||
const double max = max_norm(Res, Dim, Dim);
|
const double max = max_norm(Res, Dim, Dim);
|
||||||
|
|
||||||
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
|
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
|
||||||
uint32_t err_out = check_error(LDS, Dim, Slater_invT, SlaterT, tolerance);
|
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
|
||||||
// int32_t err_out = check_error_better(max, tolerance);
|
// int32_t err_out = check_error_better(max, tolerance);
|
||||||
|
|
||||||
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
|
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
|
||||||
|
|
||||||
// 8. COMPUTE CONDITION NUMBER
|
// 8. COMPUTE CONDITION NUMBER
|
||||||
const double condnr = condition_number(Slater, Slater_invT, LDS, Dim);
|
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
|
||||||
const double frob = frobenius_norm(Res, Dim, Dim);
|
const double frob = frobenius_norm(Res, Dim, Dim);
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
41 15 0 0 1 0 0 3.438043e+01 9.578187e+01 2.792190e+04 6.417383e+02 4.278255e+01 6.417383e+02 0
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
#1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
|
||||||
#CYCLE UPDS ERR_IN ERR_BREAK ERR_OUT SPLITS BLK_FAILS MAX FROB COND CPU_CYC CPU_CYC/UPD CUMUL REC
|
|
||||||
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
Loading…
Reference in New Issue
Block a user