Improved version.

- All static arrays replaced by dynamic ones
- All overhead induced by checking before and after running of the kernels replaced as much as possible with calls to MKL/DGEMMs.
- Solved bugs due to dimension mismatches.

Overhead time is dramatically reduced because no more calls to naive 'matmul'.
This commit is contained in:
Francois Coppens 2022-10-01 18:54:18 +02:00
parent 15f959099d
commit bba5cf5f2c
9 changed files with 252 additions and 217 deletions

View File

@ -1,6 +1,7 @@
#FC = ifx
#CC = nvc
#CFLAGS=-std=c99 -O0 -Wall -g -mp -target=gpu
CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu
INCLUDE=-I$(NVHPC_ROOT)/math_libs/include

View File

@ -0,0 +1,2 @@
const uint32_t n_cycles = 1;
uint32_t cycles[n_cycles] = {1};

View File

@ -1,5 +1,15 @@
#!/bin/bash
make
rm -v blocked kay cu
./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu
rm -v blocked kay cu em
make &&
./test m > em && \
./test b > blocked && \
./test k > kay && \
./test c > cu && \
head -n 4 em && \
awk 'NR==5' em && \
awk 'NR==5' blocked && \
awk 'NR==5' kay && \
awk 'NR==5' cu

View File

@ -1 +1 @@
cycles_329_dets/all_cycles.h
bench_cycles/1_cycle.h

View File

@ -1,8 +1,6 @@
#include <mkl_lapacke.h>
#include <mkl.h>
#define DIM 21
#define LDS 24
#define HAVE_CUBLAS_OFFLOAD
#ifdef HAVE_CUBLAS_OFFLOAD

View File

@ -2,6 +2,89 @@
#include <stdint.h>
#include <assert.h>
void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim) {
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Lds; j++) {
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
else Slater_invT_copy[i * Lds + j] = 0.0;
}
}
}
void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds) {
for (int i = 0; i < nupds; i++) {
int col = ui[i] - 1;
for (int j = 0; j < Dim; j++) {
slaterT[col + j * Dim] += upds[i * Lds + j];
}
}
}
void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds) {
for (int i = 0; i < nupds; i++) {
int col = ui[i] - 1;
for (int j = 0; j < Lds; j++) {
upds[i * Lds + j] -= slaterT[col + j * Dim];
}
}
}
double get_determinant(uint32_t cycle, hid_t file_id) {
char det_key[32];
sprintf(det_key, "/cycle_%d/determinant", cycle);
double determinant;
read_double(file_id, det_key, &determinant);
return determinant;
}
double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) {
char slater_inv_key[32];
sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
double *slater_inv = malloc(sizeof *slater_inv * Dim * Lds);
read_double(file_id, slater_inv_key, slater_inv);
return slater_inv;
}
double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) {
char slater_key[32];
sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
double *slater = malloc(sizeof *slater * Dim * Lds);
read_double(file_id, slater_key, slater);
return slater;
}
double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds) {
char upds_key[32];
sprintf(upds_key, "/cycle_%d/updates", cycle);
double *upds = malloc(sizeof *upds * Lds * nupds);
read_double(file_id, upds_key, upds);
return upds;
}
uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds) {
char upd_idx_key[32];
sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
uint64_t* uis = malloc(sizeof *uis * nupds);
read_uint(file_id, upd_idx_key, uis);
return uis;
}
uint64_t get_dim(uint32_t cycle, hid_t file_id) {
char dim_key[32];
sprintf(dim_key, "/cycle_%d/slater_matrix_dim", cycle);
uint64_t Dim;
read_uint(file_id, dim_key, &Dim);
return Dim;
}
uint64_t get_nupdates(uint32_t cycle, hid_t file_id) {
char nupds_key[32];
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
uint64_t N_updates;
read_uint(file_id, nupds_key, &N_updates);
return N_updates;
}
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
double sum2 = 0;
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
@ -65,16 +148,12 @@ void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
double *Slater, const double tolerance) {
double res[Dim*Dim];
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Dim; j++) {
res[i * Dim + j] = 0;
for (uint32_t k = 0; k < Dim; k++) {
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
}
}
}
double* res = malloc(sizeof *res * Dim * Dim);
double alpha = 1.0, beta = 0.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
Dim, Dim, Dim,
alpha, Slater, Dim, Slater_invT, Lds,
beta, res, Dim);
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Dim; j++) {
@ -84,20 +163,20 @@ uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT
if (i != j && fabs(elm) > tolerance) return 1;
}
}
free(res);
return 0;
}
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Dim; j++) {
prod[i * Dim + j] = 0;
for (uint32_t k = 0; k < Dim; k++) {
prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
}
}
}
}
//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
// for (uint32_t i = 0; i < Dim; i++) {
// for (uint32_t j = 0; j < Dim; j++) {
// prod[i * Dim + j] = 0;
// for (uint32_t k = 0; k < Dim; k++) {
// prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
// }
// }
// }
//}
int32_t check_error_better(const double max, const double tolerance) {
if (max < 0) return -1; // When max was a NaN
@ -119,21 +198,7 @@ uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
const uint64_t *Updates_index, const double breakdown, const double tolerance,
double *Slater, double *Slater_inv, double *determinant) {
uint32_t rc = 0;
// if (version[0] == 'a') { // Anthony
// const double *Upds;
// const uint64_t *Ui;
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
// for (int j = 0; j < N_updates; j++) {
// Upds = &Updates[j * Lds];
// Ui = &Updates_index[j];
// detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
// if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
// }
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
// update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
// rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
// if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
// } else if (version[0] == 'n') { // Naive
if (version[0] == 'n') { // Naive
rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
breakdown, Slater_inv, determinant);

View File

@ -5,6 +5,7 @@
#include <stdlib.h>
#include <string.h>
#include <hdf5/serial/hdf5.h>
#include "kernels.h"
typedef struct Error {
@ -12,7 +13,18 @@ typedef struct Error {
uint64_t error;
} Error;
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim);
void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds);
void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds);
double get_determinant(uint32_t cycle, hid_t file_id);
double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds);
uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds);
uint64_t get_dim(uint32_t cycle, hid_t file_id);
uint64_t get_nupdates(uint32_t cycle, hid_t file_id);
//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
void residual(double *a, double *res, const uint64_t Dim);
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);

View File

@ -2,7 +2,7 @@
#include <stdint.h>
#include <stdbool.h>
#include "kernels.h"
#include "debug.h"
//#include "debug.h"
extern uint64_t n_splits;
extern uint64_t block_fail;
@ -19,11 +19,11 @@ uint32_t qmckl_sherman_morrison(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
double __attribute__((aligned(8))) C[DIM];
double __attribute__((aligned(8))) D[LDS];
double __attribute__((aligned(8))) C[Dim];
double __attribute__((aligned(8))) D[Lds];
uint32_t l = 0;
// For each update
@ -48,7 +48,7 @@ uint32_t qmckl_sherman_morrison(
double iden = 1.0 / den;
// Update det(A)
if (!determinant)
if (determinant)
*determinant *= den;
#pragma ivdep
@ -89,14 +89,14 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
const uint32_t row1 = (Updates_index[0] - 1);
const uint32_t row2 = (Updates_index[1] - 1);
// Compute C = (S^T)^{-1}U : Dim x 2
double __attribute__((aligned(8))) C[2 * DIM];
double __attribute__((aligned(8))) C[2 * Dim];
for (uint32_t i = 0; i < Dim; i++) {
C[i * 2] = 0;
C[i * 2 + 1] = 0;
@ -132,7 +132,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
Binv[3] = idet * B0;
// tmp = B^{-1}D : 2 x LDS
double __attribute__((aligned(8))) tmp[2 * LDS];
double __attribute__((aligned(8))) tmp[2 * Lds];
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
#pragma ivdep
@ -173,15 +173,15 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
const uint32_t row1 = (Updates_index[0] - 1);
const uint32_t row2 = (Updates_index[1] - 1);
const uint32_t row3 = (Updates_index[2] - 1);
// Compute C = (S^T)^{-1}U : Dim x 3
double __attribute__((aligned(8))) C[3 * DIM];
double __attribute__((aligned(8))) C[3 * Dim];
for (uint32_t i = 0; i < Dim; i++) {
C[i * 3] = 0;
C[i * 3 + 1] = 0;
@ -231,10 +231,10 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
Binv[8] = (B0 * B4 - B3 * B1) * idet;
// tmp = B^{-1}D : 3 x LDS
double __attribute__((aligned(8))) tmp[3 * LDS];
double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
double __attribute__((aligned(8))) tmp[3 * Lds];
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
double *__restrict r3dim = &(Slater_inv[row3 * Lds]);
#pragma ivdep
#pragma vector aligned
for (uint32_t j = 0; j < Lds; j++) {
@ -277,11 +277,11 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
// Compute C = S^{-1} U : Dim x K : standard dgemm
double *C = calloc(1, DIM * N_updates * sizeof(double));
double *C = calloc(1, Dim * N_updates * sizeof(double));
double alpha = 1.0, beta = 0.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
Dim, N_updates, Lds,
@ -289,7 +289,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
beta, C, N_updates);
// Construct B = 1 + V C : K x K, construct D = V S^{-1} : K x LDS
double B[N_updates * N_updates], D[N_updates * LDS];
double* B = calloc(1, sizeof *B * N_updates * N_updates);
double* D = calloc(1, sizeof *D * N_updates * Lds);
for (uint32_t i = 0; i < N_updates; i++) {
const uint32_t row = Updates_index[i] - 1;
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
@ -316,19 +317,23 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
(void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, pivot);
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
double tmp1[N_updates * LDS];
double* tmp1 = calloc(1, sizeof *tmp1 * N_updates * Lds);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
N_updates, LDS, N_updates,
alpha, B, N_updates, D, LDS,
beta, tmp1, LDS);
N_updates, Lds, N_updates,
alpha, B, N_updates, D, Lds,
beta, tmp1, Lds);
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
alpha = -1.0, beta = 1.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
Dim, LDS, N_updates,
alpha, C, N_updates, tmp1, LDS,
beta, Slater_inv, LDS);
Dim, Lds, N_updates,
alpha, C, N_updates, tmp1, Lds,
beta, Slater_inv, Lds);
free(C);
free(B);
free(D);
free(tmp1);
free(pivot);
return 0;
}
@ -344,10 +349,12 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
double* Slater_inv,
double* determinant)
{
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
double alpha, beta;
bool swap;
uint32_t j;
double alpha, beta, det;
int* pivot = calloc(1, sizeof *pivot * N_updates);
double* C = calloc(1, sizeof *C * Dim * N_updates);
double* B = calloc(1, sizeof *B * N_updates * N_updates);
@ -396,19 +403,22 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
// Compute determinant by LU decomposition
(void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info);
bool swap = false; uint32_t j = 0; double det = 1.0f;
swap = false; j = 0; det = 1.0f;
#pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det)
for (uint32_t i = 0; i < N_updates; i++) {
swap = (bool)(pivot[i] - (i + 1)); // swap = {0->false: no swap, >0->true: swap}
j += (uint32_t)swap; // count # of swaps
det *= B[i * (N_updates + 1)]; // prod. of diag elm. of B
}
if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early.
}
if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early.
if (determinant) { // update det(Slater) if determinant != NULL
if ((j & 1) != 0) det = -det; // multiply det with -1 if # of swaps is odd
*determinant *= det;
}
#pragma omp target data use_device_ptr(Slater_inv, Updates, C, B, workspace, pivot, Binv, D, T1, T2)
{
// Compute B^{-1} : initialise as I for solving BX=I
#pragma omp target teams distribute parallel for
for (int i = 0; i < N_updates; ++i) {
@ -471,11 +481,11 @@ uint32_t qmckl_slagel_splitting(
uint64_t *__restrict later_index, uint64_t *__restrict later,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
double __attribute__((aligned(8))) C[LDS];
double __attribute__((aligned(8))) D[LDS];
double __attribute__((aligned(8))) C[Lds];
double __attribute__((aligned(8))) D[Lds];
uint32_t l = 0;
// For each update
@ -513,7 +523,7 @@ uint32_t qmckl_slagel_splitting(
} // From here onwards we continue with applying the first halve of the update to Slater_inv
double iden = 1.0 / den;
if (!determinant) *determinant *= den;
if (determinant) *determinant *= den;
// D = v^T x S^{-1} : 1 x LDS
#pragma ivdep
@ -544,10 +554,10 @@ uint32_t qmckl_sherman_morrison_splitting(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
double __attribute__((aligned(8))) later_updates[Lds * N_updates];
uint64_t later_index[N_updates];
uint64_t later = 0;
// uint32_t rc;
@ -583,10 +593,10 @@ uint32_t qmckl_sherman_morrison_smw32s(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
double __attribute__((aligned(8))) later_updates[Lds * N_updates];
uint64_t later_index[N_updates];
uint64_t later = 0;
uint32_t rc;
@ -702,13 +712,13 @@ uint32_t qmckl_sherman_morrison_later(
double *__restrict __attribute__((aligned(8))) Slater_inv,
double *__restrict determinant) {
const uint32_t Dim = DIM;
const uint32_t Lds = LDS;
const uint32_t Dim = vDim;
const uint32_t Lds = vLDS;
double __attribute__((aligned(8))) C[DIM];
double __attribute__((aligned(8))) D[LDS];
double __attribute__((aligned(8))) C[Dim];
double __attribute__((aligned(8))) D[Lds];
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
double __attribute__((aligned(8))) later_updates[Lds * N_updates];
uint64_t later_index[N_updates];
uint64_t later = 0;
@ -743,7 +753,7 @@ uint32_t qmckl_sherman_morrison_later(
}
double iden = 1.0 / den;
if (!determinant) *determinant *= den;
if (determinant) *determinant *= den;
// D = v^T x A^{-1}
#pragma ivdep

View File

@ -1,9 +1,9 @@
#include <stdint.h>
#include <stdint.h>
#include "meuk.h"
#include "cycles.h"
#include "debug.h"
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
#define DATASET "dataset"
#define REPETITIONS 1
uint64_t n_splits;
@ -27,77 +27,46 @@ int main(int argc, char **argv) {
}
#endif
// SETUP STORAGE AND DATA ACCESS
hid_t file_id;
file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
char nupds_key[32];
char upd_idx_key[32];
char upds_key[32];
char slater_key[32];
char slater_inv_key[32];
char det_key[32];
const uint64_t Dim = DIM;
const uint64_t Lds = LDS;
uint64_t N_updates;
double Slater[LDS * DIM ], SlaterT[LDS * DIM];
double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
double determinant, determinant_copy;
hid_t file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
// SETUP TEST PARAMETERS
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
const double tolerance = 0.001; // default = 0.001
double cumulative = 0;
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
// FOR EACH UPDATE CYCLE DO:
// for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) {
for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) {
// SETUP TEST PARAMETERS
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
const double tolerance = 0.001; // default = 0.001
double cumulative = 0;
// 1. READ DATA FROM DATASET
uint32_t cycle = cycles[cycles_index];
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
sprintf(upds_key, "/cycle_%d/updates", cycle);
sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
sprintf(det_key, "/cycle_%d/determinant", cycle);
read_uint(file_id, nupds_key, &N_updates);
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
double *Updates = malloc(Lds * N_updates * sizeof(double));
read_uint(file_id, upd_idx_key, Updates_index);
read_double(file_id, upds_key, Updates);
read_double(file_id, slater_key, Slater);
read_double(file_id, slater_inv_key, Slater_invT);
read_double(file_id, det_key, &determinant);
uint64_t Dim = get_dim(cycle, file_id);
uint64_t Lds = Dim;
uint64_t N_updates = get_nupdates(cycle, file_id);
uint64_t* Updates_index = get_upd_idcs(cycle, file_id, N_updates);
double* Updates = get_upds(cycle, file_id, N_updates, Lds);
double* Slater = get_slater(cycle, file_id, Dim, Lds);
double* Slater_invT = get_slater_inv(cycle, file_id, Dim, Lds);
double *Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds);
double determinant = get_determinant(cycle, file_id), determinant_copy;
// Compute transpose of S. ST: 24 x 21
for (int i = 0; i < Lds; i++) {
for (int j = 0; j < Dim; j++) {
SlaterT[i * Dim + j] = Slater[j * Lds + i];
}
}
double *SlaterT = calloc(1, sizeof *SlaterT * Dim * Lds);
transpose(Slater, Lds, SlaterT, Dim, Dim, Lds);
// Convert repl. upds into additive upds.
for (int i = 0; i < N_updates; i++) {
int col = Updates_index[i] - 1;
for (int j = 0; j < Lds; j++) {
Updates[i * Lds + j] -= SlaterT[col + j * Dim];
}
}
convert(Updates, N_updates, Updates_index, SlaterT, Dim, Lds);
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// Update Slater matrix
for (int i = 0; i < N_updates; i++) {
int col = Updates_index[i] - 1;
for (int j = 0; j < Dim; j++) {
SlaterT[col + j * Dim] += Updates[i * Lds + j];
}
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
update(SlaterT, Updates, Updates_index, N_updates, Dim, Lds);
int32_t err_break;
@ -116,53 +85,13 @@ printf("#-----------------------------------------------------------------------
determinant_copy = determinant;
// ### CHOOSE A KERNEL:
// if (version[0] == 'a') { // Anthony
// const double *Upds;
// const uint64_t *Ui;
// double determinant_previous;
// err_break = 0;
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
// for (int i = 0; i < N_updates; i++) {
// Upds = &Updates[i * Lds];
// Ui = &Updates_index[i];
// determinant_previous = determinant_copy;
// // 1. FETCH START TIME
// uint64_t before = rdtsc();
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
// detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
// // 3. FETCH FINISH TIME
// uint64_t after = rdtsc();
// // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
// accumulator += (double)(after - before);
// // 5. STOP APPLYING UPDATES IF BREAKDOWN DETECTED
// double lambda = determinant_copy / determinant_previous; // should be id. to lambda in detupd
// if (fabs(lambda) < breakdown) {
// err_break = 1;
// break;
// }
// }
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
// }
// } else if (version[0] == 'n') { // Naive
if (version[0] == 'n') { // Naive
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -176,7 +105,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -190,7 +119,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -205,7 +134,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -220,7 +149,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -236,7 +165,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k_cublas_offload(handle, s_handle, Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -251,7 +180,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -265,7 +194,7 @@ printf("#-----------------------------------------------------------------------
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
@ -275,8 +204,8 @@ printf("#-----------------------------------------------------------------------
} else if (version[0] == 'm') { // LAPACK/MKL
// Only send upper Dim x Dim part of matrix to lapack
double tmp[DIM *DIM];
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
double *tmp = malloc(sizeof *tmp * Dim * Dim);
memcpy(tmp, SlaterT, sizeof *SlaterT * Dim * Dim);
// 1. FETCH START TIME
uint64_t before = rdtsc();
@ -288,12 +217,8 @@ printf("#-----------------------------------------------------------------------
uint64_t after = rdtsc();
// Copy elements of inverse back, adding 0-padding in "correct" place
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Lds; j++) {
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
else Slater_invT_copy[i * Lds + j] = 0.0;
}
}
copy(Slater_invT_copy, Lds, tmp, Dim);
free(tmp);
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
@ -302,9 +227,10 @@ printf("#-----------------------------------------------------------------------
printf("Version '%c' not implemented.\n", version[0]);
return 1;
}
} // END OF REPETITIONS LOOP
// 4. COPY RESULT BACK TO ORIGINAL
// 4. COPY RESULT BACK TO ORIGINAL
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
determinant = determinant_copy;
// At this point Slater_invT contains the correct inverse matrix
@ -321,11 +247,19 @@ printf("#-----------------------------------------------------------------------
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
cumulative += accumulator;
double SSi[DIM * DIM];
matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
double Res[DIM * DIM];
// Compute Slater x Slater_inv
double* SSi = malloc(sizeof *SSi * Dim * Dim);
double alpha = 1.0, beta = 0.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
Dim, Dim, Dim,
alpha, SlaterT, Dim, Slater_invT, Lds,
beta, SSi, Dim);
// Compute residual matrix S * Sinv - Id
double* Res = calloc(1, sizeof *Res * Dim * Dim);
residual(SSi, Res, Dim);
const double max = max_norm(Res, Dim, Dim);
free(SSi);
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
@ -336,13 +270,16 @@ printf("#-----------------------------------------------------------------------
// 8. COMPUTE CONDITION NUMBER
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
const double frob = frobenius_norm(Res, Dim, Dim);
free(Res);
// 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
free(Updates_index);
free(Updates);
free(SlaterT);
free(Slater_invT);
free(Slater_invT_copy);
} // END OF CYCLE LOOP
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
@ -352,7 +289,7 @@ printf("#-----------------------------------------------------------------------
(void) H5Fclose(file_id);
#ifdef HAVE_CUBLAS_OFFLOAD
#ifdef HAVE_CUBLAS_OFFLOAD
cublasDestroy(handle);
cusolverDnDestroy(s_handle);
#endif