mirror of
https://github.com/TREX-CoE/Sherman-Morrison.git
synced 2025-01-13 06:28:35 +01:00
Improved version.
- All static arrays replaced by dynamic ones - All overhead induced by checking before and after running of the kernels replaced as much as possible with calls to MKL/DGEMMs. - Solved bugs due to dimension mismatches. Overhead time is dramatically reduced because no more calls to naive 'matmul'.
This commit is contained in:
parent
15f959099d
commit
bba5cf5f2c
@ -1,6 +1,7 @@
|
||||
#FC = ifx
|
||||
#CC = nvc
|
||||
|
||||
#CFLAGS=-std=c99 -O0 -Wall -g -mp -target=gpu
|
||||
CFLAGS=-std=c99 -O3 -Wall -g -mp -target=gpu
|
||||
|
||||
INCLUDE=-I$(NVHPC_ROOT)/math_libs/include
|
||||
|
2
independent_test_harness/bench_cycles/1_cycle.h
Normal file
2
independent_test_harness/bench_cycles/1_cycle.h
Normal file
@ -0,0 +1,2 @@
|
||||
const uint32_t n_cycles = 1;
|
||||
uint32_t cycles[n_cycles] = {1};
|
@ -1,5 +1,15 @@
|
||||
#!/bin/bash
|
||||
make
|
||||
rm -v blocked kay cu
|
||||
./test b > blocked && ./test k > kay && ./test c > cu && head -n 4 cu && awk 'NR==5' blocked && awk 'NR==5' kay && awk 'NR==5' cu
|
||||
|
||||
rm -v blocked kay cu em
|
||||
|
||||
make &&
|
||||
./test m > em && \
|
||||
./test b > blocked && \
|
||||
./test k > kay && \
|
||||
./test c > cu && \
|
||||
head -n 4 em && \
|
||||
awk 'NR==5' em && \
|
||||
awk 'NR==5' blocked && \
|
||||
awk 'NR==5' kay && \
|
||||
awk 'NR==5' cu
|
||||
|
||||
|
@ -1 +1 @@
|
||||
cycles_329_dets/all_cycles.h
|
||||
bench_cycles/1_cycle.h
|
@ -1,8 +1,6 @@
|
||||
#include <mkl_lapacke.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#define DIM 21
|
||||
#define LDS 24
|
||||
#define HAVE_CUBLAS_OFFLOAD
|
||||
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
|
@ -2,6 +2,89 @@
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim) {
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
|
||||
else Slater_invT_copy[i * Lds + j] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds) {
|
||||
for (int i = 0; i < nupds; i++) {
|
||||
int col = ui[i] - 1;
|
||||
for (int j = 0; j < Dim; j++) {
|
||||
slaterT[col + j * Dim] += upds[i * Lds + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds) {
|
||||
for (int i = 0; i < nupds; i++) {
|
||||
int col = ui[i] - 1;
|
||||
for (int j = 0; j < Lds; j++) {
|
||||
upds[i * Lds + j] -= slaterT[col + j * Dim];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double get_determinant(uint32_t cycle, hid_t file_id) {
|
||||
char det_key[32];
|
||||
sprintf(det_key, "/cycle_%d/determinant", cycle);
|
||||
double determinant;
|
||||
read_double(file_id, det_key, &determinant);
|
||||
return determinant;
|
||||
}
|
||||
|
||||
double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) {
|
||||
char slater_inv_key[32];
|
||||
sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
|
||||
double *slater_inv = malloc(sizeof *slater_inv * Dim * Lds);
|
||||
read_double(file_id, slater_inv_key, slater_inv);
|
||||
return slater_inv;
|
||||
}
|
||||
|
||||
double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds) {
|
||||
char slater_key[32];
|
||||
sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
|
||||
double *slater = malloc(sizeof *slater * Dim * Lds);
|
||||
read_double(file_id, slater_key, slater);
|
||||
return slater;
|
||||
}
|
||||
|
||||
double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds) {
|
||||
char upds_key[32];
|
||||
sprintf(upds_key, "/cycle_%d/updates", cycle);
|
||||
double *upds = malloc(sizeof *upds * Lds * nupds);
|
||||
read_double(file_id, upds_key, upds);
|
||||
return upds;
|
||||
}
|
||||
|
||||
uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds) {
|
||||
char upd_idx_key[32];
|
||||
sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
|
||||
uint64_t* uis = malloc(sizeof *uis * nupds);
|
||||
read_uint(file_id, upd_idx_key, uis);
|
||||
return uis;
|
||||
}
|
||||
|
||||
uint64_t get_dim(uint32_t cycle, hid_t file_id) {
|
||||
char dim_key[32];
|
||||
sprintf(dim_key, "/cycle_%d/slater_matrix_dim", cycle);
|
||||
uint64_t Dim;
|
||||
read_uint(file_id, dim_key, &Dim);
|
||||
return Dim;
|
||||
}
|
||||
|
||||
uint64_t get_nupdates(uint32_t cycle, hid_t file_id) {
|
||||
char nupds_key[32];
|
||||
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
|
||||
uint64_t N_updates;
|
||||
read_uint(file_id, nupds_key, &N_updates);
|
||||
return N_updates;
|
||||
}
|
||||
|
||||
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim) {
|
||||
double sum2 = 0;
|
||||
for (uint64_t i = 0; i < Lds * Dim; i++) sum2 += A[i] * A[i];
|
||||
@ -65,16 +148,12 @@ void update_slater_matrix(const uint64_t Lds, const uint64_t Dim,
|
||||
uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT,
|
||||
double *Slater, const double tolerance) {
|
||||
|
||||
double res[Dim*Dim];
|
||||
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < Dim; j++) {
|
||||
res[i * Dim + j] = 0;
|
||||
for (uint32_t k = 0; k < Dim; k++) {
|
||||
res[i * Dim + j] += Slater[i * Dim + k] * Slater_invT[k * Lds + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
double* res = malloc(sizeof *res * Dim * Dim);
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||
Dim, Dim, Dim,
|
||||
alpha, Slater, Dim, Slater_invT, Lds,
|
||||
beta, res, Dim);
|
||||
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < Dim; j++) {
|
||||
@ -84,20 +163,20 @@ uint32_t check_error(const uint64_t Lds, const uint64_t Dim, double *Slater_invT
|
||||
if (i != j && fabs(elm) > tolerance) return 1;
|
||||
}
|
||||
}
|
||||
|
||||
free(res);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < Dim; j++) {
|
||||
prod[i * Dim + j] = 0;
|
||||
for (uint32_t k = 0; k < Dim; k++) {
|
||||
prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim) {
|
||||
// for (uint32_t i = 0; i < Dim; i++) {
|
||||
// for (uint32_t j = 0; j < Dim; j++) {
|
||||
// prod[i * Dim + j] = 0;
|
||||
// for (uint32_t k = 0; k < Dim; k++) {
|
||||
// prod[i * Dim + j] += a[i * Dim + k] * b[k * Lds + j];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
int32_t check_error_better(const double max, const double tolerance) {
|
||||
if (max < 0) return -1; // When max was a NaN
|
||||
@ -119,21 +198,7 @@ uint32_t test_kernel(char *version, const uint64_t Lds, const uint64_t Dim,
|
||||
const uint64_t *Updates_index, const double breakdown, const double tolerance,
|
||||
double *Slater, double *Slater_inv, double *determinant) {
|
||||
uint32_t rc = 0;
|
||||
// if (version[0] == 'a') { // Anthony
|
||||
// const double *Upds;
|
||||
// const uint64_t *Ui;
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] *= *determinant;
|
||||
// for (int j = 0; j < N_updates; j++) {
|
||||
// Upds = &Updates[j * Lds];
|
||||
// Ui = &Updates_index[j];
|
||||
// detupd(Dim, Lds, Upds, Ui, Slater_inv, determinant);
|
||||
// if (determinant == 0) printf("TEST_KERNEL: det_update21 failed\n");
|
||||
// }
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_inv[i] /= *determinant;
|
||||
// update_slater_matrix(Lds, Dim, N_updates, Updates, Updates_index, Slater);
|
||||
// rc = check_error(Lds, Dim, Slater_inv, Slater, tolerance);
|
||||
// if (rc != 0) printf("TEST_KERNEL: check_error failed\n");
|
||||
// } else if (version[0] == 'n') { // Naive
|
||||
|
||||
if (version[0] == 'n') { // Naive
|
||||
rc = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates, Updates_index,
|
||||
breakdown, Slater_inv, determinant);
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <hdf5/serial/hdf5.h>
|
||||
|
||||
#include "kernels.h"
|
||||
|
||||
typedef struct Error {
|
||||
@ -12,7 +13,18 @@ typedef struct Error {
|
||||
uint64_t error;
|
||||
} Error;
|
||||
|
||||
void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
|
||||
void copy(double* Slater_invT_copy, uint64_t Lds, double* tmp, uint64_t Dim);
|
||||
void update(double* slaterT,double* upds, uint64_t* ui, uint64_t nupds,uint64_t Dim, u_int64_t Lds);
|
||||
void convert(double* upds, uint64_t nupds, uint64_t* ui, double* slaterT, uint64_t Dim, u_int64_t Lds);
|
||||
double get_determinant(uint32_t cycle, hid_t file_id);
|
||||
double* get_slater_inv(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
|
||||
double* get_slater(uint32_t cycle, hid_t file_id, uint64_t Dim, u_int64_t Lds);
|
||||
double* get_upds(uint32_t cycle, hid_t file_id, uint64_t nupds, u_int64_t Lds);
|
||||
uint64_t* get_upd_idcs(uint32_t cycle, hid_t file_id, uint64_t nupds);
|
||||
uint64_t get_dim(uint32_t cycle, hid_t file_id);
|
||||
uint64_t get_nupdates(uint32_t cycle, hid_t file_id);
|
||||
|
||||
//void matmul(double *a, double *b, double *prod, const uint64_t Lds, const uint64_t Dim);
|
||||
void residual(double *a, double *res, const uint64_t Dim);
|
||||
double frobenius_norm2(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||
double frobenius_norm(double *A, const uint64_t Lds, const uint64_t Dim);
|
||||
|
@ -2,7 +2,7 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "kernels.h"
|
||||
#include "debug.h"
|
||||
//#include "debug.h"
|
||||
|
||||
extern uint64_t n_splits;
|
||||
extern uint64_t block_fail;
|
||||
@ -19,11 +19,11 @@ uint32_t qmckl_sherman_morrison(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
double __attribute__((aligned(8))) C[DIM];
|
||||
double __attribute__((aligned(8))) D[LDS];
|
||||
double __attribute__((aligned(8))) C[Dim];
|
||||
double __attribute__((aligned(8))) D[Lds];
|
||||
|
||||
uint32_t l = 0;
|
||||
// For each update
|
||||
@ -48,7 +48,7 @@ uint32_t qmckl_sherman_morrison(
|
||||
double iden = 1.0 / den;
|
||||
|
||||
// Update det(A)
|
||||
if (!determinant)
|
||||
if (determinant)
|
||||
*determinant *= den;
|
||||
|
||||
#pragma ivdep
|
||||
@ -89,14 +89,14 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||
Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
const uint32_t row1 = (Updates_index[0] - 1);
|
||||
const uint32_t row2 = (Updates_index[1] - 1);
|
||||
|
||||
// Compute C = (S^T)^{-1}U : Dim x 2
|
||||
double __attribute__((aligned(8))) C[2 * DIM];
|
||||
double __attribute__((aligned(8))) C[2 * Dim];
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
C[i * 2] = 0;
|
||||
C[i * 2 + 1] = 0;
|
||||
@ -132,7 +132,7 @@ uint32_t qmckl_woodbury_2(const uint64_t vLDS, const uint64_t vDim,
|
||||
Binv[3] = idet * B0;
|
||||
|
||||
// tmp = B^{-1}D : 2 x LDS
|
||||
double __attribute__((aligned(8))) tmp[2 * LDS];
|
||||
double __attribute__((aligned(8))) tmp[2 * Lds];
|
||||
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
|
||||
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
|
||||
#pragma ivdep
|
||||
@ -173,15 +173,15 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
||||
Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
const uint32_t row1 = (Updates_index[0] - 1);
|
||||
const uint32_t row2 = (Updates_index[1] - 1);
|
||||
const uint32_t row3 = (Updates_index[2] - 1);
|
||||
|
||||
// Compute C = (S^T)^{-1}U : Dim x 3
|
||||
double __attribute__((aligned(8))) C[3 * DIM];
|
||||
double __attribute__((aligned(8))) C[3 * Dim];
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
C[i * 3] = 0;
|
||||
C[i * 3 + 1] = 0;
|
||||
@ -231,10 +231,10 @@ uint32_t qmckl_woodbury_3(const uint64_t vLDS, const uint64_t vDim,
|
||||
Binv[8] = (B0 * B4 - B3 * B1) * idet;
|
||||
|
||||
// tmp = B^{-1}D : 3 x LDS
|
||||
double __attribute__((aligned(8))) tmp[3 * LDS];
|
||||
double *__restrict r1dim = &(Slater_inv[row1 * LDS]);
|
||||
double *__restrict r2dim = &(Slater_inv[row2 * LDS]);
|
||||
double *__restrict r3dim = &(Slater_inv[row3 * LDS]);
|
||||
double __attribute__((aligned(8))) tmp[3 * Lds];
|
||||
double *__restrict r1dim = &(Slater_inv[row1 * Lds]);
|
||||
double *__restrict r2dim = &(Slater_inv[row2 * Lds]);
|
||||
double *__restrict r3dim = &(Slater_inv[row3 * Lds]);
|
||||
#pragma ivdep
|
||||
#pragma vector aligned
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
@ -277,11 +277,11 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
// Compute C = S^{-1} U : Dim x K : standard dgemm
|
||||
double *C = calloc(1, DIM * N_updates * sizeof(double));
|
||||
double *C = calloc(1, Dim * N_updates * sizeof(double));
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
Dim, N_updates, Lds,
|
||||
@ -289,7 +289,8 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||
beta, C, N_updates);
|
||||
|
||||
// Construct B = 1 + V C : K x K, construct D = V S^{-1} : K x LDS
|
||||
double B[N_updates * N_updates], D[N_updates * LDS];
|
||||
double* B = calloc(1, sizeof *B * N_updates * N_updates);
|
||||
double* D = calloc(1, sizeof *D * N_updates * Lds);
|
||||
for (uint32_t i = 0; i < N_updates; i++) {
|
||||
const uint32_t row = Updates_index[i] - 1;
|
||||
for (uint32_t j = 0; j < N_updates ; j++) B[i * N_updates + j] = C[row * N_updates + j] + (i == j);
|
||||
@ -316,19 +317,23 @@ uint32_t qmckl_woodbury_k(const uint64_t vLDS,
|
||||
(void) LAPACKE_dgetri(LAPACK_ROW_MAJOR, N_updates, B, N_updates, pivot);
|
||||
|
||||
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
|
||||
double tmp1[N_updates * LDS];
|
||||
double* tmp1 = calloc(1, sizeof *tmp1 * N_updates * Lds);
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||
N_updates, LDS, N_updates,
|
||||
alpha, B, N_updates, D, LDS,
|
||||
beta, tmp1, LDS);
|
||||
N_updates, Lds, N_updates,
|
||||
alpha, B, N_updates, D, Lds,
|
||||
beta, tmp1, Lds);
|
||||
|
||||
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
|
||||
alpha = -1.0, beta = 1.0;
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||
Dim, LDS, N_updates,
|
||||
alpha, C, N_updates, tmp1, LDS,
|
||||
beta, Slater_inv, LDS);
|
||||
Dim, Lds, N_updates,
|
||||
alpha, C, N_updates, tmp1, Lds,
|
||||
beta, Slater_inv, Lds);
|
||||
|
||||
free(C);
|
||||
free(B);
|
||||
free(D);
|
||||
free(tmp1);
|
||||
free(pivot);
|
||||
return 0;
|
||||
}
|
||||
@ -344,10 +349,12 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
|
||||
double* Slater_inv,
|
||||
double* determinant)
|
||||
{
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
double alpha, beta;
|
||||
bool swap;
|
||||
uint32_t j;
|
||||
double alpha, beta, det;
|
||||
int* pivot = calloc(1, sizeof *pivot * N_updates);
|
||||
double* C = calloc(1, sizeof *C * Dim * N_updates);
|
||||
double* B = calloc(1, sizeof *B * N_updates * N_updates);
|
||||
@ -396,19 +403,22 @@ uint32_t qmckl_woodbury_k_cublas_offload(cublasHandle_t b_handle, cusolverDnHand
|
||||
|
||||
// Compute determinant by LU decomposition
|
||||
(void) cusolverDnDgetrf(s_handle, N_updates, N_updates, B, N_updates, workspace, pivot, info);
|
||||
bool swap = false; uint32_t j = 0; double det = 1.0f;
|
||||
swap = false; j = 0; det = 1.0f;
|
||||
#pragma omp target teams distribute parallel for reduction(+: j) reduction(*: det)
|
||||
for (uint32_t i = 0; i < N_updates; i++) {
|
||||
swap = (bool)(pivot[i] - (i + 1)); // swap = {0->false: no swap, >0->true: swap}
|
||||
j += (uint32_t)swap; // count # of swaps
|
||||
det *= B[i * (N_updates + 1)]; // prod. of diag elm. of B
|
||||
}
|
||||
if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early.
|
||||
}
|
||||
if (fabs(det) < breakdown) return 1; // check if determinant of B is too close to zero. If so, exit early.
|
||||
if (determinant) { // update det(Slater) if determinant != NULL
|
||||
if ((j & 1) != 0) det = -det; // multiply det with -1 if # of swaps is odd
|
||||
*determinant *= det;
|
||||
}
|
||||
|
||||
|
||||
#pragma omp target data use_device_ptr(Slater_inv, Updates, C, B, workspace, pivot, Binv, D, T1, T2)
|
||||
{
|
||||
// Compute B^{-1} : initialise as I for solving BX=I
|
||||
#pragma omp target teams distribute parallel for
|
||||
for (int i = 0; i < N_updates; ++i) {
|
||||
@ -471,11 +481,11 @@ uint32_t qmckl_slagel_splitting(
|
||||
uint64_t *__restrict later_index, uint64_t *__restrict later,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
double __attribute__((aligned(8))) C[LDS];
|
||||
double __attribute__((aligned(8))) D[LDS];
|
||||
double __attribute__((aligned(8))) C[Lds];
|
||||
double __attribute__((aligned(8))) D[Lds];
|
||||
|
||||
uint32_t l = 0;
|
||||
// For each update
|
||||
@ -513,7 +523,7 @@ uint32_t qmckl_slagel_splitting(
|
||||
} // From here onwards we continue with applying the first halve of the update to Slater_inv
|
||||
double iden = 1.0 / den;
|
||||
|
||||
if (!determinant) *determinant *= den;
|
||||
if (determinant) *determinant *= den;
|
||||
|
||||
// D = v^T x S^{-1} : 1 x LDS
|
||||
#pragma ivdep
|
||||
@ -544,10 +554,10 @@ uint32_t qmckl_sherman_morrison_splitting(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||
double __attribute__((aligned(8))) later_updates[Lds * N_updates];
|
||||
uint64_t later_index[N_updates];
|
||||
uint64_t later = 0;
|
||||
// uint32_t rc;
|
||||
@ -583,10 +593,10 @@ uint32_t qmckl_sherman_morrison_smw32s(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||
double __attribute__((aligned(8))) later_updates[Lds * N_updates];
|
||||
uint64_t later_index[N_updates];
|
||||
uint64_t later = 0;
|
||||
uint32_t rc;
|
||||
@ -702,13 +712,13 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
double *__restrict __attribute__((aligned(8))) Slater_inv,
|
||||
double *__restrict determinant) {
|
||||
|
||||
const uint32_t Dim = DIM;
|
||||
const uint32_t Lds = LDS;
|
||||
const uint32_t Dim = vDim;
|
||||
const uint32_t Lds = vLDS;
|
||||
|
||||
double __attribute__((aligned(8))) C[DIM];
|
||||
double __attribute__((aligned(8))) D[LDS];
|
||||
double __attribute__((aligned(8))) C[Dim];
|
||||
double __attribute__((aligned(8))) D[Lds];
|
||||
|
||||
double __attribute__((aligned(8))) later_updates[LDS * N_updates];
|
||||
double __attribute__((aligned(8))) later_updates[Lds * N_updates];
|
||||
uint64_t later_index[N_updates];
|
||||
uint64_t later = 0;
|
||||
|
||||
@ -743,7 +753,7 @@ uint32_t qmckl_sherman_morrison_later(
|
||||
}
|
||||
double iden = 1.0 / den;
|
||||
|
||||
if (!determinant) *determinant *= den;
|
||||
if (determinant) *determinant *= den;
|
||||
|
||||
// D = v^T x A^{-1}
|
||||
#pragma ivdep
|
||||
|
@ -1,9 +1,9 @@
|
||||
#include <stdint.h>
|
||||
#include <stdint.h>
|
||||
#include "meuk.h"
|
||||
#include "cycles.h"
|
||||
#include "debug.h"
|
||||
|
||||
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
|
||||
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
|
||||
#define DATASET "dataset"
|
||||
#define REPETITIONS 1
|
||||
|
||||
uint64_t n_splits;
|
||||
@ -27,77 +27,46 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// SETUP STORAGE AND DATA ACCESS
|
||||
hid_t file_id;
|
||||
file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
|
||||
char nupds_key[32];
|
||||
char upd_idx_key[32];
|
||||
char upds_key[32];
|
||||
char slater_key[32];
|
||||
char slater_inv_key[32];
|
||||
char det_key[32];
|
||||
const uint64_t Dim = DIM;
|
||||
const uint64_t Lds = LDS;
|
||||
uint64_t N_updates;
|
||||
double Slater[LDS * DIM ], SlaterT[LDS * DIM];
|
||||
double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
|
||||
double determinant, determinant_copy;
|
||||
hid_t file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
|
||||
|
||||
// SETUP TEST PARAMETERS
|
||||
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
|
||||
const double tolerance = 0.001; // default = 0.001
|
||||
double cumulative = 0;
|
||||
|
||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
|
||||
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
|
||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
|
||||
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
|
||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||
// FOR EACH UPDATE CYCLE DO:
|
||||
// for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
|
||||
for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) {
|
||||
for (uint32_t cycles_index = 0; cycles_index < 1; cycles_index++) {
|
||||
|
||||
// SETUP TEST PARAMETERS
|
||||
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
|
||||
const double tolerance = 0.001; // default = 0.001
|
||||
double cumulative = 0;
|
||||
|
||||
// 1. READ DATA FROM DATASET
|
||||
uint32_t cycle = cycles[cycles_index];
|
||||
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
|
||||
sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
|
||||
sprintf(upds_key, "/cycle_%d/updates", cycle);
|
||||
sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
|
||||
sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
|
||||
sprintf(det_key, "/cycle_%d/determinant", cycle);
|
||||
read_uint(file_id, nupds_key, &N_updates);
|
||||
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
|
||||
double *Updates = malloc(Lds * N_updates * sizeof(double));
|
||||
read_uint(file_id, upd_idx_key, Updates_index);
|
||||
read_double(file_id, upds_key, Updates);
|
||||
read_double(file_id, slater_key, Slater);
|
||||
read_double(file_id, slater_inv_key, Slater_invT);
|
||||
read_double(file_id, det_key, &determinant);
|
||||
uint64_t Dim = get_dim(cycle, file_id);
|
||||
uint64_t Lds = Dim;
|
||||
uint64_t N_updates = get_nupdates(cycle, file_id);
|
||||
uint64_t* Updates_index = get_upd_idcs(cycle, file_id, N_updates);
|
||||
double* Updates = get_upds(cycle, file_id, N_updates, Lds);
|
||||
double* Slater = get_slater(cycle, file_id, Dim, Lds);
|
||||
double* Slater_invT = get_slater_inv(cycle, file_id, Dim, Lds);
|
||||
double *Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds);
|
||||
double determinant = get_determinant(cycle, file_id), determinant_copy;
|
||||
|
||||
// Compute transpose of S. ST: 24 x 21
|
||||
for (int i = 0; i < Lds; i++) {
|
||||
for (int j = 0; j < Dim; j++) {
|
||||
SlaterT[i * Dim + j] = Slater[j * Lds + i];
|
||||
}
|
||||
}
|
||||
double *SlaterT = calloc(1, sizeof *SlaterT * Dim * Lds);
|
||||
transpose(Slater, Lds, SlaterT, Dim, Dim, Lds);
|
||||
|
||||
// Convert repl. upds into additive upds.
|
||||
for (int i = 0; i < N_updates; i++) {
|
||||
int col = Updates_index[i] - 1;
|
||||
for (int j = 0; j < Lds; j++) {
|
||||
Updates[i * Lds + j] -= SlaterT[col + j * Dim];
|
||||
}
|
||||
}
|
||||
convert(Updates, N_updates, Updates_index, SlaterT, Dim, Lds);
|
||||
|
||||
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
|
||||
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
|
||||
|
||||
// Update Slater matrix
|
||||
for (int i = 0; i < N_updates; i++) {
|
||||
int col = Updates_index[i] - 1;
|
||||
for (int j = 0; j < Dim; j++) {
|
||||
SlaterT[col + j * Dim] += Updates[i * Lds + j];
|
||||
}
|
||||
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
|
||||
update(SlaterT, Updates, Updates_index, N_updates, Dim, Lds);
|
||||
|
||||
int32_t err_break;
|
||||
|
||||
@ -116,53 +85,13 @@ printf("#-----------------------------------------------------------------------
|
||||
determinant_copy = determinant;
|
||||
|
||||
// ### CHOOSE A KERNEL:
|
||||
// if (version[0] == 'a') { // Anthony
|
||||
// const double *Upds;
|
||||
// const uint64_t *Ui;
|
||||
// double determinant_previous;
|
||||
|
||||
// err_break = 0;
|
||||
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
|
||||
|
||||
// for (int i = 0; i < N_updates; i++) {
|
||||
// Upds = &Updates[i * Lds];
|
||||
// Ui = &Updates_index[i];
|
||||
// determinant_previous = determinant_copy;
|
||||
|
||||
// // 1. FETCH START TIME
|
||||
// uint64_t before = rdtsc();
|
||||
|
||||
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
// detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
|
||||
|
||||
// // 3. FETCH FINISH TIME
|
||||
// uint64_t after = rdtsc();
|
||||
|
||||
// // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||
// accumulator += (double)(after - before);
|
||||
|
||||
// // 5. STOP APPLYING UPDATES IF BREAKDOWN DETECTED
|
||||
// double lambda = determinant_copy / determinant_previous; // should be id. to lambda in detupd
|
||||
// if (fabs(lambda) < breakdown) {
|
||||
// err_break = 1;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
|
||||
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
|
||||
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
|
||||
// }
|
||||
// } else if (version[0] == 'n') { // Naive
|
||||
if (version[0] == 'n') { // Naive
|
||||
// 1. FETCH START TIME
|
||||
uint64_t before = rdtsc();
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -176,7 +105,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -190,7 +119,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
|
||||
breakdown, Slater_invT_copy, &determinant);
|
||||
breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -205,7 +134,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
|
||||
breakdown, Slater_invT_copy, &determinant);
|
||||
breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -220,7 +149,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -236,7 +165,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_woodbury_k_cublas_offload(handle, s_handle, Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -251,7 +180,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -265,7 +194,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
|
||||
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
Updates_index, breakdown, Slater_invT_copy, &determinant);
|
||||
|
||||
// 3. FETCH FINISH TIME
|
||||
uint64_t after = rdtsc();
|
||||
@ -275,8 +204,8 @@ printf("#-----------------------------------------------------------------------
|
||||
} else if (version[0] == 'm') { // LAPACK/MKL
|
||||
|
||||
// Only send upper Dim x Dim part of matrix to lapack
|
||||
double tmp[DIM *DIM];
|
||||
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
|
||||
double *tmp = malloc(sizeof *tmp * Dim * Dim);
|
||||
memcpy(tmp, SlaterT, sizeof *SlaterT * Dim * Dim);
|
||||
|
||||
// 1. FETCH START TIME
|
||||
uint64_t before = rdtsc();
|
||||
@ -288,12 +217,8 @@ printf("#-----------------------------------------------------------------------
|
||||
uint64_t after = rdtsc();
|
||||
|
||||
// Copy elements of inverse back, adding 0-padding in "correct" place
|
||||
for (uint32_t i = 0; i < Dim; i++) {
|
||||
for (uint32_t j = 0; j < Lds; j++) {
|
||||
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
|
||||
else Slater_invT_copy[i * Lds + j] = 0.0;
|
||||
}
|
||||
}
|
||||
copy(Slater_invT_copy, Lds, tmp, Dim);
|
||||
free(tmp);
|
||||
|
||||
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
|
||||
accumulator += (double)(after - before);
|
||||
@ -302,9 +227,10 @@ printf("#-----------------------------------------------------------------------
|
||||
printf("Version '%c' not implemented.\n", version[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
} // END OF REPETITIONS LOOP
|
||||
|
||||
// 4. COPY RESULT BACK TO ORIGINAL
|
||||
// 4. COPY RESULT BACK TO ORIGINAL
|
||||
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
|
||||
determinant = determinant_copy;
|
||||
// At this point Slater_invT contains the correct inverse matrix
|
||||
@ -321,11 +247,19 @@ printf("#-----------------------------------------------------------------------
|
||||
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
|
||||
cumulative += accumulator;
|
||||
|
||||
double SSi[DIM * DIM];
|
||||
matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
|
||||
double Res[DIM * DIM];
|
||||
// Compute Slater x Slater_inv
|
||||
double* SSi = malloc(sizeof *SSi * Dim * Dim);
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||
Dim, Dim, Dim,
|
||||
alpha, SlaterT, Dim, Slater_invT, Lds,
|
||||
beta, SSi, Dim);
|
||||
|
||||
// Compute residual matrix S * Sinv - Id
|
||||
double* Res = calloc(1, sizeof *Res * Dim * Dim);
|
||||
residual(SSi, Res, Dim);
|
||||
const double max = max_norm(Res, Dim, Dim);
|
||||
free(SSi);
|
||||
|
||||
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
|
||||
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
|
||||
@ -336,13 +270,16 @@ printf("#-----------------------------------------------------------------------
|
||||
// 8. COMPUTE CONDITION NUMBER
|
||||
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
|
||||
const double frob = frobenius_norm(Res, Dim, Dim);
|
||||
|
||||
free(Res);
|
||||
|
||||
// 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
|
||||
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
|
||||
|
||||
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
|
||||
|
||||
free(Updates_index);
|
||||
free(Updates);
|
||||
free(SlaterT);
|
||||
free(Slater_invT);
|
||||
free(Slater_invT_copy);
|
||||
|
||||
} // END OF CYCLE LOOP
|
||||
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
|
||||
@ -352,7 +289,7 @@ printf("#-----------------------------------------------------------------------
|
||||
|
||||
(void) H5Fclose(file_id);
|
||||
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
cublasDestroy(handle);
|
||||
cusolverDnDestroy(s_handle);
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user