Sherman-Morrison/independent_test_harness/test.c

360 lines
13 KiB
C

#include <stdint.h>
#include "meuk.h"
#include "cycles.h"
#define DATASET "dataset_329d_zeropadded_cm.hdf5"
// #define DATASET "dataset_15784d_zeropadded_cm.hdf5"
#define REPETITIONS 1
uint64_t n_splits;
uint64_t block_fail;
uint64_t recursive_calls;
int main(int argc, char **argv) {
assert(argc == 2);
char *version = argv[1];
#ifdef HAVE_CUBLAS_OFFLOAD
cublasHandle_t handle;
cusolverDnHandle_t s_handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) {
fprintf(stdout, "cuBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
if (cusolverDnCreate(&s_handle) != CUSOLVER_STATUS_SUCCESS) {
fprintf(stdout, "cuSOLVER initialization failed!\n");
exit(EXIT_FAILURE);
}
#endif
// SETUP STORAGE AND DATA ACCESS
hid_t file_id;
file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
char nupds_key[32];
char upd_idx_key[32];
char upds_key[32];
char slater_key[32];
char slater_inv_key[32];
char det_key[32];
const uint64_t Dim = DIM;
const uint64_t Lds = LDS;
uint64_t N_updates;
double Slater[LDS * DIM ], SlaterT[LDS * DIM];
double Slater_invT[LDS * DIM], Slater_invT_copy[LDS * DIM];
double determinant, determinant_copy;
// SETUP TEST PARAMETERS
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
const double tolerance = 0.001; // default = 0.001
double cumulative = 0;
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
// FOR EACH UPDATE CYCLE DO:
// for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
for (uint32_t cycles_index = 40; cycles_index < 41; cycles_index++) {
// 1. READ DATA FROM DATASET
uint32_t cycle = cycles[cycles_index];
sprintf(nupds_key, "/cycle_%d/nupdates", cycle);
sprintf(upd_idx_key, "/cycle_%d/col_update_index", cycle);
sprintf(upds_key, "/cycle_%d/updates", cycle);
sprintf(slater_key, "/cycle_%d/slater_matrix", cycle);
sprintf(slater_inv_key, "/cycle_%d/slater_inverse_t", cycle);
sprintf(det_key, "/cycle_%d/determinant", cycle);
read_uint(file_id, nupds_key, &N_updates);
uint64_t *Updates_index = malloc(N_updates * sizeof(uint64_t));
double *Updates = malloc(Lds * N_updates * sizeof(double));
read_uint(file_id, upd_idx_key, Updates_index);
read_double(file_id, upds_key, Updates);
read_double(file_id, slater_key, Slater);
read_double(file_id, slater_inv_key, Slater_invT);
read_double(file_id, det_key, &determinant);
// Compute transpose of S. ST: 24 x 21
for (int i = 0; i < Lds; i++) {
for (int j = 0; j < Dim; j++) {
SlaterT[i * Dim + j] = Slater[j * Lds + i];
}
}
// Convert repl. upds into additive upds.
for (int i = 0; i < N_updates; i++) {
int col = Updates_index[i] - 1;
for (int j = 0; j < Lds; j++) {
Updates[i * Lds + j] -= SlaterT[col + j * Dim];
}
}
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// Update Slater matrix
for (int i = 0; i < N_updates; i++) {
int col = Updates_index[i] - 1;
for (int j = 0; j < Dim; j++) {
SlaterT[col + j * Dim] += Updates[i * Lds + j];
}
} // A this point SlaterT, Updates & the updated SlaterT are correct. Checked in GDB
int32_t err_break;
// 3. SET TIME- AND SPLIT ACCUMULATOR TO ZERO
double accumulator = 0;
double cycles_per_update = 0;
n_splits = 0;
block_fail = 0;
recursive_calls = 0;
// ## FOR A SET NUMBER OF REPETITIONS DO:
for (int rep = 0; rep < REPETITIONS; rep++) {
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double));
determinant_copy = determinant;
// ### CHOOSE A KERNEL:
// if (version[0] == 'a') { // Anthony
// const double *Upds;
// const uint64_t *Ui;
// double determinant_previous;
// err_break = 0;
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] *= determinant_copy; // Multiply inv(Slater-mat) by det(Slater-mat) to get adj(Slater_mat)
// for (int i = 0; i < N_updates; i++) {
// Upds = &Updates[i * Lds];
// Ui = &Updates_index[i];
// determinant_previous = determinant_copy;
// // 1. FETCH START TIME
// uint64_t before = rdtsc();
// // 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
// detupd(Dim, Lds, Upds, Ui, Slater_invT_copy, &determinant_copy);
// // 3. FETCH FINISH TIME
// uint64_t after = rdtsc();
// // 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
// accumulator += (double)(after - before);
// // 5. STOP APPLYING UPDATES IF BREAKDOWN DETECTED
// double lambda = determinant_copy / determinant_previous; // should be id. to lambda in detupd
// if (fabs(lambda) < breakdown) {
// err_break = 1;
// break;
// }
// }
// if (err_break == 1) { // Divide adj(Slater-mat) by OLD det(Slater-mat) to get inv(Slater_mat) again
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_previous;
// } else { // Divide adj(Slater-mat) by NEW det(Slater-mat) to get inv(Slater_mat) again
// for (int i = 0; i < Lds * Dim; i++) Slater_invT_copy[i] /= determinant_copy;
// }
// } else if (version[0] == 'n') { // Naive
if (version[0] == 'n') { // Naive
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'l') { // Later
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == '2') { // by twos
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == '3') { // by threes
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'k') { // Woodbury K
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
#ifdef HAVE_CUBLAS_OFFLOAD
} else if (version[0] == 'c') { // Woodbury K cuBLAS
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k_cublas_offload(handle, s_handle, Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
#endif
} else if (version[0] == 's') { // Splitting
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'b') { // Blocked
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else if (version[0] == 'm') { // LAPACK/MKL
// Only send upper Dim x Dim part of matrix to lapack
double tmp[DIM *DIM];
memcpy(tmp, SlaterT, Dim*Dim*sizeof(double));
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = inverse(tmp, Dim, Dim);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// Copy elements of inverse back, adding 0-padding in "correct" place
for (uint32_t i = 0; i < Dim; i++) {
for (uint32_t j = 0; j < Lds; j++) {
if (j < Dim) Slater_invT_copy[i * Lds + j] = tmp[i * Dim + j];
else Slater_invT_copy[i * Lds + j] = 0.0;
}
}
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
} else { // Exit
printf("Version '%c' not implemented.\n", version[0]);
return 1;
}
} // END OF REPETITIONS LOOP
// 4. COPY RESULT BACK TO ORIGINAL
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
determinant = determinant_copy;
// At this point Slater_invT contains the correct inverse matrix
// 5. DIVIDE CYCLE- AND SPLIT-ACCUMULATOR BY NUMBER OF REPETITIONS AND RECORD
// DIVIDE CYCLE-ACCUMULATOR BY NUMBER OF UPDATES AND RECORD
accumulator /= REPETITIONS;
cycles_per_update = accumulator / N_updates;
n_splits /= REPETITIONS;
block_fail /= REPETITIONS;
recursive_calls /= REPETITIONS;
// 6. ADD THE AVERAGED TIME PER CYCLE OF ACCUMULATER TO
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
cumulative += accumulator;
double SSi[DIM * DIM];
matmul(SlaterT, Slater_invT, SSi, Lds, Dim);
double Res[DIM * DIM];
residual(SSi, Res, Dim);
const double max = max_norm(Res, Dim, Dim);
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// int32_t err_out = check_error_better(max, tolerance);
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
// 8. COMPUTE CONDITION NUMBER
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
const double frob = frobenius_norm(Res, Dim, Dim);
// 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
free(Updates_index);
free(Updates);
} // END OF CYCLE LOOP
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
(void) H5Fclose(file_id);
#ifdef HAVE_CUBLAS_OFFLOAD
cublasDestroy(handle);
cusolverDnDestroy(s_handle);
#endif
}