Sherman-Morrison/independent_test_harness/test.c
Francois Coppens f7dbe3ddd8 - Sync and Async version
- OpenMP version
- PP defines cleanup
2022-11-08 15:35:25 +01:00

317 lines
12 KiB
C

#include <stdint.h>
#include "helper.h"
#include "cycles.h"
#define DATASET "dataset"
#define REPETITIONS 1
uint64_t n_splits;
uint64_t block_fail;
uint64_t recursive_calls;
int main(int argc, char **argv) {
assert(argc == 2);
char *version = argv[1];
#ifdef USE_OMP_OFFLOAD_CUDA
cublasHandle_t handle = init_cublas();
cusolverDnHandle_t s_handle = init_cusolver();
#endif
// SETUP DATA ACCESS
hid_t file_id = H5Fopen(DATASET, H5F_ACC_RDONLY, H5P_DEFAULT);
printf("\n# %d REPETITIONS\n", REPETITIONS);
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
// FOR EACH UPDATE CYCLE DO:
for (uint32_t cycles_index = 0; cycles_index < n_cycles; cycles_index++) {
// SETUP TEST PARAMETERS
// const uint32_t GHz = 2800000000; // 2.8 giga-cycles per second
const double breakdown = 0.001; // default = 0.001. 1e-9 might be too small
const double tolerance = 0.001; // default = 0.001
double cumulative = 0;
// 1. READ DATA FROM DATASET
uint32_t cycle = cycles[cycles_index];
uint64_t Dim = get_dim(cycle, file_id);
uint64_t Lds = Dim;
uint64_t N_updates = get_nupdates(cycle, file_id);
uint64_t* Updates_index = get_upd_idcs(cycle, file_id, N_updates);
double* Updates = get_upds(cycle, file_id, N_updates, Lds);
double* Slater = get_slater(cycle, file_id, Dim, Lds);
double* Slater_invT = get_slater_inv(cycle, file_id, Dim, Lds);
double* Slater_invT_copy = calloc(1, sizeof *Slater_invT_copy * Dim * Lds);
double determinant = get_determinant(cycle, file_id), determinant_copy;
// Compute transpose of S. ST: 24 x 21
double *SlaterT = calloc(1, sizeof *SlaterT * Dim * Lds);
transpose(Slater, Lds, SlaterT, Dim, Dim, Lds);
// Convert repl. upds into additive upds.
convert(Updates, N_updates, Updates_index, SlaterT, Dim, Lds);
// 2. CHECK ERROR ON THE INPUT DATA AND RECORD RESULT: ERR_INPUT
uint32_t err_inp = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// Update Slater matrix
update(SlaterT, Updates, Updates_index, N_updates, Dim, Lds);
int32_t err_break;
// 3. SET TIME- AND SPLIT ACCUMULATOR TO ZERO
double accumulator = 0;
double cycles_per_update = 0;
n_splits = 0;
block_fail = 0;
recursive_calls = 0;
// ## FOR A SET NUMBER OF REPETITIONS EXECUTE A CHOSEN KERNEL
for (int rep = 0; rep < REPETITIONS; rep++) {
// 1. MAKE A FRESH COPY OF THE SLATER INVERSE AND DETERMINANT AND USE THE COPY
memcpy(Slater_invT_copy, Slater_invT, Lds * Dim * sizeof(double));
determinant_copy = determinant;
// ### CHOOSE A KERNEL:
if (version[0] == 'n') { // Naive
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else if (version[0] == 'l') { // Later
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_later(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else if (version[0] == '2') { // by twos
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_2(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else if (version[0] == '3') { // by threes
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_3(Lds, Dim, Updates, Updates_index,
breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else if (version[0] == 'k') { // Woodbury K
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double) (after - before);
}
#ifdef USE_OMP
else if (version[0] == 'o') { // Woodbury K OMP
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k_omp(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double) (after - before);
}
#endif
#ifdef USE_OMP_OFFLOAD_CUDA
else if (version[0] == 'c') { // Woodbury K cuBLAS
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_woodbury_k_ompol_cuda_sync(handle, s_handle, Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double) (after - before);
}
#endif
else if (version[0] == 's') { // Splitting
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_splitting(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else if (version[0] == 'b') { // Blocked
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = qmckl_sherman_morrison_smw32s(Lds, Dim, N_updates, Updates,
Updates_index, breakdown, Slater_invT_copy, &determinant);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else if (version[0] == 'm') { // LAPACK/MKL
// Only send upper Dim x Dim part of matrix to lapack
double *tmp = malloc(sizeof *tmp * Dim * Dim);
memcpy(tmp, SlaterT, sizeof *SlaterT * Dim * Dim);
// 1. FETCH START TIME
uint64_t before = rdtsc();
// 2. EXECUTE KERNEL AND REMEMBER EXIT STATUS
err_break = inverse(tmp, Dim, Dim);
// 3. FETCH FINISH TIME
uint64_t after = rdtsc();
// Copy elements of inverse back, adding 0-padding in "correct" place
copy(Slater_invT_copy, Lds, tmp, Dim);
free(tmp);
// 4. ADD TIME DIFFERENCE TO TIME CUMMULATOR
accumulator += (double)(after - before);
}
else { // Exit
printf("Version '%c' not implemented.\n", version[0]);
return 1;
}
} // END OF REPETITIONS LOOP
// 4. COPY RESULT BACK TO ORIGINAL
memcpy(Slater_invT, Slater_invT_copy, Lds * Dim * sizeof(double));
determinant = determinant_copy;
// 5. DIVIDE CYCLE- AND SPLIT-ACCUMULATOR BY NUMBER OF REPETITIONS AND RECORD
// DIVIDE CYCLE-ACCUMULATOR BY NUMBER OF UPDATES AND RECORD
accumulator /= REPETITIONS;
cycles_per_update = accumulator / N_updates;
n_splits /= REPETITIONS;
block_fail /= REPETITIONS;
recursive_calls /= REPETITIONS;
// 6. ADD THE AVERAGED TIME PER CYCLE OF ACCUMULATER TO
// CUMULATIVE RESULT FOR THE ENTIRE DATASET
cumulative += accumulator;
// Compute Slater x Slater_inv
double* SSi = malloc(sizeof *SSi * Dim * Dim);
double alpha = 1.0, beta = 0.0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
Dim, Dim, Dim,
alpha, SlaterT, Dim, Slater_invT, Lds,
beta, SSi, Dim);
// Compute residual matrix S * Sinv - Id
double* Res = calloc(1, sizeof *Res * Dim * Dim);
residual(SSi, Res, Dim);
const double max = max_norm(Res, Dim, Dim);
free(SSi);
// 7. CHECK ERRROR ON THE UPDATED DATA AND RECORD THE RESULT: ERR_OUT
uint32_t err_out = check_error(Lds, Dim, Slater_invT, SlaterT, tolerance);
// if (err_out == 1) printf("cycle index %d: cycle %d with %lu upds failed!\n", cycles_index, cycle, N_updates);
// 8. COMPUTE CONDITION NUMBER
const double condnr = condition_number(Slater, Slater_invT, Lds, Dim);
const double frob = frobenius_norm(Res, Dim, Dim);
free(Res);
// 10. WRITE RESULTS TO FILE: CYCLE#, #UPDS, ERR_INP, ERR_BREAK, #SPLITS, ERR_OUT, COND, #CLCK_TCKS
printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%e\t%e\t%e\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, accumulator, cycles_per_update, cumulative, recursive_calls);
// printf("%u\t%lu\t%u\t%u\t\t%u\t%lu\t\t%lu\t\t%e\t%e\t%e\t%9.6f\t%9.6f\t%9.6f\t%lu\n", cycle, N_updates, err_inp, err_break, err_out, n_splits, block_fail, max, frob, condnr, (double)accumulator/GHz, (double)cycles_per_update/GHz, (double)cumulative/GHz, recursive_calls);
free(Updates_index);
free(Updates);
free(SlaterT);
free(Slater_invT);
free(Slater_invT_copy);
} // END OF CYCLE LOOP
// printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
// printf("#1\t2\t3\t4\t\t5\t6\t\t7\t\t8\t\t9\t\t10\t\t11\t\t12\t\t13\t\t14\n");
// printf("#CYCLE\tUPDS\tERR_IN\tERR_BREAK\tERR_OUT\tSPLITS\t\tBLK_FAILS\tMAX\t\tFROB\t\tCOND\t\tCPU_CYC\t\tCPU_CYC/UPD\tCUMUL\t\tREC\n");
// printf("#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
(void) H5Fclose(file_id);
#ifdef USE_OMP_OFFLOAD_CUDA
cublasDestroy_v2(handle);
cusolverDnDestroy(s_handle);
#endif
return 0;
}