From fd2addb37050dcaff7a6e27e91db41d5fe2c9670 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 14 Sep 2023 09:41:15 +0200 Subject: [PATCH 1/5] Added IVDEP and ALIGNED in configure.ac --- configure.ac | 88 +++++++++++++++++++++++++ org/qmckl_sherman_morrison_woodbury.org | 87 ++++++++++-------------- 2 files changed, 121 insertions(+), 54 deletions(-) diff --git a/configure.ac b/configure.ac index 727c585..856cf43 100644 --- a/configure.ac +++ b/configure.ac @@ -246,6 +246,94 @@ int simd=1; AC_MSG_RESULT([$SIMD_LENGTH]) AC_DEFINE_UNQUOTED([SIMD_LENGTH], [$SIMD_LENGTH], [Length of SIMD vectors]) +# Checking IVDEP +ivdep="" +AC_MSG_CHECKING([for ivdep pragma]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include +]], [[ + int main() { + #pragma ivdep + for (int i = 0; i < 10; ++i) { + printf("Testing: %d\n", i); + } + return 0; + } +]])], + [ivdep='_Pragma("ivdep")'], [ +]) + +AS_IF([test "x$ivdep" = "x"], [ + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + ]], [[ + int main() { + #pragma clang loop vectorize(enable) + for (int i = 0; i < 10; ++i) { + printf("Testing: %d\n", i); + } + return 0; + } + ]])], + [ivdep='_Pragma("clang loop vectorize(enable)")'], [ + ]) +]) + +AS_IF([test "x$ivdep" = "x"], [ + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + ]], [[ + int main() { + #pragma GCC ivdep + for (int i = 0; i < 10; ++i) { + printf("Testing: %d\n", i); + } + return 0; + } + ]])], + [ivdep='_Pragma("GCC ivdep")'], [ + ]) +]) + +AC_DEFINE_UNQUOTED([IVDEP], [$ivdep], [IVDEP pragma]) +AS_IF([test "x$ivdep" = "x"], [ + ivdep="no" +]) +AC_MSG_RESULT([$ivdep]) + + +# Checking ALIGNED + +AC_CHECK_FUNCS([aligned_alloc], [have_aligned_alloc=yes], [have_aligned_alloc=no]) +AS_IF([test "x$have_aligned_alloc" = "xyes"], [ + AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if you have the aligned_alloc function.]) +]) + +aligned="" +AC_MSG_CHECKING([for vector aligned pragma]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +]], [[ + int main() { + double __attribute__((aligned(8))) a[10] ; + #pragma vector aligned + for (int i = 0; i < 10; ++i) { + a[i] = (double) i; + } + return 0; + } +]])], + [aligned='_Pragma("vector aligned")'], [ +]) + +AS_IF([test "x$have_aligned_alloc" = "xno"], [ + aligned="" +]) + +AC_DEFINE_UNQUOTED([ALIGNED], [$aligned], [VECTOR ALIGNED pragma]) +AS_IF([test "x$aligned" = "x"], [ + aligned="no" +]) +AC_MSG_RESULT([$aligned]) # QMCKLDGEMM diff --git a/org/qmckl_sherman_morrison_woodbury.org b/org/qmckl_sherman_morrison_woodbury.org index 4ddd949..5b51b07 100644 --- a/org/qmckl_sherman_morrison_woodbury.org +++ b/org/qmckl_sherman_morrison_woodbury.org @@ -31,7 +31,7 @@ This is the range that determines the how many high performance kernel instantce #+begin_src python :noweb yes :exports none range(2, 22) #+end_src - + * Naïve Sherman-Morrison ** ~qmckl_sm_naive~ @@ -109,7 +109,7 @@ subroutine convert(upds, s_inv, Updates, Inverse, nupdates, lds, dim) implicit none integer*8 , intent(in) :: lds, dim, nupdates real*8 , intent(in) :: upds(nupdates * lds) - real*8 , intent(in) :: s_inv(dim * lds) + real*8 , intent(in) :: s_inv(dim * lds) real*8 , intent(out) , dimension(lds, nupdates) :: Updates real*8 , intent(out) , dimension(dim, lds) :: Inverse @@ -136,7 +136,7 @@ subroutine copy_back_inv(Inverse, s_inv, lds, dim) implicit none integer*8 , intent(in) :: lds, dim real*8 , intent(in) , dimension(dim, lds) :: Inverse - real*8 , intent(out) :: s_inv(dim * lds) + real*8 , intent(out) :: s_inv(dim * lds) integer*8 :: i, j @@ -154,7 +154,7 @@ subroutine copy_back_lu(Later_updates, later_upds, lds, nupdates) implicit none integer*8 , intent(in) :: lds, nupdates real*8 , intent(in) , dimension(lds, nupdates) :: Later_updates - real*8 , intent(out) :: later_upds(nupdates * lds) + real*8 , intent(out) :: later_upds(nupdates * lds) integer*8 :: i, j @@ -300,7 +300,7 @@ qmckl_exit_code qmckl_sm_naive ( const uint64_t* Updates_index, const double breakdown, double* Slater_inv, - double* determinant ); + double* determinant ); #+end_src #+CALL: generate_private_c_header(table=qmckl_sm_naive_args,rettyp=get_value("CRetType"),fname="qmckl_sm_naive_hpc") @@ -316,7 +316,7 @@ qmckl_exit_code qmckl_sm_naive_hpc ( const uint64_t* Updates_index, const double breakdown, double* Slater_inv, - double* determinant ); + double* determinant ); #+end_src #+CALL: generate_c_header(table=qmckl_sm_naive_args,rettyp=get_value("CRetType"),fname="qmckl_sm_naive_doc") @@ -332,7 +332,7 @@ qmckl_exit_code qmckl_sm_naive_doc ( const uint64_t* Updates_index, const double breakdown, double* Slater_inv, - double* determinant ); + double* determinant ); #+end_src *** C sources @@ -345,22 +345,6 @@ Common includes and macros used by all the Sherman-Morrison-Woodbury kernels. #include "assert.h" #include "stdio.h" -// Order important because -// __GNUC__ also set in ICC, ICX and CLANG -// __clang__ also set in ICX -#if defined(__INTEL_COMPILER) - #define IVDEP _Pragma("ivdep") - #define ALIGNED _Pragma("vector aligned") -#elif defined(__INTEL_LLVM_COMPILER) - #define IVDEP _Pragma("ivdep") - #define ALIGNED _Pragma("vector aligned") -#elif defined(__clang__) - #define IVDEP _Pragma("clang loop vectorize(enable)") - #define ALIGNED -#elif defined(__GNUC__) - #define IVDEP _Pragma("GCC ivdep") - #define ALIGNED -#endif #+end_src ~qmckl_sm_naive_hpc~ is a high performance variation of @@ -534,7 +518,7 @@ Python script that generated C switch cases that call individual kernel instance #+NAME:naive_switch-case_generator #+begin_src python :noweb yes text=""" -case {Dim}: +case {Dim}: return qmckl_sm_naive_{Dim}(context, N_updates, Updates, @@ -604,7 +588,7 @@ qmckl_exit_code qmckl_sm_naive(const qmckl_context context, Slater_inv, determinant); #endif - + return QMCKL_FAILURE; } #+end_src @@ -695,16 +679,16 @@ end interface #+end_src *** Performance -This function performs best when there is only 1 rank-1 update in the update cycle. It is -not useful to use Sherman-Morrison with update splitting for these cycles since splitting +This function performs best when there is only 1 rank-1 update in the update cycle. It is +not useful to use Sherman-Morrison with update splitting for these cycles since splitting can never resolve a situation where applying the update causes singular behaviour. *** Tests -The tests for the kernels are executed on datasets that are extracted from a run of -QMC=Chem on Benzene (21 spin-up/21 spin down electrons) using 329 unique alpha determinants. -The tests are run such that the kernels reject the computed inverse whenever the computed -intermediate determinants or denominators are smaller than 1e-3. This is the default value in -QMC=Chem. The tests will return QMCKL_SUCCESS whenever all the elements of the final matrix -$R=S.S^-1 - 1$ are smaller than the given tolerance value of 1e-3, and will return +The tests for the kernels are executed on datasets that are extracted from a run of +QMC=Chem on Benzene (21 spin-up/21 spin down electrons) using 329 unique alpha determinants. +The tests are run such that the kernels reject the computed inverse whenever the computed +intermediate determinants or denominators are smaller than 1e-3. This is the default value in +QMC=Chem. The tests will return QMCKL_SUCCESS whenever all the elements of the final matrix +$R=S.S^-1 - 1$ are smaller than the given tolerance value of 1e-3, and will return QMCKL_FAILURE if the values are larger than this tolerance value. #+begin_src c :tangle (eval c_test) @@ -777,8 +761,8 @@ It has three extra parameters in its API: It is up to the user to decide what to do with these updates once the kernel returns. Normally ~qmckl_sm_splitting_core~ is used as the core part of a recursive function, as is done in ~qmckl_sm_splitting~ or as part of a more complex -kernel like ~qmckl_sherman_morrison_smw32s~. - +kernel like ~qmckl_sherman_morrison_smw32s~. + If the determinant is passed it will only be partially updated if there were any update splits. *** API @@ -922,7 +906,7 @@ integer function qmckl_sm_splitting_core_doc_f( & info = QMCKL_SUCCESS write(*,*) "Leaving 'qmckl_sm_splittinig_core_doc_f'" - + end function qmckl_sm_splitting_core_doc_f #+end_src @@ -1003,7 +987,7 @@ qmckl_exit_code qmckl_sm_splitting_core ( double* later_updates, uint64_t* later_index, uint64_t* later, - double* determinant ); + double* determinant ); #+end_src #+CALL: generate_c_header(table=qmckl_sm_splitting_core_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_core_hpc") @@ -1022,7 +1006,7 @@ qmckl_exit_code qmckl_sm_splitting_core_hpc ( double* later_updates, uint64_t* later_index, uint64_t* later, - double* determinant ); + double* determinant ); #+end_src #+CALL: generate_c_header(table=qmckl_sm_splitting_core_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_core_doc") @@ -1041,7 +1025,7 @@ qmckl_exit_code qmckl_sm_splitting_core_doc ( double* later_updates, uint64_t* later_index, uint64_t* later, - double* determinant ); + double* determinant ); #+end_src *** C sources @@ -1242,7 +1226,6 @@ case {Dim}: { later_index, later, determinant); - break; }""" result = [] for Dim in <>: @@ -1493,7 +1476,7 @@ integer recursive function qmckl_sm_splitting_doc_f( & real*8 , intent(inout) :: determinant integer , external :: qmckl_sm_splitting_core_doc_f - + integer*8 :: Later integer*8 , dimension(nupdates) :: Later_index real*8 , dimension(lds * nupdates) :: Later_updates @@ -1523,7 +1506,7 @@ integer recursive function qmckl_sm_splitting_doc_f( & Later_index, & Later, & determinant) - + if (Later > 0) then info = qmckl_sm_splitting_doc_f( & context, & @@ -1539,7 +1522,7 @@ integer recursive function qmckl_sm_splitting_doc_f( & info = QMCKL_SUCCESS write(*,*) "Leaving 'qmckl_sm_splitting_doc_f'" - + end function qmckl_sm_splitting_doc_f #+end_src @@ -1574,12 +1557,12 @@ integer(c_int32_t) function qmckl_sm_splitting_doc & integer(c_int32_t), external :: qmckl_sm_splitting_doc_f write(*,*) "Entering 'qmckl_sm_splitting_doc'" - + info = qmckl_sm_splitting_doc_f & (context, LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant) write(*,*) "Leaving 'qmckl_sm_splitting_doc'" - + end function qmckl_sm_splitting_doc #+end_src @@ -1598,7 +1581,7 @@ qmckl_exit_code qmckl_sm_splitting ( const uint64_t* Updates_index, const double breakdown, double* Slater_inv, - double* determinant ); + double* determinant ); #+end_src #+CALL: generate_private_c_header(table=qmckl_sm_splitting_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_hpc") @@ -1614,7 +1597,7 @@ qmckl_exit_code qmckl_sm_splitting_hpc ( const uint64_t* Updates_index, const double breakdown, double* Slater_inv, - double* determinant ); + double* determinant ); #+end_src #+CALL: generate_c_header(table=qmckl_sm_splitting_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_doc") @@ -1630,7 +1613,7 @@ qmckl_exit_code qmckl_sm_splitting_doc ( const uint64_t* Updates_index, const double breakdown, double* Slater_inv, - double* determinant ); + double* determinant ); #+end_src *** C source @@ -1722,8 +1705,6 @@ qmckl_exit_code qmckl_sm_splitting( const double breakdown, double* Slater_inv, double* determinant) { - - printf("Entering 'qmckl_sm_splitting'\n"); if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return qmckl_failwith( @@ -1754,11 +1735,9 @@ qmckl_exit_code qmckl_sm_splitting( breakdown, Slater_inv, determinant); - #endif + #endif - printf("Leaving 'qmckl_sm_splitting'\n"); - - return QMCKL_SUCCESS; + return QMCKL_SUCCESS; } #+end_src From 10ee05005052e1b6f472f377b13c9bcd7c3f9fb3 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 14 Sep 2023 09:53:17 +0200 Subject: [PATCH 2/5] Removed IVPDEP from SM --- org/qmckl_memory.org | 11 +++++------ org/qmckl_sherman_morrison_woodbury.org | 16 ---------------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/org/qmckl_memory.org b/org/qmckl_memory.org index caf6acd..64c4a1c 100644 --- a/org/qmckl_memory.org +++ b/org/qmckl_memory.org @@ -125,11 +125,10 @@ void* qmckl_malloc(qmckl_context context, ~qmckl_context~. 4. The function then allocates memory: - If the ~HAVE_HPC~ macro is defined, the memory allocation is done using - the ~aligned_alloc~ function with a 64-byte alignment, rounding up the - requested size to the nearest multiple of 64 bytes. If the ~HAVE_HPC~ - macro is not defined, the memory allocation is done using the standard - ~malloc~ function. + If the ~HAVE_HPC~ and ~HAVE_ALIGNED_ALLOC~ macros are defined, the memory + allocation is done using the ~aligned_alloc~ function with a 64-byte alignment, + rounding up the requested size to the nearest multiple of 64 bytes. Else, the + memory allocation is done using the standard ~malloc~ function. 5 If the allocation fails, the function returns ~NULL~. @@ -154,7 +153,7 @@ void* qmckl_malloc(qmckl_context context, const qmckl_memory_info_struct info) { qmckl_context_struct* const ctx = (qmckl_context_struct*) context; /* Allocate memory and zero it */ -#ifdef HAVE_HPC +#if defined(HAVE_HPC) && defined(HAVE_ALIGNED_ALLOC) assert( ((info.size+64) >> 6) << 6 >= info.size ); void * pointer = aligned_alloc(64, ((info.size+64) >> 6) << 6 ); #else diff --git a/org/qmckl_sherman_morrison_woodbury.org b/org/qmckl_sherman_morrison_woodbury.org index 6e7d779..c910525 100644 --- a/org/qmckl_sherman_morrison_woodbury.org +++ b/org/qmckl_sherman_morrison_woodbury.org @@ -348,22 +348,6 @@ Common includes and macros used by all the Sherman-Morrison-Woodbury kernels. #include "assert.h" #include "stdio.h" -// Order important because -// __GNUC__ also set in ICC, ICX and CLANG -// __clang__ also set in ICX -#if defined(__INTEL_COMPILER) - #define IVDEP _Pragma("ivdep") - #define ALIGNED _Pragma("vector aligned") -#elif defined(__INTEL_LLVM_COMPILER) - #define IVDEP _Pragma("ivdep") - #define ALIGNED _Pragma("vector aligned") -#elif defined(__clang__) - #define IVDEP _Pragma("clang loop vectorize(enable)") - #define ALIGNED -#elif defined(__GNUC__) - #define IVDEP _Pragma("GCC ivdep") - #define ALIGNED -#endif #+end_src ~qmckl_sm_naive_hpc~ is a high performance variation of From 932263d22ffbbb20ec468e4a4fc1a5852c86e5ee Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 14 Sep 2023 09:54:50 +0200 Subject: [PATCH 3/5] Cleaning in SM --- org/qmckl_sherman_morrison_woodbury.org | 2 -- 1 file changed, 2 deletions(-) diff --git a/org/qmckl_sherman_morrison_woodbury.org b/org/qmckl_sherman_morrison_woodbury.org index c910525..68ba0b6 100644 --- a/org/qmckl_sherman_morrison_woodbury.org +++ b/org/qmckl_sherman_morrison_woodbury.org @@ -1224,7 +1224,6 @@ case {Dim}: { later_index, later, determinant); - break; }""" result = [] for Dim in <>: @@ -3062,7 +3061,6 @@ qmckl_exit_code qmckl_sm_splitting( determinant); #endif - return QMCKL_SUCCESS; } #+end_src From c66188e64194bbe85f19ee808610e11a55e4e2a1 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 14 Sep 2023 09:56:28 +0200 Subject: [PATCH 4/5] Cleaning in SM --- org/qmckl_mo.org | 2 -- 1 file changed, 2 deletions(-) diff --git a/org/qmckl_mo.org b/org/qmckl_mo.org index 9f12d40..d16939f 100644 --- a/org/qmckl_mo.org +++ b/org/qmckl_mo.org @@ -255,7 +255,6 @@ qmckl_exit_code qmckl_set_mo_basis_mo_num(qmckl_context context, const int64_t m <> - return QMCKL_SUCCESS; } #+end_src @@ -1087,7 +1086,6 @@ qmckl_exit_code qmckl_provide_mo_basis_mo_value(qmckl_context context) } else { rc = qmckl_provide_en_distance(context); if (rc != QMCKL_SUCCESS) { - return rc; return qmckl_failwith( context, QMCKL_NOT_PROVIDED, "qmckl_provide_mo_basis_mo_value", From 561373fe4fa25aaf9057451eeb75932cb94540c6 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Thu, 14 Sep 2023 11:00:24 +0200 Subject: [PATCH 5/5] Improved configure for nvc --- configure.ac | 13 ++++++++++++- org/qmckl_ao.org | 8 ++++---- org/qmckl_mo.org | 4 ++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/configure.ac b/configure.ac index 7818cf3..7ffd937 100644 --- a/configure.ac +++ b/configure.ac @@ -304,10 +304,21 @@ AC_MSG_RESULT([$ivdep]) # Checking ALIGNED -AC_CHECK_FUNCS([aligned_alloc], [have_aligned_alloc=yes], [have_aligned_alloc=no]) +AC_MSG_CHECKING([for aligned_alloc]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +]], [[ + int main() { + void * pointer = aligned_alloc(64, 100); + free(pointer); + return 0; + } +]])], + [have_aligned_alloc=yes], [have_aligned_alloc=no +]) AS_IF([test "x$have_aligned_alloc" = "xyes"], [ AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if you have the aligned_alloc function.]) ]) +AC_MSG_RESULT([$have_aligned_alloc]) aligned="" AC_MSG_CHECKING([for vector aligned pragma]) diff --git a/org/qmckl_ao.org b/org/qmckl_ao.org index e61b7f2..36d9f6e 100644 --- a/org/qmckl_ao.org +++ b/org/qmckl_ao.org @@ -3846,7 +3846,7 @@ print ( "[7][4][26] : %e"% lf(a,x,y)) assert(qmckl_electron_provided(context)); - const int64_t point_num = elec_num; + int64_t point_num = elec_num; rc = qmckl_set_point(context, 'N', point_num, elec_coord, point_num*3); assert(rc == QMCKL_SUCCESS); @@ -4261,7 +4261,7 @@ print ( "[1][4][26] : %25.15e"% lf(a,x,y)) assert(qmckl_electron_provided(context)); - const int64_t point_num = elec_num; + int64_t point_num = elec_num; rc = qmckl_set_point(context, 'N', point_num, elec_coord, point_num*3); assert(rc == QMCKL_SUCCESS); @@ -6310,7 +6310,7 @@ double* elec_coord = &(chbrclf_elec_coord[0][0][0]); assert(qmckl_electron_provided(context)); -const int64_t point_num = elec_num; +int64_t point_num = elec_num; rc = qmckl_set_point(context, 'N', point_num, elec_coord, point_num*3); assert(rc == QMCKL_SUCCESS); @@ -7261,7 +7261,7 @@ double* elec_coord = &(chbrclf_elec_coord[0][0][0]); assert(qmckl_electron_provided(context)); -const int64_t point_num = elec_num; +int64_t point_num = elec_num; rc = qmckl_set_point(context, 'N', point_num, elec_coord, point_num*3); assert(rc == QMCKL_SUCCESS); diff --git a/org/qmckl_mo.org b/org/qmckl_mo.org index d16939f..40b3190 100644 --- a/org/qmckl_mo.org +++ b/org/qmckl_mo.org @@ -2810,11 +2810,11 @@ print ( "[4][1][15][14] : %25.15e"% lf(a,x,y)) int64_t elec_up_num = chbrclf_elec_up_num; int64_t elec_dn_num = chbrclf_elec_dn_num; double* elec_coord = &(chbrclf_elec_coord[0][0][0]); -const int64_t nucl_num = chbrclf_nucl_num; +int64_t nucl_num = chbrclf_nucl_num; const double* nucl_charge = chbrclf_charge; const double* nucl_coord = &(chbrclf_nucl_coord[0][0]); -const int64_t point_num = walk_num*elec_num; +int64_t point_num = walk_num*elec_num; rc = qmckl_set_electron_num (context, elec_up_num, elec_dn_num); assert (rc == QMCKL_SUCCESS);