1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-12-22 20:36:01 +01:00

Added IVDEP and ALIGNED in configure.ac

This commit is contained in:
Anthony Scemama 2023-09-14 09:41:15 +02:00
parent 7bec8b7984
commit fd2addb370
2 changed files with 121 additions and 54 deletions

View File

@ -246,6 +246,94 @@ int simd=1;
AC_MSG_RESULT([$SIMD_LENGTH])
AC_DEFINE_UNQUOTED([SIMD_LENGTH], [$SIMD_LENGTH], [Length of SIMD vectors])
# Checking IVDEP
ivdep=""
AC_MSG_CHECKING([for ivdep pragma])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdio.h>
]], [[
int main() {
#pragma ivdep
for (int i = 0; i < 10; ++i) {
printf("Testing: %d\n", i);
}
return 0;
}
]])],
[ivdep='_Pragma("ivdep")'], [
])
AS_IF([test "x$ivdep" = "x"], [
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdio.h>
]], [[
int main() {
#pragma clang loop vectorize(enable)
for (int i = 0; i < 10; ++i) {
printf("Testing: %d\n", i);
}
return 0;
}
]])],
[ivdep='_Pragma("clang loop vectorize(enable)")'], [
])
])
AS_IF([test "x$ivdep" = "x"], [
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdio.h>
]], [[
int main() {
#pragma GCC ivdep
for (int i = 0; i < 10; ++i) {
printf("Testing: %d\n", i);
}
return 0;
}
]])],
[ivdep='_Pragma("GCC ivdep")'], [
])
])
AC_DEFINE_UNQUOTED([IVDEP], [$ivdep], [IVDEP pragma])
AS_IF([test "x$ivdep" = "x"], [
ivdep="no"
])
AC_MSG_RESULT([$ivdep])
# Checking ALIGNED
AC_CHECK_FUNCS([aligned_alloc], [have_aligned_alloc=yes], [have_aligned_alloc=no])
AS_IF([test "x$have_aligned_alloc" = "xyes"], [
AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if you have the aligned_alloc function.])
])
aligned=""
AC_MSG_CHECKING([for vector aligned pragma])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
]], [[
int main() {
double __attribute__((aligned(8))) a[10] ;
#pragma vector aligned
for (int i = 0; i < 10; ++i) {
a[i] = (double) i;
}
return 0;
}
]])],
[aligned='_Pragma("vector aligned")'], [
])
AS_IF([test "x$have_aligned_alloc" = "xno"], [
aligned=""
])
AC_DEFINE_UNQUOTED([ALIGNED], [$aligned], [VECTOR ALIGNED pragma])
AS_IF([test "x$aligned" = "x"], [
aligned="no"
])
AC_MSG_RESULT([$aligned])
# QMCKLDGEMM

View File

@ -31,7 +31,7 @@ This is the range that determines the how many high performance kernel instantce
#+begin_src python :noweb yes :exports none
range(2, 22)
#+end_src
* Naïve Sherman-Morrison
** ~qmckl_sm_naive~
@ -109,7 +109,7 @@ subroutine convert(upds, s_inv, Updates, Inverse, nupdates, lds, dim)
implicit none
integer*8 , intent(in) :: lds, dim, nupdates
real*8 , intent(in) :: upds(nupdates * lds)
real*8 , intent(in) :: s_inv(dim * lds)
real*8 , intent(in) :: s_inv(dim * lds)
real*8 , intent(out) , dimension(lds, nupdates) :: Updates
real*8 , intent(out) , dimension(dim, lds) :: Inverse
@ -136,7 +136,7 @@ subroutine copy_back_inv(Inverse, s_inv, lds, dim)
implicit none
integer*8 , intent(in) :: lds, dim
real*8 , intent(in) , dimension(dim, lds) :: Inverse
real*8 , intent(out) :: s_inv(dim * lds)
real*8 , intent(out) :: s_inv(dim * lds)
integer*8 :: i, j
@ -154,7 +154,7 @@ subroutine copy_back_lu(Later_updates, later_upds, lds, nupdates)
implicit none
integer*8 , intent(in) :: lds, nupdates
real*8 , intent(in) , dimension(lds, nupdates) :: Later_updates
real*8 , intent(out) :: later_upds(nupdates * lds)
real*8 , intent(out) :: later_upds(nupdates * lds)
integer*8 :: i, j
@ -300,7 +300,7 @@ qmckl_exit_code qmckl_sm_naive (
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant );
double* determinant );
#+end_src
#+CALL: generate_private_c_header(table=qmckl_sm_naive_args,rettyp=get_value("CRetType"),fname="qmckl_sm_naive_hpc")
@ -316,7 +316,7 @@ qmckl_exit_code qmckl_sm_naive_hpc (
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant );
double* determinant );
#+end_src
#+CALL: generate_c_header(table=qmckl_sm_naive_args,rettyp=get_value("CRetType"),fname="qmckl_sm_naive_doc")
@ -332,7 +332,7 @@ qmckl_exit_code qmckl_sm_naive_doc (
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant );
double* determinant );
#+end_src
*** C sources
@ -345,22 +345,6 @@ Common includes and macros used by all the Sherman-Morrison-Woodbury kernels.
#include "assert.h"
#include "stdio.h"
// Order important because
// __GNUC__ also set in ICC, ICX and CLANG
// __clang__ also set in ICX
#if defined(__INTEL_COMPILER)
#define IVDEP _Pragma("ivdep")
#define ALIGNED _Pragma("vector aligned")
#elif defined(__INTEL_LLVM_COMPILER)
#define IVDEP _Pragma("ivdep")
#define ALIGNED _Pragma("vector aligned")
#elif defined(__clang__)
#define IVDEP _Pragma("clang loop vectorize(enable)")
#define ALIGNED
#elif defined(__GNUC__)
#define IVDEP _Pragma("GCC ivdep")
#define ALIGNED
#endif
#+end_src
~qmckl_sm_naive_hpc~ is a high performance variation of
@ -534,7 +518,7 @@ Python script that generated C switch cases that call individual kernel instance
#+NAME:naive_switch-case_generator
#+begin_src python :noweb yes
text="""
case {Dim}:
case {Dim}:
return qmckl_sm_naive_{Dim}(context,
N_updates,
Updates,
@ -604,7 +588,7 @@ qmckl_exit_code qmckl_sm_naive(const qmckl_context context,
Slater_inv,
determinant);
#endif
return QMCKL_FAILURE;
}
#+end_src
@ -695,16 +679,16 @@ end interface
#+end_src
*** Performance
This function performs best when there is only 1 rank-1 update in the update cycle. It is
not useful to use Sherman-Morrison with update splitting for these cycles since splitting
This function performs best when there is only 1 rank-1 update in the update cycle. It is
not useful to use Sherman-Morrison with update splitting for these cycles since splitting
can never resolve a situation where applying the update causes singular behaviour.
*** Tests
The tests for the kernels are executed on datasets that are extracted from a run of
QMC=Chem on Benzene (21 spin-up/21 spin down electrons) using 329 unique alpha determinants.
The tests are run such that the kernels reject the computed inverse whenever the computed
intermediate determinants or denominators are smaller than 1e-3. This is the default value in
QMC=Chem. The tests will return QMCKL_SUCCESS whenever all the elements of the final matrix
$R=S.S^-1 - 1$ are smaller than the given tolerance value of 1e-3, and will return
The tests for the kernels are executed on datasets that are extracted from a run of
QMC=Chem on Benzene (21 spin-up/21 spin down electrons) using 329 unique alpha determinants.
The tests are run such that the kernels reject the computed inverse whenever the computed
intermediate determinants or denominators are smaller than 1e-3. This is the default value in
QMC=Chem. The tests will return QMCKL_SUCCESS whenever all the elements of the final matrix
$R=S.S^-1 - 1$ are smaller than the given tolerance value of 1e-3, and will return
QMCKL_FAILURE if the values are larger than this tolerance value.
#+begin_src c :tangle (eval c_test)
@ -777,8 +761,8 @@ It has three extra parameters in its API:
It is up to the user to decide what to do with these updates once the kernel returns. Normally ~qmckl_sm_splitting_core~ is
used as the core part of a recursive function, as is done in ~qmckl_sm_splitting~ or as part of a more complex
kernel like ~qmckl_sherman_morrison_smw32s~.
kernel like ~qmckl_sherman_morrison_smw32s~.
If the determinant is passed it will only be partially updated if there were any update splits.
*** API
@ -922,7 +906,7 @@ integer function qmckl_sm_splitting_core_doc_f( &
info = QMCKL_SUCCESS
write(*,*) "Leaving 'qmckl_sm_splittinig_core_doc_f'"
end function qmckl_sm_splitting_core_doc_f
#+end_src
@ -1003,7 +987,7 @@ qmckl_exit_code qmckl_sm_splitting_core (
double* later_updates,
uint64_t* later_index,
uint64_t* later,
double* determinant );
double* determinant );
#+end_src
#+CALL: generate_c_header(table=qmckl_sm_splitting_core_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_core_hpc")
@ -1022,7 +1006,7 @@ qmckl_exit_code qmckl_sm_splitting_core_hpc (
double* later_updates,
uint64_t* later_index,
uint64_t* later,
double* determinant );
double* determinant );
#+end_src
#+CALL: generate_c_header(table=qmckl_sm_splitting_core_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_core_doc")
@ -1041,7 +1025,7 @@ qmckl_exit_code qmckl_sm_splitting_core_doc (
double* later_updates,
uint64_t* later_index,
uint64_t* later,
double* determinant );
double* determinant );
#+end_src
*** C sources
@ -1242,7 +1226,6 @@ case {Dim}: {
later_index,
later,
determinant);
break;
}"""
result = []
for Dim in <<kernel_generator_range>>:
@ -1493,7 +1476,7 @@ integer recursive function qmckl_sm_splitting_doc_f( &
real*8 , intent(inout) :: determinant
integer , external :: qmckl_sm_splitting_core_doc_f
integer*8 :: Later
integer*8 , dimension(nupdates) :: Later_index
real*8 , dimension(lds * nupdates) :: Later_updates
@ -1523,7 +1506,7 @@ integer recursive function qmckl_sm_splitting_doc_f( &
Later_index, &
Later, &
determinant)
if (Later > 0) then
info = qmckl_sm_splitting_doc_f( &
context, &
@ -1539,7 +1522,7 @@ integer recursive function qmckl_sm_splitting_doc_f( &
info = QMCKL_SUCCESS
write(*,*) "Leaving 'qmckl_sm_splitting_doc_f'"
end function qmckl_sm_splitting_doc_f
#+end_src
@ -1574,12 +1557,12 @@ integer(c_int32_t) function qmckl_sm_splitting_doc &
integer(c_int32_t), external :: qmckl_sm_splitting_doc_f
write(*,*) "Entering 'qmckl_sm_splitting_doc'"
info = qmckl_sm_splitting_doc_f &
(context, LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant)
write(*,*) "Leaving 'qmckl_sm_splitting_doc'"
end function qmckl_sm_splitting_doc
#+end_src
@ -1598,7 +1581,7 @@ qmckl_exit_code qmckl_sm_splitting (
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant );
double* determinant );
#+end_src
#+CALL: generate_private_c_header(table=qmckl_sm_splitting_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_hpc")
@ -1614,7 +1597,7 @@ qmckl_exit_code qmckl_sm_splitting_hpc (
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant );
double* determinant );
#+end_src
#+CALL: generate_c_header(table=qmckl_sm_splitting_args,rettyp=get_value("CRetType"),fname="qmckl_sm_splitting_doc")
@ -1630,7 +1613,7 @@ qmckl_exit_code qmckl_sm_splitting_doc (
const uint64_t* Updates_index,
const double breakdown,
double* Slater_inv,
double* determinant );
double* determinant );
#+end_src
*** C source
@ -1722,8 +1705,6 @@ qmckl_exit_code qmckl_sm_splitting(
const double breakdown,
double* Slater_inv,
double* determinant) {
printf("Entering 'qmckl_sm_splitting'\n");
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return qmckl_failwith(
@ -1754,11 +1735,9 @@ qmckl_exit_code qmckl_sm_splitting(
breakdown,
Slater_inv,
determinant);
#endif
#endif
printf("Leaving 'qmckl_sm_splitting'\n");
return QMCKL_SUCCESS;
return QMCKL_SUCCESS;
}
#+end_src