diff --git a/configure.ac b/configure.ac index fe54fc2..ffa8f99 100644 --- a/configure.ac +++ b/configure.ac @@ -224,6 +224,47 @@ AS_IF([test "$HAVE_HPC" = "yes"], [ AC_DEFINE([HAVE_HPC], [1], [If defined, activate HPC routines]) ]) +# Enable Verificarlo tests +AC_ARG_ENABLE([vfc_ci], +[ --enable-vfc_ci Build the library with vfc_ci support], +[case "${enableval}" in + yes) vfc_ci=true && FCFLAGS="-D VFC_CI $FCFLAGS" && CFLAGS="-D VFC_CI $CFLAGS";; + no) vfc_ci=false ;; + *) AC_MSG_ERROR([bad value ${enableval} for --enable_vfc_ci]) ;; +esac],[vfc_ci=false]) +AM_CONDITIONAL([VFC_CI], [test x$vfc_ci = xtrue]) + +if test "$FC" = "verificarlo-f"; then + AC_MSG_NOTICE(verificarlo-f detected) + # Arguments order is important here + FCFLAGS="-Mpreprocess $FCFLAGS" +fi + +## Enable GPU offloading + +# OpenACC offloading +AC_ARG_ENABLE(openacc-offload, [AS_HELP_STRING([--openacc-offload],[Use OpenACC-offloaded functions])], HAVE_OPENACC_OFFLOAD=$enableval, HAVE_OPENACC_OFFLOAD=no) +AS_IF([test "$HAVE_OPENACC_OFFLOAD" = "yes"], [ + AC_DEFINE([HAVE_OPENACC_OFFLOAD], [1], [If defined, activate OpenACC-offloaded routines]) + CFLAGS="$OFFLOAD_FLAGS $OFFLOAD_CFLAGS $CFLAGS" + FCFLAGS="$OFFLOAD_FLAGS $OFFLOAD_FCFLAGS -DHAVE_OPENACC_OFFLOAD $FCFLAGS" +]) + +# cuBLAS offloading +AC_ARG_ENABLE(cublas-offload, [AS_HELP_STRING([--cublas-offload],[Use cuBLAS-offloaded functions])], HAVE_CUBLAS_OFFLOAD=$enableval, HAVE_CUBLAS_OFFLOAD=no) +AS_IF([test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [ + AC_DEFINE([HAVE_CUBLAS_OFFLOAD], [1], [If defined, activate cuBLAS-offloaded routines]) + FCFLAGS="-DHAVE_CUBLAS_OFFLOAD" +]) + +# General offload +AS_IF([test "$HAVE_OPENACC_OFFLOAD" = "yes" || test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [ + CFLAGS="$OFFLOAD_FLAGS $OFFLOAD_CFLAGS $CFLAGS" + FCFLAGS="$OFFLOAD_FLAGS $OFFLOAD_FCFLAGS $FCFLAGS" +]) + +## + AC_ARG_ENABLE(debug, [AS_HELP_STRING([--enable-debug],[compile for debugging])], ok=$enableval, ok=no) if test "$ok" = "yes"; then if test "$GCC" = "yes"; then @@ -319,21 +360,6 @@ if test "x${QMCKL_DEVEL}" != "x"; then fi -# Enable Verificarlo tests -AC_ARG_ENABLE([vfc_ci], -[ --enable-vfc_ci Build the library with vfc_ci support], -[case "${enableval}" in - yes) vfc_ci=true && FCFLAGS="-D VFC_CI $FCFLAGS" && CFLAGS="-D VFC_CI $CFLAGS";; - no) vfc_ci=false ;; - *) AC_MSG_ERROR([bad value ${enableval} for --enable_vfc_ci]) ;; -esac],[vfc_ci=false]) -AM_CONDITIONAL([VFC_CI], [test x$vfc_ci = xtrue]) - -if test "$FC" = "verificarlo-f"; then - AC_MSG_NOTICE(verificarlo-f detected) - # Arguments order is important here - FCFLAGS="-Mpreprocess $FCFLAGS" -fi #PKG-CONFIG #mkl-dynamic-lp64-seq @@ -369,6 +395,8 @@ LDFLAGS:........: ${LDFLAGS} LIBS............: ${LIBS} USE CHAMELEON...: ${with_chameleon} HPC version.....: ${HAVE_HPC} +OpenACC offload.: ${HAVE_OPENACC_OFFLOAD} +cuBLAS offload..: ${HAVE_CUBLAS_OFFLOAD} Package features: ${ARGS} diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index 6eaad62..c602d84 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -151,6 +151,7 @@ int main() { | ~factor_en_deriv_e_date~ | ~uint64_t~ | out | Keep track of the date for the en derivative | | ~factor_een_deriv_e~ | ~double[4][nelec][walk_num]~ | out | Derivative of the Jastrow factor: electron-electron-nucleus part | | ~factor_een_deriv_e_date~ | ~uint64_t~ | out | Keep track of the date for the een derivative | + | ~offload_type~ | ~qmckl_jastrow_offload_type~ | in | Enum type to change offload type at runtime | computed data: @@ -327,6 +328,14 @@ kappa_inv = 1.0/kappa ** Data structure +#+begin_src c :comments org :tangle (eval h_type) +typedef enum qmckl_jastrow_offload_type{ + OFFLOAD_NONE, + OFFLOAD_OPENACC, + OFFLOAD_CUBLAS +} qmckl_jastrow_offload_type; +#+end_src + #+begin_src c :comments org :tangle (eval h_private_type) typedef struct qmckl_jastrow_struct{ int32_t uninitialized; @@ -372,6 +381,7 @@ typedef struct qmckl_jastrow_struct{ uint64_t een_rescaled_n_deriv_e_date; bool provided; char * type; + qmckl_jastrow_offload_type offload_type; } qmckl_jastrow_struct; #+end_src @@ -416,6 +426,7 @@ qmckl_exit_code qmckl_get_jastrow_type_nucl_vector (qmckl_context context, int qmckl_exit_code qmckl_get_jastrow_aord_vector (qmckl_context context, double * const aord_vector, const int64_t size_max); qmckl_exit_code qmckl_get_jastrow_bord_vector (qmckl_context context, double * const bord_vector, const int64_t size_max); qmckl_exit_code qmckl_get_jastrow_cord_vector (qmckl_context context, double * const cord_vector, const int64_t size_max); +qmckl_exit_code qmckl_get_jastrow_offload_type (qmckl_context context, qmckl_jastrow_offload_type * const offload_type); #+end_src Along with these core functions, calculation of the jastrow factor @@ -713,6 +724,32 @@ qmckl_get_jastrow_cord_vector (const qmckl_context context, return QMCKL_SUCCESS; } +qmckl_exit_code qmckl_get_jastrow_offload_type (const qmckl_context context, qmckl_jastrow_offload_type* const offload_type) { + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return (char) 0; + } + + if (offload_type == NULL) { + return qmckl_failwith( context, + QMCKL_INVALID_ARG_2, + "qmckl_get_jastrow_offload_type", + "offload_type is a null pointer"); + } + + qmckl_context_struct* const ctx = (qmckl_context_struct* const) context; + assert (ctx != NULL); + + int32_t mask = 1 << 0; + + if ( (ctx->jastrow.uninitialized & mask) != 0) { + return QMCKL_NOT_PROVIDED; + } + + *offload_type = ctx->jastrow.offload_type; + return QMCKL_SUCCESS; +} + #+end_src ** Initialization functions @@ -727,6 +764,7 @@ qmckl_exit_code qmckl_set_jastrow_type_nucl_vector (qmckl_context context, con qmckl_exit_code qmckl_set_jastrow_aord_vector (qmckl_context context, const double * aord_vector, const int64_t size_max); qmckl_exit_code qmckl_set_jastrow_bord_vector (qmckl_context context, const double * bord_vector, const int64_t size_max); qmckl_exit_code qmckl_set_jastrow_cord_vector (qmckl_context context, const double * cord_vector, const int64_t size_max); +qmckl_exit_code qmckl_set_jastrow_offload_type (qmckl_context context, const qmckl_jastrow_offload_type offload_type); #+end_src #+NAME:pre2 @@ -1063,6 +1101,14 @@ qmckl_set_jastrow_cord_vector(qmckl_context context, <> } +qmckl_exit_code +qmckl_set_jastrow_offload_type(qmckl_context context, const qmckl_jastrow_offload_type offload_type) +{ +<> + ctx->jastrow.offload_type = offload_type; + return QMCKL_SUCCESS; +} + #+end_src When the required information is completely entered, other data structures are @@ -2560,6 +2606,7 @@ qmckl_exit_code qmckl_compute_factor_en ( double x, x1, power_ser; + if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } @@ -4812,7 +4859,6 @@ qmckl_exit_code qmckl_provide_lkpm_combined_index(qmckl_context context) qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) { - if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return QMCKL_NULL_CONTEXT; } @@ -4844,16 +4890,63 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) ctx->jastrow.tmp_c = tmp_c; } - rc = qmckl_compute_tmp_c(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walk_num, - ctx->jastrow.een_rescaled_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.tmp_c); - if (rc != QMCKL_SUCCESS) { - return rc; + /* Choose the correct compute function (depending on offload type) */ + switch(ctx->jastrow.offload_type) { + case OFFLOAD_OPENACC: + #ifdef HAVE_OPENACC_OFFLOAD + rc = + qmckl_compute_tmp_c_acc_offload(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.tmp_c); + #else + rc = qmckl_compute_tmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.tmp_c); + + #endif + break; + case OFFLOAD_CUBLAS: + #ifdef HAVE_CUBLAS_OFFLOAD + rc = + qmckl_compute_tmp_c_cublas_offload(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.tmp_c); + #else + rc = qmckl_compute_tmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.tmp_c); + #endif + break; + default: + rc = qmckl_compute_tmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.tmp_c); + break; } ctx->jastrow.tmp_c_date = ctx->date; @@ -4864,7 +4957,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context) qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) { - if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return QMCKL_NULL_CONTEXT; } @@ -4896,14 +4988,61 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context) ctx->jastrow.dtmp_c = dtmp_c; } - rc = qmckl_compute_dtmp_c(context, - ctx->jastrow.cord_num, - ctx->electron.num, - ctx->nucleus.num, - ctx->electron.walk_num, - ctx->jastrow.een_rescaled_e_deriv_e, - ctx->jastrow.een_rescaled_n, - ctx->jastrow.dtmp_c); + switch(ctx->jastrow.offload_type) { + case OFFLOAD_OPENACC: + #ifdef HAVE_OPENACC_OFFLOAD + rc = qmckl_compute_dtmp_c_acc_offload(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e_deriv_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.dtmp_c); + #else + rc = qmckl_compute_dtmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e_deriv_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.dtmp_c); + #endif + break; + case OFFLOAD_CUBLAS: + #ifdef HAVE_CUBLAS_OFFLOAD + rc = qmckl_compute_dtmp_c_acc_offload(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e_deriv_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.dtmp_c); + #else + rc = qmckl_compute_dtmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e_deriv_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.dtmp_c); + #endif + break; + default: + rc = qmckl_compute_dtmp_c(context, + ctx->jastrow.cord_num, + ctx->electron.num, + ctx->nucleus.num, + ctx->electron.walk_num, + ctx->jastrow.een_rescaled_e_deriv_e, + ctx->jastrow.een_rescaled_n, + ctx->jastrow.dtmp_c); + break; + } + if (rc != QMCKL_SUCCESS) { return rc; } @@ -5499,7 +5638,226 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context, #endif } #+end_src - + +*** Compute tmp_c (OpenACC offload) + :PROPERTIES: + :Name: qmckl_compute_tmp_c_acc_offload + :CRetType: qmckl_exit_code + :FRetType: qmckl_exit_code + :END: + + #+NAME: qmckl_factor_tmp_c_acc_offload_args + | Variable | Type | In/Out | Description | + |------------------+------------------------------------------------------------------+--------+-----------------------------------| + | ~context~ | ~qmckl_context~ | in | Global state | + | ~cord_num~ | ~int64_t~ | in | Order of polynomials | + | ~elec_num~ | ~int64_t~ | in | Number of electrons | + | ~nucl_num~ | ~int64_t~ | in | Number of nucleii | + | ~walk_num~ | ~int64_t~ | in | Number of walkers | + | ~een_rescaled_e~ | ~double[walk_num][0:cord_num][elec_num][elec_num]~ | in | Electron-electron rescaled factor | + | ~een_rescaled_n~ | ~double[walk_num][0:cord_num][nucl_num][elec_num]~ | in | Electron-nucleus rescaled factor | + | ~tmp_c~ | ~double[walk_num][0:cord_num-1][0:cord_num][nucl_num][elec_num]~ | out | vector of non-zero coefficients | + + + #+begin_src c :comments org :tangle (eval c) :noweb yes +#ifdef HAVE_OPENACC_OFFLOAD +qmckl_exit_code qmckl_compute_tmp_c_acc_offload ( + const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ) { + + if (context == QMCKL_NULL_CONTEXT) { + return QMCKL_INVALID_CONTEXT; + } + + if (cord_num <= 0) { + return QMCKL_INVALID_ARG_2; + } + + if (elec_num <= 0) { + return QMCKL_INVALID_ARG_3; + } + + if (nucl_num <= 0) { + return QMCKL_INVALID_ARG_4; + } + + // Compute array access strides: + // For tmp_c... + int stride_k_c = elec_num; + int stride_j_c = stride_k_c * nucl_num; + int stride_i_c = stride_j_c * (cord_num+1); + int stride_nw_c = stride_i_c * cord_num; + // For een_rescaled_e... + int stride_m_e = elec_num; + int stride_i_e = stride_m_e * elec_num; + int stride_nw_e = stride_i_e * (cord_num+1); + // For een_rescaled_n... + int stride_k_n = elec_num; + int stride_j_n = stride_k_n * nucl_num; + int stride_nw_n = stride_j_n * (cord_num+1); + + #pragma acc parallel + #pragma acc loop independent gang worker vector collapse(5) + for (int nw=0; nw < walk_num; ++nw) { + for (int i=0; i