diff --git a/configure.ac b/configure.ac index d3c9471..055ca86 100644 --- a/configure.ac +++ b/configure.ac @@ -310,20 +310,21 @@ AS_IF([test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [ case $CC in *gcc*) - CFLAGS="$CFLAGS -fopenacc" + CFLAGS="$CFLAGS -fopenmp" + LDFLAGS="-lcublas" ;; *nvc*) - CFLAGS="$CFLAGS -acc=gpu" + CFLAGS="$CFLAGS -mp=gpu -cudalib=cublas" ;; esac case $FC in *gfortran*) - FCFLAGS="$FCFLAGS -fopenacc" + FCFLAGS="$FCFLAGS -fopenmp" ;; *nvfortran*) - FCFLAGS="$FCFLAGS -acc=gpu" + FCFLAGS="$FCFLAGS -mp=gpu -cudalib=cublas" ;; esac ]) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index e1fc423..1e17c11 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -5870,14 +5870,15 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context, #+begin_src c :comments org :tangle (eval c) :noweb yes #ifdef HAVE_OPENACC_OFFLOAD -qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context, - const int64_t cord_num, - const int64_t elec_num, - const int64_t nucl_num, - const int64_t walk_num, - const double* een_rescaled_e, - const double* een_rescaled_n, - double* const tmp_c ) +qmckl_exit_code +qmckl_compute_tmp_c_acc_offload (const qmckl_context context, + const int64_t cord_num, + const int64_t elec_num, + const int64_t nucl_num, + const int64_t walk_num, + const double* een_rescaled_e, + const double* een_rescaled_n, + double* const tmp_c ) { if (context == QMCKL_NULL_CONTEXT) { @@ -6062,6 +6063,7 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context, #+begin_src c :comments org :tangle (eval c) :noweb yes #ifdef HAVE_CUBLAS_OFFLOAD +qmckl_exit_code qmckl_compute_tmp_c_cublas_offload (const qmckl_context context, const int64_t cord_num, const int64_t elec_num, @@ -6116,16 +6118,19 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context, const int64_t bf = elec_num*nucl_num*(cord_num+1); const int64_t cf = bf; + info = QMCKL_SUCCESS; + + #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) { - +#pragma omp target teams distribute parallel for collapse(2) for (int nw=0; nw < walk_num; ++nw) { for (int i=0; i