Merge pull request #1 from TREX-CoE/gpu

Gpu
2025-01-03 18:16:28 +01:00 · 2022-04-06 17:17:45 +02:00 · 2022-04-06 17:17:45 +02:00 · cba6477e4a
commit cba6477e4a
parent fe277b7a6e 7aad2a79a2
7 changed files with 1752 additions and 757 deletions
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@ -2,9 +2,7 @@ name: test-build

 on:
  push:
-    branches: [ master ]
  pull_request:
-    branches: [ master ]

 jobs:
  x86_ubuntu:
--- a/configure.ac
+++ b/configure.ac
@ -93,6 +93,7 @@ AC_PROG_F77
 m4_version_prereq([2.70],[], [AC_PROG_CC_C99])
 AS_IF([test "$ac_cv_prog_cc_c99" = "no"], [AC_MSG_ERROR([The compiler does not support C99])])
 AC_PROG_CC_C_O
+AM_PROG_CC_C_O
 AC_PROG_FC
 AC_PROG_FC_C_O
 AC_FC_PP_DEFINE
@ -137,10 +138,10 @@ case "$with_chameleon" in
      [PKG_CFLAGS="$PKG_CFLAGS $LIBCHAMELEON_CFLAGS"
      PKG_LIBS="$PKG_LIBS $LIBCHAMELEON_LIBS"]
                      ,[
-    
+
      ## something went wrong.
      ## try to find the package without pkg-config
-    
+
      ## check that the library is actually new enough.
      ## by testing for a 1.0.0+ function which we use
      AC_CHECK_LIB(chameleon,CHAMELEON_finalize,[LIBCHAMELEON_LIBS="-lchameleon"])
@ -205,15 +206,18 @@ case $FC in
      ;;

 *nvfortran*)
-      FCFLAGS="$FCFLAGS -fPIC -Mnomain -mp -target=gpu"
+      FCFLAGS="$FCFLAGS -fPIC -Mnomain"
      ;;

 esac

 case $CC in

+  *gcc*)
+        CFLAGS="$CFLAGS -fPIC"
+        ;;
  *nvc*)
-        CFLAGS="$CFLAGS -fPIC -mp -target=gpu"
+        CFLAGS="$CFLAGS -fPIC"
        ;;
 esac

@ -224,6 +228,109 @@ AS_IF([test "$HAVE_HPC" = "yes"], [
   AC_DEFINE([HAVE_HPC], [1], [If defined, activate HPC routines])
 ])

+# Enable Verificarlo tests
+AC_ARG_ENABLE([vfc_ci],
+[  --enable-vfc_ci    Build the library with vfc_ci support],
+[case "${enableval}" in
+  yes) vfc_ci=true &&  FCFLAGS="-D VFC_CI $FCFLAGS" && CFLAGS="-D VFC_CI $CFLAGS";;
+  no)  vfc_ci=false ;;
+  *) AC_MSG_ERROR([bad value ${enableval} for --enable_vfc_ci]) ;;
+esac],[vfc_ci=false])
+AM_CONDITIONAL([VFC_CI], [test x$vfc_ci = xtrue])
+
+if test "$FC" = "verificarlo-f"; then
+  AC_MSG_NOTICE(verificarlo-f detected)
+  # Arguments order is important here
+  FCFLAGS="-Mpreprocess $FCFLAGS"
+fi
+
+## Enable GPU offloading
+
+# GPU offloading
+AC_ARG_ENABLE(gpu, [AS_HELP_STRING([--enable-gpu],[openmp|openacc : Use GPU-offloaded functions])], enable_gpu=$enableval, enable_gpu=no)
+AS_IF([test "$enable_gpu" = "yes"], [enable_gpu="openmp"])
+
+# OpenMP offloading
+HAVE_OPENMP_OFFLOAD="no"
+AS_IF([test "$enable_gpu" = "openmp"], [
+  AC_DEFINE([HAVE_OPENMP_OFFLOAD], [1], [If defined, activate OpenMP-offloaded routines])
+  HAVE_OPENMP_OFFLOAD="yes"
+  case $CC in
+
+    *gcc*)
+          CFLAGS="$CFLAGS -fopenmp"
+          ;;
+    *nvc*)
+          CFLAGS="$CFLAGS -mp=gpu"
+          ;;
+  esac
+
+  case $FC in
+
+    *gfortran*)
+          FCFLAGS="$FCFLAGS -fopenmp"
+          ;;
+    *nvfortran*)
+          FCFLAGS="$FCFLAGS -mp=gpu"
+          ;;
+  esac]
+)
+
+# OpenMP offloading
+HAVE_OPENACC_OFFLOAD="no"
+AS_IF([test "$enable_gpu" = "openacc"], [
+  AC_DEFINE([HAVE_OPENACC_OFFLOAD], [1], [If defined, activate OpenACC-offloaded routines])
+  HAVE_OPENACC_OFFLOAD="yes"
+  case $CC in
+
+    *gcc*)
+          CFLAGS="$CFLAGS -fopenacc"
+          ;;
+    *nvc*)
+          CFLAGS="$CFLAGS -acc=gpu"
+          ;;
+  esac
+
+  case $FC in
+
+    *gfortran*)
+          FCFLAGS="$FCFLAGS -fopenacc"
+          ;;
+    *nvfortran*)
+          FCFLAGS="$FCFLAGS -acc=gpu"
+          ;;
+  esac
+
+])
+
+# cuBLAS offloading
+AC_ARG_ENABLE(cublas, [AS_HELP_STRING([--enable-cublas],[Use cuBLAS-offloaded functions])], HAVE_CUBLAS_OFFLOAD=$enableval, HAVE_CUBLAS_OFFLOAD=no)
+AS_IF([test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [
+  AC_DEFINE([HAVE_CUBLAS_OFFLOAD], [1], [If defined, activate cuBLAS-offloaded routines])
+  case $CC in
+
+    *gcc*)
+          CFLAGS="$CFLAGS -fopenacc"
+          ;;
+    *nvc*)
+          CFLAGS="$CFLAGS -acc=gpu"
+          ;;
+  esac
+
+  case $FC in
+
+    *gfortran*)
+          FCFLAGS="$FCFLAGS -fopenacc"
+          ;;
+    *nvfortran*)
+          FCFLAGS="$FCFLAGS -acc=gpu"
+          ;;
+  esac
+])
+
+
+##
+
 AC_ARG_ENABLE(debug, [AS_HELP_STRING([--enable-debug],[compile for debugging])], ok=$enableval, ok=no)
 if test "$ok" = "yes"; then
        if test "$GCC" = "yes"; then
@ -319,21 +426,6 @@ if test "x${QMCKL_DEVEL}" != "x"; then

 fi

-# Enable Verificarlo tests
-AC_ARG_ENABLE([vfc_ci],
-[  --enable-vfc_ci    Build the library with vfc_ci support],
-[case "${enableval}" in
-  yes) vfc_ci=true &&  FCFLAGS="-D VFC_CI $FCFLAGS" && CFLAGS="-D VFC_CI $CFLAGS";;
-  no)  vfc_ci=false ;;
-  *) AC_MSG_ERROR([bad value ${enableval} for --enable_vfc_ci]) ;;
-esac],[vfc_ci=false])
-AM_CONDITIONAL([VFC_CI], [test x$vfc_ci = xtrue])
-
-if test "$FC" = "verificarlo-f"; then
-  AC_MSG_NOTICE(verificarlo-f detected)
-  # Arguments order is important here
-  FCFLAGS="-Mpreprocess $FCFLAGS"
-fi

 #PKG-CONFIG
 #mkl-dynamic-lp64-seq
@ -369,6 +461,9 @@ LDFLAGS:........: ${LDFLAGS}
 LIBS............: ${LIBS}
 USE CHAMELEON...: ${with_chameleon}
 HPC version.....: ${HAVE_HPC}
+OpenMP offload..: ${HAVE_OPENMP_OFFLOAD}
+OpenACC offload.: ${HAVE_OPENACC_OFFLOAD}
+cuBLAS offload..: ${HAVE_CUBLAS_OFFLOAD}

 Package features:
  ${ARGS}
--- a/org/qmckl_ao.org
+++ b/org/qmckl_ao.org
@ -2634,9 +2634,10 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) {
    }
  }

-  rc = QMCKL_SUCCESS;
 #ifdef HAVE_HPC
  rc = qmckl_finalize_basis_hpc(context);
+#else
+  rc = QMCKL_SUCCESS;
 #endif

  return rc;
--- a/org/qmckl_blas.org
+++ b/org/qmckl_blas.org
@ -84,8 +84,8 @@ are not intended to be passed to external codes.

   #+begin_src c :comments org :tangle (eval h_private_type) :exports none
 typedef struct qmckl_vector {
-  int64_t size;
  double* restrict data;
+  int64_t size;
 } qmckl_vector;
   #+end_src

@ -160,8 +160,8 @@ qmckl_vector_free( qmckl_context context,
  
   #+begin_src c :comments org :tangle (eval h_private_type) :exports none
 typedef struct qmckl_matrix {
-  int64_t size[2];
  double* restrict data;
+  int64_t size[2];
 } qmckl_matrix;
   #+end_src

@ -245,9 +245,9 @@ qmckl_matrix_free( qmckl_context context,
 #define QMCKL_TENSOR_ORDER_MAX 16

 typedef struct qmckl_tensor {
+  double* restrict data;
  int64_t order;
  int64_t size[QMCKL_TENSOR_ORDER_MAX];
-  double* restrict data;
 } qmckl_tensor;
   #+end_src

--- a/org/qmckl_jastrow.org
+++ b/org/qmckl_jastrow.org
--- a/org/qmckl_mo.org
+++ b/org/qmckl_mo.org
@ -655,6 +655,7 @@ integer function qmckl_compute_mo_basis_mo_vgl_doc_f(context, &
        end if
     end do
  end do
+  info = QMCKL_SUCCESS

 end function qmckl_compute_mo_basis_mo_vgl_doc_f
    #+end_src
@ -790,7 +791,6 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,
    double  av4[ao_num];
    double  av5[ao_num];
    for (int64_t k=0 ; k<ao_num ; ++k) {
-      const double* restrict ck1 = coef_normalized_t + k*mo_num;
      if (avgl1[k] != 0.) {
        idx[nidx] = k;
        av1[nidx] = avgl1[k];
@ -804,7 +804,6 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,

    int64_t n;
    for (n=0 ; n < nidx-4 ; n+=4) {
-      int64_t k = idx[n];
      const double* restrict ck1 = coef_normalized_t + idx[n  ]*mo_num;
      const double* restrict ck2 = coef_normalized_t + idx[n+1]*mo_num;
      const double* restrict ck3 = coef_normalized_t + idx[n+2]*mo_num;
@ -849,13 +848,13 @@ qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context,

    int64_t n0 = nidx-4;
    n0 = n0 < 0 ? 0 : n0;
-    for (int64_t n=n0 ; n < nidx ; n+=1) {
-      const double* restrict ck = coef_normalized_t + idx[n]*mo_num;
-      const double a1 = av1[n];
-      const double a2 = av2[n];
-      const double a3 = av3[n];
-      const double a4 = av4[n];
-      const double a5 = av5[n];
+    for (int64_t m=n0 ; m < nidx ; m+=1) {
+      const double* restrict ck = coef_normalized_t + idx[m]*mo_num;
+      const double a1 = av1[m];
+      const double a2 = av2[m];
+      const double a3 = av3[m];
+      const double a4 = av4[m];
+      const double a5 = av5[m];

 #ifdef HAVE_OPENMP
  #pragma omp simd
--- a/org/qmckl_sherman_morrison_woodbury.org
+++ b/org/qmckl_sherman_morrison_woodbury.org
@ -965,7 +965,7 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context,
      rc = qmckl_woodbury_3(context, LDS, Dim, Updates_3block, Updates_index_3block, breakdown, Slater_inv, determinant);
      if (rc != 0) { // Send the entire block to slagel_splitting
        uint64_t l = 0;
-        rc = qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block, Updates_index_3block,
+        (void) qmckl_slagel_splitting(LDS, Dim, 3, Updates_3block, Updates_index_3block,
                breakdown, Slater_inv, later_updates + (Dim * later), later_index + later, &l, determinant);
        later = later + l;
      }