diff --git a/configure.ac b/configure.ac index 0ec94b8..17350c7 100644 --- a/configure.ac +++ b/configure.ac @@ -304,9 +304,10 @@ AS_IF([test "$enable_gpu" = "openacc"], [ ]) # cuBLAS offloading -AC_ARG_WITH(cublas, [AS_HELP_STRING([--with-cublas],[Use cuBLAS-offloaded functions])], HAVE_CUBLAS_OFFLOAD=$withval, HAVE_CUBLAS_OFFLOAD=no) -AS_IF([test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [ +HAVE_OPENACC_OFFLOAD="no" +AS_IF([test "$enable_gpu" = "cublas"], [ AC_DEFINE([HAVE_CUBLAS_OFFLOAD], [1], [If defined, activate cuBLAS-offloaded routines]) + HAVE_CUBLAS_OFFLOAD="yes" case $CC in *gcc*) diff --git a/org/qmckl_jastrow.org b/org/qmckl_jastrow.org index adf66d1..fb3cf3e 100644 --- a/org/qmckl_jastrow.org +++ b/org/qmckl_jastrow.org @@ -118,7 +118,6 @@ int main() { #include "qmckl_jastrow_private_type.h" #ifdef HAVE_CUBLAS_OFFLOAD -#include #include "cublas_v2.h" #endif @@ -6229,30 +6228,6 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context, { qmckl_exit_code info; - //Initialisation of cublas - - cublasHandle_t handle; - if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) - { - fprintf(stdout, "CUBLAS initialization failed!\n"); - exit(EXIT_FAILURE); - } - - - - qmckl_exit_code info; - - //Initialisation of cublas - - cublasHandle_t handle; - if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) - { - fprintf(stdout, "CUBLAS initialization failed!\n"); - exit(EXIT_FAILURE); - } - - - if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } @@ -6269,6 +6244,14 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context, return QMCKL_INVALID_ARG_4; } + //cuBLAS initialization + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) + { + fprintf(stdout, "CUBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + const double alpha = 1.0; const double beta = 0.0; @@ -6284,45 +6267,24 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context, const int64_t bf = elec_num*nucl_num*(cord_num+1); const int64_t cf = bf; - #pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) #pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c) { for (int nw=0; nw < walk_num; ++nw) { - int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, - &(een_rescaled_e[nw*(cord_num+1)]), \ - LDA, af, \ - &(een_rescaled_n[bf*nw]), \ - LDB, 0, \ - &beta, \ - &(tmp_c[nw*cord_num]), \ + int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, + &(een_rescaled_e[nw*(cord_num+1)]), + LDA, af, + &(een_rescaled_n[bf*nw]), + LDB, 0, + &beta, + &(tmp_c[nw*cord_num]), LDC, cf, cord_num); - - - //Manage cublas ERROR - if(cublasError != CUBLAS_STATUS_SUCCESS){ - printf("CUBLAS ERROR %d", cublasError); - info = QMCKL_FAILURE; - - return info; - }else{ - info = QMCKL_SUCCESS; - } - - - - } } - cudaDeviceSynchronize(); - cublasDestroy(handle); - - #pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num]) - - + cublasDestroy(handle); return info; } #endif @@ -6801,15 +6763,6 @@ qmckl_compute_dtmp_c_cublas_offload ( const double* een_rescaled_n, double* const dtmp_c ) { - cublasHandle_t handle; - if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) - { - fprintf(stdout, "CUBLAS initialization failed!\n"); - exit(EXIT_FAILURE); - } - - - if (context == QMCKL_NULL_CONTEXT) { return QMCKL_INVALID_CONTEXT; } @@ -6832,6 +6785,14 @@ qmckl_compute_dtmp_c_cublas_offload ( qmckl_exit_code info = QMCKL_SUCCESS; + //cuBLAS initialization + cublasHandle_t handle; + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) + { + fprintf(stdout, "CUBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + const double alpha = 1.0; const double beta = 0.0; @@ -6847,38 +6808,24 @@ qmckl_compute_dtmp_c_cublas_offload ( const int64_t bf = elec_num*nucl_num*(cord_num+1); const int64_t cf = elec_num*4*nucl_num*(cord_num+1); -#pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num]) -#pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c) + #pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num]) + #pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c) { - for (int64_t nw=0; nw < walk_num; ++nw) { - //Manage CUBLAS ERRORS - - int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, \ - &(een_rescaled_e_deriv_e[(nw*(cord_num+1))]), \ - LDA, af, \ - &(een_rescaled_n[bf*nw]), \ - LDB, 0, \ - &beta, \ - &(dtmp_c[(nw*cord_num)]), \ + for (int64_t nw=0; nw < walk_num; ++nw) { + int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, + &(een_rescaled_e_deriv_e[(nw*(cord_num+1))]), + LDA, af, + &(een_rescaled_n[bf*nw]), LDB, 0, + &beta, + &(dtmp_c[(nw*cord_num)]), LDC, cf, cord_num); - - if(cublasError != CUBLAS_STATUS_SUCCESS){ - printf("CUBLAS ERROR %d", cublasError); - info = QMCKL_FAILURE; - return info; - }else{ - info = QMCKL_SUCCESS; - } - - //} - } } - cudaDeviceSynchronize(); + } + + #pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num]) + cublasDestroy(handle); - -#pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num]) - return info; } #endif