1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-08-16 18:38:28 +02:00

Merge branch 'gpu' into jastrow_hpc

This commit is contained in:
Gianfranco Abrusci 2022-04-07 18:43:11 +02:00
commit 4ee83a48d0
2 changed files with 59 additions and 49 deletions

View File

@ -310,20 +310,21 @@ AS_IF([test "$HAVE_CUBLAS_OFFLOAD" = "yes"], [
case $CC in
*gcc*)
CFLAGS="$CFLAGS -fopenacc"
CFLAGS="$CFLAGS -fopenmp"
LDFLAGS="-lcublas"
;;
*nvc*)
CFLAGS="$CFLAGS -acc=gpu"
CFLAGS="$CFLAGS -mp=gpu -cudalib=cublas"
;;
esac
case $FC in
*gfortran*)
FCFLAGS="$FCFLAGS -fopenacc"
FCFLAGS="$FCFLAGS -fopenmp"
;;
*nvfortran*)
FCFLAGS="$FCFLAGS -acc=gpu"
FCFLAGS="$FCFLAGS -mp=gpu -cudalib=cublas"
;;
esac
])

View File

@ -6024,14 +6024,15 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
qmckl_exit_code
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
@ -6071,22 +6072,22 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma acc loop independent gang worker vector collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
#pragma acc loop independent gang worker vector collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
}
}
@ -6164,32 +6165,33 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
// WARNING This implementation seems unomptimized
#pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma omp teams distribute parallel for collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
@ -6214,6 +6216,7 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
@ -6268,16 +6271,19 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = bf;
info = QMCKL_SUCCESS;
#pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
#pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
{
#pragma omp target teams distribute parallel for collapse(2)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
//CuBlas implementation
int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
cublasStatus_t cublasError =
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
&(een_rescaled_e[af*(i+nw*(cord_num+1))]), \
LDA, \
&(een_rescaled_n[bf*nw]), \
@ -6286,6 +6292,7 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
&(tmp_c[cf*(i+nw*cord_num)]), \
LDC);
/*
//Manage cublas ERROR
if(cublasError != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS ERROR %d", cublasError);
@ -6294,6 +6301,7 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
}else{
info = QMCKL_SUCCESS;
}
*/
}
@ -6314,7 +6322,8 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code qmckl_compute_tmp_c_cublas_offload (
qmckl_exit_code
qmckl_compute_tmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
@ -6539,8 +6548,6 @@ qmckl_compute_dtmp_c_hpc (const qmckl_context context,
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
printf("COUCOU\n");
#ifdef HAVE_OPENMP
#pragma omp parallel for collapse(2)
#endif
@ -6573,7 +6580,8 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
qmckl_exit_code
qmckl_compute_dtmp_c_acc_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
@ -6633,7 +6641,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
@ -6772,7 +6780,8 @@ qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
qmckl_exit_code
qmckl_compute_dtmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,