1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-05 11:00:36 +01:00

Removed GPU from Jastrow

This commit is contained in:
Anthony Scemama 2023-03-01 14:47:32 +01:00
parent 9a779f2a94
commit ea21ec2ef7

View File

@ -20,16 +20,16 @@
\[ \[
J_{\text{eN}}(\mathbf{r},\mathbf{R}) = J_{\text{eN}}(\mathbf{r},\mathbf{R}) =
\sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}} \sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}}
\frac{a_{1,\alpha}\, g_\alpha(R_{i\alpha})}{1+a_{2,\alpha}\, g_\alpha(R_{i\alpha})} + \frac{a_{1\,\alpha}\, f_\alpha(R_{i\,\alpha})}{1+a_{2\,\alpha}\, f_\alpha(R_{i\alpha})} +
\sum_{p=2}^{N_\text{ord}^a} a_{p+1,\alpha}\, [g_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha} \sum_{p=2}^{N_\text{ord}^a} a_{p+1\,\alpha}\, [f_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
\] \]
$J_{\text{ee}}$ contains electron-electron terms: $J_{\text{ee}}$ contains electron-electron terms:
\[ \[
J_{\text{ee}}(\mathbf{r}) = J_{\text{ee}}(\mathbf{r}) =
\sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1} \sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1}
\frac{b_1\, f(r_{ij})}{1+b_2\, f(r_{ij})} + \frac{\frac{1}{2}(1+\delta^{\uparrow\downarrow}_{ij}) b_1\, f_{\text{ee}}(r_{ij})}{1+b_2\, f_{\text{ee}}(r_{ij})} +
\sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f(r_{ij})]^p - J_{ee}^\infty \sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f_{\text{ee}}(r_{ij})]^p - J_{ee}^\infty
\] \]
and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms: and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms:
@ -42,7 +42,7 @@
\sum_{p=2}^{N_{\text{ord}}} \sum_{p=2}^{N_{\text{ord}}}
\sum_{k=0}^{p-1} \sum_{k=0}^{p-1}
\sum_{l=0}^{p-k-2\delta_{k,0}} \sum_{l=0}^{p-k-2\delta_{k,0}}
c_{lkp\alpha} \left[ f({r}_{ij}) \right]^k c_{lkp\alpha} \left[ g_\text{ee}({r}_{ij}) \right]^k
\left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right] \left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right]
\left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2} \left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2}
\] \]
@ -52,7 +52,7 @@
$f$ and $g$ are scaling function defined as $f$ and $g$ are scaling function defined as
\[ \[
f(r) = \frac{1-e^{-\kappa\, r}}{\kappa} \text{ and } f_\alpha(r) = \frac{1-e^{-\kappa_\alpha\, r}}{\kappa_\alpha} \text{ and }
g_\alpha(r) = e^{-\kappa_\alpha\, r}. g_\alpha(r) = e^{-\kappa_\alpha\, r}.
\] \]
@ -118,11 +118,6 @@ int main() {
#include "qmckl_jastrow_private_func.h" #include "qmckl_jastrow_private_func.h"
#include "qmckl_jastrow_private_type.h" #include "qmckl_jastrow_private_type.h"
#ifdef HAVE_CUBLAS_OFFLOAD
#include "cublas_v2.h"
#endif
#+end_src #+end_src
* Context * Context
@ -404,13 +399,9 @@ typedef struct qmckl_jastrow_struct{
bool provided; bool provided;
char * type; char * type;
#ifdef HAVE_HPC
bool gpu_offload;
#endif
} qmckl_jastrow_struct; } qmckl_jastrow_struct;
#+end_src #+end_src
The ~uninitialized~ integer contains one bit set to one for each The ~uninitialized~ integer contains one bit set to one for each
initialization function which has not been called. It becomes equal initialization function which has not been called. It becomes equal
to zero after all initialization functions have been called. The to zero after all initialization functions have been called. The
@ -492,7 +483,6 @@ if (ctx->jastrow.provided) {
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
#+end_src #+end_src
#+begin_src c :comments org :tangle (eval c) :noweb yes :exports none #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none
qmckl_exit_code qmckl_exit_code
qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num) qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num)
@ -910,7 +900,7 @@ qmckl_set_jastrow_rescale_factor_en(qmckl_context context,
When the required information is completely entered, other data structures are When the required information is completely entered, other data structures are
computed to accelerate the calculations. The intermediates factors computed to accelerate the calculations. The intermediates factors
are precontracted using BLAS LEVEL 3 operations for an optimal flop count. are precontracted using BLAS LEVEL 3 operations.
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none #+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context); qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context);
@ -952,11 +942,6 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) {
NULL); NULL);
} }
/* Decide if the Jastrow if offloaded on GPU or not */
#if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD))
ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100;
#endif
qmckl_exit_code rc; qmckl_exit_code rc;
rc = qmckl_provide_jastrow_asymp_jasa(context); rc = qmckl_provide_jastrow_asymp_jasa(context);
@ -2016,16 +2001,16 @@ assert(fabs(asymp_jasb[1]-0.31567342786262853) < 1.e-12);
#+end_src #+end_src
** Electron-electron component \(f_{ee}\) ** Electron-electron component \(f_\text{ee}\)
Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~ Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~
componenet and the electron-electron rescaled distances ~ee_distance_rescaled~. component and the electron-electron rescaled distances ~ee_distance_rescaled~.
\[ \[
f_{ee} = \sum_{i,j<i} \left[ \frac{ \eta B_0 C_{ij}}{1 - B_1 C_{ij}} + \sum^{nord}_{k}B_k C_{ij}^k \right] - J_{\text{ee}}^{\infty} f_\text{ee} = \sum_{i,j<i} \left[ \frac{ \delta B_0\, C_{ij}}{1 - B_1\, C_{ij}} + \sum^{n_\text{ord}}_{k}B_k\, C_{ij}^k \right] - J_{\text{ee}}^{\infty}
\] \]
$\eta$ is the spin factor, $B$ is the vector of $b$ parameters, $\delta$ is the spin factor, $B$ is the vector of $b$ parameters,
$C$ is the array of scaled distances. $C$ is the array of scaled distances.
@ -7294,46 +7279,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
ctx->jastrow.tmp_c = tmp_c; ctx->jastrow.tmp_c = tmp_c;
} }
/* Choose the correct compute function (depending on offload type) */
#ifdef HAVE_HPC
const bool gpu_offload = ctx->jastrow.gpu_offload;
#else
const bool gpu_offload = false;
#endif
if (gpu_offload) {
#ifdef HAVE_CUBLAS_OFFLOAD
rc = qmckl_compute_tmp_c_cublas_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#elif HAVE_OPENACC_OFFLOAD
rc = qmckl_compute_tmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#elif HAVE_OPENMP_OFFLOAD
rc = qmckl_compute_tmp_c_omp_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#else
rc = QMCKL_FAILURE;
#endif
} else {
rc = qmckl_compute_tmp_c(context, rc = qmckl_compute_tmp_c(context,
ctx->jastrow.cord_num, ctx->jastrow.cord_num,
ctx->electron.num, ctx->electron.num,
@ -7342,8 +7287,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
ctx->jastrow.een_rescaled_e, ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n, ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c); ctx->jastrow.tmp_c);
}
ctx->jastrow.tmp_c_date = ctx->date; ctx->jastrow.tmp_c_date = ctx->date;
} }
@ -7394,44 +7337,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
} }
#ifdef HAVE_HPC
const bool gpu_offload = ctx->jastrow.gpu_offload;
#else
const bool gpu_offload = false;
#endif
if (gpu_offload) {
#ifdef HAVE_CUBLAS_OFFLOAD
rc = qmckl_compute_dtmp_c_cublas_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#elif HAVE_OPENACC_OFFLOAD
rc = qmckl_compute_dtmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#elif HAVE_OPENMP_OFFLOAD
rc = qmckl_compute_dtmp_c_omp_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#else
rc = QMCKL_FAILURE;
#endif
} else {
rc = qmckl_compute_dtmp_c(context, rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num, ctx->jastrow.cord_num,
ctx->electron.num, ctx->electron.num,
@ -7440,7 +7345,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
ctx->jastrow.een_rescaled_e_deriv_e, ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n, ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c); ctx->jastrow.dtmp_c);
}
if (rc != QMCKL_SUCCESS) { if (rc != QMCKL_SUCCESS) {
return rc; return rc;
@ -8139,293 +8043,6 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
double* const tmp_c ); double* const tmp_c );
#+end_src #+end_src
**** OpenACC offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute array access strides:
// For tmp_c...
const int64_t stride_k_c = elec_num;
const int64_t stride_j_c = stride_k_c * nucl_num;
const int64_t stride_i_c = stride_j_c * (cord_num+1);
const int64_t stride_nw_c = stride_i_c * cord_num;
// For een_rescaled_e...
const int64_t stride_m_e = elec_num;
const int64_t stride_i_e = stride_m_e * elec_num;
const int64_t stride_nw_e = stride_i_e * (cord_num+1);
// For een_rescaled_n...
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma acc loop independent gang worker vector collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c );
#endif
#+end_src
**** OpenMP offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute array access strides:
// For tmp_c...
const int64_t stride_k_c = elec_num;
const int64_t stride_j_c = stride_k_c * nucl_num;
const int64_t stride_i_c = stride_j_c * (cord_num+1);
const int64_t stride_nw_c = stride_i_c * cord_num;
// For een_rescaled_e...
const int64_t stride_m_e = elec_num;
const int64_t stride_i_e = stride_m_e * elec_num;
const int64_t stride_nw_e = stride_i_e * (cord_num+1);
// For een_rescaled_n...
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
// WARNING This implementation seems unomptimized
#pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma omp teams distribute parallel for collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c );
#endif
#+end_src
**** cuBLAS offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
//cuBLAS initialization
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
{
fprintf(stdout, "CUBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
const double alpha = 1.0;
const double beta = 0.0;
const int64_t M = elec_num;
const int64_t N = nucl_num*(cord_num + 1);
const int64_t K = elec_num;
const int64_t LDA = elec_num;
const int64_t LDB = elec_num;
const int64_t LDC = elec_num;
const int64_t af = elec_num*elec_num;
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = bf;
#pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
#pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
{
for (int nw=0; nw < walk_num; ++nw) {
// /!\ cublasError needs to be checked and return QMCKL_FAILURE if it fails
int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
&(een_rescaled_e[nw*(cord_num+1)]),
LDA, af,
&(een_rescaled_n[bf*nw]),
LDB, 0,
&beta,
&(tmp_c[nw*cord_num]),
LDC, cf, cord_num);
}
}
#pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
cublasDestroy(handle);
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c );
#endif
#+end_src
*** Compute dtmp_c *** Compute dtmp_c
:PROPERTIES: :PROPERTIES:
:Name: qmckl_compute_dtmp_c :Name: qmckl_compute_dtmp_c
@ -8667,304 +8284,6 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
double* const dtmp_c ); double* const dtmp_c );
#+end_src #+end_src
**** OpenACC offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code
qmckl_compute_dtmp_c_acc_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute strides...
// For dtmp_c
const int64_t stride_l_d = elec_num;
const int64_t stride_k_d = stride_l_d * 4;
const int64_t stride_j_d = stride_k_d * nucl_num;
const int64_t stride_i_d = stride_j_d * (cord_num+1);
const int64_t stride_nw_d = stride_i_d * cord_num;
// For een_rescaled_e_deriv_e
const int64_t stride_l_e = elec_num;
const int64_t stride_n_e = stride_l_e * 4;
const int64_t stride_i_e = stride_n_e * elec_num;
const int64_t stride_nw_e = stride_i_e * cord_num;
// For een_rescaled_n
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
#pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma acc loop independent gang worker vector collapse(6)
for (int nw=0; nw < walk_num; nw++) {
for (int i=0; i < cord_num; i++) {
// Single DGEMM
for(int j=0; j<cord_num+1; j++) {
for(int k=0; k<nucl_num; k++) {
for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c );
#endif
#+end_src
**** OpenMP offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute strides...
// For dtmp_c
const int64_t stride_l_d = elec_num;
const int64_t stride_k_d = stride_l_d * 4;
const int64_t stride_j_d = stride_k_d * nucl_num;
const int64_t stride_i_d = stride_j_d * (cord_num+1);
const int64_t stride_nw_d = stride_i_d * cord_num;
// For een_rescaled_e_deriv_e
const int64_t stride_l_e = elec_num;
const int64_t stride_n_e = stride_l_e * 4;
const int64_t stride_i_e = stride_n_e * elec_num;
const int64_t stride_nw_e = stride_i_e * cord_num;
// For een_rescaled_n
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
// WARNING This implementation seems unomptimized
#pragma omp target map(from:dtmp_c[0:size_dtmp_c]) map(to:een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma omp teams distribute parallel for collapse(6)
for (int nw=0; nw < walk_num; nw++) {
for (int i=0; i < cord_num; i++) {
// Single DGEMM
for(int j=0; j<cord_num+1; j++) {
for(int k=0; k<nucl_num; k++) {
for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c );
#endif
#+end_src
**** cuBLAS offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_dtmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
if (walk_num <= 0) {
return QMCKL_INVALID_ARG_5;
}
qmckl_exit_code info = QMCKL_SUCCESS;
//cuBLAS initialization
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
{
fprintf(stdout, "CUBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
const double alpha = 1.0;
const double beta = 0.0;
const int64_t M = 4*elec_num;
const int64_t N = nucl_num*(cord_num + 1);
const int64_t K = elec_num;
const int64_t LDA = 4*elec_num;
const int64_t LDB = elec_num;
const int64_t LDC = 4*elec_num;
const int64_t af = elec_num*elec_num*4;
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
#pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num])
#pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
{
for (int64_t nw=0; nw < walk_num; ++nw) {
int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
&(een_rescaled_e_deriv_e[(nw*(cord_num+1))]),
LDA, af,
&(een_rescaled_n[bf*nw]), LDB, 0,
&beta,
&(dtmp_c[(nw*cord_num)]),
LDC, cf, cord_num);
}
}
#pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num])
cublasDestroy(handle);
return info;
}
#endif
#+end_src
#+RESULTS:
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c );
#endif
#+end_src
*** Test *** Test
#+name: helper_funcs #+name: helper_funcs