1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-11-19 20:42:50 +01:00

Removed GPU from Jastrow

This commit is contained in:
Anthony Scemama 2023-03-01 14:47:32 +01:00
parent 9a779f2a94
commit ea21ec2ef7

View File

@ -20,16 +20,16 @@
\[
J_{\text{eN}}(\mathbf{r},\mathbf{R}) =
\sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}}
\frac{a_{1,\alpha}\, g_\alpha(R_{i\alpha})}{1+a_{2,\alpha}\, g_\alpha(R_{i\alpha})} +
\sum_{p=2}^{N_\text{ord}^a} a_{p+1,\alpha}\, [g_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
\frac{a_{1\,\alpha}\, f_\alpha(R_{i\,\alpha})}{1+a_{2\,\alpha}\, f_\alpha(R_{i\alpha})} +
\sum_{p=2}^{N_\text{ord}^a} a_{p+1\,\alpha}\, [f_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
\]
$J_{\text{ee}}$ contains electron-electron terms:
\[
J_{\text{ee}}(\mathbf{r}) =
\sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1}
\frac{b_1\, f(r_{ij})}{1+b_2\, f(r_{ij})} +
\sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f(r_{ij})]^p - J_{ee}^\infty
\frac{\frac{1}{2}(1+\delta^{\uparrow\downarrow}_{ij}) b_1\, f_{\text{ee}}(r_{ij})}{1+b_2\, f_{\text{ee}}(r_{ij})} +
\sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f_{\text{ee}}(r_{ij})]^p - J_{ee}^\infty
\]
and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms:
@ -42,7 +42,7 @@
\sum_{p=2}^{N_{\text{ord}}}
\sum_{k=0}^{p-1}
\sum_{l=0}^{p-k-2\delta_{k,0}}
c_{lkp\alpha} \left[ f({r}_{ij}) \right]^k
c_{lkp\alpha} \left[ g_\text{ee}({r}_{ij}) \right]^k
\left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right]
\left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2}
\]
@ -52,7 +52,7 @@
$f$ and $g$ are scaling function defined as
\[
f(r) = \frac{1-e^{-\kappa\, r}}{\kappa} \text{ and }
f_\alpha(r) = \frac{1-e^{-\kappa_\alpha\, r}}{\kappa_\alpha} \text{ and }
g_\alpha(r) = e^{-\kappa_\alpha\, r}.
\]
@ -118,11 +118,6 @@ int main() {
#include "qmckl_jastrow_private_func.h"
#include "qmckl_jastrow_private_type.h"
#ifdef HAVE_CUBLAS_OFFLOAD
#include "cublas_v2.h"
#endif
#+end_src
* Context
@ -404,13 +399,9 @@ typedef struct qmckl_jastrow_struct{
bool provided;
char * type;
#ifdef HAVE_HPC
bool gpu_offload;
#endif
} qmckl_jastrow_struct;
#+end_src
The ~uninitialized~ integer contains one bit set to one for each
initialization function which has not been called. It becomes equal
to zero after all initialization functions have been called. The
@ -458,9 +449,9 @@ qmckl_exit_code qmckl_set_jastrow_bord_num (qmckl_context context, con
qmckl_exit_code qmckl_set_jastrow_cord_num (qmckl_context context, const int64_t cord_num);
qmckl_exit_code qmckl_set_jastrow_type_nucl_num (qmckl_context context, const int64_t type_nucl_num);
qmckl_exit_code qmckl_set_jastrow_type_nucl_vector (qmckl_context context, const int64_t* type_nucl_vector, const int64_t nucl_num);
qmckl_exit_code qmckl_set_jastrow_a_vector (qmckl_context context, const double * a_vector, const int64_t size_max);
qmckl_exit_code qmckl_set_jastrow_b_vector (qmckl_context context, const double * b_vector, const int64_t size_max);
qmckl_exit_code qmckl_set_jastrow_c_vector (qmckl_context context, const double * c_vector, const int64_t size_max);
qmckl_exit_code qmckl_set_jastrow_a_vector (qmckl_context context, const double * a_vector, const int64_t size_max);
qmckl_exit_code qmckl_set_jastrow_b_vector (qmckl_context context, const double * b_vector, const int64_t size_max);
qmckl_exit_code qmckl_set_jastrow_c_vector (qmckl_context context, const double * c_vector, const int64_t size_max);
#+end_src
#+NAME:pre2
@ -492,7 +483,6 @@ if (ctx->jastrow.provided) {
return QMCKL_SUCCESS;
#+end_src
#+begin_src c :comments org :tangle (eval c) :noweb yes :exports none
qmckl_exit_code
qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num)
@ -910,7 +900,7 @@ qmckl_set_jastrow_rescale_factor_en(qmckl_context context,
When the required information is completely entered, other data structures are
computed to accelerate the calculations. The intermediates factors
are precontracted using BLAS LEVEL 3 operations for an optimal flop count.
are precontracted using BLAS LEVEL 3 operations.
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context);
@ -952,11 +942,6 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) {
NULL);
}
/* Decide if the Jastrow if offloaded on GPU or not */
#if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD))
ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100;
#endif
qmckl_exit_code rc;
rc = qmckl_provide_jastrow_asymp_jasa(context);
@ -2016,16 +2001,16 @@ assert(fabs(asymp_jasb[1]-0.31567342786262853) < 1.e-12);
#+end_src
** Electron-electron component \(f_{ee}\)
** Electron-electron component \(f_\text{ee}\)
Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~
componenet and the electron-electron rescaled distances ~ee_distance_rescaled~.
component and the electron-electron rescaled distances ~ee_distance_rescaled~.
\[
f_{ee} = \sum_{i,j<i} \left[ \frac{ \eta B_0 C_{ij}}{1 - B_1 C_{ij}} + \sum^{nord}_{k}B_k C_{ij}^k \right] - J_{\text{ee}}^{\infty}
f_\text{ee} = \sum_{i,j<i} \left[ \frac{ \delta B_0\, C_{ij}}{1 - B_1\, C_{ij}} + \sum^{n_\text{ord}}_{k}B_k\, C_{ij}^k \right] - J_{\text{ee}}^{\infty}
\]
$\eta$ is the spin factor, $B$ is the vector of $b$ parameters,
$\delta$ is the spin factor, $B$ is the vector of $b$ parameters,
$C$ is the array of scaled distances.
@ -7294,47 +7279,7 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
ctx->jastrow.tmp_c = tmp_c;
}
/* Choose the correct compute function (depending on offload type) */
#ifdef HAVE_HPC
const bool gpu_offload = ctx->jastrow.gpu_offload;
#else
const bool gpu_offload = false;
#endif
if (gpu_offload) {
#ifdef HAVE_CUBLAS_OFFLOAD
rc = qmckl_compute_tmp_c_cublas_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#elif HAVE_OPENACC_OFFLOAD
rc = qmckl_compute_tmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#elif HAVE_OPENMP_OFFLOAD
rc = qmckl_compute_tmp_c_omp_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#else
rc = QMCKL_FAILURE;
#endif
} else {
rc = qmckl_compute_tmp_c(context,
rc = qmckl_compute_tmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
@ -7342,8 +7287,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
}
ctx->jastrow.tmp_c_date = ctx->date;
}
@ -7394,54 +7337,15 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
}
#ifdef HAVE_HPC
const bool gpu_offload = ctx->jastrow.gpu_offload;
#else
const bool gpu_offload = false;
#endif
if (gpu_offload) {
#ifdef HAVE_CUBLAS_OFFLOAD
rc = qmckl_compute_dtmp_c_cublas_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#elif HAVE_OPENACC_OFFLOAD
rc = qmckl_compute_dtmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#elif HAVE_OPENMP_OFFLOAD
rc = qmckl_compute_dtmp_c_omp_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#else
rc = QMCKL_FAILURE;
#endif
} else {
rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
}
rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walker.num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
if (rc != QMCKL_SUCCESS) {
return rc;
}
@ -8139,293 +8043,6 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
double* const tmp_c );
#+end_src
**** OpenACC offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute array access strides:
// For tmp_c...
const int64_t stride_k_c = elec_num;
const int64_t stride_j_c = stride_k_c * nucl_num;
const int64_t stride_i_c = stride_j_c * (cord_num+1);
const int64_t stride_nw_c = stride_i_c * cord_num;
// For een_rescaled_e...
const int64_t stride_m_e = elec_num;
const int64_t stride_i_e = stride_m_e * elec_num;
const int64_t stride_nw_e = stride_i_e * (cord_num+1);
// For een_rescaled_n...
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma acc loop independent gang worker vector collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c );
#endif
#+end_src
**** OpenMP offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute array access strides:
// For tmp_c...
const int64_t stride_k_c = elec_num;
const int64_t stride_j_c = stride_k_c * nucl_num;
const int64_t stride_i_c = stride_j_c * (cord_num+1);
const int64_t stride_nw_c = stride_i_c * cord_num;
// For een_rescaled_e...
const int64_t stride_m_e = elec_num;
const int64_t stride_i_e = stride_m_e * elec_num;
const int64_t stride_nw_e = stride_i_e * (cord_num+1);
// For een_rescaled_n...
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
// WARNING This implementation seems unomptimized
#pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma omp teams distribute parallel for collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c );
#endif
#+end_src
**** cuBLAS offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c )
{
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
//cuBLAS initialization
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
{
fprintf(stdout, "CUBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
const double alpha = 1.0;
const double beta = 0.0;
const int64_t M = elec_num;
const int64_t N = nucl_num*(cord_num + 1);
const int64_t K = elec_num;
const int64_t LDA = elec_num;
const int64_t LDB = elec_num;
const int64_t LDC = elec_num;
const int64_t af = elec_num*elec_num;
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = bf;
#pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
#pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
{
for (int nw=0; nw < walk_num; ++nw) {
// /!\ cublasError needs to be checked and return QMCKL_FAILURE if it fails
int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
&(een_rescaled_e[nw*(cord_num+1)]),
LDA, af,
&(een_rescaled_n[bf*nw]),
LDB, 0,
&beta,
&(tmp_c[nw*cord_num]),
LDC, cf, cord_num);
}
}
#pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
cublasDestroy(handle);
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_tmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e,
const double* een_rescaled_n,
double* const tmp_c );
#endif
#+end_src
*** Compute dtmp_c
:PROPERTIES:
:Name: qmckl_compute_dtmp_c
@ -8667,304 +8284,6 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
double* const dtmp_c );
#+end_src
**** OpenACC offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code
qmckl_compute_dtmp_c_acc_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute strides...
// For dtmp_c
const int64_t stride_l_d = elec_num;
const int64_t stride_k_d = stride_l_d * 4;
const int64_t stride_j_d = stride_k_d * nucl_num;
const int64_t stride_i_d = stride_j_d * (cord_num+1);
const int64_t stride_nw_d = stride_i_d * cord_num;
// For een_rescaled_e_deriv_e
const int64_t stride_l_e = elec_num;
const int64_t stride_n_e = stride_l_e * 4;
const int64_t stride_i_e = stride_n_e * elec_num;
const int64_t stride_nw_e = stride_i_e * cord_num;
// For een_rescaled_n
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
#pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma acc loop independent gang worker vector collapse(6)
for (int nw=0; nw < walk_num; nw++) {
for (int i=0; i < cord_num; i++) {
// Single DGEMM
for(int j=0; j<cord_num+1; j++) {
for(int k=0; k<nucl_num; k++) {
for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENACC_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c );
#endif
#+end_src
**** OpenMP offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
// Compute strides...
// For dtmp_c
const int64_t stride_l_d = elec_num;
const int64_t stride_k_d = stride_l_d * 4;
const int64_t stride_j_d = stride_k_d * nucl_num;
const int64_t stride_i_d = stride_j_d * (cord_num+1);
const int64_t stride_nw_d = stride_i_d * cord_num;
// For een_rescaled_e_deriv_e
const int64_t stride_l_e = elec_num;
const int64_t stride_n_e = stride_l_e * 4;
const int64_t stride_i_e = stride_n_e * elec_num;
const int64_t stride_nw_e = stride_i_e * cord_num;
// For een_rescaled_n
const int64_t stride_k_n = elec_num;
const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
// WARNING This implementation seems unomptimized
#pragma omp target map(from:dtmp_c[0:size_dtmp_c]) map(to:een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
{
#pragma omp teams distribute parallel for collapse(6)
for (int nw=0; nw < walk_num; nw++) {
for (int i=0; i < cord_num; i++) {
// Single DGEMM
for(int j=0; j<cord_num+1; j++) {
for(int k=0; k<nucl_num; k++) {
for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
}
}
}
}
}
}
return QMCKL_SUCCESS;
}
#endif
#+end_src
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_OPENMP_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c );
#endif
#+end_src
**** cuBLAS offload :noexport:
#+begin_src c :comments org :tangle (eval c) :noweb yes
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code
qmckl_compute_dtmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (cord_num < 0) {
return QMCKL_INVALID_ARG_2;
}
if (elec_num <= 0) {
return QMCKL_INVALID_ARG_3;
}
if (nucl_num <= 0) {
return QMCKL_INVALID_ARG_4;
}
if (walk_num <= 0) {
return QMCKL_INVALID_ARG_5;
}
qmckl_exit_code info = QMCKL_SUCCESS;
//cuBLAS initialization
cublasHandle_t handle;
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
{
fprintf(stdout, "CUBLAS initialization failed!\n");
exit(EXIT_FAILURE);
}
const double alpha = 1.0;
const double beta = 0.0;
const int64_t M = 4*elec_num;
const int64_t N = nucl_num*(cord_num + 1);
const int64_t K = elec_num;
const int64_t LDA = 4*elec_num;
const int64_t LDB = elec_num;
const int64_t LDC = 4*elec_num;
const int64_t af = elec_num*elec_num*4;
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
#pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num])
#pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
{
for (int64_t nw=0; nw < walk_num; ++nw) {
int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
&(een_rescaled_e_deriv_e[(nw*(cord_num+1))]),
LDA, af,
&(een_rescaled_n[bf*nw]), LDB, 0,
&beta,
&(dtmp_c[(nw*cord_num)]),
LDC, cf, cord_num);
}
}
#pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num])
cublasDestroy(handle);
return info;
}
#endif
#+end_src
#+RESULTS:
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
#ifdef HAVE_CUBLAS_OFFLOAD
qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
const qmckl_context context,
const int64_t cord_num,
const int64_t elec_num,
const int64_t nucl_num,
const int64_t walk_num,
const double* een_rescaled_e_deriv_e,
const double* een_rescaled_n,
double* const dtmp_c );
#endif
#+end_src
*** Test
#+name: helper_funcs