mirror of
https://github.com/TREX-CoE/qmckl.git
synced 2024-11-04 05:03:59 +01:00
Removed GPU from Jastrow
This commit is contained in:
parent
9a779f2a94
commit
ea21ec2ef7
@ -20,16 +20,16 @@
|
||||
\[
|
||||
J_{\text{eN}}(\mathbf{r},\mathbf{R}) =
|
||||
\sum_{\alpha=1}^{N_\text{nucl}} \sum_{i=1}^{N_\text{elec}}
|
||||
\frac{a_{1,\alpha}\, g_\alpha(R_{i\alpha})}{1+a_{2,\alpha}\, g_\alpha(R_{i\alpha})} +
|
||||
\sum_{p=2}^{N_\text{ord}^a} a_{p+1,\alpha}\, [g_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
|
||||
\frac{a_{1\,\alpha}\, f_\alpha(R_{i\,\alpha})}{1+a_{2\,\alpha}\, f_\alpha(R_{i\alpha})} +
|
||||
\sum_{p=2}^{N_\text{ord}^a} a_{p+1\,\alpha}\, [f_\alpha(R_{i\alpha})]^p - J_{eN}^{\infty \alpha}
|
||||
\]
|
||||
|
||||
$J_{\text{ee}}$ contains electron-electron terms:
|
||||
\[
|
||||
J_{\text{ee}}(\mathbf{r}) =
|
||||
\sum_{i=1}^{N_\text{elec}} \sum_{j=1}^{i-1}
|
||||
\frac{b_1\, f(r_{ij})}{1+b_2\, f(r_{ij})} +
|
||||
\sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f(r_{ij})]^p - J_{ee}^\infty
|
||||
\frac{\frac{1}{2}(1+\delta^{\uparrow\downarrow}_{ij}) b_1\, f_{\text{ee}}(r_{ij})}{1+b_2\, f_{\text{ee}}(r_{ij})} +
|
||||
\sum_{p=2}^{N_\text{ord}^b} a_{p+1}\, [f_{\text{ee}}(r_{ij})]^p - J_{ee}^\infty
|
||||
\]
|
||||
|
||||
and $J_{\text{eeN}}$ contains electron-electron-Nucleus terms:
|
||||
@ -42,7 +42,7 @@
|
||||
\sum_{p=2}^{N_{\text{ord}}}
|
||||
\sum_{k=0}^{p-1}
|
||||
\sum_{l=0}^{p-k-2\delta_{k,0}}
|
||||
c_{lkp\alpha} \left[ f({r}_{ij}) \right]^k
|
||||
c_{lkp\alpha} \left[ g_\text{ee}({r}_{ij}) \right]^k
|
||||
\left[ \left[ g_\alpha({R}_{i\alpha}) \right]^l + \left[ g_\alpha({R}_{j\alpha}) \right]^l \right]
|
||||
\left[ g_\alpha({R}_{i\,\alpha}) \, g_\alpha({R}_{j\alpha}) \right]^{(p-k-l)/2}
|
||||
\]
|
||||
@ -52,7 +52,7 @@
|
||||
$f$ and $g$ are scaling function defined as
|
||||
|
||||
\[
|
||||
f(r) = \frac{1-e^{-\kappa\, r}}{\kappa} \text{ and }
|
||||
f_\alpha(r) = \frac{1-e^{-\kappa_\alpha\, r}}{\kappa_\alpha} \text{ and }
|
||||
g_\alpha(r) = e^{-\kappa_\alpha\, r}.
|
||||
\]
|
||||
|
||||
@ -118,11 +118,6 @@ int main() {
|
||||
#include "qmckl_jastrow_private_func.h"
|
||||
#include "qmckl_jastrow_private_type.h"
|
||||
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
#include "cublas_v2.h"
|
||||
#endif
|
||||
|
||||
|
||||
#+end_src
|
||||
|
||||
* Context
|
||||
@ -404,13 +399,9 @@ typedef struct qmckl_jastrow_struct{
|
||||
bool provided;
|
||||
char * type;
|
||||
|
||||
#ifdef HAVE_HPC
|
||||
bool gpu_offload;
|
||||
#endif
|
||||
} qmckl_jastrow_struct;
|
||||
#+end_src
|
||||
|
||||
|
||||
The ~uninitialized~ integer contains one bit set to one for each
|
||||
initialization function which has not been called. It becomes equal
|
||||
to zero after all initialization functions have been called. The
|
||||
@ -492,7 +483,6 @@ if (ctx->jastrow.provided) {
|
||||
return QMCKL_SUCCESS;
|
||||
#+end_src
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes :exports none
|
||||
qmckl_exit_code
|
||||
qmckl_set_jastrow_aord_num(qmckl_context context, const int64_t aord_num)
|
||||
@ -910,7 +900,7 @@ qmckl_set_jastrow_rescale_factor_en(qmckl_context context,
|
||||
|
||||
When the required information is completely entered, other data structures are
|
||||
computed to accelerate the calculations. The intermediates factors
|
||||
are precontracted using BLAS LEVEL 3 operations for an optimal flop count.
|
||||
are precontracted using BLAS LEVEL 3 operations.
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context);
|
||||
@ -952,11 +942,6 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) {
|
||||
NULL);
|
||||
}
|
||||
|
||||
/* Decide if the Jastrow if offloaded on GPU or not */
|
||||
#if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD))
|
||||
ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100;
|
||||
#endif
|
||||
|
||||
qmckl_exit_code rc;
|
||||
|
||||
rc = qmckl_provide_jastrow_asymp_jasa(context);
|
||||
@ -2016,16 +2001,16 @@ assert(fabs(asymp_jasb[1]-0.31567342786262853) < 1.e-12);
|
||||
|
||||
#+end_src
|
||||
|
||||
** Electron-electron component \(f_{ee}\)
|
||||
** Electron-electron component \(f_\text{ee}\)
|
||||
|
||||
Calculate the electron-electron jastrow component ~factor_ee~ using the ~asymp_jasb~
|
||||
componenet and the electron-electron rescaled distances ~ee_distance_rescaled~.
|
||||
component and the electron-electron rescaled distances ~ee_distance_rescaled~.
|
||||
|
||||
\[
|
||||
f_{ee} = \sum_{i,j<i} \left[ \frac{ \eta B_0 C_{ij}}{1 - B_1 C_{ij}} + \sum^{nord}_{k}B_k C_{ij}^k \right] - J_{\text{ee}}^{\infty}
|
||||
f_\text{ee} = \sum_{i,j<i} \left[ \frac{ \delta B_0\, C_{ij}}{1 - B_1\, C_{ij}} + \sum^{n_\text{ord}}_{k}B_k\, C_{ij}^k \right] - J_{\text{ee}}^{\infty}
|
||||
\]
|
||||
|
||||
$\eta$ is the spin factor, $B$ is the vector of $b$ parameters,
|
||||
$\delta$ is the spin factor, $B$ is the vector of $b$ parameters,
|
||||
$C$ is the array of scaled distances.
|
||||
|
||||
|
||||
@ -7294,46 +7279,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
|
||||
ctx->jastrow.tmp_c = tmp_c;
|
||||
}
|
||||
|
||||
|
||||
/* Choose the correct compute function (depending on offload type) */
|
||||
#ifdef HAVE_HPC
|
||||
const bool gpu_offload = ctx->jastrow.gpu_offload;
|
||||
#else
|
||||
const bool gpu_offload = false;
|
||||
#endif
|
||||
|
||||
if (gpu_offload) {
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
rc = qmckl_compute_tmp_c_cublas_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
ctx->nucleus.num,
|
||||
ctx->electron.walker.num,
|
||||
ctx->jastrow.een_rescaled_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.tmp_c);
|
||||
#elif HAVE_OPENACC_OFFLOAD
|
||||
rc = qmckl_compute_tmp_c_acc_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
ctx->nucleus.num,
|
||||
ctx->electron.walker.num,
|
||||
ctx->jastrow.een_rescaled_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.tmp_c);
|
||||
#elif HAVE_OPENMP_OFFLOAD
|
||||
rc = qmckl_compute_tmp_c_omp_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
ctx->nucleus.num,
|
||||
ctx->electron.walker.num,
|
||||
ctx->jastrow.een_rescaled_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.tmp_c);
|
||||
#else
|
||||
rc = QMCKL_FAILURE;
|
||||
#endif
|
||||
} else {
|
||||
rc = qmckl_compute_tmp_c(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
@ -7342,8 +7287,6 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
|
||||
ctx->jastrow.een_rescaled_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.tmp_c);
|
||||
}
|
||||
|
||||
|
||||
ctx->jastrow.tmp_c_date = ctx->date;
|
||||
}
|
||||
@ -7394,44 +7337,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_HPC
|
||||
const bool gpu_offload = ctx->jastrow.gpu_offload;
|
||||
#else
|
||||
const bool gpu_offload = false;
|
||||
#endif
|
||||
|
||||
if (gpu_offload) {
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
rc = qmckl_compute_dtmp_c_cublas_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
ctx->nucleus.num,
|
||||
ctx->electron.walker.num,
|
||||
ctx->jastrow.een_rescaled_e_deriv_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.dtmp_c);
|
||||
#elif HAVE_OPENACC_OFFLOAD
|
||||
rc = qmckl_compute_dtmp_c_acc_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
ctx->nucleus.num,
|
||||
ctx->electron.walker.num,
|
||||
ctx->jastrow.een_rescaled_e_deriv_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.dtmp_c);
|
||||
#elif HAVE_OPENMP_OFFLOAD
|
||||
rc = qmckl_compute_dtmp_c_omp_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
ctx->nucleus.num,
|
||||
ctx->electron.walker.num,
|
||||
ctx->jastrow.een_rescaled_e_deriv_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.dtmp_c);
|
||||
#else
|
||||
rc = QMCKL_FAILURE;
|
||||
#endif
|
||||
} else {
|
||||
rc = qmckl_compute_dtmp_c(context,
|
||||
ctx->jastrow.cord_num,
|
||||
ctx->electron.num,
|
||||
@ -7440,7 +7345,6 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
|
||||
ctx->jastrow.een_rescaled_e_deriv_e,
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.dtmp_c);
|
||||
}
|
||||
|
||||
if (rc != QMCKL_SUCCESS) {
|
||||
return rc;
|
||||
@ -8139,293 +8043,6 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
|
||||
double* const tmp_c );
|
||||
#+end_src
|
||||
|
||||
**** OpenACC offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_OPENACC_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c )
|
||||
{
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num < 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
// Compute array access strides:
|
||||
// For tmp_c...
|
||||
const int64_t stride_k_c = elec_num;
|
||||
const int64_t stride_j_c = stride_k_c * nucl_num;
|
||||
const int64_t stride_i_c = stride_j_c * (cord_num+1);
|
||||
const int64_t stride_nw_c = stride_i_c * cord_num;
|
||||
// For een_rescaled_e...
|
||||
const int64_t stride_m_e = elec_num;
|
||||
const int64_t stride_i_e = stride_m_e * elec_num;
|
||||
const int64_t stride_nw_e = stride_i_e * (cord_num+1);
|
||||
// For een_rescaled_n...
|
||||
const int64_t stride_k_n = elec_num;
|
||||
const int64_t stride_j_n = stride_k_n * nucl_num;
|
||||
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
|
||||
|
||||
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
|
||||
#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
|
||||
{
|
||||
#pragma acc loop independent gang worker vector collapse(5)
|
||||
for (int nw=0; nw < walk_num; ++nw) {
|
||||
for (int i=0; i<cord_num; ++i){
|
||||
|
||||
// Replacement for single DGEMM
|
||||
for (int j=0; j<cord_num+1; j++) {
|
||||
for (int k=0; k<nucl_num; k++) {
|
||||
for (int l=0; l<elec_num; l++) {
|
||||
|
||||
// Single reduction
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
|
||||
for (int m=0; m<elec_num; m++) {
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
|
||||
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
|
||||
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_OPENACC_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
**** OpenMP offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_OPENMP_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c )
|
||||
{
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num < 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
// Compute array access strides:
|
||||
// For tmp_c...
|
||||
const int64_t stride_k_c = elec_num;
|
||||
const int64_t stride_j_c = stride_k_c * nucl_num;
|
||||
const int64_t stride_i_c = stride_j_c * (cord_num+1);
|
||||
const int64_t stride_nw_c = stride_i_c * cord_num;
|
||||
// For een_rescaled_e...
|
||||
const int64_t stride_m_e = elec_num;
|
||||
const int64_t stride_i_e = stride_m_e * elec_num;
|
||||
const int64_t stride_nw_e = stride_i_e * (cord_num+1);
|
||||
// For een_rescaled_n...
|
||||
const int64_t stride_k_n = elec_num;
|
||||
const int64_t stride_j_n = stride_k_n * nucl_num;
|
||||
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
|
||||
|
||||
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
|
||||
|
||||
// WARNING This implementation seems unomptimized
|
||||
#pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
|
||||
{
|
||||
#pragma omp teams distribute parallel for collapse(5)
|
||||
for (int nw=0; nw < walk_num; ++nw) {
|
||||
for (int i=0; i<cord_num; ++i){
|
||||
|
||||
// Replacement for single DGEMM
|
||||
for (int j=0; j<cord_num+1; j++) {
|
||||
for (int k=0; k<nucl_num; k++) {
|
||||
for (int l=0; l<elec_num; l++) {
|
||||
|
||||
// Single reduction
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
|
||||
for (int m=0; m<elec_num; m++) {
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
|
||||
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
|
||||
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_OPENMP_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
**** cuBLAS offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c )
|
||||
{
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num < 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
//cuBLAS initialization
|
||||
cublasHandle_t handle;
|
||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
|
||||
{
|
||||
fprintf(stdout, "CUBLAS initialization failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
const double alpha = 1.0;
|
||||
const double beta = 0.0;
|
||||
|
||||
const int64_t M = elec_num;
|
||||
const int64_t N = nucl_num*(cord_num + 1);
|
||||
const int64_t K = elec_num;
|
||||
|
||||
const int64_t LDA = elec_num;
|
||||
const int64_t LDB = elec_num;
|
||||
const int64_t LDC = elec_num;
|
||||
|
||||
const int64_t af = elec_num*elec_num;
|
||||
const int64_t bf = elec_num*nucl_num*(cord_num+1);
|
||||
const int64_t cf = bf;
|
||||
|
||||
#pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
#pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
|
||||
{
|
||||
for (int nw=0; nw < walk_num; ++nw) {
|
||||
|
||||
// /!\ cublasError needs to be checked and return QMCKL_FAILURE if it fails
|
||||
int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
|
||||
&(een_rescaled_e[nw*(cord_num+1)]),
|
||||
LDA, af,
|
||||
&(een_rescaled_n[bf*nw]),
|
||||
LDB, 0,
|
||||
&beta,
|
||||
&(tmp_c[nw*cord_num]),
|
||||
LDC, cf, cord_num);
|
||||
}
|
||||
}
|
||||
#pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
|
||||
cublasDestroy(handle);
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
#+end_src
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_cublas_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
*** Compute dtmp_c
|
||||
:PROPERTIES:
|
||||
:Name: qmckl_compute_dtmp_c
|
||||
@ -8667,304 +8284,6 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
|
||||
double* const dtmp_c );
|
||||
#+end_src
|
||||
|
||||
**** OpenACC offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_OPENACC_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_dtmp_c_acc_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e_deriv_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c ) {
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num < 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
// Compute strides...
|
||||
// For dtmp_c
|
||||
const int64_t stride_l_d = elec_num;
|
||||
const int64_t stride_k_d = stride_l_d * 4;
|
||||
const int64_t stride_j_d = stride_k_d * nucl_num;
|
||||
const int64_t stride_i_d = stride_j_d * (cord_num+1);
|
||||
const int64_t stride_nw_d = stride_i_d * cord_num;
|
||||
// For een_rescaled_e_deriv_e
|
||||
const int64_t stride_l_e = elec_num;
|
||||
const int64_t stride_n_e = stride_l_e * 4;
|
||||
const int64_t stride_i_e = stride_n_e * elec_num;
|
||||
const int64_t stride_nw_e = stride_i_e * cord_num;
|
||||
// For een_rescaled_n
|
||||
const int64_t stride_k_n = elec_num;
|
||||
const int64_t stride_j_n = stride_k_n * nucl_num;
|
||||
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
|
||||
|
||||
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
|
||||
|
||||
#pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
|
||||
{
|
||||
#pragma acc loop independent gang worker vector collapse(6)
|
||||
for (int nw=0; nw < walk_num; nw++) {
|
||||
for (int i=0; i < cord_num; i++) {
|
||||
|
||||
// Single DGEMM
|
||||
for(int j=0; j<cord_num+1; j++) {
|
||||
for(int k=0; k<nucl_num; k++) {
|
||||
for(int l=0; l<4; l++) {
|
||||
for(int m=0; m<elec_num; m++) {
|
||||
|
||||
// Single reduction
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
|
||||
for(int n=0; n<elec_num; n++){
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
|
||||
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
|
||||
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_OPENACC_OFFLOAD
|
||||
qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e_deriv_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c );
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
**** OpenMP offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_OPENMP_OFFLOAD
|
||||
qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e_deriv_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c ) {
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num < 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
// Compute strides...
|
||||
// For dtmp_c
|
||||
const int64_t stride_l_d = elec_num;
|
||||
const int64_t stride_k_d = stride_l_d * 4;
|
||||
const int64_t stride_j_d = stride_k_d * nucl_num;
|
||||
const int64_t stride_i_d = stride_j_d * (cord_num+1);
|
||||
const int64_t stride_nw_d = stride_i_d * cord_num;
|
||||
// For een_rescaled_e_deriv_e
|
||||
const int64_t stride_l_e = elec_num;
|
||||
const int64_t stride_n_e = stride_l_e * 4;
|
||||
const int64_t stride_i_e = stride_n_e * elec_num;
|
||||
const int64_t stride_nw_e = stride_i_e * cord_num;
|
||||
// For een_rescaled_n
|
||||
const int64_t stride_k_n = elec_num;
|
||||
const int64_t stride_j_n = stride_k_n * nucl_num;
|
||||
const int64_t stride_nw_n = stride_j_n * (cord_num+1);
|
||||
|
||||
|
||||
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
|
||||
|
||||
// WARNING This implementation seems unomptimized
|
||||
#pragma omp target map(from:dtmp_c[0:size_dtmp_c]) map(to:een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
|
||||
{
|
||||
|
||||
#pragma omp teams distribute parallel for collapse(6)
|
||||
for (int nw=0; nw < walk_num; nw++) {
|
||||
for (int i=0; i < cord_num; i++) {
|
||||
|
||||
// Single DGEMM
|
||||
for(int j=0; j<cord_num+1; j++) {
|
||||
for(int k=0; k<nucl_num; k++) {
|
||||
for(int l=0; l<4; l++) {
|
||||
for(int m=0; m<elec_num; m++) {
|
||||
|
||||
// Single reduction
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
|
||||
for(int n=0; n<elec_num; n++){
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
|
||||
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
|
||||
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_OPENMP_OFFLOAD
|
||||
qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e_deriv_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c );
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
**** cuBLAS offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_dtmp_c_cublas_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e_deriv_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c ) {
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num < 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_5;
|
||||
}
|
||||
|
||||
qmckl_exit_code info = QMCKL_SUCCESS;
|
||||
|
||||
//cuBLAS initialization
|
||||
cublasHandle_t handle;
|
||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
|
||||
{
|
||||
fprintf(stdout, "CUBLAS initialization failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
const double alpha = 1.0;
|
||||
const double beta = 0.0;
|
||||
|
||||
const int64_t M = 4*elec_num;
|
||||
const int64_t N = nucl_num*(cord_num + 1);
|
||||
const int64_t K = elec_num;
|
||||
|
||||
const int64_t LDA = 4*elec_num;
|
||||
const int64_t LDB = elec_num;
|
||||
const int64_t LDC = 4*elec_num;
|
||||
|
||||
const int64_t af = elec_num*elec_num*4;
|
||||
const int64_t bf = elec_num*nucl_num*(cord_num+1);
|
||||
const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
|
||||
|
||||
#pragma omp target enter data map(to:een_rescaled_e_deriv_e[0:elec_num*4*elec_num*(cord_num+1)*walk_num], een_rescaled_n[0:elec_num*nucl_num*(cord_num+1)*walk_num], dtmp_c[0:elec_num*4*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
#pragma omp target data use_device_ptr(een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
|
||||
{
|
||||
for (int64_t nw=0; nw < walk_num; ++nw) {
|
||||
int cublasError = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
|
||||
&(een_rescaled_e_deriv_e[(nw*(cord_num+1))]),
|
||||
LDA, af,
|
||||
&(een_rescaled_n[bf*nw]), LDB, 0,
|
||||
&beta,
|
||||
&(dtmp_c[(nw*cord_num)]),
|
||||
LDC, cf, cord_num);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp target exit data map(from:dtmp_c[0:cf*cord_num*walk_num])
|
||||
|
||||
cublasDestroy(handle);
|
||||
return info;
|
||||
}
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
#+RESULTS:
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e_deriv_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c );
|
||||
#endif
|
||||
#+end_src
|
||||
|
||||
*** Test
|
||||
|
||||
#+name: helper_funcs
|
||||
|
Loading…
Reference in New Issue
Block a user