mirror of
https://github.com/TREX-CoE/qmckl.git
synced 2025-01-03 10:06:09 +01:00
Merge branch 'gpu' of github.com:TREX-CoE/qmckl into gpu
This commit is contained in:
commit
a7fac59f04
@ -108,6 +108,7 @@ int main() {
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "qmckl.h"
|
||||
@ -116,6 +117,13 @@ int main() {
|
||||
#include "qmckl_memory_private_func.h"
|
||||
#include "qmckl_jastrow_private_func.h"
|
||||
#include "qmckl_jastrow_private_type.h"
|
||||
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "cublas_v2.h"
|
||||
#endif
|
||||
|
||||
|
||||
#+end_src
|
||||
|
||||
* Context
|
||||
@ -1117,7 +1125,7 @@ qmckl_exit_code qmckl_finalize_jastrow(qmckl_context context) {
|
||||
#if defined(HAVE_HPC) && (defined(HAVE_CUBLAS_OFFLOAD) || defined(HAVE_OPENACC_OFFLOAD) || defined(HAVE_OPENMP_OFFLOAD))
|
||||
ctx->jastrow.gpu_offload = true; // ctx->electron.num > 100;
|
||||
#endif
|
||||
|
||||
|
||||
qmckl_exit_code rc = QMCKL_SUCCESS;
|
||||
return rc;
|
||||
|
||||
@ -1511,7 +1519,7 @@ qmckl_exit_code qmckl_compute_asymp_jasb (
|
||||
const int64_t bord_num,
|
||||
const double* bord_vector,
|
||||
const double rescale_factor_kappa_ee,
|
||||
double* const asymp_jasb );
|
||||
double* const asymp_jasb );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -1802,21 +1810,21 @@ qmckl_exit_code qmckl_compute_factor_ee (
|
||||
int ipar; // can we use a smaller integer?
|
||||
double x, x1, spin_fact, power_ser;
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
}
|
||||
|
||||
if (bord_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
}
|
||||
|
||||
for (int nw = 0; nw < walk_num; ++nw) {
|
||||
factor_ee[nw] = 0.0; // put init array here.
|
||||
@ -1827,9 +1835,9 @@ qmckl_exit_code qmckl_compute_factor_ee (
|
||||
x1 = x;
|
||||
power_ser = 0.0;
|
||||
spin_fact = 1.0;
|
||||
ipar = 0; // index of asymp_jasb
|
||||
ipar = 0; // index of asymp_jasb
|
||||
|
||||
for (int p = 1; p < bord_num; ++p) {
|
||||
for (int p = 1; p < bord_num; ++p) {
|
||||
x = x * x1;
|
||||
power_ser = power_ser + bord_vector[p + 1] * x;
|
||||
}
|
||||
@ -1838,7 +1846,7 @@ qmckl_exit_code qmckl_compute_factor_ee (
|
||||
spin_fact = 0.5;
|
||||
ipar = 1;
|
||||
}
|
||||
|
||||
|
||||
factor_ee[nw] = factor_ee[nw] + spin_fact * bord_vector[0] * \
|
||||
x1 / \
|
||||
(1.0 + bord_vector[1] * \
|
||||
@ -1854,7 +1862,7 @@ qmckl_exit_code qmckl_compute_factor_ee (
|
||||
#+end_src
|
||||
|
||||
# #+CALL: generate_c_header(table=qmckl_factor_ee_args,rettyp=get_value("CRetType"),fname=get_value("Name"))
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
qmckl_exit_code qmckl_compute_factor_ee (
|
||||
const qmckl_context context,
|
||||
@ -1865,7 +1873,7 @@ qmckl_exit_code qmckl_compute_factor_ee (
|
||||
const double* bord_vector,
|
||||
const double* ee_distance_rescaled,
|
||||
const double* asymp_jasb,
|
||||
double* const factor_ee );
|
||||
double* const factor_ee );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -2177,7 +2185,7 @@ end function qmckl_compute_factor_ee_deriv_e_f
|
||||
const double* ee_distance_rescaled,
|
||||
const double* ee_distance_rescaled_deriv_e,
|
||||
const double* asymp_jasb,
|
||||
double* const factor_ee_deriv_e );
|
||||
double* const factor_ee_deriv_e );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -2451,7 +2459,7 @@ qmckl_exit_code qmckl_provide_factor_en(qmckl_context context)
|
||||
if (rc != QMCKL_SUCCESS) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
ctx->jastrow.factor_en_date = ctx->date;
|
||||
}
|
||||
|
||||
@ -2550,7 +2558,7 @@ integer function qmckl_compute_factor_en_f( &
|
||||
|
||||
end function qmckl_compute_factor_en_f
|
||||
#+end_src
|
||||
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
qmckl_exit_code qmckl_compute_factor_en (
|
||||
@ -2619,7 +2627,7 @@ qmckl_exit_code qmckl_compute_factor_en (
|
||||
x1 = x;
|
||||
power_ser = 0.0;
|
||||
|
||||
for (int p = 2; p < aord_num+1; ++p) {
|
||||
for (int p = 2; p < aord_num+1; ++p) {
|
||||
x = x * x1;
|
||||
power_ser = power_ser + aord_vector[(p+1)-1 + (type_nucl_vector[a]-1) * aord_num] * x;
|
||||
}
|
||||
@ -2650,7 +2658,7 @@ qmckl_exit_code qmckl_compute_factor_en (
|
||||
const int64_t aord_num,
|
||||
const double* aord_vector,
|
||||
const double* en_distance_rescaled,
|
||||
double* const factor_en );
|
||||
double* const factor_en );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -2944,7 +2952,7 @@ end function qmckl_compute_factor_en_deriv_e_f
|
||||
const double* aord_vector,
|
||||
const double* en_distance_rescaled,
|
||||
const double* en_distance_rescaled_deriv_e,
|
||||
double* const factor_en_deriv_e );
|
||||
double* const factor_en_deriv_e );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -3337,7 +3345,7 @@ end function qmckl_compute_een_rescaled_e_doc_f
|
||||
const int64_t cord_num,
|
||||
const double rescale_factor_kappa_ee,
|
||||
const double* ee_distance,
|
||||
double* const een_rescaled_e );
|
||||
double* const een_rescaled_e );
|
||||
#+end_src
|
||||
|
||||
#+CALL: generate_c_interface(table=qmckl_factor_een_rescaled_e_args,rettyp=get_value("CRetType"),fname="qmckl_compute_een_rescaled_e_doc")
|
||||
@ -3376,13 +3384,13 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
const double rescale_factor_kappa_ee,
|
||||
const double* ee_distance,
|
||||
double* const een_rescaled_e ) {
|
||||
|
||||
double *een_rescaled_e_ij;
|
||||
|
||||
double *een_rescaled_e_ij;
|
||||
double x;
|
||||
const int64_t elec_pairs = (elec_num * (elec_num - 1)) / 2;
|
||||
const int64_t len_een_ij = elec_pairs * (cord_num + 1);
|
||||
int64_t k;
|
||||
|
||||
int64_t k;
|
||||
|
||||
// number of element for the een_rescaled_e_ij[N_e*(N_e-1)/2][cord+1]
|
||||
// probably in C is better [cord+1, Ne*(Ne-1)/2]
|
||||
//elec_pairs = (elec_num * (elec_num - 1)) / 2;
|
||||
@ -3391,7 +3399,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
@ -3406,8 +3414,8 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
}
|
||||
|
||||
// Prepare table of exponentiated distances raised to appropriate power
|
||||
// init
|
||||
|
||||
// init
|
||||
|
||||
for (int kk = 0; kk < walk_num*(cord_num+1)*elec_num*elec_num; ++kk) {
|
||||
een_rescaled_e[kk]= 0.0;
|
||||
}
|
||||
@ -3425,14 +3433,14 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
*/
|
||||
|
||||
for (int nw = 0; nw < walk_num; ++nw) {
|
||||
|
||||
|
||||
for (int kk = 0; kk < len_een_ij; ++kk) {
|
||||
// this array initialized at 0 except een_rescaled_e_ij(:, 1) = 1.0d0
|
||||
// and the arrangement of indices is [cord_num+1, ne*(ne-1)/2]
|
||||
een_rescaled_e_ij[kk]= ( kk < (elec_pairs) ? 1.0 : 0.0 );
|
||||
}
|
||||
|
||||
k = 0;
|
||||
k = 0;
|
||||
for (int i = 0; i < elec_num; ++i) {
|
||||
for (int j = 0; j < i; ++j) {
|
||||
// een_rescaled_e_ij(k, 2) = dexp(-rescale_factor_kappa_ee * ee_distance(i, j, nw));
|
||||
@ -3450,7 +3458,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
een_rescaled_e_ij[k + elec_pairs];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// prepare the actual een table
|
||||
for (int i = 0; i < elec_num; ++i){
|
||||
@ -3458,7 +3466,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
een_rescaled_e[j + i*elec_num + 0 + nw*(cord_num+1)*elec_num*elec_num] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Up to here it should work.
|
||||
for ( int l = 1; l < (cord_num+1); ++l) {
|
||||
k = 0;
|
||||
@ -3481,7 +3489,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
}
|
||||
|
||||
free(een_rescaled_e_ij);
|
||||
|
||||
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#+end_src
|
||||
@ -3520,7 +3528,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_e_hpc (
|
||||
const double* ee_distance,
|
||||
double* const een_rescaled_e );
|
||||
#+end_src
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
qmckl_exit_code qmckl_compute_een_rescaled_e (
|
||||
const qmckl_context context,
|
||||
@ -3848,7 +3856,7 @@ end function qmckl_compute_factor_een_rescaled_e_deriv_e_f
|
||||
const double* coord_new,
|
||||
const double* ee_distance,
|
||||
const double* een_rescaled_e,
|
||||
double* const een_rescaled_e_deriv_e );
|
||||
double* const een_rescaled_e_deriv_e );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -4207,7 +4215,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_n (
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
@ -4268,7 +4276,7 @@ qmckl_exit_code qmckl_compute_een_rescaled_n (
|
||||
const int64_t cord_num,
|
||||
const double rescale_factor_kappa_en,
|
||||
const double* en_distance,
|
||||
double* const een_rescaled_n );
|
||||
double* const een_rescaled_n );
|
||||
#+end_src
|
||||
|
||||
*** Test
|
||||
@ -4577,7 +4585,7 @@ end function qmckl_compute_factor_een_rescaled_n_deriv_e_f
|
||||
const double* coord,
|
||||
const double* en_distance,
|
||||
const double* een_rescaled_n,
|
||||
double* const een_rescaled_n_deriv_e );
|
||||
double* const een_rescaled_n_deriv_e );
|
||||
#+end_src
|
||||
|
||||
#+CALL: generate_c_interface(table=qmckl_compute_factor_een_rescaled_n_deriv_e_args,rettyp=get_value("CRetType"),fname=get_value("Name"))
|
||||
@ -5019,14 +5027,15 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
|
||||
ctx->jastrow.tmp_c = tmp_c;
|
||||
}
|
||||
|
||||
|
||||
/* Choose the correct compute function (depending on offload type) */
|
||||
#ifdef HAVE_HPC
|
||||
const bool gpu_offload = ctx->jastrow.gpu_offload;
|
||||
#else
|
||||
const bool gpu_offload = false;
|
||||
#endif
|
||||
|
||||
if (gpu_offload) {
|
||||
|
||||
if (gpu_offload) {
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
rc = qmckl_compute_tmp_c_cublas_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
@ -5067,7 +5076,8 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
|
||||
ctx->jastrow.een_rescaled_n,
|
||||
ctx->jastrow.tmp_c);
|
||||
}
|
||||
|
||||
|
||||
|
||||
ctx->jastrow.tmp_c_date = ctx->date;
|
||||
}
|
||||
|
||||
@ -5107,13 +5117,14 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
|
||||
ctx->jastrow.dtmp_c = dtmp_c;
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_HPC
|
||||
const bool gpu_offload = ctx->jastrow.gpu_offload;
|
||||
#else
|
||||
const bool gpu_offload = false;
|
||||
#endif
|
||||
|
||||
if (gpu_offload) {
|
||||
|
||||
if (gpu_offload) {
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
rc = qmckl_compute_dtmp_c_cublas_offload(context,
|
||||
ctx->jastrow.cord_num,
|
||||
@ -5159,6 +5170,7 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
ctx->jastrow.dtmp_c_date = ctx->date;
|
||||
}
|
||||
|
||||
@ -5228,10 +5240,10 @@ qmckl_exit_code qmckl_compute_dim_cord_vect (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
int64_t* const dim_cord_vect){
|
||||
|
||||
|
||||
int lmax;
|
||||
|
||||
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
@ -5241,7 +5253,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect (
|
||||
}
|
||||
|
||||
*dim_cord_vect = 0;
|
||||
|
||||
|
||||
for (int p=2; p <= cord_num; ++p){
|
||||
for (int k=p-1; k >= 0; --k) {
|
||||
if (k != 0) {
|
||||
@ -5255,7 +5267,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect (
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return QMCKL_SUCCESS;
|
||||
}
|
||||
#+end_src
|
||||
@ -5266,7 +5278,7 @@ qmckl_exit_code qmckl_compute_dim_cord_vect (
|
||||
qmckl_exit_code qmckl_compute_dim_cord_vect (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
int64_t* const dim_cord_vect );
|
||||
int64_t* const dim_cord_vect );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -5531,15 +5543,15 @@ qmckl_exit_code qmckl_compute_lkpm_combined_index (
|
||||
|
||||
int kk, lmax, m;
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
|
||||
if (cord_num <= 0) {
|
||||
if (cord_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
|
||||
if (dim_cord_vect <= 0) {
|
||||
if (dim_cord_vect <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
|
||||
@ -5576,7 +5588,7 @@ qmckl_exit_code qmckl_compute_lkpm_combined_index (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t dim_cord_vect,
|
||||
int64_t* const lkpm_combined_index );
|
||||
int64_t* const lkpm_combined_index );
|
||||
#+end_src
|
||||
|
||||
|
||||
@ -5617,7 +5629,7 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context,
|
||||
#endif
|
||||
}
|
||||
#+end_src
|
||||
|
||||
|
||||
# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c")
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
@ -5629,7 +5641,7 @@ qmckl_exit_code qmckl_compute_tmp_c (const qmckl_context context,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
double* const tmp_c );
|
||||
#+end_src
|
||||
|
||||
#+begin_src f90 :comments org :tangle (eval f) :noweb yes
|
||||
@ -5709,11 +5721,11 @@ qmckl_exit_code qmckl_compute_tmp_c_doc (
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
double* const tmp_c );
|
||||
#+end_src
|
||||
|
||||
#+CALL: generate_c_interface(table=qmckl_factor_tmp_c_args,rettyp=get_value("FRetType"),fname="qmckl_compute_tmp_c_doc")
|
||||
|
||||
|
||||
#+RESULTS:
|
||||
#+begin_src f90 :tangle (eval f) :comments org :exports none
|
||||
integer(c_int32_t) function qmckl_compute_tmp_c_doc &
|
||||
@ -5758,19 +5770,19 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (
|
||||
|
||||
if (cord_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
}
|
||||
}
|
||||
|
||||
if (elec_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_3;
|
||||
}
|
||||
}
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_5;
|
||||
}
|
||||
}
|
||||
|
||||
qmckl_exit_code info = QMCKL_SUCCESS;
|
||||
|
||||
@ -5807,6 +5819,42 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (
|
||||
}
|
||||
#+end_src
|
||||
|
||||
|
||||
|
||||
#+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c")
|
||||
|
||||
#+RESULTS:
|
||||
#+begin_src c :tangle (eval h_func) :comments org
|
||||
qmckl_exit_code qmckl_compute_tmp_c (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
#+end_src
|
||||
|
||||
# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c_doc")
|
||||
|
||||
#+RESULTS:
|
||||
#+begin_src c :tangle (eval h_private_func) :comments org
|
||||
qmckl_exit_code qmckl_compute_tmp_c_doc (
|
||||
const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
const int64_t nucl_num,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
#+end_src
|
||||
|
||||
# #+CALL: generate_c_header(table=qmckl_factor_tmp_c_args,rettyp=get_value("CRetType"),fname="qmckl_compute_tmp_c_hpc")
|
||||
|
||||
#+RESULTS:
|
||||
|
||||
#+begin_src c :tangle (eval h_private_func) :comments org
|
||||
qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
@ -5815,7 +5863,7 @@ qmckl_exit_code qmckl_compute_tmp_c_hpc (const qmckl_context context,
|
||||
const int64_t walk_num,
|
||||
const double* een_rescaled_e,
|
||||
const double* een_rescaled_n,
|
||||
double* const tmp_c );
|
||||
double* const tmp_c );
|
||||
#+end_src
|
||||
|
||||
**** OpenACC offload :noexport:
|
||||
@ -5865,7 +5913,7 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
|
||||
|
||||
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
|
||||
#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
|
||||
{
|
||||
@ -5877,7 +5925,7 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
|
||||
for (int j=0; j<cord_num+1; j++) {
|
||||
for (int k=0; k<nucl_num; k++) {
|
||||
for (int l=0; l<elec_num; l++) {
|
||||
|
||||
|
||||
// Single reduction
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
|
||||
for (int m=0; m<elec_num; m++) {
|
||||
@ -5886,7 +5934,7 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
|
||||
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
|
||||
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5961,7 +6009,7 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
|
||||
|
||||
const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
|
||||
|
||||
#pragma omp target teams distribute parallel for collapse(5) \
|
||||
@ -5975,7 +6023,7 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
|
||||
for (int j=0; j<cord_num+1; j++) {
|
||||
for (int k=0; k<nucl_num; k++) {
|
||||
for (int l=0; l<elec_num; l++) {
|
||||
|
||||
|
||||
// Single reduction
|
||||
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0.;
|
||||
for (int m=0; m<elec_num; m++) {
|
||||
@ -5984,7 +6032,7 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
|
||||
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
|
||||
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -6012,9 +6060,8 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
|
||||
|
||||
**** cuBLAS offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
qmckl_exit_code
|
||||
qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
|
||||
const int64_t cord_num,
|
||||
const int64_t elec_num,
|
||||
@ -6025,6 +6072,19 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
|
||||
double* const tmp_c )
|
||||
{
|
||||
|
||||
qmckl_exit_code info;
|
||||
|
||||
//Initialisation of cublas
|
||||
|
||||
cublasHandle_t handle;
|
||||
if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
|
||||
{
|
||||
fprintf(stdout, "CUBLAS initialization failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
@ -6041,14 +6101,6 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_5;
|
||||
}
|
||||
|
||||
qmckl_exit_code info = QMCKL_SUCCESS;
|
||||
|
||||
const char TransA = 'N';
|
||||
const char TransB = 'N';
|
||||
const double alpha = 1.0;
|
||||
const double beta = 0.0;
|
||||
|
||||
@ -6064,19 +6116,49 @@ qmckl_compute_tmp_c_cublas_offload (const qmckl_context context,
|
||||
const int64_t bf = elec_num*nucl_num*(cord_num+1);
|
||||
const int64_t cf = bf;
|
||||
|
||||
for (int64_t nw=0; nw < walk_num; ++nw) {
|
||||
for (int64_t i=0; i<cord_num; ++i){
|
||||
info = qmckl_dgemm(context, TransA, TransB, M, N, K, alpha,
|
||||
&(een_rescaled_e[af*(i+nw*(cord_num+1))]), LDA,
|
||||
&(een_rescaled_n[bf*nw]), LDB, beta,
|
||||
&(tmp_c[cf*(i+nw*cord_num)]), LDC);
|
||||
#pragma omp target enter data map(to:een_rescaled_e[0:elec_num*elec_num*(cord_num+1)*walk_num],een_rescaled_n[0:M*N*walk_num],tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
#pragma omp target data use_device_ptr(een_rescaled_e,een_rescaled_n,tmp_c)
|
||||
{
|
||||
|
||||
|
||||
for (int nw=0; nw < walk_num; ++nw) {
|
||||
for (int i=0; i<cord_num; ++i){
|
||||
|
||||
//CuBlas implementation
|
||||
int cublasError = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
|
||||
&(een_rescaled_e[af*(i+nw*(cord_num+1))]), \
|
||||
LDA, \
|
||||
&(een_rescaled_n[bf*nw]), \
|
||||
LDB, \
|
||||
&beta, \
|
||||
&(tmp_c[cf*(i+nw*cord_num)]), \
|
||||
LDC);
|
||||
|
||||
//Manage cublas ERROR
|
||||
if(cublasError != CUBLAS_STATUS_SUCCESS){
|
||||
printf("CUBLAS ERROR %d", cublasError);
|
||||
info = QMCKL_FAILURE;
|
||||
return info;
|
||||
}else{
|
||||
info = QMCKL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
cublasDestroy(handle);
|
||||
|
||||
|
||||
#pragma omp target exit data map(from:tmp_c[0:elec_num*nucl_num*(cord_num+1)*cord_num*walk_num])
|
||||
|
||||
|
||||
return info;
|
||||
}
|
||||
#endif
|
||||
#+end_src
|
||||
#+end_src
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval h_private_func) :noweb yes :exports none
|
||||
#ifdef HAVE_CUBLAS_OFFLOAD
|
||||
@ -6209,7 +6291,7 @@ integer function qmckl_compute_dtmp_c_doc_f( &
|
||||
dtmp_c(1,1,1,0,i,nw),LDC)
|
||||
end do
|
||||
end do
|
||||
|
||||
|
||||
end function qmckl_compute_dtmp_c_doc_f
|
||||
#+end_src
|
||||
|
||||
@ -6253,7 +6335,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_doc (
|
||||
#+end_src
|
||||
|
||||
**** CPU :noexport:
|
||||
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
qmckl_exit_code
|
||||
qmckl_compute_dtmp_c_hpc (const qmckl_context context,
|
||||
@ -6268,7 +6350,7 @@ qmckl_compute_dtmp_c_hpc (const qmckl_context context,
|
||||
|
||||
if (context == QMCKL_NULL_CONTEXT) {
|
||||
return QMCKL_INVALID_CONTEXT;
|
||||
}
|
||||
}
|
||||
|
||||
if (cord_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_2;
|
||||
@ -6280,11 +6362,11 @@ qmckl_compute_dtmp_c_hpc (const qmckl_context context,
|
||||
|
||||
if (nucl_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_4;
|
||||
}
|
||||
}
|
||||
|
||||
if (walk_num <= 0) {
|
||||
return QMCKL_INVALID_ARG_5;
|
||||
}
|
||||
}
|
||||
|
||||
qmckl_exit_code info = QMCKL_SUCCESS;
|
||||
|
||||
@ -6332,7 +6414,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
|
||||
const double* een_rescaled_n,
|
||||
double* const dtmp_c );
|
||||
#+end_src
|
||||
|
||||
|
||||
**** OpenACC offload :noexport:
|
||||
|
||||
#+begin_src c :comments org :tangle (eval c) :noweb yes
|
||||
@ -6382,7 +6464,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
|
||||
|
||||
|
||||
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
|
||||
|
||||
#pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
|
||||
@ -6396,7 +6478,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
|
||||
for(int k=0; k<nucl_num; k++) {
|
||||
for(int l=0; l<4; l++) {
|
||||
for(int m=0; m<elec_num; m++) {
|
||||
|
||||
|
||||
// Single reduction
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
|
||||
for(int n=0; n<elec_num; n++){
|
||||
@ -6481,7 +6563,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
|
||||
|
||||
|
||||
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
|
||||
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
|
||||
|
||||
|
||||
@ -6497,7 +6579,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
|
||||
for(int k=0; k<nucl_num; k++) {
|
||||
for(int l=0; l<4; l++) {
|
||||
for(int m=0; m<elec_num; m++) {
|
||||
|
||||
|
||||
// Single reduction
|
||||
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0.;
|
||||
for(int n=0; n<elec_num; n++){
|
||||
@ -6701,7 +6783,7 @@ rc = qmckl_get_jastrow_dtmp_c(context, &(dtmp_c[0][0][0][0][0][0]));
|
||||
printf("%e\n%e\n", tmp_c[0][0][1][0][0], 2.7083473948352403);
|
||||
assert(fabs(tmp_c[0][0][1][0][0] - 2.7083473948352403) < 1e-12);
|
||||
|
||||
printf("%e\n%e\n", tmp_c[0][1][0][0][0],0.237440520852232);
|
||||
printf("%e\n%e\n", tmp_c[0][1][0][0][0],0.237440520852232);
|
||||
assert(fabs(dtmp_c[0][1][0][0][0][0] - 0.237440520852232) < 1e-12);
|
||||
return QMCKL_SUCCESS;
|
||||
#+end_src
|
||||
|
Loading…
Reference in New Issue
Block a user