mirror of
https://github.com/TREX-CoE/qmckl.git
synced 2025-01-08 20:33:40 +01:00
Implement computation of tmp_c and dtmp_c in OpenACC
These 2 kernels seem to give good speedup compared to the CPU BLAS versions. However, the current GPU implementation of factor_een_deriv seems to be slightly slower (on the tested machine). TODO: - Try to improve factor_een_deriv GPU implem - Try out a cuBLAS implementation of tmp_c and dtmp_c
This commit is contained in:
parent
99306473a4
commit
9428eaa19e
@ -4809,7 +4809,27 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
|
|||||||
}
|
}
|
||||||
ctx->jastrow.tmp_c = tmp_c;
|
ctx->jastrow.tmp_c = tmp_c;
|
||||||
}
|
}
|
||||||
|
/* Choose the correct compute function (depending on offload type) */
|
||||||
|
bool default_compute = true;
|
||||||
|
|
||||||
|
#ifdef HAVE_OPENACC_OFFLOAD
|
||||||
|
if(ctx->jastrow.offload_type == OFFLOAD_OPENACC) {
|
||||||
|
qmckl_exit_code rc =
|
||||||
|
qmckl_compute_tmp_c_acc_offload(context,
|
||||||
|
ctx->jastrow.cord_num,
|
||||||
|
ctx->electron.num,
|
||||||
|
ctx->nucleus.num,
|
||||||
|
ctx->electron.walk_num,
|
||||||
|
ctx->jastrow.een_rescaled_e,
|
||||||
|
ctx->jastrow.een_rescaled_n,
|
||||||
|
ctx->jastrow.tmp_c);
|
||||||
|
if (rc != QMCKL_SUCCESS) {
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(default_compute) {
|
||||||
qmckl_exit_code rc =
|
qmckl_exit_code rc =
|
||||||
qmckl_compute_tmp_c(context,
|
qmckl_compute_tmp_c(context,
|
||||||
ctx->jastrow.cord_num,
|
ctx->jastrow.cord_num,
|
||||||
@ -4822,6 +4842,8 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
|
|||||||
if (rc != QMCKL_SUCCESS) {
|
if (rc != QMCKL_SUCCESS) {
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
ctx->jastrow.tmp_c_date = ctx->date;
|
ctx->jastrow.tmp_c_date = ctx->date;
|
||||||
}
|
}
|
||||||
@ -5332,6 +5354,134 @@ end function qmckl_compute_tmp_c_f
|
|||||||
|
|
||||||
end function qmckl_compute_tmp_c
|
end function qmckl_compute_tmp_c
|
||||||
#+end_src
|
#+end_src
|
||||||
|
*** Compute tmp_c (OpenACC offload)
|
||||||
|
:PROPERTIES:
|
||||||
|
:Name: qmckl_compute_tmp_c
|
||||||
|
:CRetType: qmckl_exit_code
|
||||||
|
:FRetType: qmckl_exit_code
|
||||||
|
:END:
|
||||||
|
|
||||||
|
#+NAME: qmckl_factor_tmp_c_acc_offload_args
|
||||||
|
| Variable | Type | In/Out | Description |
|
||||||
|
|------------------+------------------------------------------------------------------+--------+-----------------------------------|
|
||||||
|
| ~context~ | ~qmckl_context~ | in | Global state |
|
||||||
|
| ~cord_num~ | ~int64_t~ | in | Order of polynomials |
|
||||||
|
| ~elec_num~ | ~int64_t~ | in | Number of electrons |
|
||||||
|
| ~nucl_num~ | ~int64_t~ | in | Number of nucleii |
|
||||||
|
| ~walk_num~ | ~int64_t~ | in | Number of walkers |
|
||||||
|
| ~een_rescaled_e~ | ~double[walk_num][0:cord_num][elec_num][elec_num]~ | in | Electron-electron rescaled factor |
|
||||||
|
| ~een_rescaled_n~ | ~double[walk_num][0:cord_num][nucl_num][elec_num]~ | in | Electron-nucleus rescaled factor |
|
||||||
|
| ~tmp_c~ | ~double[walk_num][0:cord_num-1][0:cord_num][nucl_num][elec_num]~ | out | vector of non-zero coefficients |
|
||||||
|
|
||||||
|
#+begin_src f90 :comments org :tangle (eval f) :noweb yes
|
||||||
|
integer function qmckl_compute_tmp_c_acc_offload_f(context, cord_num, elec_num, nucl_num, &
|
||||||
|
walk_num, een_rescaled_e, een_rescaled_n, tmp_c) &
|
||||||
|
result(info)
|
||||||
|
use qmckl
|
||||||
|
implicit none
|
||||||
|
integer(qmckl_context), intent(in) :: context
|
||||||
|
integer*8 , intent(in) :: cord_num
|
||||||
|
integer*8 , intent(in) :: elec_num
|
||||||
|
integer*8 , intent(in) :: nucl_num
|
||||||
|
integer*8 , intent(in) :: walk_num
|
||||||
|
double precision , intent(in) :: een_rescaled_e(elec_num, elec_num, 0:cord_num, walk_num)
|
||||||
|
double precision , intent(in) :: een_rescaled_n(elec_num, nucl_num, 0:cord_num, walk_num)
|
||||||
|
double precision , intent(out) :: tmp_c(elec_num, nucl_num,0:cord_num, 0:cord_num-1, walk_num)
|
||||||
|
double precision :: tmp
|
||||||
|
integer*8 :: i, j, jj, k, l, p, lmax, nw
|
||||||
|
|
||||||
|
|
||||||
|
info = QMCKL_SUCCESS
|
||||||
|
if (context == QMCKL_NULL_CONTEXT) then
|
||||||
|
info = QMCKL_INVALID_CONTEXT
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
if (cord_num <= 0) then
|
||||||
|
info = QMCKL_INVALID_ARG_2
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
if (elec_num <= 0) then
|
||||||
|
info = QMCKL_INVALID_ARG_3
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
if (nucl_num <= 0) then
|
||||||
|
info = QMCKL_INVALID_ARG_4
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
!$acc parallel
|
||||||
|
!$acc loop independent gang worker vector collapse(5)
|
||||||
|
do nw=1, walk_num
|
||||||
|
do i=0, cord_num-1
|
||||||
|
|
||||||
|
do j=0,cord_num
|
||||||
|
do jj=1,nucl_num
|
||||||
|
do k=1,elec_num
|
||||||
|
|
||||||
|
tmp = 0.0
|
||||||
|
do l=1,elec_num
|
||||||
|
tmp = tmp + &
|
||||||
|
een_rescaled_e(k, l, i, nw) * een_rescaled_n(l, jj, j, nw)
|
||||||
|
end do
|
||||||
|
tmp_c(k, jj, j, i, nw) = tmp
|
||||||
|
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
!$acc end parallel
|
||||||
|
|
||||||
|
|
||||||
|
end function qmckl_compute_tmp_c_acc_offload_f
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+CALL: generate_c_header(table=qmckl_factor_tmp_c_acc_offload_args,rettyp=get_value("CRetType"),fname=get_value("Name"))
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
#+begin_src c :tangle (eval h_func) :comments org
|
||||||
|
qmckl_exit_code qmckl_compute_tmp_c_acc_offload (
|
||||||
|
const qmckl_context context,
|
||||||
|
const int64_t cord_num,
|
||||||
|
const int64_t elec_num,
|
||||||
|
const int64_t nucl_num,
|
||||||
|
const int64_t walk_num,
|
||||||
|
const double* een_rescaled_e,
|
||||||
|
const double* een_rescaled_n,
|
||||||
|
double* const tmp_c );
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
|
#+CALL: generate_c_interface(table=qmckl_factor_tmp_c_acc_offload_args,rettyp=get_value("CRetType"),fname=get_value("Name"))
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
#+begin_src f90 :tangle (eval f) :comments org :exports none
|
||||||
|
integer(c_int32_t) function qmckl_compute_tmp_c_acc_offload &
|
||||||
|
(context, cord_num, elec_num, nucl_num, walk_num, een_rescaled_e, een_rescaled_n, tmp_c) &
|
||||||
|
bind(C) result(info)
|
||||||
|
|
||||||
|
use, intrinsic :: iso_c_binding
|
||||||
|
implicit none
|
||||||
|
|
||||||
|
integer (c_int64_t) , intent(in) , value :: context
|
||||||
|
integer (c_int64_t) , intent(in) , value :: cord_num
|
||||||
|
integer (c_int64_t) , intent(in) , value :: elec_num
|
||||||
|
integer (c_int64_t) , intent(in) , value :: nucl_num
|
||||||
|
integer (c_int64_t) , intent(in) , value :: walk_num
|
||||||
|
real (c_double ) , intent(in) :: een_rescaled_e(elec_num,elec_num,0:cord_num,walk_num)
|
||||||
|
real (c_double ) , intent(in) :: een_rescaled_n(elec_num,nucl_num,0:cord_num,walk_num)
|
||||||
|
real (c_double ) , intent(out) :: tmp_c(elec_num,nucl_num,0:cord_num,0:cord_num-1,walk_num)
|
||||||
|
|
||||||
|
integer(c_int32_t), external :: qmckl_compute_tmp_c_acc_offload_f
|
||||||
|
info = qmckl_compute_tmp_c_acc_offload_f &
|
||||||
|
(context, cord_num, elec_num, nucl_num, walk_num, een_rescaled_e, een_rescaled_n, tmp_c)
|
||||||
|
|
||||||
|
end function qmckl_compute_tmp_c_acc_offload
|
||||||
|
#+end_src
|
||||||
|
|
||||||
*** Compute dtmp_c
|
*** Compute dtmp_c
|
||||||
:PROPERTIES:
|
:PROPERTIES:
|
||||||
@ -5495,19 +5645,10 @@ integer function qmckl_compute_dtmp_c_acc_offload_f(context, cord_num, elec_num,
|
|||||||
double precision , intent(in) :: een_rescaled_e_deriv_e(elec_num, 4, elec_num, 0:cord_num, walk_num)
|
double precision , intent(in) :: een_rescaled_e_deriv_e(elec_num, 4, elec_num, 0:cord_num, walk_num)
|
||||||
double precision , intent(in) :: een_rescaled_n(elec_num, nucl_num, 0:cord_num, walk_num)
|
double precision , intent(in) :: een_rescaled_n(elec_num, nucl_num, 0:cord_num, walk_num)
|
||||||
double precision , intent(out) :: dtmp_c(elec_num, 4, nucl_num,0:cord_num, 0:cord_num-1, walk_num)
|
double precision , intent(out) :: dtmp_c(elec_num, 4, nucl_num,0:cord_num, 0:cord_num-1, walk_num)
|
||||||
double precision :: x, tmp
|
double precision :: tmp
|
||||||
integer*8 :: i, j, jj, k2, a, l, kk, p, lmax, nw, ii
|
integer*8 :: nw, i, j, jj, k, kk, l
|
||||||
character :: TransA, TransB
|
|
||||||
double precision :: alpha, beta
|
|
||||||
integer*8 :: M, N, K, LDA, LDB, LDC
|
|
||||||
|
|
||||||
TransA = 'N'
|
|
||||||
TransB = 'N'
|
|
||||||
alpha = 1.0d0
|
|
||||||
beta = 0.0d0
|
|
||||||
|
|
||||||
info = QMCKL_SUCCESS
|
info = QMCKL_SUCCESS
|
||||||
|
|
||||||
if (context == QMCKL_NULL_CONTEXT) then
|
if (context == QMCKL_NULL_CONTEXT) then
|
||||||
info = QMCKL_INVALID_CONTEXT
|
info = QMCKL_INVALID_CONTEXT
|
||||||
return
|
return
|
||||||
@ -5528,43 +5669,31 @@ integer function qmckl_compute_dtmp_c_acc_offload_f(context, cord_num, elec_num,
|
|||||||
return
|
return
|
||||||
endif
|
endif
|
||||||
|
|
||||||
M = 4*elec_num
|
!$acc parallel
|
||||||
N = nucl_num*(cord_num + 1)
|
!$acc loop independent gang worker vector collapse(6)
|
||||||
K = elec_num
|
|
||||||
LDA = 4*size(een_rescaled_e_deriv_e,1)
|
|
||||||
LDB = size(een_rescaled_n,1)
|
|
||||||
LDC = 4*size(dtmp_c,1)
|
|
||||||
|
|
||||||
do nw=1, walk_num
|
do nw=1, walk_num
|
||||||
do i=0, cord_num-1
|
do i=0, cord_num-1
|
||||||
|
|
||||||
! Single DGEMM
|
|
||||||
do j=0,cord_num
|
do j=0,cord_num
|
||||||
do jj=1,nucl_num
|
do jj=1,nucl_num
|
||||||
do k2=1,4
|
do k=1,4
|
||||||
do kk=1,elec_num
|
do kk=1,elec_num
|
||||||
|
|
||||||
tmp = 0.0
|
tmp = 0.0
|
||||||
do l=1,K
|
do l=1,elec_num
|
||||||
tmp = tmp + &
|
tmp = tmp + &
|
||||||
een_rescaled_e_deriv_e(kk, k2, l, i, nw) * een_rescaled_n(l, jj, j, nw)
|
een_rescaled_e_deriv_e(kk, k, l, i, nw) * een_rescaled_n(l, jj, j, nw)
|
||||||
enddo
|
end do
|
||||||
! affect tmp
|
dtmp_c(kk, k, jj, j, i, nw ) = tmp
|
||||||
dtmp_c(kk, k2, jj, j, i, nw ) = tmp
|
|
||||||
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
|
|
||||||
|
|
||||||
!info = qmckl_dgemm(context,TransA, TransB, M, N, K, alpha, &
|
|
||||||
! een_rescaled_e_deriv_e(1,1,1,i,nw),LDA*1_8, &
|
|
||||||
! een_rescaled_n(1,1,0,nw),LDB*1_8, &
|
|
||||||
! beta, &
|
|
||||||
! dtmp_c(1,1,1,0,i,nw),LDC)
|
|
||||||
end do
|
end do
|
||||||
end do
|
end do
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
!$acc end parallel
|
||||||
|
|
||||||
end function qmckl_compute_dtmp_c_acc_offload_f
|
end function qmckl_compute_dtmp_c_acc_offload_f
|
||||||
#+end_src
|
#+end_src
|
||||||
@ -5605,8 +5734,8 @@ end function qmckl_compute_dtmp_c_acc_offload_f
|
|||||||
real (c_double ) , intent(in) :: een_rescaled_n(elec_num,nucl_num,0:cord_num,walk_num)
|
real (c_double ) , intent(in) :: een_rescaled_n(elec_num,nucl_num,0:cord_num,walk_num)
|
||||||
real (c_double ) , intent(out) :: dtmp_c(elec_num,nucl_num,0:cord_num,0:cord_num-1,walk_num)
|
real (c_double ) , intent(out) :: dtmp_c(elec_num,nucl_num,0:cord_num,0:cord_num-1,walk_num)
|
||||||
|
|
||||||
integer(c_int32_t), external :: qmckl_compute_dtmp_c_f
|
integer(c_int32_t), external :: qmckl_compute_dtmp_c_acc_offload_f
|
||||||
info = qmckl_compute_dtmp_c_f &
|
info = qmckl_compute_dtmp_c_acc_offload_f &
|
||||||
(context, cord_num, elec_num, nucl_num, walk_num, een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
|
(context, cord_num, elec_num, nucl_num, walk_num, een_rescaled_e_deriv_e, een_rescaled_n, dtmp_c)
|
||||||
|
|
||||||
end function qmckl_compute_dtmp_c_acc_offload
|
end function qmckl_compute_dtmp_c_acc_offload
|
||||||
@ -6316,7 +6445,11 @@ qmckl_exit_code qmckl_provide_factor_een_deriv_e(qmckl_context context)
|
|||||||
#ifdef HAVE_OPENACC_OFFLOAD
|
#ifdef HAVE_OPENACC_OFFLOAD
|
||||||
if(ctx->jastrow.offload_type == OFFLOAD_OPENACC) {
|
if(ctx->jastrow.offload_type == OFFLOAD_OPENACC) {
|
||||||
qmckl_exit_code rc =
|
qmckl_exit_code rc =
|
||||||
qmckl_compute_factor_een_deriv_e_acc_offload(context,
|
// CPU version
|
||||||
|
qmckl_compute_factor_een_deriv_e(context,
|
||||||
|
|
||||||
|
// GPU version : No speedup on this kernel yet
|
||||||
|
// qmckl_compute_factor_een_deriv_e_acc_offload(context,
|
||||||
ctx->electron.walk_num,
|
ctx->electron.walk_num,
|
||||||
ctx->electron.num,
|
ctx->electron.num,
|
||||||
ctx->nucleus.num,
|
ctx->nucleus.num,
|
||||||
|
Loading…
Reference in New Issue
Block a user