1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2024-09-27 03:51:09 +02:00

Fixed naive GPU kernels and ignored variable issue

This commit is contained in:
Aurélien Delval 2022-04-05 14:28:35 +02:00
parent bc43113b6f
commit eb71a752f5
2 changed files with 120 additions and 180 deletions

View File

@ -1,114 +0,0 @@
subroutine qmckl_check_error(rc, message)
use qmckl
implicit none
integer(qmckl_exit_code), intent(in) :: rc
character(len=*) , intent(in) :: message
character(len=128) :: str_buffer
if (rc /= QMCKL_SUCCESS) then
print *, message
call qmckl_string_of_error(rc, str_buffer)
print *, str_buffer
call exit(rc)
end if
end subroutine qmckl_check_error
program ao_grid
use qmckl
implicit none
integer(qmckl_context) :: qmckl_ctx ! QMCkl context
integer(qmckl_exit_code) :: rc ! Exit code of QMCkl functions
character(len=128) :: trexio_filename
character(len=128) :: str_buffer
integer :: ao_id
integer :: point_num_x
integer(c_int64_t) :: nucl_num
double precision, allocatable :: nucl_coord(:,:)
integer(c_int64_t) :: point_num
integer(c_int64_t) :: ao_num
integer(c_int64_t) :: ipoint, i, j, k
double precision :: x, y, z, dr(3)
double precision :: rmin(3), rmax(3)
double precision, allocatable :: points(:,:)
double precision, allocatable :: ao_vgl(:,:,:)
if (iargc() /= 3) then
print *, 'Syntax: ao_grid <trexio_file> <AO_id> <point_num>'
call exit(-1)
end if
call getarg(1, trexio_filename)
call getarg(2, str_buffer)
read(str_buffer, *) ao_id
call getarg(3, str_buffer)
read(str_buffer, *) point_num_x
if (point_num_x < 0 .or. point_num_x > 300) then
print *, 'Error: 0 < point_num < 300'
call exit(-1)
end if
qmckl_ctx = qmckl_context_create()
rc = qmckl_trexio_read(qmckl_ctx, trexio_filename, 1_8*len(trim(trexio_filename)))
call qmckl_check_error(rc, 'Read TREXIO')
rc = qmckl_get_ao_basis_ao_num(qmckl_ctx, ao_num)
call qmckl_check_error(rc, 'Getting ao_num')
if (ao_id < 0 .or. ao_id > ao_num) then
print *, 'Error: 0 < ao_id < ', ao_num
call exit(-1)
end if
rc = qmckl_get_nucleus_num(qmckl_ctx, nucl_num)
call qmckl_check_error(rc, 'Get nucleus num')
allocate( nucl_coord(3, nucl_num) )
rc = qmckl_get_nucleus_coord(qmckl_ctx, 'N', nucl_coord, 3_8*nucl_num)
call qmckl_check_error(rc, 'Get nucleus coord')
rmin(1) = minval( nucl_coord(1,:) ) - 5.d0
rmin(2) = minval( nucl_coord(2,:) ) - 5.d0
rmin(3) = minval( nucl_coord(3,:) ) - 5.d0
rmax(1) = maxval( nucl_coord(1,:) ) + 5.d0
rmax(2) = maxval( nucl_coord(2,:) ) + 5.d0
rmax(3) = maxval( nucl_coord(3,:) ) + 5.d0
dr(1:3) = (rmax(1:3) - rmin(1:3)) / dble(point_num_x-1)
point_num = point_num_x**3
allocate( points(point_num, 3) )
ipoint=0
z = rmin(3)
do k=1,point_num_x
y = rmin(2)
do j=1,point_num_x
x = rmin(1)
do i=1,point_num_x
ipoint = ipoint+1
points(ipoint,1) = x
points(ipoint,2) = y
points(ipoint,3) = z
x = x + dr(1)
end do
y = y + dr(2)
end do
z = z + dr(3)
end do
rc = qmckl_set_point(qmckl_ctx, 'T', points, point_num)
call qmckl_check_error(rc, 'Setting points')
allocate( ao_vgl(ao_num, 5, point_num) )
rc = qmckl_get_ao_basis_ao_vgl(qmckl_ctx, ao_vgl, ao_num*5_8*point_num)
call qmckl_check_error(rc, 'Setting points')
do ipoint=1, point_num
print '(3(F16.10,X),E20.10)', points(ipoint, 1:3), ao_vgl(ao_id,1,ipoint)
end do
deallocate( nucl_coord, points, ao_vgl )
end program ao_grid

View File

@ -4889,57 +4889,65 @@ qmckl_exit_code qmckl_provide_tmp_c(qmckl_context context)
}
ctx->jastrow.tmp_c = tmp_c;
}
/* Choose the correct compute function (depending on offload type) */
bool default_compute = true;
switch(ctx->jastrow.offload_type) {
case OFFLOAD_OPENACC:
#ifdef HAVE_OPENACC_OFFLOAD
rc =
qmckl_compute_tmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#elif
rc = qmckl_compute_tmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#ifdef HAVE_OPENACC_OFFLOAD
if(ctx->jastrow.offload_type == OFFLOAD_OPENACC) {
qmckl_exit_code rc =
qmckl_compute_tmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
default_compute = false;
if (rc != QMCKL_SUCCESS) {
return rc;
}
#endif
break;
case OFFLOAD_CUBLAS:
#ifdef HAVE_CUBLAS_OFFLOAD
rc =
qmckl_compute_tmp_c_cublas_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#elif
rc = qmckl_compute_tmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
#endif
break;
default:
rc = qmckl_compute_tmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
break;
}
#endif
#ifdef HAVE_CUBLAS_OFFLOAD
if(ctx->jastrow.offload_type == OFFLOAD_CUBLAS) {
qmckl_exit_code rc =
qmckl_compute_tmp_c_cublas_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
default_compute = false;
if (rc != QMCKL_SUCCESS) {
return rc;
}
}
#endif
rc = qmckl_compute_tmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.tmp_c);
if (rc != QMCKL_SUCCESS) {
return rc;
}
ctx->jastrow.tmp_c_date = ctx->date;
}
@ -4980,15 +4988,61 @@ qmckl_exit_code qmckl_provide_dtmp_c(qmckl_context context)
ctx->jastrow.dtmp_c = dtmp_c;
}
switch(ctx->jastrow.offload_type) {
case OFFLOAD_OPENACC:
#ifdef HAVE_OPENACC_OFFLOAD
rc = qmckl_compute_dtmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#elif
rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#endif
break;
case OFFLOAD_CUBLAS:
#ifdef HAVE_CUBLAS_OFFLOAD
rc = qmckl_compute_dtmp_c_acc_offload(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#elif
rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
#endif
break;
default:
rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
break;
}
rc = qmckl_compute_dtmp_c(context,
ctx->jastrow.cord_num,
ctx->electron.num,
ctx->nucleus.num,
ctx->electron.walk_num,
ctx->jastrow.een_rescaled_e_deriv_e,
ctx->jastrow.een_rescaled_n,
ctx->jastrow.dtmp_c);
if (rc != QMCKL_SUCCESS) {
return rc;
}
@ -5617,7 +5671,6 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (
const double* een_rescaled_n,
double* const tmp_c ) {
if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
@ -5649,8 +5702,8 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (
int stride_j_n = stride_k_n * nucl_num;
int stride_nw_n = stride_j_n * (cord_num+1);
//#pragma acc parallel
//#pragma acc loop independent gang worker vector collapse(5)
#pragma acc parallel
#pragma acc loop independent gang worker vector collapse(5)
for (int nw=0; nw < walk_num; ++nw) {
for (int i=0; i<cord_num; ++i){
@ -5660,9 +5713,10 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c, nw*stride_nw_c];
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
@ -6110,8 +6164,8 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
int stride_nw_n = stride_j_n * (cord_num+1);
//#pragma acc parallel
//#pragma loop independent gang worker vector collapse(6)
#pragma acc parallel
#pragma loop independent gang worker vector collapse(6)
for (int nw=0; nw < walk_num; nw++) {
for (int i=0; i < cord_num; i++) {
@ -6122,12 +6176,12 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
for(int n=0; n<elec_num; n++){
// TODO Fix indexing issues
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d];
//een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
//een_rescaled_n[n * + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
}