1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-03 10:06:09 +01:00

Fix OpenACC and OpenMP implementations

This commit is contained in:
Aurelien Delval 2022-04-07 13:57:20 +02:00
parent 7dc02571e9
commit 3cd30bc8f3

View File

@ -5915,36 +5915,31 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (const qmckl_context context,
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num; const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
#pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n]) #pragma acc parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{ {
#pragma acc loop independent gang worker vector #pragma acc loop independent gang worker vector collapse(5)
for (int64_t i=0 ; i<size_tmp_c ; ++i) for (int nw=0; nw < walk_num; ++nw) {
tmp_c[i] = 0.; for (int i=0; i<cord_num; ++i){
#pragma acc loop independent gang worker vector collapse(5) // Replacement for single DGEMM
for (int nw=0; nw < walk_num; ++nw) { for (int j=0; j<cord_num+1; j++) {
for (int i=0; i<cord_num; ++i){ for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
} }
} }
} }
} }
} }
} }
}
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
} }
@ -6015,36 +6010,32 @@ qmckl_compute_tmp_c_omp_offload (const qmckl_context context,
const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num; const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
#pragma omp parallel copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n]) // WARNING This implementation seems unomptimized
#pragma omp target map(from:tmp_c[0:size_tmp_c]) map(to:een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
{ {
#pragma omp loop independent gang worker vector #pragma omp teams distribute parallel for collapse(5)
for (int64_t i=0 ; i<size_tmp_c ; ++i) for (int nw=0; nw < walk_num; ++nw) {
tmp_c[i] = 0.; for (int i=0; i<cord_num; ++i){
#pragma omp loop independent gang worker vector collapse(5) // Replacement for single DGEMM
for (int nw=0; nw < walk_num; ++nw) { for (int j=0; j<cord_num+1; j++) {
for (int i=0; i<cord_num; ++i){ for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Replacement for single DGEMM
for (int j=0; j<cord_num+1; j++) {
for (int k=0; k<nucl_num; k++) {
for (int l=0; l<elec_num; l++) {
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
}
// Single reduction
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] = 0;
for (int m=0; m<elec_num; m++) {
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] =
tmp_c[l + k*stride_k_c + j*stride_j_c + i*stride_i_c + nw*stride_nw_c] +
een_rescaled_e[l + m*stride_m_e + i*stride_i_e + nw*stride_nw_e] *
een_rescaled_n[m + k*stride_k_n + j*stride_j_n + nw*stride_nw_n];
} }
} }
} }
} }
} }
} }
}
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
} }
@ -6471,41 +6462,36 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
const int64_t stride_j_n = stride_k_n * nucl_num; const int64_t stride_j_n = stride_k_n * nucl_num;
const int64_t stride_nw_n = stride_j_n * (cord_num+1); const int64_t stride_nw_n = stride_j_n * (cord_num+1);
const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num; const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num; const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
#pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n]) #pragma acc parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
{ {
#pragma acc loop independent gang worker vector #pragma acc loop independent gang worker vector collapse(6)
for (int64_t i=0 ; i<size_dtmp_c ; ++i) for (int nw=0; nw < walk_num; nw++) {
dtmp_c[i] = 0.; for (int i=0; i < cord_num; i++) {
#pragma loop independent gang worker vector collapse(6) // Single DGEMM
for (int nw=0; nw < walk_num; nw++) { for(int j=0; j<cord_num+1; j++) {
for (int i=0; i < cord_num; i++) { for(int k=0; k<nucl_num; k++) {
for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) {
// Single DGEMM // Single reduction
for(int j=0; j<cord_num+1; j++) { dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
for(int k=0; k<nucl_num; k++) { for(int n=0; n<elec_num; n++){
for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) {
// Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] + dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] * een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n]; een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
} }
} }
} }
} }
} }
}
}
} }
return QMCKL_SUCCESS; return QMCKL_SUCCESS;
@ -6579,36 +6565,34 @@ qmckl_exit_code qmckl_compute_dtmp_c_omp_offload (
const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num;
const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num; const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
#pragma omp parallel copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n]) // WARNING This implementation seems unomptimized
#pragma omp target map(from:dtmp_c[0:size_dtmp_c]) map(to:een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
{ {
#pragma omp target
for (int64_t i=0 ; i<size_dtmp_c ; ++i)
dtmp_c[i] = 0.;
#pragma loop independent gang worker vector collapse(6) #pragma omp teams distribute parallel for collapse(6)
for (int nw=0; nw < walk_num; nw++) { for (int nw=0; nw < walk_num; nw++) {
for (int i=0; i < cord_num; i++) { for (int i=0; i < cord_num; i++) {
// Single DGEMM // Single DGEMM
for(int j=0; j<cord_num+1; j++) { for(int j=0; j<cord_num+1; j++) {
for(int k=0; k<nucl_num; k++) { for(int k=0; k<nucl_num; k++) {
for(int l=0; l<4; l++) { for(int l=0; l<4; l++) {
for(int m=0; m<elec_num; m++) { for(int m=0; m<elec_num; m++) {
// Single reduction // Single reduction
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0; dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = 0;
for(int n=0; n<elec_num; n++){ for(int n=0; n<elec_num; n++){
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] = dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] =
dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] + dtmp_c[m + l * stride_l_d + k * stride_k_d + j * stride_j_d + i * stride_i_d + nw * stride_nw_d] +
een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] * een_rescaled_e_deriv_e[m + l * stride_l_e + n * stride_n_e + i * stride_i_e + nw * stride_nw_e] *
een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n]; een_rescaled_n[n + k * stride_k_n + j * stride_j_n + nw * stride_nw_n];
}
}
} }
} }
} }
} }
} }
}
}
} }
return QMCKL_SUCCESS; return QMCKL_SUCCESS;