1
0
mirror of https://github.com/TREX-CoE/qmckl.git synced 2025-01-08 20:33:40 +01:00

Replace placeholder cuBLAS kernels with new C HPC implementation

This commit is contained in:
Aurélien Delval 2022-04-05 16:29:52 +02:00
parent f8e6d5f06b
commit 63c7f8ea72

View File

@ -5783,17 +5783,6 @@ qmckl_exit_code qmckl_compute_tmp_c_cublas_offload (
const double* een_rescaled_n, const double* een_rescaled_n,
double* const tmp_c ) { double* const tmp_c ) {
qmckl_exit_code info;
int i, j, a, l, kk, p, lmax, nw;
char TransA, TransB;
double alpha, beta;
int M, N, K, LDA, LDB, LDC;
TransA = 'N';
TransB = 'N';
alpha = 1.0;
beta = 0.0;
if (context == QMCKL_NULL_CONTEXT) { if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT; return QMCKL_INVALID_CONTEXT;
} }
@ -5810,29 +5799,40 @@ qmckl_exit_code qmckl_compute_tmp_c_cublas_offload (
return QMCKL_INVALID_ARG_4; return QMCKL_INVALID_ARG_4;
} }
M = elec_num; if (walk_num <= 0) {
N = nucl_num*(cord_num + 1); return QMCKL_INVALID_ARG_5;
K = elec_num; }
LDA = sizeof(een_rescaled_e)/sizeof(double); qmckl_exit_code info = QMCKL_SUCCESS;
LDB = sizeof(een_rescaled_n)/sizeof(double);
LDC = sizeof(tmp_c)/sizeof(double);
// TODO Replace with cuBLAS calls const char TransA = 'N';
for (int nw=0; nw < walk_num; ++nw) { const char TransB = 'N';
for (int i=0; i<cord_num; ++i){ const double alpha = 1.0;
const double beta = 0.0;
const int64_t M = elec_num;
const int64_t N = nucl_num*(cord_num + 1);
const int64_t K = elec_num;
const int64_t LDA = elec_num;
const int64_t LDB = elec_num;
const int64_t LDC = elec_num;
const int64_t af = elec_num*elec_num;
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = bf;
// TODO Replace with calls to cuBLAS
for (int64_t nw=0; nw < walk_num; ++nw) {
for (int64_t i=0; i<cord_num; ++i){
info = qmckl_dgemm(context, TransA, TransB, M, N, K, alpha, \ info = qmckl_dgemm(context, TransA, TransB, M, N, K, alpha, \
// &een_rescaled_e[0+0*elec_num+i*elec_num*elec_num+nw*elec_num*elec_num*(cord_num+1)], &(een_rescaled_e[af*(i+nw*(cord_num+1))]), \
&een_rescaled_e[ i*elec_num*elec_num+nw*elec_num*elec_num*(cord_num+1)], \
LDA, \ LDA, \
// &een_rescaled_n[0+0*elec_num+0*elec_num*nucl_num+nw*elec_num*nucl_num*(cord_num+1)], &(een_rescaled_n[bf*nw]), \
&een_rescaled_n[ nw*elec_num*nucl_num*(cord_num+1)], \
LDB, \ LDB, \
beta, \ beta, \
// &tmp_c[0+0*elec_num+0*elec_num*nucl_num+i*elec_num*nucl_num*(cord_num+1)+nw*elec_num*nucl_num*(cord_num+1)*cord_num], &(tmp_c[cf*(i+nw*cord_num)]), \
&tmp_c[ i*elec_num*nucl_num*(cord_num+1)+nw*elec_num*nucl_num*(cord_num+1)*cord_num], \
LDC); LDC);
} }
} }
@ -6244,16 +6244,6 @@ qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
const double* een_rescaled_n, const double* een_rescaled_n,
double* const dtmp_c ) { double* const dtmp_c ) {
qmckl_exit_code info;
char TransA, TransB;
double alpha, beta;
int M, N, K, LDA, LDB, LDC;
TransA = 'N';
TransB = 'N';
alpha = 1.0;
beta = 0.0;
if (context == QMCKL_NULL_CONTEXT) { if (context == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT; return QMCKL_INVALID_CONTEXT;
} }
@ -6270,27 +6260,39 @@ qmckl_exit_code qmckl_compute_dtmp_c_cublas_offload (
return QMCKL_INVALID_ARG_4; return QMCKL_INVALID_ARG_4;
} }
M = 4*elec_num; if (walk_num <= 0) {
N = nucl_num*(cord_num + 1); return QMCKL_INVALID_ARG_5;
K = elec_num; }
LDA = 4*sizeof(een_rescaled_e_deriv_e)/sizeof(double); qmckl_exit_code info = QMCKL_SUCCESS;
LDB = sizeof(een_rescaled_n)/sizeof(double);
LDC = 4*sizeof(dtmp_c)/sizeof(double);
// TODO Replace with cuBLAS calls const char TransA = 'N';
for (int nw=0; nw < walk_num; ++nw) { const char TransB = 'N';
for (int i=0; nw < cord_num; ++i) { const double alpha = 1.0;
const double beta = 0.0;
const int64_t M = 4*elec_num;
const int64_t N = nucl_num*(cord_num + 1);
const int64_t K = elec_num;
const int64_t LDA = 4*elec_num;
const int64_t LDB = elec_num;
const int64_t LDC = 4*elec_num;
const int64_t af = elec_num*elec_num*4;
const int64_t bf = elec_num*nucl_num*(cord_num+1);
const int64_t cf = elec_num*4*nucl_num*(cord_num+1);
// TODO Replace with calls to cuBLAS
for (int64_t nw=0; nw < walk_num; ++nw) {
for (int64_t i=0; i < cord_num; ++i) {
info = qmckl_dgemm(context, TransA, TransB, M, N, K, alpha, \ info = qmckl_dgemm(context, TransA, TransB, M, N, K, alpha, \
//&een_rescaled_e_deriv_e[0+0*elec_num+0*elec_num*4+i*elec_num*4*elec_num+nw*elec_num*4*elec_num*(cord_num+1)], &(een_rescaled_e_deriv_e[af*(i+nw*(cord_num+1))]), \
&een_rescaled_e_deriv_e[i*elec_num*4*elec_num+nw*elec_num*4*elec_num*(cord_num+1)], \
LDA, \ LDA, \
//&een_rescaled_n[0+0*elec_num+0*elec_num*nucl_num+nw*elec_num*nucl_num*(cord_num+1)], &(een_rescaled_n[bf*nw]), \
&een_rescaled_n[nw*elec_num*nucl_num*(cord_num+1)], \
LDB, \ LDB, \
beta, \ beta, \
//&dtmp_c[0+0*elec_num+0*elec_num*4+0*elec_num*4*nucl_num+i*elec_num*4*nucl_num*(cord_num+1)+nw*elec_num*4*nucl_num*(cord_num+1)*cord_num], &(dtmp_c[cf*(i+nw*cord_num)]), \
&dtmp_c[i*elec_num*4*nucl_num*(cord_num+1)+nw*elec_num*4*nucl_num*(cord_num+1)*cord_num], \
LDC); LDC);
} }
} }