Replace += by = ... + for better FMA

This commit is contained in:
Anthony Scemama 2024-01-30 11:31:07 +01:00
parent ffbeb97df4
commit dd3db966b0
3 changed files with 79 additions and 77 deletions

View File

@ -5910,7 +5910,7 @@ IVDEP
for (int l=1 ; l<coef_mat_sparse_idx[inucl][i][0]; ++l) {
const int k = idx[l];
if (k >= nidx) break;
ce_mat[i] += v[l] * exp_mat[k];
ce_mat[i] = ce_mat[i] + v[l] * exp_mat[k];
}
}
@ -6730,7 +6730,7 @@ IVDEP
#pragma omp simd
#endif
for (int j=0 ; j<8 ; ++j) {
ce_mat[i][j] += v[l] * exp_mat[k][j];
ce_mat[i][j] = ce_mat[i][j] + v[l] * exp_mat[k][j];
}
}
}

View File

@ -1994,7 +1994,7 @@ qmckl_compute_jastrow_champ_asymp_jasb_hpc (const qmckl_context context,
double x = kappa_inv;
for (int k = 2; k <= bord_num; ++k) {
x *= kappa_inv;
f += b_vector[k]*x;
f = f + b_vector[k]*x;
}
asymp_jasb[0] = spin_independent == 1 ? asym_one + f : 0.5 * asym_one + f;
@ -2491,7 +2491,7 @@ qmckl_compute_jastrow_champ_factor_ee_hpc (const qmckl_context context,
for (int j = 0; j < elec_num; ++j ) {
const double* xj = &(ee_distance_rescaled[j * elec_num + ishift]);
for (int i = 0; i < j ; ++i) {
factor_ee[nw] += b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
factor_ee[nw] = factor_ee[nw] + b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
}
}
@ -2500,23 +2500,23 @@ qmckl_compute_jastrow_champ_factor_ee_hpc (const qmckl_context context,
for (int j = 0; j < up_num; ++j ) {
const double* xj = &(ee_distance_rescaled[j * elec_num + ishift]);
for (int i = 0; i < j ; ++i) {
factor_ee[nw] += 0.5 * b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
factor_ee[nw] = factor_ee[nw] + 0.5 * b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
}
}
for (int j = up_num ; j < elec_num; ++j ) {
const double* xj = &(ee_distance_rescaled[j * elec_num + ishift]);
for (int i = 0; i < up_num; ++i) {
factor_ee[nw] += b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
factor_ee[nw] = factor_ee[nw] + b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
}
for (int i = up_num ; i < j ; ++i) {
factor_ee[nw] += 0.5 * b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
factor_ee[nw] = factor_ee[nw] + 0.5 * b_vector[0]*xj[i] / (1. + b_vector[1]*xj[i]);
}
}
}
factor_ee[nw] -= fshift;
factor_ee[nw] = factor_ee[nw] - fshift;
for (int j=0; j < elec_num; ++j ) {
const double* xj = &(ee_distance_rescaled[j * elec_num + ishift]);
@ -2525,7 +2525,7 @@ qmckl_compute_jastrow_champ_factor_ee_hpc (const qmckl_context context,
double xk = x;
for (int k = 2; k <= bord_num; ++k) {
xk *= x;
factor_ee[nw] += b_vector[k] * xk;
factor_ee[nw] = factor_ee[nw] + b_vector[k] * xk;
}
}
@ -2991,11 +2991,11 @@ qmckl_compute_jastrow_champ_factor_ee_gl_hpc(const qmckl_context context,
f *= 0.5;
}
factor_ee_gl_0[i] += f*dx[0];
factor_ee_gl_1[i] += f*dx[1];
factor_ee_gl_2[i] += f*dx[2];
factor_ee_gl_3[i] += f*dx[3];
factor_ee_gl_3[i] -= f*grad_c2*invdenom*2.0 * b_vector[1];
factor_ee_gl_0[i] = factor_ee_gl_0[i] + f*dx[0];
factor_ee_gl_1[i] = factor_ee_gl_1[i] + f*dx[1];
factor_ee_gl_2[i] = factor_ee_gl_2[i] + f*dx[2];
factor_ee_gl_3[i] = factor_ee_gl_3[i] + f*dx[3];
factor_ee_gl_3[i] = factor_ee_gl_3[i] - f*grad_c2*invdenom*2.0 * b_vector[1];
double xk[bord_num+1]; // Nvidia C 23.1-0 compiler crashes here (skylake avx512) nvc nvfoftran --enable-hpc
@ -3007,11 +3007,11 @@ qmckl_compute_jastrow_champ_factor_ee_gl_hpc(const qmckl_context context,
for (int k=2 ; k<= bord_num ; ++k) {
const double f1 = b_vector[k] * kf[k] * xk[k-2];
const double f2 = f1*xk[1];
factor_ee_gl_0[i] += f2*dx[0];
factor_ee_gl_1[i] += f2*dx[1];
factor_ee_gl_2[i] += f2*dx[2];
factor_ee_gl_3[i] += f2*dx[3];
factor_ee_gl_3[i] += f1*kf[k-1]*grad_c2;
factor_ee_gl_0[i] = factor_ee_gl_0[i] + f2*dx[0];
factor_ee_gl_1[i] = factor_ee_gl_1[i] + f2*dx[1];
factor_ee_gl_2[i] = factor_ee_gl_2[i] + f2*dx[2];
factor_ee_gl_3[i] = factor_ee_gl_3[i] + f2*dx[3];
factor_ee_gl_3[i] = factor_ee_gl_3[i] + f1*kf[k-1]*grad_c2;
}
}
}
@ -4483,15 +4483,15 @@ qmckl_exit_code qmckl_compute_jastrow_champ_factor_en_hpc (
const double* en_distance_rescaled__ = &(en_distance_rescaled_[a*elec_num]);
const double* a_vec = &(a_vector[(aord_num+1)*type_nucl_vector[a]]);
factor_en[nw] -= asymp_jasa[type_nucl_vector[a]]*de;
factor_en[nw] = factor_en[nw] - asymp_jasa[type_nucl_vector[a]]*de;
for (int64_t i=0 ; i<elec_num ; ++i) {
double x = en_distance_rescaled__[i];
factor_en[nw] += a_vec[0]*x / (1.0 + a_vec[1]*x);
factor_en[nw] = factor_en[nw] + a_vec[0]*x / (1.0 + a_vec[1]*x);
for (int64_t p=2 ; p <= aord_num ; ++p) {
x *= en_distance_rescaled__[i];
factor_en[nw] += a_vec[p]*x;
factor_en[nw] = factor_en[nw] + a_vec[p]*x;
}
}
}
@ -4881,11 +4881,11 @@ qmckl_compute_jastrow_champ_factor_en_gl_hpc (const qmckl_context context,
double f = a_vec[0] * invdenom2;
factor_en_gl_0[i] += f*dx[0];
factor_en_gl_1[i] += f*dx[1];
factor_en_gl_2[i] += f*dx[2];
factor_en_gl_3[i] += f*dx[3];
factor_en_gl_3[i] -= f*grad_c2*invdenom*2.0 * a_vec[1];
factor_en_gl_0[i] = factor_en_gl_0[i] + f*dx[0];
factor_en_gl_1[i] = factor_en_gl_1[i] + f*dx[1];
factor_en_gl_2[i] = factor_en_gl_2[i] + f*dx[2];
factor_en_gl_3[i] = factor_en_gl_3[i] + f*dx[3];
factor_en_gl_3[i] = factor_en_gl_3[i] - f*grad_c2*invdenom*2.0 * a_vec[1];
double xk[aord_num+1];
@ -4897,11 +4897,11 @@ qmckl_compute_jastrow_champ_factor_en_gl_hpc (const qmckl_context context,
for (int k=2 ; k<= aord_num ; ++k) {
const double f1 = a_vec[k] * kf[k] * xk[k-2];
const double f2 = f1*xk[1];
factor_en_gl_0[i] += f2*dx[0];
factor_en_gl_1[i] += f2*dx[1];
factor_en_gl_2[i] += f2*dx[2];
factor_en_gl_3[i] += f2*dx[3];
factor_en_gl_3[i] += f1*kf[k-1]*grad_c2;
factor_en_gl_0[i] = factor_en_gl_0[i] + f2*dx[0];
factor_en_gl_1[i] = factor_en_gl_1[i] + f2*dx[1];
factor_en_gl_2[i] = factor_en_gl_2[i] + f2*dx[2];
factor_en_gl_3[i] = factor_en_gl_3[i] + f2*dx[3];
factor_en_gl_3[i] = factor_en_gl_3[i] + f1*kf[k-1]*grad_c2;
}
}
}
@ -10214,7 +10214,7 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context,
double tmp3[elec_num];
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_0nw[j] += cn *
factor_een_gl_0nw[j] = factor_een_gl_0nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_0amlnw[j] +
dtmp_c_0amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_0amlknw[j] * een_rescaled_n_amnw[j] +
@ -10225,29 +10225,29 @@ qmckl_compute_jastrow_champ_factor_een_gl_hpc(const qmckl_context context,
}
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_1nw[j] += cn *
factor_een_gl_1nw[j] = factor_een_gl_1nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_1amlnw[j] +
dtmp_c_1amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_1amnw[j]);
tmp3[j] +=
tmp3[j] = tmp3[j] +
dtmp_c_1amknw[j] * een_rescaled_n_gl_1amlnw[j] +
dtmp_c_1amlknw[j] * een_rescaled_n_gl_1amnw[j];
}
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_2nw[j] += cn *
factor_een_gl_2nw[j] = factor_een_gl_2nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_2amlnw[j] +
dtmp_c_2amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_2amlknw[j] * een_rescaled_n_amnw[j] +
tmp_c_amlkn[j] * een_rescaled_n_gl_2amnw[j]);
tmp3[j] +=
tmp3[j] = tmp3[j] +
dtmp_c_2amknw[j] * een_rescaled_n_gl_2amlnw[j] +
dtmp_c_2amlknw[j] * een_rescaled_n_gl_2amnw[j];
}
for (size_t j = 0; j < (size_t) elec_num; ++j) {
factor_een_gl_3nw[j] += cn *
factor_een_gl_3nw[j] = factor_een_gl_3nw[j] + cn *
(tmp_c_amkn[j] * een_rescaled_n_gl_3amlnw[j] +
dtmp_c_3amknw[j] * een_rescaled_n_amlnw[j] +
dtmp_c_3amlknw[j] * een_rescaled_n_amnw[j] +

View File

@ -526,9 +526,9 @@ qmckl_set_mo_basis_r_cusp(qmckl_context context,
for (int64_t k=0 ; k<ctx->ao_basis.ao_num ; ++k) {
if ( ctx->ao_basis.ao_nucl[k] == inucl && ctx->ao_basis.ao_ang_mom[k] == 0) {
const double ck = ctx->mo_basis.coefficient[k + i*ctx->ao_basis.ao_num];
qmckl_ten3(mo_vgl_at_r_cusp_s,i,0,inucl) += ck * qmckl_ten3(ao_vgl_at_r_cusp_s,k,0,inucl);
qmckl_ten3(mo_vgl_at_r_cusp_s,i,1,inucl) += ck * qmckl_ten3(ao_vgl_at_r_cusp_s,k,3,inucl);
qmckl_ten3(mo_vgl_at_r_cusp_s,i,2,inucl) += ck * qmckl_ten3(ao_vgl_at_r_cusp_s,k,4,inucl);
qmckl_ten3(mo_vgl_at_r_cusp_s,i,0,inucl) = qmckl_ten3(mo_vgl_at_r_cusp_s,i,0,inucl) + ck * qmckl_ten3(ao_vgl_at_r_cusp_s,k,0,inucl);
qmckl_ten3(mo_vgl_at_r_cusp_s,i,1,inucl) = qmckl_ten3(mo_vgl_at_r_cusp_s,i,1,inucl) + ck * qmckl_ten3(ao_vgl_at_r_cusp_s,k,3,inucl);
qmckl_ten3(mo_vgl_at_r_cusp_s,i,2,inucl) = qmckl_ten3(mo_vgl_at_r_cusp_s,i,2,inucl) + ck * qmckl_ten3(ao_vgl_at_r_cusp_s,k,4,inucl);
}
}
}
@ -1363,7 +1363,8 @@ qmckl_compute_mo_basis_mo_value_hpc_sp (const qmckl_context context,
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl_sp[i] += ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41 +
vgl_sp[i] = vgl_sp[i] +
ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41 +
ck5[i] * a51 + ck6[i] * a61 + ck7[i] * a71 + ck8[i] * a81;
}
}
@ -1377,7 +1378,7 @@ qmckl_compute_mo_basis_mo_value_hpc_sp (const qmckl_context context,
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl_sp[i] += ck[i] * a1;
vgl_sp[i] = vgl_sp[i] + ck[i] * a1;
}
}
@ -1486,7 +1487,8 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context,
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41 +
vgl1[i] = vgl1[i] +
ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41 +
ck5[i] * a51 + ck6[i] * a61 + ck7[i] * a71 + ck8[i] * a81;
}
}
@ -1500,7 +1502,7 @@ qmckl_compute_mo_basis_mo_value_hpc (const qmckl_context context,
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += ck[i] * a1;
vgl1[i] = vgl1[i] + ck[i] * a1;
}
}
}
@ -2041,11 +2043,11 @@ IVDEP
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
vgl2[i] += ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
vgl3[i] += ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
vgl4[i] += ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
vgl5[i] += ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
vgl1[i] = vgl1[i] + ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
vgl2[i] = vgl2[i] + ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
vgl3[i] = vgl3[i] + ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
vgl4[i] = vgl4[i] + ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
vgl5[i] = vgl5[i] + ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
}
}
@ -2062,11 +2064,11 @@ IVDEP
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += ck[i] * a1;
vgl2[i] += ck[i] * a2;
vgl3[i] += ck[i] * a3;
vgl4[i] += ck[i] * a4;
vgl5[i] += ck[i] * a5;
vgl1[i] = vgl1[i] + ck[i] * a1;
vgl2[i] = vgl2[i] + ck[i] * a2;
vgl3[i] = vgl3[i] + ck[i] * a3;
vgl4[i] = vgl4[i] + ck[i] * a4;
vgl5[i] = vgl5[i] + ck[i] * a5;
}
}
}
@ -2222,11 +2224,11 @@ qmckl_compute_mo_basis_mo_vgl_hpc_sp (const qmckl_context context,
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl_sp1[i] += ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
vgl_sp2[i] += ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
vgl_sp3[i] += ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
vgl_sp4[i] += ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
vgl_sp5[i] += ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
vgl_sp1[i] = vgl_sp1[i] + ck1[i] * a11 + ck2[i] * a21 + ck3[i] * a31 + ck4[i] * a41;
vgl_sp2[i] = vgl_sp2[i] + ck1[i] * a12 + ck2[i] * a22 + ck3[i] * a32 + ck4[i] * a42;
vgl_sp3[i] = vgl_sp3[i] + ck1[i] * a13 + ck2[i] * a23 + ck3[i] * a33 + ck4[i] * a43;
vgl_sp4[i] = vgl_sp4[i] + ck1[i] * a14 + ck2[i] * a24 + ck3[i] * a34 + ck4[i] * a44;
vgl_sp5[i] = vgl_sp5[i] + ck1[i] * a15 + ck2[i] * a25 + ck3[i] * a35 + ck4[i] * a45;
}
}
@ -2243,11 +2245,11 @@ qmckl_compute_mo_basis_mo_vgl_hpc_sp (const qmckl_context context,
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl_sp1[i] += ck[i] * a1;
vgl_sp2[i] += ck[i] * a2;
vgl_sp3[i] += ck[i] * a3;
vgl_sp4[i] += ck[i] * a4;
vgl_sp5[i] += ck[i] * a5;
vgl_sp1[i] = vgl_sp1[i] + ck[i] * a1;
vgl_sp2[i] = vgl_sp2[i] + ck[i] * a2;
vgl_sp3[i] = vgl_sp3[i] + ck[i] * a3;
vgl_sp4[i] = vgl_sp4[i] + ck[i] * a4;
vgl_sp5[i] = vgl_sp5[i] + ck[i] * a5;
}
}
IVDEP
@ -2599,7 +2601,7 @@ IVDEP
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += ck[i] * a1;
vgl1[i] = vgl1[i] + ck[i] * a1;
}
}
@ -2608,7 +2610,7 @@ IVDEP
const double r = ria[inucl];
IVDEP
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*(
vgl1[i] = vgl1[i] + qmckl_ten3(cusp_param,i,0,inucl) + r*(
qmckl_ten3(cusp_param,i,1,inucl) + r*(
qmckl_ten3(cusp_param,i,2,inucl) + r*(
qmckl_ten3(cusp_param,i,3,inucl) )));
@ -3032,11 +3034,11 @@ IVDEP
#pragma omp simd
#endif
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += ck[i] * a1;
vgl2[i] += ck[i] * a2;
vgl3[i] += ck[i] * a3;
vgl4[i] += ck[i] * a4;
vgl5[i] += ck[i] * a5;
vgl1[i] = vgl1[i] + ck[i] * a1;
vgl2[i] = vgl2[i] + ck[i] * a2;
vgl3[i] = vgl3[i] + ck[i] * a3;
vgl4[i] = vgl4[i] + ck[i] * a4;
vgl5[i] = vgl5[i] + ck[i] * a5;
}
}
@ -3052,7 +3054,7 @@ IVDEP
IVDEP
for (int64_t i=0 ; i<mo_num ; ++i) {
vgl1[i] += qmckl_ten3(cusp_param,i,0,inucl) + r*(
vgl1[i] = vgl1[i] + qmckl_ten3(cusp_param,i,0,inucl) + r*(
qmckl_ten3(cusp_param,i,1,inucl) + r*(
qmckl_ten3(cusp_param,i,2,inucl) + r*(
qmckl_ten3(cusp_param,i,3,inucl) )));
@ -3061,11 +3063,11 @@ IVDEP
2.0*qmckl_ten3(cusp_param,i,2,inucl) +
r * 3.0 * qmckl_ten3(cusp_param,i,3,inucl);
vgl2[i] += r_vec[0] * c1;
vgl3[i] += r_vec[1] * c1;
vgl4[i] += r_vec[2] * c1;
vgl2[i] = vgl2[i] + r_vec[0] * c1;
vgl3[i] = vgl3[i] + r_vec[1] * c1;
vgl4[i] = vgl4[i] + r_vec[2] * c1;
vgl5[i] += 2.0*qmckl_ten3(cusp_param,i,1,inucl)*r_inv +
vgl5[i] = vgl5[i] + 2.0*qmckl_ten3(cusp_param,i,1,inucl)*r_inv +
6.0*qmckl_ten3(cusp_param,i,2,inucl) +
12.0*qmckl_ten3(cusp_param,i,3,inucl)*r;
}