diff --git a/org/qmckl_sherman_morrison_woodbury.org b/org/qmckl_sherman_morrison_woodbury.org index 54b8b98..71d6544 100644 --- a/org/qmckl_sherman_morrison_woodbury.org +++ b/org/qmckl_sherman_morrison_woodbury.org @@ -26,11 +26,16 @@ int main() { qmckl_exit_code rc; #+end_src + #+NAME:kernel_generator_range + #+begin_src python :noweb yes :exports none +range(2, 22) + #+end_src + * Naïve Sherman-Morrison -** ~qmckl_sherman_morrison~ +** ~qmckl_sherman_morrison_naive~ :PROPERTIES: - :Name: qmckl_sherman_morrison + :Name: qmckl_sherman_morrison_naive :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: @@ -63,7 +68,7 @@ int main() { If the determinant of the Slater-matrix is passed, it will be updated to the determinant resulting from applying the updates to the original matrix. - #+NAME: qmckl_sherman_morrison_args + #+NAME: qmckl_sherman_morrison_naive_args | qmckl_context | context | in | Global state | | uint64_t | LDS | in | Leading dimension of Slater_inv | | uint64_t | Dim | in | Dimension of Slater_inv | @@ -87,11 +92,11 @@ int main() { *** C header - #+CALL: generate_c_header(table=qmckl_sherman_morrison_args,rettyp=get_value("CRetType"),fname=get_value("Name")) + #+CALL: generate_c_header(table=qmckl_sherman_morrison_naive_args,rettyp=get_value("CRetType"),fname=get_value("Name")) #+RESULTS: #+begin_src c :tangle (eval h_func) :comments org - qmckl_exit_code qmckl_sherman_morrison( + qmckl_exit_code qmckl_sherman_morrison_naive( const qmckl_context context, const uint64_t LDS, const uint64_t Dim, @@ -130,7 +135,7 @@ int main() { #define ALIGNED #endif -qmckl_exit_code qmckl_sherman_morrison_hpc( +qmckl_exit_code qmckl_sherman_morrison_naive_hpc( const qmckl_context context, const uint64_t LDS, const uint64_t Dim, @@ -203,9 +208,9 @@ qmckl_exit_code qmckl_sherman_morrison_hpc( - #+NAME:naive_template + #+NAME:naive_template_code #+begin_src c -static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}( +static inline qmckl_exit_code qmckl_sherman_morrison_naive_{Dim}( const qmckl_context context, const uint64_t N_updates, const double* __restrict Updates, @@ -217,7 +222,7 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}( if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return qmckl_failwith( context, QMCKL_NULL_CONTEXT, - "qmckl_sherman_morrison_{Dim}", + "qmckl_sherman_morrison_naive_{Dim}", NULL); } @@ -280,13 +285,13 @@ static inline qmckl_exit_code qmckl_sherman_morrison_{Dim}( - #+NAME:naive-python + #+NAME:naive_kernel_generator #+begin_src python :noweb yes :exports none text=""" -<> +<> """ result = [] -for Dim in range(2, 22): +for Dim in <>: Dim=str(Dim) result.append(text.replace("{Dim}",Dim) ) @@ -297,11 +302,11 @@ return '\n'.join(result) - #+NAME:naive-python-switch + #+NAME:naive_switch-case_generator #+begin_src python :noweb yes :exports none text=""" case {Dim}: - return qmckl_sherman_morrison_{Dim}(context, + return qmckl_sherman_morrison_naive_{Dim}(context, N_updates, Updates, Updates_index, @@ -310,7 +315,7 @@ case {Dim}: determinant); """ result = [] -for Dim in range(2, 22): +for Dim in <>: Dim=str(Dim) result.append(text.replace("{Dim}",Dim) ) @@ -322,9 +327,9 @@ return '\n'.join(result) #+begin_src c :tangle (eval c) :comments org :noweb yes -<> +<> -qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, +qmckl_exit_code qmckl_sherman_morrison_naive(const qmckl_context context, const uint64_t LDS, const uint64_t Dim, const uint64_t N_updates, @@ -337,17 +342,17 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return qmckl_failwith(context, QMCKL_NULL_CONTEXT, - "qmckl_sherman_morrison", + "qmckl_sherman_morrison_naive", NULL); } if (LDS == (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) { // Most cases switch (Dim) { - <> + <> } } else { // When SIMD_LENGTH > 1, called with LDS == Dim AND Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) - return qmckl_sherman_morrison_hpc(context, + return qmckl_sherman_morrison_naive_hpc(context, LDS, Dim, N_updates, @@ -359,7 +364,7 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, } - return QMCKL_SUCCESS; + return QMCKL_FAILURE; } #+end_src @@ -375,17 +380,17 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, ** Fortran interface :noexport: :PROPERTIES: - :Name: qmckl_sherman_morrison + :Name: qmckl_sherman_morrison_naive :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: - #+CALL: generate_f_interface(table=qmckl_sherman_morrison_args,rettyp=get_value("FRetType"),fname=get_value("Name")) + #+CALL: generate_f_interface(table=qmckl_sherman_morrison_naive_args,rettyp=get_value("FRetType"),fname=get_value("Name")) #+RESULTS: #+begin_src f90 :tangle (eval fh_func) :comments org :exports none interface - integer(c_int32_t) function qmckl_sherman_morrison & + integer(c_int32_t) function qmckl_sherman_morrison_naive & (context, LDS, Dim, N_updates, Updates, Updates_index, breakdown, Slater_inv, determinant) & bind(C) @@ -403,7 +408,7 @@ qmckl_exit_code qmckl_sherman_morrison(const qmckl_context context, real (c_double ) , intent(inout) :: Slater_inv(LDS*Dim) real (c_double ) , intent(inout) :: determinant - end function qmckl_sherman_morrison + end function qmckl_sherman_morrison_naive end interface #+end_src @@ -426,7 +431,7 @@ assert(Slater_inv1 != NULL); // original determinant of Slater1 (before applying updates) double det = 3.407025646103221e-10; -rc = qmckl_sherman_morrison(context, +rc = qmckl_sherman_morrison_naive(context, LDS, Dim, N_updates1, @@ -463,9 +468,9 @@ assert(rc == QMCKL_SUCCESS); * Woodbury 2x2 -** ~qmckl_woodbury_2~ +** ~qmckl_woodbury_2x2~ :PROPERTIES: - :Name: qmckl_woodbury_2 + :Name: qmckl_woodbury_2x2 :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: @@ -488,7 +493,7 @@ assert(rc == QMCKL_SUCCESS); - #+NAME: qmckl_woodbury_2_args + #+NAME: qmckl_woodbury_2x2_args | qmckl_context | context | in | Global state | | uint64_t | LDS | in | Leading dimension of Slater_inv | | uint64_t | Dim | in | Dimension of Slater_inv | @@ -510,11 +515,11 @@ assert(rc == QMCKL_SUCCESS); *** C header - #+CALL: generate_c_header(table=qmckl_woodbury_2_args,rettyp=get_value("CRetType"),fname=get_value("Name")) + #+CALL: generate_c_header(table=qmckl_woodbury_2x2_args,rettyp=get_value("CRetType"),fname=get_value("Name")) #+RESULTS: #+begin_src c :tangle (eval h_func) :comments org - qmckl_exit_code qmckl_woodbury_2( + qmckl_exit_code qmckl_woodbury_2x2( const qmckl_context context, const uint64_t LDS, const uint64_t Dim, @@ -528,14 +533,14 @@ assert(rc == QMCKL_SUCCESS); *** C source #+begin_src c :tangle (eval c) :comments org -qmckl_exit_code qmckl_woodbury_2(const qmckl_context context, - const uint64_t LDS, - const uint64_t Dim, - const double* Updates, - const uint64_t* Updates_index, - const double breakdown, - double* Slater_inv, - double* determinant) { +qmckl_exit_code qmckl_woodbury_2x2_hpc(const qmckl_context context, + const uint64_t LDS, + const uint64_t Dim, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, + const double breakdown, + double* __restrict Slater_inv, + double* __restrict determinant) { /* C := S^{-1} * U, dim x 2 B := 1 + V * C, 2 x 2 @@ -545,7 +550,7 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context, if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return qmckl_failwith(context, QMCKL_NULL_CONTEXT, - "qmckl_woodbury_2", + "qmckl_woodbury_2x2_hpc", NULL); } @@ -613,6 +618,179 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context, } #+end_src + + + + + #+NAME:woodbury_2x2_kernel_template + #+begin_src c +static inline qmckl_exit_code qmckl_woodbury_2x2_{Dim}( + const qmckl_context context, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, + const double breakdown, + double* __restrict Slater_inv, + double* __restrict determinant) { +/* + C := S^{-1} * U, dim x 2 + B := 1 + V * C, 2 x 2 + D := V * S^{-1}, 2 x dim +*/ + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return qmckl_failwith(context, + QMCKL_NULL_CONTEXT, + "qmckl_woodbury_2x2_{Dim}", + NULL); + } + + const uint64_t row1 = (Updates_index[0] - 1); + const uint64_t row2 = (Updates_index[1] - 1); + + // Compute C = (S^T)^{-1}U : {Dim} x 2 + double __attribute__((aligned(8))) C[2 * {Dim}]; + for (uint64_t i = 0; i < {Dim}; i++) { + C[i * 2] = 0; + C[i * 2 + 1] = 0; + IVDEP + ALIGNED + for (uint64_t k = 0; k < D{Dim}_P; k++) { + C[i * 2] += Slater_inv[i * D{Dim}_P + k] * Updates[k]; + C[i * 2 + 1] += Slater_inv[i * D{Dim}_P + k] * Updates[D{Dim}_P + k]; + } + } + + // Compute B = 1 + VC : 2 x 2 + const double B0 = C[row1 * 2] + 1; + const double B1 = C[row1 * 2 + 1]; + const double B2 = C[row2 * 2]; + const double B3 = C[row2 * 2 + 1] + 1; + + // Check if determinant of inverted matrix is not zero + double det = B0 * B3 - B1 * B2; + if (fabs(det) < breakdown) { + return QMCKL_FAILURE; + } + + // Update det(S) when passed + if (determinant) + *determinant *= det; + + // Compute B^{-1} with explicit formula for 2 x 2 inversion + double __attribute__((aligned(8))) Binv[4], idet = 1.0 / det; + Binv[0] = idet * B3; + Binv[1] = -1.0 * idet * B1; + Binv[2] = -1.0 * idet * B2; + Binv[3] = idet * B0; + + // tmp = B^{-1}D : 2 x D{Dim}_P + double __attribute__((aligned(8))) tmp[2 * D{Dim}_P]; + double* r1dim = &(Slater_inv[row1 * D{Dim}_P]); + double* r2dim = &(Slater_inv[row2 * D{Dim}_P]); + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j]; + tmp[D{Dim}_P + j] = Binv[2] * r1dim[j] + Binv[3] * r2dim[j]; + } + + // Compute (S^T)^{-1} - C * tmp : {Dim} x D{Dim}_P + for (uint64_t i = 0; i < {Dim}; i++) { + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + Slater_inv[i * D{Dim}_P + j] -= C[i * 2] * tmp[j]; + Slater_inv[i * D{Dim}_P + j] -= C[i * 2 + 1] * tmp[D{Dim}_P + j]; + } + } + + return QMCKL_SUCCESS; +} + #+end_src + + + + #+NAME:woodbury_2x2_kernel_generator + #+begin_src python :noweb yes :exports none +text=""" +<> +""" +result = [] +for Dim in <>: + Dim=str(Dim) + result.append(text.replace("{Dim}",Dim) ) + +return '\n'.join(result) + #+end_src + + + + #+NAME:woodbury_2x2_switch-case_generator + #+begin_src python :noweb yes :exports none +text=""" +case {Dim}: + return qmckl_woodbury_2x2_{Dim}(context, + Updates, + Updates_index, + breakdown, + Slater_inv, + determinant); +""" +result = [] +for Dim in <>: + Dim=str(Dim) + result.append(text.replace("{Dim}",Dim) ) + +return '\n'.join(result) + #+end_src + + + + + #+begin_src c :tangle (eval c) :comments org :noweb yes +<> + +qmckl_exit_code qmckl_woodbury_2x2(const qmckl_context context, + const uint64_t LDS, + const uint64_t Dim, + const double* Updates, + const uint64_t* Updates_index, + const double breakdown, + double* Slater_inv, + double* determinant) { + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return qmckl_failwith(context, + QMCKL_NULL_CONTEXT, + "qmckl_woodbury_2x2", + NULL); + } + + if (LDS == (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) { // Most cases + switch (Dim) { + <> + } + } + else { // When SIMD_LENGTH > 1, called with LDS == Dim AND Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) + return qmckl_woodbury_2x2_hpc(context, + LDS, + Dim, + Updates, + Updates_index, + breakdown, + Slater_inv, + determinant); + + } + + return QMCKL_FAILURE; +} + #+end_src + + + + + *** Performance This function is most efficient when used in cases where there are only 2 rank-1 updates and @@ -620,17 +798,17 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context, ** Fortran interface :noexport: :PROPERTIES: - :Name: qmckl_woodbury_2 + :Name: qmckl_woodbury_2x2 :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: - #+CALL: generate_f_interface(table=qmckl_woodbury_2_args,rettyp=get_value("FRetType"),fname=get_value("Name")) + #+CALL: generate_f_interface(table=qmckl_woodbury_2x2_args,rettyp=get_value("FRetType"),fname=get_value("Name")) #+RESULTS: #+begin_src f90 :tangle (eval fh_func) :comments org :exports none interface - integer(c_int32_t) function qmckl_woodbury_2 & + integer(c_int32_t) function qmckl_woodbury_2x2 & (context, LDS, Dim, Updates, Updates_index, breakdown, Slater_inv, determinant) & bind(C) use, intrinsic :: iso_c_binding @@ -646,7 +824,7 @@ qmckl_exit_code qmckl_woodbury_2(const qmckl_context context, real (c_double ) , intent(inout) :: Slater_inv(LDS*Dim) real (c_double ) , intent(inout) :: determinant - end function qmckl_woodbury_2 + end function qmckl_woodbury_2x2 end interface #+end_src @@ -657,7 +835,7 @@ assert(Updates2 != NULL); assert(Updates_index2 != NULL); assert(Slater_inv2 != NULL); det = -1.4432116661319376e-11; -rc = qmckl_woodbury_2(context, LDS, Dim, Updates2, Updates_index2, breakdown, Slater_inv2, &det); +rc = qmckl_woodbury_2x2(context, LDS, Dim, Updates2, Updates_index2, breakdown, Slater_inv2, &det); assert(fabs(det-2.367058141251457e-10) < 1e-15); for (unsigned int i = 0; i < Dim; i++) { for (unsigned int j = 0; j < Dim; j++) { @@ -683,9 +861,9 @@ assert(rc == QMCKL_SUCCESS); * Woodbury 3x3 -** ~qmckl_woodbury_3~ +** ~qmckl_woodbury_3x3~ :PROPERTIES: - :Name: qmckl_woodbury_3 + :Name: qmckl_woodbury_3x3 :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: @@ -703,7 +881,7 @@ assert(rc == QMCKL_SUCCESS); #pragma ivdep #pragma vector aligned - #+NAME: qmckl_woodbury_3_args + #+NAME: qmckl_woodbury_3x3_args | qmckl_context | context | in | Global state | | uint64_t | LDS | in | Leading dimension of Slater_inv | | uint64_t | Dim | in | Dimension of Slater_inv | @@ -725,11 +903,11 @@ assert(rc == QMCKL_SUCCESS); *** C header - #+CALL: generate_c_header(table=qmckl_woodbury_3_args,rettyp=get_value("CRetType"),fname=get_value("Name")) + #+CALL: generate_c_header(table=qmckl_woodbury_3x3_args,rettyp=get_value("CRetType"),fname=get_value("Name")) #+RESULTS: #+begin_src c :tangle (eval h_func) :comments org - qmckl_exit_code qmckl_woodbury_3( + qmckl_exit_code qmckl_woodbury_3x3( const qmckl_context context, const uint64_t LDS, const uint64_t Dim, @@ -743,14 +921,14 @@ assert(rc == QMCKL_SUCCESS); *** C source #+begin_src c :tangle (eval c) :comments org -qmckl_exit_code qmckl_woodbury_3(const qmckl_context context, +qmckl_exit_code qmckl_woodbury_3x3_hpc(const qmckl_context context, const uint64_t LDS, const uint64_t Dim, - const double* Updates, - const uint64_t* Updates_index, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, const double breakdown, - double* Slater_inv, - double* determinant) { + double* __restrict Slater_inv, + double* __restrict determinant) { /* C := S^{-1} * U, dim x 3 B := 1 + V * C, 3 x 3 @@ -760,7 +938,7 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context, if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return qmckl_failwith(context, QMCKL_NULL_CONTEXT, - "qmckl_woodbury_3", + "qmckl_woodbury_3x3_hpc", NULL); } @@ -848,6 +1026,190 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context, } #+end_src + + #+NAME:woodbury_3x3_kernel_template + #+begin_src c +qmckl_exit_code qmckl_woodbury_3x3_{Dim}( + const qmckl_context context, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, + const double breakdown, + double* __restrict Slater_inv, + double* __restrict determinant) { +/* + C := S^{-1} * U, dim x 3 + B := 1 + V * C, 3 x 3 + D := V * S^{-1}, 3 x dim +,*/ + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return qmckl_failwith(context, + QMCKL_NULL_CONTEXT, + "qmckl_woodbury_3x3_{Dim}", + NULL); + } + + const uint64_t row1 = (Updates_index[0] - 1); + const uint64_t row2 = (Updates_index[1] - 1); + const uint64_t row3 = (Updates_index[2] - 1); + + // Compute C = (S^T)^{-1}U : {Dim} x 3 + double __attribute__((aligned(8))) C[3 * {Dim}]; + for (uint64_t i = 0; i < {Dim}; i++) { + C[i * 3] = 0; + C[i * 3 + 1] = 0; + C[i * 3 + 2] = 0; + IVDEP + ALIGNED + for (uint64_t k = 0; k < D{Dim}_P; k++) { + C[i * 3] += Slater_inv[i * D{Dim}_P + k] * Updates[k]; + C[i * 3 + 1] += Slater_inv[i * D{Dim}_P + k] * Updates[D{Dim}_P + k]; + C[i * 3 + 2] += Slater_inv[i * D{Dim}_P + k] * Updates[2 * D{Dim}_P + k]; + } + } + + // Compute B = 1 + VC : 3 x 3 + const double B0 = C[row1 * 3] + 1; + const double B1 = C[row1 * 3 + 1]; + const double B2 = C[row1 * 3 + 2]; + const double B3 = C[row2 * 3]; + const double B4 = C[row2 * 3 + 1] + 1; + const double B5 = C[row2 * 3 + 2]; + const double B6 = C[row3 * 3]; + const double B7 = C[row3 * 3 + 1]; + const double B8 = C[row3 * 3 + 2] + 1; + + // Check if determinant of B is not too close to zero + double det; + det = B0 * (B4 * B8 - B5 * B7) - B1 * (B3 * B8 - B5 * B6) + + B2 * (B3 * B7 - B4 * B6); + if (fabs(det) < breakdown) { + return QMCKL_FAILURE; + } + + // Update det(Slater) if passed + if (determinant) + *determinant *= det; + + // Compute B^{-1} with explicit formula for 3 x 3 inversion + double __attribute__((aligned(8))) Binv[9], idet = 1.0 / det; + Binv[0] = (B4 * B8 - B7 * B5) * idet; + Binv[1] = -(B1 * B8 - B7 * B2) * idet; + Binv[2] = (B1 * B5 - B4 * B2) * idet; + Binv[3] = -(B3 * B8 - B6 * B5) * idet; + Binv[4] = (B0 * B8 - B6 * B2) * idet; + Binv[5] = -(B0 * B5 - B3 * B2) * idet; + Binv[6] = (B3 * B7 - B6 * B4) * idet; + Binv[7] = -(B0 * B7 - B6 * B1) * idet; + Binv[8] = (B0 * B4 - B3 * B1) * idet; + + // tmp = B^{-1}D : 3 x D{Dim}_P + double __attribute__((aligned(8))) tmp[3 * D{Dim}_P]; + double* r1dim = &(Slater_inv[row1 * D{Dim}_P]); + double* r2dim = &(Slater_inv[row2 * D{Dim}_P]); + double* r3dim = &(Slater_inv[row3 * D{Dim}_P]); + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + tmp[j] = Binv[0] * r1dim[j] + Binv[1] * r2dim[j] + Binv[2] * r3dim[j]; + tmp[D{Dim}_P + j] = + Binv[3] * r1dim[j] + Binv[4] * r2dim[j] + Binv[5] * r3dim[j]; + tmp[2 * D{Dim}_P + j] = + Binv[6] * r1dim[j] + Binv[7] * r2dim[j] + Binv[8] * r3dim[j]; + } + + // Compute (S^T)^{-1} - C * tmp : {Dim} x D{Dim}_P + for (uint64_t i = 0; i < {Dim}; i++) { + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + Slater_inv[i * D{Dim}_P + j] -= C[i * 3] * tmp[j]; + Slater_inv[i * D{Dim}_P + j] -= C[i * 3 + 1] * tmp[D{Dim}_P + j]; + Slater_inv[i * D{Dim}_P + j] -= C[i * 3 + 2] * tmp[2 * D{Dim}_P + j]; + } + } + + return QMCKL_SUCCESS; +} + #+end_src + + + #+NAME:woodbury_3x3_kernel_generator + #+begin_src python :noweb yes :exports none +text=""" +<> +""" +result = [] +for Dim in <>: + Dim=str(Dim) + result.append(text.replace("{Dim}",Dim) ) + +return '\n'.join(result) + #+end_src + + + #+NAME:woodbury_3x3_switch-case_generator + #+begin_src python :noweb yes :exports none +text=""" +case {Dim}: + return qmckl_woodbury_3x3_{Dim}(context, + Updates, + Updates_index, + breakdown, + Slater_inv, + determinant); +""" +result = [] +for Dim in <>: + Dim=str(Dim) + result.append(text.replace("{Dim}",Dim) ) + +return '\n'.join(result) + #+end_src + + + #+begin_src c :tangle (eval c) :comments org :noweb yes +<> + +qmckl_exit_code qmckl_woodbury_3x3(const qmckl_context context, + const uint64_t LDS, + const uint64_t Dim, + const double* Updates, + const uint64_t* Updates_index, + const double breakdown, + double* Slater_inv, + double* determinant) { + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return qmckl_failwith(context, + QMCKL_NULL_CONTEXT, + "qmckl_woodbury_3x3", + NULL); + } + + if (LDS == (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) { // Most cases + switch (Dim) { + <> + } + } + else { // When SIMD_LENGTH > 1, called with LDS == Dim AND Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) + return qmckl_woodbury_3x3_hpc(context, + LDS, + Dim, + Updates, + Updates_index, + breakdown, + Slater_inv, + determinant); + + } + + return QMCKL_FAILURE; +} + #+end_src + + + *** Performance... This function is most efficient when used in cases where there are only 3 rank-1 updates and @@ -855,17 +1217,17 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context, ** Fortran interface :noexport: :PROPERTIES: - :Name: qmckl_woodbury_3 + :Name: qmckl_woodbury_3x3 :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: - #+CALL: generate_f_interface(table=qmckl_woodbury_3_args,rettyp=get_value("FRetType"),fname=get_value("Name")) + #+CALL: generate_f_interface(table=qmckl_woodbury_3x3_args,rettyp=get_value("FRetType"),fname=get_value("Name")) #+RESULTS: #+begin_src f90 :tangle (eval fh_func) :comments org :exports none interface - integer(c_int32_t) function qmckl_woodbury_3 & + integer(c_int32_t) function qmckl_woodbury_3x3 & (context, LDS, Dim, Updates, Updates_index, breakdown, Slater_inv, determinant) & bind(C) use, intrinsic :: iso_c_binding @@ -881,7 +1243,7 @@ qmckl_exit_code qmckl_woodbury_3(const qmckl_context context, real (c_double ) , intent(inout) :: Slater_inv(LDS*Dim) real (c_double ) , intent(inout) :: determinant - end function qmckl_woodbury_3 + end function qmckl_woodbury_3x3 end interface #+end_src @@ -892,7 +1254,7 @@ assert(Updates3 != NULL); assert(Updates_index3 != NULL); assert(Slater_inv3_1 != NULL); det = -1.23743195512859e-09; -rc = qmckl_woodbury_3(context, LDS, Dim, Updates3, Updates_index3, breakdown, Slater_inv3_1, &det); +rc = qmckl_woodbury_3x3(context, LDS, Dim, Updates3, Updates_index3, breakdown, Slater_inv3_1, &det); assert(fabs(det - 1.602708950725074e-10) < 1e-15); for (unsigned int i = 0; i < Dim; i++) { for (unsigned int j = 0; j < Dim; j++) { @@ -1177,7 +1539,7 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context, // Special case for 4 rank-1 updates: 2+2 if (N_updates == 4) { qmckl_exit_code rc = - qmckl_woodbury_2(context, LDS, Dim, Updates, Updates_index, + qmckl_woodbury_2x2(context, LDS, Dim, Updates, Updates_index, breakdown, Slater_inv, determinant); if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting uint64_t l = 0; @@ -1187,7 +1549,7 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context, later_index + later, &l, determinant); later += l; } - rc = qmckl_woodbury_2(context, LDS, Dim, &Updates[2 * LDS], + rc = qmckl_woodbury_2x2(context, LDS, Dim, &Updates[2 * LDS], &Updates_index[2], breakdown, Slater_inv, determinant); if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting @@ -1217,7 +1579,7 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context, for (uint64_t i = 0; i < n_of_3blocks; i++) { const double* Updates_3block = &Updates[i * length_3block]; const uint64_t* Updates_index_3block = &Updates_index[i * 3]; - qmckl_exit_code rc = qmckl_woodbury_3( + qmckl_exit_code rc = qmckl_woodbury_3x3( context, LDS, Dim, Updates_3block, Updates_index_3block, breakdown, Slater_inv, determinant); if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting @@ -1235,7 +1597,7 @@ qmckl_exit_code qmckl_sherman_morrison_smw32s(const qmckl_context context, if (remainder == 2) { const double* Updates_2block = &Updates[n_of_3blocks * length_3block]; const uint64_t* Updates_index_2block = &Updates_index[3 * n_of_3blocks]; - qmckl_exit_code rc = qmckl_woodbury_2( + qmckl_exit_code rc = qmckl_woodbury_2x2( context, LDS, Dim, Updates_2block, Updates_index_2block, breakdown, Slater_inv, determinant); if (rc != QMCKL_SUCCESS) { // Send the entire block to slagel_splitting @@ -1400,33 +1762,34 @@ These functions can only be used internally by the kernels in this module. #+RESULTS: #+begin_src c :tangle (eval h_func) :comments org qmckl_exit_code qmckl_slagel_splitting ( - const uint64_t LDS, - const uint64_t Dim, - const uint64_t N_updates, - const double* Updates, - const uint64_t* Updates_index, - const double breakdown, - double* Slater_inv, - double* later_updates, - uint64_t* later_index, - uint64_t* later, - double* determinant); + const uint64_t LDS, + const uint64_t Dim, + const uint64_t N_updates, + const double* Updates, + const uint64_t* Updates_index, + const double breakdown, + double* Slater_inv, + double* later_updates, + uint64_t* later_index, + uint64_t* later, + double* determinant); #+end_src *** C source #+begin_src c :tangle (eval c) :comments org -qmckl_exit_code qmckl_slagel_splitting(uint64_t LDS, - uint64_t Dim, - uint64_t N_updates, - const double* Updates, - const uint64_t* Updates_index, - const double breakdown, - double* Slater_inv, - double* later_updates, - uint64_t* later_index, - uint64_t* later, - double* determinant) { +qmckl_exit_code qmckl_slagel_splitting_hpc( + uint64_t LDS, + uint64_t Dim, + uint64_t N_updates, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, + const double breakdown, + double* __restrict Slater_inv, + double* __restrict later_updates, + uint64_t* __restrict later_index, + uint64_t* __restrict later, + double* __restrict determinant) { double __attribute__((aligned(8))) C[LDS]; double __attribute__((aligned(8))) D[LDS]; @@ -1488,9 +1851,164 @@ qmckl_exit_code qmckl_slagel_splitting(uint64_t LDS, return QMCKL_SUCCESS; } - #+end_src + + #+NAME:slagel_splitting_template_code + #+begin_src c +static inline qmckl_exit_code qmckl_slagel_splitting_{Dim}( + uint64_t N_updates, + const double* __restrict Updates, + const uint64_t* __restrict Updates_index, + const double breakdown, + double* __restrict Slater_inv, + double* __restrict later_updates, + uint64_t* __restrict later_index, + uint64_t* __restrict later, + double* __restrict determinant) { + + double __attribute__((aligned(8))) C[D{Dim}_P]; + double __attribute__((aligned(8))) D[D{Dim}_P]; + + uint64_t l = 0; + // For each update + while (l < N_updates) { + // C = S^{-1} x U_l + for (uint64_t i = 0; i < {Dim}; i++) { + C[i] = 0.0f; + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + C[i] += Slater_inv[i * D{Dim}_P + j] * Updates[l * D{Dim}_P + j]; + } + } + + // Denominator + const int cui = Updates_index[l] - 1; + double den = 1.0f + C[cui]; + if (fabs(den) < breakdown) { + // U_l = U_l / 2: split the update in 2 equal halves and save the + // second halve in later_updates + IVDEP + ALIGNED + for (uint64_t i = 0; i < D{Dim}_P; i++) { + later_updates[*later * D{Dim}_P + i] = Updates[l * D{Dim}_P + i] * 0.5f; + C[i] *= 0.5f; + } + later_index[*later] = Updates_index[l]; + (*later)++; + + den = 1.0f + C[cui]; + } // From here onwards we continue with applying the first halve of the + // update to Slater_inv + double iden = 1.0f / den; + + if (determinant) + *determinant *= den; + + // D = v^T x S^{-1} : 1 x D{Dim}_P + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + D[j] = Slater_inv[cui * D{Dim}_P + j]; + } + + // S^{-1} = S^{-1} - C x D / den + for (uint64_t i = 0; i < {Dim}; i++) { + IVDEP + ALIGNED + for (uint64_t j = 0; j < D{Dim}_P; j++) { + const double update = C[i] * D[j] * iden; + Slater_inv[i * D{Dim}_P + j] -= update; + } + } + l += 1; + } + + return QMCKL_SUCCESS; +} + #+end_src + + + #+NAME:slagel_splitting_kernel_generator + #+begin_src python :noweb yes :exports none +text=""" +<> +""" +result = [] +for Dim in <>: + Dim=str(Dim) + result.append(text.replace("{Dim}",Dim) ) + +return '\n'.join(result) + #+end_src + + + #+NAME:slagel_splitting_switch-case_generator + #+begin_src python :noweb yes :exports none +text=""" +case {Dim}: + return qmckl_slagel_splitting_{Dim}( + N_updates, + Updates, + Updates_index, + breakdown, + Slater_inv, + later_updates, + later_index, + later, + determinant); +""" +result = [] +for Dim in <>: + Dim=str(Dim) + result.append(text.replace("{Dim}",Dim) ) + +return '\n'.join(result) + #+end_src + + + #+begin_src c :tangle (eval c) :comments org :noweb yes +<> + +qmckl_exit_code qmckl_slagel_splitting( + const uint64_t LDS, + const uint64_t Dim, + const uint64_t N_updates, + const double* Updates, + const uint64_t* Updates_index, + const double breakdown, + double* Slater_inv, + double* later_updates, + uint64_t* later_index, + uint64_t* later, + double* determinant) { + + if (LDS == (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) { // Most cases + switch (Dim) { + <> + } + } + else { // When SIMD_LENGTH > 1, called with LDS == Dim AND Dim != (1+(Dim-1)/SIMD_LENGTH)*SIMD_LENGTH) + return qmckl_slagel_splitting_hpc( + LDS, + Dim, + N_updates, + Updates, + Updates_index, + breakdown, + Slater_inv, + later_updates, + later_index, + later, + determinant); + } + + return QMCKL_FAILURE; +} + #+end_src + + *** Performance This function cannot be used by itself and is used in Sherman-Morrison with update splitting and Woodbury 3x3 and 2x2