2022-07-11 14:48:59 +02:00
# include <math.h>
# include <stdint.h>
# include "kernels.h"
2022-09-09 17:15:12 +02:00
# include "debug.h"
2022-07-11 14:48:59 +02:00
extern uint64_t n_splits ;
extern uint64_t block_fail ;
extern uint64_t recursive_calls ;
2022-07-20 19:09:55 +02:00
int min ( int a , int b ) {
return ( a > b ) ? b : a ;
}
2022-07-11 14:48:59 +02:00
uint32_t qmckl_sherman_morrison (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
double __attribute__ ( ( aligned ( 8 ) ) ) C [ DIM ] ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) D [ LDS ] ;
uint32_t l = 0 ;
// For each update
while ( l < N_updates ) {
// C = S^{-1} x u_l
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i ] = 0.0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
C [ i ] + = Slater_inv [ i * Lds + j ] * Updates [ l * Lds + j ] ; // regular mat-vec product, but actually working on S_inv^T * U_l.
2022-07-11 14:48:59 +02:00
}
}
// Denominator: v_l^T * C
const int cui = Updates_index [ l ] - 1 ;
double den = 1.0 + C [ cui ] ;
if ( fabs ( den ) < breakdown ) {
return 1 ;
}
double iden = 1.0 / den ;
// Update det(A)
if ( ! determinant )
* determinant * = den ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
D [ j ] = Slater_inv [ cui * Lds + j ] ; // selecting proper column of v_l^T * S_inv
2022-07-11 14:48:59 +02:00
}
// A^{-1} = A^{-1} - C x D / den
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
const double update = C [ i ] * D [ j ] * iden ;
2022-07-22 11:34:29 +02:00
Slater_inv [ i * Lds + j ] - = update ;
2022-07-11 14:48:59 +02:00
}
}
l + = 1 ;
}
return 0 ;
}
2022-07-22 11:34:29 +02:00
/*
COMPUTE S ^ { - 1 } P - CB ^ { - 1 } D : Dim x LDS ,
where S ^ { - 1 } P : Dim x LDS ,
C : = S ^ { - 1 } PP ^ TU : Dim x 2 ,
B : = 1 + VC : 2 x 2 ,
D : = VS ^ { - 1 } P : 2 x LDS ,
P ^ TU : LDS x 2 ,
V : 2 x Dim
*/
2022-07-11 14:48:59 +02:00
uint32_t qmckl_woodbury_2 ( const uint64_t vLDS , const uint64_t vDim ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
const uint32_t row1 = ( Updates_index [ 0 ] - 1 ) ;
const uint32_t row2 = ( Updates_index [ 1 ] - 1 ) ;
// Compute C = (S^T)^{-1}U : Dim x 2
2022-07-22 11:34:29 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ 2 * DIM ] ;
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i * 2 ] = 0 ;
C [ i * 2 + 1 ] = 0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t k = 0 ; k < Lds ; k + + ) {
C [ i * 2 ] + = Slater_inv [ i * Lds + k ] * Updates [ k ] ;
C [ i * 2 + 1 ] + = Slater_inv [ i * Lds + k ] * Updates [ Lds + k ] ;
2022-07-11 14:48:59 +02:00
}
}
// Compute B = 1 + VC : 2 x 2
const double B0 = C [ row1 * 2 ] + 1 ;
const double B1 = C [ row1 * 2 + 1 ] ;
const double B2 = C [ row2 * 2 ] ;
const double B3 = C [ row2 * 2 + 1 ] + 1 ;
// Check if determinant of inverted matrix is not zero
double det = B0 * B3 - B1 * B2 ;
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(S) when passed
if ( determinant ! = NULL )
* determinant * = det ;
// Compute B^{-1} with explicit formula for 2 x 2 inversion
double __attribute__ ( ( aligned ( 8 ) ) ) Binv [ 4 ] , idet = 1.0 / det ;
Binv [ 0 ] = idet * B3 ;
Binv [ 1 ] = - 1.0 * idet * B1 ;
Binv [ 2 ] = - 1.0 * idet * B2 ;
Binv [ 3 ] = idet * B0 ;
// tmp = B^{-1}D : 2 x LDS
double __attribute__ ( ( aligned ( 8 ) ) ) tmp [ 2 * LDS ] ;
2022-07-22 11:34:29 +02:00
double * __restrict r1dim = & ( Slater_inv [ row1 * Lds ] ) ;
double * __restrict r2dim = & ( Slater_inv [ row2 * Lds ] ) ;
2022-07-11 14:48:59 +02:00
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
tmp [ j ] = Binv [ 0 ] * r1dim [ j ] + Binv [ 1 ] * r2dim [ j ] ;
2022-07-22 11:34:29 +02:00
tmp [ Lds + j ] = Binv [ 2 ] * r1dim [ j ] + Binv [ 3 ] * r2dim [ j ] ;
2022-07-11 14:48:59 +02:00
}
2022-07-22 11:34:29 +02:00
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
Slater_inv [ i * Lds + j ] - = C [ i * 2 ] * tmp [ j ] ;
Slater_inv [ i * Lds + j ] - = C [ i * 2 + 1 ] * tmp [ Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
}
return 0 ;
}
2022-07-22 11:34:29 +02:00
/*
COMPUTE ( S ^ T ) ^ { - 1 } - CB ^ { - 1 } D : Dim x LDS ,
where S ^ T : Dim x LDS ,
C : = ( S ^ T ) ^ { - 1 } U : Dim x 3 ,
B : = 1 + VC : 3 x 3 ,
D : = V ( S ^ T ) ^ { - 1 } : 3 x LDS ,
U : LDS x 3 ,
V : 3 x Dim
*/
2022-07-11 14:48:59 +02:00
uint32_t qmckl_woodbury_3 ( const uint64_t vLDS , const uint64_t vDim ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
const uint32_t row1 = ( Updates_index [ 0 ] - 1 ) ;
const uint32_t row2 = ( Updates_index [ 1 ] - 1 ) ;
const uint32_t row3 = ( Updates_index [ 2 ] - 1 ) ;
// Compute C = (S^T)^{-1}U : Dim x 3
2022-07-22 11:34:29 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ 3 * DIM ] ;
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i * 3 ] = 0 ;
C [ i * 3 + 1 ] = 0 ;
C [ i * 3 + 2 ] = 0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t k = 0 ; k < Lds ; k + + ) {
C [ i * 3 ] + = Slater_inv [ i * Lds + k ] * Updates [ k ] ;
C [ i * 3 + 1 ] + = Slater_inv [ i * Lds + k ] * Updates [ Lds + k ] ;
C [ i * 3 + 2 ] + = Slater_inv [ i * Lds + k ] * Updates [ 2 * Lds + k ] ;
2022-07-11 14:48:59 +02:00
}
}
// Compute B = 1 + VC : 3 x 3
const double B0 = C [ row1 * 3 ] + 1 ;
const double B1 = C [ row1 * 3 + 1 ] ;
const double B2 = C [ row1 * 3 + 2 ] ;
const double B3 = C [ row2 * 3 ] ;
const double B4 = C [ row2 * 3 + 1 ] + 1 ;
const double B5 = C [ row2 * 3 + 2 ] ;
const double B6 = C [ row3 * 3 ] ;
const double B7 = C [ row3 * 3 + 1 ] ;
const double B8 = C [ row3 * 3 + 2 ] + 1 ;
// Check if determinant of B is not too close to zero
double det ;
det = B0 * ( B4 * B8 - B5 * B7 ) - B1 * ( B3 * B8 - B5 * B6 ) +
B2 * ( B3 * B7 - B4 * B6 ) ;
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(Slater) if passed
if ( determinant ! = NULL )
* determinant * = det ;
// Compute B^{-1} with explicit formula for 3 x 3 inversion
double __attribute__ ( ( aligned ( 8 ) ) ) Binv [ 9 ] , idet = 1.0 / det ;
Binv [ 0 ] = ( B4 * B8 - B7 * B5 ) * idet ;
Binv [ 1 ] = - ( B1 * B8 - B7 * B2 ) * idet ;
Binv [ 2 ] = ( B1 * B5 - B4 * B2 ) * idet ;
Binv [ 3 ] = - ( B3 * B8 - B6 * B5 ) * idet ;
Binv [ 4 ] = ( B0 * B8 - B6 * B2 ) * idet ;
Binv [ 5 ] = - ( B0 * B5 - B3 * B2 ) * idet ;
Binv [ 6 ] = ( B3 * B7 - B6 * B4 ) * idet ;
Binv [ 7 ] = - ( B0 * B7 - B6 * B1 ) * idet ;
Binv [ 8 ] = ( B0 * B4 - B3 * B1 ) * idet ;
// tmp = B^{-1}D : 3 x LDS
double __attribute__ ( ( aligned ( 8 ) ) ) tmp [ 3 * LDS ] ;
double * __restrict r1dim = & ( Slater_inv [ row1 * LDS ] ) ;
double * __restrict r2dim = & ( Slater_inv [ row2 * LDS ] ) ;
double * __restrict r3dim = & ( Slater_inv [ row3 * LDS ] ) ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
tmp [ j ] = Binv [ 0 ] * r1dim [ j ] + Binv [ 1 ] * r2dim [ j ] + Binv [ 2 ] * r3dim [ j ] ;
2022-07-22 11:34:29 +02:00
tmp [ Lds + j ] = Binv [ 3 ] * r1dim [ j ] + Binv [ 4 ] * r2dim [ j ] + Binv [ 5 ] * r3dim [ j ] ;
tmp [ 2 * Lds + j ] = Binv [ 6 ] * r1dim [ j ] + Binv [ 7 ] * r2dim [ j ] + Binv [ 8 ] * r3dim [ j ] ;
2022-07-11 14:48:59 +02:00
}
2022-07-22 11:34:29 +02:00
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
Slater_inv [ i * Lds + j ] - = C [ i * 3 ] * tmp [ j ] ;
Slater_inv [ i * Lds + j ] - = C [ i * 3 + 1 ] * tmp [ Lds + j ] ;
Slater_inv [ i * Lds + j ] - = C [ i * 3 + 2 ] * tmp [ 2 * Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
}
return 0 ;
}
2022-07-20 19:09:55 +02:00
/*
COMPUTE S ^ { - 1 } - C B ^ { - 1 } D : Dim x LDS ,
where S ^ { - 1 } : Dim x LDS ,
C : = S ^ { - 1 } U : Dim x K , dgemm
B : = 1 + V C : K x K , copy
D : = V S ^ { - 1 } : K x LDS , copy
U : LDS x K ,
V : K x Dim
tmp : = B ^ { - 1 } D : K x LDS , dgemm
S = S - C tmp : Dim x LDS , dgemm
*/
uint32_t qmckl_woodbury_k ( const uint64_t vLDS ,
const uint64_t vDim ,
const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-20 19:09:55 +02:00
// Compute C = S^{-1} U : Dim x K : standard dgemm
2022-09-09 17:15:12 +02:00
double * C = calloc ( 1 , DIM * N_updates * sizeof ( double ) ) ;
2022-07-20 19:09:55 +02:00
double alpha = 1.0 , beta = 0.0 ;
2022-09-09 17:15:12 +02:00
2022-07-20 19:09:55 +02:00
cblas_dgemm ( CblasRowMajor , CblasNoTrans , CblasTrans ,
2022-07-22 11:34:29 +02:00
Dim , N_updates , Lds ,
alpha , Slater_inv , Lds , Updates , Lds ,
2022-07-20 19:09:55 +02:00
beta , C , N_updates ) ;
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
// Construct D = V S^{-1} : K x LDS
double B [ N_updates * N_updates ] , D [ N_updates * LDS ] ;
for ( uint32_t i = 0 ; i < N_updates ; i + + ) {
const uint32_t row = Updates_index [ i ] - 1 ;
for ( uint32_t j = 0 ; j < N_updates ; j + + ) B [ i * N_updates + j ] = C [ row * N_updates + j ] + ( i = = j ) ;
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) D [ i * Lds + j ] = Slater_inv [ row * Lds + j ] ;
2022-07-20 19:09:55 +02:00
}
// Compute determinant by LU decomposition
int ipiv [ N_updates ] ;
lapack_int ret ;
ret = LAPACKE_dgetrf ( LAPACK_ROW_MAJOR , N_updates , N_updates , B , N_updates , ipiv ) ;
if ( ret ! = 0 ) return ret ;
double det = 1.0 ;
int j = 0 ;
for ( uint32_t i = 0 ; i < N_updates ; i + + ) {
2022-07-21 13:57:28 +02:00
j + = min ( ipiv [ i ] - i , 1 ) ;
2022-07-20 19:09:55 +02:00
det * = B [ ( N_updates + 1 ) * i ] ;
}
2022-07-21 12:21:51 +02:00
if ( ( j & 1 ) = = 0 ) det = - det ; // multiply det with -1 if j is even
2022-07-20 19:09:55 +02:00
// Check if determinant of B is not too close to zero
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(Slater) if passed
if ( determinant ) * determinant * = det ;
// Compute B^{-1} with explicit formula for K x K inversion
ret = LAPACKE_dgetri ( LAPACK_ROW_MAJOR , N_updates , B , N_updates , ipiv ) ;
if ( ret ! = 0 ) return ret ;
2022-09-09 17:15:12 +02:00
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
double tmp1 [ N_updates * LDS ] ;
2022-07-20 19:09:55 +02:00
cblas_dgemm ( CblasRowMajor , CblasNoTrans , CblasNoTrans ,
N_updates , LDS , N_updates ,
alpha , B , N_updates , D , LDS ,
2022-09-09 17:15:12 +02:00
beta , tmp1 , LDS ) ;
print_m ( tmp1 , N_updates , LDS , LDS , " tmp1_cblas " ) ;
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
// alpha = -1.0, beta = 1.0;
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
// Dim, LDS, N_updates,
// alpha, C, N_updates, tmp1, LDS,
// beta, Slater_inv, LDS);
double * tmp2 = calloc ( 1 , DIM * LDS * sizeof ( double ) ) ;
cblas_dgemm ( CblasRowMajor ,
CblasNoTrans , CblasNoTrans ,
2022-07-20 19:09:55 +02:00
Dim , LDS , N_updates ,
2022-09-09 17:15:12 +02:00
alpha , C , N_updates , tmp1 , LDS ,
beta , tmp2 , LDS ) ;
print_m ( tmp2 , DIM , LDS , LDS , " tmp2_cblas " ) ;
double * tmp3 = calloc ( 1 , DIM * LDS * sizeof ( double ) ) ;
for ( int i = 0 ; i < DIM * LDS ; + + i )
{
tmp3 [ i ] = Slater_inv [ i ] - tmp2 [ i ] ;
}
print_m ( tmp3 , DIM , LDS , LDS , " tmp3_cblas " ) ;
for ( int i = 0 ; i < DIM * LDS ; + + i )
{
Slater_inv [ i ] = tmp3 [ i ] ;
}
free ( tmp2 ) ;
free ( tmp3 ) ;
2022-07-20 19:09:55 +02:00
return 0 ;
}
2022-07-21 12:21:51 +02:00
# ifdef HAVE_CUBLAS_OFFLOAD
2022-09-09 17:15:12 +02:00
uint32_t qmckl_woodbury_k_cublas_offload ( cublasHandle_t handle ,
const uint64_t vLDS ,
2022-07-21 12:21:51 +02:00
const uint64_t vDim ,
const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-21 12:21:51 +02:00
2022-07-22 11:34:29 +02:00
// Compute C = S^{-1} U : Dim x K : standard dgemm
2022-09-09 17:15:12 +02:00
double * C = calloc ( 1 , DIM * N_updates * sizeof ( double ) ) ;
double alpha = 1.0f , beta = 0.0f ;
# pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], Updates[0:LDS*N_updates], C[0:DIM*N_updates])
2022-07-22 11:34:29 +02:00
# pragma omp target data use_device_ptr(Slater_inv, Updates, C)
{
2022-09-09 17:15:12 +02:00
int cublasError = cublasDgemm ( handle ,
CUBLAS_OP_T , CUBLAS_OP_N ,
15 , 21 , 24 ,
& alpha , Updates , 24 , Slater_inv , 24 ,
& beta , C , 15 ) ;
2022-07-22 11:34:29 +02:00
}
2022-09-09 17:15:12 +02:00
# pragma omp target exit data map(from:C[0:DIM*N_updates])
2022-07-21 12:21:51 +02:00
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
// Construct D = V S^{-1} : K x LDS
2022-09-09 17:15:12 +02:00
double * B = calloc ( 1 , N_updates * N_updates * sizeof ( double ) ) ;
double * D = calloc ( 1 , N_updates * LDS * sizeof ( double ) ) ;
2022-07-21 12:21:51 +02:00
for ( uint32_t i = 0 ; i < N_updates ; i + + ) {
const uint32_t row = Updates_index [ i ] - 1 ;
for ( uint32_t j = 0 ; j < N_updates ; j + + ) B [ i * N_updates + j ] = C [ row * N_updates + j ] + ( i = = j ) ;
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) D [ i * Lds + j ] = Slater_inv [ row * Lds + j ] ;
2022-07-21 12:21:51 +02:00
}
// Compute determinant by LU decomposition
int ipiv [ N_updates ] ;
lapack_int ret ;
ret = LAPACKE_dgetrf ( LAPACK_ROW_MAJOR , N_updates , N_updates , B , N_updates , ipiv ) ;
if ( ret ! = 0 ) return ret ;
double det = 1.0 ;
int j = 0 ;
for ( uint32_t i = 0 ; i < N_updates ; i + + ) {
2022-07-21 13:57:28 +02:00
j + = min ( ipiv [ i ] - i , 1 ) ;
2022-09-09 17:15:12 +02:00
det * = B [ ( N_updates + 1 ) * i ] ; // update determinant
2022-07-21 12:21:51 +02:00
}
if ( ( j & 1 ) = = 0 ) det = - det ; // multiply det with -1 if j is even
// Check if determinant of B is not too close to zero
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(Slater) if passed
if ( determinant ) * determinant * = det ;
// Compute B^{-1} with explicit formula for K x K inversion
ret = LAPACKE_dgetri ( LAPACK_ROW_MAJOR , N_updates , B , N_updates , ipiv ) ;
if ( ret ! = 0 ) return ret ;
2022-09-09 17:15:12 +02:00
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
double * tmp1 = calloc ( 1 , N_updates * LDS * sizeof ( double ) ) ;
# pragma omp target enter data map(to:D[0:N_updates*LDS], B[0:N_updates*N_updates], tmp1[0:N_updates*LDS])
# pragma omp target data use_device_ptr(D, B, tmp1)
{
int cublasError = cublasDgemm ( handle ,
CUBLAS_OP_N , CUBLAS_OP_N ,
LDS , N_updates , N_updates ,
& alpha , D , LDS , B , N_updates ,
& beta , tmp1 , LDS ) ;
}
# pragma omp target exit data map(from:tmp1[0:N_updates*LDS])
print_m ( tmp1 , N_updates , LDS , LDS , " tmp1_cublas " ) ;
2022-07-21 12:21:51 +02:00
2022-09-09 17:15:12 +02:00
// Compute tmp2 = C * tmp1 : Dim x LDS
double * tmp2 = calloc ( 1 , DIM * LDS * sizeof ( double ) ) ;
# pragma omp target enter data map(to:tmp1[0:N_updates*LDS], C[0:DIM*N_updates], tmp2[0:DIM*LDS])
# pragma omp target data use_device_ptr(tmp1, C, tmp2)
{
int cublasError = cublasDgemm ( handle ,
CUBLAS_OP_N , CUBLAS_OP_N ,
LDS , Dim , N_updates ,
& alpha , tmp1 , LDS , C , N_updates ,
& beta , tmp2 , LDS ) ;
}
# pragma omp target exit data map(from:tmp2[0:DIM*LDS])
print_m ( tmp2 , DIM , LDS , LDS , " tmp2_cublas " ) ;
// Compute tmp3 = S^{-1} - tmp2
double * tmp3 = calloc ( 1 , DIM * LDS * sizeof ( double ) ) ;
beta = - 1.0f ;
# pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS], tmp3[0:DIM*LDS])
# pragma omp target data use_device_ptr(Slater_inv, tmp2, tmp3)
{
int cublasError = cublasDgeam ( handle ,
CUBLAS_OP_N , CUBLAS_OP_N ,
DIM , LDS ,
& alpha , Slater_inv , LDS ,
& beta , tmp2 , LDS ,
tmp3 , LDS ) ;
}
# pragma omp target exit data map(from:tmp3[0:DIM*LDS])
print_m ( tmp3 , DIM , LDS , LDS , " tmp3_cublas " ) ;
for ( int i = 0 ; i < DIM * LDS ; + + i )
{
Slater_inv [ i ] = tmp3 [ i ] ;
}
2022-07-21 12:21:51 +02:00
2022-09-09 17:15:12 +02:00
// // Compute S^{-1} <- S^{-1} - tmp2
// beta = -1.0f;
// #pragma omp target enter data map(to:Slater_inv[0:DIM*LDS], tmp2[0:DIM*LDS])
// #pragma omp target data use_device_ptr(Slater_inv, tmp2)
// {
// int cublasError = cublasDgeam(handle,
// CUBLAS_OP_N, CUBLAS_OP_N,
// DIM, LDS,
// &alpha, Slater_inv, LDS,
// &beta, tmp2, LDS,
// Slater_inv, LDS);
// }
// #pragma omp target exit data map(from:Slater_inv[0:DIM*LDS])
free ( B ) ;
free ( C ) ;
free ( D ) ;
free ( tmp1 ) ;
free ( tmp2 ) ;
// free(tmp3);
2022-07-21 12:21:51 +02:00
return 0 ;
}
# endif
2022-07-11 14:48:59 +02:00
uint32_t qmckl_slagel_splitting (
const uint64_t vLDS , const uint64_t vDim , uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) later_updates ,
uint64_t * __restrict later_index , uint64_t * __restrict later ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
2022-07-21 08:16:25 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ LDS ] ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) D [ LDS ] ;
uint32_t l = 0 ;
// For each update
while ( l < N_updates ) {
// C = S^{-1} x U_l
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i ] = 0.0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
C [ i ] + = Slater_inv [ i * Lds + j ] * Updates [ l * Lds + j ] ; // regular mat-vec product, but actually working on S_inv^T * U_l.
2022-07-11 14:48:59 +02:00
}
}
// Denominator
const int cui = Updates_index [ l ] - 1 ;
double den = 1.0 + C [ cui ] ;
// printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui);
if ( fabs ( den ) < breakdown ) { // Here is decided to split the update, or not.
// printf("Split! breakdown = %f\n", breakdown);
n_splits + = 1 ;
// U_l = U_l / 2: split the update in 2 equal halves and save the second halve
// in later_updates
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t i = 0 ; i < Lds ; i + + ) {
later_updates [ * later * Lds + i ] = Updates [ l * Lds + i ] / 2.0 ;
2022-07-11 14:48:59 +02:00
C [ i ] / = 2.0 ;
}
later_index [ * later ] = Updates_index [ l ] ;
( * later ) + + ;
den = 1.0 + C [ cui ] ;
} // From here onwards we continue with applying the first halve of the update to Slater_inv
double iden = 1.0 / den ;
if ( ! determinant ) * determinant * = den ;
// D = v^T x S^{-1} : 1 x LDS
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
D [ j ] = Slater_inv [ cui * Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
// S^{-1} = S^{-1} - C x D / den
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
const double update = C [ i ] * D [ j ] * iden ;
2022-07-22 11:34:29 +02:00
Slater_inv [ i * Lds + j ] - = update ;
2022-07-11 14:48:59 +02:00
}
}
l + = 1 ;
}
return 0 ;
}
uint32_t qmckl_sherman_morrison_splitting (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) later_updates [ LDS * N_updates ] ;
uint64_t later_index [ N_updates ] ;
uint64_t later = 0 ;
2022-07-21 12:21:51 +02:00
// uint32_t rc;
2022-07-11 14:48:59 +02:00
2022-07-22 11:34:29 +02:00
( void ) qmckl_slagel_splitting ( Lds , Dim , N_updates , Updates , Updates_index ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , later_updates , later_index ,
& later , determinant ) ;
2022-07-22 11:34:29 +02:00
// rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
2022-07-21 12:21:51 +02:00
// breakdown, Slater_inv, later_updates, later_index,
// &later, determinant);
2022-07-11 14:48:59 +02:00
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
if ( later > 0 ) {
recursive_calls + + ;
// printf("Later > 0\n");
2022-07-22 11:34:29 +02:00
( void ) qmckl_sherman_morrison_splitting ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv ,
determinant ) ;
2022-07-21 12:21:51 +02:00
2022-07-22 11:34:29 +02:00
// rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
2022-07-21 12:21:51 +02:00
// later_index, breakdown, Slater_inv,
// determinant);
2022-07-11 14:48:59 +02:00
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
}
return 0 ;
}
uint32_t qmckl_sherman_morrison_smw32s (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) later_updates [ LDS * N_updates ] ;
uint64_t later_index [ N_updates ] ;
uint64_t later = 0 ;
uint32_t rc ;
if ( N_updates = = 4 ) { // Special case for 4 rank-1 updates: 2+2
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_2 ( Lds , Dim , Updates , Updates_index ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 2 , Updates ,
2022-07-11 14:48:59 +02:00
Updates_index , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
later + = l ;
}
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_2 ( Lds , Dim , & Updates [ 2 * Lds ] , & Updates_index [ 2 ] ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 2 , & Updates [ 2 * Lds ] ,
2022-07-11 14:48:59 +02:00
& Updates_index [ 2 ] , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
later + = l ;
}
if ( later > 0 ) {
recursive_calls + + ;
2022-07-22 11:34:29 +02:00
rc = qmckl_sherman_morrison_splitting ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv ,
determinant ) ;
}
return 0 ;
}
// And for the other cases != 4, 6
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
// Woodbury 3x3 kernel
uint32_t n_of_3blocks = N_updates / 3 ;
uint32_t remainder = N_updates % 3 ;
2022-07-22 11:34:29 +02:00
uint32_t length_3block = 3 * Lds ;
2022-07-11 14:48:59 +02:00
if ( n_of_3blocks > 0 ) {
for ( uint32_t i = 0 ; i < n_of_3blocks ; i + + ) {
const double * Updates_3block = & Updates [ i * length_3block ] ;
const uint64_t * Updates_index_3block = & Updates_index [ i * 3 ] ;
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_3 ( Lds , Dim , Updates_3block , Updates_index_3block ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
// printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 3 , Updates_3block ,
2022-07-11 14:48:59 +02:00
Updates_index_3block , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later + = l ;
}
}
}
// Apply last remaining block of 2 updates with Woodbury 2x2 kernel
if ( remainder = = 2 ) {
const double * Updates_2block = & Updates [ n_of_3blocks * length_3block ] ;
const uint64_t * Updates_index_2block = & Updates_index [ 3 * n_of_3blocks ] ;
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_2 ( Lds , Dim , Updates_2block , Updates_index_2block ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
// printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 2 , Updates_2block ,
2022-07-11 14:48:59 +02:00
Updates_index_2block , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later + = l ;
}
}
// Apply last remaining update with slagel_splitting
if ( remainder = = 1 ) {
// // printf("Sending single update to QMCKL_SLAGEL_SPLITTING\n");
const double * Updates_1block = & Updates [ n_of_3blocks * length_3block ] ;
const uint64_t * Updates_index_1block = & Updates_index [ 3 * n_of_3blocks ] ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 1 , Updates_1block ,
2022-07-11 14:48:59 +02:00
Updates_index_1block , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later + = l ;
}
if ( later > 0 ) {
recursive_calls + + ;
// printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
2022-07-22 11:34:29 +02:00
rc = qmckl_sherman_morrison_splitting ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv ,
determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
}
return 0 ;
}
// Sherman Morrison, leaving zero denominators for later
uint32_t qmckl_sherman_morrison_later (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
2022-07-22 11:34:29 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ DIM ] ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) D [ LDS ] ;
double __attribute__ ( ( aligned ( 8 ) ) ) later_updates [ LDS * N_updates ] ;
uint64_t later_index [ N_updates ] ;
uint64_t later = 0 ;
uint32_t l = 0 ;
// For each update
while ( l < N_updates ) {
// C = A^{-1} x U_l
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i ] = 0.0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
C [ i ] + = Slater_inv [ i * Lds + j ] * Updates [ l * Lds + j ] ; // regular mat-vec product, but actually working on S_inv^T * U_l.
2022-07-11 14:48:59 +02:00
}
}
// Denominator
const int cui = Updates_index [ l ] - 1 ;
double den = 1.0 + C [ cui ] ;
if ( fabs ( den ) < breakdown ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-11 14:48:59 +02:00
// for (uint32_t i = 0; i < Dim; i++) {
2022-07-22 11:34:29 +02:00
for ( uint32_t i = 0 ; i < Lds ; i + + ) {
later_updates [ later * Lds + i ] = Updates [ l * Lds + i ] ;
2022-07-11 14:48:59 +02:00
}
later_index [ later ] = Updates_index [ l ] ;
later + + ;
l + = 1 ;
continue ;
}
double iden = 1.0 / den ;
if ( ! determinant ) * determinant * = den ;
// D = v^T x A^{-1}
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
D [ j ] = Slater_inv [ cui * Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
// S^{-1} = S^{-1} - C x D / den
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
const double update = C [ i ] * D [ j ] * iden ;
2022-07-22 11:34:29 +02:00
Slater_inv [ i * Lds + j ] - = update ;
2022-07-11 14:48:59 +02:00
}
}
l + = 1 ;
}
if ( later = = N_updates ) { // If all the updates have failed, exit early with an error
return 1 ;
}
else if ( later > 0 ) { // If some have failed, make a recursive call
recursive_calls + + ;
2022-07-22 11:34:29 +02:00
( void ) qmckl_sherman_morrison_later ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv , determinant ) ;
}
return 0 ;
}
// Inplace inverse n x n matrix A.
// returns:
// ret = 0 on success
// ret < 0 illegal argument value
// ret > 0 singular matrix
lapack_int inverse ( double * a , uint64_t m , uint64_t n ) {
int ipiv [ m + 1 ] ;
lapack_int ret ;
ret = LAPACKE_dgetrf ( LAPACK_ROW_MAJOR , m , n , a , n , ipiv ) ;
if ( ret ! = 0 ) return ret ;
ret = LAPACKE_dgetri ( LAPACK_ROW_MAJOR , n , a , n , ipiv ) ;
return ret ;
}