2022-07-11 14:48:59 +02:00
# include <math.h>
# include <stdint.h>
# include "kernels.h"
2022-09-09 17:15:12 +02:00
# include "debug.h"
2022-07-11 14:48:59 +02:00
extern uint64_t n_splits ;
extern uint64_t block_fail ;
extern uint64_t recursive_calls ;
2022-07-20 19:09:55 +02:00
int min ( int a , int b ) {
return ( a > b ) ? b : a ;
}
2022-07-11 14:48:59 +02:00
uint32_t qmckl_sherman_morrison (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
double __attribute__ ( ( aligned ( 8 ) ) ) C [ DIM ] ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) D [ LDS ] ;
uint32_t l = 0 ;
// For each update
while ( l < N_updates ) {
// C = S^{-1} x u_l
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i ] = 0.0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
C [ i ] + = Slater_inv [ i * Lds + j ] * Updates [ l * Lds + j ] ; // regular mat-vec product, but actually working on S_inv^T * U_l.
2022-07-11 14:48:59 +02:00
}
}
// Denominator: v_l^T * C
const int cui = Updates_index [ l ] - 1 ;
double den = 1.0 + C [ cui ] ;
if ( fabs ( den ) < breakdown ) {
return 1 ;
}
double iden = 1.0 / den ;
// Update det(A)
if ( ! determinant )
* determinant * = den ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
D [ j ] = Slater_inv [ cui * Lds + j ] ; // selecting proper column of v_l^T * S_inv
2022-07-11 14:48:59 +02:00
}
// A^{-1} = A^{-1} - C x D / den
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
const double update = C [ i ] * D [ j ] * iden ;
2022-07-22 11:34:29 +02:00
Slater_inv [ i * Lds + j ] - = update ;
2022-07-11 14:48:59 +02:00
}
}
l + = 1 ;
}
return 0 ;
}
2022-07-22 11:34:29 +02:00
/*
COMPUTE S ^ { - 1 } P - CB ^ { - 1 } D : Dim x LDS ,
where S ^ { - 1 } P : Dim x LDS ,
C : = S ^ { - 1 } PP ^ TU : Dim x 2 ,
B : = 1 + VC : 2 x 2 ,
D : = VS ^ { - 1 } P : 2 x LDS ,
P ^ TU : LDS x 2 ,
V : 2 x Dim
*/
2022-07-11 14:48:59 +02:00
uint32_t qmckl_woodbury_2 ( const uint64_t vLDS , const uint64_t vDim ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
const uint32_t row1 = ( Updates_index [ 0 ] - 1 ) ;
const uint32_t row2 = ( Updates_index [ 1 ] - 1 ) ;
// Compute C = (S^T)^{-1}U : Dim x 2
2022-07-22 11:34:29 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ 2 * DIM ] ;
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i * 2 ] = 0 ;
C [ i * 2 + 1 ] = 0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t k = 0 ; k < Lds ; k + + ) {
C [ i * 2 ] + = Slater_inv [ i * Lds + k ] * Updates [ k ] ;
C [ i * 2 + 1 ] + = Slater_inv [ i * Lds + k ] * Updates [ Lds + k ] ;
2022-07-11 14:48:59 +02:00
}
}
// Compute B = 1 + VC : 2 x 2
const double B0 = C [ row1 * 2 ] + 1 ;
const double B1 = C [ row1 * 2 + 1 ] ;
const double B2 = C [ row2 * 2 ] ;
const double B3 = C [ row2 * 2 + 1 ] + 1 ;
// Check if determinant of inverted matrix is not zero
double det = B0 * B3 - B1 * B2 ;
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(S) when passed
if ( determinant ! = NULL )
* determinant * = det ;
// Compute B^{-1} with explicit formula for 2 x 2 inversion
double __attribute__ ( ( aligned ( 8 ) ) ) Binv [ 4 ] , idet = 1.0 / det ;
Binv [ 0 ] = idet * B3 ;
Binv [ 1 ] = - 1.0 * idet * B1 ;
Binv [ 2 ] = - 1.0 * idet * B2 ;
Binv [ 3 ] = idet * B0 ;
// tmp = B^{-1}D : 2 x LDS
double __attribute__ ( ( aligned ( 8 ) ) ) tmp [ 2 * LDS ] ;
2022-07-22 11:34:29 +02:00
double * __restrict r1dim = & ( Slater_inv [ row1 * Lds ] ) ;
double * __restrict r2dim = & ( Slater_inv [ row2 * Lds ] ) ;
2022-07-11 14:48:59 +02:00
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
tmp [ j ] = Binv [ 0 ] * r1dim [ j ] + Binv [ 1 ] * r2dim [ j ] ;
2022-07-22 11:34:29 +02:00
tmp [ Lds + j ] = Binv [ 2 ] * r1dim [ j ] + Binv [ 3 ] * r2dim [ j ] ;
2022-07-11 14:48:59 +02:00
}
2022-07-22 11:34:29 +02:00
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
Slater_inv [ i * Lds + j ] - = C [ i * 2 ] * tmp [ j ] ;
Slater_inv [ i * Lds + j ] - = C [ i * 2 + 1 ] * tmp [ Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
}
return 0 ;
}
2022-07-22 11:34:29 +02:00
/*
COMPUTE ( S ^ T ) ^ { - 1 } - CB ^ { - 1 } D : Dim x LDS ,
where S ^ T : Dim x LDS ,
C : = ( S ^ T ) ^ { - 1 } U : Dim x 3 ,
B : = 1 + VC : 3 x 3 ,
D : = V ( S ^ T ) ^ { - 1 } : 3 x LDS ,
U : LDS x 3 ,
V : 3 x Dim
*/
2022-07-11 14:48:59 +02:00
uint32_t qmckl_woodbury_3 ( const uint64_t vLDS , const uint64_t vDim ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) )
Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
const uint32_t row1 = ( Updates_index [ 0 ] - 1 ) ;
const uint32_t row2 = ( Updates_index [ 1 ] - 1 ) ;
const uint32_t row3 = ( Updates_index [ 2 ] - 1 ) ;
// Compute C = (S^T)^{-1}U : Dim x 3
2022-07-22 11:34:29 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ 3 * DIM ] ;
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i * 3 ] = 0 ;
C [ i * 3 + 1 ] = 0 ;
C [ i * 3 + 2 ] = 0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t k = 0 ; k < Lds ; k + + ) {
C [ i * 3 ] + = Slater_inv [ i * Lds + k ] * Updates [ k ] ;
C [ i * 3 + 1 ] + = Slater_inv [ i * Lds + k ] * Updates [ Lds + k ] ;
C [ i * 3 + 2 ] + = Slater_inv [ i * Lds + k ] * Updates [ 2 * Lds + k ] ;
2022-07-11 14:48:59 +02:00
}
}
// Compute B = 1 + VC : 3 x 3
const double B0 = C [ row1 * 3 ] + 1 ;
const double B1 = C [ row1 * 3 + 1 ] ;
const double B2 = C [ row1 * 3 + 2 ] ;
const double B3 = C [ row2 * 3 ] ;
const double B4 = C [ row2 * 3 + 1 ] + 1 ;
const double B5 = C [ row2 * 3 + 2 ] ;
const double B6 = C [ row3 * 3 ] ;
const double B7 = C [ row3 * 3 + 1 ] ;
const double B8 = C [ row3 * 3 + 2 ] + 1 ;
// Check if determinant of B is not too close to zero
double det ;
det = B0 * ( B4 * B8 - B5 * B7 ) - B1 * ( B3 * B8 - B5 * B6 ) +
B2 * ( B3 * B7 - B4 * B6 ) ;
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(Slater) if passed
if ( determinant ! = NULL )
* determinant * = det ;
// Compute B^{-1} with explicit formula for 3 x 3 inversion
double __attribute__ ( ( aligned ( 8 ) ) ) Binv [ 9 ] , idet = 1.0 / det ;
Binv [ 0 ] = ( B4 * B8 - B7 * B5 ) * idet ;
Binv [ 1 ] = - ( B1 * B8 - B7 * B2 ) * idet ;
Binv [ 2 ] = ( B1 * B5 - B4 * B2 ) * idet ;
Binv [ 3 ] = - ( B3 * B8 - B6 * B5 ) * idet ;
Binv [ 4 ] = ( B0 * B8 - B6 * B2 ) * idet ;
Binv [ 5 ] = - ( B0 * B5 - B3 * B2 ) * idet ;
Binv [ 6 ] = ( B3 * B7 - B6 * B4 ) * idet ;
Binv [ 7 ] = - ( B0 * B7 - B6 * B1 ) * idet ;
Binv [ 8 ] = ( B0 * B4 - B3 * B1 ) * idet ;
// tmp = B^{-1}D : 3 x LDS
double __attribute__ ( ( aligned ( 8 ) ) ) tmp [ 3 * LDS ] ;
double * __restrict r1dim = & ( Slater_inv [ row1 * LDS ] ) ;
double * __restrict r2dim = & ( Slater_inv [ row2 * LDS ] ) ;
double * __restrict r3dim = & ( Slater_inv [ row3 * LDS ] ) ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
tmp [ j ] = Binv [ 0 ] * r1dim [ j ] + Binv [ 1 ] * r2dim [ j ] + Binv [ 2 ] * r3dim [ j ] ;
2022-07-22 11:34:29 +02:00
tmp [ Lds + j ] = Binv [ 3 ] * r1dim [ j ] + Binv [ 4 ] * r2dim [ j ] + Binv [ 5 ] * r3dim [ j ] ;
tmp [ 2 * Lds + j ] = Binv [ 6 ] * r1dim [ j ] + Binv [ 7 ] * r2dim [ j ] + Binv [ 8 ] * r3dim [ j ] ;
2022-07-11 14:48:59 +02:00
}
2022-07-22 11:34:29 +02:00
// Compute (S^T)^{-1} - C * tmp : Dim x Lds
2022-07-11 14:48:59 +02:00
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
Slater_inv [ i * Lds + j ] - = C [ i * 3 ] * tmp [ j ] ;
Slater_inv [ i * Lds + j ] - = C [ i * 3 + 1 ] * tmp [ Lds + j ] ;
Slater_inv [ i * Lds + j ] - = C [ i * 3 + 2 ] * tmp [ 2 * Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
}
return 0 ;
}
2022-07-20 19:09:55 +02:00
/*
COMPUTE S ^ { - 1 } - C B ^ { - 1 } D : Dim x LDS ,
where S ^ { - 1 } : Dim x LDS ,
C : = S ^ { - 1 } U : Dim x K , dgemm
B : = 1 + V C : K x K , copy
D : = V S ^ { - 1 } : K x LDS , copy
U : LDS x K ,
V : K x Dim
tmp : = B ^ { - 1 } D : K x LDS , dgemm
S = S - C tmp : Dim x LDS , dgemm
*/
uint32_t qmckl_woodbury_k ( const uint64_t vLDS ,
const uint64_t vDim ,
const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index ,
const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-20 19:09:55 +02:00
// Compute C = S^{-1} U : Dim x K : standard dgemm
2022-09-09 17:15:12 +02:00
double * C = calloc ( 1 , DIM * N_updates * sizeof ( double ) ) ;
2022-07-20 19:09:55 +02:00
double alpha = 1.0 , beta = 0.0 ;
2022-09-09 17:15:12 +02:00
2022-07-20 19:09:55 +02:00
cblas_dgemm ( CblasRowMajor , CblasNoTrans , CblasTrans ,
2022-07-22 11:34:29 +02:00
Dim , N_updates , Lds ,
alpha , Slater_inv , Lds , Updates , Lds ,
2022-07-20 19:09:55 +02:00
beta , C , N_updates ) ;
// Construct B = 1 + V C : K x K : selecting and copying row from C into B. Can maybe be off-loaded to GPU by splitting in N_updates tiles of N_updates strides, using PARALLEL and SIMD
// Construct D = V S^{-1} : K x LDS
double B [ N_updates * N_updates ] , D [ N_updates * LDS ] ;
for ( uint32_t i = 0 ; i < N_updates ; i + + ) {
const uint32_t row = Updates_index [ i ] - 1 ;
for ( uint32_t j = 0 ; j < N_updates ; j + + ) B [ i * N_updates + j ] = C [ row * N_updates + j ] + ( i = = j ) ;
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) D [ i * Lds + j ] = Slater_inv [ row * Lds + j ] ;
2022-07-20 19:09:55 +02:00
}
// Compute determinant by LU decomposition
2022-09-23 18:57:54 +02:00
int * ipiv = calloc ( 1 , sizeof * ipiv * N_updates ) ;
2022-09-22 14:37:00 +02:00
( void ) LAPACKE_dgetrf ( LAPACK_ROW_MAJOR , N_updates , N_updates , B , N_updates , ipiv ) ;
2022-07-20 19:09:55 +02:00
double det = 1.0 ;
int j = 0 ;
for ( uint32_t i = 0 ; i < N_updates ; i + + ) {
2022-07-21 13:57:28 +02:00
j + = min ( ipiv [ i ] - i , 1 ) ;
2022-07-20 19:09:55 +02:00
det * = B [ ( N_updates + 1 ) * i ] ;
}
2022-07-21 12:21:51 +02:00
if ( ( j & 1 ) = = 0 ) det = - det ; // multiply det with -1 if j is even
2022-07-20 19:09:55 +02:00
// Check if determinant of B is not too close to zero
if ( fabs ( det ) < breakdown ) {
return 1 ;
}
// Update det(Slater) if passed
if ( determinant ) * determinant * = det ;
// Compute B^{-1} with explicit formula for K x K inversion
2022-09-22 14:37:00 +02:00
( void ) LAPACKE_dgetri ( LAPACK_ROW_MAJOR , N_updates , B , N_updates , ipiv ) ;
2022-07-20 19:09:55 +02:00
2022-09-09 17:15:12 +02:00
// tmp1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
double tmp1 [ N_updates * LDS ] ;
2022-07-20 19:09:55 +02:00
cblas_dgemm ( CblasRowMajor , CblasNoTrans , CblasNoTrans ,
N_updates , LDS , N_updates ,
alpha , B , N_updates , D , LDS ,
2022-09-09 17:15:12 +02:00
beta , tmp1 , LDS ) ;
// Compute S^{-1} - C * tmp1 : Dim x LDS : standard dgemm
2022-09-22 14:37:00 +02:00
alpha = - 1.0 , beta = 1.0 ;
cblas_dgemm ( CblasRowMajor , CblasNoTrans , CblasNoTrans ,
2022-07-20 19:09:55 +02:00
Dim , LDS , N_updates ,
2022-09-09 17:15:12 +02:00
alpha , C , N_updates , tmp1 , LDS ,
2022-09-22 14:37:00 +02:00
beta , Slater_inv , LDS ) ;
2022-07-20 19:09:55 +02:00
2022-09-23 18:57:54 +02:00
free ( ipiv ) ;
2022-07-20 19:09:55 +02:00
return 0 ;
}
2022-07-21 12:21:51 +02:00
# ifdef HAVE_CUBLAS_OFFLOAD
2022-09-23 18:57:54 +02:00
uint32_t qmckl_woodbury_k_cublas_offload ( cublasHandle_t b_handle , cusolverDnHandle_t s_handle ,
2022-09-22 14:37:00 +02:00
const uint64_t vLDS ,
const uint64_t vDim ,
const uint64_t N_updates ,
const double * Updates ,
const uint64_t * Updates_index ,
const double breakdown ,
double * Slater_inv ,
double * determinant )
{
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-21 12:21:51 +02:00
2022-09-22 14:37:00 +02:00
double alpha , beta ;
2022-09-23 18:57:54 +02:00
int * ipiv = calloc ( 1 , sizeof * ipiv * N_updates ) ;
2022-09-22 14:37:00 +02:00
double * C = calloc ( 1 , sizeof * C * Dim * N_updates ) ;
double * B = calloc ( 1 , sizeof * B * N_updates * N_updates ) ;
double * D = calloc ( 1 , sizeof * D * N_updates * Lds ) ;
double * T1 = calloc ( 1 , sizeof * T1 * N_updates * Lds ) ;
double * T2 = calloc ( 1 , sizeof * T2 * Dim * Lds ) ;
2022-09-23 18:57:54 +02:00
int lwork = 0 , * info = NULL ; double * d_work = NULL ;
cusolverDnDgetrf_bufferSize ( s_handle , N_updates , N_updates , B , N_updates , & lwork ) ;
printf ( " Size of lwork = %d \n " , lwork ) ;
d_work = calloc ( 1 , sizeof * d_work * lwork ) ;
2022-09-22 14:37:00 +02:00
# pragma omp target enter data map(to: Updates[0:Lds*N_updates], \
Updates_index [ 0 : N_updates ] , \
Slater_inv [ 0 : Dim * Lds ] ) \
map ( alloc : B [ 0 : N_updates * N_updates ] , \
C [ 0 : Dim * N_updates ] , \
D [ 0 : N_updates * Lds ] , \
T1 [ 0 : N_updates * Lds ] , \
T2 [ 0 : Dim * Lds ] , \
2022-09-23 18:57:54 +02:00
ipiv [ 0 : N_updates ] , \
d_work [ 0 : lwork ] )
2022-09-22 14:37:00 +02:00
# pragma omp target data use_device_ptr(Slater_inv, Updates, C) // compute C ON DEVICE
2022-07-22 11:34:29 +02:00
{
2022-09-22 14:37:00 +02:00
alpha = 1.0f , beta = 0.0f ;
2022-09-23 18:57:54 +02:00
( void ) cublasDgemm ( b_handle ,
2022-09-22 14:37:00 +02:00
CUBLAS_OP_T , CUBLAS_OP_N ,
N_updates , Dim , Lds ,
& alpha , Updates , Lds , Slater_inv , Lds ,
& beta , C , N_updates ) ;
2022-07-22 11:34:29 +02:00
}
2022-07-21 12:21:51 +02:00
2022-09-22 14:37:00 +02:00
// Construct B = 1 + V C : K x K
2022-07-21 12:21:51 +02:00
// Construct D = V S^{-1} : K x LDS
2022-09-22 14:37:00 +02:00
# pragma omp target teams distribute parallel for // compute B, D ON DEVICE
for ( uint32_t i = 0 ; i < N_updates ; i + + )
{
2022-07-21 12:21:51 +02:00
const uint32_t row = Updates_index [ i ] - 1 ;
2022-09-22 14:37:00 +02:00
for ( uint32_t j = 0 ; j < N_updates ; j + + )
{
2022-09-23 18:57:54 +02:00
B [ j * N_updates + i ] = C [ row * N_updates + j ] + ( i = = j ) ; // B NEEDS TO BE IN COL-MAJ FOR cusolverDnDgetrf !
2022-09-22 14:37:00 +02:00
}
for ( uint32_t j = 0 ; j < Lds ; j + + )
{
D [ i * Lds + j ] = Slater_inv [ row * Lds + j ] ;
}
2022-07-21 12:21:51 +02:00
}
// Compute determinant by LU decomposition
2022-09-23 18:57:54 +02:00
# pragma omp target data use_device_ptr(B, d_work, ipiv) // compute C ON DEVICE
{
( void ) cusolverDnDgetrf ( s_handle , N_updates , N_updates , B , N_updates , d_work , ipiv , info ) ;
}
2022-09-22 14:37:00 +02:00
double det = 1.0f ; uint32_t j = 0 ;
# pragma omp target teams distribute parallel for // compute det ON DEVICE
for ( uint32_t i = 0 ; i < N_updates ; i + + )
{
2022-09-23 18:57:54 +02:00
int row = ipiv [ i ] - i ;
j + = min ( row , 1 ) ;
2022-09-09 17:15:12 +02:00
det * = B [ ( N_updates + 1 ) * i ] ; // update determinant
2022-07-21 12:21:51 +02:00
}
if ( ( j & 1 ) = = 0 ) det = - det ; // multiply det with -1 if j is even
// Check if determinant of B is not too close to zero
2022-09-22 14:37:00 +02:00
if ( fabs ( det ) < breakdown ) return det ;
2022-07-21 12:21:51 +02:00
// Update det(Slater) if passed
if ( determinant ) * determinant * = det ;
2022-09-22 14:37:00 +02:00
// Compute B^{-1}
2022-09-23 18:57:54 +02:00
# pragma omp target update from(B[0:N_updates*N_updates], ipiv[0:N_updates])
( void ) LAPACKE_dgetri ( LAPACK_COL_MAJOR , N_updates , B , N_updates , ipiv ) ; // compute B^-1 ON HOST
2022-09-22 14:37:00 +02:00
# pragma omp target update to(B[:N_updates*N_updates]) // Update B^-1 TO DEVICE
2022-07-21 12:21:51 +02:00
2022-09-22 14:37:00 +02:00
// T1 = B^{-1} D : KxLDS = KxK X KxLDS : standard dgemm
# pragma omp target data use_device_ptr(D, B, T1) // compute T1 ON DEVICE
2022-09-09 17:15:12 +02:00
{
2022-09-22 14:37:00 +02:00
alpha = 1.0 , beta = 0.0 ;
2022-09-23 18:57:54 +02:00
( void ) cublasDgemm ( b_handle ,
CUBLAS_OP_N , CUBLAS_OP_T , // REMEMBER THIS IS TMP TRANSPOSED BECAUSE OF LAPACKE CALL ON l429 !!!
2022-09-22 14:37:00 +02:00
Lds , N_updates , N_updates ,
& alpha , D , Lds , B , N_updates ,
& beta , T1 , Lds ) ;
2022-09-09 17:15:12 +02:00
}
2022-07-21 12:21:51 +02:00
2022-09-22 14:37:00 +02:00
// Compute T2 <- C * T1 : Dim x LDS : standard dgemm
# pragma omp target data use_device_ptr(T1, C, T2) // compute T2 ONM DEVICE
2022-09-09 17:15:12 +02:00
{
2022-09-22 14:37:00 +02:00
alpha = 1.0f , beta = 0.0f ;
2022-09-23 18:57:54 +02:00
( void ) cublasDgemm ( b_handle ,
2022-09-22 14:37:00 +02:00
CUBLAS_OP_N , CUBLAS_OP_N ,
Dim , Lds , N_updates ,
& alpha , T1 , Lds , C , N_updates ,
& beta , T2 , Lds ) ;
2022-09-09 17:15:12 +02:00
}
2022-09-22 14:37:00 +02:00
// Compute S^{-1} <- S^{-1} - T2 : Dim x LDS : standard dgemm
# pragma omp target teams distribute parallel for // compute S^-1 ON DEVICE
for ( uint32_t i = 0 ; i < Dim * Lds ; i + + )
2022-09-09 17:15:12 +02:00
{
2022-09-22 14:37:00 +02:00
Slater_inv [ i ] = Slater_inv [ i ] - T2 [ i ] ;
2022-09-09 17:15:12 +02:00
}
2022-09-22 14:37:00 +02:00
# pragma omp target update from(Slater_inv[0:Dim*Lds]) // update S^-1 ON HOST
# pragma omp target exit data map(delete: Updates[0:Lds*N_updates], \
Updates_index [ 0 : N_updates ] , \
Slater_inv [ 0 : Dim * Lds ] , \
B [ 0 : N_updates * N_updates ] , \
C [ 0 : Dim * N_updates ] , \
D [ 0 : N_updates * Lds ] , \
T1 [ 0 : N_updates * Lds ] , \
T2 [ 0 : Dim * Lds ] , \
ipiv [ 0 : N_updates ] )
// free(ipiv);
2022-09-09 17:15:12 +02:00
free ( B ) ;
free ( C ) ;
free ( D ) ;
2022-09-22 14:37:00 +02:00
free ( T1 ) ;
free ( T2 ) ;
2022-07-21 12:21:51 +02:00
return 0 ;
}
# endif
2022-07-11 14:48:59 +02:00
uint32_t qmckl_slagel_splitting (
const uint64_t vLDS , const uint64_t vDim , uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) later_updates ,
uint64_t * __restrict later_index , uint64_t * __restrict later ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
2022-07-21 08:16:25 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ LDS ] ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) D [ LDS ] ;
uint32_t l = 0 ;
// For each update
while ( l < N_updates ) {
// C = S^{-1} x U_l
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i ] = 0.0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
C [ i ] + = Slater_inv [ i * Lds + j ] * Updates [ l * Lds + j ] ; // regular mat-vec product, but actually working on S_inv^T * U_l.
2022-07-11 14:48:59 +02:00
}
}
// Denominator
const int cui = Updates_index [ l ] - 1 ;
2022-09-22 14:37:00 +02:00
double den = 1.0 + C [ cui ] ;
2022-07-11 14:48:59 +02:00
// printf("test breakdown = %f, den = %f, C[cui] = %f, cui = %d\n", breakdown, fabs(den), C[cui], cui);
if ( fabs ( den ) < breakdown ) { // Here is decided to split the update, or not.
// printf("Split! breakdown = %f\n", breakdown);
n_splits + = 1 ;
// U_l = U_l / 2: split the update in 2 equal halves and save the second halve
// in later_updates
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t i = 0 ; i < Lds ; i + + ) {
later_updates [ * later * Lds + i ] = Updates [ l * Lds + i ] / 2.0 ;
2022-07-11 14:48:59 +02:00
C [ i ] / = 2.0 ;
}
later_index [ * later ] = Updates_index [ l ] ;
( * later ) + + ;
den = 1.0 + C [ cui ] ;
} // From here onwards we continue with applying the first halve of the update to Slater_inv
double iden = 1.0 / den ;
if ( ! determinant ) * determinant * = den ;
// D = v^T x S^{-1} : 1 x LDS
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
D [ j ] = Slater_inv [ cui * Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
// S^{-1} = S^{-1} - C x D / den
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
const double update = C [ i ] * D [ j ] * iden ;
2022-07-22 11:34:29 +02:00
Slater_inv [ i * Lds + j ] - = update ;
2022-07-11 14:48:59 +02:00
}
}
l + = 1 ;
}
return 0 ;
}
uint32_t qmckl_sherman_morrison_splitting (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) later_updates [ LDS * N_updates ] ;
uint64_t later_index [ N_updates ] ;
uint64_t later = 0 ;
2022-07-21 12:21:51 +02:00
// uint32_t rc;
2022-07-11 14:48:59 +02:00
2022-07-22 11:34:29 +02:00
( void ) qmckl_slagel_splitting ( Lds , Dim , N_updates , Updates , Updates_index ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , later_updates , later_index ,
& later , determinant ) ;
2022-07-22 11:34:29 +02:00
// rc = qmckl_slagel_splitting(Lds, Dim, N_updates, Updates, Updates_index,
2022-07-21 12:21:51 +02:00
// breakdown, Slater_inv, later_updates, later_index,
// &later, determinant);
2022-07-11 14:48:59 +02:00
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
if ( later > 0 ) {
recursive_calls + + ;
// printf("Later > 0\n");
2022-07-22 11:34:29 +02:00
( void ) qmckl_sherman_morrison_splitting ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv ,
determinant ) ;
2022-07-21 12:21:51 +02:00
2022-07-22 11:34:29 +02:00
// rc = qmckl_sherman_morrison_splitting(Lds, Dim, later, later_updates,
2022-07-21 12:21:51 +02:00
// later_index, breakdown, Slater_inv,
// determinant);
2022-07-11 14:48:59 +02:00
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
}
return 0 ;
}
uint32_t qmckl_sherman_morrison_smw32s (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) later_updates [ LDS * N_updates ] ;
uint64_t later_index [ N_updates ] ;
uint64_t later = 0 ;
uint32_t rc ;
if ( N_updates = = 4 ) { // Special case for 4 rank-1 updates: 2+2
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_2 ( Lds , Dim , Updates , Updates_index ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 2 , Updates ,
2022-07-11 14:48:59 +02:00
Updates_index , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
later + = l ;
}
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_2 ( Lds , Dim , & Updates [ 2 * Lds ] , & Updates_index [ 2 ] ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 2 , & Updates [ 2 * Lds ] ,
2022-07-11 14:48:59 +02:00
& Updates_index [ 2 ] , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
later + = l ;
}
if ( later > 0 ) {
recursive_calls + + ;
2022-07-22 11:34:29 +02:00
rc = qmckl_sherman_morrison_splitting ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv ,
determinant ) ;
}
return 0 ;
}
// And for the other cases != 4, 6
// Apply first 3*n_of_3blocks updates in n_of_3blocks blocks of 3 updates with
2022-09-22 14:37:00 +02:00
// Woodbury 3x3 kernel
2022-07-11 14:48:59 +02:00
uint32_t n_of_3blocks = N_updates / 3 ;
uint32_t remainder = N_updates % 3 ;
2022-07-22 11:34:29 +02:00
uint32_t length_3block = 3 * Lds ;
2022-07-11 14:48:59 +02:00
if ( n_of_3blocks > 0 ) {
for ( uint32_t i = 0 ; i < n_of_3blocks ; i + + ) {
const double * Updates_3block = & Updates [ i * length_3block ] ;
const uint64_t * Updates_index_3block = & Updates_index [ i * 3 ] ;
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_3 ( Lds , Dim , Updates_3block , Updates_index_3block ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
// printf("QMCKL_WOODBURY_3 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 3 , Updates_3block ,
2022-07-11 14:48:59 +02:00
Updates_index_3block , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later + = l ;
}
}
}
// Apply last remaining block of 2 updates with Woodbury 2x2 kernel
if ( remainder = = 2 ) {
const double * Updates_2block = & Updates [ n_of_3blocks * length_3block ] ;
const uint64_t * Updates_index_2block = & Updates_index [ 3 * n_of_3blocks ] ;
2022-07-22 11:34:29 +02:00
rc = qmckl_woodbury_2 ( Lds , Dim , Updates_2block , Updates_index_2block ,
2022-07-11 14:48:59 +02:00
breakdown , Slater_inv , determinant ) ;
if ( rc ! = 0 ) { // Send the entire block to slagel_splitting
// printf("QMCKL_WOODBURY_2 failed. Sending to QMCKL_SLAGEL_SPLITTING\n");
block_fail + = 1 ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 2 , Updates_2block ,
2022-07-11 14:48:59 +02:00
Updates_index_2block , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later + = l ;
}
}
// Apply last remaining update with slagel_splitting
if ( remainder = = 1 ) {
// // printf("Sending single update to QMCKL_SLAGEL_SPLITTING\n");
const double * Updates_1block = & Updates [ n_of_3blocks * length_3block ] ;
const uint64_t * Updates_index_1block = & Updates_index [ 3 * n_of_3blocks ] ;
uint64_t l = 0 ;
2022-07-22 11:34:29 +02:00
rc = qmckl_slagel_splitting ( Lds , Dim , 1 , Updates_1block ,
2022-07-11 14:48:59 +02:00
Updates_index_1block , breakdown , Slater_inv ,
2022-07-22 11:34:29 +02:00
later_updates + ( Lds * later ) ,
2022-07-11 14:48:59 +02:00
later_index + later , & l , determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SLAGEL_SPLITTING\n");
later + = l ;
}
if ( later > 0 ) {
recursive_calls + + ;
// printf("Sending remaining updates to QMCKL_SHERMAN_MORRISON_SPLITTING\n");
2022-07-22 11:34:29 +02:00
rc = qmckl_sherman_morrison_splitting ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv ,
determinant ) ;
// if (rc != 0) printf("Something when catastrophically wrong in QMCKL_SHERMAN_MORRISON_SPLITTING\n");
}
return 0 ;
}
// Sherman Morrison, leaving zero denominators for later
uint32_t qmckl_sherman_morrison_later (
const uint64_t vLDS , const uint64_t vDim , const uint64_t N_updates ,
const double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Updates ,
const uint64_t * __restrict Updates_index , const double breakdown ,
double * __restrict __attribute__ ( ( aligned ( 8 ) ) ) Slater_inv ,
double * __restrict determinant ) {
2022-07-22 11:34:29 +02:00
const uint32_t Dim = DIM ;
const uint32_t Lds = LDS ;
2022-07-11 14:48:59 +02:00
2022-07-22 11:34:29 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) C [ DIM ] ;
2022-07-11 14:48:59 +02:00
double __attribute__ ( ( aligned ( 8 ) ) ) D [ LDS ] ;
double __attribute__ ( ( aligned ( 8 ) ) ) later_updates [ LDS * N_updates ] ;
uint64_t later_index [ N_updates ] ;
uint64_t later = 0 ;
uint32_t l = 0 ;
// For each update
while ( l < N_updates ) {
// C = A^{-1} x U_l
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
C [ i ] = 0.0 ;
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
C [ i ] + = Slater_inv [ i * Lds + j ] * Updates [ l * Lds + j ] ; // regular mat-vec product, but actually working on S_inv^T * U_l.
2022-07-11 14:48:59 +02:00
}
}
// Denominator
const int cui = Updates_index [ l ] - 1 ;
double den = 1.0 + C [ cui ] ;
if ( fabs ( den ) < breakdown ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-11 14:48:59 +02:00
// for (uint32_t i = 0; i < Dim; i++) {
2022-07-22 11:34:29 +02:00
for ( uint32_t i = 0 ; i < Lds ; i + + ) {
later_updates [ later * Lds + i ] = Updates [ l * Lds + i ] ;
2022-07-11 14:48:59 +02:00
}
later_index [ later ] = Updates_index [ l ] ;
later + + ;
l + = 1 ;
continue ;
}
double iden = 1.0 / den ;
if ( ! determinant ) * determinant * = den ;
// D = v^T x A^{-1}
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
D [ j ] = Slater_inv [ cui * Lds + j ] ;
2022-07-11 14:48:59 +02:00
}
// S^{-1} = S^{-1} - C x D / den
for ( uint32_t i = 0 ; i < Dim ; i + + ) {
# pragma ivdep
2022-07-21 13:57:28 +02:00
# pragma vector aligned
2022-07-22 11:34:29 +02:00
for ( uint32_t j = 0 ; j < Lds ; j + + ) {
2022-07-11 14:48:59 +02:00
const double update = C [ i ] * D [ j ] * iden ;
2022-07-22 11:34:29 +02:00
Slater_inv [ i * Lds + j ] - = update ;
2022-07-11 14:48:59 +02:00
}
}
l + = 1 ;
2022-09-22 14:37:00 +02:00
}
2022-07-11 14:48:59 +02:00
if ( later = = N_updates ) { // If all the updates have failed, exit early with an error
return 1 ;
}
else if ( later > 0 ) { // If some have failed, make a recursive call
recursive_calls + + ;
2022-07-22 11:34:29 +02:00
( void ) qmckl_sherman_morrison_later ( Lds , Dim , later , later_updates ,
2022-07-11 14:48:59 +02:00
later_index , breakdown , Slater_inv , determinant ) ;
}
return 0 ;
}
// Inplace inverse n x n matrix A.
// returns:
// ret = 0 on success
// ret < 0 illegal argument value
// ret > 0 singular matrix
lapack_int inverse ( double * a , uint64_t m , uint64_t n ) {
int ipiv [ m + 1 ] ;
lapack_int ret ;
ret = LAPACKE_dgetrf ( LAPACK_ROW_MAJOR , m , n , a , n , ipiv ) ;
if ( ret ! = 0 ) return ret ;
ret = LAPACKE_dgetri ( LAPACK_ROW_MAJOR , n , a , n , ipiv ) ;
return ret ;
}