15 KiB
Tiled arrays
To increase performance, matrices may be stored as tiled arrays. Instead of storing a matrix in a two-dimensional array, it may be stored as a two dimensional array of small matrices (a rank 4 tensor). This improves the locality of the data in matrix multiplications, and also enables the possibility to use BLAS3 while also exploiting part of the sparse structure of the matrices.
Tile │ ┌──────┬──────┬──────┐ │ │1 4 7 │ │ │ └───────►│2 5 8 │ T_12 │ T_13 │ │3 6 9 │ │ │ ├──────┼──────┼──────┤ │ │ │ │ │ T_21 │ T_22 │ T_23 │ │ │ │ │ ├──────┼──────┼──────┤ │ │ │ │ │ T_31 │ T_32 │ T_33 │ │ │ │ │ └──────┴──────┴──────┘
In this file, tiled matrice will be produced for the following types:
float |
double |
Data structures
Tile
A tile is a small matrix of fixed size. The dimensions of the tiles is fixed at compile-time to increase performance. It is defined as $2^s$:
s | tile size |
---|---|
2 | 4 |
3 | 8 |
4 | 16 |
5 | 32 |
6 | 64 |
7 | 128 |
#define TILE_SIZE_SHIFT 3
#define TILE_SIZE 8
#define VEC_SIZE 8
typedef struct $T$_tile_struct {
$T$ element[TILE_SIZE][TILE_SIZE];
int64_t is_null;
int64_t padding[VEC_SIZE-1];
} $T$_tile_struct;
Tiled matrix
A tiled matrix is a two-dimensional array of tiles.
typedef struct $T$_tiled_matrix {
$T$_tile_struct** tile;
size_t n_row;
size_t n_col;
size_t n_tile_row;
size_t n_tile_col;
} $T$_tiled_matrix;
When a tiled matrix is initialized, it is set to zero.
qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
$T$_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col);
qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
$T$_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
} $T$_tiled_matrix;
When a tiled matrix is initialized, it is set to zero.
qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
$T$_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col);
qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
$T$_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (m == NULL) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"$T$_tiled_matrix_init",
NULL);
}
if (n_tile_row == (size_t) 0) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_3,
"$T$_tiled_matrix_init",
NULL);
}
if (n_tile_col == (size_t) 0) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_4,
"$T$_tiled_matrix_init",
NULL);
}
qmckl_memory_info_struct info = qmckl_memory_info_struct_zero;
size_t n = n_tile_row * n_tile_col;
/* Check overflow */
if (n/n_tile_col != n_tile_row
|| n > SIZE_MAX / sizeof($T$_tile_struct) ) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"$T$_tiled_matrix_init",
"n_tile_row * n_tile_col overflows" );
}
/* Allocate array of column pointers */
info.size = n_tile_col * sizeof($T$_tile_struct*) ;
m->tile = ($T$_tile_struct**) qmckl_malloc(context, info);
if (m->tile == NULL) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"$T$_tiled_matrix_init",
NULL);
}
/* Allocate array of tiles */
info.size = n * sizeof($T$_tile_struct) ;
m->tile[0] = ($T$_tile_struct*) qmckl_malloc(context, info);
if (m->tile[0] == NULL) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"$T$_tiled_matrix_init",
NULL);
}
/* Compute array of pointers to the 1st element of columns */
for (size_t i=1 ; i<n_tile_col ; ++i) {
m->tile[i] = m->tile[i-1] + n_tile_row;
}
m->n_tile_row = n_tile_row;
m->n_tile_col = n_tile_col;
return QMCKL_SUCCESS;
}
Write templates
typedef struct float_tile_struct {
float element[TILE_SIZE][TILE_SIZE];
int32_t is_null;
int32_t padding;
} float_tile_struct;
typedef struct float_tiled_matrix {
float_tile_struct** tile;
size_t n_tile_row;
size_t n_tile_col;
} float_tiled_matrix;
typedef struct double_tile_struct {
double element[TILE_SIZE][TILE_SIZE];
int32_t is_null;
int32_t padding;
} double_tile_struct;
typedef struct double_tiled_matrix {
double_tile_struct** tile;
size_t n_tile_row;
size_t n_tile_col;
} double_tiled_matrix;
qmckl_exit_code float_tiled_matrix_init (qmckl_context context,
float_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col);
qmckl_exit_code double_tiled_matrix_init (qmckl_context context,
double_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col);
qmckl_exit_code float_tiled_matrix_init (qmckl_context context,
float_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (m == NULL) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"float_tiled_matrix_init",
NULL);
}
if (n_tile_row == (size_t) 0) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_3,
"float_tiled_matrix_init",
NULL);
}
if (n_tile_col == (size_t) 0) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_4,
"float_tiled_matrix_init",
NULL);
}
qmckl_memory_info_struct info = qmckl_memory_info_struct_zero;
size_t n = n_tile_row * n_tile_col;
/* Check overflow */
if (n/n_tile_col != n_tile_row
|| n > SIZE_MAX / sizeof(float_tile_struct) ) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"float_tiled_matrix_init",
"n_tile_row * n_tile_col overflows" );
}
/* Allocate array of column pointers */
info.size = n_tile_col * sizeof(float_tile_struct*) ;
m->tile = (float_tile_struct**) qmckl_malloc(context, info);
if (m->tile == NULL) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"float_tiled_matrix_init",
NULL);
}
/* Allocate array of tiles */
info.size = n * sizeof(float_tile_struct) ;
m->tile[0] = (float_tile_struct*) qmckl_malloc(context, info);
if (m->tile[0] == NULL) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"float_tiled_matrix_init",
NULL);
}
/* Compute array of pointers to the 1st element of columns */
for (size_t i=1 ; i<n_tile_col ; ++i) {
m->tile[i] = m->tile[i-1] + n_tile_row;
}
m->n_tile_row = n_tile_row;
m->n_tile_col = n_tile_col;
return QMCKL_SUCCESS;
}
qmckl_exit_code double_tiled_matrix_init (qmckl_context context,
double_tiled_matrix* m,
size_t n_tile_row,
size_t n_tile_col) {
if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
return QMCKL_INVALID_CONTEXT;
}
if (m == NULL) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_2,
"double_tiled_matrix_init",
NULL);
}
if (n_tile_row == (size_t) 0) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_3,
"double_tiled_matrix_init",
NULL);
}
if (n_tile_col == (size_t) 0) {
return qmckl_failwith(context,
QMCKL_INVALID_ARG_4,
"double_tiled_matrix_init",
NULL);
}
qmckl_memory_info_struct info = qmckl_memory_info_struct_zero;
size_t n = n_tile_row * n_tile_col;
/* Check overflow */
if (n/n_tile_col != n_tile_row
|| n > SIZE_MAX / sizeof(double_tile_struct) ) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"double_tiled_matrix_init",
"n_tile_row * n_tile_col overflows" );
}
/* Allocate array of column pointers */
info.size = n_tile_col * sizeof(double_tile_struct*) ;
m->tile = (double_tile_struct**) qmckl_malloc(context, info);
if (m->tile == NULL) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"double_tiled_matrix_init",
NULL);
}
/* Allocate array of tiles */
info.size = n * sizeof(double_tile_struct) ;
m->tile[0] = (double_tile_struct*) qmckl_malloc(context, info);
if (m->tile[0] == NULL) {
return qmckl_failwith(context,
QMCKL_ALLOCATION_FAILED,
"double_tiled_matrix_init",
NULL);
}
/* Compute array of pointers to the 1st element of columns */
for (size_t i=1 ; i<n_tile_col ; ++i) {
m->tile[i] = m->tile[i-1] + n_tile_row;
}
m->n_tile_row = n_tile_row;
m->n_tile_col = n_tile_col;
return QMCKL_SUCCESS;
}