qmckl/org/hpc/qmckl_tile.org

15 KiB

Tiled arrays

To increase performance, matrices may be stored as tiled arrays. Instead of storing a matrix in a two-dimensional array, it may be stored as a two dimensional array of small matrices (a rank 4 tensor). This improves the locality of the data in matrix multiplications, and also enables the possibility to use BLAS3 while also exploiting part of the sparse structure of the matrices.

Tile │ ┌──────┬──────┬──────┐ │ │1 4 7 │ │ │ └───────►│2 5 8 │ T_12 │ T_13 │ │3 6 9 │ │ │ ├──────┼──────┼──────┤ │ │ │ │ │ T_21 │ T_22 │ T_23 │ │ │ │ │ ├──────┼──────┼──────┤ │ │ │ │ │ T_31 │ T_32 │ T_33 │ │ │ │ │ └──────┴──────┴──────┘

In this file, tiled matrice will be produced for the following types:

float
double

Data structures

Tile

A tile is a small matrix of fixed size. The dimensions of the tiles is fixed at compile-time to increase performance. It is defined as $2^s$:

s tile size
2 4
3 8
4 16
5 32
6 64
7 128
#define TILE_SIZE_SHIFT 3
#define TILE_SIZE       8
#define VEC_SIZE        8
typedef struct $T$_tile_struct {
$T$ element[TILE_SIZE][TILE_SIZE];
int64_t is_null;
int64_t padding[VEC_SIZE-1];
} $T$_tile_struct;

Tiled matrix

A tiled matrix is a two-dimensional array of tiles.

typedef struct $T$_tiled_matrix {
$T$_tile_struct** tile;
size_t n_row;
size_t n_col;
size_t n_tile_row;
size_t n_tile_col;
} $T$_tiled_matrix;

When a tiled matrix is initialized, it is set to zero.

qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
                                     $T$_tiled_matrix* m,
                                     size_t n_tile_row,
                                     size_t n_tile_col);
qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
                                     $T$_tiled_matrix* m,
                                     size_t n_tile_row,
                                     size_t n_tile_col) {

if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
} $T$_tiled_matrix;

When a tiled matrix is initialized, it is set to zero.

qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
                                     $T$_tiled_matrix* m,
                                     size_t n_tile_row,
                                     size_t n_tile_col);
qmckl_exit_code $T$_tiled_matrix_init (qmckl_context context,
                                     $T$_tiled_matrix* m,
                                     size_t n_tile_row,
                                     size_t n_tile_col) {

if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
  return QMCKL_INVALID_CONTEXT;
}

if (m == NULL) {
  return qmckl_failwith(context,
                        QMCKL_INVALID_ARG_2,
                        "$T$_tiled_matrix_init",
                        NULL);
}

if (n_tile_row == (size_t) 0) {
  return qmckl_failwith(context,
                        QMCKL_INVALID_ARG_3,
                        "$T$_tiled_matrix_init",
                        NULL);
}

if (n_tile_col == (size_t) 0) {
  return qmckl_failwith(context,
                        QMCKL_INVALID_ARG_4,
                        "$T$_tiled_matrix_init",
                        NULL);
}

qmckl_memory_info_struct info = qmckl_memory_info_struct_zero;
size_t n = n_tile_row * n_tile_col;

/* Check overflow */
if (n/n_tile_col != n_tile_row
    || n > SIZE_MAX / sizeof($T$_tile_struct) ) {
  return qmckl_failwith(context,
                        QMCKL_ALLOCATION_FAILED,
                        "$T$_tiled_matrix_init",
                        "n_tile_row * n_tile_col overflows" );
}

/* Allocate array of column pointers */
info.size = n_tile_col * sizeof($T$_tile_struct*) ;
m->tile = ($T$_tile_struct**) qmckl_malloc(context, info);

if (m->tile == NULL) {
  return qmckl_failwith(context,
                        QMCKL_ALLOCATION_FAILED,
                        "$T$_tiled_matrix_init",
                        NULL);
}


/* Allocate array of tiles */
info.size = n * sizeof($T$_tile_struct) ;
m->tile[0] = ($T$_tile_struct*) qmckl_malloc(context, info);

if (m->tile[0] == NULL) {
  return qmckl_failwith(context,
                        QMCKL_ALLOCATION_FAILED,
                        "$T$_tiled_matrix_init",
                        NULL);
}

/* Compute array of pointers to the 1st element of columns */
for (size_t i=1 ; i<n_tile_col ; ++i) {
  m->tile[i] = m->tile[i-1] + n_tile_row;
}

m->n_tile_row = n_tile_row;
m->n_tile_col = n_tile_col;
return QMCKL_SUCCESS;
}

Write templates

typedef struct float_tile_struct {
  float element[TILE_SIZE][TILE_SIZE];
  int32_t is_null;
  int32_t padding;
} float_tile_struct;

typedef struct float_tiled_matrix {
  float_tile_struct** tile;
  size_t n_tile_row;
  size_t n_tile_col;
} float_tiled_matrix;



typedef struct double_tile_struct {
  double element[TILE_SIZE][TILE_SIZE];
  int32_t is_null;
  int32_t padding;
} double_tile_struct;

typedef struct double_tiled_matrix {
  double_tile_struct** tile;
  size_t n_tile_row;
  size_t n_tile_col;
} double_tiled_matrix;
qmckl_exit_code float_tiled_matrix_init (qmckl_context context,
                                       float_tiled_matrix* m,
                                       size_t n_tile_row,
                                       size_t n_tile_col);



qmckl_exit_code double_tiled_matrix_init (qmckl_context context,
                                       double_tiled_matrix* m,
                                       size_t n_tile_row,
                                       size_t n_tile_col);
qmckl_exit_code float_tiled_matrix_init (qmckl_context context,
                                       float_tiled_matrix* m,
                                       size_t n_tile_row,
                                       size_t n_tile_col) {

  if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
    return QMCKL_INVALID_CONTEXT;
  }

  if (m == NULL) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_2,
                          "float_tiled_matrix_init",
                          NULL);
  }

  if (n_tile_row == (size_t) 0) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_3,
                          "float_tiled_matrix_init",
                          NULL);
  }

  if (n_tile_col == (size_t) 0) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_4,
                          "float_tiled_matrix_init",
                          NULL);
  }

  qmckl_memory_info_struct info = qmckl_memory_info_struct_zero;
  size_t n = n_tile_row * n_tile_col;

  /* Check overflow */
  if (n/n_tile_col != n_tile_row
      || n > SIZE_MAX / sizeof(float_tile_struct) ) {
    return qmckl_failwith(context,
                          QMCKL_ALLOCATION_FAILED,
                          "float_tiled_matrix_init",
                          "n_tile_row * n_tile_col overflows" );
  }

  /* Allocate array of column pointers */
  info.size = n_tile_col * sizeof(float_tile_struct*) ;
  m->tile = (float_tile_struct**) qmckl_malloc(context, info);

  if (m->tile == NULL) {
    return qmckl_failwith(context,
                          QMCKL_ALLOCATION_FAILED,
                          "float_tiled_matrix_init",
                          NULL);
  }


  /* Allocate array of tiles */
  info.size = n * sizeof(float_tile_struct) ;
  m->tile[0] = (float_tile_struct*) qmckl_malloc(context, info);

  if (m->tile[0] == NULL) {
    return qmckl_failwith(context,
                          QMCKL_ALLOCATION_FAILED,
                          "float_tiled_matrix_init",
                          NULL);
  }

  /* Compute array of pointers to the 1st element of columns */
  for (size_t i=1 ; i<n_tile_col ; ++i) {
    m->tile[i] = m->tile[i-1] + n_tile_row;
  }

  m->n_tile_row = n_tile_row;
  m->n_tile_col = n_tile_col;
  return QMCKL_SUCCESS;
}




qmckl_exit_code double_tiled_matrix_init (qmckl_context context,
                                       double_tiled_matrix* m,
                                       size_t n_tile_row,
                                       size_t n_tile_col) {

  if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) {
    return QMCKL_INVALID_CONTEXT;
  }

  if (m == NULL) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_2,
                          "double_tiled_matrix_init",
                          NULL);
  }

  if (n_tile_row == (size_t) 0) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_3,
                          "double_tiled_matrix_init",
                          NULL);
  }

  if (n_tile_col == (size_t) 0) {
    return qmckl_failwith(context,
                          QMCKL_INVALID_ARG_4,
                          "double_tiled_matrix_init",
                          NULL);
  }

  qmckl_memory_info_struct info = qmckl_memory_info_struct_zero;
  size_t n = n_tile_row * n_tile_col;

  /* Check overflow */
  if (n/n_tile_col != n_tile_row
      || n > SIZE_MAX / sizeof(double_tile_struct) ) {
    return qmckl_failwith(context,
                          QMCKL_ALLOCATION_FAILED,
                          "double_tiled_matrix_init",
                          "n_tile_row * n_tile_col overflows" );
  }

  /* Allocate array of column pointers */
  info.size = n_tile_col * sizeof(double_tile_struct*) ;
  m->tile = (double_tile_struct**) qmckl_malloc(context, info);

  if (m->tile == NULL) {
    return qmckl_failwith(context,
                          QMCKL_ALLOCATION_FAILED,
                          "double_tiled_matrix_init",
                          NULL);
  }


  /* Allocate array of tiles */
  info.size = n * sizeof(double_tile_struct) ;
  m->tile[0] = (double_tile_struct*) qmckl_malloc(context, info);

  if (m->tile[0] == NULL) {
    return qmckl_failwith(context,
                          QMCKL_ALLOCATION_FAILED,
                          "double_tiled_matrix_init",
                          NULL);
  }

  /* Compute array of pointers to the 1st element of columns */
  for (size_t i=1 ; i<n_tile_col ; ++i) {
    m->tile[i] = m->tile[i-1] + n_tile_row;
  }

  m->n_tile_row = n_tile_row;
  m->n_tile_col = n_tile_col;
  return QMCKL_SUCCESS;
}