1
0
mirror of https://github.com/TREX-CoE/trexio.git synced 2025-01-05 02:48:59 +01:00

optimize storage of indices depending on the size_max variable [TEXT]

This commit is contained in:
q-posev 2021-12-17 15:32:03 +01:00
parent 1e457c497d
commit 2639b76a6e
3 changed files with 84 additions and 45 deletions

View File

@ -2455,7 +2455,7 @@ trexio_read_$group_dset$(trexio_t* const file,
switch (file->back_end) { switch (file->back_end) {
case TREXIO_TEXT: case TREXIO_TEXT:
rc = trexio_text_read_$group_dset$(file, offset_file, *buffer_size, size_max, &eof_read_size, index_sparse, value_sparse); rc = trexio_text_read_$group_dset$(file, offset_file, *buffer_size, num, &eof_read_size, index_sparse, value_sparse);
break; break;
case TREXIO_HDF5: case TREXIO_HDF5:
@ -2570,7 +2570,7 @@ trexio_write_$group_dset$(trexio_t* const file,
switch (file->back_end) { switch (file->back_end) {
case TREXIO_TEXT: case TREXIO_TEXT:
rc = trexio_text_write_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse_p, value_sparse); rc = trexio_text_write_$group_dset$(file, offset_file, buffer_size, num, size_max, index_sparse_p, value_sparse);
break; break;
case TREXIO_HDF5: case TREXIO_HDF5:

View File

@ -1014,7 +1014,7 @@ trexio_text_has_$group_str$ (trexio_t* const file)
#+begin_src c :tangle hrw_dset_sparse_text.h :exports none #+begin_src c :tangle hrw_dset_sparse_text.h :exports none
trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file); trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse); trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse); trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int64_t size_start, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max); trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
#+end_src #+end_src
@ -1024,6 +1024,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
const int64_t offset_file, const int64_t offset_file,
const int64_t size, const int64_t size,
const int64_t size_max, const int64_t size_max,
const int64_t size_start,
const int32_t* index_sparse, const int32_t* index_sparse,
const double* value_sparse) const double* value_sparse)
{ {
@ -1045,41 +1046,49 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
FILE* f = fopen(file_full_path, "a"); FILE* f = fopen(file_full_path, "a");
if(f == NULL) return TREXIO_FILE_ERROR; if(f == NULL) return TREXIO_FILE_ERROR;
/* Specify the line length in order to offset properly. For example, for 4-index quantities /* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char. the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char.
CURRENTLY NO OFFSET IS USED WHEN WRITING ! CURRENTLY NO OFFSET IS USED WHEN WRITING !
*/ ,*/
const int64_t line_length = $group_dset_sparse_line_length$L; int64_t line_length = 0L;
char format_str[256] = "\0";
/* Determine the optimal type for storing indices depending on the size_max (usually mo_num or ao_num) */
if (size_max < UINT8_MAX) {
line_length = $sparse_line_length_8$; // 41 for 4 indices
strncpy(format_str, $sparse_format_printf_8$, 256);
} else if (size_max < UINT16_MAX) {
line_length = $sparse_line_length_16$; // 49 for 4 indices
strncpy(format_str, $sparse_format_printf_16$, 256);
} else {
line_length = $sparse_line_length_32$; //69 for 4 indices
strncpy(format_str, $sparse_format_printf_32$, 256);
}
strncat(format_str, "\n", 2);
/* Get the starting position of the IO stream to be written in the .size file. /* Get the starting position of the IO stream to be written in the .size file.
This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail. This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail.
One can use ftello function which is adapted for large files. One can use ftello function which is adapted for large files.
For now, we can use front-end-provided size_max, which has been checked for INT64_MAX overflow. For now, we can use front-end-provided size_start, which has been checked for INT64_MAX overflow.
*/ */
//int64_t io_start_pos = (int64_t) ftell(f); int64_t io_start_pos = size_start * line_length;
int64_t io_start_pos = size_max * line_length;
//fseek(f, (long) offset_file * line_length, SEEK_SET);
/* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */ /* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */
int rc; int rc;
for (uint64_t i=0UL; i<size; ++i) { for (uint64_t i=0UL; i<size; ++i) {
rc = fprintf(f, format_str,
rc = fprintf(f, "$group_dset_format_printf$\n", $group_dset_sparse_indices_printf$,
$group_dset_sparse_indices_printf$, *(value_sparse + i));
*(value_sparse + i)); if(rc <= 0) {
fclose(f);
if(rc <= 0) { return TREXIO_FAILURE;
fclose(f); }
return TREXIO_FAILURE;
}
} }
/* Close the TXT file */ /* Close the TXT file */
rc = fclose(f); rc = fclose(f);
if(rc != 0) return TREXIO_FILE_ERROR; if (rc != 0) return TREXIO_FILE_ERROR;
/* Append .size to the file_full_path in order to write additional info about the written buffer of data */ /* Append .size to the file_full_path in order to write additional info about the written buffer of data */
strncat(file_full_path, ".size", 6); strncat(file_full_path, ".size", 6);
@ -1097,8 +1106,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
/* Close the TXT file */ /* Close the TXT file */
rc = fclose(f_wSize); rc = fclose(f_wSize);
if(rc != 0) return TREXIO_FILE_ERROR; if (rc != 0) return TREXIO_FILE_ERROR;
/* Exit upon success */ /* Exit upon success */
return TREXIO_SUCCESS; return TREXIO_SUCCESS;
@ -1120,7 +1128,7 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Build the name of the file with sparse data. /* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed? The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
*/ ,*/
const char $group_dset$_file_name[256] = "/$group_dset$.txt"; const char $group_dset$_file_name[256] = "/$group_dset$.txt";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */ /* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH]; char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
@ -1137,9 +1145,18 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Specify the line length in order to offset properly. For example, for 4-index quantities /* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char
*/ ,*/
const uint64_t line_length = $group_dset_sparse_line_length$L; uint64_t line_length = 0UL;
/* Determine the line length depending on the size_max (usually mo_num or ao_num) */
if (size_max < UINT8_MAX) {
line_length = $sparse_line_length_8$; // 41 for 4 indices
} else if (size_max < UINT16_MAX) {
line_length = $sparse_line_length_16$; // 49 for 4 indices
} else {
line_length = $sparse_line_length_32$; //69 for 4 indices
}
/* Offset in the file according to the provided value of offset_file and optimal line_length */
fseek(f, (long) offset_file * line_length, SEEK_SET); fseek(f, (long) offset_file * line_length, SEEK_SET);
/* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */ /* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */

View File

@ -108,7 +108,9 @@ def recursive_populate_file(fname: str, paths: dict, detailed_source: dict) -> N
'group_num_dtype_default', 'group_num_dtype_double', 'group_num_dtype_single', 'group_num_dtype_default', 'group_num_dtype_double', 'group_num_dtype_single',
'group_num_h5_dtype', 'group_num_py_dtype', 'group_num_h5_dtype', 'group_num_py_dtype',
'group_dset_format_scanf', 'group_dset_format_printf', 'group_dset_sparse_dim', 'group_dset_format_scanf', 'group_dset_format_printf', 'group_dset_sparse_dim',
'group_dset_sparse_line_length', 'group_dset_sparse_indices_printf', 'group_dset_sparse_indices_scanf', 'group_dset_sparse_indices_printf', 'group_dset_sparse_indices_scanf',
'sparse_format_printf_8', 'sparse_format_printf_16', 'sparse_format_printf_32',
'sparse_line_length_8', 'sparse_line_length_16', 'sparse_line_length_32',
'group_dset', 'group_num', 'group_str', 'group'] 'group_dset', 'group_num', 'group_str', 'group']
for item in detailed_source.keys(): for item in detailed_source.keys():
@ -468,7 +470,9 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
dtype (str) : dtype corresponding to the trex.json (i.e. int/dim/float/float sparse/str) dtype (str) : dtype corresponding to the trex.json (i.e. int/dim/float/float sparse/str)
target (str) : `num` or `dset` target (str) : `num` or `dset`
rank (int) : [optional] value of n in n-index (sparse) dset; needed to build the printf/scanf format string rank (int) : [optional] value of n in n-index (sparse) dset; needed to build the printf/scanf format string
int_len_printf (int): [optional] length reserved for one index when printing n-index (sparse) dset (e.g. 10 for int32_t) int_len_printf(dict): [optional]
keys: precision (e.g. 32 for int32_t)
values: lengths reserved for one index when printing n-index (sparse) dset (e.g. 10 for int32_t)
Returns: Returns:
dtype_dict (dict) : dictionary dtype-related substitutions dtype_dict (dict) : dictionary dtype-related substitutions
@ -480,8 +484,8 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
raise Exception("Both rank and int_len_printf arguments has to be provided to build the dtype_dict for sparse data.") raise Exception("Both rank and int_len_printf arguments has to be provided to build the dtype_dict for sparse data.")
if rank is not None and rank <= 1: if rank is not None and rank <= 1:
raise Exception('Rank of sparse quantity cannot be lower than 2.') raise Exception('Rank of sparse quantity cannot be lower than 2.')
if int_len_printf is not None and int_len_printf <= 0: if int_len_printf is not None and not isinstance(int_len_printf, dict):
raise Exception('Length of an index of sparse quantity has to be positive value.') raise Exception('int_len_printf has to be a dictionary of lengths for different precisions.')
dtype_dict = {} dtype_dict = {}
# set up the key-value pairs dependending on the dtype # set up the key-value pairs dependending on the dtype
@ -532,15 +536,23 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
}) })
elif 'sparse' in dtype: elif 'sparse' in dtype:
# build format string for n-index sparse quantity # build format string for n-index sparse quantity
item_printf = f'%{int_len_printf}" PRId32 " ' item_printf_8 = f'%{int_len_printf[8]}" PRIu8 " '
item_printf_16 = f'%{int_len_printf[16]}" PRIu16 " '
item_printf_32 = f'%{int_len_printf[32]}" PRId32 " '
item_scanf = '%" SCNd32 " ' item_scanf = '%" SCNd32 " '
group_dset_format_printf = '' group_dset_format_printf_8 = '"'
group_dset_format_printf_16 = '"'
group_dset_format_printf_32 = '"'
group_dset_format_scanf = '' group_dset_format_scanf = ''
for i in range(rank): for i in range(rank):
group_dset_format_printf += item_printf group_dset_format_printf_8 += item_printf_8
group_dset_format_printf_16 += item_printf_16
group_dset_format_printf_32 += item_printf_32
group_dset_format_scanf += item_scanf group_dset_format_scanf += item_scanf
# append the format string for float values # append the format string for float values
group_dset_format_printf += '%24.16e' group_dset_format_printf_8 += '%24.16e" '
group_dset_format_printf_16 += '%24.16e" '
group_dset_format_printf_32 += '%24.16e" '
group_dset_format_scanf += '%lf' group_dset_format_scanf += '%lf'
# set up the dictionary for sparse # set up the dictionary for sparse
@ -554,7 +566,9 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
f'group_{target}_dtype_default' : '', f'group_{target}_dtype_default' : '',
f'group_{target}_dtype_double' : '', f'group_{target}_dtype_double' : '',
f'group_{target}_dtype_single' : '', f'group_{target}_dtype_single' : '',
f'group_{target}_format_printf' : group_dset_format_printf, f'sparse_format_printf_8' : group_dset_format_printf_8,
f'sparse_format_printf_16' : group_dset_format_printf_16,
f'sparse_format_printf_32' : group_dset_format_printf_32,
f'group_{target}_format_scanf' : group_dset_format_scanf, f'group_{target}_format_scanf' : group_dset_format_scanf,
f'group_{target}_py_dtype' : '' f'group_{target}_py_dtype' : ''
}) })
@ -664,17 +678,18 @@ def split_dset_dict_detailed (datasets: dict) -> tuple:
# define whether the dset is sparse # define whether the dset is sparse
is_sparse = False is_sparse = False
int_len_printf = {}
if 'sparse' in datatype: if 'sparse' in datatype:
is_sparse = True is_sparse = True
int32_len_printf = 10 int_len_printf[32] = 10
# int64_len_printf = ?? int_len_printf[16] = 5
# int16_len_printf = ?? int_len_printf[8] = 3
# get the dtype-related substitutions required to replace templated variables later # get the dtype-related substitutions required to replace templated variables later
if not is_sparse: if not is_sparse:
dtype_dict = get_dtype_dict(datatype, 'dset') dtype_dict = get_dtype_dict(datatype, 'dset')
else: else:
dtype_dict = get_dtype_dict(datatype, 'dset', rank, int32_len_printf) dtype_dict = get_dtype_dict(datatype, 'dset', rank, int_len_printf)
tmp_dict.update(dtype_dict) tmp_dict.update(dtype_dict)
@ -713,20 +728,27 @@ def split_dset_dict_detailed (datasets: dict) -> tuple:
index_printf = f'*(index_sparse + {str(rank)}*i' index_printf = f'*(index_sparse + {str(rank)}*i'
index_scanf = f'index_sparse + {str(rank)}*i' index_scanf = f'index_sparse + {str(rank)}*i'
# one index item consumes up to index_length characters (int32_len_printf for int32 + 1 for space) # one index item consumes up to index_length characters (int32_len_printf for int32 + 1 for space)
index_len = int32_len_printf + 1
group_dset_sparse_indices_printf = index_printf + ')' group_dset_sparse_indices_printf = index_printf + ')'
group_dset_sparse_indices_scanf = index_scanf group_dset_sparse_indices_scanf = index_scanf
group_dset_sparse_line_len = index_len sparse_line_length_32 = int_len_printf[32] + 1
sparse_line_length_16 = int_len_printf[16] + 1
sparse_line_length_8 = int_len_printf[8] + 1
# loop from 1 because we already have stored one index # loop from 1 because we already have stored one index
for index_count in range(1,rank): for index_count in range(1,rank):
group_dset_sparse_indices_printf += f', {index_printf} + {index_count})' group_dset_sparse_indices_printf += f', {index_printf} + {index_count})'
group_dset_sparse_indices_scanf += f', {index_scanf} + {index_count}' group_dset_sparse_indices_scanf += f', {index_scanf} + {index_count}'
group_dset_sparse_line_len += index_len sparse_line_length_32 += int_len_printf[32] + 1
sparse_line_length_16 += int_len_printf[16] + 1
sparse_line_length_8 += int_len_printf[8] + 1
# add 24 chars occupied by the floating point value of sparse dataset + 1 char for "\n" # add 24 chars occupied by the floating point value of sparse dataset + 1 char for "\n"
group_dset_sparse_line_len += 24 + 1 sparse_line_length_32 += 24 + 1
sparse_line_length_16 += 24 + 1
sparse_line_length_8 += 24 + 1
tmp_dict['group_dset_sparse_line_length'] = str(group_dset_sparse_line_len) tmp_dict['sparse_line_length_32'] = str(sparse_line_length_32)
tmp_dict['sparse_line_length_16'] = str(sparse_line_length_16)
tmp_dict['sparse_line_length_8'] = str(sparse_line_length_8)
tmp_dict['group_dset_sparse_indices_printf'] = group_dset_sparse_indices_printf tmp_dict['group_dset_sparse_indices_printf'] = group_dset_sparse_indices_printf
tmp_dict['group_dset_sparse_indices_scanf'] = group_dset_sparse_indices_scanf tmp_dict['group_dset_sparse_indices_scanf'] = group_dset_sparse_indices_scanf