1
0
mirror of https://github.com/TREX-CoE/trexio.git synced 2025-01-03 01:56:13 +01:00

optimize storage of indices depending on the size_max variable [TEXT]

This commit is contained in:
q-posev 2021-12-17 15:32:03 +01:00
parent 1e457c497d
commit 2639b76a6e
3 changed files with 84 additions and 45 deletions

View File

@ -2455,7 +2455,7 @@ trexio_read_$group_dset$(trexio_t* const file,
switch (file->back_end) {
case TREXIO_TEXT:
rc = trexio_text_read_$group_dset$(file, offset_file, *buffer_size, size_max, &eof_read_size, index_sparse, value_sparse);
rc = trexio_text_read_$group_dset$(file, offset_file, *buffer_size, num, &eof_read_size, index_sparse, value_sparse);
break;
case TREXIO_HDF5:
@ -2570,7 +2570,7 @@ trexio_write_$group_dset$(trexio_t* const file,
switch (file->back_end) {
case TREXIO_TEXT:
rc = trexio_text_write_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse_p, value_sparse);
rc = trexio_text_write_$group_dset$(file, offset_file, buffer_size, num, size_max, index_sparse_p, value_sparse);
break;
case TREXIO_HDF5:

View File

@ -1014,7 +1014,7 @@ trexio_text_has_$group_str$ (trexio_t* const file)
#+begin_src c :tangle hrw_dset_sparse_text.h :exports none
trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int64_t size_start, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
#+end_src
@ -1024,6 +1024,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
const int64_t offset_file,
const int64_t size,
const int64_t size_max,
const int64_t size_start,
const int32_t* index_sparse,
const double* value_sparse)
{
@ -1045,41 +1046,49 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
FILE* f = fopen(file_full_path, "a");
if(f == NULL) return TREXIO_FILE_ERROR;
/* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char.
CURRENTLY NO OFFSET IS USED WHEN WRITING !
*/
const int64_t line_length = $group_dset_sparse_line_length$L;
,*/
int64_t line_length = 0L;
char format_str[256] = "\0";
/* Determine the optimal type for storing indices depending on the size_max (usually mo_num or ao_num) */
if (size_max < UINT8_MAX) {
line_length = $sparse_line_length_8$; // 41 for 4 indices
strncpy(format_str, $sparse_format_printf_8$, 256);
} else if (size_max < UINT16_MAX) {
line_length = $sparse_line_length_16$; // 49 for 4 indices
strncpy(format_str, $sparse_format_printf_16$, 256);
} else {
line_length = $sparse_line_length_32$; //69 for 4 indices
strncpy(format_str, $sparse_format_printf_32$, 256);
}
strncat(format_str, "\n", 2);
/* Get the starting position of the IO stream to be written in the .size file.
This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail.
One can use ftello function which is adapted for large files.
For now, we can use front-end-provided size_max, which has been checked for INT64_MAX overflow.
For now, we can use front-end-provided size_start, which has been checked for INT64_MAX overflow.
*/
//int64_t io_start_pos = (int64_t) ftell(f);
int64_t io_start_pos = size_max * line_length;
//fseek(f, (long) offset_file * line_length, SEEK_SET);
int64_t io_start_pos = size_start * line_length;
/* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */
int rc;
for (uint64_t i=0UL; i<size; ++i) {
rc = fprintf(f, "$group_dset_format_printf$\n",
$group_dset_sparse_indices_printf$,
*(value_sparse + i));
if(rc <= 0) {
fclose(f);
return TREXIO_FAILURE;
}
rc = fprintf(f, format_str,
$group_dset_sparse_indices_printf$,
*(value_sparse + i));
if(rc <= 0) {
fclose(f);
return TREXIO_FAILURE;
}
}
/* Close the TXT file */
rc = fclose(f);
if(rc != 0) return TREXIO_FILE_ERROR;
if (rc != 0) return TREXIO_FILE_ERROR;
/* Append .size to the file_full_path in order to write additional info about the written buffer of data */
strncat(file_full_path, ".size", 6);
@ -1097,8 +1106,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
/* Close the TXT file */
rc = fclose(f_wSize);
if(rc != 0) return TREXIO_FILE_ERROR;
if (rc != 0) return TREXIO_FILE_ERROR;
/* Exit upon success */
return TREXIO_SUCCESS;
@ -1120,7 +1128,7 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
*/
,*/
const char $group_dset$_file_name[256] = "/$group_dset$.txt";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
@ -1137,9 +1145,18 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char
*/
const uint64_t line_length = $group_dset_sparse_line_length$L;
,*/
uint64_t line_length = 0UL;
/* Determine the line length depending on the size_max (usually mo_num or ao_num) */
if (size_max < UINT8_MAX) {
line_length = $sparse_line_length_8$; // 41 for 4 indices
} else if (size_max < UINT16_MAX) {
line_length = $sparse_line_length_16$; // 49 for 4 indices
} else {
line_length = $sparse_line_length_32$; //69 for 4 indices
}
/* Offset in the file according to the provided value of offset_file and optimal line_length */
fseek(f, (long) offset_file * line_length, SEEK_SET);
/* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */

View File

@ -108,7 +108,9 @@ def recursive_populate_file(fname: str, paths: dict, detailed_source: dict) -> N
'group_num_dtype_default', 'group_num_dtype_double', 'group_num_dtype_single',
'group_num_h5_dtype', 'group_num_py_dtype',
'group_dset_format_scanf', 'group_dset_format_printf', 'group_dset_sparse_dim',
'group_dset_sparse_line_length', 'group_dset_sparse_indices_printf', 'group_dset_sparse_indices_scanf',
'group_dset_sparse_indices_printf', 'group_dset_sparse_indices_scanf',
'sparse_format_printf_8', 'sparse_format_printf_16', 'sparse_format_printf_32',
'sparse_line_length_8', 'sparse_line_length_16', 'sparse_line_length_32',
'group_dset', 'group_num', 'group_str', 'group']
for item in detailed_source.keys():
@ -468,7 +470,9 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
dtype (str) : dtype corresponding to the trex.json (i.e. int/dim/float/float sparse/str)
target (str) : `num` or `dset`
rank (int) : [optional] value of n in n-index (sparse) dset; needed to build the printf/scanf format string
int_len_printf (int): [optional] length reserved for one index when printing n-index (sparse) dset (e.g. 10 for int32_t)
int_len_printf(dict): [optional]
keys: precision (e.g. 32 for int32_t)
values: lengths reserved for one index when printing n-index (sparse) dset (e.g. 10 for int32_t)
Returns:
dtype_dict (dict) : dictionary dtype-related substitutions
@ -480,8 +484,8 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
raise Exception("Both rank and int_len_printf arguments has to be provided to build the dtype_dict for sparse data.")
if rank is not None and rank <= 1:
raise Exception('Rank of sparse quantity cannot be lower than 2.')
if int_len_printf is not None and int_len_printf <= 0:
raise Exception('Length of an index of sparse quantity has to be positive value.')
if int_len_printf is not None and not isinstance(int_len_printf, dict):
raise Exception('int_len_printf has to be a dictionary of lengths for different precisions.')
dtype_dict = {}
# set up the key-value pairs dependending on the dtype
@ -532,15 +536,23 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
})
elif 'sparse' in dtype:
# build format string for n-index sparse quantity
item_printf = f'%{int_len_printf}" PRId32 " '
item_printf_8 = f'%{int_len_printf[8]}" PRIu8 " '
item_printf_16 = f'%{int_len_printf[16]}" PRIu16 " '
item_printf_32 = f'%{int_len_printf[32]}" PRId32 " '
item_scanf = '%" SCNd32 " '
group_dset_format_printf = ''
group_dset_format_printf_8 = '"'
group_dset_format_printf_16 = '"'
group_dset_format_printf_32 = '"'
group_dset_format_scanf = ''
for i in range(rank):
group_dset_format_printf += item_printf
group_dset_format_printf_8 += item_printf_8
group_dset_format_printf_16 += item_printf_16
group_dset_format_printf_32 += item_printf_32
group_dset_format_scanf += item_scanf
# append the format string for float values
group_dset_format_printf += '%24.16e'
group_dset_format_printf_8 += '%24.16e" '
group_dset_format_printf_16 += '%24.16e" '
group_dset_format_printf_32 += '%24.16e" '
group_dset_format_scanf += '%lf'
# set up the dictionary for sparse
@ -554,7 +566,9 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
f'group_{target}_dtype_default' : '',
f'group_{target}_dtype_double' : '',
f'group_{target}_dtype_single' : '',
f'group_{target}_format_printf' : group_dset_format_printf,
f'sparse_format_printf_8' : group_dset_format_printf_8,
f'sparse_format_printf_16' : group_dset_format_printf_16,
f'sparse_format_printf_32' : group_dset_format_printf_32,
f'group_{target}_format_scanf' : group_dset_format_scanf,
f'group_{target}_py_dtype' : ''
})
@ -664,17 +678,18 @@ def split_dset_dict_detailed (datasets: dict) -> tuple:
# define whether the dset is sparse
is_sparse = False
int_len_printf = {}
if 'sparse' in datatype:
is_sparse = True
int32_len_printf = 10
# int64_len_printf = ??
# int16_len_printf = ??
int_len_printf[32] = 10
int_len_printf[16] = 5
int_len_printf[8] = 3
# get the dtype-related substitutions required to replace templated variables later
if not is_sparse:
dtype_dict = get_dtype_dict(datatype, 'dset')
else:
dtype_dict = get_dtype_dict(datatype, 'dset', rank, int32_len_printf)
dtype_dict = get_dtype_dict(datatype, 'dset', rank, int_len_printf)
tmp_dict.update(dtype_dict)
@ -713,20 +728,27 @@ def split_dset_dict_detailed (datasets: dict) -> tuple:
index_printf = f'*(index_sparse + {str(rank)}*i'
index_scanf = f'index_sparse + {str(rank)}*i'
# one index item consumes up to index_length characters (int32_len_printf for int32 + 1 for space)
index_len = int32_len_printf + 1
group_dset_sparse_indices_printf = index_printf + ')'
group_dset_sparse_indices_scanf = index_scanf
group_dset_sparse_line_len = index_len
sparse_line_length_32 = int_len_printf[32] + 1
sparse_line_length_16 = int_len_printf[16] + 1
sparse_line_length_8 = int_len_printf[8] + 1
# loop from 1 because we already have stored one index
for index_count in range(1,rank):
group_dset_sparse_indices_printf += f', {index_printf} + {index_count})'
group_dset_sparse_indices_scanf += f', {index_scanf} + {index_count}'
group_dset_sparse_line_len += index_len
sparse_line_length_32 += int_len_printf[32] + 1
sparse_line_length_16 += int_len_printf[16] + 1
sparse_line_length_8 += int_len_printf[8] + 1
# add 24 chars occupied by the floating point value of sparse dataset + 1 char for "\n"
group_dset_sparse_line_len += 24 + 1
sparse_line_length_32 += 24 + 1
sparse_line_length_16 += 24 + 1
sparse_line_length_8 += 24 + 1
tmp_dict['group_dset_sparse_line_length'] = str(group_dset_sparse_line_len)
tmp_dict['sparse_line_length_32'] = str(sparse_line_length_32)
tmp_dict['sparse_line_length_16'] = str(sparse_line_length_16)
tmp_dict['sparse_line_length_8'] = str(sparse_line_length_8)
tmp_dict['group_dset_sparse_indices_printf'] = group_dset_sparse_indices_printf
tmp_dict['group_dset_sparse_indices_scanf'] = group_dset_sparse_indices_scanf