mirror of
https://github.com/TREX-CoE/trexio.git
synced 2025-01-03 01:56:13 +01:00
optimize storage of indices depending on the size_max variable [TEXT]
This commit is contained in:
parent
1e457c497d
commit
2639b76a6e
@ -2455,7 +2455,7 @@ trexio_read_$group_dset$(trexio_t* const file,
|
||||
switch (file->back_end) {
|
||||
|
||||
case TREXIO_TEXT:
|
||||
rc = trexio_text_read_$group_dset$(file, offset_file, *buffer_size, size_max, &eof_read_size, index_sparse, value_sparse);
|
||||
rc = trexio_text_read_$group_dset$(file, offset_file, *buffer_size, num, &eof_read_size, index_sparse, value_sparse);
|
||||
break;
|
||||
|
||||
case TREXIO_HDF5:
|
||||
@ -2570,7 +2570,7 @@ trexio_write_$group_dset$(trexio_t* const file,
|
||||
switch (file->back_end) {
|
||||
|
||||
case TREXIO_TEXT:
|
||||
rc = trexio_text_write_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse_p, value_sparse);
|
||||
rc = trexio_text_write_$group_dset$(file, offset_file, buffer_size, num, size_max, index_sparse_p, value_sparse);
|
||||
break;
|
||||
|
||||
case TREXIO_HDF5:
|
||||
|
@ -1014,7 +1014,7 @@ trexio_text_has_$group_str$ (trexio_t* const file)
|
||||
#+begin_src c :tangle hrw_dset_sparse_text.h :exports none
|
||||
trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file);
|
||||
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
|
||||
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
|
||||
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int64_t size_start, const int32_t* index_sparse, const double* value_sparse);
|
||||
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
|
||||
#+end_src
|
||||
|
||||
@ -1024,6 +1024,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
|
||||
const int64_t offset_file,
|
||||
const int64_t size,
|
||||
const int64_t size_max,
|
||||
const int64_t size_start,
|
||||
const int32_t* index_sparse,
|
||||
const double* value_sparse)
|
||||
{
|
||||
@ -1045,41 +1046,49 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
|
||||
FILE* f = fopen(file_full_path, "a");
|
||||
if(f == NULL) return TREXIO_FILE_ERROR;
|
||||
|
||||
|
||||
/* Specify the line length in order to offset properly. For example, for 4-index quantities
|
||||
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char.
|
||||
CURRENTLY NO OFFSET IS USED WHEN WRITING !
|
||||
*/
|
||||
const int64_t line_length = $group_dset_sparse_line_length$L;
|
||||
,*/
|
||||
int64_t line_length = 0L;
|
||||
char format_str[256] = "\0";
|
||||
|
||||
/* Determine the optimal type for storing indices depending on the size_max (usually mo_num or ao_num) */
|
||||
if (size_max < UINT8_MAX) {
|
||||
line_length = $sparse_line_length_8$; // 41 for 4 indices
|
||||
strncpy(format_str, $sparse_format_printf_8$, 256);
|
||||
} else if (size_max < UINT16_MAX) {
|
||||
line_length = $sparse_line_length_16$; // 49 for 4 indices
|
||||
strncpy(format_str, $sparse_format_printf_16$, 256);
|
||||
} else {
|
||||
line_length = $sparse_line_length_32$; //69 for 4 indices
|
||||
strncpy(format_str, $sparse_format_printf_32$, 256);
|
||||
}
|
||||
strncat(format_str, "\n", 2);
|
||||
|
||||
/* Get the starting position of the IO stream to be written in the .size file.
|
||||
This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail.
|
||||
One can use ftello function which is adapted for large files.
|
||||
For now, we can use front-end-provided size_max, which has been checked for INT64_MAX overflow.
|
||||
For now, we can use front-end-provided size_start, which has been checked for INT64_MAX overflow.
|
||||
*/
|
||||
//int64_t io_start_pos = (int64_t) ftell(f);
|
||||
int64_t io_start_pos = size_max * line_length;
|
||||
|
||||
//fseek(f, (long) offset_file * line_length, SEEK_SET);
|
||||
int64_t io_start_pos = size_start * line_length;
|
||||
|
||||
/* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */
|
||||
int rc;
|
||||
for (uint64_t i=0UL; i<size; ++i) {
|
||||
|
||||
rc = fprintf(f, "$group_dset_format_printf$\n",
|
||||
$group_dset_sparse_indices_printf$,
|
||||
*(value_sparse + i));
|
||||
|
||||
if(rc <= 0) {
|
||||
fclose(f);
|
||||
return TREXIO_FAILURE;
|
||||
}
|
||||
|
||||
rc = fprintf(f, format_str,
|
||||
$group_dset_sparse_indices_printf$,
|
||||
*(value_sparse + i));
|
||||
if(rc <= 0) {
|
||||
fclose(f);
|
||||
return TREXIO_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Close the TXT file */
|
||||
rc = fclose(f);
|
||||
if(rc != 0) return TREXIO_FILE_ERROR;
|
||||
|
||||
if (rc != 0) return TREXIO_FILE_ERROR;
|
||||
|
||||
/* Append .size to the file_full_path in order to write additional info about the written buffer of data */
|
||||
strncat(file_full_path, ".size", 6);
|
||||
@ -1097,8 +1106,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
|
||||
|
||||
/* Close the TXT file */
|
||||
rc = fclose(f_wSize);
|
||||
if(rc != 0) return TREXIO_FILE_ERROR;
|
||||
|
||||
if (rc != 0) return TREXIO_FILE_ERROR;
|
||||
|
||||
/* Exit upon success */
|
||||
return TREXIO_SUCCESS;
|
||||
@ -1120,7 +1128,7 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
|
||||
|
||||
/* Build the name of the file with sparse data.
|
||||
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
|
||||
*/
|
||||
,*/
|
||||
const char $group_dset$_file_name[256] = "/$group_dset$.txt";
|
||||
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
|
||||
char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
|
||||
@ -1137,9 +1145,18 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
|
||||
|
||||
/* Specify the line length in order to offset properly. For example, for 4-index quantities
|
||||
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char
|
||||
*/
|
||||
const uint64_t line_length = $group_dset_sparse_line_length$L;
|
||||
,*/
|
||||
uint64_t line_length = 0UL;
|
||||
/* Determine the line length depending on the size_max (usually mo_num or ao_num) */
|
||||
if (size_max < UINT8_MAX) {
|
||||
line_length = $sparse_line_length_8$; // 41 for 4 indices
|
||||
} else if (size_max < UINT16_MAX) {
|
||||
line_length = $sparse_line_length_16$; // 49 for 4 indices
|
||||
} else {
|
||||
line_length = $sparse_line_length_32$; //69 for 4 indices
|
||||
}
|
||||
|
||||
/* Offset in the file according to the provided value of offset_file and optimal line_length */
|
||||
fseek(f, (long) offset_file * line_length, SEEK_SET);
|
||||
|
||||
/* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */
|
||||
|
@ -108,7 +108,9 @@ def recursive_populate_file(fname: str, paths: dict, detailed_source: dict) -> N
|
||||
'group_num_dtype_default', 'group_num_dtype_double', 'group_num_dtype_single',
|
||||
'group_num_h5_dtype', 'group_num_py_dtype',
|
||||
'group_dset_format_scanf', 'group_dset_format_printf', 'group_dset_sparse_dim',
|
||||
'group_dset_sparse_line_length', 'group_dset_sparse_indices_printf', 'group_dset_sparse_indices_scanf',
|
||||
'group_dset_sparse_indices_printf', 'group_dset_sparse_indices_scanf',
|
||||
'sparse_format_printf_8', 'sparse_format_printf_16', 'sparse_format_printf_32',
|
||||
'sparse_line_length_8', 'sparse_line_length_16', 'sparse_line_length_32',
|
||||
'group_dset', 'group_num', 'group_str', 'group']
|
||||
|
||||
for item in detailed_source.keys():
|
||||
@ -468,7 +470,9 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
|
||||
dtype (str) : dtype corresponding to the trex.json (i.e. int/dim/float/float sparse/str)
|
||||
target (str) : `num` or `dset`
|
||||
rank (int) : [optional] value of n in n-index (sparse) dset; needed to build the printf/scanf format string
|
||||
int_len_printf (int): [optional] length reserved for one index when printing n-index (sparse) dset (e.g. 10 for int32_t)
|
||||
int_len_printf(dict): [optional]
|
||||
keys: precision (e.g. 32 for int32_t)
|
||||
values: lengths reserved for one index when printing n-index (sparse) dset (e.g. 10 for int32_t)
|
||||
|
||||
Returns:
|
||||
dtype_dict (dict) : dictionary dtype-related substitutions
|
||||
@ -480,8 +484,8 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
|
||||
raise Exception("Both rank and int_len_printf arguments has to be provided to build the dtype_dict for sparse data.")
|
||||
if rank is not None and rank <= 1:
|
||||
raise Exception('Rank of sparse quantity cannot be lower than 2.')
|
||||
if int_len_printf is not None and int_len_printf <= 0:
|
||||
raise Exception('Length of an index of sparse quantity has to be positive value.')
|
||||
if int_len_printf is not None and not isinstance(int_len_printf, dict):
|
||||
raise Exception('int_len_printf has to be a dictionary of lengths for different precisions.')
|
||||
|
||||
dtype_dict = {}
|
||||
# set up the key-value pairs dependending on the dtype
|
||||
@ -532,15 +536,23 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
|
||||
})
|
||||
elif 'sparse' in dtype:
|
||||
# build format string for n-index sparse quantity
|
||||
item_printf = f'%{int_len_printf}" PRId32 " '
|
||||
item_printf_8 = f'%{int_len_printf[8]}" PRIu8 " '
|
||||
item_printf_16 = f'%{int_len_printf[16]}" PRIu16 " '
|
||||
item_printf_32 = f'%{int_len_printf[32]}" PRId32 " '
|
||||
item_scanf = '%" SCNd32 " '
|
||||
group_dset_format_printf = ''
|
||||
group_dset_format_printf_8 = '"'
|
||||
group_dset_format_printf_16 = '"'
|
||||
group_dset_format_printf_32 = '"'
|
||||
group_dset_format_scanf = ''
|
||||
for i in range(rank):
|
||||
group_dset_format_printf += item_printf
|
||||
group_dset_format_printf_8 += item_printf_8
|
||||
group_dset_format_printf_16 += item_printf_16
|
||||
group_dset_format_printf_32 += item_printf_32
|
||||
group_dset_format_scanf += item_scanf
|
||||
# append the format string for float values
|
||||
group_dset_format_printf += '%24.16e'
|
||||
group_dset_format_printf_8 += '%24.16e" '
|
||||
group_dset_format_printf_16 += '%24.16e" '
|
||||
group_dset_format_printf_32 += '%24.16e" '
|
||||
group_dset_format_scanf += '%lf'
|
||||
|
||||
# set up the dictionary for sparse
|
||||
@ -554,7 +566,9 @@ def get_dtype_dict (dtype: str, target: str, rank = None, int_len_printf = None)
|
||||
f'group_{target}_dtype_default' : '',
|
||||
f'group_{target}_dtype_double' : '',
|
||||
f'group_{target}_dtype_single' : '',
|
||||
f'group_{target}_format_printf' : group_dset_format_printf,
|
||||
f'sparse_format_printf_8' : group_dset_format_printf_8,
|
||||
f'sparse_format_printf_16' : group_dset_format_printf_16,
|
||||
f'sparse_format_printf_32' : group_dset_format_printf_32,
|
||||
f'group_{target}_format_scanf' : group_dset_format_scanf,
|
||||
f'group_{target}_py_dtype' : ''
|
||||
})
|
||||
@ -664,17 +678,18 @@ def split_dset_dict_detailed (datasets: dict) -> tuple:
|
||||
|
||||
# define whether the dset is sparse
|
||||
is_sparse = False
|
||||
int_len_printf = {}
|
||||
if 'sparse' in datatype:
|
||||
is_sparse = True
|
||||
int32_len_printf = 10
|
||||
# int64_len_printf = ??
|
||||
# int16_len_printf = ??
|
||||
int_len_printf[32] = 10
|
||||
int_len_printf[16] = 5
|
||||
int_len_printf[8] = 3
|
||||
|
||||
# get the dtype-related substitutions required to replace templated variables later
|
||||
if not is_sparse:
|
||||
dtype_dict = get_dtype_dict(datatype, 'dset')
|
||||
else:
|
||||
dtype_dict = get_dtype_dict(datatype, 'dset', rank, int32_len_printf)
|
||||
dtype_dict = get_dtype_dict(datatype, 'dset', rank, int_len_printf)
|
||||
|
||||
tmp_dict.update(dtype_dict)
|
||||
|
||||
@ -713,20 +728,27 @@ def split_dset_dict_detailed (datasets: dict) -> tuple:
|
||||
index_printf = f'*(index_sparse + {str(rank)}*i'
|
||||
index_scanf = f'index_sparse + {str(rank)}*i'
|
||||
# one index item consumes up to index_length characters (int32_len_printf for int32 + 1 for space)
|
||||
index_len = int32_len_printf + 1
|
||||
group_dset_sparse_indices_printf = index_printf + ')'
|
||||
group_dset_sparse_indices_scanf = index_scanf
|
||||
group_dset_sparse_line_len = index_len
|
||||
sparse_line_length_32 = int_len_printf[32] + 1
|
||||
sparse_line_length_16 = int_len_printf[16] + 1
|
||||
sparse_line_length_8 = int_len_printf[8] + 1
|
||||
# loop from 1 because we already have stored one index
|
||||
for index_count in range(1,rank):
|
||||
group_dset_sparse_indices_printf += f', {index_printf} + {index_count})'
|
||||
group_dset_sparse_indices_scanf += f', {index_scanf} + {index_count}'
|
||||
group_dset_sparse_line_len += index_len
|
||||
sparse_line_length_32 += int_len_printf[32] + 1
|
||||
sparse_line_length_16 += int_len_printf[16] + 1
|
||||
sparse_line_length_8 += int_len_printf[8] + 1
|
||||
|
||||
# add 24 chars occupied by the floating point value of sparse dataset + 1 char for "\n"
|
||||
group_dset_sparse_line_len += 24 + 1
|
||||
sparse_line_length_32 += 24 + 1
|
||||
sparse_line_length_16 += 24 + 1
|
||||
sparse_line_length_8 += 24 + 1
|
||||
|
||||
tmp_dict['group_dset_sparse_line_length'] = str(group_dset_sparse_line_len)
|
||||
tmp_dict['sparse_line_length_32'] = str(sparse_line_length_32)
|
||||
tmp_dict['sparse_line_length_16'] = str(sparse_line_length_16)
|
||||
tmp_dict['sparse_line_length_8'] = str(sparse_line_length_8)
|
||||
tmp_dict['group_dset_sparse_indices_printf'] = group_dset_sparse_indices_printf
|
||||
tmp_dict['group_dset_sparse_indices_scanf'] = group_dset_sparse_indices_scanf
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user