1
0
mirror of https://github.com/TREX-CoE/trexio.git synced 2024-11-03 20:54:07 +01:00

only shift indices that have been read to the buffer (EOF case)

This commit is contained in:
q-posev 2021-12-10 14:02:05 +01:00
parent 7afb53be49
commit 9f5ee463e1
4 changed files with 77 additions and 43 deletions

View File

@ -2445,19 +2445,19 @@ trexio_read_$group_dset$(trexio_t* const file,
/* Read the max number of integrals stored in the file */
rc = trexio_read_$group_dset$_size(file, &size_max);
if (rc != TREXIO_SUCCESS) return rc;
/* Cannot read more data points than there is already in the file */
// TODO: YOU CAN AND WILL REACH EOF SO NO PROBLEM
if (buffer_size > size_max) return TREXIO_INVALID_ARG_3;
// introduce a new variable which will be modified with the number of integrals being read if EOF is encountered
uint64_t eof_read_size = 0UL;
switch (file->back_end) {
case TREXIO_TEXT:
rc = trexio_text_read_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse, value_sparse);
rc = trexio_text_read_$group_dset$(file, offset_file, buffer_size, size_max, &eof_read_size, index_sparse, value_sparse);
break;
case TREXIO_HDF5:
#ifdef HAVE_HDF5
rc = trexio_hdf5_read_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse, value_sparse);
rc = trexio_hdf5_read_$group_dset$(file, offset_file, buffer_size, size_max, &eof_read_size, index_sparse, value_sparse);
break;
#else
rc = TREXIO_BACK_END_MISSING;
@ -2471,21 +2471,18 @@ trexio_read_$group_dset$(trexio_t* const file,
rc = TREXIO_FAILURE; /* Impossible case */
}
if (rc != TREXIO_SUCCESS) return rc;
if (rc != TREXIO_SUCCESS && rc != TREXIO_END) return rc;
// shift indices to be one-based if Fortran API is used
// TODO :
// THIS WILL SHIFT ALL INDICES, HOWEVER IF EOF IS ENCOUNTERED THIS IS NOT DESIRABLE
// WE CAN MODIFY BY ADDRESS AND INT VALUE INDICATING THE NUMBER OF INTEGRALS READ BEFORE EOF
// AND ONLY SHIFT THEM !
if (file->one_based) {
uint64_t index_size = 4L*buffer_size;
// if EOF is reached - shift only indices that have been read, not an entire buffer
uint64_t index_size = (rc == TREXIO_END) ? (4UL*eof_read_size) : (4UL*buffer_size) ;
for (uint64_t i=0; i<index_size; ++i){
index_sparse[i] += 1;
}
}
return TREXIO_SUCCESS;
return rc;
}
#+end_src
@ -2551,7 +2548,7 @@ trexio_write_$group_dset$(trexio_t* const file,
// shift indices to be zero-based if Fortran API is used
if (file->one_based) {
uint64_t index_size = 4L*buffer_size;
uint64_t index_size = 4UL*buffer_size;
index_sparse_p = CALLOC(index_size, int32_t);
if (index_sparse_p == NULL) return TREXIO_ALLOCATION_FAILED;

View File

@ -379,7 +379,7 @@ trexio_hdf5_has_$group_dset$ (trexio_t* const file)
#+begin_src c :tangle hrw_dset_sparse_hdf5.h :exports none
trexio_exit_code trexio_hdf5_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_hdf5_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_hdf5_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_hdf5_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_hdf5_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
#+end_src
@ -483,11 +483,13 @@ trexio_hdf5_read_$group_dset$ (trexio_t* const file,
const int64_t offset_file,
const int64_t size,
const int64_t size_max,
uint64_t* const eof_read_size,
int32_t* const index_read,
double* const value_read)
{
if (file == NULL) return TREXIO_INVALID_ARG_1;
if (eof_read_size == NULL) return TREXIO_INVALID_ARG_5;
const trexio_hdf5_t* f = (const trexio_hdf5_t*) file;
@ -515,6 +517,8 @@ trexio_hdf5_read_$group_dset$ (trexio_t* const file,
eof_reachable = 1;
// lower the value of count to reduce the number of elements which will be read
count[0] -= max_offset - ddims[0];
// modify the eof_read_size accordingly
*eof_read_size = (uint64_t) (count[0] / 4UL);
}
herr_t status = H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);

View File

@ -520,16 +520,16 @@ trexio_text_read_$group$ (trexio_text_t* const file)
}
/* WARNING: this tmp array allows to avoid allocation of space for each element of array of string
, BUT it's size has to be number_of_str*max_len_str where max_len_str is somewhat arbitrary, e.g. 32.
,*/
BUT it's size has to be number_of_str*max_len_str where max_len_str is somewhat arbitrary, e.g. 32.
*/
char* tmp_$group_dset$;
tmp_$group_dset$ = CALLOC(size_$group_dset$*32, char);
for (uint64_t i=0 ; i<size_$group_dset$ ; ++i) {
$group$->$group_dset$[i] = tmp_$group_dset$;
/* conventional fcanf with "%s" only return the string before the first space character
,* to read string with spaces use "%[^\n]" possible with space before or after, i.e. " %[^\n]"
,* Q: depending on what ? */
* to read string with spaces use "%[^\n]" possible with space before or after, i.e. " %[^\n]"
* Q: depending on what ? */
rc = fscanf(f, " %1023[^\n]", tmp_$group_dset$);
assert(!(rc != 1));
if (rc != 1) {
@ -1003,21 +1003,29 @@ trexio_text_has_$group_str$ (trexio_t* const file)
#+end_src
** Template for has/read/write the dataset of sparse data
Each sparse array is stored in a separate =.txt= file due to the fact that sparse I/O has to be decoupled
from conventional write/read/flush behaviour of the TEXT back end. Chunks are used to read/write sparse data
to prevent memory overflow. Chunks have a given ~int64_t size~
(size specifies the number of sparse data items, e.g. integrals).
User provides indices and values of the sparse array as two separate variables.
#+begin_src c :tangle hrw_dset_sparse_text.h :exports none
trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
#+end_src
#+begin_src c :tangle write_dset_sparse_text.c
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
const int64_t offset_file,
const int64_t size,
const int64_t size_max,
const int32_t* index_sparse,
const double* value_sparse)
const int64_t offset_file,
const int64_t size,
const int64_t size_max,
const int32_t* index_sparse,
const double* value_sparse)
{
if (file == NULL) return TREXIO_INVALID_ARG_1;
@ -1040,14 +1048,14 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
/* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char.
CURRENTLY NO OFFSET IS USED WHEN WRITING !
*/
*/
const int64_t line_length = $group_dset_sparse_line_length$L;
/* Get the starting position of the IO stream to be written in the .size file.
This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail.
One can use ftello function which is adapted for large files.
For now, we can use front-end-provided size_max, which has been checked for INT64_MAX overflow.
*/
*/
//int64_t io_start_pos = (int64_t) ftell(f);
int64_t io_start_pos = size_max * line_length;
@ -1055,7 +1063,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
/* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */
int rc;
for (uint64_t i=0L; i<size; ++i) {
for (uint64_t i=0UL; i<size; ++i) {
rc = fprintf(f, "$group_dset_format_printf$\n",
$group_dset_sparse_indices_printf$,
@ -1100,17 +1108,19 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
#+begin_src c :tangle read_dset_sparse_text.c
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
const int64_t offset_file,
const int64_t size,
const int64_t size_max,
int32_t* const index_sparse,
double* const value_sparse)
const int64_t offset_file,
const int64_t size,
const int64_t size_max,
uint64_t* const eof_read_size,
int32_t* const index_sparse,
double* const value_sparse)
{
if (file == NULL) return TREXIO_INVALID_ARG_1;
if (eof_read_size == NULL) return TREXIO_INVALID_ARG_5;
/* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
,*/
*/
const char $group_dset$_file_name[256] = "/$group_dset$.txt";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
@ -1127,7 +1137,7 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char
,*/
*/
const uint64_t line_length = $group_dset_sparse_line_length$L;
fseek(f, (long) offset_file * line_length, SEEK_SET);
@ -1135,13 +1145,15 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */
int rc;
char buffer[1024];
for (uint64_t i=0L; i<size; ++i) {
uint64_t count = 0UL;
for (uint64_t i=0UL; i<size; ++i) {
memset(buffer,0,sizeof(buffer));
if(fgets(buffer, 1023, f) == NULL){
fclose(f);
*eof_read_size = count;
return TREXIO_END;
} else {
@ -1149,11 +1161,11 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
rc = sscanf(buffer, "$group_dset_format_scanf$",
$group_dset_sparse_indices_scanf$,
value_sparse + i);
if(rc <= 0) {
fclose(f);
return TREXIO_FAILURE;
}
count += 1UL;
}
}
@ -1163,7 +1175,6 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
if(rc != 0) return TREXIO_FILE_ERROR;
return TREXIO_SUCCESS;
}
#+end_src
@ -1175,7 +1186,7 @@ trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_
/* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
,*/
*/
const char $group_dset$_file_name[256] = "/$group_dset$.txt.size";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
@ -1225,7 +1236,7 @@ trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file)
/* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
,*/
*/
const char $group_dset$_file_name[256] = "/$group_dset$.txt";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH];

View File

@ -195,6 +195,9 @@ subroutine test_read(file_name, back_end)
double precision :: value_sparse_mo_2e_int_eri(20)
integer(8) :: read_buf_size = 10
integer(8) :: offset_read = 40
integer(8) :: offset_data_read = 5
integer(8) :: offset_eof = 97
integer(8) :: offset_data_eof = 1
integer(8) :: size_toread = 0
character*(128) :: str
@ -271,10 +274,11 @@ subroutine test_read(file_name, back_end)
rc = trexio_read_mo_2e_int_eri(trex_file, offset_read, read_buf_size, &
index_sparse_mo_2e_int_eri(1,5+1), &
value_sparse_mo_2e_int_eri(5+1))
index_sparse_mo_2e_int_eri(1, offset_data_read + 1), &
value_sparse_mo_2e_int_eri(offset_data_read + 1))
call trexio_assert(rc, TREXIO_SUCCESS)
if (index_sparse_mo_2e_int_eri(1,1) == 0 .and. index_sparse_mo_2e_int_eri(1,5+1) == offset_read*4+1) then
if (index_sparse_mo_2e_int_eri(1, 1) == 0 .and. &
index_sparse_mo_2e_int_eri(1, offset_data_read + 1) == offset_read*4 + 1) then
write(*,*) 'SUCCESS READ SPARSE DATA'
else
print *, 'FAILURE SPARSE DATA CHECK'
@ -282,6 +286,24 @@ subroutine test_read(file_name, back_end)
endif
! attempt to read reaching EOF: should return TREXIO_END and
! NOT increment the existing values in the buffer (only upd with what has been read)
rc = trexio_read_mo_2e_int_eri(trex_file, offset_eof, read_buf_size, &
index_sparse_mo_2e_int_eri(1, offset_data_eof + 1), &
value_sparse_mo_2e_int_eri(offset_data_eof + 1))
call trexio_assert(rc, TREXIO_END)
!do i = 1,20
! write(*,*) index_sparse_mo_2e_int_eri(1,i)
!enddo
if (index_sparse_mo_2e_int_eri(1, 1) == 0 .and. &
index_sparse_mo_2e_int_eri(1, offset_data_read + 1) == offset_read*4 + 1 .and. &
index_sparse_mo_2e_int_eri(1, offset_data_eof + 1) == offset_eof*4 + 1) then
write(*,*) 'SUCCESS READ SPARSE DATA EOF'
else
print *, 'FAILURE SPARSE DATA EOF CHECK'
call exit(-1)
endif
rc = trexio_read_mo_2e_int_eri_size(trex_file, size_toread)
call trexio_assert(rc, TREXIO_SUCCESS)
if (size_toread == 100) then