1
0
mirror of https://github.com/TREX-CoE/trexio.git synced 2025-04-29 11:54:45 +02:00

only shift indices that have been read to the buffer (EOF case)

This commit is contained in:
q-posev 2021-12-10 14:02:05 +01:00
parent 7afb53be49
commit 9f5ee463e1
4 changed files with 77 additions and 43 deletions

View File

@ -2445,19 +2445,19 @@ trexio_read_$group_dset$(trexio_t* const file,
/* Read the max number of integrals stored in the file */ /* Read the max number of integrals stored in the file */
rc = trexio_read_$group_dset$_size(file, &size_max); rc = trexio_read_$group_dset$_size(file, &size_max);
if (rc != TREXIO_SUCCESS) return rc; if (rc != TREXIO_SUCCESS) return rc;
/* Cannot read more data points than there is already in the file */
// TODO: YOU CAN AND WILL REACH EOF SO NO PROBLEM // introduce a new variable which will be modified with the number of integrals being read if EOF is encountered
if (buffer_size > size_max) return TREXIO_INVALID_ARG_3; uint64_t eof_read_size = 0UL;
switch (file->back_end) { switch (file->back_end) {
case TREXIO_TEXT: case TREXIO_TEXT:
rc = trexio_text_read_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse, value_sparse); rc = trexio_text_read_$group_dset$(file, offset_file, buffer_size, size_max, &eof_read_size, index_sparse, value_sparse);
break; break;
case TREXIO_HDF5: case TREXIO_HDF5:
#ifdef HAVE_HDF5 #ifdef HAVE_HDF5
rc = trexio_hdf5_read_$group_dset$(file, offset_file, buffer_size, size_max, index_sparse, value_sparse); rc = trexio_hdf5_read_$group_dset$(file, offset_file, buffer_size, size_max, &eof_read_size, index_sparse, value_sparse);
break; break;
#else #else
rc = TREXIO_BACK_END_MISSING; rc = TREXIO_BACK_END_MISSING;
@ -2471,21 +2471,18 @@ trexio_read_$group_dset$(trexio_t* const file,
rc = TREXIO_FAILURE; /* Impossible case */ rc = TREXIO_FAILURE; /* Impossible case */
} }
if (rc != TREXIO_SUCCESS) return rc; if (rc != TREXIO_SUCCESS && rc != TREXIO_END) return rc;
// shift indices to be one-based if Fortran API is used // shift indices to be one-based if Fortran API is used
// TODO :
// THIS WILL SHIFT ALL INDICES, HOWEVER IF EOF IS ENCOUNTERED THIS IS NOT DESIRABLE
// WE CAN MODIFY BY ADDRESS AND INT VALUE INDICATING THE NUMBER OF INTEGRALS READ BEFORE EOF
// AND ONLY SHIFT THEM !
if (file->one_based) { if (file->one_based) {
uint64_t index_size = 4L*buffer_size; // if EOF is reached - shift only indices that have been read, not an entire buffer
uint64_t index_size = (rc == TREXIO_END) ? (4UL*eof_read_size) : (4UL*buffer_size) ;
for (uint64_t i=0; i<index_size; ++i){ for (uint64_t i=0; i<index_size; ++i){
index_sparse[i] += 1; index_sparse[i] += 1;
} }
} }
return TREXIO_SUCCESS; return rc;
} }
#+end_src #+end_src
@ -2551,7 +2548,7 @@ trexio_write_$group_dset$(trexio_t* const file,
// shift indices to be zero-based if Fortran API is used // shift indices to be zero-based if Fortran API is used
if (file->one_based) { if (file->one_based) {
uint64_t index_size = 4L*buffer_size; uint64_t index_size = 4UL*buffer_size;
index_sparse_p = CALLOC(index_size, int32_t); index_sparse_p = CALLOC(index_size, int32_t);
if (index_sparse_p == NULL) return TREXIO_ALLOCATION_FAILED; if (index_sparse_p == NULL) return TREXIO_ALLOCATION_FAILED;

View File

@ -379,7 +379,7 @@ trexio_hdf5_has_$group_dset$ (trexio_t* const file)
#+begin_src c :tangle hrw_dset_sparse_hdf5.h :exports none #+begin_src c :tangle hrw_dset_sparse_hdf5.h :exports none
trexio_exit_code trexio_hdf5_has_$group_dset$(trexio_t* const file); trexio_exit_code trexio_hdf5_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_hdf5_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, int32_t* const index_sparse, double* const value_sparse); trexio_exit_code trexio_hdf5_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_hdf5_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse); trexio_exit_code trexio_hdf5_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_hdf5_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max); trexio_exit_code trexio_hdf5_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
#+end_src #+end_src
@ -483,11 +483,13 @@ trexio_hdf5_read_$group_dset$ (trexio_t* const file,
const int64_t offset_file, const int64_t offset_file,
const int64_t size, const int64_t size,
const int64_t size_max, const int64_t size_max,
uint64_t* const eof_read_size,
int32_t* const index_read, int32_t* const index_read,
double* const value_read) double* const value_read)
{ {
if (file == NULL) return TREXIO_INVALID_ARG_1; if (file == NULL) return TREXIO_INVALID_ARG_1;
if (eof_read_size == NULL) return TREXIO_INVALID_ARG_5;
const trexio_hdf5_t* f = (const trexio_hdf5_t*) file; const trexio_hdf5_t* f = (const trexio_hdf5_t*) file;
@ -515,6 +517,8 @@ trexio_hdf5_read_$group_dset$ (trexio_t* const file,
eof_reachable = 1; eof_reachable = 1;
// lower the value of count to reduce the number of elements which will be read // lower the value of count to reduce the number of elements which will be read
count[0] -= max_offset - ddims[0]; count[0] -= max_offset - ddims[0];
// modify the eof_read_size accordingly
*eof_read_size = (uint64_t) (count[0] / 4UL);
} }
herr_t status = H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL); herr_t status = H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);

View File

@ -520,16 +520,16 @@ trexio_text_read_$group$ (trexio_text_t* const file)
} }
/* WARNING: this tmp array allows to avoid allocation of space for each element of array of string /* WARNING: this tmp array allows to avoid allocation of space for each element of array of string
, BUT it's size has to be number_of_str*max_len_str where max_len_str is somewhat arbitrary, e.g. 32. BUT it's size has to be number_of_str*max_len_str where max_len_str is somewhat arbitrary, e.g. 32.
,*/ */
char* tmp_$group_dset$; char* tmp_$group_dset$;
tmp_$group_dset$ = CALLOC(size_$group_dset$*32, char); tmp_$group_dset$ = CALLOC(size_$group_dset$*32, char);
for (uint64_t i=0 ; i<size_$group_dset$ ; ++i) { for (uint64_t i=0 ; i<size_$group_dset$ ; ++i) {
$group$->$group_dset$[i] = tmp_$group_dset$; $group$->$group_dset$[i] = tmp_$group_dset$;
/* conventional fcanf with "%s" only return the string before the first space character /* conventional fcanf with "%s" only return the string before the first space character
,* to read string with spaces use "%[^\n]" possible with space before or after, i.e. " %[^\n]" * to read string with spaces use "%[^\n]" possible with space before or after, i.e. " %[^\n]"
,* Q: depending on what ? */ * Q: depending on what ? */
rc = fscanf(f, " %1023[^\n]", tmp_$group_dset$); rc = fscanf(f, " %1023[^\n]", tmp_$group_dset$);
assert(!(rc != 1)); assert(!(rc != 1));
if (rc != 1) { if (rc != 1) {
@ -1003,21 +1003,29 @@ trexio_text_has_$group_str$ (trexio_t* const file)
#+end_src #+end_src
** Template for has/read/write the dataset of sparse data ** Template for has/read/write the dataset of sparse data
Each sparse array is stored in a separate =.txt= file due to the fact that sparse I/O has to be decoupled
from conventional write/read/flush behaviour of the TEXT back end. Chunks are used to read/write sparse data
to prevent memory overflow. Chunks have a given ~int64_t size~
(size specifies the number of sparse data items, e.g. integrals).
User provides indices and values of the sparse array as two separate variables.
#+begin_src c :tangle hrw_dset_sparse_text.h :exports none #+begin_src c :tangle hrw_dset_sparse_text.h :exports none
trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file); trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, int32_t* const index_sparse, double* const value_sparse); trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, uint64_t* const eof_read_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse); trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
#+end_src #+end_src
#+begin_src c :tangle write_dset_sparse_text.c #+begin_src c :tangle write_dset_sparse_text.c
trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file, trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
const int64_t offset_file, const int64_t offset_file,
const int64_t size, const int64_t size,
const int64_t size_max, const int64_t size_max,
const int32_t* index_sparse, const int32_t* index_sparse,
const double* value_sparse) const double* value_sparse)
{ {
if (file == NULL) return TREXIO_INVALID_ARG_1; if (file == NULL) return TREXIO_INVALID_ARG_1;
@ -1040,14 +1048,14 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
/* Specify the line length in order to offset properly. For example, for 4-index quantities /* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char. the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char.
CURRENTLY NO OFFSET IS USED WHEN WRITING ! CURRENTLY NO OFFSET IS USED WHEN WRITING !
*/ */
const int64_t line_length = $group_dset_sparse_line_length$L; const int64_t line_length = $group_dset_sparse_line_length$L;
/* Get the starting position of the IO stream to be written in the .size file. /* Get the starting position of the IO stream to be written in the .size file.
This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail. This is error-prone due to the fact that for large files (>2 GB) in 32-bit systems ftell will fail.
One can use ftello function which is adapted for large files. One can use ftello function which is adapted for large files.
For now, we can use front-end-provided size_max, which has been checked for INT64_MAX overflow. For now, we can use front-end-provided size_max, which has been checked for INT64_MAX overflow.
*/ */
//int64_t io_start_pos = (int64_t) ftell(f); //int64_t io_start_pos = (int64_t) ftell(f);
int64_t io_start_pos = size_max * line_length; int64_t io_start_pos = size_max * line_length;
@ -1055,7 +1063,7 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
/* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */ /* Write the data in the file and check the return code of fprintf to verify that > 0 bytes have been written */
int rc; int rc;
for (uint64_t i=0L; i<size; ++i) { for (uint64_t i=0UL; i<size; ++i) {
rc = fprintf(f, "$group_dset_format_printf$\n", rc = fprintf(f, "$group_dset_format_printf$\n",
$group_dset_sparse_indices_printf$, $group_dset_sparse_indices_printf$,
@ -1100,17 +1108,19 @@ trexio_exit_code trexio_text_write_$group_dset$(trexio_t* const file,
#+begin_src c :tangle read_dset_sparse_text.c #+begin_src c :tangle read_dset_sparse_text.c
trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file, trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
const int64_t offset_file, const int64_t offset_file,
const int64_t size, const int64_t size,
const int64_t size_max, const int64_t size_max,
int32_t* const index_sparse, uint64_t* const eof_read_size,
double* const value_sparse) int32_t* const index_sparse,
double* const value_sparse)
{ {
if (file == NULL) return TREXIO_INVALID_ARG_1; if (file == NULL) return TREXIO_INVALID_ARG_1;
if (eof_read_size == NULL) return TREXIO_INVALID_ARG_5;
/* Build the name of the file with sparse data. /* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed? The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
,*/ */
const char $group_dset$_file_name[256] = "/$group_dset$.txt"; const char $group_dset$_file_name[256] = "/$group_dset$.txt";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */ /* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH]; char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
@ -1127,7 +1137,7 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Specify the line length in order to offset properly. For example, for 4-index quantities /* Specify the line length in order to offset properly. For example, for 4-index quantities
the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char the line_length is 69 because 10 per index + 4 spaces + 24 for floating point value + 1 for the new line char
,*/ */
const uint64_t line_length = $group_dset_sparse_line_length$L; const uint64_t line_length = $group_dset_sparse_line_length$L;
fseek(f, (long) offset_file * line_length, SEEK_SET); fseek(f, (long) offset_file * line_length, SEEK_SET);
@ -1135,13 +1145,15 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
/* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */ /* Read the data from the file and check the return code of fprintf to verify that > 0 bytes have been read or reached EOF */
int rc; int rc;
char buffer[1024]; char buffer[1024];
for (uint64_t i=0L; i<size; ++i) { uint64_t count = 0UL;
for (uint64_t i=0UL; i<size; ++i) {
memset(buffer,0,sizeof(buffer)); memset(buffer,0,sizeof(buffer));
if(fgets(buffer, 1023, f) == NULL){ if(fgets(buffer, 1023, f) == NULL){
fclose(f); fclose(f);
*eof_read_size = count;
return TREXIO_END; return TREXIO_END;
} else { } else {
@ -1149,11 +1161,11 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
rc = sscanf(buffer, "$group_dset_format_scanf$", rc = sscanf(buffer, "$group_dset_format_scanf$",
$group_dset_sparse_indices_scanf$, $group_dset_sparse_indices_scanf$,
value_sparse + i); value_sparse + i);
if(rc <= 0) { if(rc <= 0) {
fclose(f); fclose(f);
return TREXIO_FAILURE; return TREXIO_FAILURE;
} }
count += 1UL;
} }
} }
@ -1163,7 +1175,6 @@ trexio_exit_code trexio_text_read_$group_dset$(trexio_t* const file,
if(rc != 0) return TREXIO_FILE_ERROR; if(rc != 0) return TREXIO_FILE_ERROR;
return TREXIO_SUCCESS; return TREXIO_SUCCESS;
} }
#+end_src #+end_src
@ -1175,7 +1186,7 @@ trexio_exit_code trexio_text_read_$group_dset$_size(trexio_t* const file, int64_
/* Build the name of the file with sparse data. /* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed? The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
,*/ */
const char $group_dset$_file_name[256] = "/$group_dset$.txt.size"; const char $group_dset$_file_name[256] = "/$group_dset$.txt.size";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */ /* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH]; char file_full_path[TREXIO_MAX_FILENAME_LENGTH];
@ -1225,7 +1236,7 @@ trexio_exit_code trexio_text_has_$group_dset$(trexio_t* const file)
/* Build the name of the file with sparse data. /* Build the name of the file with sparse data.
The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed? The $group_dset$.txt is limited to 256 symbols for the moment. What are the chances that it will exceed?
,*/ */
const char $group_dset$_file_name[256] = "/$group_dset$.txt"; const char $group_dset$_file_name[256] = "/$group_dset$.txt";
/* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */ /* The full path to the destination TXT file with sparse data. This will include TREXIO directory name. */
char file_full_path[TREXIO_MAX_FILENAME_LENGTH]; char file_full_path[TREXIO_MAX_FILENAME_LENGTH];

View File

@ -195,6 +195,9 @@ subroutine test_read(file_name, back_end)
double precision :: value_sparse_mo_2e_int_eri(20) double precision :: value_sparse_mo_2e_int_eri(20)
integer(8) :: read_buf_size = 10 integer(8) :: read_buf_size = 10
integer(8) :: offset_read = 40 integer(8) :: offset_read = 40
integer(8) :: offset_data_read = 5
integer(8) :: offset_eof = 97
integer(8) :: offset_data_eof = 1
integer(8) :: size_toread = 0 integer(8) :: size_toread = 0
character*(128) :: str character*(128) :: str
@ -271,10 +274,11 @@ subroutine test_read(file_name, back_end)
rc = trexio_read_mo_2e_int_eri(trex_file, offset_read, read_buf_size, & rc = trexio_read_mo_2e_int_eri(trex_file, offset_read, read_buf_size, &
index_sparse_mo_2e_int_eri(1,5+1), & index_sparse_mo_2e_int_eri(1, offset_data_read + 1), &
value_sparse_mo_2e_int_eri(5+1)) value_sparse_mo_2e_int_eri(offset_data_read + 1))
call trexio_assert(rc, TREXIO_SUCCESS) call trexio_assert(rc, TREXIO_SUCCESS)
if (index_sparse_mo_2e_int_eri(1,1) == 0 .and. index_sparse_mo_2e_int_eri(1,5+1) == offset_read*4+1) then if (index_sparse_mo_2e_int_eri(1, 1) == 0 .and. &
index_sparse_mo_2e_int_eri(1, offset_data_read + 1) == offset_read*4 + 1) then
write(*,*) 'SUCCESS READ SPARSE DATA' write(*,*) 'SUCCESS READ SPARSE DATA'
else else
print *, 'FAILURE SPARSE DATA CHECK' print *, 'FAILURE SPARSE DATA CHECK'
@ -282,6 +286,24 @@ subroutine test_read(file_name, back_end)
endif endif
! attempt to read reaching EOF: should return TREXIO_END and
! NOT increment the existing values in the buffer (only upd with what has been read)
rc = trexio_read_mo_2e_int_eri(trex_file, offset_eof, read_buf_size, &
index_sparse_mo_2e_int_eri(1, offset_data_eof + 1), &
value_sparse_mo_2e_int_eri(offset_data_eof + 1))
call trexio_assert(rc, TREXIO_END)
!do i = 1,20
! write(*,*) index_sparse_mo_2e_int_eri(1,i)
!enddo
if (index_sparse_mo_2e_int_eri(1, 1) == 0 .and. &
index_sparse_mo_2e_int_eri(1, offset_data_read + 1) == offset_read*4 + 1 .and. &
index_sparse_mo_2e_int_eri(1, offset_data_eof + 1) == offset_eof*4 + 1) then
write(*,*) 'SUCCESS READ SPARSE DATA EOF'
else
print *, 'FAILURE SPARSE DATA EOF CHECK'
call exit(-1)
endif
rc = trexio_read_mo_2e_int_eri_size(trex_file, size_toread) rc = trexio_read_mo_2e_int_eri_size(trex_file, size_toread)
call trexio_assert(rc, TREXIO_SUCCESS) call trexio_assert(rc, TREXIO_SUCCESS)
if (size_toread == 100) then if (size_toread == 100) then