1
0
mirror of https://github.com/TREX-CoE/trexio.git synced 2024-11-03 20:54:07 +01:00

Merge pull request #71 from TREX-CoE/add-sparse-datasets

Add Python API for sparse datasets
This commit is contained in:
Evgeny Posenitskiy 2022-01-05 13:57:32 +01:00 committed by GitHub
commit c687c80f92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 440 additions and 50 deletions

View File

@ -67,7 +67,7 @@ charges = [6., 6., 6., 6., 6., 6., 1., 1., 1., 1., 1., 1.]
#charges_np = np.array(charges, dtype=np.float32)
charges_np = np.array(charges, dtype=np.int32)
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# from the size of the list/array by SWIG using typemaps from numpy.i
trexio.write_nucleus_charge(test_file, charges_np)
@ -80,7 +80,7 @@ indices_np = np.array(indices, dtype=np.int64)
# first write basis_shell_num because it is needed to check dimensions of basis_nucleus_index in TREXIO >= 2.0.0
trexio.write_basis_shell_num(test_file, basis_shell_num)
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# from the size of the list/array by SWIG using typemacs from numpy.i
trexio.write_basis_nucleus_index(test_file, indices_np)
@ -103,11 +103,25 @@ coords = [
# write coordinates in the file
trexio.write_nucleus_coord(test_file, coords)
point_group = 'B3U'
# write mo_num (needed later to write sparse mo_2e_int_eri integrals)
trexio.write_mo_num(test_file, 600)
# write sparse data in the file
num_integrals = 100
indices = [i for i in range(num_integrals*4)]
values = [(3.14 + float(i)) for i in range(num_integrals)]
trexio.write_mo_2e_int_eri(test_file, 0, num_integrals, indices, values)
# write nucleus_point_group in the file
point_group = 'B3U'
trexio.write_nucleus_point_group(test_file, point_group)
# write nucleus_label in the file
labels = [
'C',
'C',
@ -122,14 +136,13 @@ labels = [
'H',
'H']
# write nucleus_label in the file
trexio.write_nucleus_label(test_file,labels)
# close TREXIO file
# close TREXIO file
# this call is no longer needed as we introduced TREXIO_File class which has a desctructor that closes the file
#trexio.close(test_file)
# without calling destructor on test_file the TREXIO_FILE is not getting created and the data is not written when using TEXT back end.
# This, the user still has to explicitly call destructor on test_file object instead of the trexio.close function.
# without calling destructor on test_file the TREXIO_FILE is not getting created and the data is not written when using TEXT back end.
# This, the user still has to explicitly call destructor on test_file object instead of the trexio.close function.
# This is only an issue when the data is getting written and read in the same session (e.g. in Jupyter notebook)
del test_file
@ -147,6 +160,7 @@ assert trexio.has_nucleus_charge
assert trexio.has_nucleus_coord
assert trexio.has_nucleus_label
assert trexio.has_nucleus_point_group
assert trexio.has_mo_2e_int_eri
# read nucleus_num from file
rnum = trexio.read_nucleus_num(test_file2)
@ -189,6 +203,33 @@ np.testing.assert_array_almost_equal(rcoords_np, np.array(coords).reshape(nucleu
# set doReshape to False to get a flat 1D array (e.g. when reading matrices like nuclear coordinates)
#rcoords_reshaped_2 = trexio.read_nucleus_coord(test_file2, doReshape=False)
# read number of integrals already present in the file
assert trexio.has_mo_2e_int_eri(test_file2)
assert trexio.read_mo_2e_int_eri_size(test_file2)==num_integrals
# read sparse arrays on mo_2e_int_eri integrals
buf_size = 60
offset_file = 0
# read full buf_size (i.e. the one that does not reach EOF)
indices_sparse_np, value_sparse_np, read_buf_size, eof = trexio.read_mo_2e_int_eri(test_file2, offset_file, buf_size)
print(f'First complete sparse read size: {read_buf_size}')
#print(indices_sparse_np)
assert not eof
assert read_buf_size==buf_size
assert indices_sparse_np[0][0]==0
assert indices_sparse_np[read_buf_size-1][3]==read_buf_size*4-1
offset_file += buf_size
# read incomplete buf_size (i.e. the one that does reach EOF)
indices_sparse_np, value_sparse_np, read_buf_size, eof2 = trexio.read_mo_2e_int_eri(test_file2, offset_file, buf_size)
print(f'Second incomplete sparse read size: {read_buf_size}')
#print(indices_sparse_np)
assert eof2
assert read_buf_size==(num_integrals - buf_size)
assert indices_sparse_np[0][0]==offset_file*4
assert indices_sparse_np[read_buf_size-1][3]==(offset_file+read_buf_size)*4-1
# read array of nuclear labels
rlabels_2d = trexio.read_nucleus_label(test_file2, dim=nucleus_num)
print(rlabels_2d)
@ -197,13 +238,13 @@ for i in range(nucleus_num):
# read a string corresponding to nuclear point group
rpoint_group = trexio.read_nucleus_point_group(test_file2)
assert rpoint_group==point_group
assert rpoint_group==point_group
# another way to read only if the variable exists
if trexio.has_mo_num(test_file2):
rmo_num = trexio.read_mo_num(test_file2)
if trexio.has_ao_num(test_file2):
rmo_num = trexio.read_ao_num(test_file2)
else:
print("Pass on reading the non-existing variable mo_num: checked")
print("Pass on reading the non-existing variable ao_num: checked")
# close TREXIO file
#trexio.close(test_file2)

View File

@ -54,7 +54,7 @@ assert rc==0
charges = [6., 6., 6., 6., 6., 6., 1., 1., 1., 1., 1., 1.]
charges_np = np.array(charges, dtype=np.float64)
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# from the size of the list/array by SWIG using typemaps from numpy.i
rc = trexio_write_safe_nucleus_charge(test_file, charges_np)
assert rc==0
@ -78,16 +78,29 @@ indices_np = np.array(indices, dtype=np.int32)
rc = trexio_write_basis_shell_num(test_file, basis_num)
assert rc==0
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# function call below works with both lists and numpy arrays, dimension needed for memory-safety is derived
# from the size of the list/array by SWIG using typemacs from numpy.i
rc = trexio_write_safe_basis_nucleus_index(test_file, indices_np)
assert rc==0
# test writing of sparse data
rc = trexio_write_mo_num(test_file, 600)
assert rc==0
num_integrals = 100
indices = [i for i in range(num_integrals*4)]
values = [(3.14 + float(i)) for i in range(num_integrals)]
rc = trexio_write_safe_mo_2e_int_eri(test_file, 0, num_integrals, indices, values)
assert rc==0
# test writing of single string
point_group = 'B3U'
rc = trexio_write_nucleus_point_group(test_file, point_group, 10)
assert rc==0
# test writing of array of strings
labels = [
'C',
'C',
@ -172,6 +185,36 @@ print(f'Read point group: {rpoint_group}')
assert rc==0
assert rpoint_group==point_group
# test reasing sparse quantities
rc, mo_2e_int_size = trexio_read_mo_2e_int_eri_size(test_file2)
assert rc==0
assert mo_2e_int_size==num_integrals
buf_size = 60
offset_file = 0
# read full buf_size (i.e. the one that does not reach EOF)
rc, read_buf_size, indices_sparse_np, value_sparse_np = trexio_read_safe_mo_2e_int_eri(test_file2, offset_file, buf_size, buf_size*4, buf_size)
print(f'First complete sparse read size: {read_buf_size}')
#print(indices_sparse_np)
assert rc==0
assert read_buf_size==buf_size
assert indices_sparse_np[0]==0
assert indices_sparse_np[read_buf_size*4-1]==read_buf_size*4-1
offset_file += buf_size
# read incomplete buf_size (i.e. the one that does reach EOF)
rc, read_buf_size, indices_sparse_np, value_sparse_np = trexio_read_safe_mo_2e_int_eri(test_file2, offset_file, buf_size, buf_size*4, buf_size)
print(f'Second incomplete sparse read size: {read_buf_size}')
# Incomplete read still allocates NumPy array of buf_size=60 elements but only 40 elements read upon encounter of EOF,
# Thus the remaining 20 elements are filled with garbage rather than zeros. Handle this in the front end ?
print(indices_sparse_np)
# trexio_exit_code = 6 correspond to TREXIO_END
assert rc==6
assert read_buf_size==(num_integrals - buf_size)
assert indices_sparse_np[0]==offset_file*4
assert indices_sparse_np[read_buf_size*4-1]==(offset_file+read_buf_size)*4-1
rc = trexio_close(test_file2)
assert rc==0
@ -184,4 +227,3 @@ except:
print (f'No output file {output_filename} has been produced')
#==========================================================#

View File

@ -12,10 +12,10 @@
%include <stdint.i>
/* NOTE:
carrays was useful before numpy.i was introduced.
carrays was useful before numpy.i was introduced.
For Python interface it's better to use numpy arrays instead of carrays, because the latter are less python-ic.
On the other hand, carrays might be more portable to other target languages.
// Include carrays to work with C pointers to arrays
// Include carrays to work with C pointers to arrays
%include "carrays.i"
// Include classes that correspond to integer and float arrays
%array_class(double, doubleArray);
@ -24,20 +24,26 @@
%array_class(int64_t, int64Array);
*/
/* Include typemaps to play with input/output re-casting
/* Include typemaps to play with input/output re-casting
Useful when working with C pointers
*/
%include typemaps.i
/* Redefine the [int32_t*, int64_t*, float*, double*] num
/* Redefine the [int32_t*, int64_t*, float*, double*] num
pattern to be appended to the output tuple.
Useful for TREXIO read_num functions where the
Useful for TREXIO read_num functions where the
num variable is modified by address
*/
/* Return num variables as part of the output tuple */
%apply int *OUTPUT { int32_t* const num};
%apply int *OUTPUT { int64_t* const num};
%apply float *OUTPUT { float* const num};
%apply float *OUTPUT { double* const num};
/* Return TREXIO exit code from trexio_open as part of the output tuple */
%apply int *OUTPUT { trexio_exit_code* const rc_open};
/* Return number of sparse data points stored in the file as part of the output tuple */
%apply int *OUTPUT { int64_t* const size_max};
/* Return number of sparse data points read from the file as part of the output tuple */
%apply int *INOUT { int64_t* const buffer_size_read};
/* Does not work for arrays (SIGSEGV) */
@ -47,13 +53,13 @@
%include <cstring.i>
/* This enables read of long strings with TREXIO_DELIM delimeters that can be further converted into an array of string */
%cstring_bounded_output(char* dset_out, 4096);
/* This enables read of single string attributes with pre-defined max_str_len
/* This enables read of single string attributes with pre-defined max_str_len
for Python we pre-define max_str_len = PYTREXIO_MAX_STR_LENGTH everywhere for simplicity
*/
%cstring_output_maxsize(char* const str_out, const int32_t max_str_len);
/* This block is needed make SWIG treat (double * dset_out|_in, int64_t dim_out|_in) pattern
/* This block is needed make SWIG treat (double * dset_out|_in, int64_t dim_out|_in) pattern
as a special case in order to return the NumPy array to Python from C pointer to array
provided by trexio_read_safe_[dset_num] function.
NOTE: numpy.i is currently not part of SWIG but included in the numpy distribution (under numpy/tools/swig/numpy.i)
@ -81,8 +87,14 @@ import_array();
/* Enable write|read_safe functions to convert numpy arrays from/to int64 arrays */
%apply (int64_t* ARGOUT_ARRAY1, int64_t DIM1) {(int64_t* const dset_out, const int64_t dim_out)};
%apply (int64_t* IN_ARRAY1, int64_t DIM1) {(const int64_t* dset_in, const int64_t dim_in)};
/* Enable write|read_safe functions to convert numpy arrays from/to sparse arrays */
%apply (double* IN_ARRAY1, int64_t DIM1) {(const double* value_sparse_write, const int64_t size_value_write)};
%apply (int32_t* IN_ARRAY1, int64_t DIM1) {(const int32_t* index_sparse_write, const int64_t size_index_write)};
/* This tells SWIG to treat char ** dset_in pattern as a special case
%apply (double* ARGOUT_ARRAY1, int64_t DIM1) {(double* const value_sparse_read, const int64_t size_value_read)};
%apply (int32_t* ARGOUT_ARRAY1, int64_t DIM1) {(int32_t* const index_sparse_read, const int64_t size_index_read)};
/* This tells SWIG to treat char ** dset_in pattern as a special case
Enables access to trexio_[...]_write_dset_str set of functions directly, i.e.
by converting input list of strings from Python into char ** of C
*/
@ -116,4 +128,3 @@ import_array();
/* Parse the header files to generate wrappers */
%include "trexio_s.h"
%include "trexio.h"

View File

@ -2378,35 +2378,58 @@ def has_$group_dset$(trexio_file) -> bool:
}
#+end_src
The electron repulsion integral (eri) $\langle ij | kl \rangle$ is
represented as a quartet of integers $(i,j,k,l)$ and a floating
point value.
The electron repulsion integral (eri) $\langle ij | kl \rangle$ is
represented as a quartet of integers $(i,j,k,l)$ and a floating
point value.
To store $N$ integrals in the file, we store
To store $N$ integrals in the file, we store
- An array of quartets of integers
- An array of values (floats)
- An array of quartets of integers
- An array of values (floats)
Both arrays have the same size, $N$, the number of non-zero integrals.
Knowing the maximum dimensions allows to check that the integers are
in a valid range, and also lets the library choose the smallest
integer representation to compress the storage.
Both arrays have the same size, $N$, the number of non-zero integrals.
Knowing the maximum dimensions allows to check that the integers are
in a valid range, and also lets the library choose the smallest
integer representation to compress the storage.
Fortran uses 1-based array indexing, while C uses 0-based indexing.
Internally, we use a 0-based representation but the Fortran binding
does the appropriate conversion when reading or writing.
Fortran uses 1-based array indexing, while C uses 0-based indexing.
Internally, we use a 0-based representation but the Fortran binding
does the appropriate conversion when reading or writing.
As the number of integrals to store can be prohibitively large, we
provide the possibility to read/write the integrals in chunks. So the
functions take two extra parameters:
As the number of integrals to store can be prohibitively large, we
provide the possibility to read/write the integrals in chunks. So the
functions take two extra parameters:
- ~offset~ : how many integrals in the file should be skipped when reading.
An offset of zero implies to read the first integral.
- ~size~ : the number of integrals to read.
- ~offset_file~ : how many integrals in the file should be skipped when reading/writing.
An offset of zero implies to read the first integral.
- ~buffer_size~ : the number of integrals to read/write.
If EOF is encountered upon reading, the ~buffer_size~ is overwritten with the number
of integrals that have been read before EOF and the ~trexio_read_~ function return
~TREXIO_END~ exit code instead of ~TREXIO_SUCCESS~.
We provide a function to read a chunk of indices, and a function to
read a chunk of values, because some users might want to read only
the values of the integrals, or only the indices.
The storage of ~int~ indices is internally compressed based on the maximum possible value of an index,
which is derived from the corresponding dimension of the sparse array (e.g. ~ao_num~ is the upper bound
of indices in the aforementioned ~ao_2e_int_eri~ dataset).
The upper bounds for different ~int~ types (e.g. ~uint16_t~) can be found in the in the =stdint.h= C library.
Currently implemented list of compressions based on the upper bound of indices can be found below:
| Max value of indices | Internal representation (in the TREXIO file) |
|---------------------------------+----------------------------------------------|
| ~UINT8_MAX~ (e.g. $< 255$) | 8-bit unsigned int |
| ~UINT16_MAX~ (e.g. $< 65535$) | 16-bit unsigned int |
| Otherwise (e.g. $\ge 65535$) | 32-bit signed int |
This section concerns API calls related to sparse data structures.
| Function name | Description | Precision |
|----------------------------------+-------------------------------------------------------------+----------------------------------|
| ~trexio_has_$group_dset$~ | Check if a sparse dset is present in a file | --- |
| ~trexio_read_$group_dset$~ | Read indices and values of a sparse dset | Single/Double for indices/values |
| ~trexio_read_$group_dset$_size~ | Read the number of sparse data elements stored in the file | Double for size |
| ~trexio_write_$group_dset$~ | Write indices and values of a sparse dset | Single/Double for indices/values |
| ~trexio_read_safe_$group_dset$~ | Safe (bounded) read of indices and values (for Python API) | Single/Double for indices/values |
| ~trexio_write_safe_$group_dset$~ | Safe (bounded) write of indices and values (for Python API) | Single/Double for indices/values |
*** C templates for front end
**** Function declarations
@ -2416,11 +2439,25 @@ trexio_exit_code trexio_has_$group_dset$(trexio_t* const file);
trexio_exit_code trexio_read_$group_dset$(trexio_t* const file, const int64_t offset_file, int64_t* const buffer_size, int32_t* const index_sparse, double* const value_sparse);
trexio_exit_code trexio_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
trexio_exit_code trexio_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t buffer_size, const int32_t* index_sparse, const double* value_sparse);
trexio_exit_code trexio_read_safe_$group_dset$(trexio_t* const file, const int64_t offset_file, int64_t* const buffer_size_read, int32_t* const index_sparse_read, const int64_t size_index_read, double* const value_sparse_read, const int64_t size_value_read);
trexio_exit_code trexio_write_safe_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t buffer_size, const int32_t* index_sparse_write, const int64_t size_index_write, const double* value_sparse_write, const int64_t size_value_write);
#+end_src
**** Source code for default functions
#+begin_src c :tangle read_dset_sparse_front.c
trexio_exit_code trexio_read_safe_$group_dset$(trexio_t* const file,
const int64_t offset_file,
int64_t* const buffer_size_read,
int32_t* const index_sparse_read,
const int64_t size_index_read,
double* const value_sparse_read,
const int64_t size_value_read
)
{
return trexio_read_$group_dset$(file, offset_file, buffer_size_read, index_sparse_read, value_sparse_read);
}
trexio_exit_code
trexio_read_$group_dset$(trexio_t* const file,
const int64_t offset_file,
@ -2525,13 +2562,25 @@ trexio_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max)
#+begin_src c :tangle write_dset_sparse_front.c
trexio_exit_code trexio_write_safe_$group_dset$(trexio_t* const file,
const int64_t offset_file,
const int64_t buffer_size,
const int32_t* index_sparse_write,
const int64_t size_index_write,
const double* value_sparse_write,
const int64_t size_value_write
)
{
return trexio_write_$group_dset$(file, offset_file, buffer_size, index_sparse_write, value_sparse_write);
}
trexio_exit_code
trexio_write_$group_dset$(trexio_t* const file,
const int64_t offset_file,
const int64_t buffer_size,
const int32_t* index_sparse,
const double* value_sparse
)
const int64_t offset_file,
const int64_t buffer_size,
const int32_t* index_sparse,
const double* value_sparse
)
{
if (file == NULL) return TREXIO_INVALID_ARG_1;
if (offset_file < 0L) return TREXIO_INVALID_ARG_2;
@ -2648,6 +2697,22 @@ interface
double precision, intent(in) :: value_sparse(*)
end function trexio_write_$group_dset$
end interface
interface
integer function trexio_write_safe_$group_dset$ (trex_file, &
offset_file, buffer_size, &
index_sparse, index_size, &
value_sparse, value_size) bind(C)
use, intrinsic :: iso_c_binding
integer(8), intent(in), value :: trex_file
integer(8), intent(in), value :: offset_file
integer(8), intent(in), value :: buffer_size
integer(4), intent(in) :: index_sparse(*)
integer(8), intent(in), value :: index_size
double precision, intent(in) :: value_sparse(*)
integer(8), intent(in), value :: value_size
end function trexio_write_safe_$group_dset$
end interface
#+end_src
#+begin_src f90 :tangle read_dset_sparse_front_fortran.f90
@ -2663,6 +2728,22 @@ interface
double precision, intent(out) :: value_sparse(*)
end function trexio_read_$group_dset$
end interface
interface
integer function trexio_read_safe_$group_dset$ (trex_file, &
offset_file, buffer_size, &
index_sparse, index_size, &
value_sparse, value_size) bind(C)
use, intrinsic :: iso_c_binding
integer(8), intent(in), value :: trex_file
integer(8), intent(in), value :: offset_file
integer(8), intent(inout) :: buffer_size
integer(4), intent(out) :: index_sparse(*)
integer(8), intent(in), value :: index_size
double precision, intent(out) :: value_sparse(*)
integer(8), intent(in), value :: value_size
end function trexio_read_safe_$group_dset$
end interface
#+end_src
#+begin_src f90 :tangle read_dset_sparse_size_front_fortran.f90
@ -2685,6 +2766,221 @@ interface
end interface
#+end_src
*** Python templates for front end
#+begin_src python :tangle write_dset_sparse_front.py
def write_$group_dset$(trexio_file: File, offset_file: int, buffer_size: int, indices: list, values: list) -> None:
"""Write the $group_dset$ indices and values in the TREXIO file.
Parameters:
trexio_file:
TREXIO File object.
offset_file: int
The number of integrals to be skipped in the file when writing.
buffer_size: int
The number of integrals to write in the file from the provided sparse arrays.
values: list OR numpy.ndarray
Array of $group_dset$ indices to be written. If array data type does not correspond to int32, the conversion is performed.
values: list OR numpy.ndarray
Array of $group_dset$ values to be written. If array data type does not correspond to float64, the conversion is performed.
Raises:
- Exception from AssertionError if TREXIO return code ~rc~ is different from TREXIO_SUCCESS and prints the error message using trexio_string_of_error.
- Exception from some other error (e.g. RuntimeError).
"""
try:
import numpy as np
except ImportError:
raise Exception("NumPy cannot be imported.")
if not isinstance(offset_file, int):
raise TypeError("offset_file argument has to be an integer.")
if not isinstance(buffer_size, int):
raise TypeError("buffer_size argument has to be an integer.")
if not isinstance(indices, (list, tuple, np.ndarray)):
raise TypeError("indices argument has to be an array (list, tuple or NumPy ndarray).")
if not isinstance(values, (list, tuple, np.ndarray)):
raise TypeError("values argument has to be an array (list, tuple or NumPy ndarray).")
convertIndices = False
convertValues = False
flattenIndices = False
if isinstance(indices, np.ndarray):
# convert to int32 if input indices are in a different precision
if not indices.dtype==np.int32:
convertIndices = True
if len(indices.shape) > 1:
flattenIndices = True
if convertIndices:
indices_32 = np.int32(indices).flatten()
else:
indices_32 = np.array(indices, dtype=np.int32).flatten()
else:
if convertIndices:
indices_32 = np.int32(indices)
else:
# if input array is a multidimensional list or tuple, we have to convert it
try:
doFlatten = True
# if list of indices is flat - the attempt to compute len(indices[0]) will raise a TypeError
ncol = len(indices[0])
indices_32 = np.array(indices, dtype=np.int32).flatten()
except TypeError:
doFlatten = False
pass
if isinstance(values, np.ndarray):
# convert to float64 if input values are in a different precision
if not values.dtype==np.float64:
convertValues = True
if convertValues:
values_64 = np.float64(values)
if (convertIndices or flattenIndices) and convertValues:
rc = pytr.trexio_write_safe_$group_dset$(trexio_file.pytrexio_s, offset_file, buffer_size, indices_32, values_64)
elif (convertIndices or flattenIndices) and not convertValues:
rc = pytr.trexio_write_safe_$group_dset$(trexio_file.pytrexio_s, offset_file, buffer_size, indices_32, values)
elif not (convertIndices or flattenIndices) and convertValues:
rc = pytr.trexio_write_safe_$group_dset$(trexio_file.pytrexio_s, offset_file, buffer_size, indices, values_64)
else:
rc = pytr.trexio_write_safe_$group_dset$(trexio_file.pytrexio_s, offset_file, buffer_size, indices, values)
if rc != TREXIO_SUCCESS:
raise Error(rc)
#+end_src
#+begin_src python :tangle read_dset_sparse_front.py
def read_$group_dset$(trexio_file: File, offset_file: int, buffer_size: int) -> tuple:
"""Read the $group_dset$ indices and values from the TREXIO file.
Parameters:
trexio_file:
TREXIO File object.
offset_file: int
The number of integrals to be skipped in the file when reading.
buffer_size: int
The number of integrals to read from the file.
Returns:
(indices, values, n_int_read, eof_flag) tuple where
- indices and values are NumPy arrays [numpy.ndarray] with the default int32 and float64 precision, respectively;
- n_int_read [int] is the number of integrals read from the trexio_file
(either strictly equal to buffer_size or less than buffer_size if EOF has been reached);
- eof_flag [bool] is True when EOF has been reached (i.e. when call to low-level pytrexio API returns TREXIO_END)
False otherwise.
Raises:
- Exception from AssertionError if TREXIO return code ~rc~ is different from TREXIO_SUCCESS
and prints the error message using trexio_string_of_error.
- Exception from some other error (e.g. RuntimeError).
"""
try:
import numpy as np
except ImportError:
raise Exception("NumPy cannot be imported.")
if not isinstance(offset_file, int):
raise TypeError("offset_file argument has to be an integer.")
if not isinstance(buffer_size, int):
raise TypeError("buffer_size argument has to be an integer.")
# read the number of integrals already in the file
integral_num = read_$group_dset$_size(trexio_file)
# additional modification needed to avoid allocating more memory than needed if EOF will be reached during read
overflow = offset_file + buffer_size - integral_num
eof_flag = False
if overflow > 0:
verified_size = buffer_size - overflow
eof_flag = True
else:
verified_size = buffer_size
# main call to the low-level (SWIG-wrapped) trexio_read function, which also requires the sizes of the output to be provided
# as the last 2 arguments (for numpy arrays of indices and values, respectively)
# read_buf_size contains the number of elements being read from the file, useful when EOF has been reached
rc, n_int_read, indices, values = pytr.trexio_read_safe_$group_dset$(trexio_file.pytrexio_s,
offset_file,
verified_size,
verified_size * $group_dset_rank$,
verified_size)
if rc != TREXIO_SUCCESS:
raise Error(rc)
if n_int_read == 0:
raise ValueError("No integrals have been read from the file.")
if indices is None or values is None:
raise ValueError("Returned NULL array from the low-level pytrexio API.")
# conversion to custom types can be performed on the user side, here we only reshape the returned flat array of indices according to group_dset_rank
shape = tuple([verified_size, $group_dset_rank$])
indices_reshaped = np.reshape(indices, shape, order='C')
return (indices_reshaped, values, n_int_read, eof_flag)
def read_$group_dset$_size(trexio_file) -> int:
"""Read the number of $group_dset$ integrals stored in the TREXIO file.
Parameter is a ~TREXIO File~ object that has been created by a call to ~open~ function.
Returns:
~num_integral~: int
Integer value of corresponding to the size of the $group_dset$ sparse array from ~trexio_file~.
Raises:
- Exception from AssertionError if TREXIO return code ~rc~ is different from TREXIO_SUCCESS and prints the error message using trexio_string_of_error.
- Exception from some other error (e.g. RuntimeError).
"""
try:
rc, num_integral = pytr.trexio_read_$group_dset$_size(trexio_file.pytrexio_s)
if rc != TREXIO_SUCCESS:
raise Error(rc)
except:
raise
return num_integral
#+end_src
#+begin_src python :tangle has_dset_sparse_front.py
def has_$group_dset$(trexio_file) -> bool:
"""Check that $group_dset$ variable exists in the TREXIO file.
Parameter is a ~TREXIO File~ object that has been created by a call to ~open~ function.
Returns:
True if the variable exists, False otherwise
Raises:
- Exception from trexio.Error class if TREXIO return code ~rc~ is TREXIO_FAILURE and prints the error message using string_of_error.
- Exception from some other error (e.g. RuntimeError).
"""
try:
rc = pytr.trexio_has_$group_dset$(trexio_file.pytrexio_s)
if rc == TREXIO_FAILURE:
raise Error(rc)
except:
raise
if rc == TREXIO_SUCCESS:
return True
else:
return False
#+end_src
** Templates for front end has/read/write a dataset of strings
*** Introduction
This section concerns API calls related to datasets of strings.