[WIP] working write_ for chunked extensible (sparse) datasets

2025-04-28 11:24:44 +02:00 · 2021-12-08 17:26:55 +01:00 · 2021-12-08 17:26:55 +01:00 · d001844c2f
commit d001844c2f
parent ddcfff0f83
3 changed files with 229 additions and 28 deletions
--- a/src/templates_hdf5/templator_hdf5.org
+++ b/src/templates_hdf5/templator_hdf5.org
@ -372,6 +372,202 @@ trexio_hdf5_has_$group_dset$ (trexio_t* const file)
 }
   #+end_src

+** Template for HDF5 has/read/write the dataset of sparse data
+
+  Sparse data is stored using extensible datasets of HDF5. Extensibility is required
+  due to the fact that the sparse data will be written in chunks of user-defined size.
+
+   #+begin_src c :tangle hrw_dset_sparse_hdf5.h :exports none
+trexio_exit_code trexio_hdf5_has_$group_dset$(trexio_t* const file);
+trexio_exit_code trexio_hdf5_read_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, int32_t* const index_sparse, double* const value_sparse);
+trexio_exit_code trexio_hdf5_write_$group_dset$(trexio_t* const file, const int64_t offset_file, const int64_t size, const int64_t size_max, const int32_t* index_sparse, const double* value_sparse);
+trexio_exit_code trexio_hdf5_read_$group_dset$_size(trexio_t* const file, int64_t* const size_max);
+   #+end_src
+
+
+   #+begin_src c :tangle write_dset_sparse_hdf5.c
+trexio_exit_code
+trexio_hdf5_write_$group_dset$ (trexio_t* const file,
+                                const int64_t offset_file,
+                                const int64_t size,
+                                const int64_t size_max,
+                                const int32_t* index_sparse,
+                                const double* value_sparse)
+{
+
+  if (file == NULL) return TREXIO_INVALID_ARG_1;
+
+  trexio_hdf5_t* f = (trexio_hdf5_t*) file;
+
+  const uint32_t rank = 1; // 4;
+  const hsize_t chunk_dims[1] = {size*4}; //[4] = {size, size, size, size};
+  // TODO: generator
+  hsize_t maxdims[1] = {H5S_UNLIMITED}; // [4] = {H5S_UNLIMITED, H5S_UNLIMITED, H5S_UNLIMITED, H5S_UNLIMITED};
+
+  if ( H5LTfind_dataset(f->$group$_group, $GROUP_DSET$_NAME) != 1 ) {
+
+    hid_t dspace = H5Screate_simple(rank, chunk_dims, maxdims);
+    hid_t prop = H5Pcreate(H5P_DATASET_CREATE);
+    herr_t status = H5Pset_chunk(prop, rank, chunk_dims);
+
+    hid_t dset_id = H5Dcreate(f->$group$_group,
+                              $GROUP_DSET$_NAME,
+                              H5T_NATIVE_INT32,
+                              dspace,
+                              H5P_DEFAULT,
+                              prop,
+                              H5P_DEFAULT);
+
+    assert(dset_id >= 0);
+    printf(" HERE HERE HERE !\n");
+    status = H5Dwrite(dset_id, H5T_NATIVE_INT32, H5S_ALL, H5S_ALL, H5P_DEFAULT, index_sparse);
+
+    /*const herr_t status = H5LTmake_dataset(f->$group$_group,
+					   $GROUP_DSET$_NAME,
+					   (int) rank, (const hsize_t*) dims,
+					   H5T_$GROUP_DSET_H5_DTYPE$,
+					   $group_dset$);
+    if (status < 0) return TREXIO_FAILURE;*/
+    H5Pclose(prop);
+    H5Dclose(dset_id);
+    H5Sclose(dspace);
+
+  } else {
+
+    hid_t dset_id = H5Dopen(f->$group$_group, $GROUP_DSET$_NAME, H5P_DEFAULT);
+    if (dset_id <= 0) return TREXIO_INVALID_ID;
+
+
+    hid_t fspace = H5Dget_space(dset_id);
+    hsize_t offset[1] = {offset_file*4}; //[4] = {offset_file, offset_file, offset_file, offset_file};
+
+    // allocate space for the dimensions to be read
+    hsize_t ddims[1] = {0};
+
+    // get the rank and dimensions of the dataset
+    int rrank = H5Sget_simple_extent_dims(fspace, ddims, NULL);
+    ddims[0] += chunk_dims[0];
+
+    printf("SIZE = %ld\n", ddims[0]);
+
+    // extend the dset size
+    herr_t status  = H5Dset_extent(dset_id, ddims);
+
+    H5Sclose(fspace);
+
+    fspace = H5Dget_space(dset_id);
+
+    status = H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, chunk_dims, NULL);
+    hid_t dspace = H5Screate_simple(rank, chunk_dims, NULL);
+
+    status = H5Dwrite(dset_id,
+                      H5T_NATIVE_INT32,
+                      dspace, fspace, H5P_DEFAULT,
+                      index_sparse);
+    assert(status >= 0);
+    // TODO: CLOSE ALL OPENED
+    H5Dclose(dset_id);
+    H5Sclose(dspace);
+    H5Sclose(fspace);
+    //if (status < 0) return TREXIO_FAILURE;
+
+  }
+
+  return TREXIO_SUCCESS;
+
+}
+   #+end_src
+
+
+   #+begin_src c :tangle read_dset_sparse_hdf5.c
+trexio_exit_code
+trexio_hdf5_read_$group_dset$ (trexio_t* const file,
+                               const int64_t offset_file,
+                               const int64_t size,
+                               const int64_t size_max,
+                               int32_t* const index_sparse,
+                               double* const value_sparse)
+{
+
+  if (file == NULL) return TREXIO_INVALID_ARG_1;
+
+  const trexio_hdf5_t* f = (const trexio_hdf5_t*) file;
+
+  // open the dataset to get its dimensions
+  hid_t dset_id = H5Dopen(f->$group$_group, $GROUP_DSET$_NAME, H5P_DEFAULT);
+  if (dset_id <= 0) return TREXIO_INVALID_ID;
+
+  const uint32_t rank = 4;
+
+  // allocate space for the dimensions to be read
+  hsize_t* ddims = CALLOC( (int) rank, hsize_t);
+  if (ddims == NULL) return TREXIO_FAILURE;
+
+  // get the dataspace of the dataset
+  hid_t dspace_id = H5Dget_space(dset_id);
+  // get the rank and dimensions of the dataset
+  int rrank = H5Sget_simple_extent_dims(dspace_id, ddims, NULL);
+
+  // check that dimensions are consistent
+  if (rrank != (int) rank) {
+    FREE(ddims);
+    H5Sclose(dspace_id);
+    H5Dclose(dset_id);
+    return TREXIO_INVALID_ARG_3;
+  }
+
+  free(ddims);
+  H5Sclose(dspace_id);
+  H5Dclose(dset_id);
+
+  /* High-level H5LT API. No need to deal with dataspaces and datatypes */
+  /*herr_t status = H5LTread_dataset(f->$group$_group,
+			           $GROUP_DSET$_NAME,
+			           H5T_$GROUP_DSET_H5_DTYPE$,
+			           $group_dset$);
+  if (status < 0) return TREXIO_FAILURE;*/
+
+  return TREXIO_SUCCESS;
+}
+   #+end_src
+
+
+   #+begin_src c :tangle read_dset_sparse_hdf5.c
+trexio_exit_code
+trexio_hdf5_read_$group_dset$_size (trexio_t* const file, int64_t* const size_max)
+{
+
+  if (file == NULL) return TREXIO_INVALID_ARG_1;
+
+  // TODO
+
+  return TREXIO_SUCCESS;
+}
+   #+end_src
+
+
+   #+begin_src c :tangle has_dset_sparse_hdf5.c
+trexio_exit_code
+trexio_hdf5_has_$group_dset$ (trexio_t* const file)
+{
+
+  if (file == NULL) return TREXIO_INVALID_ARG_1;
+
+  trexio_hdf5_t* f = (trexio_hdf5_t*) file;
+
+  herr_t status = H5LTfind_dataset(f->$group$_group, $GROUP_DSET$_NAME);
+  /* H5LTfind_dataset returns 1 if dataset exists, 0 otherwise */
+  if (status == 1){
+    return TREXIO_SUCCESS;
+  } else if (status == 0) {
+    return TREXIO_HAS_NOT;
+  } else {
+    return TREXIO_FAILURE;
+  }
+
+}
+   #+end_src
+
 ** Template for HDF5 has/read/write the dataset of strings

   #+begin_src c :tangle hrw_dset_str_hdf5.h :exports none
@ -709,5 +905,3 @@ trexio_hdf5_has_$group_str$ (trexio_t* const file)

 #endif
  #+end_src
-
-
--- a/tools/generator.py
+++ b/tools/generator.py
@ -15,6 +15,12 @@ dsets = get_dset_dict(trex_config)
 detailed_dsets_nostr, detailed_dsets_str, detailed_dsets_sparse = split_dset_dict_detailed(dsets)
 detailed_dsets = detailed_dsets_nostr.copy()
 detailed_dsets.update(detailed_dsets_str)
+# build a big dictionary with all pre-processed data
+detailed_all = {}
+detailed_all['datasets'] = dict(detailed_dsets_nostr, **detailed_dsets_str, **detailed_dsets_sparse)
+detailed_all['groups']   = group_dict
+detailed_all['numbers']  = detailed_nums
+detailed_all['strings']  = detailed_strs
 # consistency check for dimensioning variables
 check_dim_consistency(detailed_nums, dsets)
 # --------------------------------------------------------------------------- #
@ -33,7 +39,7 @@ files_todo = get_files_todo(source_files)

 # populate files with iterative scheme, i.e. for unique functions
 for fname in files_todo['auxiliary']:
-    iterative_populate_file(fname, template_paths, group_dict, detailed_dsets, detailed_nums, detailed_strs)
+    iterative_populate_file(fname, template_paths, detailed_all)

 # populate has/read/write_num functions with recursive scheme
 for fname in files_todo['attr_num']:
--- a/tools/generator_tools.py
+++ b/tools/generator_tools.py
@ -181,17 +181,18 @@ def recursive_replace_line (input_line: str, triggers: list, source: dict) -> st
    return output_line


-def iterative_populate_file (filename: str, paths: dict, groups: dict, datasets: dict, numbers: dict, strings: dict) -> None:
+def iterative_populate_file (filename: str, paths: dict, detailed_all: dict) -> None:
    """
    Iteratively populate files with unique functions that contain templated variables.

            Parameters:
                    filename (str)          : template file to be populated
                    paths (dict)            : dictionary of paths per source directory
-                    groups (dict)           : dictionary of groups
-                    datasets (dict)         : dictionary of datasets with substitution details
-                    numbers (dict)          : dictionary of numbers with substitution details
-                    strings (dict)          : dictionary of strings with substitution details
+                    detailed_all(dict)      : dictionary with substitution details with the following keys:
+                        'groups'            : dictionary of groups with substitution details
+                        'datasets'          : dictionary of datasets with substitution details
+                        'numbers'           : dictionary of numbers with substitution details
+                        'strings'           : dictionary of strings with substitution details

            Returns:
                    None
@ -211,19 +212,19 @@ def iterative_populate_file (filename: str, paths: dict, groups: dict, datasets:
                if id == 0:
                    # special case for proper error handling when deallocting text groups
                    error_handler = '  if (rc != TREXIO_SUCCESS) return rc;\n'
-                    populated_line = iterative_replace_line(line, '$group$', groups, add_line=error_handler)
+                    populated_line = iterative_replace_line(line, '$group$', detailed_all['groups'], add_line=error_handler)
                    f_out.write(populated_line)
                elif id == 1:
-                    populated_line = iterative_replace_line(line, triggers[id], datasets, None)
+                    populated_line = iterative_replace_line(line, triggers[id], detailed_all['datasets'], None)
                    f_out.write(populated_line)
                elif id == 2:
-                    populated_line = iterative_replace_line(line, triggers[id], numbers, None)
+                    populated_line = iterative_replace_line(line, triggers[id], detailed_all['numbers'], None)
                    f_out.write(populated_line)
                elif id == 3:
-                    populated_line = iterative_replace_line(line, triggers[id], strings, None)
+                    populated_line = iterative_replace_line(line, triggers[id], detailed_all['strings'], None)
                    f_out.write(populated_line)
                elif id == 4:
-                    populated_line = iterative_replace_line(line, triggers[id], groups, None)
+                    populated_line = iterative_replace_line(line, triggers[id], detailed_all['groups'], None)
                    f_out.write(populated_line)
                else:
                    f_out.write(line)