Add CI dataset and the tool to generate it.

Since the dataset must be accessible from the CI runner, the best solution is probably to commit a small dataset containing only the required cycles. It's included in this commit, and can be generated by extract-from-h5.py using the same cycles list as the one used by vfc_test_h5.cpp. Moreover, the probes exported by vfc_test_h5.cpp are now 0-padded, which will result in a better sorting in the results.
2025-01-12 14:08:34 +01:00 · 2021-05-03 13:59:43 +02:00 · 2021-05-03 13:59:43 +02:00 · d81777e347
commit d81777e347
parent 18d1e2b785
3 changed files with 81 additions and 3 deletions
--- a/datasets/ci_dataset.hdf5
+++ b/datasets/ci_dataset.hdf5
--- a/tests/vfc_test_h5.cpp
+++ b/tests/vfc_test_h5.cpp
@ -19,7 +19,7 @@
 using namespace H5;
 // #define DEBUG

-const H5std_string FILE_NAME( "datasets/dataset.hdf5" );
+const H5std_string FILE_NAME( "datasets/ci_dataset.hdf5" );

 double residual_max(double * A, unsigned int Dim) {
  double max = 0.0;
@ -79,6 +79,12 @@ int test_cycle(H5File file, int cycle, std::string version, vfc_probes * probes)

  std::string group = "cycle_" + std::to_string(cycle);

+  // This will result in the same string as group but with the cycle number
+  // being zero-padded. This is used when calling vfc_put_probe later on.
+  std::string zero_padded_group = std::to_string(cycle);
+  zero_padded_group = "cycle_" +
+  std::string(5 - zero_padded_group.length(), '0') + zero_padded_group;
+
  try{
    file.openGroup(group);
  } catch(H5::Exception& e){
@ -155,8 +161,8 @@ int test_cycle(H5File file, int cycle, std::string version, vfc_probes * probes)
  showMatrix(res, dim, "Result");
 #endif

-  vfc_put_probe(probes, &(group)[0], &("res_max_" + version)[0], res_max);
-  vfc_put_probe(probes, &(group)[0], &("res2_" + version)[0], res2);
+  vfc_put_probe(probes, &(zero_padded_group)[0], &("res_max_" + version)[0], res_max);
+  vfc_put_probe(probes, &(zero_padded_group)[0], &("res2_" + version)[0], res2);

  delete [] res, updates, u, col_update_index,
            slater_matrix, slater_inverse;
--- a/tools/extract-from-h5.py
+++ b/tools/extract-from-h5.py
@ -0,0 +1,72 @@
+#!/usr/bin/env python
+import h5py
+import sys
+
+# Helper script to extract a few cycles from a large dataset. This will be
+# especially useful for the Verificarlo CI, since vfc_ci_cycles.txt can be
+# used to both extract (with this script), and read the small dataset (in a CI
+# run).
+
+
+    # Parse arguments
+
+if len(sys.argv) != 4:
+    sys.stderr.write(
+        "Error: Wrong number of arguments. Usage : extract_h5.py "\
+        "<source_dataset.hdf5> <cycles_list.txt> <destination_dataset.hdf5>\n"
+    )
+    exit(1)
+
+source_dataset_path = sys.argv[1]
+cycles_list_path = sys.argv[2]
+destination_dataset_path = sys.argv[3]
+
+
+    # Read the cycles list
+
+cycles_list = []
+
+try:
+    f = open(cycles_list_path)
+    for line in f:
+        cycles_list.extend([cycle for cycle in line.split()])
+except IOError:
+    sys.stderr.write("Error: Could not read " + cycles_list_path + "\n")
+    exit(1)
+finally:
+    f.close()
+
+
+    # Read the source dataset, and extract the cycles to the destination dataset
+
+try:
+    fs = h5py.File(source_dataset_path, "r")
+
+except IOError:
+    sys.stderr.write("Error: Could not read " + source_dataset_path + "\n")
+    exit(1)
+
+fd = h5py.File(destination_dataset_path, "w")
+
+# Copy cycles groups
+
+groups = [
+    "slater_matrix_dim",
+    "nupdates",
+    "slater_matrix",
+    "slater_inverse",
+    "col_update_index",
+    "updates"
+]
+
+for cycle in cycles_list:
+    cycle_name = "cycle_" + cycle
+
+    new_cycle = fd.create_group(cycle_name)
+
+    # Copy all datasets
+    for group_name in groups:
+        fs.copy(cycle_name + "/" + group_name, new_cycle)
+
+
+print("Dataset successfully exported to %s" % source_dataset_path)