Unify handling of collective metadata reads status (#1206) (#1417)

author: jhendersonHDF <jhenderson@hdfgroup.org> 2022-02-03 22:19:38 (GMT)
committer: GitHub <noreply@github.com> 2022-02-03 22:19:38 (GMT)
commit: 1c9f219463e236487a27cee6cb839379d670cda4 (patch)
tree: cb696ac5138940d8c22a376654a80c67b9266ba0 /testpar
parent: 331cf6926d5f07a1edb75674299f9787d79d1d68 (diff)
download: hdf5-1c9f219463e236487a27cee6cb839379d670cda4.zip
hdf5-1c9f219463e236487a27cee6cb839379d670cda4.tar.gz
hdf5-1c9f219463e236487a27cee6cb839379d670cda4.tar.bz2
2 files changed, 77 insertions, 56 deletions
diff --git a/testpar/t_cache.c b/testpar/t_cache.c
index 263b467..39c5721 100644
--- a/testpar/t_cache.c
+++ b/testpar/t_cache.c
@@ -6617,13 +6617,15 @@ trace_file_check(int metadata_write_strategy)
 static hbool_t
 smoke_check_6(int metadata_write_strategy)
 {
-    hbool_t       success = TRUE;
-    int           i;
-    int           max_nerrors;
-    hid_t         fid       = -1;
-    H5F_t *       file_ptr  = NULL;
-    H5C_t *       cache_ptr = NULL;
-    struct mssg_t mssg;
+    H5P_coll_md_read_flag_t md_reads_file_flag;
+    hbool_t                 md_reads_context_flag;
+    hbool_t                 success = TRUE;
+    int                     i;
+    int                     max_nerrors;
+    hid_t                   fid       = -1;
+    H5F_t *                 file_ptr  = NULL;
+    H5C_t *                 cache_ptr = NULL;
+    struct mssg_t           mssg;
 
     switch (metadata_write_strategy) {
 
@@ -6679,7 +6681,9 @@ smoke_check_6(int metadata_write_strategy)
         virt_num_data_entries = NUM_DATA_ENTRIES;
 
         /* insert the first half collectively */
-        H5CX_set_coll_metadata_read(TRUE);
+        md_reads_file_flag    = H5P_USER_TRUE;
+        md_reads_context_flag = TRUE;
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
         for (i = 0; i < virt_num_data_entries / 2; i++) {
             struct datum *entry_ptr;
             entry_ptr = &(data[i]);
@@ -6698,9 +6702,13 @@ smoke_check_6(int metadata_write_strategy)
             H5_CHECK_OVERFLOW(cache_ptr->max_cache_size, size_t, double);
             HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
         }
+        /* Restore collective metadata reads state */
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
 
         /* insert the other half independently */
-        H5CX_set_coll_metadata_read(FALSE);
+        md_reads_file_flag    = H5P_USER_FALSE;
+        md_reads_context_flag = FALSE;
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
         for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
             struct datum *entry_ptr;
             entry_ptr = &(data[i]);
@@ -6718,6 +6726,8 @@ smoke_check_6(int metadata_write_strategy)
             /* Make sure coll entries do not cross the 80% threshold */
             HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
         }
+        /* Restore collective metadata reads state */
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
 
         /* flush the file */
         if (H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0) {
@@ -6728,7 +6738,9 @@ smoke_check_6(int metadata_write_strategy)
         }
 
         /* Protect the first half of the entries collectively */
-        H5CX_set_coll_metadata_read(TRUE);
+        md_reads_file_flag    = H5P_USER_TRUE;
+        md_reads_context_flag = TRUE;
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
         for (i = 0; i < (virt_num_data_entries / 2); i++) {
             struct datum *entry_ptr;
             entry_ptr = &(data[i]);
@@ -6746,9 +6758,13 @@ smoke_check_6(int metadata_write_strategy)
             /* Make sure coll entries do not cross the 80% threshold */
             HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
         }
+        /* Restore collective metadata reads state */
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
 
         /* protect the other half independently */
-        H5CX_set_coll_metadata_read(FALSE);
+        md_reads_file_flag    = H5P_USER_FALSE;
+        md_reads_context_flag = FALSE;
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
         for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
             struct datum *entry_ptr;
             entry_ptr = &(data[i]);
@@ -6766,6 +6782,8 @@ smoke_check_6(int metadata_write_strategy)
             /* Make sure coll entries do not cross the 80% threshold */
             HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
         }
+        /* Restore collective metadata reads state */
+        H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
 
         for (i = 0; i < (virt_num_data_entries); i++) {
             unlock_entry(file_ptr, i, H5AC__NO_FLAGS_SET);
diff --git a/testpar/t_coll_md_read.c b/testpar/t_coll_md_read.c
index fd62eb6..cabdea0 100644
--- a/testpar/t_coll_md_read.c
+++ b/testpar/t_coll_md_read.c
@@ -34,10 +34,9 @@
 
 #define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2
 
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE  20000
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE   1
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS         1
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM 10000
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME    "linked_chunk_io_sort_chunk_issue"
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS            1
 
 /*
  * A test for issue HDFFV-10501. A parallel hang was reported which occurred
@@ -339,21 +338,34 @@ test_multi_chunk_io_addrmap_issue(void)
  * collective metadata reads being made only by process 0 in H5D__sort_chunk().
  *
  * NOTE: Due to the way that the threshold value which pertains to this test
- * is currently calculated within HDF5, there are several conditions that this
- * test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for
- * a better idea of why.
+ * is currently calculated within HDF5, the following two conditions must be
+ * true to trigger the issue:
  *
- * Condition 1: We need to make sure that the test always selects every single
- * chunk in the dataset. It is fine if the selection is split up among multiple
- * ranks, but their combined selection must cover the whole dataset.
+ * Condition 1: A certain threshold ratio must be met in order to have HDF5
+ * obtain all chunk addresses collectively inside H5D__sort_chunk(). This is
+ * given by the following:
  *
- * Condition 2: The number of chunks in the dataset divided by the number of MPI
- * ranks must exceed or equal 10000. In other words, each MPI rank must be
- * responsible for 10000 or more unique chunks.
+ *     (sum_chunk * 100) / (dataset_nchunks * mpi_size) >= 30%
  *
- * Condition 3: This test will currently only be reliably reproducable for 2 or 3
- * MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size,
- * and is compared against a default value of 30%.
+ * where:
+ *     * `sum_chunk` is the combined sum of the number of chunks selected in
+ *       the dataset by all ranks (chunks selected by more than one rank count
+ *       individually toward the sum for each rank selecting that chunk)
+ *     * `dataset_nchunks` is the number of chunks in the dataset (selected
+ *       or not)
+ *     * `mpi_size` is the size of the MPI Communicator
+ *
+ * Condition 2: `sum_chunk` divided by `mpi_size` must exceed or equal a certain
+ * threshold (as of this writing, 10000).
+ *
+ * To satisfy both these conditions, we #define a macro,
+ * LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM, which corresponds to the
+ * value of the H5D_ALL_CHUNK_ADDR_THRES_COL_NUM macro in H5Dmpio.c (the
+ * 10000 threshold from condition 2). We then create a dataset of that many
+ * chunks and have each MPI rank write to and read from a piece of every single
+ * chunk in the dataset. This ensures chunk utilization is the max possible
+ * and exceeds our 30% target ratio, while always exactly matching the numeric
+ * chunk threshold value of condition 2.
  *
  * Failure in this test may either cause a hang, or, due to how the MPI calls
  * pertaining to this issue might mistakenly match up, may cause an MPI error
@@ -375,10 +387,9 @@ void
 test_link_chunk_io_sort_chunk_issue(void)
 {
     const char *filename;
-    hsize_t *   dataset_dims = NULL;
-    hsize_t     max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
-    hsize_t     sel_dims[1];
-    hsize_t     chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = {LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS};
+    hsize_t     dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+    hsize_t     sel_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+    hsize_t     chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
     hsize_t     start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
     hsize_t     stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
     hsize_t     count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
@@ -412,14 +423,13 @@ test_link_chunk_io_sort_chunk_issue(void)
     file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
     VRFY((file_id >= 0), "H5Fcreate succeeded");
 
-    dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims));
-    VRFY((dataset_dims != NULL), "malloc succeeded");
-
-    dataset_dims[0] = (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * (hsize_t)mpi_size *
-                      (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE;
-    max_dataset_dims[0] = H5S_UNLIMITED;
+    /*
+     * Create a one-dimensional dataset of exactly LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM
+     * chunks, where every rank writes to a piece of every single chunk to keep utilization high.
+     */
+    dataset_dims[0] = (hsize_t)mpi_size * (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
 
-    fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims);
+    fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, NULL);
     VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
 
     /*
@@ -428,6 +438,9 @@ test_link_chunk_io_sort_chunk_issue(void)
     dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
     VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
 
+    /* Chunk size is equal to MPI size since each rank writes to a piece of every chunk */
+    chunk_dims[0] = (hsize_t)mpi_size;
+
     VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0),
          "H5Pset_chunk succeeded");
 
@@ -437,23 +450,21 @@ test_link_chunk_io_sort_chunk_issue(void)
 
     /*
      * Setup hyperslab selection to split the dataset among the ranks.
-     *
-     * The ranks will write rows across the dataset.
      */
-    stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
-    count[0]  = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / (hsize_t)mpi_size;
-    start[0]  = count[0] * (hsize_t)mpi_rank;
-    block[0]  = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
+    start[0]  = (hsize_t)mpi_rank;
+    stride[0] = (hsize_t)mpi_size;
+    count[0]  = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
+    block[0]  = 1;
 
     VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
          "H5Sselect_hyperslab succeeded");
 
-    sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
+    sel_dims[0] = count[0];
 
     mspace_id = H5Screate_simple(1, sel_dims, NULL);
     VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
 
-    data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+    data = HDcalloc(1, count[0] * sizeof(int));
     VRFY((data != NULL), "calloc succeeded");
 
     dxpl_id = H5Pcreate(H5P_DATASET_XFER);
@@ -476,33 +487,25 @@ test_link_chunk_io_sort_chunk_issue(void)
     VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0),
          "H5Pset_dxpl_mpio_chunk_opt succeeded");
 
-    read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+    read_buf = HDmalloc(count[0] * sizeof(int));
     VRFY((read_buf != NULL), "malloc succeeded");
 
     VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
          "H5Sselect_hyperslab succeeded");
 
-    sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
+    sel_dims[0] = count[0];
 
     VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
 
     mspace_id = H5Screate_simple(1, sel_dims, NULL);
     VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
 
-    read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
-    VRFY((read_buf != NULL), "realloc succeeded");
-
     /*
      * Finally have each rank read their section of data back from the dataset.
      */
     VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0),
          "H5Dread succeeded");
 
-    if (dataset_dims) {
-        HDfree(dataset_dims);
-        dataset_dims = NULL;
-    }
-
     if (data) {
         HDfree(data);
         data = NULL;
author	jhendersonHDF <jhenderson@hdfgroup.org>	2022-02-03 22:19:38 (GMT)
committer	GitHub <noreply@github.com>	2022-02-03 22:19:38 (GMT)
commit	1c9f219463e236487a27cee6cb839379d670cda4 (patch)
tree	cb696ac5138940d8c22a376654a80c67b9266ba0 /testpar
parent	331cf6926d5f07a1edb75674299f9787d79d1d68 (diff)
download	hdf5-1c9f219463e236487a27cee6cb839379d670cda4.zip hdf5-1c9f219463e236487a27cee6cb839379d670cda4.tar.gz hdf5-1c9f219463e236487a27cee6cb839379d670cda4.tar.bz2