From 9c9f70ed695a0022a55bf53d085b7540ae5b5281 Mon Sep 17 00:00:00 2001
From: jhendersonHDF <jhenderson@hdfgroup.org>
Date: Tue, 16 Aug 2022 16:09:24 -0500
Subject: [1.10 Merge] Hdf5 1 10 develop merges (#2013)

* Fix invalid comment about character encoding in H5Fint.c (#1845)

* Convert assertion on (possibly corrupt) file contents to normal error check (#1861)

* Convert assertion on (possibly corrupt) file contents to normal error check

* Committing clang-format changes

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>

* Avoid allocating chunk map for contiguous and compact dataset I/O (#1927)

* Add documentation for parallel compression feature (#1981)

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
---
 doc/parallel-compression.md | 313 ++++++++++++++++++++++++++++++++++++++++++++
 release_docs/RELEASE.txt    |  11 +-
 src/H5Dio.c                 |  20 ++-
 src/H5Fint.c                |   7 +-
 src/H5Fsuper.c              |   4 +-
 5 files changed, 346 insertions(+), 9 deletions(-)
 create mode 100644 doc/parallel-compression.md

diff --git a/doc/parallel-compression.md b/doc/parallel-compression.md
new file mode 100644
index 0000000..afd9903
--- /dev/null
+++ b/doc/parallel-compression.md
@@ -0,0 +1,313 @@
+# HDF5 Parallel Compression
+
+## Introduction
+
+When an HDF5 dataset is created, the application can specify
+optional data filters to be applied to the dataset (as long as
+the dataset uses a chunked data layout). These filters may
+perform compression, shuffling, checksumming/error detection
+and more on the dataset data. The filters are added to a filter
+pipeline for the dataset and are automatically applied to the
+data during dataset writes and reads.
+
+Prior to the HDF5 1.10.2 release, a parallel HDF5 application
+could read datasets with filters applied to them, but could
+not write to those datasets in parallel. The datasets would
+have to first be written in a serial HDF5 application or from
+a single MPI rank in a parallel HDF5 application. This
+restriction was in place because:
+
+ - Updating the data in filtered datasets requires management
+   of file metadata, such as the dataset's chunk index and file
+   space for data chunks, which must be done collectively in
+   order for MPI ranks to have a consistent view of the file.
+   At the time, HDF5 lacked the collective coordination of
+   this metadata management.
+
+ - When multiple MPI ranks are writing independently to the
+   same chunk in a dataset (even if their selected portions of
+   the chunk don't overlap), the whole chunk has to be read,
+   unfiltered, modified, re-filtered and then written back to
+   disk. This read-modify-write style of operation would cause
+   conflicts among the MPI ranks and lead to an inconsistent
+   view of the file.
+
+Introduced in the HDF5 1.10.2 release, the parallel compression
+feature allows an HDF5 application to write in parallel to
+datasets with filters applied to them, as long as collective
+I/O is used. The feature introduces new internal infrastructure
+that coordinates the collective management of the file metadata
+between MPI ranks during dataset writes. It also accounts for
+multiple MPI ranks writing to a chunk by assigning ownership to
+one of the MPI ranks, at which point the other MPI ranks send
+their modifications to the owning MPI rank.
+
+The parallel compression feature is always enabled when HDF5
+is built with parallel enabled, but the feature may be disabled
+if the necessary MPI-3 routines are not available. Therefore,
+HDF5 conditionally defines the macro `H5_HAVE_PARALLEL_FILTERED_WRITES`
+which an application can check for to see if the feature is
+available.
+
+## Examples
+
+Using the parallel compression feature is very similar to using
+compression in serial HDF5, except that dataset writes **must**
+be collective:
+
+```
+hid_t dxpl_id = H5Pcreate(H5P_DATASET_XFER);
+H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE);
+H5Dwrite(..., dxpl_id, ...);
+```
+
+The following are two simple examples of using the parallel compression
+feature:
+
+[ph5_filtered_writes.c](https://github.com/HDFGroup/hdf5/blob/develop/examples/ph5_filtered_writes.c)
+
+[ph5_filtered_writes_no_sel.c](https://github.com/HDFGroup/hdf5/blob/develop/examples/ph5_filtered_writes_no_sel.c)
+
+The former contains simple examples of using the parallel
+compression feature to write to compressed datasets, while the
+latter contains an example of how to write to compressed datasets
+when one or MPI ranks don't have any data to write to a dataset.
+Remember that the feature requires these writes to use collective
+I/O, so the MPI ranks which have nothing to contribute must still
+participate in the collective write call.
+
+## Incremental file space allocation support
+
+HDF5's [file space allocation time](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALLOC_TIME)
+is a dataset creation property that can have significant effects
+on application performance, especially if the application uses
+parallel HDF5. In a serial HDF5 application, the default file space
+allocation time for chunked datasets is "incremental". This means
+that allocation of space in the HDF5 file for data chunks is
+deferred until data is first written to those chunks. In parallel
+HDF5, the file space allocation time was previously always forced
+to "early", which allocates space in the file for all of a dataset's
+data chunks at creation time (or during the first open of a dataset
+if it was created serially). This would ensure that all the necessary
+file space was allocated so MPI ranks could perform independent I/O
+operations on a dataset without needing further coordination of file
+metadata as described previously.
+
+While this strategy has worked in the past, it has some noticeable
+drawbacks. For one, the larger the chunked dataset being created,
+the more noticeable overhead there will be during dataset creation
+as all of the data chunks are being allocated in the HDF5 file.
+Further, these data chunks will, by default, be [filled](https://portal.hdfgroup.org/display/HDF5/H5P_SET_FILL_VALUE)
+with HDF5's default fill data value, leading to extraordinary
+dataset creation overhead and resulting in pre-filling large
+portions of a dataset that the application might have been planning
+to overwrite anyway. Even worse, there will be more initial overhead
+from compressing that fill data before writing it out, only to have
+it read back in, unfiltered and modified the first time a chunk is
+written to. In the past, it was typically suggested that parallel
+HDF5 applications should use [H5Pset_fill_time](https://portal.hdfgroup.org/display/HDF5/H5P_SET_FILL_TIME)
+with a value of `H5D_FILL_TIME_NEVER` in order to disable writing of
+the fill value to dataset chunks, but this isn't ideal if the
+application actually wishes to make use of fill values. 
+
+With [improvements made](https://www.hdfgroup.org/2022/03/parallel-compression-improvements-in-hdf5-1-13-1/)
+to the parallel compression feature for the HDF5 1.13.1 release
+(and backported to the HDF5 1.10 branch), "incremental" file space
+allocation is now the default for datasets created in parallel *only if they have filters applied to them*.
+"Early" file space allocation is still supported for these datasets
+if desired and is still forced for datasets created in parallel that
+do *not* have filters applied to them. This change should significantly
+reduce the overhead of creating filtered datasets in parallel HDF5
+applications and should be helpful to applications that wish to
+use a fill value for these datasets. It should also help significantly
+reduce the size of the HDF5 file, as file space for the data chunks
+is allocated as needed rather than all at once.
+
+## Performance Considerations
+
+Since getting good performance out of HDF5's parallel compression
+feature involves several factors, the following is a list of
+performance considerations (generally from most to least important)
+and best practices to take into account when trying to get the
+optimal performance out of the parallel compression feature.
+
+### Begin with a good chunking strategy
+
+[Starting with a good chunking strategy](https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5)
+will generally have the largest impact on overall application
+performance. The different chunking parameters can be difficult
+to fine-tune, but it is essential to start with a well-performing
+chunking layout before adding compression and parallel I/O into
+the mix. Compression itself adds overhead and may have side
+effects that necessitate further adjustment of the chunking
+parameters and HDF5 application settings. Consider that the
+chosen chunk size becomes a very important factor when compression
+is involved, as data chunks have to be completely read and
+re-written to perform partial writes to the chunk.
+
+[Improving I/O performance with HDF5 compressed datasets](http://portal.hdfgroup.org/display/HDF5/Improving+IO+Performance+When+Working+with+HDF5+Compressed+Datasets)
+is a useful reference for more information on getting good
+performance when using a chunked dataset layout.
+
+### Avoid chunk sharing
+
+Since the parallel compression feature has to assign ownership
+of data chunks to a single MPI rank in order to avoid the
+previously described read-modify-write issue, an HDF5 application
+may need to take care when determining how a dataset will be
+divided up among the MPI ranks writing to it. Each dataset data
+chunk that is written to by more than 1 MPI rank will incur extra
+MPI overhead as one of the ranks takes ownership and the other
+ranks send it their data and information about where in the chunk
+that data belongs. While not always possible to do, an HDF5
+application will get the best performance out of parallel compression
+if it can avoid writing in a way that causes more than 1 MPI rank
+to write to any given data chunk in a dataset.
+
+### Collective metadata operations
+
+The parallel compression feature typically works with a significant
+amount of metadata related to the management of the data chunks
+in datasets. In initial performance results gathered from various
+HPC machines, it was found that the parallel compression feature
+did not scale well at around 8k MPI ranks and beyond. On further
+investigation, it became obvious that the bottleneck was due to
+heavy filesystem pressure from the metadata management for dataset
+data chunks as they changed size (as a result of data compression)
+and moved around in the HDF5 file.
+
+Enabling collective metadata operations in the HDF5 application
+(as in the below snippet) showed significant improvement in
+performance and scalability and is generally always recommended
+unless application performance shows negative benefits by doing
+so.
+
+```
+...
+hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+H5Pset_all_coll_metadata_ops(fapl_id, 1);
+H5Pset_coll_metadata_write(fapl_id, 1);
+hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+...
+```
+
+### Align chunks in the file
+
+The natural layout of an HDF5 file may cause dataset data
+chunks to end up at addresses in the file that do not align
+well with the underlying file system, possibly leading to
+poor performance. As an example, Lustre performance is generally
+good when writes are aligned with the chosen stripe size.
+The HDF5 application can use [H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT)
+to have a bit more control over where objects in the HDF5
+file end up. However, do note that setting the alignment
+of objects generally wastes space in the file and has the
+potential to dramatically increase its resulting size, so
+caution should be used when choosing the alignment parameters.
+
+[H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT)
+has two parameters that control the alignment of objects in
+the HDF5 file, the "threshold" value and the alignment
+value. The threshold value specifies that any object greater
+than or equal in size to that value will be aligned in the
+file at addresses which are multiples of the chosen alignment
+value. While the value 0 can be specified for the threshold
+to make every object in the file be aligned according to
+the alignment value, this isn't generally recommended, as it
+will likely waste an excessive amount of space in the file.
+
+In the example below, the chosen dataset chunk size is
+provided for the threshold value and 1MiB is specified for
+the alignment value. Assuming that 1MiB is an optimal
+alignment value (e.g., assuming that it matches well with
+the Lustre stripe size), this should cause dataset data
+chunks to be well-aligned and generally give good write
+performance.
+
+```
+hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+/* Assuming Lustre stripe size is 1MiB, align data chunks
+   in the file to address multiples of 1MiB. */
+H5Pset_alignment(fapl_id, dataset_chunk_size, 1048576);
+hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+```
+
+### File free space managers
+
+As data chunks in a dataset get written to and compressed,
+they can change in size and be relocated in the HDF5 file.
+Since parallel compression usually involves many data chunks
+in a file, this can create significant amounts of free space
+in the file over its lifetime and eventually cause performance
+issues.
+
+An HDF5 application can use [H5Pset_file_space_strategy](http://portal.hdfgroup.org/display/HDF5/H5P_SET_FILE_SPACE_STRATEGY)
+with a value of `H5F_FSPACE_STRATEGY_PAGE` to enable the paged
+aggregation feature, which can accumulate metadata and raw
+data for dataset data chunks into well-aligned, configurably
+sized "pages" for better performance. However, note that using
+the paged aggregation feature will cause any setting from
+[H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT)
+to be ignored. While an application should be able to get
+comparable performance effects by [setting the size of these pages](http://portal.hdfgroup.org/display/HDF5/H5P_SET_FILE_SPACE_PAGE_SIZE) to be equal to the value that
+would have been set for [H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT),
+this may not necessarily be the case and should be studied.
+
+Note that [H5Pset_file_space_strategy](http://portal.hdfgroup.org/display/HDF5/H5P_SET_FILE_SPACE_STRATEGY)
+has a `persist` parameter. This determines whether or not the
+file free space manager should include extra metadata in the
+HDF5 file about free space sections in the file. If this
+parameter is `false`, any free space in the HDF5 file will
+become unusable once the HDF5 file is closed. For parallel
+compression, it's generally recommended that `persist` be set
+to `true`, as this will keep better track of file free space
+for data chunks between accesses to the HDF5 file.
+
+```
+hid_t fcpl_id = H5Pcreate(H5P_FILE_CREATE);
+/* Use persistent free space manager with paged aggregation */
+H5Pset_file_space_strategy(fcpl_id, H5F_FSPACE_STRATEGY_PAGE, 1, 1);
+/* Assuming Lustre stripe size is 1MiB, set page size to that */
+H5Pset_file_space_page_size(fcpl_id, 1048576);
+...
+hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, fcpl_id, fapl_id);
+```
+
+### Low-level collective vs. independent I/O
+
+While the parallel compression feature requires that the HDF5
+application set and maintain collective I/O at the application
+interface level (via [H5Pset_dxpl_mpio](https://portal.hdfgroup.org/display/HDF5/H5P_SET_DXPL_MPIO)),
+it does not require that the actual MPI I/O that occurs at
+the lowest layers of HDF5 be collective; independent I/O may
+perform better depending on the application I/O patterns and
+parallel file system performance, among other factors. The
+application may use [H5Pset_dxpl_mpio_collective_opt](https://portal.hdfgroup.org/display/HDF5/H5P_SET_DXPL_MPIO_COLLECTIVE_OPT)
+to control this setting and see which I/O method provides the
+best performance.
+
+```
+hid_t dxpl_id = H5Pcreate(H5P_DATASET_XFER);
+H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE);
+H5Pset_dxpl_mpio_collective_opt(dxpl_id, H5FD_MPIO_INDIVIDUAL_IO); /* Try independent I/O */
+H5Dwrite(..., dxpl_id, ...);
+```
+
+### Runtime HDF5 Library version
+
+An HDF5 application can use the [H5Pset_libver_bounds](http://portal.hdfgroup.org/display/HDF5/H5P_SET_LIBVER_BOUNDS)
+routine to set the upper and lower bounds on library versions
+to use when creating HDF5 objects. For parallel compression
+specifically, setting the library version to the latest available
+version can allow access to better/more efficient chunk indexing
+types and data encoding methods. For example:
+
+```
+...
+hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+H5Pset_libver_bounds(fapl_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST);
+hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+...
+```
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index 08cec29..e21c391 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -163,7 +163,16 @@ Bug Fixes since HDF5-1.10.9 release
 ===================================
     Library
     -------
-    -
+    - Converted an assertion on (possibly corrupt) file contents to a normal
+      error check
+
+      Previously, the library contained an assertion check that a read superblock
+      doesn't contain a superblock extension message when the superblock
+      version < 2. When a corrupt HDF5 file is read, this assertion can be triggered
+      in debug builds of HDF5. In production builds, this situation could cause
+      either a library error or a crash, depending on the platform.
+
+      (JTH - 2022/07/08, HDFFV-11316, HDFFV-11317)
 
 
     Java Library
diff --git a/src/H5Dio.c b/src/H5Dio.c
index 0236eb0..22aacb0 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -405,6 +405,7 @@ H5D__read(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_space
     H5D_chunk_map_t *fm = NULL;                   /* Chunk file<->memory mapping */
     H5D_io_info_t    io_info;                     /* Dataset I/O info     */
     H5D_type_info_t  type_info;                   /* Datatype info for operation */
+    H5D_layout_t     layout_type;                 /* Dataset's layout type (contig, chunked, compact, etc.) */
     hbool_t          type_info_init      = FALSE; /* Whether the datatype info has been initialized */
     H5S_t           *projected_mem_space = NULL;  /* If not NULL, ptr to dataspace containing a     */
                                                   /* projection of the supplied mem_space to a new  */
@@ -436,6 +437,8 @@ H5D__read(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_space
         mem_space = file_space;
     nelmts = H5S_GET_SELECT_NPOINTS(mem_space);
 
+    layout_type = dataset->shared->layout.type;
+
     /* Set up datatype info for operation */
     if (H5D__typeinfo_init(dataset, mem_type_id, FALSE, &type_info) < 0)
         HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to set up type info")
@@ -560,11 +563,13 @@ H5D__read(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_space
         HDassert((*dataset->shared->layout.ops->is_space_alloc)(&dataset->shared->layout.storage) ||
                  (dataset->shared->layout.ops->is_data_cached &&
                   (*dataset->shared->layout.ops->is_data_cached)(dataset->shared)) ||
-                 dataset->shared->dcpl_cache.efl.nused > 0 || dataset->shared->layout.type == H5D_COMPACT);
+                 dataset->shared->dcpl_cache.efl.nused > 0 || layout_type == H5D_COMPACT);
 
     /* Allocate the chunk map */
-    if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t)))
-        HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map")
+    if (H5D_CONTIGUOUS != layout_type && H5D_COMPACT != layout_type) {
+        if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t)))
+            HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map")
+    }
 
     /* Call storage method's I/O initialization routine */
     if (io_info.layout_ops.io_init &&
@@ -620,6 +625,7 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac
     H5D_chunk_map_t *fm = NULL;                   /* Chunk file<->memory mapping */
     H5D_io_info_t    io_info;                     /* Dataset I/O info     */
     H5D_type_info_t  type_info;                   /* Datatype info for operation */
+    H5D_layout_t     layout_type;                 /* Dataset's layout type (contig, chunked, compact, etc.) */
     hbool_t          type_info_init      = FALSE; /* Whether the datatype info has been initialized */
     hbool_t          should_alloc_space  = FALSE; /* Whether or not to initialize dataset's storage */
     H5S_t           *projected_mem_space = NULL;  /* If not NULL, ptr to dataspace containing a     */
@@ -646,6 +652,8 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac
     /* check args */
     HDassert(dataset && dataset->oloc.file);
 
+    layout_type = dataset->shared->layout.type;
+
     /* All filters in the DCPL must have encoding enabled. */
     if (!dataset->shared->checked_filters) {
         if (H5Z_can_apply(dataset->shared->dcpl_id, dataset->shared->type_id) < 0)
@@ -792,8 +800,10 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac
     } /* end if */
 
     /* Allocate the chunk map */
-    if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t)))
-        HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map")
+    if (H5D_CONTIGUOUS != layout_type && H5D_COMPACT != layout_type) {
+        if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t)))
+            HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map")
+    }
 
     /* Call storage method's I/O initialization routine */
     if (io_info.layout_ops.io_init &&
diff --git a/src/H5Fint.c b/src/H5Fint.c
index e52cb91..8190227 100644
--- a/src/H5Fint.c
+++ b/src/H5Fint.c
@@ -2528,10 +2528,13 @@ H5F__build_actual_name(const H5F_t *f, const H5P_genplist_t *fapl, const char *n
             if (NULL == (new_fapl = (H5P_genplist_t *)H5I_object(new_fapl_id)))
                 HGOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL, "can't get property list")
 
-            /* Set the character encoding on the new property list */
+            /*
+             * Set the private property for retrieving the backing store
+             * POSIX file descriptor from the Core VFD
+             */
             want_posix_fd = TRUE;
             if (H5P_set(new_fapl, H5F_ACS_WANT_POSIX_FD_NAME, &want_posix_fd) < 0)
-                HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set character encoding")
+                HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set property for retrieving file descriptor")
 
             /* Retrieve the file handle */
             if (H5F_get_vfd_handle(f, new_fapl_id, (void **)&fd) < 0)
diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c
index 093121b..4c4f087 100644
--- a/src/H5Fsuper.c
+++ b/src/H5Fsuper.c
@@ -695,7 +695,9 @@ H5F__super_read(H5F_t *f, H5P_genplist_t *fa_plist, hbool_t initial_read)
         /* Sanity check - superblock extension should only be defined for
          *      superblock version >= 2.
          */
-        HDassert(sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2);
+        if (sblock->super_vers < HDF5_SUPERBLOCK_VERSION_2)
+            HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+                        "invalid superblock - extension message should not be defined for version < 2")
 
         /* Check for superblock extension being located "outside" the stored
          *      'eoa' value, which can occur with the split/multi VFD.
-- 
cgit v0.12