From 9c9f70ed695a0022a55bf53d085b7540ae5b5281 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Tue, 16 Aug 2022 16:09:24 -0500 Subject: [1.10 Merge] Hdf5 1 10 develop merges (#2013) * Fix invalid comment about character encoding in H5Fint.c (#1845) * Convert assertion on (possibly corrupt) file contents to normal error check (#1861) * Convert assertion on (possibly corrupt) file contents to normal error check * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> * Avoid allocating chunk map for contiguous and compact dataset I/O (#1927) * Add documentation for parallel compression feature (#1981) Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- doc/parallel-compression.md | 313 ++++++++++++++++++++++++++++++++++++++++++++ release_docs/RELEASE.txt | 11 +- src/H5Dio.c | 20 ++- src/H5Fint.c | 7 +- src/H5Fsuper.c | 4 +- 5 files changed, 346 insertions(+), 9 deletions(-) create mode 100644 doc/parallel-compression.md diff --git a/doc/parallel-compression.md b/doc/parallel-compression.md new file mode 100644 index 0000000..afd9903 --- /dev/null +++ b/doc/parallel-compression.md @@ -0,0 +1,313 @@ +# HDF5 Parallel Compression + +## Introduction + +When an HDF5 dataset is created, the application can specify +optional data filters to be applied to the dataset (as long as +the dataset uses a chunked data layout). These filters may +perform compression, shuffling, checksumming/error detection +and more on the dataset data. The filters are added to a filter +pipeline for the dataset and are automatically applied to the +data during dataset writes and reads. + +Prior to the HDF5 1.10.2 release, a parallel HDF5 application +could read datasets with filters applied to them, but could +not write to those datasets in parallel. The datasets would +have to first be written in a serial HDF5 application or from +a single MPI rank in a parallel HDF5 application. This +restriction was in place because: + + - Updating the data in filtered datasets requires management + of file metadata, such as the dataset's chunk index and file + space for data chunks, which must be done collectively in + order for MPI ranks to have a consistent view of the file. + At the time, HDF5 lacked the collective coordination of + this metadata management. + + - When multiple MPI ranks are writing independently to the + same chunk in a dataset (even if their selected portions of + the chunk don't overlap), the whole chunk has to be read, + unfiltered, modified, re-filtered and then written back to + disk. This read-modify-write style of operation would cause + conflicts among the MPI ranks and lead to an inconsistent + view of the file. + +Introduced in the HDF5 1.10.2 release, the parallel compression +feature allows an HDF5 application to write in parallel to +datasets with filters applied to them, as long as collective +I/O is used. The feature introduces new internal infrastructure +that coordinates the collective management of the file metadata +between MPI ranks during dataset writes. It also accounts for +multiple MPI ranks writing to a chunk by assigning ownership to +one of the MPI ranks, at which point the other MPI ranks send +their modifications to the owning MPI rank. + +The parallel compression feature is always enabled when HDF5 +is built with parallel enabled, but the feature may be disabled +if the necessary MPI-3 routines are not available. Therefore, +HDF5 conditionally defines the macro `H5_HAVE_PARALLEL_FILTERED_WRITES` +which an application can check for to see if the feature is +available. + +## Examples + +Using the parallel compression feature is very similar to using +compression in serial HDF5, except that dataset writes **must** +be collective: + +``` +hid_t dxpl_id = H5Pcreate(H5P_DATASET_XFER); +H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE); +H5Dwrite(..., dxpl_id, ...); +``` + +The following are two simple examples of using the parallel compression +feature: + +[ph5_filtered_writes.c](https://github.com/HDFGroup/hdf5/blob/develop/examples/ph5_filtered_writes.c) + +[ph5_filtered_writes_no_sel.c](https://github.com/HDFGroup/hdf5/blob/develop/examples/ph5_filtered_writes_no_sel.c) + +The former contains simple examples of using the parallel +compression feature to write to compressed datasets, while the +latter contains an example of how to write to compressed datasets +when one or MPI ranks don't have any data to write to a dataset. +Remember that the feature requires these writes to use collective +I/O, so the MPI ranks which have nothing to contribute must still +participate in the collective write call. + +## Incremental file space allocation support + +HDF5's [file space allocation time](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALLOC_TIME) +is a dataset creation property that can have significant effects +on application performance, especially if the application uses +parallel HDF5. In a serial HDF5 application, the default file space +allocation time for chunked datasets is "incremental". This means +that allocation of space in the HDF5 file for data chunks is +deferred until data is first written to those chunks. In parallel +HDF5, the file space allocation time was previously always forced +to "early", which allocates space in the file for all of a dataset's +data chunks at creation time (or during the first open of a dataset +if it was created serially). This would ensure that all the necessary +file space was allocated so MPI ranks could perform independent I/O +operations on a dataset without needing further coordination of file +metadata as described previously. + +While this strategy has worked in the past, it has some noticeable +drawbacks. For one, the larger the chunked dataset being created, +the more noticeable overhead there will be during dataset creation +as all of the data chunks are being allocated in the HDF5 file. +Further, these data chunks will, by default, be [filled](https://portal.hdfgroup.org/display/HDF5/H5P_SET_FILL_VALUE) +with HDF5's default fill data value, leading to extraordinary +dataset creation overhead and resulting in pre-filling large +portions of a dataset that the application might have been planning +to overwrite anyway. Even worse, there will be more initial overhead +from compressing that fill data before writing it out, only to have +it read back in, unfiltered and modified the first time a chunk is +written to. In the past, it was typically suggested that parallel +HDF5 applications should use [H5Pset_fill_time](https://portal.hdfgroup.org/display/HDF5/H5P_SET_FILL_TIME) +with a value of `H5D_FILL_TIME_NEVER` in order to disable writing of +the fill value to dataset chunks, but this isn't ideal if the +application actually wishes to make use of fill values. + +With [improvements made](https://www.hdfgroup.org/2022/03/parallel-compression-improvements-in-hdf5-1-13-1/) +to the parallel compression feature for the HDF5 1.13.1 release +(and backported to the HDF5 1.10 branch), "incremental" file space +allocation is now the default for datasets created in parallel *only if they have filters applied to them*. +"Early" file space allocation is still supported for these datasets +if desired and is still forced for datasets created in parallel that +do *not* have filters applied to them. This change should significantly +reduce the overhead of creating filtered datasets in parallel HDF5 +applications and should be helpful to applications that wish to +use a fill value for these datasets. It should also help significantly +reduce the size of the HDF5 file, as file space for the data chunks +is allocated as needed rather than all at once. + +## Performance Considerations + +Since getting good performance out of HDF5's parallel compression +feature involves several factors, the following is a list of +performance considerations (generally from most to least important) +and best practices to take into account when trying to get the +optimal performance out of the parallel compression feature. + +### Begin with a good chunking strategy + +[Starting with a good chunking strategy](https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5) +will generally have the largest impact on overall application +performance. The different chunking parameters can be difficult +to fine-tune, but it is essential to start with a well-performing +chunking layout before adding compression and parallel I/O into +the mix. Compression itself adds overhead and may have side +effects that necessitate further adjustment of the chunking +parameters and HDF5 application settings. Consider that the +chosen chunk size becomes a very important factor when compression +is involved, as data chunks have to be completely read and +re-written to perform partial writes to the chunk. + +[Improving I/O performance with HDF5 compressed datasets](http://portal.hdfgroup.org/display/HDF5/Improving+IO+Performance+When+Working+with+HDF5+Compressed+Datasets) +is a useful reference for more information on getting good +performance when using a chunked dataset layout. + +### Avoid chunk sharing + +Since the parallel compression feature has to assign ownership +of data chunks to a single MPI rank in order to avoid the +previously described read-modify-write issue, an HDF5 application +may need to take care when determining how a dataset will be +divided up among the MPI ranks writing to it. Each dataset data +chunk that is written to by more than 1 MPI rank will incur extra +MPI overhead as one of the ranks takes ownership and the other +ranks send it their data and information about where in the chunk +that data belongs. While not always possible to do, an HDF5 +application will get the best performance out of parallel compression +if it can avoid writing in a way that causes more than 1 MPI rank +to write to any given data chunk in a dataset. + +### Collective metadata operations + +The parallel compression feature typically works with a significant +amount of metadata related to the management of the data chunks +in datasets. In initial performance results gathered from various +HPC machines, it was found that the parallel compression feature +did not scale well at around 8k MPI ranks and beyond. On further +investigation, it became obvious that the bottleneck was due to +heavy filesystem pressure from the metadata management for dataset +data chunks as they changed size (as a result of data compression) +and moved around in the HDF5 file. + +Enabling collective metadata operations in the HDF5 application +(as in the below snippet) showed significant improvement in +performance and scalability and is generally always recommended +unless application performance shows negative benefits by doing +so. + +``` +... +hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS); +H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL); +H5Pset_all_coll_metadata_ops(fapl_id, 1); +H5Pset_coll_metadata_write(fapl_id, 1); +hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); +... +``` + +### Align chunks in the file + +The natural layout of an HDF5 file may cause dataset data +chunks to end up at addresses in the file that do not align +well with the underlying file system, possibly leading to +poor performance. As an example, Lustre performance is generally +good when writes are aligned with the chosen stripe size. +The HDF5 application can use [H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT) +to have a bit more control over where objects in the HDF5 +file end up. However, do note that setting the alignment +of objects generally wastes space in the file and has the +potential to dramatically increase its resulting size, so +caution should be used when choosing the alignment parameters. + +[H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT) +has two parameters that control the alignment of objects in +the HDF5 file, the "threshold" value and the alignment +value. The threshold value specifies that any object greater +than or equal in size to that value will be aligned in the +file at addresses which are multiples of the chosen alignment +value. While the value 0 can be specified for the threshold +to make every object in the file be aligned according to +the alignment value, this isn't generally recommended, as it +will likely waste an excessive amount of space in the file. + +In the example below, the chosen dataset chunk size is +provided for the threshold value and 1MiB is specified for +the alignment value. Assuming that 1MiB is an optimal +alignment value (e.g., assuming that it matches well with +the Lustre stripe size), this should cause dataset data +chunks to be well-aligned and generally give good write +performance. + +``` +hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS); +H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL); +/* Assuming Lustre stripe size is 1MiB, align data chunks + in the file to address multiples of 1MiB. */ +H5Pset_alignment(fapl_id, dataset_chunk_size, 1048576); +hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); +``` + +### File free space managers + +As data chunks in a dataset get written to and compressed, +they can change in size and be relocated in the HDF5 file. +Since parallel compression usually involves many data chunks +in a file, this can create significant amounts of free space +in the file over its lifetime and eventually cause performance +issues. + +An HDF5 application can use [H5Pset_file_space_strategy](http://portal.hdfgroup.org/display/HDF5/H5P_SET_FILE_SPACE_STRATEGY) +with a value of `H5F_FSPACE_STRATEGY_PAGE` to enable the paged +aggregation feature, which can accumulate metadata and raw +data for dataset data chunks into well-aligned, configurably +sized "pages" for better performance. However, note that using +the paged aggregation feature will cause any setting from +[H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT) +to be ignored. While an application should be able to get +comparable performance effects by [setting the size of these pages](http://portal.hdfgroup.org/display/HDF5/H5P_SET_FILE_SPACE_PAGE_SIZE) to be equal to the value that +would have been set for [H5Pset_alignment](https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT), +this may not necessarily be the case and should be studied. + +Note that [H5Pset_file_space_strategy](http://portal.hdfgroup.org/display/HDF5/H5P_SET_FILE_SPACE_STRATEGY) +has a `persist` parameter. This determines whether or not the +file free space manager should include extra metadata in the +HDF5 file about free space sections in the file. If this +parameter is `false`, any free space in the HDF5 file will +become unusable once the HDF5 file is closed. For parallel +compression, it's generally recommended that `persist` be set +to `true`, as this will keep better track of file free space +for data chunks between accesses to the HDF5 file. + +``` +hid_t fcpl_id = H5Pcreate(H5P_FILE_CREATE); +/* Use persistent free space manager with paged aggregation */ +H5Pset_file_space_strategy(fcpl_id, H5F_FSPACE_STRATEGY_PAGE, 1, 1); +/* Assuming Lustre stripe size is 1MiB, set page size to that */ +H5Pset_file_space_page_size(fcpl_id, 1048576); +... +hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, fcpl_id, fapl_id); +``` + +### Low-level collective vs. independent I/O + +While the parallel compression feature requires that the HDF5 +application set and maintain collective I/O at the application +interface level (via [H5Pset_dxpl_mpio](https://portal.hdfgroup.org/display/HDF5/H5P_SET_DXPL_MPIO)), +it does not require that the actual MPI I/O that occurs at +the lowest layers of HDF5 be collective; independent I/O may +perform better depending on the application I/O patterns and +parallel file system performance, among other factors. The +application may use [H5Pset_dxpl_mpio_collective_opt](https://portal.hdfgroup.org/display/HDF5/H5P_SET_DXPL_MPIO_COLLECTIVE_OPT) +to control this setting and see which I/O method provides the +best performance. + +``` +hid_t dxpl_id = H5Pcreate(H5P_DATASET_XFER); +H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE); +H5Pset_dxpl_mpio_collective_opt(dxpl_id, H5FD_MPIO_INDIVIDUAL_IO); /* Try independent I/O */ +H5Dwrite(..., dxpl_id, ...); +``` + +### Runtime HDF5 Library version + +An HDF5 application can use the [H5Pset_libver_bounds](http://portal.hdfgroup.org/display/HDF5/H5P_SET_LIBVER_BOUNDS) +routine to set the upper and lower bounds on library versions +to use when creating HDF5 objects. For parallel compression +specifically, setting the library version to the latest available +version can allow access to better/more efficient chunk indexing +types and data encoding methods. For example: + +``` +... +hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS); +H5Pset_libver_bounds(fapl_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST); +hid_t file_id = H5Fcreate("file.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); +... +``` diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 08cec29..e21c391 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -163,7 +163,16 @@ Bug Fixes since HDF5-1.10.9 release =================================== Library ------- - - + - Converted an assertion on (possibly corrupt) file contents to a normal + error check + + Previously, the library contained an assertion check that a read superblock + doesn't contain a superblock extension message when the superblock + version < 2. When a corrupt HDF5 file is read, this assertion can be triggered + in debug builds of HDF5. In production builds, this situation could cause + either a library error or a crash, depending on the platform. + + (JTH - 2022/07/08, HDFFV-11316, HDFFV-11317) Java Library diff --git a/src/H5Dio.c b/src/H5Dio.c index 0236eb0..22aacb0 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -405,6 +405,7 @@ H5D__read(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_space H5D_chunk_map_t *fm = NULL; /* Chunk file<->memory mapping */ H5D_io_info_t io_info; /* Dataset I/O info */ H5D_type_info_t type_info; /* Datatype info for operation */ + H5D_layout_t layout_type; /* Dataset's layout type (contig, chunked, compact, etc.) */ hbool_t type_info_init = FALSE; /* Whether the datatype info has been initialized */ H5S_t *projected_mem_space = NULL; /* If not NULL, ptr to dataspace containing a */ /* projection of the supplied mem_space to a new */ @@ -436,6 +437,8 @@ H5D__read(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_space mem_space = file_space; nelmts = H5S_GET_SELECT_NPOINTS(mem_space); + layout_type = dataset->shared->layout.type; + /* Set up datatype info for operation */ if (H5D__typeinfo_init(dataset, mem_type_id, FALSE, &type_info) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to set up type info") @@ -560,11 +563,13 @@ H5D__read(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_space HDassert((*dataset->shared->layout.ops->is_space_alloc)(&dataset->shared->layout.storage) || (dataset->shared->layout.ops->is_data_cached && (*dataset->shared->layout.ops->is_data_cached)(dataset->shared)) || - dataset->shared->dcpl_cache.efl.nused > 0 || dataset->shared->layout.type == H5D_COMPACT); + dataset->shared->dcpl_cache.efl.nused > 0 || layout_type == H5D_COMPACT); /* Allocate the chunk map */ - if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t))) - HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map") + if (H5D_CONTIGUOUS != layout_type && H5D_COMPACT != layout_type) { + if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map") + } /* Call storage method's I/O initialization routine */ if (io_info.layout_ops.io_init && @@ -620,6 +625,7 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac H5D_chunk_map_t *fm = NULL; /* Chunk file<->memory mapping */ H5D_io_info_t io_info; /* Dataset I/O info */ H5D_type_info_t type_info; /* Datatype info for operation */ + H5D_layout_t layout_type; /* Dataset's layout type (contig, chunked, compact, etc.) */ hbool_t type_info_init = FALSE; /* Whether the datatype info has been initialized */ hbool_t should_alloc_space = FALSE; /* Whether or not to initialize dataset's storage */ H5S_t *projected_mem_space = NULL; /* If not NULL, ptr to dataspace containing a */ @@ -646,6 +652,8 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac /* check args */ HDassert(dataset && dataset->oloc.file); + layout_type = dataset->shared->layout.type; + /* All filters in the DCPL must have encoding enabled. */ if (!dataset->shared->checked_filters) { if (H5Z_can_apply(dataset->shared->dcpl_id, dataset->shared->type_id) < 0) @@ -792,8 +800,10 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac } /* end if */ /* Allocate the chunk map */ - if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t))) - HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map") + if (H5D_CONTIGUOUS != layout_type && H5D_COMPACT != layout_type) { + if (NULL == (fm = H5FL_CALLOC(H5D_chunk_map_t))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "can't allocate chunk map") + } /* Call storage method's I/O initialization routine */ if (io_info.layout_ops.io_init && diff --git a/src/H5Fint.c b/src/H5Fint.c index e52cb91..8190227 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -2528,10 +2528,13 @@ H5F__build_actual_name(const H5F_t *f, const H5P_genplist_t *fapl, const char *n if (NULL == (new_fapl = (H5P_genplist_t *)H5I_object(new_fapl_id))) HGOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL, "can't get property list") - /* Set the character encoding on the new property list */ + /* + * Set the private property for retrieving the backing store + * POSIX file descriptor from the Core VFD + */ want_posix_fd = TRUE; if (H5P_set(new_fapl, H5F_ACS_WANT_POSIX_FD_NAME, &want_posix_fd) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set character encoding") + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set property for retrieving file descriptor") /* Retrieve the file handle */ if (H5F_get_vfd_handle(f, new_fapl_id, (void **)&fd) < 0) diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index 093121b..4c4f087 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -695,7 +695,9 @@ H5F__super_read(H5F_t *f, H5P_genplist_t *fa_plist, hbool_t initial_read) /* Sanity check - superblock extension should only be defined for * superblock version >= 2. */ - HDassert(sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2); + if (sblock->super_vers < HDF5_SUPERBLOCK_VERSION_2) + HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "invalid superblock - extension message should not be defined for version < 2") /* Check for superblock extension being located "outside" the stored * 'eoa' value, which can occur with the split/multi VFD. -- cgit v0.12