From a77d8bfcd7d066c6759b0346c02cbd612f90b7c2 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Wed, 2 Aug 2023 17:46:52 -0500 Subject: Fix h5repack for variable-length datatyped datasets (#3331) (#3333) --- release_docs/RELEASE.txt | 16 +++++++++++++++- tools/src/h5repack/h5repack_copy.c | 13 ++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index aa48654..b226664 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -184,7 +184,21 @@ Bug Fixes since HDF5-1.14.1 release Tools ----- - - + - Fixed an issue in h5repack for variable-length typed datasets + + When repacking datasets into a new file, h5repack tries to determines whether + it can use H5Ocopy to copy each dataset into the new file, or if it needs to + manually re-create the dataset, then read data from the old dataset and write + it to the new dataset. H5repack was previously using H5Ocopy for datasets with + variable-length datatypes, but this can be problematic if the global heap + addresses involved do not match exactly between the old and new files. These + addresses could change for a variety of reasons, such as the command-line options + provided to h5repack, how h5repack allocate space in the repacked file, etc. + Since H5Ocopy does not currently perform any translation when these addresses + change, datasets that were repacked with H5Ocopy could become unreadable in the + new file. H5repack has been fixed to repack variable-length typed datasets without + using H5Ocopy to ensure that the new datasets always have the correct global heap + addresses. Performance diff --git a/tools/src/h5repack/h5repack_copy.c b/tools/src/h5repack/h5repack_copy.c index 0f270c5..81b33c3 100644 --- a/tools/src/h5repack/h5repack_copy.c +++ b/tools/src/h5repack/h5repack_copy.c @@ -653,6 +653,7 @@ do_copy_objects(hid_t fidin, hid_t fidout, trav_table_t *travt, pack_opt_t *opti int ifil; int is_ref = 0; htri_t is_named; + htri_t is_vlen = 0; hbool_t limit_maxdims; hsize_t size_dset; int ret_value = 0; @@ -806,6 +807,10 @@ do_copy_objects(hid_t fidin, hid_t fidout, trav_table_t *travt, pack_opt_t *opti if (H5T_REFERENCE == H5Tget_class(ftype_id)) is_ref = 1; + /* early detection of variable-length types */ + if ((is_vlen = H5Tdetect_class(ftype_id, H5T_VLEN)) < 0) + H5TOOLS_GOTO_ERROR((-1), "H5Tdetect_class failed"); + /* Check if the datatype is committed */ if ((is_named = H5Tcommitted(ftype_id)) < 0) H5TOOLS_GOTO_ERROR((-1), "H5Tcommitted failed"); @@ -823,10 +828,16 @@ do_copy_objects(hid_t fidin, hid_t fidout, trav_table_t *travt, pack_opt_t *opti * check if we should use H5Ocopy or not * if there is a request for filters/layout, we read/write the object * otherwise we do a copy using H5Ocopy + * + * Note that H5Ocopy is currently unsafe to use for objects that reside in + * or interact with global heaps, such as variable-length datatypes. This + * appears to be due to H5Ocopy not correctly translating in the case where + * these objects move to different global heap addresses in the repacked + * file. *------------------------------------------------------------------------- */ use_h5ocopy = !(options->op_tbl->nelems || options->all_filter == 1 || - options->all_layout == 1 || is_ref || is_named); + options->all_layout == 1 || is_ref || is_vlen || is_named); /* * Check if we are using different source and destination VOL connectors. -- cgit v0.12