From 840476ead85229ac4d7be1b6c9dd87ad5f8e3a07 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Sun, 17 Mar 2024 20:48:16 -0500 Subject: Fix an issue where the Subfiling VFD's context cache grows too large (#4159) --- release_docs/RELEASE.txt | 15 +++++++++++++++ src/H5FDsubfiling/H5FDsubfiling.c | 3 +++ src/H5FDsubfiling/H5subfiling_common.c | 33 ++++++++++++++++++++------------- src/H5FDsubfiling/H5subfiling_common.h | 1 + 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 2d8eb32..d82167e 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -591,6 +591,21 @@ Bug Fixes since HDF5-1.14.0 release Library ------- + - Fixed an issue where the Subfiling VFD's context object cache could + grow too large + + The Subfiling VFD keeps a cache of its internal context objects to + speed up access to a context object for a particular file, as well + as access to that object across multiple opens of the same file. + However, opening a large amount of files with the Subfiling VFD over + the course of an application's lifetime could cause this cache to grow + too large and result in the application running out of available MPI + communicator objects. On file close, the Subfiling VFD now simply + evicts context objects out of its cache and frees them. It is assumed + that multiple opens of a file will be a less common use case for the + Subfiling VFD, but this can be revisited if it proves to be an issue + for performance. + - Fixed error when overwriting certain nested variable length types Previously, when using a datatype that included a variable length type diff --git a/src/H5FDsubfiling/H5FDsubfiling.c b/src/H5FDsubfiling/H5FDsubfiling.c index 4c39f0f..9594f67 100644 --- a/src/H5FDsubfiling/H5FDsubfiling.c +++ b/src/H5FDsubfiling/H5FDsubfiling.c @@ -1358,6 +1358,9 @@ done: H5MM_free(file_ptr->file_dir); file_ptr->file_dir = NULL; + if (file_ptr->context_id >= 0 && H5_free_subfiling_object(file_ptr->context_id) < 0) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "can't free subfiling context object"); + /* Release the file info */ file_ptr = H5FL_FREE(H5FD_subfiling_t, file_ptr); diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c index 1127ae0..47d0624 100644 --- a/src/H5FDsubfiling/H5subfiling_common.c +++ b/src/H5FDsubfiling/H5subfiling_common.c @@ -48,7 +48,6 @@ static int sf_file_map_size = 0; #define DEFAULT_TOPOLOGY_CACHE_SIZE 4 #define DEFAULT_FILE_MAP_ENTRIES 8 -static herr_t H5_free_subfiling_object(int64_t object_id); static herr_t H5_free_subfiling_object_int(subfiling_context_t *sf_context); static herr_t H5_free_subfiling_topology(sf_topology_t *topology); @@ -280,18 +279,25 @@ done: * Purpose: Frees the underlying subfiling object for a given subfiling * object ID. * - * NOTE: Currently we assume that all created subfiling - * objects are cached in the (very simple) context/topology - * cache until application exit, so the only time a subfiling - * object should be freed by this routine is if something - * fails right after creating one. Otherwise, the internal - * indexing for the relevant cache will be invalid. + * NOTE: Because we want to avoid the potentially large + * overhead of determining the application topology on every + * file open, we currently assume that all created subfiling + * topology objects are cached in the (very simple) topology + * cache until application exit. This allows us to quickly + * find and assign a cached topology object to a subfiling + * context object for a file when opened. Therefore, a + * subfiling topology object should (currently) only ever be + * freed by this routine if a function fails right after + * creating a topology object. Otherwise, the internal + * indexing for the topology cache will be invalid and we will + * either leak memory or assign invalid topology pointers to + * subfiling context objects after that point. * * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ -static herr_t +herr_t H5_free_subfiling_object(int64_t object_id) { int64_t obj_type = (object_id >> 32) & 0x0FFFF; @@ -305,30 +311,31 @@ H5_free_subfiling_object(int64_t object_id) "couldn't get subfiling context for subfiling object ID"); if (H5_free_subfiling_object_int(sf_context) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling context object"); assert(sf_context_cache_num_entries > 0); assert(sf_context == sf_context_cache[sf_context_cache_num_entries - 1]); sf_context_cache[sf_context_cache_num_entries - 1] = NULL; sf_context_cache_num_entries--; } - else { + else if (obj_type == SF_TOPOLOGY) { sf_topology_t *sf_topology; - assert(obj_type == SF_TOPOLOGY); - if (NULL == (sf_topology = H5_get_subfiling_object(object_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't get subfiling context for subfiling object ID"); if (H5_free_subfiling_topology(sf_topology) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology"); + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology object"); assert(sf_topology_cache_num_entries > 0); assert(sf_topology == sf_topology_cache[sf_topology_cache_num_entries - 1]); sf_topology_cache[sf_topology_cache_num_entries - 1] = NULL; sf_topology_cache_num_entries--; } + else + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "couldn't free subfiling object - invalid object type"); done: H5_SUBFILING_FUNC_LEAVE; diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h index 156902a..6b5cfa4 100644 --- a/src/H5FDsubfiling/H5subfiling_common.h +++ b/src/H5FDsubfiling/H5subfiling_common.h @@ -269,6 +269,7 @@ H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type); H5_DLL void *H5_get_subfiling_object(int64_t object_id); +H5_DLL herr_t H5_free_subfiling_object(int64_t object_id); H5_DLL herr_t H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, int64_t *num_subfiles); H5_DLL herr_t H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath); -- cgit v0.12