diff options
Diffstat (limited to 'src/H5FDsubfiling/H5FDioc_priv.h')
-rw-r--r-- | src/H5FDsubfiling/H5FDioc_priv.h | 440 |
1 files changed, 440 insertions, 0 deletions
diff --git a/src/H5FDsubfiling/H5FDioc_priv.h b/src/H5FDsubfiling/H5FDioc_priv.h new file mode 100644 index 0000000..07eb124 --- /dev/null +++ b/src/H5FDsubfiling/H5FDioc_priv.h @@ -0,0 +1,440 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Private definitions for HDF5 IOC VFD + */ + +#ifndef H5FDioc_priv_H +#define H5FDioc_priv_H + +/********************/ +/* Standard Headers */ +/********************/ + +#include <stdatomic.h> +#include <libgen.h> + +/**************/ +/* H5 Headers */ +/**************/ + +#include "H5private.h" /* Generic Functions */ +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Dprivate.h" /* Datasets */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5FDioc.h" /* IOC VFD */ +#include "H5Iprivate.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ + +#include "H5subfiling_common.h" +#include "H5subfiling_err.h" + +#include "mercury_thread.h" +#include "mercury_thread_mutex.h" +#include "mercury_thread_pool.h" + +/* + * Some definitions for debugging the IOC VFD + */ + +/* #define H5FD_IOC_DEBUG */ +/* #define H5FD_IOC_REQUIRE_FLUSH */ +/* #define H5FD_IOC_COLLECT_STATS */ + +/**************************************************************************** + * + * IOC I/O Queue management macros: + * + * The following macros perform the necessary operations on the IOC I/O + * Queue, which is implemented as a doubly linked list of instances of + * ioc_io_queue_entry_t. + * + * WARNING: q_ptr->q_mutex must be held when these macros are executed.. + * + * At present, the necessary operations are append (insert an entry at the + * end of the queue), and delete (remove an entry from the queue). + * + * At least initially, all sanity checking is done with asserts, as the + * the existing I/O concentrator code is not well integrated into the HDF5 + * error reporting system. This will have to be revisited for a production + * version, but it should be sufficient for now. + * + * JRM -- 11/2/21 + * + ****************************************************************************/ + +#define H5FD_IOC__IO_Q_ENTRY_MAGIC 0x1357 + +/* clang-format off */ + +#define H5FD_IOC__Q_APPEND(q_ptr, entry_ptr) \ +do { \ + HDassert(q_ptr); \ + HDassert((q_ptr)->magic == H5FD_IOC__IO_Q_MAGIC); \ + HDassert((((q_ptr)->q_len == 0) && ((q_ptr)->q_head == NULL) && ((q_ptr)->q_tail == NULL)) || \ + (((q_ptr)->q_len > 0) && ((q_ptr)->q_head != NULL) && ((q_ptr)->q_tail != NULL))); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); \ + HDassert((entry_ptr)->next == NULL); \ + HDassert((entry_ptr)->prev == NULL); \ + HDassert((entry_ptr)->in_progress == FALSE); \ + \ + if ( ((q_ptr)->q_head) == NULL ) \ + { \ + ((q_ptr)->q_head) = (entry_ptr); \ + ((q_ptr)->q_tail) = (entry_ptr); \ + } \ + else \ + { \ + ((q_ptr)->q_tail)->next = (entry_ptr); \ + (entry_ptr)->prev = ((q_ptr)->q_tail); \ + ((q_ptr)->q_tail) = (entry_ptr); \ + } \ + ((q_ptr)->q_len)++; \ +} while ( FALSE ) /* H5FD_IOC__Q_APPEND() */ + +#define H5FD_IOC__Q_REMOVE(q_ptr, entry_ptr) \ +do { \ + HDassert(q_ptr); \ + HDassert((q_ptr)->magic == H5FD_IOC__IO_Q_MAGIC); \ + HDassert((((q_ptr)->q_len == 1) && ((q_ptr)->q_head ==((q_ptr)->q_tail)) && ((q_ptr)->q_head == (entry_ptr))) || \ + (((q_ptr)->q_len > 0) && ((q_ptr)->q_head != NULL) && ((q_ptr)->q_tail != NULL))); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); \ + HDassert((((q_ptr)->q_len == 1) && ((entry_ptr)->next == NULL) && ((entry_ptr)->prev == NULL)) || \ + (((q_ptr)->q_len > 1) && (((entry_ptr)->next != NULL) || ((entry_ptr)->prev != NULL)))); \ + HDassert((entry_ptr)->in_progress == TRUE); \ + \ + { \ + if ( (((q_ptr)->q_head)) == (entry_ptr) ) \ + { \ + (((q_ptr)->q_head)) = (entry_ptr)->next; \ + if ( (((q_ptr)->q_head)) != NULL ) \ + (((q_ptr)->q_head))->prev = NULL; \ + } \ + else \ + { \ + (entry_ptr)->prev->next = (entry_ptr)->next; \ + } \ + if (((q_ptr)->q_tail) == (entry_ptr) ) \ + { \ + ((q_ptr)->q_tail) = (entry_ptr)->prev; \ + if ( ((q_ptr)->q_tail) != NULL ) \ + ((q_ptr)->q_tail)->next = NULL; \ + } \ + else \ + { \ + (entry_ptr)->next->prev = (entry_ptr)->prev; \ + } \ + (entry_ptr)->next = NULL; \ + (entry_ptr)->prev = NULL; \ + ((q_ptr)->q_len)--; \ + } \ +} while ( FALSE ) /* H5FD_IOC__Q_REMOVE() */ + +/* clang-format on */ + +/**************************************************************************** + * + * structure ioc_io_queue_entry + * + * magic: Unsigned 32 bit integer always set to H5FD_IOC__IO_Q_ENTRY_MAGIC. + * This field is used to validate pointers to instances of + * ioc_io_queue_entry_t. + * + * next: Next pointer in the doubly linked list used to implement + * the IOC I/O Queue. This field points to the next entry + * in the queue, or NULL if there is no next entry. + * + * prev: Prev pointer in the doubly linked list used to implement + * the IOC I/O Queue. This field points to the previous entry + * in the queue, or NULL if there is no previous entry. + * + * in_progress: Boolean flag that must be FALSE when the entry is inserted + * into the IOC I/O Queue, and set to TRUE when the entry is dispatched + * to the worker thread pool for execution. + * + * When in_progress is FALS, the entry is said to be pending. + * + * counter: uint32_t containing a serial number assigned to this IOC + * I/O Queue entry. Note that this will roll over on long + * computations, and thus is not in general unique. + * + * The counter fields is used to construct a tag to distinguish + * multiple concurrent I/O requests from a give rank, and thus + * this should not be a problem as long as there is sufficient + * time between roll overs. As only the lower bits of the counter + * are used in tag construction, this is more frequent than the + * size of the counter field would suggest -- albeit hopefully + * still infrequent enough. + * + * wk_req: Instance of sf_work_request_t. Replace with individual + * fields when convenient. + * + * + * Statistics: + * + * The following fields are only defined if H5FD_IOC_COLLECT_STATS is TRUE. + * They are intended to allow collection of basic statistics on the + * behaviour of the IOC I/O Queue for purposes of debugging and performance + * optimization. + * + * q_time: uint64_t containing the time the entry was place on the + * IOC I/O Queue in usec after the UNIX epoch. + * + * This value is used to compute the queue wait time, and the + * total processing time for the entry. + * + * dispatch_time: uint64_t containing the time the entry is dispatched in + * usec after the UNIX epoch. This field is undefined if the + * entry is pending. + * + * This value is used to compute the execution time for the + * entry. + * + ****************************************************************************/ + +typedef struct ioc_io_queue_entry { + + uint32_t magic; + struct ioc_io_queue_entry *next; + struct ioc_io_queue_entry *prev; + hbool_t in_progress; + uint32_t counter; + + sf_work_request_t wk_req; + struct hg_thread_work thread_wk; + int wk_ret; + + /* statistics */ +#ifdef H5FD_IOC_COLLECT_STATS + + uint64_t q_time; + uint64_t dispatch_time; + +#endif + +} ioc_io_queue_entry_t; + +/**************************************************************************** + * + * structure ioc_io_queue + * + * This is a temporary structure -- its fields should be moved to an I/O + * concentrator Catchall structure eventually. + * + * The fields of this structure support the io queue used to receive and + * sequence I/O requests for execution by the worker threads. The rules + * for sequencing are as follows: + * + * 1) Non-overlaping I/O requests must be fed to the worker threads in + * the order received, and may execute concurrently + * + * 2) Overlapping read requests must be fed to the worker threads in + * the order received, but may execute concurrently. + * + * 3) If any pair of I/O requests overlap, and at least one is a write + * request, they must be executed in strict arrival order, and the + * first must complete before the second starts. + * + * Due to the strict ordering requirement in rule 3, entries must be + * inserted at the tail of the queue in receipt order, and retained on + * the queue until completed. Entries in the queue are marked pending + * when inserted on the queue, in progress when handed to a worker + * thread, and deleted from the queue when completed. + * + * The dispatch algorithm is as follows: + * + * 1) Set X equal to the element at the head of the queue. + * + * 2) If X is pending, and there exists no prior element (i.e. between X + * and the head of the queue) that intersects with X, goto 5). + * + * 3) If X is pending, X is a read, and all prior intersecting elements + * are reads, goto 5). + * + * 4) If X is in progress, or if any prior intersecting element is a + * write, or if X is a write, set X equal to its successor in the + * queue (i.e. the next element further down the queue from the head) + * and goto 2) If there is no next element, exit without dispatching + * any I/O request. + * + * 5) If we get to 5, X must be pending. Mark it in progress, and + * dispatch it. If the number of in progress entries is less than + * the number of worker threads, and X has a successor in the queue, + * set X equal to its predecessor, and goto 2). Otherwise exit without + * dispatching further I/O requests. + * + * Note that the above dispatch algorithm doesn't address collective + * I/O requests -- this should be OK for now, but it will have to + * addressed prior to production release. + * + * On I/O request completion, worker threads must delete their assigned + * I/O requests from the queue, check to see if there are any pending + * requests, and trigger the dispatch algorithm if there are. + * + * The fields in the structure are discussed individually below. + * + * magic: Unsigned 32 bit integer always set to H5FD_IOC__IO_Q_MAGIC. + * This field is used to validate pointers to instances of + * H5C_t. + * + * q_head: Pointer to the head of the doubly linked list of entries in + * the I/O queue. + * + * This field is NULL if the I/O queue is empty. + * + * q_tail: Pointer to the tail of the doubly linked list of entries in + * the I/O queue. + * + * This field is NULL if the I/O queue is empty. + * + * num_pending: Number of I/O request pending on the I/O queue. + * + * num_in_progress: Number of I/O requests in progress on the I/O queue. + * + * q_len: Number of I/O requests on the I/O queue. Observe that q_len + * must equal (num_pending + num_in_progress). + * + * req_counter: unsigned 16 bit integer used to provide a "unique" tag for + * each I/O request. This value is incremented by 1, and then + * passed to the worker thread where its lower bits are incorporated + * into the tag used to disambiguate multiple, concurrent I/O + * requests from a single rank. The value is 32 bits, as MPI tags + * are limited to 32 bits. The value is unsigned as it is expected + * to wrap around once its maximum value is reached. + * + * q_mutex: Mutex used to ensure that only one thread accesses the IOC I/O + * Queue at once. This mutex must be held to access of modify + * all fields of the + * + * + * Statistics: + * + * The following fields are only defined if H5FD_IOC_COLLECT_STATS is TRUE. + * They are intended to allow collection of basic statistics on the + * behaviour of the IOC I/O Queue for purposes of debugging and performance + * optimization. + * + * max_q_len: Maximum number of requests residing on the IOC I/O Queue at + * any point in time in the current run. + * + * max_num_pending: Maximum number of pending requests residing on the IOC + * I/O Queue at any point in time in the current run. + * + * max_num_in_progress: Maximum number of in progress requests residing on + * the IOC I/O Queue at any point in time in the current run. + * + * ind_read_requests: Number of independent read requests received by the + * IOC to date. + * + * ind_write_requests Number of independent write requests received by the + * IOC to date. + * + * truncate_requests: Number of truncate requests received by the IOC to + * date. + * + * get_eof_requests: Number fo get EOF request received by the IO to date. + * + * requests_queued: Number of I/O requests received and placed on the IOC + * I/O queue. + * + * requests_dispatched: Number of I/O requests dispatched for execution by + * the worker threads. + * + * requests_completed: Number of I/O requests completed by the worker threads. + * Observe that on file close, requests_queued, requests_dispatched, + * and requests_completed should be equal. + * + ****************************************************************************/ + +#define H5FD_IOC__IO_Q_MAGIC 0x2468 + +typedef struct ioc_io_queue { + + uint32_t magic; + ioc_io_queue_entry_t *q_head; + ioc_io_queue_entry_t *q_tail; + int32_t num_pending; + int32_t num_in_progress; + int32_t num_failed; + int32_t q_len; + uint32_t req_counter; + hg_thread_mutex_t q_mutex; + + /* statistics */ +#ifdef H5FD_IOC_COLLECT_STATS + int32_t max_q_len; + int32_t max_num_pending; + int32_t max_num_in_progress; + int64_t ind_read_requests; + int64_t ind_write_requests; + int64_t truncate_requests; + int64_t get_eof_requests; + int64_t requests_queued; + int64_t requests_dispatched; + int64_t requests_completed; +#endif + +} ioc_io_queue_t; + +/* + * Structure definitions to enable async io completions + * We first define a structure which contains the basic + * input arguments for the functions which were originally + * invoked. See below. + */ +typedef struct _client_io_args { + int ioc; /* ID of the IO Concentrator handling this IO. */ + int64_t context_id; /* The context id provided for the read or write */ + int64_t offset; /* The file offset for the IO operation */ + int64_t elements; /* How many bytes */ + void * data; /* A pointer to the (contiguous) data segment */ + MPI_Request io_req; /* An MPI request to allow the code to loop while */ + /* making progress on multiple IOs */ +} io_args_t; + +typedef struct _client_io_func { + int (*io_function)(void *this_io); /* pointer to a completion function */ + io_args_t io_args; /* arguments passed to the completion function */ + int pending; /* The function is complete (0) or pending (1)? */ +} io_func_t; + +typedef struct _io_req { + struct _io_req *prev; /* A simple list structure containing completion */ + struct _io_req *next; /* functions. These should get removed as IO ops */ + io_func_t completion_func; /* are completed */ +} io_req_t; + +extern int *H5FD_IOC_tag_ub_val_ptr; + +#ifdef __cplusplus +extern "C" { +#endif + +H5_DLL int initialize_ioc_threads(void *_sf_context); +H5_DLL int finalize_ioc_threads(void *_sf_context); + +H5_DLL herr_t ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, + int64_t elements, const void *data, io_req_t **io_req); +H5_DLL herr_t ioc__read_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, + int64_t elements, void *data, io_req_t **io_req); + +H5_DLL int wait_for_thread_main(void); + +#ifdef __cplusplus +} +#endif + +#endif /* H5FDioc_priv_H */ |