summaryrefslogtreecommitdiffstats
path: root/src/H5FDsubfiling/H5FDioc_priv.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDsubfiling/H5FDioc_priv.h')
-rw-r--r--src/H5FDsubfiling/H5FDioc_priv.h440
1 files changed, 440 insertions, 0 deletions
diff --git a/src/H5FDsubfiling/H5FDioc_priv.h b/src/H5FDsubfiling/H5FDioc_priv.h
new file mode 100644
index 0000000..07eb124
--- /dev/null
+++ b/src/H5FDsubfiling/H5FDioc_priv.h
@@ -0,0 +1,440 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Private definitions for HDF5 IOC VFD
+ */
+
+#ifndef H5FDioc_priv_H
+#define H5FDioc_priv_H
+
+/********************/
+/* Standard Headers */
+/********************/
+
+#include <stdatomic.h>
+#include <libgen.h>
+
+/**************/
+/* H5 Headers */
+/**************/
+
+#include "H5private.h" /* Generic Functions */
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Dprivate.h" /* Datasets */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5FDioc.h" /* IOC VFD */
+#include "H5Iprivate.h" /* IDs */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
+
+#include "H5subfiling_common.h"
+#include "H5subfiling_err.h"
+
+#include "mercury_thread.h"
+#include "mercury_thread_mutex.h"
+#include "mercury_thread_pool.h"
+
+/*
+ * Some definitions for debugging the IOC VFD
+ */
+
+/* #define H5FD_IOC_DEBUG */
+/* #define H5FD_IOC_REQUIRE_FLUSH */
+/* #define H5FD_IOC_COLLECT_STATS */
+
+/****************************************************************************
+ *
+ * IOC I/O Queue management macros:
+ *
+ * The following macros perform the necessary operations on the IOC I/O
+ * Queue, which is implemented as a doubly linked list of instances of
+ * ioc_io_queue_entry_t.
+ *
+ * WARNING: q_ptr->q_mutex must be held when these macros are executed..
+ *
+ * At present, the necessary operations are append (insert an entry at the
+ * end of the queue), and delete (remove an entry from the queue).
+ *
+ * At least initially, all sanity checking is done with asserts, as the
+ * the existing I/O concentrator code is not well integrated into the HDF5
+ * error reporting system. This will have to be revisited for a production
+ * version, but it should be sufficient for now.
+ *
+ * JRM -- 11/2/21
+ *
+ ****************************************************************************/
+
+#define H5FD_IOC__IO_Q_ENTRY_MAGIC 0x1357
+
+/* clang-format off */
+
+#define H5FD_IOC__Q_APPEND(q_ptr, entry_ptr) \
+do { \
+ HDassert(q_ptr); \
+ HDassert((q_ptr)->magic == H5FD_IOC__IO_Q_MAGIC); \
+ HDassert((((q_ptr)->q_len == 0) && ((q_ptr)->q_head == NULL) && ((q_ptr)->q_tail == NULL)) || \
+ (((q_ptr)->q_len > 0) && ((q_ptr)->q_head != NULL) && ((q_ptr)->q_tail != NULL))); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); \
+ HDassert((entry_ptr)->next == NULL); \
+ HDassert((entry_ptr)->prev == NULL); \
+ HDassert((entry_ptr)->in_progress == FALSE); \
+ \
+ if ( ((q_ptr)->q_head) == NULL ) \
+ { \
+ ((q_ptr)->q_head) = (entry_ptr); \
+ ((q_ptr)->q_tail) = (entry_ptr); \
+ } \
+ else \
+ { \
+ ((q_ptr)->q_tail)->next = (entry_ptr); \
+ (entry_ptr)->prev = ((q_ptr)->q_tail); \
+ ((q_ptr)->q_tail) = (entry_ptr); \
+ } \
+ ((q_ptr)->q_len)++; \
+} while ( FALSE ) /* H5FD_IOC__Q_APPEND() */
+
+#define H5FD_IOC__Q_REMOVE(q_ptr, entry_ptr) \
+do { \
+ HDassert(q_ptr); \
+ HDassert((q_ptr)->magic == H5FD_IOC__IO_Q_MAGIC); \
+ HDassert((((q_ptr)->q_len == 1) && ((q_ptr)->q_head ==((q_ptr)->q_tail)) && ((q_ptr)->q_head == (entry_ptr))) || \
+ (((q_ptr)->q_len > 0) && ((q_ptr)->q_head != NULL) && ((q_ptr)->q_tail != NULL))); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); \
+ HDassert((((q_ptr)->q_len == 1) && ((entry_ptr)->next == NULL) && ((entry_ptr)->prev == NULL)) || \
+ (((q_ptr)->q_len > 1) && (((entry_ptr)->next != NULL) || ((entry_ptr)->prev != NULL)))); \
+ HDassert((entry_ptr)->in_progress == TRUE); \
+ \
+ { \
+ if ( (((q_ptr)->q_head)) == (entry_ptr) ) \
+ { \
+ (((q_ptr)->q_head)) = (entry_ptr)->next; \
+ if ( (((q_ptr)->q_head)) != NULL ) \
+ (((q_ptr)->q_head))->prev = NULL; \
+ } \
+ else \
+ { \
+ (entry_ptr)->prev->next = (entry_ptr)->next; \
+ } \
+ if (((q_ptr)->q_tail) == (entry_ptr) ) \
+ { \
+ ((q_ptr)->q_tail) = (entry_ptr)->prev; \
+ if ( ((q_ptr)->q_tail) != NULL ) \
+ ((q_ptr)->q_tail)->next = NULL; \
+ } \
+ else \
+ { \
+ (entry_ptr)->next->prev = (entry_ptr)->prev; \
+ } \
+ (entry_ptr)->next = NULL; \
+ (entry_ptr)->prev = NULL; \
+ ((q_ptr)->q_len)--; \
+ } \
+} while ( FALSE ) /* H5FD_IOC__Q_REMOVE() */
+
+/* clang-format on */
+
+/****************************************************************************
+ *
+ * structure ioc_io_queue_entry
+ *
+ * magic: Unsigned 32 bit integer always set to H5FD_IOC__IO_Q_ENTRY_MAGIC.
+ * This field is used to validate pointers to instances of
+ * ioc_io_queue_entry_t.
+ *
+ * next: Next pointer in the doubly linked list used to implement
+ * the IOC I/O Queue. This field points to the next entry
+ * in the queue, or NULL if there is no next entry.
+ *
+ * prev: Prev pointer in the doubly linked list used to implement
+ * the IOC I/O Queue. This field points to the previous entry
+ * in the queue, or NULL if there is no previous entry.
+ *
+ * in_progress: Boolean flag that must be FALSE when the entry is inserted
+ * into the IOC I/O Queue, and set to TRUE when the entry is dispatched
+ * to the worker thread pool for execution.
+ *
+ * When in_progress is FALS, the entry is said to be pending.
+ *
+ * counter: uint32_t containing a serial number assigned to this IOC
+ * I/O Queue entry. Note that this will roll over on long
+ * computations, and thus is not in general unique.
+ *
+ * The counter fields is used to construct a tag to distinguish
+ * multiple concurrent I/O requests from a give rank, and thus
+ * this should not be a problem as long as there is sufficient
+ * time between roll overs. As only the lower bits of the counter
+ * are used in tag construction, this is more frequent than the
+ * size of the counter field would suggest -- albeit hopefully
+ * still infrequent enough.
+ *
+ * wk_req: Instance of sf_work_request_t. Replace with individual
+ * fields when convenient.
+ *
+ *
+ * Statistics:
+ *
+ * The following fields are only defined if H5FD_IOC_COLLECT_STATS is TRUE.
+ * They are intended to allow collection of basic statistics on the
+ * behaviour of the IOC I/O Queue for purposes of debugging and performance
+ * optimization.
+ *
+ * q_time: uint64_t containing the time the entry was place on the
+ * IOC I/O Queue in usec after the UNIX epoch.
+ *
+ * This value is used to compute the queue wait time, and the
+ * total processing time for the entry.
+ *
+ * dispatch_time: uint64_t containing the time the entry is dispatched in
+ * usec after the UNIX epoch. This field is undefined if the
+ * entry is pending.
+ *
+ * This value is used to compute the execution time for the
+ * entry.
+ *
+ ****************************************************************************/
+
+typedef struct ioc_io_queue_entry {
+
+ uint32_t magic;
+ struct ioc_io_queue_entry *next;
+ struct ioc_io_queue_entry *prev;
+ hbool_t in_progress;
+ uint32_t counter;
+
+ sf_work_request_t wk_req;
+ struct hg_thread_work thread_wk;
+ int wk_ret;
+
+ /* statistics */
+#ifdef H5FD_IOC_COLLECT_STATS
+
+ uint64_t q_time;
+ uint64_t dispatch_time;
+
+#endif
+
+} ioc_io_queue_entry_t;
+
+/****************************************************************************
+ *
+ * structure ioc_io_queue
+ *
+ * This is a temporary structure -- its fields should be moved to an I/O
+ * concentrator Catchall structure eventually.
+ *
+ * The fields of this structure support the io queue used to receive and
+ * sequence I/O requests for execution by the worker threads. The rules
+ * for sequencing are as follows:
+ *
+ * 1) Non-overlaping I/O requests must be fed to the worker threads in
+ * the order received, and may execute concurrently
+ *
+ * 2) Overlapping read requests must be fed to the worker threads in
+ * the order received, but may execute concurrently.
+ *
+ * 3) If any pair of I/O requests overlap, and at least one is a write
+ * request, they must be executed in strict arrival order, and the
+ * first must complete before the second starts.
+ *
+ * Due to the strict ordering requirement in rule 3, entries must be
+ * inserted at the tail of the queue in receipt order, and retained on
+ * the queue until completed. Entries in the queue are marked pending
+ * when inserted on the queue, in progress when handed to a worker
+ * thread, and deleted from the queue when completed.
+ *
+ * The dispatch algorithm is as follows:
+ *
+ * 1) Set X equal to the element at the head of the queue.
+ *
+ * 2) If X is pending, and there exists no prior element (i.e. between X
+ * and the head of the queue) that intersects with X, goto 5).
+ *
+ * 3) If X is pending, X is a read, and all prior intersecting elements
+ * are reads, goto 5).
+ *
+ * 4) If X is in progress, or if any prior intersecting element is a
+ * write, or if X is a write, set X equal to its successor in the
+ * queue (i.e. the next element further down the queue from the head)
+ * and goto 2) If there is no next element, exit without dispatching
+ * any I/O request.
+ *
+ * 5) If we get to 5, X must be pending. Mark it in progress, and
+ * dispatch it. If the number of in progress entries is less than
+ * the number of worker threads, and X has a successor in the queue,
+ * set X equal to its predecessor, and goto 2). Otherwise exit without
+ * dispatching further I/O requests.
+ *
+ * Note that the above dispatch algorithm doesn't address collective
+ * I/O requests -- this should be OK for now, but it will have to
+ * addressed prior to production release.
+ *
+ * On I/O request completion, worker threads must delete their assigned
+ * I/O requests from the queue, check to see if there are any pending
+ * requests, and trigger the dispatch algorithm if there are.
+ *
+ * The fields in the structure are discussed individually below.
+ *
+ * magic: Unsigned 32 bit integer always set to H5FD_IOC__IO_Q_MAGIC.
+ * This field is used to validate pointers to instances of
+ * H5C_t.
+ *
+ * q_head: Pointer to the head of the doubly linked list of entries in
+ * the I/O queue.
+ *
+ * This field is NULL if the I/O queue is empty.
+ *
+ * q_tail: Pointer to the tail of the doubly linked list of entries in
+ * the I/O queue.
+ *
+ * This field is NULL if the I/O queue is empty.
+ *
+ * num_pending: Number of I/O request pending on the I/O queue.
+ *
+ * num_in_progress: Number of I/O requests in progress on the I/O queue.
+ *
+ * q_len: Number of I/O requests on the I/O queue. Observe that q_len
+ * must equal (num_pending + num_in_progress).
+ *
+ * req_counter: unsigned 16 bit integer used to provide a "unique" tag for
+ * each I/O request. This value is incremented by 1, and then
+ * passed to the worker thread where its lower bits are incorporated
+ * into the tag used to disambiguate multiple, concurrent I/O
+ * requests from a single rank. The value is 32 bits, as MPI tags
+ * are limited to 32 bits. The value is unsigned as it is expected
+ * to wrap around once its maximum value is reached.
+ *
+ * q_mutex: Mutex used to ensure that only one thread accesses the IOC I/O
+ * Queue at once. This mutex must be held to access of modify
+ * all fields of the
+ *
+ *
+ * Statistics:
+ *
+ * The following fields are only defined if H5FD_IOC_COLLECT_STATS is TRUE.
+ * They are intended to allow collection of basic statistics on the
+ * behaviour of the IOC I/O Queue for purposes of debugging and performance
+ * optimization.
+ *
+ * max_q_len: Maximum number of requests residing on the IOC I/O Queue at
+ * any point in time in the current run.
+ *
+ * max_num_pending: Maximum number of pending requests residing on the IOC
+ * I/O Queue at any point in time in the current run.
+ *
+ * max_num_in_progress: Maximum number of in progress requests residing on
+ * the IOC I/O Queue at any point in time in the current run.
+ *
+ * ind_read_requests: Number of independent read requests received by the
+ * IOC to date.
+ *
+ * ind_write_requests Number of independent write requests received by the
+ * IOC to date.
+ *
+ * truncate_requests: Number of truncate requests received by the IOC to
+ * date.
+ *
+ * get_eof_requests: Number fo get EOF request received by the IO to date.
+ *
+ * requests_queued: Number of I/O requests received and placed on the IOC
+ * I/O queue.
+ *
+ * requests_dispatched: Number of I/O requests dispatched for execution by
+ * the worker threads.
+ *
+ * requests_completed: Number of I/O requests completed by the worker threads.
+ * Observe that on file close, requests_queued, requests_dispatched,
+ * and requests_completed should be equal.
+ *
+ ****************************************************************************/
+
+#define H5FD_IOC__IO_Q_MAGIC 0x2468
+
+typedef struct ioc_io_queue {
+
+ uint32_t magic;
+ ioc_io_queue_entry_t *q_head;
+ ioc_io_queue_entry_t *q_tail;
+ int32_t num_pending;
+ int32_t num_in_progress;
+ int32_t num_failed;
+ int32_t q_len;
+ uint32_t req_counter;
+ hg_thread_mutex_t q_mutex;
+
+ /* statistics */
+#ifdef H5FD_IOC_COLLECT_STATS
+ int32_t max_q_len;
+ int32_t max_num_pending;
+ int32_t max_num_in_progress;
+ int64_t ind_read_requests;
+ int64_t ind_write_requests;
+ int64_t truncate_requests;
+ int64_t get_eof_requests;
+ int64_t requests_queued;
+ int64_t requests_dispatched;
+ int64_t requests_completed;
+#endif
+
+} ioc_io_queue_t;
+
+/*
+ * Structure definitions to enable async io completions
+ * We first define a structure which contains the basic
+ * input arguments for the functions which were originally
+ * invoked. See below.
+ */
+typedef struct _client_io_args {
+ int ioc; /* ID of the IO Concentrator handling this IO. */
+ int64_t context_id; /* The context id provided for the read or write */
+ int64_t offset; /* The file offset for the IO operation */
+ int64_t elements; /* How many bytes */
+ void * data; /* A pointer to the (contiguous) data segment */
+ MPI_Request io_req; /* An MPI request to allow the code to loop while */
+ /* making progress on multiple IOs */
+} io_args_t;
+
+typedef struct _client_io_func {
+ int (*io_function)(void *this_io); /* pointer to a completion function */
+ io_args_t io_args; /* arguments passed to the completion function */
+ int pending; /* The function is complete (0) or pending (1)? */
+} io_func_t;
+
+typedef struct _io_req {
+ struct _io_req *prev; /* A simple list structure containing completion */
+ struct _io_req *next; /* functions. These should get removed as IO ops */
+ io_func_t completion_func; /* are completed */
+} io_req_t;
+
+extern int *H5FD_IOC_tag_ub_val_ptr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+H5_DLL int initialize_ioc_threads(void *_sf_context);
+H5_DLL int finalize_ioc_threads(void *_sf_context);
+
+H5_DLL herr_t ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset,
+ int64_t elements, const void *data, io_req_t **io_req);
+H5_DLL herr_t ioc__read_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset,
+ int64_t elements, void *data, io_req_t **io_req);
+
+H5_DLL int wait_for_thread_main(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* H5FDioc_priv_H */