summaryrefslogtreecommitdiffstats
path: root/src/H5FDsubfiling/H5subfiling_common.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDsubfiling/H5subfiling_common.h')
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.h257
1 files changed, 257 insertions, 0 deletions
diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h
new file mode 100644
index 0000000..cfcbf4a
--- /dev/null
+++ b/src/H5FDsubfiling/H5subfiling_common.h
@@ -0,0 +1,257 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Header file for shared code between the HDF5 Subfiling VFD and IOC VFD
+ */
+
+#ifndef H5_SUBFILING_COMMON_H
+#define H5_SUBFILING_COMMON_H
+
+#include <stdatomic.h>
+
+#include "H5private.h"
+#include "H5Iprivate.h"
+
+/* TODO: needed for ioc_selection_t, which also needs to be public */
+#include "H5FDioc.h"
+
+/*
+ * Some definitions for debugging the Subfiling feature
+ */
+/* #define H5_SUBFILING_DEBUG */
+
+/*
+ * The following is our basic template for a subfile filename.
+ * Note that eventually we shouldn't use 0_of_N since we
+ * intend to use the user defined HDF5 filename for a
+ * zeroth subfile as well as for all metadata.
+ */
+#define SF_FILENAME_TEMPLATE ".subfile_%" PRIu64 "_%0*d_of_%d"
+
+/*
+ * The following is our basic template for a subfiling
+ * configuration filename.
+ */
+#define SF_CONFIG_FILENAME_TEMPLATE ".subfile_%" PRIu64 ".config"
+
+/*
+ * Environment variables interpreted by the HDF5 subfiling feature
+ */
+#define H5_IOC_SELECTION_CRITERIA "H5_IOC_SELECTION_CRITERIA"
+#define H5_IOC_COUNT_PER_NODE "H5_IOC_COUNT_PER_NODE"
+#define H5_IOC_STRIPE_SIZE "H5_IOC_STRIPE_SIZE"
+#define H5_IOC_SUBFILE_PREFIX "H5_IOC_SUBFILE_PREFIX"
+
+#define H5FD_DEFAULT_STRIPE_DEPTH (32 * 1024 * 1024)
+
+/*
+ * MPI Tags are 32 bits, we treat them as unsigned
+ * to allow the use of the available bits for RPC
+ * selections, i.e. a message from the VFD read or write functions
+ * to an IO Concentrator. The messages themselves are in general
+ * ONLY 3 int64_t values which define a) the data size to be read
+ * or written, b) the file offset where the data will be read from
+ * or stored, and c) the context_id allows the IO concentrator to
+ * locate the IO context for the new IO transaction.
+ *
+ * 0000
+ * 0001 READ_OP (Independent)
+ * 0010 WRITE_OP (Independent)
+ * 0011 /////////
+ * 0100 CLOSE_OP (Independent)
+ * -----
+ * 1000
+ * 1001 COLLECTIVE_READ
+ * 1010 COLLECTIVE_WRITE
+ * 1011 /////////
+ * 1100 COLLECTIVE_CLOSE
+ *
+ * 31 28 24 20 16 12 8 4 0|
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | | | ACKS | OP |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ */
+
+/* Bit 3 SET indicates collectives */
+#define COLL_FUNC (0x1 << 3)
+
+#define ACK_PART (0x01 << 8)
+#define DATA_PART (0x02 << 8)
+#define READY (0x04 << 8)
+#define COMPLETED (0x08 << 8)
+
+#define INT32_MASK 0x07FFFFFFFFFFFFFFF
+
+#define READ_INDEP (READ_OP)
+#define READ_COLL (COLL_FUNC | READ_OP)
+#define WRITE_INDEP (WRITE_OP)
+#define WRITE_COLL (COLL_FUNC | WRITE_OP)
+
+#define GET_EOF_COMPLETED (COMPLETED | GET_EOF_OP)
+
+#define SET_LOGGING (LOGGING_OP)
+
+/* MPI tag values for data communicator */
+#define WRITE_INDEP_ACK 0
+#define READ_INDEP_DATA 1
+#define WRITE_TAG_BASE 2
+
+/*
+ * Object type definitions for subfiling objects.
+ * Used when generating a new subfiling object ID
+ * or accessing the cache of stored subfiling
+ * objects.
+ */
+typedef enum {
+ SF_BADID = (-1),
+ SF_TOPOLOGY = 1,
+ SF_CONTEXT = 2,
+ SF_NTYPES /* number of subfiling object types, MUST BE LAST */
+} sf_obj_type_t;
+
+/* The following are the basic 'op codes' used when
+ * constructing a RPC message for IO Concentrators.
+ * These are defined in the low 8 bits of the
+ * message.
+ */
+typedef enum io_ops {
+ READ_OP = 1,
+ WRITE_OP = 2,
+ OPEN_OP = 3,
+ CLOSE_OP = 4,
+ TRUNC_OP = 5,
+ GET_EOF_OP = 6,
+ FINI_OP = 8,
+ LOGGING_OP = 16
+} io_op_t;
+
+/* Every application rank will record their MPI rank
+ * and hostid as a structure. These eventually get
+ * communicated to MPI rank zero(0) and sorted before
+ * being broadcast. The resulting sorted vector
+ * provides a basis for determining which MPI ranks
+ * will host an IO Concentrator (IOC), e.g. For
+ * default behavior, we choose the first vector entry
+ * associated with a "new" hostid.
+ */
+typedef struct {
+ long rank;
+ long hostid;
+} layout_t;
+
+/* This typedef defines a fixed process layout which
+ * can be reused for any number of file open operations
+ */
+typedef struct app_layout_t {
+ long hostid; /* value returned by gethostid() */
+ layout_t *layout; /* Vector of {rank,hostid} values */
+ int * node_ranks; /* ranks extracted from sorted layout */
+ int node_count; /* Total nodes (different hostids) */
+ int node_index; /* My node: index into node_ranks */
+ int local_peers; /* How may local peers on my node */
+ int world_rank; /* My MPI rank */
+ int world_size; /* Total number of MPI ranks */
+} app_layout_t;
+
+/* This typedef defines things related to IOC selections */
+typedef struct topology {
+ app_layout_t * app_layout; /* Pointer to our layout struct */
+ bool rank_is_ioc; /* Indicates that we host an IOC */
+ int subfile_rank; /* Valid only if rank_is_ioc */
+ int n_io_concentrators; /* Number of IO concentrators */
+ int * io_concentrators; /* Vector of ranks which are IOCs */
+ int * subfile_fd; /* file descriptor (if IOC) */
+ ioc_selection_t selection_type; /* Cache our IOC selection criteria */
+} sf_topology_t;
+
+typedef struct {
+ int64_t sf_context_id; /* Generated context ID which embeds the cache index */
+ uint64_t h5_file_id; /* GUID (basically the inode value) */
+ int sf_fid; /* value returned by open(file,..) */
+ size_t sf_write_count; /* Statistics: write_count */
+ size_t sf_read_count; /* Statistics: read_count */
+ haddr_t sf_eof; /* File eof */
+ int64_t sf_stripe_size; /* Stripe-depth */
+ int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */
+ int64_t sf_base_addr; /* For an IOC, our base address */
+ MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */
+ MPI_Comm sf_data_comm; /* MPI comm used to move data */
+ MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */
+ MPI_Comm sf_barrier_comm; /* MPI comm used for barrier operations */
+ MPI_Comm sf_group_comm; /* Not used: for IOC collectives */
+ MPI_Comm sf_intercomm; /* Not used: for msgs to all IOC */
+ int sf_group_size; /* IOC count (in sf_group_comm) */
+ int sf_group_rank; /* IOC rank (in sf_group_comm) */
+ int sf_intercomm_root; /* Not used: for IOC comms */
+ char * subfile_prefix; /* If subfiles are node-local */
+ char * sf_filename; /* A generated subfile name */
+ char * h5_filename; /* The user supplied file name */
+ void * ioc_data; /* Private data for underlying IOC */
+ sf_topology_t *topology; /* pointer to our topology */
+
+#ifdef H5_SUBFILING_DEBUG
+ char sf_logfile_name[PATH_MAX];
+ FILE *sf_logfile;
+#endif
+
+} subfiling_context_t;
+
+/* The following is a somewhat augmented input (by the IOC) which captures
+ * the basic RPC from a 'source'. The fields are filled out to allow
+ * an easy gathering of statistics by the IO Concentrator.
+ */
+typedef struct {
+ /* {Datasize, Offset, FileID} */
+ int64_t header[3]; /* The basic RPC input plus */
+ int tag; /* the supplied OPCODE tag */
+ int source; /* Rank of who sent the message */
+ int subfile_rank; /* The IOC rank */
+ int64_t context_id; /* context to be used to complete */
+ double start_time; /* the request, + time of receipt */
+ /* from which we calc Time(queued) */
+ void *buffer; /* for writes, we keep the buffer */
+ /* around for awhile... */
+ volatile int in_progress; /* Not used! */
+ volatile int serialize; /* worker thread needs to wait while true */
+ volatile int dependents; //* If current work item has dependents */
+ int depend_id; /* work queue index of the dependent */
+} sf_work_request_t;
+
+extern int sf_verbose_flag;
+
+extern app_layout_t *sf_app_layout;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+H5_DLL herr_t H5_open_subfiles(const char *base_filename, uint64_t h5_file_id,
+ ioc_selection_t ioc_selection_type, int file_acc_flags, MPI_Comm file_comm,
+ int64_t *context_id_out);
+H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id);
+
+H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val);
+H5_DLL void * H5_get_subfiling_object(int64_t object_id);
+H5_DLL int64_t H5_subfile_fid_to_context(uint64_t h5_fid);
+H5_DLL herr_t H5_free_subfiling_object(int64_t object_id);
+
+H5_DLL void H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...);
+
+void set_verbose_flag(int subfile_rank, int new_value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* H5_SUBFILING_COMMON_H */