1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright by The HDF Group. *
* All rights reserved. *
* *
* This file is part of HDF5. The full HDF5 copyright notice, including *
* terms governing use, modification, and redistribution, is contained in *
* the COPYING file, which can be found at the root of the source code *
* distribution tree, or in https://www.hdfgroup.org/licenses. *
* If you do not have access to either file, you may request a copy from *
* help@hdfgroup.org. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
* Header file for shared code between the HDF5 Subfiling VFD and IOC VFD
*/
#ifndef H5_SUBFILING_COMMON_H
#define H5_SUBFILING_COMMON_H
#include <stdatomic.h>
#include "H5private.h"
#include "H5FDprivate.h"
#include "H5Iprivate.h"
#include "H5Pprivate.h"
#include "H5FDsubfiling.h"
#include "H5FDioc.h"
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
/*
* Some definitions for debugging the Subfiling feature
*/
/* #define H5_SUBFILING_DEBUG */
/*
* Some definitions for controlling performance across
* different machines where some types of MPI operations
* may be better optimized than others
*/
/* #define H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY */
#ifndef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY
#if !H5_CHECK_MPI_VERSION(3, 0)
#error "MPI 3 required for MPI_Comm_split_type"
#endif
#endif
/*
* Name of the HDF5 FAPL property that the Subfiling VFD
* uses to pass its configuration down to the underlying
* IOC VFD
*/
#define H5FD_SUBFILING_CONFIG_PROP "H5FD_SUBFILING_CONFIG_PROP"
/*
* Name of the HDF5 FAPL property that the Subfiling VFD
* uses to pass the HDF5 stub file's Inode value to the
* underlying IOC VFD
*/
#define H5FD_SUBFILING_STUB_FILE_ID "H5FD_SUBFILING_STUB_FILE_ID"
/*
* MPI Tags are 32 bits, we treat them as unsigned
* to allow the use of the available bits for RPC
* selections, i.e. a message from the VFD read or write functions
* to an IO Concentrator. The messages themselves are in general
* ONLY 3 int64_t values which define a) the data size to be read
* or written, b) the file offset where the data will be read from
* or stored, and c) the context_id allows the IO concentrator to
* locate the IO context for the new IO transaction.
*
* 0000
* 0001 READ_OP (Independent)
* 0010 WRITE_OP (Independent)
* 0011 /////////
* 0100 CLOSE_OP (Independent)
* -----
* 1000
* 1001 COLLECTIVE_READ
* 1010 COLLECTIVE_WRITE
* 1011 /////////
* 1100 COLLECTIVE_CLOSE
*
* 31 28 24 20 16 12 8 4 0|
* +-------+-------+-------+-------+-------+-------+-------+-------+
* | | | ACKS | OP |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
*/
/* Bit 3 SET indicates collectives */
#define COLL_FUNC (0x1 << 3)
#define ACK_PART (0x01 << 8)
#define DATA_PART (0x02 << 8)
#define READY (0x04 << 8)
#define COMPLETED (0x08 << 8)
#define INT32_MASK 0x07FFFFFFFFFFFFFFF
#define READ_INDEP (READ_OP)
#define READ_COLL (COLL_FUNC | READ_OP)
#define WRITE_INDEP (WRITE_OP)
#define WRITE_COLL (COLL_FUNC | WRITE_OP)
#define GET_EOF_COMPLETED (COMPLETED | GET_EOF_OP)
#define TRUNC_COMPLETED (COMPLETED | TRUNC_OP)
#define SET_LOGGING (LOGGING_OP)
/* MPI tag values for data communicator */
#define WRITE_INDEP_ACK 0
#define READ_INDEP_ACK 1
#define READ_INDEP_DATA 2
#define WRITE_DATA_DONE 3
#define IO_TAG_BASE 4
/*
* Object type definitions for subfiling objects.
* Used when generating a new subfiling object ID
* or accessing the cache of stored subfiling
* objects.
*/
typedef enum {
SF_BADID = (-1),
SF_TOPOLOGY = 1,
SF_CONTEXT = 2,
SF_NTYPES /* number of subfiling object types, MUST BE LAST */
} sf_obj_type_t;
/* The following are the basic 'op codes' used when
* constructing a RPC message for IO Concentrators.
* These are defined in the low 8 bits of the
* message.
*/
typedef enum io_ops {
READ_OP = 1,
WRITE_OP = 2,
OPEN_OP = 3,
CLOSE_OP = 4,
TRUNC_OP = 5,
GET_EOF_OP = 6,
FINI_OP = 8,
LOGGING_OP = 16
} io_op_t;
/*
* Every MPI rank in a file's communicator will
* record their MPI rank for the file communicator
* and their node-local MPI rank for the node's
* communicator. Then the resulting information
* will be broadcast to all MPI ranks and will
* provide a basis for determining which MPI ranks
* will host an I/O concentrator.
*/
typedef struct {
int rank;
int node_local_rank;
int node_local_size;
int node_lead_rank;
} layout_t;
/*
* This typedef defines a fixed process layout which
* can be reused for any number of file open operations
*/
typedef struct app_layout_t {
layout_t *layout; /* Array of (rank, node local rank, node local size) values */
int *node_ranks; /* Array of lowest MPI rank values on each node */
int node_count; /* Total number of nodes */
int world_rank; /* MPI rank in file communicator */
int world_size; /* Size of file communicator */
int node_local_rank; /* MPI rank on node */
int node_local_size; /* Size of node intra-communicator */
} app_layout_t;
/* This typedef defines things related to IOC selections */
typedef struct topology {
app_layout_t *app_layout; /* Pointer to our layout struct */
MPI_Comm app_comm; /* MPI communicator for this topology */
bool rank_is_ioc; /* Indicates that we host an IOC */
int ioc_idx; /* Valid only if rank_is_ioc */
int n_io_concentrators; /* Number of I/O concentrators */
int *io_concentrators; /* Vector of ranks which are IOCs */
H5FD_subfiling_ioc_select_t selection_type; /* Cache our IOC selection criteria */
} sf_topology_t;
typedef struct {
int64_t sf_context_id; /* Generated context ID which embeds the cache index */
uint64_t h5_file_id; /* GUID (basically the inode value) */
int *sf_fids; /* Array of file IDs for subfiles this rank owns */
int sf_num_fids; /* Number of subfiles this rank owns */
int sf_num_subfiles; /* Total number of subfiles for logical HDF5 file */
size_t sf_write_count; /* Statistics: write_count */
size_t sf_read_count; /* Statistics: read_count */
haddr_t sf_eof; /* File eof */
int64_t sf_stripe_size; /* Stripe-depth */
int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */
int64_t sf_base_addr; /* For an IOC, our base address */
MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */
MPI_Comm sf_data_comm; /* MPI comm used to move data */
MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */
MPI_Comm sf_node_comm; /* MPI comm used for intra-node comms */
MPI_Comm sf_group_comm; /* Not used: for IOC collectives */
int sf_group_size; /* IOC count (in sf_group_comm) */
int sf_group_rank; /* IOC rank (in sf_group_comm) */
char *subfile_prefix; /* If subfiles are node-local */
char *config_file_prefix; /* Prefix added to config file name */
char *h5_filename; /* The user supplied file name */
void *ioc_data; /* Private data for underlying IOC */
sf_topology_t *topology; /* Pointer to our topology */
#ifdef H5_SUBFILING_DEBUG
char sf_logfile_name[PATH_MAX];
FILE *sf_logfile;
#endif
} subfiling_context_t;
/* The following is a somewhat augmented input (by the IOC) which captures
* the basic RPC from a 'source'. The fields are filled out to allow
* an easy gathering of statistics by the IO Concentrator.
*/
typedef struct {
int64_t header[3]; /* The basic RPC input */
int tag; /* the supplied OPCODE tag */
int source; /* Rank of who sent the message */
int ioc_idx; /* The IOC rank */
int64_t context_id; /* context to be used to complete */
double start_time; /* the request, + time of receipt */
/* from which we calc Time(queued) */
} sf_work_request_t;
/* MPI Datatype used to send/receive an RPC message */
extern MPI_Datatype H5_subfiling_rpc_msg_type;
#ifdef __cplusplus
extern "C" {
#endif
H5_DLL herr_t H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm,
H5FD_t **file_ptr, uint64_t *file_id);
H5_DLL herr_t H5_open_subfiles(const char *base_filename, uint64_t file_id,
H5FD_subfiling_params_t *subfiling_config, int file_acc_flags,
MPI_Comm file_comm, int64_t *context_id_out);
H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm);
H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type);
H5_DLL void *H5_get_subfiling_object(int64_t object_id);
H5_DLL herr_t H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size,
int64_t *num_subfiles);
H5_DLL herr_t H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath);
H5_DLL herr_t H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr,
const H5FD_subfiling_params_t *vfd_config);
H5_DLL herr_t H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config);
H5_DLL herr_t H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id);
H5_DLL herr_t H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id);
H5_DLL int64_t H5_subfile_fid_to_context(uint64_t file_id);
H5_DLL herr_t H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config);
H5_DLL herr_t H5_subfiling_terminate(void);
H5_DLL void H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...);
#ifdef __cplusplus
}
#endif
#endif /* H5_SUBFILING_COMMON_H */
|