1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright by The HDF Group. *
* All rights reserved. *
* *
* This file is part of HDF5. The full HDF5 copyright notice, including *
* terms governing use, modification, and redistribution, is contained in *
* the COPYING file, which can be found at the root of the source code *
* distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
* If you do not have access to either file, you may request a copy from *
* help@hdfgroup.org. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
* Header file for shared code between the HDF5 Subfiling VFD and IOC VFD
*/
#ifndef H5_SUBFILING_COMMON_H
#define H5_SUBFILING_COMMON_H
#include <stdatomic.h>
#include "H5private.h"
#include "H5Iprivate.h"
/* TODO: needed for ioc_selection_t, which also needs to be public */
#include "H5FDioc.h"
/*
* Some definitions for debugging the Subfiling feature
*/
/* #define H5_SUBFILING_DEBUG */
/*
* The following is our basic template for a subfile filename.
* Note that eventually we shouldn't use 0_of_N since we
* intend to use the user defined HDF5 filename for a
* zeroth subfile as well as for all metadata.
*/
#define SF_FILENAME_TEMPLATE ".subfile_%" PRIu64 "_%0*d_of_%d"
/*
* The following is our basic template for a subfiling
* configuration filename.
*/
#define SF_CONFIG_FILENAME_TEMPLATE ".subfile_%" PRIu64 ".config"
/*
* Environment variables interpreted by the HDF5 subfiling feature
*/
#define H5_IOC_SELECTION_CRITERIA "H5_IOC_SELECTION_CRITERIA"
#define H5_IOC_COUNT_PER_NODE "H5_IOC_COUNT_PER_NODE"
#define H5_IOC_STRIPE_SIZE "H5_IOC_STRIPE_SIZE"
#define H5_IOC_SUBFILE_PREFIX "H5_IOC_SUBFILE_PREFIX"
#define H5FD_DEFAULT_STRIPE_DEPTH (32 * 1024 * 1024)
/*
* MPI Tags are 32 bits, we treat them as unsigned
* to allow the use of the available bits for RPC
* selections, i.e. a message from the VFD read or write functions
* to an IO Concentrator. The messages themselves are in general
* ONLY 3 int64_t values which define a) the data size to be read
* or written, b) the file offset where the data will be read from
* or stored, and c) the context_id allows the IO concentrator to
* locate the IO context for the new IO transaction.
*
* 0000
* 0001 READ_OP (Independent)
* 0010 WRITE_OP (Independent)
* 0011 /////////
* 0100 CLOSE_OP (Independent)
* -----
* 1000
* 1001 COLLECTIVE_READ
* 1010 COLLECTIVE_WRITE
* 1011 /////////
* 1100 COLLECTIVE_CLOSE
*
* 31 28 24 20 16 12 8 4 0|
* +-------+-------+-------+-------+-------+-------+-------+-------+
* | | | ACKS | OP |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
*/
/* Bit 3 SET indicates collectives */
#define COLL_FUNC (0x1 << 3)
#define ACK_PART (0x01 << 8)
#define DATA_PART (0x02 << 8)
#define READY (0x04 << 8)
#define COMPLETED (0x08 << 8)
#define INT32_MASK 0x07FFFFFFFFFFFFFFF
#define READ_INDEP (READ_OP)
#define READ_COLL (COLL_FUNC | READ_OP)
#define WRITE_INDEP (WRITE_OP)
#define WRITE_COLL (COLL_FUNC | WRITE_OP)
#define GET_EOF_COMPLETED (COMPLETED | GET_EOF_OP)
#define SET_LOGGING (LOGGING_OP)
/* MPI tag values for data communicator */
#define WRITE_INDEP_ACK 0
#define READ_INDEP_DATA 1
#define WRITE_TAG_BASE 2
/*
* Object type definitions for subfiling objects.
* Used when generating a new subfiling object ID
* or accessing the cache of stored subfiling
* objects.
*/
typedef enum {
SF_BADID = (-1),
SF_TOPOLOGY = 1,
SF_CONTEXT = 2,
SF_NTYPES /* number of subfiling object types, MUST BE LAST */
} sf_obj_type_t;
/* The following are the basic 'op codes' used when
* constructing a RPC message for IO Concentrators.
* These are defined in the low 8 bits of the
* message.
*/
typedef enum io_ops {
READ_OP = 1,
WRITE_OP = 2,
OPEN_OP = 3,
CLOSE_OP = 4,
TRUNC_OP = 5,
GET_EOF_OP = 6,
FINI_OP = 8,
LOGGING_OP = 16
} io_op_t;
/* Every application rank will record their MPI rank
* and hostid as a structure. These eventually get
* communicated to MPI rank zero(0) and sorted before
* being broadcast. The resulting sorted vector
* provides a basis for determining which MPI ranks
* will host an IO Concentrator (IOC), e.g. For
* default behavior, we choose the first vector entry
* associated with a "new" hostid.
*/
typedef struct {
long rank;
long hostid;
} layout_t;
/* This typedef defines a fixed process layout which
* can be reused for any number of file open operations
*/
typedef struct app_layout_t {
long hostid; /* value returned by gethostid() */
layout_t *layout; /* Vector of {rank,hostid} values */
int * node_ranks; /* ranks extracted from sorted layout */
int node_count; /* Total nodes (different hostids) */
int node_index; /* My node: index into node_ranks */
int local_peers; /* How may local peers on my node */
int world_rank; /* My MPI rank */
int world_size; /* Total number of MPI ranks */
} app_layout_t;
/* This typedef defines things related to IOC selections */
typedef struct topology {
app_layout_t * app_layout; /* Pointer to our layout struct */
bool rank_is_ioc; /* Indicates that we host an IOC */
int subfile_rank; /* Valid only if rank_is_ioc */
int n_io_concentrators; /* Number of IO concentrators */
int * io_concentrators; /* Vector of ranks which are IOCs */
int * subfile_fd; /* file descriptor (if IOC) */
ioc_selection_t selection_type; /* Cache our IOC selection criteria */
} sf_topology_t;
typedef struct {
int64_t sf_context_id; /* Generated context ID which embeds the cache index */
uint64_t h5_file_id; /* GUID (basically the inode value) */
int sf_fid; /* value returned by open(file,..) */
size_t sf_write_count; /* Statistics: write_count */
size_t sf_read_count; /* Statistics: read_count */
haddr_t sf_eof; /* File eof */
int64_t sf_stripe_size; /* Stripe-depth */
int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */
int64_t sf_base_addr; /* For an IOC, our base address */
MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */
MPI_Comm sf_data_comm; /* MPI comm used to move data */
MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */
MPI_Comm sf_barrier_comm; /* MPI comm used for barrier operations */
MPI_Comm sf_group_comm; /* Not used: for IOC collectives */
MPI_Comm sf_intercomm; /* Not used: for msgs to all IOC */
int sf_group_size; /* IOC count (in sf_group_comm) */
int sf_group_rank; /* IOC rank (in sf_group_comm) */
int sf_intercomm_root; /* Not used: for IOC comms */
char * subfile_prefix; /* If subfiles are node-local */
char * sf_filename; /* A generated subfile name */
char * h5_filename; /* The user supplied file name */
void * ioc_data; /* Private data for underlying IOC */
sf_topology_t *topology; /* pointer to our topology */
#ifdef H5_SUBFILING_DEBUG
char sf_logfile_name[PATH_MAX];
FILE *sf_logfile;
#endif
} subfiling_context_t;
/* The following is a somewhat augmented input (by the IOC) which captures
* the basic RPC from a 'source'. The fields are filled out to allow
* an easy gathering of statistics by the IO Concentrator.
*/
typedef struct {
/* {Datasize, Offset, FileID} */
int64_t header[3]; /* The basic RPC input plus */
int tag; /* the supplied OPCODE tag */
int source; /* Rank of who sent the message */
int subfile_rank; /* The IOC rank */
int64_t context_id; /* context to be used to complete */
double start_time; /* the request, + time of receipt */
/* from which we calc Time(queued) */
void *buffer; /* for writes, we keep the buffer */
/* around for awhile... */
volatile int in_progress; /* Not used! */
volatile int serialize; /* worker thread needs to wait while true */
volatile int dependents; //* If current work item has dependents */
int depend_id; /* work queue index of the dependent */
} sf_work_request_t;
extern int sf_verbose_flag;
extern app_layout_t *sf_app_layout;
#ifdef __cplusplus
extern "C" {
#endif
H5_DLL herr_t H5_open_subfiles(const char *base_filename, uint64_t h5_file_id,
ioc_selection_t ioc_selection_type, int file_acc_flags, MPI_Comm file_comm,
int64_t *context_id_out);
H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id);
H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val);
H5_DLL void * H5_get_subfiling_object(int64_t object_id);
H5_DLL int64_t H5_subfile_fid_to_context(uint64_t h5_fid);
H5_DLL herr_t H5_free_subfiling_object(int64_t object_id);
H5_DLL void H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...);
void set_verbose_flag(int subfile_rank, int new_value);
#ifdef __cplusplus
}
#endif
#endif /* H5_SUBFILING_COMMON_H */
|