summaryrefslogtreecommitdiffstats
path: root/examples/h5_shared_mesg.c
blob: a2edb38e077ae1a7ea699dd12c9746d717f7230a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * Copyright by the Board of Trustees of the University of Illinois.         *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the files COPYING and Copyright.html.  COPYING can be found at the root   *
 * of the source code distribution tree; Copyright.html can be found at the  *
 * root level of an installed copy of the electronic HDF5 document set and   *
 * is linked from the top-level documents page.  It can also be found at     *
 * http://hdfgroup.org/HDF5/doc/Copyright.html.  If you do not have          *
 * access to either file, you may request a copy from help@hdfgroup.org.     *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*
 *  This program illustrates the usage of HDF5's implicit message sharing
 *  feature, which can be used to save space when the same messages are
 *  used many times in a file.
 *
 *  This example creates a standard file using file creation property lists
 *  to control which messages are shared.  Messages that can be shared are
 *  datatypes, dataspaces, attributes, fill values, and filter pipelines.
 *
 */

#include <stdlib.h>

#include "hdf5.h"

#define NUM_DATASETS 40
const char* DSETNAME[] = {
    "dataset0",    "dataset1",
    "dataset2",    "dataset3",
    "dataset4",    "dataset5",
    "dataset6",    "dataset7",
    "dataset8",    "dataset9",
    "dataset10",   "dataset11",
    "dataset12",   "dataset13",
    "dataset14",   "dataset15",
    "dataset16",   "dataset17",
    "dataset18",   "dataset19",
    "dataset20",   "dataset21",
    "dataset22",   "dataset23",
    "dataset24",   "dataset25",
    "dataset26",   "dataset27",
    "dataset28",   "dataset29",
    "dataset30",   "dataset31",
    "dataset32",   "dataset33",
    "dataset34",   "dataset35",
    "dataset36",   "dataset37",
    "dataset38",   "dataset39",
    NULL
};

herr_t create_standard_file(const char *filename, hid_t fcpl);

/*-------------------------------------------------------------------------
 * Function:    main
 *
 * Purpose:     Enables shared messages using File Creation Property Lists
 *              and creates files using these settings.
 *
 *-------------------------------------------------------------------------
 */
int main(void)
{
    hid_t fcpl_id;
    herr_t ret;

    /* Create a file creation property list */
    fcpl_id = H5Pcreate(H5P_FILE_CREATE);
    if(fcpl_id < 0) goto error;

    /* The file creation property list is the default list right now.
     * Create a file using it (this is the same as creating a file with
     * H5P_DEFAULT).  Implicit shared messages will be disabled.
     */
    ret = create_standard_file("default_file.h5", fcpl_id);
    if(ret < 0) goto error;

    /* There are five kinds of messages that can be shared: datatypes,
     * dataspaces, attributes, fill values, and filter pipelines.
     * Shared messages are stored in up to five "indexes," where each
     * index can contain one or more types of message.  Using more indexes
     * will result in more overhead for sharing, but can also provide
     * more "tunability" and may affect caching performance.
     */
    /* To begin with, use only one index. */
    ret = H5Pset_shared_mesg_nindexes(fcpl_id, 1);
    if(ret < 0) goto error;

    /* Each index has a "minimum message size" for a message of that
     * type to be shared.  Since sharing a message creates some overhead,
     * this is to prevent this overhead for very small messages when little
     * space would be saved by sharing them anyway.
     * If the content of the file isn't known beforehand, it's probably best
     * to set the minimum size "high"; over 100 or 200 bytes.  If the content
     * of the file is known, this value can be used to trade space saved for
     * performance lost.  The smaller this value is, the more messages will
     * be shared, so the more overhead will be incurred.
     * This value is in bytes.  A shared message involves about 30 bytes of
     * overhead.  Note that even messages that are only written once will
     * require this overhead (since they "might" be shared in the future),
     * so setting the minimum size too low may result in a file actually growing
     * in size.
     * For this example case, we'll set the minimum sharing size to be small
     * since we know that every message the "standard" file uses will be
     * repeated many times.
     */
    /* The other property that each index has is the kinds of messages that
     * it holds.  For the simple case, we'll put every message that could be
     * shared in this single index.
     */
    ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_ALL_FLAG, 40);
    if(ret < 0) goto error;

    /* The other property that can be set for shared messages is the
     * list/B-tree cutoff for the indexes.
     * Each shared message index beins life as a simple list of messages
     * and becomes a B-tree when "too many" messages are written to it.
     * This keeps the indexes simple when only a few messages are shared,
     * but allows them to scale for many messages.  If many messages are
     * deleted from the B-tree, it scales back down into a list.
     * A "resonable" setting for maximum list size and minimum btree size
     * depends on what kinds of messages will be stored in the file.
     * These numbers are the same for all indexes in a file.
     * We'll guess at some numbers, though we could just as easily have kept
     * the default values.  The first value is the maximum list size, the
     * second the minimum B-tree size.
     */
    ret = H5Pset_shared_mesg_phase_change(fcpl_id, 30, 20);
    if(ret < 0) goto error;

    /* Now create a file with this property list.  After the FCPL is used,
     * everything is automatic; messages will be shared and this will be
     * completely transparent to the user.  Even if the file is closed
     * and re-opened, this settings will be saved and applied to messages
     * written later.
     */
    ret = create_standard_file("one_index_file.h5", fcpl_id);
    if(ret < 0) goto error;

    /* Now try some variations on this.  The FCPL hasn't been closed, so
     * we don't need to re-create it.
     * For instance, if we set the index to only share very large
     * messages, none of the messages we write will qualify and the file
     * will be about the same size as a normal file (with just a little extra
     * overhead).
     */
    ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_ALL_FLAG, 1000);
    if(ret < 0) goto error;

    ret = create_standard_file("only_huge_mesgs_file.h5", fcpl_id);
    if(ret < 0) goto error;


    /* Or, suppose we only wanted to shared dataspaces and
     * attributes (which might make sense if we were going to use committed
     * datatypes).  We could change the flags on the index:
     */
    ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_SDSPACE_FLAG | H5O_SHMESG_ATTR_FLAG, 40);
    if(ret < 0) goto error;

    ret = create_standard_file("only_dspaces_and_attrs_file.h5", fcpl_id);
    if(ret < 0) goto error;


    /* We could create a second index and put attributes in it to separate them
     * from datatypes and dataspaces (and then run some performance metrics to
     * see whether this improved caching performance).
     */
    ret = H5Pset_shared_mesg_nindexes(fcpl_id, 2);
    if(ret < 0) goto error;
    ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_DTYPE_FLAG | H5O_SHMESG_SDSPACE_FLAG, 40);
    if(ret < 0) goto error;
    ret = H5Pset_shared_mesg_index(fcpl_id, 1, H5O_SHMESG_ATTR_FLAG, 40);
    if(ret < 0) goto error;

    ret = create_standard_file("separate_indexes_file.h5", fcpl_id);
    if(ret < 0) goto error;

    /* We can try twiddling the "phase change" values and see what it does to
     * the file size.  Since there's only a few different messages (two
     * datatypes, two dataspaces, and one attribute), using smaller lists will
     * save some space.
     */
    ret = H5Pset_shared_mesg_nindexes(fcpl_id, 1);
    if(ret < 0) goto error;
    ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_ALL_FLAG, 40);
    if(ret < 0) goto error;

    ret = H5Pset_shared_mesg_phase_change(fcpl_id, 5, 0);
    if(ret < 0) goto error;

    ret = create_standard_file("small_lists_file.h5", fcpl_id);
    if(ret < 0) goto error;

    /* Or we could create indexes that are never lists, but are created as
     * B-trees.  We do this by setting the "maximum list size" to zero.
     */
    ret = H5Pset_shared_mesg_phase_change(fcpl_id, 0, 0);
    if(ret < 0) goto error;

    ret = create_standard_file("btrees_file.h5", fcpl_id);
    if(ret < 0) goto error;


    /* Obviously there are a lot more permutations of these options possible.
     * Performance will often be a tradeoff of speed for space, but will
     * depend a great deal on the specific application.  If performance is
     * important, the best thing to do is to play with these settings to find
     * the ones that work best for you.
     * Please let The HDF Group (help@hdfgroup.org) know what you find!
     */


    /* Close the property list */
    ret = H5Pclose(fcpl_id);
    if(ret < 0) goto error;
    return 0;

error:
    return -1;
}

/*-------------------------------------------------------------------------
 * Function:    create_standard_file
 *
 * Purpose:     A helper functon for the example.  Creates an HDF5 file
 *              with many repeated messages using the file creation
 *              property list FCPL.
 *
 *              This function only uses datatypes, dataspaces, and
 *              attributes.  Fill values and filter pipelines can also
 *              be shared in the same way (i.e., by enabling sharing in
 *              the FCPL and writing the same message more than once).
 *-------------------------------------------------------------------------
 */
herr_t
create_standard_file(const char *filename, hid_t fcpl_id)
{
    hid_t file_id=-1;
    hid_t type_id=-1, temp_type_id=-1;
    hsize_t dims[] = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
    hid_t space_id=-1;
    hid_t attr_type_id = -1;
    hid_t attr_space_id = -1;
    int attr_data[] = {1,2,3,4,5,6,7,8,9,0};
    hid_t dset_id=-1;
    hid_t attr_id=-1;
    int x;
    herr_t ret;

    /* Create the file */
    file_id = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl_id, H5P_DEFAULT);
    if(file_id < 0) goto error;

    /* Create the datatype we'll be using.  Generally, sharing messages
     * is most useful when the message is complex and takes more space on
     * disk, so this type will be an array type rather than an atomic type.
     * However, any type can be shared.
     */
    temp_type_id = H5Tarray_create2(H5T_NATIVE_INT, 10, dims);
    if(temp_type_id < 0) goto error;
    type_id  = H5Tarray_create2(temp_type_id, 10, dims);
    if(type_id < 0) goto error;
    ret = H5Tclose(temp_type_id);
    if(ret < 0) goto error;

    /* Create the dataspace we'll be using.
     * Again, create a more complex dataspace so that more space will
     * be saved when we share it.
     */
    space_id = H5Screate_simple(10, dims, dims);
    if(space_id < 0) goto error;

    /* Create a datatype and dataspace for the attributes we'll be creating.
     * The datatype will be a single integer, and each attribute will hold
     * 10 integers.
     */
    attr_type_id = H5Tcopy(H5T_NATIVE_INT);
    if(attr_type_id < 0) goto error;
    attr_space_id = H5Screate_simple(1, dims, dims);
    if(attr_space_id < 0) goto error;


    /* Begin using the messages many times.  Do this by creating datasets
     * that use this datatype, dataspace, and have this attribute.
     */
    for(x = 0; x < NUM_DATASETS; ++x) {
       /* Create a dataset */
       dset_id = H5Dcreate2(file_id, DSETNAME[x], type_id, space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
       if(dset_id < 0) goto error;

       /* Create an attribute on the dataset */
       attr_id = H5Acreate2(dset_id, "attr_name", attr_type_id, attr_space_id, H5P_DEFAULT, H5P_DEFAULT);
       if(attr_id < 0) goto error;

       /* Write data to the attribute */
       ret = H5Awrite(attr_id, H5T_NATIVE_INT, attr_data);
       if(ret < 0) goto error;

       ret = H5Aclose(attr_id);
       if(ret < 0) goto error;
       ret = H5Dclose(dset_id);
       if(ret < 0) goto error;
    }

    /* Close all open IDs */
    ret = H5Tclose(attr_type_id);
    if(ret < 0) goto error;
    ret = H5Sclose(attr_space_id);
    if(ret < 0) goto error;
    ret = H5Tclose(type_id);
    if(ret < 0) goto error;
    ret = H5Sclose(space_id);
    if(ret < 0) goto error;
    ret = H5Fclose(file_id);
    if(ret < 0) goto error;

    return 0;

error:
    return -1;
}