summaryrefslogtreecommitdiffstats
path: root/src/H5AC2public.h
blob: 8c4051c5525871f3b1f8e265a8990d45e3d57b29 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * Copyright by the Board of Trustees of the University of Illinois.         *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the files COPYING and Copyright.html.  COPYING can be found at the root   *
 * of the source code distribution tree; Copyright.html can be found at the  *
 * root level of an installed copy of the electronic HDF5 document set and   *
 * is linked from the top-level documents page.  It can also be found at     *
 * http://hdfgroup.org/HDF5/doc/Copyright.html.  If you do not have          *
 * access to either file, you may request a copy from help@hdfgroup.org.     *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*-------------------------------------------------------------------------
 *
 * Created:             H5AC2public.h
 *                      Jul 10 1997
 *                      Robb Matzke <matzke@llnl.gov>
 *
 * Purpose:             Public include file for cache functions.
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
#ifndef _H5AC2public_H
#define _H5AC2public_H

/* Public headers needed by this file */
#include "H5public.h"
#include "H5C2public.h"

#ifdef __cplusplus
extern "C" {
#endif

/****************************************************************************
 *
 * structure H5AC2_cache_config_t
 *
 * H5AC2_cache_config_t is a public structure intended for use in public APIs.
 * At least in its initial incarnation, it is basicaly a copy of struct
 * H5C2_auto_size_ctl_t, minus the report_fcn field, and plus the
 * dirty_bytes_threshold field.
 *
 * The report_fcn field is omitted, as including it would require us to
 * make H5C2_t structure public.
 *
 * The dirty_bytes_threshold field does not appear in H5C2_auto_size_ctl_t,
 * as synchronization between caches on different processes is handled at
 * the H5AC2 level, not at the level of H5C2.  Note however that there is
 * considerable interaction between this value and the other fields in this
 * structure.
 *
 * Similarly, the open_trace_file, close_trace_file, and trace_file_name
 * fields do not appear in H5C2_auto_size_ctl_t, as most trace file 
 * issues are handled at the H5AC2 level.  The one exception is storage of
 * the pointer to the trace file, which is handled by H5C2.
 *
 * The structure is in H5AC2public.h as we may wish to allow different
 * configuration options for metadata and raw data caches.
 *
 * The fields of the structure are discussed individually below:
 *
 * version: Integer field containing the version number of this version
 *      of the H5AC2_cache_config_t structure.  Any instance of
 *      H5AC2_cache_config_t passed to the cache must have a known
 *      version number, or an error will be flagged.
 *
 * rpt_fcn_enabled: Boolean field used to enable and disable the default
 *	reporting function.  This function is invoked every time the
 *	automatic cache resize code is run, and reports on its activities.
 *
 *	This is a debugging function, and should normally be turned off.
 *
 * open_trace_file: Boolean field indicating whether the trace_file_name
 * 	field should be used to open a trace file for the cache.
 *
 * 	The trace file is a debuging feature that allow the capture of
 * 	top level metadata cache requests for purposes of debugging and/or
 * 	optimization.  This field should normally be set to FALSE, as 
 * 	trace file collection imposes considerable overhead.
 *
 * 	This field should only be set to TRUE when the trace_file_name
 * 	contains the full path of the desired trace file, and either
 * 	there is no open trace file on the cache, or the close_trace_file
 * 	field is also TRUE.
 *
 * close_trace_file: Boolean field indicating whether the current trace
 * 	file (if any) should be closed.
 *
 * 	See the above comments on the open_trace_file field.  This field
 * 	should be set to FALSE unless there is an open trace file on the
 * 	cache that you wish to close.
 *
 * trace_file_name: Full path of the trace file to be opened if the
 * 	open_trace_file field is TRUE.
 *
 * 	In the parallel case, an ascii representation of the mpi rank of 
 * 	the process will be appended to the file name to yield a unique
 * 	trace file name for each process.
 *
 * 	The length of the path must not exceed H5AC2__MAX_TRACE_FILE_NAME_LEN
 * 	characters.
 *
 * evictions_enabled:  Boolean field used to either report the current
 * 	evictions enabled status of the cache, or to set the cache's 
 *	evictions enabled status.
 *
 * 	In general, the metadata cache should always be allowed to 
 * 	evict entries.  However, in some cases it is advantageous to 
 * 	disable evictions briefly, and thereby postpone metadata 
 * 	writes.  However, this must be done with care, as the cache
 * 	can grow quickly.  If you do this, re-enable evictions as
 * 	soon as possible and monitor cache size.
 *
 * 	At present, evictions can only be disabled if automatic
 * 	cache resizing is also disabled (that is, ( incr_mode ==
 *	H5C2_incr__off ) && ( decr_mode == H5C2_decr__off )).  There
 *	is no logical reason why this should be so, but it simplifies
 *	implementation and testing, and I can't think of any reason
 *	why it would be desireable.  If you can think of one, I'll
 *	revisit the issue.
 *
 * set_initial_size: Boolean flag indicating whether the size of the
 *      initial size of the cache is to be set to the value given in
 *      the initial_size field.  If set_initial_size is FALSE, the
 *      initial_size field is ignored.
 *
 * initial_size: If enabled, this field contain the size the cache is
 *      to be set to upon receipt of this structure.  Needless to say,
 *      initial_size must lie in the closed interval [min_size, max_size].
 *
 * min_clean_fraction: double in the range 0 to 1 indicating the fraction
 *      of the cache that is to be kept clean.  This field is only used
 *      in parallel mode.  Typical values are 0.1 to 0.5.
 *
 * max_size: Maximum size to which the cache can be adjusted.  The
 *      supplied value must fall in the closed interval
 *      [MIN_MAX_CACHE_SIZE, MAX_MAX_CACHE_SIZE].  Also, max_size must
 *      be greater than or equal to min_size.
 *
 * min_size: Minimum size to which the cache can be adjusted.  The
 *      supplied value must fall in the closed interval
 *      [H5C2__MIN_MAX_CACHE_SIZE, H5C2__MAX_MAX_CACHE_SIZE].  Also, min_size
 *      must be less than or equal to max_size.
 *
 * epoch_length: Number of accesses on the cache over which to collect
 *      hit rate stats before running the automatic cache resize code,
 *      if it is enabled.
 *
 *      At the end of an epoch, we discard prior hit rate data and start
 *      collecting afresh.  The epoch_length must lie in the closed
 *      interval [H5C2__MIN_AR_EPOCH_LENGTH, H5C2__MAX_AR_EPOCH_LENGTH].
 *
 *
 * Cache size increase control fields:
 *
 * incr_mode: Instance of the H5C2_cache_incr_mode enumerated type whose
 *      value indicates how we determine whether the cache size should be
 *      increased.  At present there are two possible values:
 *
 *      H5C2_incr__off:  Don't attempt to increase the size of the cache
 *              automatically.
 *
 *              When this increment mode is selected, the remaining fields
 *              in the cache size increase section ar ignored.
 *
 *      H5C2_incr__threshold: Attempt to increase the size of the cache
 *              whenever the average hit rate over the last epoch drops
 *              below the value supplied in the lower_hr_threshold
 *              field.
 *
 *              Note that this attempt will fail if the cache is already
 *              at its maximum size, or if the cache is not already using
 *              all available space.
 *
 *      Note that you must set decr_mode to H5C2_incr__off if you 
 *      disable metadata cache entry evictions.
 *
 * lower_hr_threshold: Lower hit rate threshold.  If the increment mode
 *      (incr_mode) is H5C2_incr__threshold and the hit rate drops below the
 *      value supplied in this field in an epoch, increment the cache size by
 *      size_increment.  Note that cache size may not be incremented above
 *      max_size, and that the increment may be further restricted by the
 *      max_increment field if it is enabled.
 *
 *      When enabled, this field must contain a value in the range [0.0, 1.0].
 *      Depending on the incr_mode selected, it may also have to be less than
 *      upper_hr_threshold.
 *
 * increment:  Double containing the multiplier used to derive the new
 *      cache size from the old if a cache size increment is triggered.
 *      The increment must be greater than 1.0, and should not exceed 2.0.
 *
 *      The new cache size is obtained my multiplying the current max cache
 *      size by the increment, and then clamping to max_size and to stay
 *      within the max_increment as necessary.
 *
 * apply_max_increment:  Boolean flag indicating whether the max_increment
 *      field should be used to limit the maximum cache size increment.
 *
 * max_increment: If enabled by the apply_max_increment field described
 *      above, this field contains the maximum number of bytes by which the
 *      cache size can be increased in a single re-size.
 *
 * flash_incr_mode:  Instance of the H5C_cache_flash_incr_mode enumerated
 *      type whose value indicates whether and by which algorithm we should
 *      make flash increases in the size of the cache to accomodate insertion
 *      of large entries and large increases in the size of a single entry.
 *
 *      The addition of the flash increment mode was occasioned by performance
 *      problems that appear when a local heap is increased to a size in excess
 *      of the current cache size.  While the existing re-size code dealt with
 *      this eventually, performance was very bad for the remainder of the
 *      epoch.
 *
 *      At present, there are two possible values for the flash_incr_mode:
 *
 *      H5C_flash_incr__off:  Don't perform flash increases in the size of
 *              the cache.
 *
 *      H5C_flash_incr__add_space:  Let x be either the size of a newly
 *              newly inserted entry, or the number of bytes by which the
 *              size of an existing entry has been increased.
 *
 *              If
 *                      x > flash_threshold * current max cache size,
 *
 *              increase the current maximum cache size by x * flash_multiple
 *              less any free space in the cache, and star a new epoch.  For
 *              now at least, pay no attention to the maximum increment.
 *
 *      In both of the above cases, the flash increment pays no attention to
 *      the maximum increment (at least in this first incarnation), but DOES
 *      stay within max_size.
 *
 *      With a little thought, it should be obvious that the above flash
 *      cache size increase algorithm is not sufficient for all circumstances
 *      -- for example, suppose the user round robins through
 *      (1/flash_threshold) +1 groups, adding one data set to each on each
 *      pass.  Then all will increase in size at about the same time, requiring
 *      the max cache size to at least double to maintain acceptable
 *      performance, however the above flash increment algorithm will not be
 *      triggered.
 *
 *      Hopefully, the add space algorithms detailed above will be sufficient
 *      for the performance problems encountered to date.  However, we should
 *      expect to revisit the issue.
 *
 * flash_multiple: Double containing the multiple described above in the
 *      H5C_flash_incr__add_space section of the discussion of the
 *      flash_incr_mode section.  This field is ignored unless flash_incr_mode
 *      is H5C_flash_incr__add_space.
 *
 * flash_threshold: Double containing the factor by which current max cache
 *      size is multiplied to obtain the size threshold for the add_space flash
 *      increment algorithm.  The field is ignored unless flash_incr_mode is
 *      H5C_flash_incr__add_space.
 *
 *
 * Cache size decrease control fields:
 *
 * decr_mode: Instance of the H5C2_cache_decr_mode enumerated type whose
 *      value indicates how we determine whether the cache size should be
 *      decreased.  At present there are four possibilities.
 *
 *      H5C2_decr__off:  Don't attempt to decrease the size of the cache
 *              automatically.
 *
 *              When this increment mode is selected, the remaining fields
 *              in the cache size decrease section are ignored.
 *
 *      H5C2_decr__threshold: Attempt to decrease the size of the cache
 *              whenever the average hit rate over the last epoch rises
 *              above the value supplied in the upper_hr_threshold
 *              field.
 *
 *      H5C2_decr__age_out:  At the end of each epoch, search the cache for
 *              entries that have not been accessed for at least the number
 *              of epochs specified in the epochs_before_eviction field, and
 *              evict these entries.  Conceptually, the maximum cache size
 *              is then decreased to match the new actual cache size.  However,
 *              this reduction may be modified by the min_size, the
 *              max_decrement, and/or the empty_reserve.
 *
 *      H5C2_decr__age_out_with_threshold:  Same as age_out, but we only
 *              attempt to reduce the cache size when the hit rate observed
 *              over the last epoch exceeds the value provided in the
 *              upper_hr_threshold field.
 *
 *      Note that you must set decr_mode to H5C2_decr__off if you 
 *      disable metadata cache entry evictions.
 *
 * upper_hr_threshold: Upper hit rate threshold.  The use of this field
 *      varies according to the current decr_mode:
 *
 *      H5C2_decr__off or H5C2_decr__age_out:  The value of this field is
 *              ignored.
 *
 *      H5C2_decr__threshold:  If the hit rate exceeds this threshold in any
 *              epoch, attempt to decrement the cache size by size_decrement.
 *
 *              Note that cache size may not be decremented below min_size.
 *
 *              Note also that if the upper_threshold is 1.0, the cache size
 *              will never be reduced.
 *
 *      H5C2_decr__age_out_with_threshold:  If the hit rate exceeds this
 *              threshold in any epoch, attempt to reduce the cache size
 *              by evicting entries that have not been accessed for more
 *              than the specified number of epochs.
 *
 * decrement: This field is only used when the decr_mode is
 *      H5C2_decr__threshold.
 *
 *      The field is a double containing the multiplier used to derive the
 *      new cache size from the old if a cache size decrement is triggered.
 *      The decrement must be in the range 0.0 (in which case the cache will
 *      try to contract to its minimum size) to 1.0 (in which case the
 *      cache will never shrink).
 *
 * apply_max_decrement:  Boolean flag used to determine whether decrements
 *      in cache size are to be limited by the max_decrement field.
 *
 * max_decrement: Maximum number of bytes by which the cache size can be
 *      decreased in a single re-size.  Note that decrements may also be
 *      restricted by the min_size of the cache, and (in age out modes) by
 *      the empty_reserve field.
 *
 * epochs_before_eviction:  Integer field used in H5C2_decr__age_out and
 *      H5C2_decr__age_out_with_threshold decrement modes.
 *
 *      This field contains the number of epochs an entry must remain
 *      unaccessed before it is evicted in an attempt to reduce the
 *      cache size.  If applicable, this field must lie in the range
 *      [1, H5C2__MAX_EPOCH_MARKERS].
 *
 * apply_empty_reserve:  Boolean field controlling whether the empty_reserve
 *      field is to be used in computing the new cache size when the
 *      decr_mode is H5C2_decr__age_out or H5C2_decr__age_out_with_threshold.
 *
 * empty_reserve:  To avoid a constant racheting down of cache size by small
 *      amounts in the H5C2_decr__age_out and H5C2_decr__age_out_with_threshold
 *      modes, this field allows one to require that any cache size
 *      reductions leave the specified fraction of unused space in the cache.
 *
 *      The value of this field must be in the range [0.0, 1.0].  I would
 *      expect typical values to be in the range of 0.01 to 0.1.
 *
 *
 * Parallel Configuration Fields:
 *
 * In PHDF5, all operations that modify metadata must be executed collectively.
 * We used to think that this was enough to ensure consistency across the
 * metadata caches, but since we allow processes to read metadata individually,
 * the order of dirty entries in the LRU list can vary across processes,
 * which can result in inconsistencies between the caches.
 *
 * To prevent this, only the metadata cache on process 0 is allowed to write
 * to file, and then only after synchronizing with the other caches.  After
 * it writes entries to file, it sends the base addresses of the now clean
 * entries to the other caches, so they can mark these entries clean as well.
 *
 * The different caches know when to synchronize caches by counting the
 * number of bytes of dirty metadata created by the collective operations
 * modifying metadata.  Whenever this count exceeds a user specified
 * threshold (see below), process 0 flushes down to its minimum clean size,
 * and then sends the list of newly cleaned entries to the other caches.
 *
 * dirty_bytes_threshold:  Threshold of dirty byte creation used to
 * 	synchronize updates between caches. (See above for outline and
 *	motivation.)
 *
 *	This value MUST be consistant across all processes accessing the
 *	file.  This field is ignored unless HDF5 has been compiled for
 *	parallel.
 *
 ****************************************************************************/

#define H5AC2__CURR_CACHE_CONFIG_VERSION 	1
#define H5AC2__MAX_TRACE_FILE_NAME_LEN		1024

typedef struct H5AC2_cache_config_t
{
    /* general configuration fields: */
    int                       version;

    hbool_t		      rpt_fcn_enabled;

    hbool_t		      open_trace_file;
    hbool_t                   close_trace_file;
    char                      trace_file_name[H5AC2__MAX_TRACE_FILE_NAME_LEN+1];

    hbool_t                   evictions_enabled;

    hbool_t                   set_initial_size;
    size_t                    initial_size;

    double                    min_clean_fraction;

    size_t                    max_size;
    size_t                    min_size;

    long int                  epoch_length;


    /* size increase control fields: */
    enum H5C2_cache_incr_mode incr_mode;

    double                    lower_hr_threshold;

    double                    increment;

    hbool_t                   apply_max_increment;
    size_t                    max_increment;

    enum H5C2_cache_flash_incr_mode     flash_incr_mode;
    double                              flash_multiple;
    double                              flash_threshold;


    /* size decrease control fields: */
    enum H5C2_cache_decr_mode decr_mode;

    double                    upper_hr_threshold;

    double                    decrement;

    hbool_t                   apply_max_decrement;
    size_t                    max_decrement;

    int                       epochs_before_eviction;

    hbool_t                   apply_empty_reserve;
    double                    empty_reserve;


    /* parallel configuration fields: */
    int                       dirty_bytes_threshold;

} H5AC2_cache_config_t;


/****************************************************************************
 *
 * structure H5AC2_jnl_config_t
 *
 * H5AC2_jnl_config_t is a public structure intended for use in public APIs.
 * At least in its initial incarnation, it is intended to package all the 
 * data needed to configure metadata journaling.  In the future, we may 
 * use it to package configuration data for other types of journaling as well.
 *
 * The fields of the structure are discussed individually below.  Note 
 * that the fields with the "jbrb_" prefix are used to configure the
 * journal buffer ring buffer -- a ring buffer of buffers used to buffer
 * output of journal messages.
 *
 * version: Integer field containing the version number of this version
 *      of the H5AC2_jnl_config_t structure.  Any instance of
 *      H5AC2_cache_config_t passed to the cache must have a known
 *      version number, or an error will be flagged.
 *
 * enable_journaling:  Boolean flag that is set to TRUE if journaling is 
 * 	to be enabled, and to FALSE otherwise.  
 *
 * 	When the cache configuration is reported, this field is TRUE iff
 * 	journaling is enabled.
 *
 * journal_file_path:  Full path of the file to be used to store the 
 * 	metadata journal.  This field is only defined if enable_journaling
 * 	is TRUE.
 *
 * 	At present, the length of the journal file path is restricted to 
 * 	no more than H5AC2__MAX_JOURNAL_FILE_NAME_LEN.
 *
 * journal_recovered:  Boolean flag use to indicate that we are opening
 * 	a journaled file that was not closed correctly, and on which the
 * 	journal recovery tool has been run.
 *
 * 	Unless you are the writer of a new journal recovery tool, you
 * 	should always set this field to FALSE.
 *
 * jbrb_buf_size: size_t containing the size of each individual buffer 
 * 	in the journal buffer ring buffer.  This size should be chosen
 * 	to be some multiple of the block size used by the file system 
 * 	on which the journal file will be written.
 *
 * jbrb_num_bufs: Integer containing the number of buffers in the journal
 * 	buffer ring buffer.  If synchronous I/O is used, one or two buffers
 * 	is sufficient.  If asynchronous I/O is used, the number of buffers
 * 	should be sufficiently large that a write on buffer is likely to 
 * 	complete before that buffer is needed again.
 *
 * jbrb_use_aio:  Boolean flag indicating whether we should use 
 * 	asynchronous I/O for journal entry writes.
 *
 * jbrb_human_readable: Boolean flag which determines whether the journal
 * 	file will be written in human readable form.  In general, this 
 * 	field should be set to false, as the human readable journal
 * 	file is at least twice a large as the machine readable version.
 *
 ****************************************************************************/

#define H5AC2__CURR_JNL_CONFIG_VER 		1
#define H5AC2__MAX_JOURNAL_FILE_NAME_LEN	1024

typedef struct H5AC2_jnl_config_t
{
    int		version;

    /* metadata journaling configuration fields: */
    hbool_t     enable_journaling;
    char        journal_file_path[H5AC2__MAX_JOURNAL_FILE_NAME_LEN + 1];
    hbool_t     journal_recovered;
    size_t      jbrb_buf_size;
    int         jbrb_num_bufs;
    hbool_t     jbrb_use_aio;
    hbool_t     jbrb_human_readable;

} H5AC2_jnl_config_t;


#ifdef __cplusplus
}
#endif
#endif