summaryrefslogtreecommitdiffstats
path: root/src/H5PBprivate.h
blob: ad59150f6f48238fef0ca731d1357bde02160e7e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * Copyright by the Board of Trustees of the University of Illinois.         *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the COPYING file, which can be found at the root of the source code       *
 * distribution tree, or in https://www.hdfgroup.org/licenses.               *
 * If you do not have access to either file, you may request a copy from     *
 * help@hdfgroup.org.                                                        *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*
 * File:        H5PBprivate.h
 *
 * Purpose:     This file contains declarations which are normally visible
 *              within the HDF5 library, but are not visible at the user
 *              level
 *
 * Programmer: John Mainzer -- 10/07/18
 */

#ifndef H5PBprivate_H
#define H5PBprivate_H

/* Private headers needed by this header */
#include "H5private.h" /* Generic Functions			*/

/**************************/
/* Library Private Macros */
/**************************/

#define H5PB__HASH_TABLE_LEN 4096 /* must be a power of 2 */

/****************************/
/* Library Private Typedefs */
/****************************/

/* Typedef for the page buffer entry structure (defined in H5PBpkg.h) */
typedef struct H5PB_entry_t H5PB_entry_t;

/******************************************************************************
 *
 * structure H5PB_t
 *
 * Catchall structure for all variables specific to an instance of the page
 * buffer.
 *
 * At present, the page buffer serves two purposes in the HDF5 library.
 *
 * Under normal operating conditions, it serves as a normal page buffer whose
 * purpose is to minimize and optimize file I/O by aggregating small metadata
 * and raw data writes into pages, and by caching frequently used pages.
 *
 * In addition, when a file is opened for VFD SWMR writing, the page buffer is
 * used to retain copies of all metadata pages and multi-page metadata entries
 * that are written in a given tick, and under certain cases, to delay metadata
 * page and/or multi-page metadata entry writes for some number of ticks.
 * If the entry has not appeared in the VFD SWMR index for at least max_lag
 * ticks, this is necessary to avoid message from the future bugs.  See the
 * VFD SWMR RFC for further details.
 *
 * To reflect this, the fields of this structure are divided into three
 * sections.  Specifically fields needed for general operations, fields needed
 * for VFD SWMR, and statistics.
 *
 * FIELDS FOR GENERAL OPERATIONS:
 *
 * magic:       Unsigned 32 bit integer that must always be set to
 *              H5PB__H5PB_T_MAGIC.  This field is used to validate pointers to
 *              instances of H5PB_t.
 *
 * page_size:   size_t containing the page buffer page size in bytes.
 *
 * max_pages:   64 bit integer containing the nominal maximum number
 *              of pages in the page buffer.  Note that on creation, the page
 *              buffer is empty, and that under certain circumstances (mostly
 *              related to VFD SWMR) this limit can be exceeded by large
 *              amounts.
 *
 * curr_pages:  64 bit integer containing the current number of pages
 *              in the page buffer.  curr_pages must always equal the sum of
 *              curr_md_pages + curr_rd_pages.
 *
 *              Note that in the context of VFD SWMR, this count does NOT
 *              include multi-page metadata entries.
 *
 * curr_md_pages: 64 bit integer containing the current number of
 *              metadata pages in the page buffer.
 *
 *              Note that in the context of VFD SWMR, this count does NOT
 *              include multi-page metadata entries.
 *
 * curr_rd_pages: 64 bit integer containing the current number of
 *              raw data pages in the page buffer.
 *
 * min_md_pages: 64 bit integer containing the number of pages in the
 *              page buffer reserved for metadata.  No metadata page may be
 *              evicted from the page buffer if curr_md_pages is less than or
 *              equal to this value.
 *
 * min_rd_pages: 64 bin integer containing the number of pages in the
 *              page buffer reserved for raw data.  No page or raw data may be
 *              evicted from the page buffer if curr_rd_pages is less than or
 *              equal to this value.
 *
 * The FAPL fields are used to store the page buffer configuration data
 * provided to the page buffer in the H5PB_create() call.
 *
 * max_size:    Maximum page buffer size supplied by the FAPL.
 *
 * min_meta_perc: Percent of the page buffer reserved for metadata as
 *              supplied in the FAPL.
 *
 * min_raw_perc: Percent of the page buffer reserved for metadata as
 *              supplied in the FAPL.
 *
 * The purpose of the index is to allow us to efficiently look up all pages
 * (and multi-page metadata entries in the context of VFD SWMR) in the
 * page buffer.
 *
 * This function is provided by a hash table with chaining, albeit with one
 * un-unusual feature.
 *
 * Specifically hash table size must be a power of two, and the hash function
 * simply clips the high order bits off the page offset of the entry.
 *
 * This should work, as space is typically allocated sequentually, and thus
 * via a reverse principle of locality argument, hot pages are unlikely to
 * hash to the same bucket.  That said, we must collect statistics to alert
 * us should this not be the case.
 *
 * We also maintain a linked list of all entries in the index to facilitate
 * flush operations.
 *
 * index        Array of pointer to H5PB_entry_t of size
 *              H5PB__HASH_TABLE_LEN.  This size must ba a power of 2,
 *              not the usual prime number.
 *
 * index_len:   Number of entries currently in the hash table used to index
 *              the page buffer.  index_len should always equal
 *              clean_index_len + dirty_index_len.
 *
 * clean_index_len: Number of clean entries currently in the hash table
 *              used to index the page buffer.
 *
 * dirty_index_len: Number of dirty entries currently in the hash table
 *              used to index the page buffer.
 *
 * index_size:  Number of bytes currently stored in the hash table used to
 *              index the page buffer.  Under normal circumstances, this
 *              value will be index_len * page size.  However, if
 *              vfd_swmr_writer is TRUE, it may be larger.
 *
 *              index_size should always equal clean_index_size +
 *              dirty_index_size.
 *
 * clean_index_size: Number of bytes of clean entries currently stored in
 *              the hash table used to index the page buffer.
 *
 * dirty_index_size: Number of bytes of dirty entries currently stored in
 *              the hash table used to index the page buffer.
 *
 * il_len:      Number of entries on the index list.
 *
 *              This must always be equal to index_len.  As such, this
 *              field is redundant.  However, the existing linked list
 *              management macros expect to maintain a length field, so
 *              this field exists primarily to avoid adding complexity to
 *              these macros.
 *
 * il_size:     Number of bytes of cache entries currently stored in the
 *              index list.
 *
 *              This must always be equal to index_size.  As such, this
 *              field is redundant.  However, the existing linked list
 *              management macros expect to maintain a size field, so
 *              this field exists primarily to avoid adding complexity to
 *              these macros.
 *
 * il_head:     Pointer to the head of the doubly linked list of entries in
 *              the index list.  Note that cache entries on this list are
 *              linked by their il_next and il_prev fields.
 *
 *              This field is NULL if the index is empty.
 *
 * il_tail:     Pointer to the tail of the doubly linked list of entries in
 *              the index list.  Note that cache entries on this list are
 *              linked by their il_next and il_prev fields.
 *
 *              This field is NULL if the index is empty.
 *
 *
 * Fields supporting the modified LRU policy:
 *
 * See most any OS text for a discussion of the LRU replacement policy.
 *
 * Under normal operating circumstances (i.e. vfd_swmr_writer is FALSE)
 * all entries will reside both in the index and in the LRU.  Further,
 * all entries will be of size page_size.
 *
 * The VFD SWMR writer case (i.e. vfd_swmr_writer is TRUE) is complicated
 * by the requirements that we:
 *
 * 1) buffer all metadat writes (including multi-page metadata writes) that
 *    occur during a tick, and
 *
 * 2) when necessary, delay metadata writes for up to max_lag ticks to
 *    avoid message from the future bugs on the VFD SWMR readers.
 *
 * See discussion of fields supporting VFD SWMR below for details.
 *
 * Discussions of the individual fields used by the modified LRU replacement
 * policy follow:
 *
 * LRU_len:     Number of page buffer entries currently on the LRU.
 *
 *              Observe that LRU_len + dwl_len must always equal
 *              index_len.
 *
 * LRU_size:    Number of bytes of page buffer entries currently residing
 *              on the LRU list.
 *
 *              Observe that LRU_size + dwl_size must always equal
 *              index_size.
 *
 * LRU_head_ptr:  Pointer to the head of the doubly linked LRU list.  Page
 *              buffer entries on this list are linked by their next and
 *              prev fields.
 *
 *              This field is NULL if the list is empty.
 *
 * LRU_tail_ptr:  Pointer to the tail of the doubly linked LRU list.  Page
 *              buffer entries on this list are linked by their next and
 *              prev fields.
 *
 *              This field is NULL if the list is empty.
 *
 *
 * FIELDS SUPPORTING VFD SWMR:
 *
 * If the file is opened in VFD SWMR mode (i.e. vfd_swmr == TRUE), all
 * raw data I/O must be passed through to the HDF5 file
 *
 * If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE),
 * the page buffer must retain the data necessary to update the metadata
 * file at the end of each tick, and also delay writes as necessary so as
 * to avoid message from the future bugs on the VFD SWMR readers.
 *
 * The tick list exists to allow us to buffer copies of all metadata writes
 * during a tick, and the delayed write list supports delayed writes.
 *
 * If a regular page is written to during a tick, it is placed on the tick
 * list.  If there is no reason to delay its write to file (i.e. either
 * it was just allocated, or it has existed in the metadata file index for
 * at least max_lag ticks), it is also placed on the LRU, where it may be
 * flushed, but not evicted.  If its write must be delayed, it is placed on
 * the delayed write list, where it must remain until its write delay is
 * satisfied -- at which point it is moved to the LRU.
 *
 * If a multi-page metadata entry is written during a tick, it is placed on
 * the tick list.  If, in addition, the write of the entry must be delayed,
 * it is also place on the delayed write list.  Note that multi-page metadata
 * entries may never appear on the LRU.
 *
 * At the end of each tick, the tick list is emptied.
 *
 * Regular pages are simply removed from the tick list, as they must already
 * appear on either the LRU or the delayed write list.
 *
 * Multi-page metadata entries that are not also on the delayed write list
 * are simply flushed and evicted.
 *
 * The delayed write list is also scanned at the end of each tick.  Regular
 * entries that are now flushable are placed at the head of the LRU.  Multi-
 * page metadata entries that are flushable are flushed and evicted.
 *
 * The remainder of this sections contains discussions of the fields and
 * data structures used to support the above operations.
 *
 * vfd_swmr:    Boolean flag that is set to TRUE IFF the file is opened
 *              in VFD SWMR mode -- either reader or writer.  This field
 *              is used to exclude raw data from the page buffer.
 *
 * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is
 *              is opened in VFD SWMR writer mode.  The remaining
 *              VFD SWMR fields are defined iff vfd_swmr_writer is TRUE.
 *
 * mpmde_count: int64_t containing the number of multi-page metadata
 *              entries currently resident in the page buffer.  Observe
 *              that index_len should always equal curr_pages + mpmde_count.
 *
 * cur_tick:    uint64_t containing the current tick.  This is a copy of
 *              the same field in the associated instance of H5F_file_t,
 *              and is maintained as a convenience.
 *
 * In the context of VFD SWMR the delayed write list allows us to delay
 * metadata writes to the HDF5 file until it appears in all indexes in the
 * last max_lag ticks.  This is essential if a version of the page or
 * multi-page metadata entry already exists in the HDF5 file -- failure to
 * delay the write can result in a message from the future which will
 * likely be perciived as file corruption by the reader.
 *
 * To facilitate identification of entries that must be removed from the
 * DWL during the end of tick scan, the list always observes the following
 * invariant for any entry on the list:
 *
 *    entry_ptr->next == NULL ||
 *    entry_ptr->delay_write_until >= entry_ptr->next->delay_write_until
 *
 * Discussion of the fields used to implement the delayed write list follows:
 *
 * max_delay:   Maximum of the delay_write_until fields of the entries on
 *              the delayed write list.  This must never be more than max_lag
 *              ticks in advance of the current tick, and should be set to
 *              zero if the delayed write list is empty.
 *
 * dwl_len:     Number of page buffer entries currently on the delayed
 *              write list.
 *
 *              Observe that LRU_len + dwl_len must always equal
 *              index_len.
 *
 * dwl_size:    Number of bytes of page buffer entries currently residing
 *              on the LRU list.
 *
 *              Observe that LRU_size + dwl_size must always equal
 *              index_size.
 *
 * dwl_head_ptr:  Pointer to the head of the doubly linked delayed write list.
 *              Page buffer entries on this list are linked by their next and
 *              prev fields.
 *
 *              This field is NULL if the list is empty.
 *
 * dwl_tail_ptr:  Pointer to the tail of the doubly linked delayed write list.
 *              Page buffer entries on this list are linked by their next and
 *              prev fields.
 *
 *              This field is NULL if the list is empty.
 *
 * For VFD SWMR to function, copies of all pages modified during a tick must
 * be retained in the page buffer to allow correct updates to the index and
 * metadata file at the end of tick.
 *
 * To implement this, all entries modified during the current tick are placed
 * on the tick list.  Entries are removed from the tick list during end of
 * tick processing, so each tick starts with an empty tick list.
 *
 * Unless the entry also resides on the delayed write list, entries on the
 * tick list may be flushed, but they may not be evicted.
 *
 * Discussion of the fields used to implement the tick list follows:
 *
 * tl_len:      Number of page buffer entries currently on the tick list
 *
 * tl_size:     Number of bytes of page buffer entries currently residing
 *              on the tick list.
 *
 * tl_head_ptr:  Pointer to the head of the doubly linked tick list.
 *              Page buffer entries on this list are linked by their tl_next
 *              and tl_prev fields.
 *
 *              This field is NULL if the list is empty.
 *
 * tl_tail_ptr:  Pointer to the tail of the doubly linked tick list.
 *              Page buffer entries on this list are linked by their tl_next
 *              and tl_prev fields.
 *
 *              This field is NULL if the list is empty.
 *
 *
 * STATISTICS:
 *
 * Multi-page metadata entries (which may only appear in VFD
 * SWMR mode) are NOT counted in the following statistics.
 *
 * Note that all statistics fields contain only data since the last time
 * that statistics were reset.
 *
 * bypasses:    Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of times that the page buffer has been
 *              bypassed for raw data, metadata, and for multi-page
 *              metadata entries (VFD SWMR only) as indexed by 5PB__STATS_MD,
 *              H5PB__STATS_RD, and H5PB__STATS_MPMDE respectively.
 *
 * accesses:    Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer accesses for raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * hits:        Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer hits for raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * misses:      Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer misses for raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * loads:       Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer loads for raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * insertions:  Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer insertions of raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * flushes:     Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer flushes of raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * evictions:   Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer evictions of raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * clears:      Array of int64_t of length H5PB__NUM_STAT_TYPES containing
 *              the number of page buffer entry clears of raw data, metadata,
 *              and for multi-page metadata entries (VFD SWMR only) as
 *              indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
 *              H5PB__STATS_MPMDE respectively.
 *
 * max_lru_len: int64_t containing the maximum number of entries that
 *              have appeared in the LRU.
 *
 * max_lru_size: int64_t containing the maximum size of the LRU.
 *
 * lru_md_skips: When searching for an entry to evict, metadata entries on
 *              the LRU must be skipped if the number of metadata pages
 *              in the page buffer fails to exceed min_md_pages.
 *
 *              This int64_t is used to keep a count of these skips.
 *
 *              If this number becomes excessive, it will be necessary to
 *              add a holding tank for such entries.
 *
 * lru_rd_skips: When searching for an entry to evict, raw data entries on
 *              the LRU must be skipped if the number of raw data pages
 *              in the page buffer fails to exceed min_rd_pages.
 *
 *              This int64_t is used to keep a count of these skips.
 *
 *              If this number becomes excessive, it will be necessary to
 *              add a holding tank for such entries.
 *
 * Multi-page metadata entries (which appear only in VFD SWMR mode) are
 * listed in the hash take, and thus they are counted in the following
 * statistics.
 *
 * total_ht_insertions: Number of times entries have been inserted into the
 *              hash table.
 *
 * total_ht_deletions: Number of times entries have been deleted from the
 *              hash table.
 *
 * successful_ht_searches: int64 containing the total number of successful
 *              searches of the hash table.
 *
 * total_successful_ht_search_depth: int64 containing the total number of
 *              entries other than the targets examined in successful
 *              searches of the hash table.
 *
 * failed_ht_searches: int64 containing the total number of unsuccessful
 *              searches of the hash table.
 *
 * total_failed_ht_search_depth: int64 containing the total number of
 *              entries examined in unsuccessful searches of the hash
 *              table.
 *
 * max_index_len:  Largest value attained by the index_len field.
 *
 * max_clean_index_len:  Largest value attained by the clean_index_len field.
 *
 * max_dirty_index_len:  Largest value attained by the dirty_index_len field.
 *
 * max_index_size:  Largest value attained by the index_size field.
 *
 * max_clean_index_size:  Largest value attained by the clean_index_size field.
 *
 * max_dirty_index_size:  Largest value attained by the dirty_index_size field.
 *
 * max_rd_pages: Maximum number of raw data pages in the page buffer.
 *
 * max_md_pages: Maximum number of metadata pages in the page buffer.
 *
 *
 * Statistics pretaining to VFD SWMR.
 *
 * max_mpmde_count: Maximum number of multi-page metadata entries in the
 *              page buffer.
 *
 * lru_tl_skips: When searching for an entry to evict, metadata entries on
 *              the LRU must be skipped if they also reside on the tick list.
 *
 *              This int64_t is used to keep a count of these skips.
 *
 *              If this number becomes excessive, it will be necessary to
 *              add a holding tank for such entries.
 *
 * max_tl_len:  int64_t containing the maximum value of tl_len.
 *
 * max_tl_size: int64_t containing the maximum value of tl_size.
 *
 * delayed_writes: int64_t containing the total number of delayed writes.
 *
 * total_delay: int64_t containing the total number of ticks by which
 *              entry writes have been delayed.
 *
 * max_dwl_len: int64_t containing the maximum value of dwl_len.
 *
 * max_dwl_size: int64_t containing the maximum value of dwl_size.
 *
 * total_dwl_ins_depth: int64_t containing the total insertion depth
 *              required to maintain the odering invariant on the
 *              delayed write list.
 *
 * md_read_splits:  int64_t containing the number of metadata reads that
 *              are split into two or three sub-reads to manage the
 *              case in which a group of metadata cache clients
 *              sub-allocate entries from a single file space allocationn.
 *
 * md_write_splits:  int64_t containing the number of metadata writes that
 *              are split into two or three sub-writes to manage the
 *              case in which a group of metadata cache clients
 *              sub-allocate entries from a single file space allocationn.
 *
 ******************************************************************************/

#define H5PB__H5PB_T_MAGIC 0x01020304

#define H5PB__STATS_MD       0
#define H5PB__STATS_RD       1
#define H5PB__STATS_MPMDE    2
#define H5PB__NUM_STAT_TYPES 3

typedef struct H5PB_t {

    /* Fields for general operations: */

    uint32_t magic;
    size_t   page_size;
    int64_t  max_pages;
    int64_t  curr_pages;
    int64_t  curr_md_pages;
    int64_t  curr_rd_pages;
    int64_t  min_md_pages;
    int64_t  min_rd_pages;

    /* FAPL fields */
    size_t   max_size;
    unsigned min_meta_perc;
    unsigned min_raw_perc;

    /* index */
    H5PB_entry_t *(ht[H5PB__HASH_TABLE_LEN]);
    int64_t       index_len;
    int64_t       clean_index_len;
    int64_t       dirty_index_len;
    int64_t       index_size;
    int64_t       clean_index_size;
    int64_t       dirty_index_size;
    int64_t       il_len;
    int64_t       il_size;
    H5PB_entry_t *il_head;
    H5PB_entry_t *il_tail;

    /* LRU */
    int64_t       LRU_len;
    int64_t       LRU_size;
    H5PB_entry_t *LRU_head_ptr;
    H5PB_entry_t *LRU_tail_ptr;

    /* Fields for VFD SWMR operations: */

    hbool_t  vfd_swmr;
    hbool_t  vfd_swmr_writer;
    int64_t  mpmde_count;
    uint64_t cur_tick;

    /* delayed write list */
    uint64_t      max_delay;
    int64_t       dwl_len;
    int64_t       dwl_size;
    H5PB_entry_t *dwl_head_ptr;
    H5PB_entry_t *dwl_tail_ptr;

    /* tick list */
    int64_t       tl_len;
    int64_t       tl_size;
    H5PB_entry_t *tl_head_ptr;
    H5PB_entry_t *tl_tail_ptr;

    /* Statistics: */

    /* general operations statistics: */
    /* these statistics count pages only, not multi-page metadata entries
     * (that occur only in the VFD SWMR writer case).
     */
    int64_t  bypasses[H5PB__NUM_STAT_TYPES];
    int64_t  accesses[H5PB__NUM_STAT_TYPES];
    int64_t  hits[H5PB__NUM_STAT_TYPES];
    int64_t  misses[H5PB__NUM_STAT_TYPES];
    int64_t  loads[H5PB__NUM_STAT_TYPES];
    int64_t  insertions[H5PB__NUM_STAT_TYPES];
    int64_t  flushes[H5PB__NUM_STAT_TYPES];
    int64_t  evictions[H5PB__NUM_STAT_TYPES];
    int64_t  clears[H5PB__NUM_STAT_TYPES];
    uint64_t access_size_count[6];
    int64_t  max_lru_len;
    int64_t  max_lru_size;
    int64_t  lru_md_skips;
    int64_t  lru_rd_skips;

    /* In the VFD SWMR case, both pages and multi-page metadata entries
     * are stored in the index.  Thus mult-page metadata entries are
     * included in the index related statistics.
     */
    int64_t total_ht_insertions;
    int64_t total_ht_deletions;
    int64_t successful_ht_searches;
    int64_t total_successful_ht_search_depth;
    int64_t failed_ht_searches;
    int64_t total_failed_ht_search_depth;
    int64_t max_index_len;
    int64_t max_clean_index_len;
    int64_t max_dirty_index_len;
    int64_t max_index_size;
    int64_t max_clean_index_size;
    int64_t max_dirty_index_size;
    int64_t max_rd_pages;
    int64_t max_md_pages;

    /* vfd swmr statistics */
    int64_t max_mpmde_count;
    int64_t lru_tl_skips;
    int64_t max_tl_len;
    int64_t max_tl_size;
    int64_t delayed_writes;
    int64_t total_delay;
    int64_t max_dwl_len;
    int64_t max_dwl_size;
    int64_t total_dwl_ins_depth;
    int64_t md_read_splits;
    int64_t md_write_splits;

} H5PB_t;

/*****************************/
/* Library-private Variables */
/*****************************/

/***************************************/
/* Library-private Function Prototypes */
/***************************************/

/* General routines */
H5_DLL herr_t H5PB_create(H5F_shared_t *shared, size_t page_buffer_size, unsigned page_buf_min_meta_perc,
                          unsigned page_buf_min_raw_perc);

H5_DLL herr_t H5PB_flush(H5F_shared_t *);

H5_DLL herr_t H5PB_dest(H5F_shared_t *);

H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *, H5FD_mem_t, haddr_t);

H5_DLL herr_t H5PB_update_entry(H5PB_t *, haddr_t, size_t, const void *);

H5_DLL herr_t H5PB_remove_entry(H5F_shared_t *, haddr_t);

H5_DLL herr_t H5PB_remove_entries(H5F_shared_t *, haddr_t, hsize_t);

H5_DLL herr_t H5PB_read(H5F_shared_t *, H5FD_mem_t, haddr_t, size_t, void * /*out*/);

H5_DLL herr_t H5PB_write(H5F_shared_t *, H5FD_mem_t, haddr_t, size_t, const void *);

/* VFD SWMR specific routines */
H5_DLL herr_t H5PB_vfd_swmr__release_delayed_writes(H5F_shared_t *);

H5_DLL herr_t H5PB_vfd_swmr__release_tick_list(H5F_shared_t *);

H5_DLL herr_t H5PB_vfd_swmr__set_tick(H5F_shared_t *);

H5_DLL herr_t H5PB_vfd_swmr__update_index(H5F_t *f, uint32_t *idx_ent_added_ptr,
                                          uint32_t *idx_ent_modified_ptr, uint32_t *idx_ent_not_in_tl_ptr,
                                          uint32_t *idx_ent_not_in_tl_flushed_ptr);

/* Statistics routines */
H5_DLL herr_t H5PB_reset_stats(H5PB_t *);

H5_DLL herr_t H5PB_get_stats(const H5PB_t *page_buf, unsigned accesses[2], unsigned hits[2],
                             unsigned misses[2], unsigned evictions[2], unsigned bypasses[2]);

H5_DLL herr_t H5PB_print_stats(const H5PB_t *);

/* test & debug functions */
H5_DLL herr_t H5PB_page_exists(H5F_shared_t *, haddr_t, hbool_t *);

#endif /* H5PBprivate_H */