From e1d1bf607f2128d0066b6f4a50c61d91e8732db7 Mon Sep 17 00:00:00 2001 From: Robb Matzke Date: Fri, 23 Oct 1998 10:44:02 -0500 Subject: [svn-r779] Changes since 19981022 ---------------------- ./MANIFEST ./doc/html/study.html [DELETED] ./doc/html/study_1000x1000.gif [DELETED] ./doc/html/study_250x250.gif [DELETED] ./doc/html/study_499x499.gif [DELETED] ./doc/html/study_5000x1000.gif [DELETED] ./doc/html/study_500x500.gif [DELETED] ./doc/html/study_p1.gif [DELETED] ./doc/html/study_p1.obj [DELETED] Removed these old files -- the data was from before the chunk cache was implemented and therefore the whole file was pretty much garbage. ./MANIFEST ./doc/html/Chunking.html [NEW] ./doc/html/Chunk_f1.gif [NEW] ./doc/html/Chunk_f1.obj [NEW] ./doc/html/Chunk_f2.gif [NEW] ./doc/html/Chunk_f2.obj [NEW] ./doc/html/Chunk_f3.gif [NEW] ./doc/html/Chunk_f4.gif [NEW] ./doc/html/Chunk_f5.gif [NEW] ./doc/html/Chunk_f6.gif [NEW] ./doc/html/Chunk_f6.obj [NEW] New documentation for chunking. It's not quite complete but it gives a pretty good idea of some of the issues affecting performance. --- MANIFEST | 18 +++-- doc/html/study.html | 172 ------------------------------------------- doc/html/study_1000x1000.gif | Bin 6594 -> 0 bytes doc/html/study_250x250.gif | Bin 6914 -> 0 bytes doc/html/study_499x499.gif | Bin 10429 -> 0 bytes doc/html/study_5000x1000.gif | Bin 10653 -> 0 bytes doc/html/study_500x500.gif | Bin 6842 -> 0 bytes doc/html/study_p1.gif | Bin 6550 -> 0 bytes doc/html/study_p1.obj | 113 ---------------------------- 9 files changed, 10 insertions(+), 293 deletions(-) delete mode 100644 doc/html/study.html delete mode 100644 doc/html/study_1000x1000.gif delete mode 100644 doc/html/study_250x250.gif delete mode 100644 doc/html/study_499x499.gif delete mode 100644 doc/html/study_5000x1000.gif delete mode 100644 doc/html/study_500x500.gif delete mode 100644 doc/html/study_p1.gif delete mode 100644 doc/html/study_p1.obj diff --git a/MANIFEST b/MANIFEST index 8112a82..129bd01 100644 --- a/MANIFEST +++ b/MANIFEST @@ -58,6 +58,16 @@ ./doc/html/Attributes.html ./doc/html/Big.html ./doc/html/Caching.html +./doc/html/Chunking.html +./doc/html/Chunk_f1.gif +./doc/html/Chunk_f1.obj _DO_NOT_DISTRIBUTE_ +./doc/html/Chunk_f2.gif +./doc/html/Chunk_f2.obj _DO_NOT_DISTRIBUTE_ +./doc/html/Chunk_f3.gif +./doc/html/Chunk_f4.gif +./doc/html/Chunk_f5.gif +./doc/html/Chunk_f6.gif +./doc/html/Chunk_f6.obj _DO_NOT_DISTRIBUTE_ ./doc/html/CodeReview.html _DO_NOT_DISTRIBUTE_ ./doc/html/Coding.html ./doc/html/Copyright.html @@ -132,14 +142,6 @@ ./doc/html/review1.html _DO_NOT_DISTRIBUTE_ ./doc/html/review1a.html _DO_NOT_DISTRIBUTE_ ./doc/html/storage.html _DO_NOT_DISTRIBUTE_ -./doc/html/study.html _DO_NOT_DISTRIBUTE_ -./doc/html/study_1000x1000.gif _DO_NOT_DISTRIBUTE_ -./doc/html/study_250x250.gif _DO_NOT_DISTRIBUTE_ -./doc/html/study_499x499.gif _DO_NOT_DISTRIBUTE_ -./doc/html/study_5000x1000.gif _DO_NOT_DISTRIBUTE_ -./doc/html/study_500x500.gif _DO_NOT_DISTRIBUTE_ -./doc/html/study_p1.gif _DO_NOT_DISTRIBUTE_ -./doc/html/study_p1.obj _DO_NOT_DISTRIBUTE_ ./doc/html/symtab _DO_NOT_DISTRIBUTE_ ./doc/html/version.gif ./doc/html/version.obj _DO_NOT_DISTRIBUTE_ diff --git a/doc/html/study.html b/doc/html/study.html deleted file mode 100644 index f9e192d..0000000 --- a/doc/html/study.html +++ /dev/null @@ -1,172 +0,0 @@ - - - - Testing the chunked layout of HDF5 - - - -

Testing the chunked layout of HDF5

- -

This is the results of studying the chunked layout policy in - HDF5. A 1000 by 1000 array of integers was written to a file - dataset extending the dataset with each write to create, in the - end, a 5000 by 5000 array of 4-byte integers for a total data - storage size of 100 million bytes. - -

-

- Order that data was written -
Fig 1: Write-order of Output Blocks -
- -

After the array was written, it was read back in blocks that - were 500 by 500 bytes in row major order (that is, the top-left - quadrant of output block one, then the top-right quadrant of - output block one, then the top-left quadrant of output block 2, - etc.). - -

I tried to answer two questions: -

- -

I started with chunk sizes that were multiples of the read - block size or k*(500, 500). - -

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Table 1: Total File Overhead -
Chunk Size (elements)Meta Data Overhead (ppm)Raw Data Overhead (ppm)
500 by 50085.840.00
1000 by 100023.080.00
5000 by 100023.080.00
250 by 250253.300.00
499 by 49985.84205164.84
-
- -
-

-

- 500x500 -
Fig 2: Chunk size is 500x500 -
- -

The first half of Figure 2 shows output to the file while the - second half shows input. Each dot represents a file-level I/O - request and the lines that connect the dots are for visual - clarity. The size of the request is not indicated in the - graph. The output block size is four times the chunk size which - results in four file-level write requests per block for a total - of 100 requests. Since file space for the chunks was allocated - in output order, and the input block size is 1/4 the output - block size, the input shows a staircase effect. Each input - request results in one file-level read request. The downward - spike at about the 60-millionth byte is probably the result of a - cache miss for the B-tree and the downward spike at the end is - probably a cache flush or file boot block update. - -


-

-

- 1000x1000 -
Fig 2: Chunk size is 1000x1000 -
- -

In this test I increased the chunk size to match the output - chunk size and one can see from the first half of the graph that - 25 file-level write requests were issued, one for each output - block. The read half of the test shows that four times the - amount of data was read as written. This results from the fact - that HDF5 must read the entire chunk for any request that falls - within that chunk, which is done because (1) if the data is - compressed the entire chunk must be decompressed, and (2) the - library assumes that a chunk size was chosen to optimize disk - performance. - -


-

-

- 5000x1000 -
Fig 3: Chunk size is 5000x1000 -
- -

Increasing the chunk size further results in even worse - performance since both the read and write halves of the test are - re-reading and re-writing vast amounts of data. This proves - that one should be careful that chunk sizes are not much larger - than the typical partial I/O request. - -


-

-

- 250x250 -
Fig 4: Chunk size is 250x250 -
- -

If the chunk size is decreased then the amount of data - transfered between the disk and library is optimal for no - caching, but the amount of meta data required to describe the - chunk locations increases to 250 parts per million. One can - also see that the final downward spike contains more file-level - write requests as the meta data is flushed to disk just before - the file is closed. - -


-

-

- 499x499 -
Fig 4: Chunk size is 499x499 -
- -

This test shows the result of choosing a chunk size which is - close to the I/O block size. Because the total size of the - array isn't a multiple of the chunk size, the library allocates - an extra zone of chunks around the top and right edges of the - array which are only partially filled. This results in - 20,516,484 extra bytes of storage, a 20% increase in the total - raw data storage size. But the amount of meta data overhead is - the same as for the 500 by 500 test. In addition, the mismatch - causes entire chunks to be read in order to update a few - elements along the edge or the chunk which results in a 3.6-fold - increase in the amount of data transfered. - -


-
Robb Matzke
- - -Last modified: Fri Jan 30 23:51:31 EST 1998 - - - diff --git a/doc/html/study_1000x1000.gif b/doc/html/study_1000x1000.gif deleted file mode 100644 index b7d5a83..0000000 Binary files a/doc/html/study_1000x1000.gif and /dev/null differ diff --git a/doc/html/study_250x250.gif b/doc/html/study_250x250.gif deleted file mode 100644 index fe35f39..0000000 Binary files a/doc/html/study_250x250.gif and /dev/null differ diff --git a/doc/html/study_499x499.gif b/doc/html/study_499x499.gif deleted file mode 100644 index 0d2038b..0000000 Binary files a/doc/html/study_499x499.gif and /dev/null differ diff --git a/doc/html/study_5000x1000.gif b/doc/html/study_5000x1000.gif deleted file mode 100644 index 0f3c290..0000000 Binary files a/doc/html/study_5000x1000.gif and /dev/null differ diff --git a/doc/html/study_500x500.gif b/doc/html/study_500x500.gif deleted file mode 100644 index 38dd7d6..0000000 Binary files a/doc/html/study_500x500.gif and /dev/null differ diff --git a/doc/html/study_p1.gif b/doc/html/study_p1.gif deleted file mode 100644 index 938d133..0000000 Binary files a/doc/html/study_p1.gif and /dev/null differ diff --git a/doc/html/study_p1.obj b/doc/html/study_p1.obj deleted file mode 100644 index 6fbf583..0000000 --- a/doc/html/study_p1.obj +++ /dev/null @@ -1,113 +0,0 @@ -%TGIF 3.0-p5 -state(0,33,100,0,0,0,16,1,9,1,1,0,0,3,7,1,1,'Helvetica',0,24,0,0,0,10,0,0,1,1,0,16,0,0,1,1,1,0,1088,1408,0,0,2880). -% -% @(#)$Header$ -% %W% -% -unit("1 pixel/pixel"). -page(1,"",1). -box('black',64,64,384,384,0,1,1,22,0,0,0,0,0,'1',[ -]). -poly('black',2,[ - 128,64,128,384],0,1,1,23,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 192,64,192,384],0,1,1,24,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 256,64,256,384],0,1,1,25,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 320,64,320,384],0,1,1,26,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 64,128,384,128],0,1,1,27,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 64,192,384,192],0,1,1,28,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 64,256,384,256],0,1,1,29,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 64,320,384,320],0,1,1,30,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -text('black',96,80,'Courier',0,17,1,1,0,1,7,14,37,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "1"]). -text('black',160,80,'Courier',0,17,1,1,0,1,7,14,39,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "2"]). -text('black',224,80,'Courier',0,17,1,1,0,1,7,14,41,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "3"]). -text('black',288,80,'Courier',0,17,1,1,0,1,7,14,43,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "4"]). -text('black',352,80,'Courier',0,17,1,1,0,1,7,14,47,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "5"]). -text('black',96,144,'Courier',0,17,1,1,0,1,7,14,51,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "6"]). -text('black',160,144,'Courier',0,17,1,1,0,1,7,14,53,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "7"]). -text('black',224,144,'Courier',0,17,1,1,0,1,7,14,55,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "8"]). -text('black',288,144,'Courier',0,17,1,1,0,1,7,14,57,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "9"]). -text('black',352,144,'Courier',0,17,1,1,0,1,14,14,59,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "10"]). -text('black',96,208,'Courier',0,17,1,1,0,1,14,14,61,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "11"]). -text('black',160,208,'Courier',0,17,1,1,0,1,14,14,63,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "12"]). -text('black',224,208,'Courier',0,17,1,1,0,1,14,14,65,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "13"]). -text('black',288,208,'Courier',0,17,1,1,0,1,14,14,67,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "14"]). -text('black',352,208,'Courier',0,17,1,1,0,1,14,14,71,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "15"]). -text('black',96,272,'Courier',0,17,1,1,0,1,14,14,75,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "16"]). -text('black',160,272,'Courier',0,17,1,1,0,1,14,14,77,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "17"]). -text('black',224,272,'Courier',0,17,1,1,0,1,14,14,79,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "18"]). -text('black',288,272,'Courier',0,17,1,1,0,1,14,14,81,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "19"]). -text('black',352,272,'Courier',0,17,1,1,0,1,14,14,83,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "20"]). -text('black',96,336,'Courier',0,17,1,1,0,1,14,14,87,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "21"]). -text('black',160,336,'Courier',0,17,1,1,0,1,14,14,89,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "22"]). -text('black',224,336,'Courier',0,17,1,1,0,1,14,14,91,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "23"]). -text('black',288,336,'Courier',0,17,1,1,0,1,14,14,93,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "24"]). -text('black',352,336,'Courier',0,17,1,1,0,1,14,14,95,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "25"]). -poly('black',2,[ - 416,64,416,384],3,1,1,100,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -poly('black',2,[ - 64,416,384,416],3,1,1,101,0,0,0,0,8,3,0,0,0,'1','8','3', - "0",[ -]). -text('black',390,228,'Courier',0,17,1,0,0,1,14,35,102,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,1,0,[ - 390,228,390,228,425,242,0,-1000,1000,0,34,18,389,227,426,243],[ - "5,000"]). -text('black',224,432,'Courier',0,17,1,1,0,1,35,14,116,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "5,000"]). -text('black',160,512,'Courier',0,17,1,0,0,1,105,14,131,0,11,3,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "= 1,000 x 1,000"]). -box('black',80,480,144,544,7,1,1,134,0,0,0,0,0,'1',[ -]). -text('black',224,16,'Helvetica',0,24,1,1,0,1,296,29,144,0,24,5,0,0,0,0,0,2,0,0,0,0,"",0,0,0,[ - "Order that data was written"]). -box('black',32,0,464,576,0,1,1,149,0,0,0,0,0,'1',[ -]). -- cgit v0.12