From bcb26565a732062759a2428e167e27ad3c2f6e96 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 11 Apr 2019 15:19:48 -0700 Subject: improved documentation for LZ4 dictionary compression --- README.md | 8 +++++--- doc/lz4_manual.html | 11 ++++++++--- doc/lz4frame_manual.html | 2 +- lib/lz4.h | 11 ++++++++--- lib/lz4frame.h | 20 +++++++++++++++++++- programs/README.md | 13 +++++++++---- 6 files changed, 50 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e64020d..4e139e3 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,11 @@ trading CPU time for improved compression ratio. All versions feature the same decompression speed. LZ4 is also compatible with [dictionary compression](https://github.com/facebook/zstd#the-case-for-small-data-compression), -and can ingest any input file as dictionary, -including those created by [Zstandard Dictionary Builder](https://github.com/facebook/zstd/blob/v1.3.5/programs/zstd.1.md#dictionary-builder). -(note: only the final 64KB are used). +both at [API](https://github.com/lz4/lz4/blob/v1.8.3/lib/lz4frame.h#L481) and [CLI](https://github.com/lz4/lz4/blob/v1.8.3/programs/lz4.1.md#operation-modifiers) levels. +It can ingest any input file as dictionary, though only the final 64KB are used. +This capability can be combined with the [Zstandard Dictionary Builder](https://github.com/facebook/zstd/blob/v1.3.5/programs/zstd.1.md#dictionary-builder), +in order to drastically improve compression performance on small files. + LZ4 library is provided as open-source software using BSD 2-Clause license. diff --git a/doc/lz4_manual.html b/doc/lz4_manual.html index ef1a8b5..4cd21fc 100644 --- a/doc/lz4_manual.html +++ b/doc/lz4_manual.html @@ -180,10 +180,15 @@ int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int src


int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
-

Use this function to load a static dictionary into LZ4_stream_t. - Any previous data will be forgotten, only 'dictionary' will remain in memory. +

Use this function to reference a static dictionary into LZ4_stream_t. + The dictionary must remain available during compression. + LZ4_loadDict() triggers a reset, so any previous data will be forgotten. + The same dictionary will have to be loaded on decompression side for successful decoding. + Dictionary are useful for better compression of small data (KB range). + While LZ4 accept any input as dictionary, + results are generally better when using Zstandard's Dictionary Builder. Loading a size of 0 is allowed, and is the same as reset. - @return : dictionary size, in bytes (necessarily <= 64 KB) + @return : loaded dictionary size, in bytes (necessarily <= 64 KB)


diff --git a/doc/lz4frame_manual.html b/doc/lz4frame_manual.html index d5496a1..914405f 100644 --- a/doc/lz4frame_manual.html +++ b/doc/lz4frame_manual.html @@ -343,7 +343,7 @@ LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx);
LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
 LZ4FLIB_STATIC_API void        LZ4F_freeCDict(LZ4F_CDict* CDict);
-

When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. +

When compressing multiple messages / blocks using the same dictionary, it's recommended to load it just once. LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict diff --git a/lib/lz4.h b/lib/lz4.h index b86417a..935b55f 100644 --- a/lib/lz4.h +++ b/lib/lz4.h @@ -263,10 +263,15 @@ LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr); /*! LZ4_loadDict() : - * Use this function to load a static dictionary into LZ4_stream_t. - * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Use this function to reference a static dictionary into LZ4_stream_t. + * The dictionary must remain available during compression. + * LZ4_loadDict() triggers a reset, so any previous data will be forgotten. + * The same dictionary will have to be loaded on decompression side for successful decoding. + * Dictionary are useful for better compression of small data (KB range). + * While LZ4 accept any input as dictionary, + * results are generally better when using Zstandard's Dictionary Builder. * Loading a size of 0 is allowed, and is the same as reset. - * @return : dictionary size, in bytes (necessarily <= 64 KB) + * @return : loaded dictionary size, in bytes (necessarily <= 64 KB) */ LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); diff --git a/lib/lz4frame.h b/lib/lz4frame.h index 2ada8b8..5c68628 100644 --- a/lib/lz4frame.h +++ b/lib/lz4frame.h @@ -524,10 +524,28 @@ LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(unsigned); /********************************** * Bulk processing dictionary API *********************************/ + +/* A Dictionary is useful for the compression of small messages (KB range). + * It dramatically improves compression efficiency. + * + * LZ4 can ingest any input as dictionary, though only the last 64 KB are useful. + * Best results are generally achieved by using Zstandard's Dictionary Builder + * to generate a high-quality dictionary from a set of samples. + * + * Loading a dictionary has a cost, since it involves construction of tables. + * The Bulk processing dictionary API makes it possible to share this cost + * over an arbitrary number of compression jobs, even concurrently, + * markedly improving compression latency for these cases. + * + * The same dictionary will have to be used on the decompression side + * for decoding to be successful. + * To help identify the correct dictionary at decoding stage, + * the frame header allows optional embedding of a dictID field. + */ typedef struct LZ4F_CDict_s LZ4F_CDict; /*! LZ4_createCDict() : - * When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. + * When compressing multiple messages / blocks using the same dictionary, it's recommended to load it just once. * LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. * LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. * `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict */ diff --git a/programs/README.md b/programs/README.md index 2ad0449..23c944b 100644 --- a/programs/README.md +++ b/programs/README.md @@ -38,7 +38,9 @@ Arguments : -9 : High compression -d : decompression (default for .lz4 extension) -z : force compression + -D FILE: use FILE as dictionary -f : overwrite output without prompting + -k : preserve source files(s) (default) --rm : remove source file(s) after successful de/compression -h/-H : display help/long help and exit @@ -51,17 +53,20 @@ Advanced arguments : -m : multiple input files (implies automatic output filenames) -r : operate recursively on directories (sets also -m) -l : compress using Legacy format (Linux kernel compression) - -B# : Block size [4-7] (default : 7) + -B# : cut file into blocks of size # bytes [32+] + or predefined block size [4-7] (default: 7) -BD : Block dependency (improve compression ratio) + -BX : enable block checksum (default:disabled) --no-frame-crc : disable stream checksum (default:enabled) --content-size : compressed frame includes original size (default:not present) --[no-]sparse : sparse mode (default:enabled on file, disabled on stdout) +--favor-decSpeed: compressed files decompress faster, but are less compressed +--fast[=#]: switch to ultra fast compression level (default: 1) + Benchmark arguments : -b# : benchmark file(s), using # compression level (default : 1) -e# : test all compression levels from -bX to # (default : 1) - -i# : minimum evaluation time in seconds (default : 3s) - -B# : cut file into independent blocks of size # bytes [32+] - or predefined block size [4-7] (default: 7) + -i# : minimum evaluation time in seconds (default : 3s)``` ``` #### License -- cgit v0.12