From 3430ee1bc0d90e32c47a036eb39c3b0d16ba5451 Mon Sep 17 00:00:00 2001 From: "yann.collet.73@gmail.com" Date: Sat, 25 Feb 2012 19:13:16 +0000 Subject: Added : format description file Added : new tuning parameter : LZ4_COMPRESSMIN. Thanks to Maciej Adamczyk for suggestion. changed : macro for bswap16, in order to help GCC intrinsic detection. Thank to Erik Andersen for suggestion. git-svn-id: https://lz4.googlecode.com/svn/trunk@57 650e7d94-2a16-8b24-b05c-7c0b3f6821cd --- lz4.c | 15 ++++-- lz4.h | 34 +++++++------ lz4_format_description.txt | 119 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 17 deletions(-) create mode 100644 lz4_format_description.txt diff --git a/lz4.c b/lz4.c index a131a81..cce3b17 100644 --- a/lz4.c +++ b/lz4.c @@ -45,6 +45,13 @@ // The default value (6) is recommended #define NOTCOMPRESSIBLE_CONFIRMATION 6 +// LZ4_COMPRESSMIN : +// Compression function will *fail* if it is not successful at compressing input by at least LZ4_COMPRESSMIN bytes +// Since the compression function stops working prematurely, it results in a speed gain +// The output however is unusable. Compression function result will be zero. +// Default : 0 = disabled +#define LZ4_COMPRESSMIN 0 + // BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : // This will provide a boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU. // You can set this option to 1 in situations where data will stay within closed environment @@ -92,13 +99,13 @@ #endif #ifdef _MSC_VER -#define inline __forceinline // Visual is not C99, but supports inline +#define inline __forceinline // Visual is not C99, but supports some kind of inline #endif #ifdef _MSC_VER // Visual Studio -#define bswap16(i) _byteswap_ushort(i) +#define bswap16(x) _byteswap_ushort(x) #else -#define bswap16(i) (((i)>>8) | ((i)<<8)) +#define bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) #endif @@ -429,6 +436,7 @@ _last_literals: // Encode Last Literals { int lastRun = iend - anchor; + if ((LZ4_COMPRESSMIN>0) && (((op - (BYTE*)dest) + lastRun + 1 + ((lastRun-15)/255)) > isize - LZ4_COMPRESSMIN)) return 0; if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } else *op++ = (lastRun<0) && (((op - (BYTE*)dest) + lastRun + 1 + ((lastRun-15)/255)) > isize - LZ4_COMPRESSMIN)) return 0; if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } else *op++ = (lastRun<= 255 +- 10 : (=280 - 15 - 255) ) remaining length to reach 280 + +Example 3 : A length of 15 will be represented as : +- 15 : value for the 4-bits High field +- 0 : (=15-15) yes, the zero must be output + +Following the token and optional length bytes, are the literals themselves. +They are exactly as numerous as previously decoded (length of literals). +It's possible that there are zero literal. + + +Following the literals is the match copy operation. + +It starts by the offset. +This is a 2 bytes value, in little endian format : +the lower byte is the first one in the stream. + +The offset represents the position of the match to be copied from. +1 means "current position - 1 byte". +The maximum offset value is 65535, 65536 cannot be coded. +Note that 0 is an invalid value, not used. + +Then we need to extract the match length. +For this, we use the second token field, the low 4-bits. +Value, obviously, ranges from 0 to 15. +However here, 0 means that the copy operation will be minimal. +The minimum length of a match, called minmatch, is 4. +As a consequence, a 0 value means 4 bytes, and a value of 15 means 19+ bytes. +Similar to literal length, on reaching the highest possible value (15), +we output additional bytes, one at a time, with values ranging from 0 to 255. +They are added to total to provide the final match length. +A 255 value means there is another byte to read and add. +There is no limit to the number of optional bytes that can be output this way. +(This points towards a maximum achievable compression ratio of ~250). + +With the offset and the matchlength, +the decoder can now proceed to copy the data from the already decoded buffer. +On decoding the matchlength, we reach the end of the compressed sequence, +and therefore start another one. + + +-- Parsing restrictions -- + +There are specific parsing rules to respect in order to remain compatible +with assumptions made by the decoder : +1) The last 5 bytes are always literals +2) The last match must start at least 12 bytes before end of stream +Consequently, a file with less than 13 bytes cannot be compressed. +These rules are in place to ensure that the decoder +will never read beyond the input buffer, nor write beyond the output buffer. + +Note that the last sequence is also incomplete, +and stops right after literals. + + +-- Additional notes -- + +There is no assumption nor limits to the way the compressor +searches and selects matches within the source stream. +It could be a fast scan, a multi-probe, a full search using BST, +standard hash chains or MMC, well whatever. + +Advanced parsing strategies can also be implemented, such as lazy match, +or full optimal parsing. + +All these trade-off offer distinctive speed/memory/compression advantages. +Whatever the method used by the compressor, its result will be decodable +by any LZ4 decoder if it follows the format described above. + -- cgit v0.12