From 8d76c8a44a15cc7c0c1f345ba750e44edac7abb7 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 11 Apr 2019 14:15:33 -0700 Subject: introduce LZ4_DISTANCE_MAX build macro make it possible to generate LZ4-compressed block with a controlled maximum offset (necessarily <= 65535). This could be useful for compatibility with decoders using a very limited memory budget (<64 KB). Answer #154 --- doc/lz4_manual.html | 1 + doc/lz4frame_manual.html | 61 +++++++++++++++++++++++++++++++++++++----------- lib/README.md | 7 ++++++ lib/lz4.c | 35 +++++++++++++++------------ lib/lz4hc.c | 16 ++++++------- programs/Makefile | 2 +- 6 files changed, 84 insertions(+), 38 deletions(-) diff --git a/doc/lz4_manual.html b/doc/lz4_manual.html index 1c6dba7..ef1a8b5 100644 --- a/doc/lz4_manual.html +++ b/doc/lz4_manual.html @@ -396,6 +396,7 @@ union LZ4_stream_u { Note : initialization fails if size and alignment conditions are not respected. In which case, the function will @return NULL. Note2: An LZ4_stream_t structure guarantees correct alignment and size. + Note3: Before v1.9.0, use LZ4_resetStream() instead


diff --git a/doc/lz4frame_manual.html b/doc/lz4frame_manual.html index 4277c3c..d5496a1 100644 --- a/doc/lz4frame_manual.html +++ b/doc/lz4frame_manual.html @@ -237,25 +237,58 @@ LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx);

Streaming decompression functions


 
+
size_t LZ4F_headerSize(const void* src, size_t srcSize);
+

Provide the header size of a frame starting at `src`. + `srcSize` must be >= LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH, + which is enough to decode the header length. + @return : size of frame header + or an error code, which can be tested using LZ4F_isError() + note : Frame header size is variable, but is guaranteed to be + >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes. + +


+
size_t LZ4F_getFrameInfo(LZ4F_dctx* dctx,
                                      LZ4F_frameInfo_t* frameInfoPtr,
                                      const void* srcBuffer, size_t* srcSizePtr);
 

This function extracts frame parameters (max blockSize, dictID, etc.). - Its usage is optional. - Extracted information is typically useful for allocation and dictionary. - This function works in 2 situations : - - At the beginning of a new frame, in which case - it will decode information from `srcBuffer`, starting the decoding process. - Input size must be large enough to successfully decode the entire frame header. - Frame header size is variable, but is guaranteed to be <= LZ4F_HEADER_SIZE_MAX bytes. - It's allowed to provide more input data than this minimum. - - After decoding has been started. - In which case, no input is read, frame parameters are extracted from dctx. - - If decoding has barely started, but not yet extracted information from header, + Its usage is optional: user can call LZ4F_decompress() directly. + + Extracted information will fill an existing LZ4F_frameInfo_t structure. + This can be useful for allocation and dictionary identification purposes. + + LZ4F_getFrameInfo() can work in the following situations : + + 1) At the beginning of a new frame, before any invocation of LZ4F_decompress(). + It will decode header from `srcBuffer`, + consuming the header and starting the decoding process. + + Input size must be large enough to contain the full frame header. + Frame header size can be known beforehand by LZ4F_headerSize(). + Frame header size is variable, but is guaranteed to be >= LZ4F_HEADER_SIZE_MIN bytes, + and not more than <= LZ4F_HEADER_SIZE_MAX bytes. + Hence, blindly providing LZ4F_HEADER_SIZE_MAX bytes or more will always work. + It's allowed to provide more input data than the header size, + LZ4F_getFrameInfo() will only consume the header. + + If input size is not large enough, + aka if it's smaller than header size, + function will fail and return an error code. + + 2) After decoding has been started, + it's possible to invoke LZ4F_getFrameInfo() anytime + to extract already decoded frame parameters stored within dctx. + + Note that, if decoding has barely started, + and not yet read enough information to decode the header, LZ4F_getFrameInfo() will fail. - The number of bytes consumed from srcBuffer will be updated within *srcSizePtr (necessarily <= original value). - Decompression must resume from (srcBuffer + *srcSizePtr). - @return : an hint about how many srcSize bytes LZ4F_decompress() expects for next call, + + The number of bytes consumed from srcBuffer will be updated in *srcSizePtr (necessarily <= original value). + LZ4F_getFrameInfo() only consumes bytes when decoding has not yet started, + and when decoding the header has been successful. + Decompression must then resume from (srcBuffer + *srcSizePtr). + + @return : a hint about how many srcSize bytes LZ4F_decompress() expects for next call, or an error code which can be tested using LZ4F_isError(). note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely. note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure. diff --git a/lib/README.md b/lib/README.md index be8eba0..c6daaea 100644 --- a/lib/README.md +++ b/lib/README.md @@ -52,6 +52,13 @@ The following build macro can be determined at compilation time : For example, with `gcc` : `-DLZ4_FAST_DEC_LOOP=1`, and with `make` : `CPPFLAGS+=-DLZ4_FAST_DEC_LOOP=1 make lz4`. +- `LZ4_DISTANCE_MAX` : control the maximum offset that the compressor will allow. + Set to 65535 by default, which is the maximum value supported by lz4 format. + Reducing maximum distance will reduce opportunities for LZ4 to find matches, + hence will produce worse the compression ratio. + However, a smaller max distance may allow compatibility with specific decoders using limited memory budget. + This build macro only influences the compressed output of the compressor. + - `LZ4_DISABLE_DEPRECATE_WARNINGS` : invoking a deprecated function will make the compiler generate a warning. This is meant to invite users to update their source code. Should this be a problem, it's generally to make the compiler ignore these warnings, diff --git a/lib/lz4.c b/lib/lz4.c index 34587ce..ca3684f 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -395,8 +395,13 @@ static const int LZ4_minLength = (MFLIMIT+1); #define MB *(1 <<20) #define GB *(1U<<30) -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) +#ifndef LZ4_DISTANCE_MAX /* can be user - defined at compile time */ +# define LZ4_DISTANCE_MAX 65535 +#endif + +#if (LZ4_DISTANCE_MAX > 65535) /* max supported by LZ4 format */ +# error "LZ4_DISTANCE_MAX is too big : must be <= 65535" +#endif #define ML_BITS 4 #define ML_MASK ((1U< MAX_DISTANCE back, is faster + /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, is faster * than compressing without a gap. However, compressing with * currentOffset == 0 is faster still, so we preserve that case. */ @@ -850,7 +855,7 @@ LZ4_FORCE_INLINE int LZ4_compress_generic( forwardH = LZ4_hashPosition(forwardIp, tableType); LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base); - } while ( (match+MAX_DISTANCE < ip) + } while ( (match+LZ4_DISTANCE_MAX < ip) || (LZ4_read32(match) != LZ4_read32(ip)) ); } else { /* byU32, byU16 */ @@ -901,8 +906,8 @@ LZ4_FORCE_INLINE int LZ4_compress_generic( if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) continue; /* match outside of valid area */ assert(matchIndex < current); - if ((tableType != byU16) && (matchIndex+MAX_DISTANCE < current)) continue; /* too far */ - if (tableType == byU16) assert((current - matchIndex) <= MAX_DISTANCE); /* too_far presumed impossible with byU16 */ + if ((tableType != byU16) && (matchIndex+LZ4_DISTANCE_MAX < current)) continue; /* too far */ + if (tableType == byU16) assert((current - matchIndex) <= LZ4_DISTANCE_MAX); /* too_far presumed impossible with byU16 */ if (LZ4_read32(match) == LZ4_read32(ip)) { if (maybe_extMem) offset = current - matchIndex; @@ -961,11 +966,11 @@ _next_match: /* Encode Offset */ if (maybe_extMem) { /* static test */ DEBUGLOG(6, " with offset=%u (ext if > %i)", offset, (int)(ip - (const BYTE*)source)); - assert(offset <= MAX_DISTANCE && offset > 0); + assert(offset <= LZ4_DISTANCE_MAX && offset > 0); LZ4_writeLE16(op, (U16)offset); op+=2; } else { DEBUGLOG(6, " with offset=%u (same segment)", (U32)(ip - match)); - assert(ip-match <= MAX_DISTANCE); + assert(ip-match <= LZ4_DISTANCE_MAX); LZ4_writeLE16(op, (U16)(ip - match)); op+=2; } @@ -1030,7 +1035,7 @@ _next_match: match = LZ4_getPosition(ip, cctx->hashTable, tableType, base); LZ4_putPosition(ip, cctx->hashTable, tableType, base); - if ( (match+MAX_DISTANCE >= ip) + if ( (match+LZ4_DISTANCE_MAX >= ip) && (LZ4_read32(match) == LZ4_read32(ip)) ) { token=op++; *token=0; goto _next_match; } @@ -1065,7 +1070,7 @@ _next_match: LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); assert(matchIndex < current); if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) - && ((tableType==byU16) ? 1 : (matchIndex+MAX_DISTANCE >= current)) + && ((tableType==byU16) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current)) && (LZ4_read32(match) == LZ4_read32(ip)) ) { token=op++; *token=0; @@ -1132,14 +1137,14 @@ int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int if (inputSize < LZ4_64Klimit) { return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration); } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > MAX_DISTANCE)) ? byPtr : byU32; + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); } } else { if (inputSize < LZ4_64Klimit) {; return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > MAX_DISTANCE)) ? byPtr : byU32; + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration); } } @@ -1169,7 +1174,7 @@ int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); } } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > MAX_DISTANCE)) ? byPtr : byU32; + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; LZ4_prepareTable(ctx, srcSize, tableType); return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); } @@ -1183,7 +1188,7 @@ int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); } } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > MAX_DISTANCE)) ? byPtr : byU32; + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; LZ4_prepareTable(ctx, srcSize, tableType); return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); } @@ -1246,7 +1251,7 @@ static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, if (*srcSizePtr < LZ4_64Klimit) { return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1); } else { - tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > MAX_DISTANCE)) ? byPtr : byU32; + tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1); } } } diff --git a/lib/lz4hc.c b/lib/lz4hc.c index 411b6cc..a6dc7a2 100644 --- a/lib/lz4hc.c +++ b/lib/lz4hc.c @@ -132,7 +132,7 @@ LZ4_FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip) while (idx < target) { U32 const h = LZ4HC_hashPtr(base+idx); size_t delta = idx - hashTable[h]; - if (delta>MAX_DISTANCE) delta = MAX_DISTANCE; + if (delta>LZ4_DISTANCE_MAX) delta = LZ4_DISTANCE_MAX; DELTANEXTU16(chainTable, idx) = (U16)delta; hashTable[h] = idx; idx++; @@ -235,7 +235,7 @@ LZ4HC_InsertAndGetWiderMatch ( const U32 dictLimit = hc4->dictLimit; const BYTE* const lowPrefixPtr = base + dictLimit; const U32 ipIndex = (U32)(ip - base); - const U32 lowestMatchIndex = (hc4->lowLimit + 64 KB > ipIndex) ? hc4->lowLimit : ipIndex - MAX_DISTANCE; + const U32 lowestMatchIndex = (hc4->lowLimit + 64 KB > ipIndex) ? hc4->lowLimit : ipIndex - LZ4_DISTANCE_MAX; const BYTE* const dictBase = hc4->dictBase; int const lookBackLength = (int)(ip-iLowLimit); int nbAttempts = maxNbAttempts; @@ -325,7 +325,7 @@ LZ4HC_InsertAndGetWiderMatch ( const BYTE* const matchPtr = base + matchCandidateIdx; if (LZ4_read32(matchPtr) == pattern) { /* good candidate */ size_t const forwardPatternLength = LZ4HC_countPattern(matchPtr+sizeof(pattern), iHighLimit, pattern) + sizeof(pattern); - const BYTE* const lowestMatchPtr = (lowPrefixPtr + MAX_DISTANCE >= ip) ? lowPrefixPtr : ip - MAX_DISTANCE; + const BYTE* const lowestMatchPtr = (lowPrefixPtr + LZ4_DISTANCE_MAX >= ip) ? lowPrefixPtr : ip - LZ4_DISTANCE_MAX; size_t const backLength = LZ4HC_reverseCountPattern(matchPtr, lowestMatchPtr, pattern); size_t const currentSegmentLength = backLength + forwardPatternLength; @@ -338,7 +338,7 @@ LZ4HC_InsertAndGetWiderMatch ( size_t const maxML = MIN(currentSegmentLength, srcPatternLength); if ((size_t)longest < maxML) { assert(base + matchIndex < ip); - if (ip - (base+matchIndex) > MAX_DISTANCE) break; + if (ip - (base+matchIndex) > LZ4_DISTANCE_MAX) break; assert(maxML < 2 GB); longest = (int)maxML; *matchpos = base + matchIndex; /* virtual pos, relative to ip, to retrieve offset */ @@ -359,12 +359,12 @@ LZ4HC_InsertAndGetWiderMatch ( if ( dict == usingDictCtxHc && nbAttempts - && ipIndex - lowestMatchIndex < MAX_DISTANCE) { + && ipIndex - lowestMatchIndex < LZ4_DISTANCE_MAX) { size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->base); U32 dictMatchIndex = dictCtx->hashTable[LZ4HC_hashPtr(ip)]; assert(dictEndOffset <= 1 GB); matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset; - while (ipIndex - matchIndex <= MAX_DISTANCE && nbAttempts--) { + while (ipIndex - matchIndex <= LZ4_DISTANCE_MAX && nbAttempts--) { const BYTE* const matchPtr = dictCtx->base + dictMatchIndex; if (LZ4_read32(matchPtr) == pattern) { @@ -453,7 +453,7 @@ LZ4_FORCE_INLINE int LZ4HC_encodeSequence ( *op += length; /* Encode Offset */ - assert( (*ip - match) <= MAX_DISTANCE ); /* note : consider providing offset as a value, rather than as a pointer difference */ + assert( (*ip - match) <= LZ4_DISTANCE_MAX ); /* note : consider providing offset as a value, rather than as a pointer difference */ LZ4_writeLE16(*op, (U16)(*ip-match)); *op += 2; /* Encode MatchLength */ @@ -1435,7 +1435,7 @@ static int LZ4HC_compress_optimal ( LZ4HC_CCtx_internal* ctx, if (ml == 1) { ip++; rPos++; continue; } /* literal; note: can end up with several literals, in which case, skip them */ rPos += ml; assert(ml >= MINMATCH); - assert((offset >= 1) && (offset <= MAX_DISTANCE)); + assert((offset >= 1) && (offset <= LZ4_DISTANCE_MAX)); opSaved = op; if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ip - offset, limit, oend) ) /* updates ip, op and anchor */ goto _dest_overflow; diff --git a/programs/Makefile b/programs/Makefile index af461fe..92fd683 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -94,7 +94,7 @@ lz4.1: lz4.1.md $(LIBVER_SRC) man: lz4.1 clean-man: - rm lz4.1 + $(RM) lz4.1 preview-man: clean-man man man ./lz4.1 -- cgit v0.12