Merge pull request #528 from lz4/complexShortcut

Faster decoding speed
author: Yann Collet <Cyan4973@users.noreply.github.com> 2018-05-03 18:35:50 (GMT)
committer: GitHub <noreply@github.com> 2018-05-03 18:35:50 (GMT)
commit: 95607a749b8bbe6f9323408ddd740ef4ff248794 (patch)
tree: 18b0d16568a2f6b2e046db16362c215d2346ab9a /lib
parent: 85be6b8f6d5a91a8834e913b5f547ded49f7f714 (diff)
parent: 2e2c9f6ff353e9f1a4d23274eb4e5b7a5f7d654d (diff)
download: lz4-95607a749b8bbe6f9323408ddd740ef4ff248794.zip
lz4-95607a749b8bbe6f9323408ddd740ef4ff248794.tar.gz
lz4-95607a749b8bbe6f9323408ddd740ef4ff248794.tar.bz2
2 files changed, 108 insertions, 49 deletions
diff --git a/lib/lz4.c b/lib/lz4.c
index 1e0d8e9..3860c51 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1398,6 +1398,9 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
     const int safeDecode = (endOnInput==endOnInputSize);
     const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
 
+    /* Set up the "end" pointers for the shortcut. */
+    const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+    const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
 
     /* Special cases */
     if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT;                      /* targetOutputSize too high => just decode everything */
@@ -1407,39 +1410,56 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
 
     /* Main Loop : decode sequences */
     while (1) {
-        size_t length;
         const BYTE* match;
         size_t offset;
 
         unsigned const token = *ip++;
+        size_t length = token >> ML_BITS; /* literal length */
 
         assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
-        /* shortcut for common case :
-         * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes).
-         * this shortcut was tested on x86 and x64, where it improves decoding speed.
-         * it has not yet been benchmarked on ARM, Power, mips, etc.
-         * NOTE: The loop begins with a read, so we must have one byte left at the end. */
-        if (endOnInput
-          && ((ip + 14 /*maxLL*/ + 2 /*offset*/ < iend)
-            & (op + 14 /*maxLL*/ + 18 /*maxML*/ <= oend)
-            & (token < (15<<ML_BITS))
-            & ((token & ML_MASK) != 15) ) ) {
-            size_t const ll = token >> ML_BITS;
-            size_t const off = LZ4_readLE16(ip+ll);
-            const BYTE* const matchPtr = op + ll - off;  /* pointer underflow risk ? */
-            if ((off >= 8) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
-                size_t const ml = (token & ML_MASK) + MINMATCH;
-                memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
-                memcpy(op + 0, matchPtr + 0, 8);
-                memcpy(op + 8, matchPtr + 8, 8);
-                memcpy(op +16, matchPtr +16, 2);
-                op += ml;
+
+        /* A two-stage shortcut for the most common case:
+         * 1) If the literal length is 0..14, and there is enough space,
+         * enter the shortcut and copy 16 bytes on behalf of the literals
+         * (in the fast mode, only 8 bytes can be safely copied this way).
+         * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+         * manner; but we ensure that there's enough space in the output for
+         * those 18 bytes earlier, upon entering the shortcut (in other words,
+         * there is a combined check for both stages).
+         */
+        if ( (endOnInput ? length != RUN_MASK : length <= 8)
+            /* strictly "less than" on input, to re-enter the loop with at least one byte */
+          && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
+            /* Copy the literals */
+            memcpy(op, ip, endOnInput ? 16 : 8);
+            op += length; ip += length;
+
+            /* The second stage: prepare for match copying, decode full info.
+             * If it doesn't work out, the info won't be wasted. */
+            length = token & ML_MASK; /* match length */
+            offset = LZ4_readLE16(ip); ip += 2;
+            match = op - offset;
+
+            /* Do not deal with overlapping matches. */
+            if ( (length != ML_MASK)
+              && (offset >= 8)
+              && (dict==withPrefix64k || match >= lowPrefix) ) {
+                /* Copy the match. */
+                memcpy(op + 0, match + 0, 8);
+                memcpy(op + 8, match + 8, 8);
+                memcpy(op +16, match +16, 2);
+                op += length + MINMATCH;
+                /* Both stages worked, load the next token. */
                 continue;
             }
+
+            /* The second stage didn't work out, but the info is ready.
+             * Propel it right to the point of match copying. */
+            goto _copy_match;
         }
 
         /* decode literal length */
-        if ((length=(token>>ML_BITS)) == RUN_MASK) {
+        if (length == RUN_MASK) {
             unsigned s;
             if (unlikely(endOnInput ? ip >= iend-RUN_MASK : 0)) goto _output_error;   /* overflow detection */
             do {
@@ -1473,11 +1493,14 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
         /* get offset */
         offset = LZ4_readLE16(ip); ip+=2;
         match = op - offset;
-        if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
-        LZ4_write32(op, (U32)offset);   /* costs ~1%; silence an msan warning when offset==0 */
 
         /* get matchlength */
         length = token & ML_MASK;
+
+_copy_match:
+        if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+        LZ4_write32(op, (U32)offset);   /* costs ~1%; silence an msan warning when offset==0 */
+
         if (length == ML_MASK) {
             unsigned s;
             do {
@@ -1664,12 +1687,11 @@ int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
     return 0;
 }
 
-/*!
- * LZ4_setStreamDecode() :
- * Use this function to instruct where to find the dictionary.
- * This function is not necessary if previous data is still available where it was decoded.
- * Loading a size of 0 is allowed (same effect as no dictionary).
- * Return : 1 if OK, 0 if error
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
  */
 int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
 {
@@ -1681,6 +1703,25 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
     return 1;
 }
 
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
 /*
 *_continue() :
     These decoding functions allow decompression of multiple blocks in "streaming" mode.
diff --git a/lib/lz4.h b/lib/lz4.h
index 2745260..410f480 100644
--- a/lib/lz4.h
+++ b/lib/lz4.h
@@ -293,45 +293,62 @@ LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxD
 *  Streaming Decompression Functions
 *  Bufferless synchronous API
 ************************************************/
-typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* incomplete type (defined later) */
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
 
 /*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
- *  creation / destruction of streaming decompression tracking structure.
- *  A tracking structure can be re-used multiple times sequentially. */
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
 LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
 LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
 
 /*! LZ4_setStreamDecode() :
- *  An LZ4_streamDecode_t structure can be allocated once and re-used multiple times.
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
  *  Use this function to start decompression of a new stream of blocks.
  *  A dictionary can optionnally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  * @return : 1 if OK, 0 if error
  */
 LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
 
+/*! LZ4_decoderRingBufferSize() : v1.8.2
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(mbs) (65536 + 14 + (mbs))  /* for static allocation; mbs presumed valid */
+
 /*! LZ4_decompress_*_continue() :
  *  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
  *  A block is an unsplittable entity, it must be presented entirely to a decompression function.
- *  Decompression functions only accept one block at a time.
+ *  Decompression functions only accepts one block at a time.
  *  The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
- *  If less than 64KB of data has been decoded all the data must be present.
+ *  If less than 64KB of data has been decoded, all the data must be present.
  *
- *  Special : if application sets a ring buffer for decompression, it must respect one of the following conditions :
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
  *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
- *    maxBlockSize is the maximum size of any single block. It is implementation dependent, and can have any value (presumed > 16 bytes).
  *    In which case, encoding and decoding buffers do not need to be synchronized,
  *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
- *  - Decompression buffer size is _at least_ 64 KB + 8 bytes + maxBlockSize.
- *    In which case, encoding and decoding buffers do not need to be synchronized,
- *    and encoding ring buffer can have any size, including larger than decoding buffer.
- *  - Decompression buffer size is exactly the same as compression buffer size,
- *    and follows exactly same update rule (block boundaries at same positions).
- *    If the decoding function is provided with the exact decompressed size of each block,
- *    then decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
- *    If the decoding function only knows the compressed size,
- *    then buffer size must be a minimum of 64 KB + 8 bytes + maxBlockSize.
- *  Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
- *  and indicate where it is saved using LZ4_setStreamDecode() before decompressing next block.
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
 */
 LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
 LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
@@ -341,6 +358,7 @@ LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecod
  *  These decoding functions work the same as
  *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
  *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  */
 LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
 LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
author	Yann Collet <Cyan4973@users.noreply.github.com>	2018-05-03 18:35:50 (GMT)
committer	GitHub <noreply@github.com>	2018-05-03 18:35:50 (GMT)
commit	95607a749b8bbe6f9323408ddd740ef4ff248794 (patch)
tree	18b0d16568a2f6b2e046db16362c215d2346ab9a /lib
parent	85be6b8f6d5a91a8834e913b5f547ded49f7f714 (diff)
parent	2e2c9f6ff353e9f1a4d23274eb4e5b7a5f7d654d (diff)
download	lz4-95607a749b8bbe6f9323408ddd740ef4ff248794.zip lz4-95607a749b8bbe6f9323408ddd740ef4ff248794.tar.gz lz4-95607a749b8bbe6f9323408ddd740ef4ff248794.tar.bz2