11 files changed, 347 insertions, 221 deletions
diff --git a/NEWS b/NEWS
index 90cafc6..0139e61 100644
--- a/NEWS
+++ b/NEWS
@@ -1,13 +1,13 @@
 v1.8.2
 perf: *much* faster dictionary compression on small files, by @felixhandte
+perf: improved decompression speed and binary size, by Alexey Tourbin (@svpv)
 perf: slightly faster HC compression and decompression speed
 perf: very small compression ratio improvement
-perf: improved decompression binary size and speed, by Alexey Tourbin (@svpv)
 fix : compression compatible with low memory addresses (< 0xFFFF)
 fix : decompression segfault when provided with NULL input, by @terrelln
 cli : new command --favor-decSpeed
 cli : benchmark mode more accurate for small inputs
-fullbench : can measure _destSize() variants, by @felixhandte
+fullbench : can bench _destSize() variants, by @felixhandte
 doc : clarified block format parsing restrictions, by Alexey Tourbin (@svpv)
 
 v1.8.1
diff --git a/README.md b/README.md
index 596fdac..406792a 100644
--- a/README.md
+++ b/README.md
@@ -43,33 +43,32 @@ Benchmarks
 -------------------------
 
 The benchmark uses [lzbench], from @inikep
-compiled with GCC v6.2.0 on Linux 64-bits.
-The reference system uses a Core i7-3930K CPU @ 4.5GHz.
+compiled with GCC v7.3.0 on Linux 64-bits (Debian 4.15.17-1).
+The reference system uses a Core i7-6700K CPU @ 4.0GHz.
 Benchmark evaluates the compression of reference [Silesia Corpus]
 in single-thread mode.
 
 [lzbench]: https://github.com/inikep/lzbench
 [Silesia Corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
 
-|  Compressor            | Ratio   | Compression | Decompression |
-|  ----------            | -----   | ----------- | ------------- |
-|  memcpy                |  1.000  | 7300 MB/s   |   7300 MB/s   |
-|**LZ4 fast 8  (v1.7.3)**|  1.799  |**911 MB/s** | **3360 MB/s** |
-|**LZ4 default (v1.7.3)**|**2.101**|**625 MB/s** | **3220 MB/s** |
-|  LZO 2.09              |  2.108  |  620 MB/s   |    845 MB/s   |
-|  QuickLZ 1.5.0         |  2.238  |  510 MB/s   |    600 MB/s   |
-|  Snappy 1.1.3          |  2.091  |  450 MB/s   |   1550 MB/s   |
-|  LZF v3.6              |  2.073  |  365 MB/s   |    820 MB/s   |
-|  [Zstandard] 1.1.1 -1  |  2.876  |  330 MB/s   |    930 MB/s   |
-|  [Zstandard] 1.1.1 -3  |  3.164  |  200 MB/s   |    810 MB/s   |
-| [zlib] deflate 1.2.8 -1|  2.730  |  100 MB/s   |    370 MB/s   |
-|**LZ4 HC -9 (v1.7.3)**  |**2.720**|   34 MB/s   | **3240 MB/s** |
-| [zlib] deflate 1.2.8 -6|  3.099  |   33 MB/s   |    390 MB/s   |
+|  Compressor             | Ratio   | Compression | Decompression |
+|  ----------             | -----   | ----------- | ------------- |
+|  memcpy                 |  1.000  |13100 MB/s   |  13100 MB/s   |
+|**LZ4 default (v1.8.2)** |**2.101**|**730 MB/s** | **3900 MB/s** |
+|  LZO 2.09               |  2.108  |  630 MB/s   |    800 MB/s   |
+|  QuickLZ 1.5.0          |  2.238  |  530 MB/s   |    720 MB/s   |
+|  Snappy 1.1.4           |  2.091  |  525 MB/s   |   1750 MB/s   |
+|  [Zstandard] 1.3.4 -1   |  2.877  |  470 MB/s   |   1380 MB/s   |
+|  LZF v3.6               |  2.073  |  380 MB/s   |    840 MB/s   |
+| [zlib] deflate 1.2.11 -1|  2.730  |  100 MB/s   |    380 MB/s   |
+|**LZ4 HC -9 (v1.8.2)**   |**2.721**|   40 MB/s   | **3920 MB/s** |
+| [zlib] deflate 1.2.11 -6|  3.099  |   34 MB/s   |    410 MB/s   |
 
 [zlib]: http://www.zlib.net/
 [Zstandard]: http://www.zstd.net/
 
-LZ4 is also compatible and well optimized for x32 mode, for which it provides an additional +10% speed performance.
+LZ4 is also compatible and well optimized for x32 mode,
+for which it provides some additional speed performance.
 
 
 Installation
diff --git a/doc/lz4_manual.html b/doc/lz4_manual.html
index ddd2724..e079db2 100644
--- a/doc/lz4_manual.html
+++ b/doc/lz4_manual.html
@@ -206,38 +206,59 @@ int           LZ4_freeStream (LZ4_stream_t* streamPtr);
 
 <pre><b>LZ4_streamDecode_t* LZ4_createStreamDecode(void);
 int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
-</b><p>  creation / destruction of streaming decompression tracking structure.
-  A tracking structure can be re-used multiple times sequentially. 
+</b><p>  creation / destruction of streaming decompression tracking context.
+  A tracking context can be re-used multiple times.
+ 
 </p></pre><BR>
 
 <pre><b>int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
-</b><p>  An LZ4_streamDecode_t structure can be allocated once and re-used multiple times.
+</b><p>  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
   Use this function to start decompression of a new stream of blocks.
   A dictionary can optionnally be set. Use NULL or size 0 for a reset order.
+  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  @return : 1 if OK, 0 if error
  
 </p></pre><BR>
 
+<pre><b>int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(mbs) (65536 + 14 + (mbs))  </b>/* for static allocation; mbs presumed valid */<b>
+</b><p>  Note : in a ring buffer scenario (optional),
+  blocks are presumed decompressed next to each other
+  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+  at which stage it resumes from beginning of ring buffer.
+  When setting such a ring buffer for streaming decompression,
+  provides the minimum size of this ring buffer
+  to be compatible with any source respecting maxBlockSize condition.
+ @return : minimum ring buffer size,
+           or 0 if there is an error (invalid maxBlockSize).
+ 
+</p></pre><BR>
+
 <pre><b>int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
 int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
 </b><p>  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
   A block is an unsplittable entity, it must be presented entirely to a decompression function.
-  Decompression functions only accept one block at a time.
+  Decompression functions only accepts one block at a time.
   The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
-  If less than 64KB of data has been decoded all the data must be present.
-
-  Special : if application sets a ring buffer for decompression, it must respect one of the following conditions :
-  - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
-    In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
-  - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
-    maxBlockSize is implementation dependent. It's the maximum size of any single block.
+  If less than 64KB of data has been decoded, all the data must be present.
+
+  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+    In which case, encoding and decoding buffers do not need to be synchronized.
+    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+  - Synchronized mode :
+    Decompression buffer size is _exactly_ the same as compression buffer size,
+    and follows exactly same update rule (block boundaries at same positions),
+    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
     In which case, encoding and decoding buffers do not need to be synchronized,
     and encoding ring buffer can have any size, including small ones ( < 64 KB).
-  - _At least_ 64 KB + 8 bytes + maxBlockSize.
-    In which case, encoding and decoding buffers do not need to be synchronized,
-    and encoding ring buffer can have any size, including larger than decoding buffer.
-  Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
-  and indicate where it is saved using LZ4_setStreamDecode() before decompressing next block.
+
+  Whenever these conditions are not possible,
+  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
 </p></pre><BR>
 
 <pre><b>int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
@@ -245,6 +266,7 @@ int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize,
 </b><p>  These decoding functions work the same as
   a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
   They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  
 </p></pre><BR>
 
diff --git a/lib/lz4.c b/lib/lz4.c
index 1e0d8e9..3860c51 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1398,6 +1398,9 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
     const int safeDecode = (endOnInput==endOnInputSize);
     const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
 
+    /* Set up the "end" pointers for the shortcut. */
+    const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+    const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
 
     /* Special cases */
     if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT;                      /* targetOutputSize too high => just decode everything */
@@ -1407,39 +1410,56 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
 
     /* Main Loop : decode sequences */
     while (1) {
-        size_t length;
         const BYTE* match;
         size_t offset;
 
         unsigned const token = *ip++;
+        size_t length = token >> ML_BITS; /* literal length */
 
         assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
-        /* shortcut for common case :
-         * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes).
-         * this shortcut was tested on x86 and x64, where it improves decoding speed.
-         * it has not yet been benchmarked on ARM, Power, mips, etc.
-         * NOTE: The loop begins with a read, so we must have one byte left at the end. */
-        if (endOnInput
-          && ((ip + 14 /*maxLL*/ + 2 /*offset*/ < iend)
-            & (op + 14 /*maxLL*/ + 18 /*maxML*/ <= oend)
-            & (token < (15<<ML_BITS))
-            & ((token & ML_MASK) != 15) ) ) {
-            size_t const ll = token >> ML_BITS;
-            size_t const off = LZ4_readLE16(ip+ll);
-            const BYTE* const matchPtr = op + ll - off;  /* pointer underflow risk ? */
-            if ((off >= 8) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
-                size_t const ml = (token & ML_MASK) + MINMATCH;
-                memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
-                memcpy(op + 0, matchPtr + 0, 8);
-                memcpy(op + 8, matchPtr + 8, 8);
-                memcpy(op +16, matchPtr +16, 2);
-                op += ml;
+
+        /* A two-stage shortcut for the most common case:
+         * 1) If the literal length is 0..14, and there is enough space,
+         * enter the shortcut and copy 16 bytes on behalf of the literals
+         * (in the fast mode, only 8 bytes can be safely copied this way).
+         * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+         * manner; but we ensure that there's enough space in the output for
+         * those 18 bytes earlier, upon entering the shortcut (in other words,
+         * there is a combined check for both stages).
+         */
+        if ( (endOnInput ? length != RUN_MASK : length <= 8)
+            /* strictly "less than" on input, to re-enter the loop with at least one byte */
+          && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
+            /* Copy the literals */
+            memcpy(op, ip, endOnInput ? 16 : 8);
+            op += length; ip += length;
+
+            /* The second stage: prepare for match copying, decode full info.
+             * If it doesn't work out, the info won't be wasted. */
+            length = token & ML_MASK; /* match length */
+            offset = LZ4_readLE16(ip); ip += 2;
+            match = op - offset;
+
+            /* Do not deal with overlapping matches. */
+            if ( (length != ML_MASK)
+              && (offset >= 8)
+              && (dict==withPrefix64k || match >= lowPrefix) ) {
+                /* Copy the match. */
+                memcpy(op + 0, match + 0, 8);
+                memcpy(op + 8, match + 8, 8);
+                memcpy(op +16, match +16, 2);
+                op += length + MINMATCH;
+                /* Both stages worked, load the next token. */
                 continue;
             }
+
+            /* The second stage didn't work out, but the info is ready.
+             * Propel it right to the point of match copying. */
+            goto _copy_match;
         }
 
         /* decode literal length */
-        if ((length=(token>>ML_BITS)) == RUN_MASK) {
+        if (length == RUN_MASK) {
             unsigned s;
             if (unlikely(endOnInput ? ip >= iend-RUN_MASK : 0)) goto _output_error;   /* overflow detection */
             do {
@@ -1473,11 +1493,14 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
         /* get offset */
         offset = LZ4_readLE16(ip); ip+=2;
         match = op - offset;
-        if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
-        LZ4_write32(op, (U32)offset);   /* costs ~1%; silence an msan warning when offset==0 */
 
         /* get matchlength */
         length = token & ML_MASK;
+
+_copy_match:
+        if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+        LZ4_write32(op, (U32)offset);   /* costs ~1%; silence an msan warning when offset==0 */
+
         if (length == ML_MASK) {
             unsigned s;
             do {
@@ -1664,12 +1687,11 @@ int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
     return 0;
 }
 
-/*!
- * LZ4_setStreamDecode() :
- * Use this function to instruct where to find the dictionary.
- * This function is not necessary if previous data is still available where it was decoded.
- * Loading a size of 0 is allowed (same effect as no dictionary).
- * Return : 1 if OK, 0 if error
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
  */
 int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
 {
@@ -1681,6 +1703,25 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
     return 1;
 }
 
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
 /*
 *_continue() :
     These decoding functions allow decompression of multiple blocks in "streaming" mode.
diff --git a/lib/lz4.h b/lib/lz4.h
index d8238f5..7d13122 100644
--- a/lib/lz4.h
+++ b/lib/lz4.h
@@ -293,41 +293,62 @@ LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxD
 *  Streaming Decompression Functions
 *  Bufferless synchronous API
 ************************************************/
-typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* incomplete type (defined later) */
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
 
 /*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
- *  creation / destruction of streaming decompression tracking structure.
- *  A tracking structure can be re-used multiple times sequentially. */
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
 LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
 LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
 
 /*! LZ4_setStreamDecode() :
- *  An LZ4_streamDecode_t structure can be allocated once and re-used multiple times.
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
  *  Use this function to start decompression of a new stream of blocks.
  *  A dictionary can optionnally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  * @return : 1 if OK, 0 if error
  */
 LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
 
+/*! LZ4_decoderRingBufferSize() : v1.8.2
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(mbs) (65536 + 14 + (mbs))  /* for static allocation; mbs presumed valid */
+
 /*! LZ4_decompress_*_continue() :
  *  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
  *  A block is an unsplittable entity, it must be presented entirely to a decompression function.
- *  Decompression functions only accept one block at a time.
+ *  Decompression functions only accepts one block at a time.
  *  The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
- *  If less than 64KB of data has been decoded all the data must be present.
+ *  If less than 64KB of data has been decoded, all the data must be present.
  *
- *  Special : if application sets a ring buffer for decompression, it must respect one of the following conditions :
- *  - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
- *    In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
- *  - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
- *    maxBlockSize is implementation dependent. It's the maximum size of any single block.
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
  *    In which case, encoding and decoding buffers do not need to be synchronized,
  *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
- *  - _At least_ 64 KB + 8 bytes + maxBlockSize.
- *    In which case, encoding and decoding buffers do not need to be synchronized,
- *    and encoding ring buffer can have any size, including larger than decoding buffer.
- *  Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
- *  and indicate where it is saved using LZ4_setStreamDecode() before decompressing next block.
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
 */
 LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
 LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
@@ -337,6 +358,7 @@ LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecod
  *  These decoding functions work the same as
  *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
  *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
  */
 LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
 LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
diff --git a/lib/lz4frame.c b/lib/lz4frame.c
index 91e5a43..aa7889b 100644
--- a/lib/lz4frame.c
+++ b/lib/lz4frame.c
@@ -96,6 +96,19 @@ You can contact the author at :
 
 #define LZ4F_STATIC_ASSERT(c)    { enum { LZ4F_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) && !defined(DEBUGLOG)
+#  include <stdio.h>
+static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                                  \
+                if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+                    fprintf(stderr, __FILE__ ": ");           \
+                    fprintf(stderr, __VA_ARGS__);             \
+                    fprintf(stderr, " \n");                   \
+            }   }
+#else
+#  define DEBUGLOG(l, ...)      {}    /* disabled */
+#endif
+
 
 /*-************************************
 *  Basic Types
@@ -408,6 +421,7 @@ size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity,
     LZ4_stream_t lz4ctx;
     LZ4F_cctx_t *cctxPtr = &cctx;
 
+    DEBUGLOG(4, "LZ4F_compressFrame");
     MEM_INIT(&cctx, 0, sizeof(cctx));
     cctx.version = LZ4F_VERSION;
     cctx.maxBufferSize = 5 MB;   /* mess with real buffer size to prevent dynamic allocation; works only because autoflush==1 & stableSrc==1 */
@@ -1207,24 +1221,31 @@ LZ4F_errorCode_t LZ4F_getFrameInfo(LZ4F_dctx* dctx, LZ4F_frameInfo_t* frameInfoP
 
 /* LZ4F_updateDict() :
  * only used for LZ4F_blockLinked mode */
-static void LZ4F_updateDict(LZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize, const BYTE* dstPtr0, unsigned withinTmp)
+static void LZ4F_updateDict(LZ4F_dctx* dctx,
+                      const BYTE* dstPtr, size_t dstSize, const BYTE* dstBufferStart,
+                      unsigned withinTmp)
 {
     if (dctx->dictSize==0)
         dctx->dict = (const BYTE*)dstPtr;   /* priority to dictionary continuity */
 
-    if (dctx->dict + dctx->dictSize == dstPtr) {  /* dictionary continuity */
+    if (dctx->dict + dctx->dictSize == dstPtr) {  /* dictionary continuity, directly within dstBuffer */
         dctx->dictSize += dstSize;
         return;
     }
 
-    if (dstPtr - dstPtr0 + dstSize >= 64 KB) {  /* dstBuffer large enough to become dictionary */
-        dctx->dict = (const BYTE*)dstPtr0;
-        dctx->dictSize = dstPtr - dstPtr0 + dstSize;
+    if (dstPtr - dstBufferStart + dstSize >= 64 KB) {  /* history in dstBuffer becomes large enough to become dictionary */
+        dctx->dict = (const BYTE*)dstBufferStart;
+        dctx->dictSize = dstPtr - dstBufferStart + dstSize;
         return;
     }
 
-    if ((withinTmp) && (dctx->dict == dctx->tmpOutBuffer)) {
-        /* assumption : dctx->dict + dctx->dictSize == dctx->tmpOut + dctx->tmpOutStart */
+    assert(dstSize < 64 KB);   /* if dstSize >= 64 KB, dictionary would be set into dstBuffer directly */
+
+    /* dstBuffer does not contain whole useful history (64 KB), so it must be saved within tmpOut */
+
+    if ((withinTmp) && (dctx->dict == dctx->tmpOutBuffer)) {   /* continue history within tmpOutBuffer */
+        /* withinTmp expectation : content of [dstPtr,dstSize] is same as [dict+dictSize,dstSize], so we just extend it */
+        assert(dctx->dict + dctx->dictSize == dctx->tmpOut + dctx->tmpOutStart);
         dctx->dictSize += dstSize;
         return;
     }
@@ -1245,7 +1266,7 @@ static void LZ4F_updateDict(LZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize,
 
     if (dctx->dict == dctx->tmpOutBuffer) {    /* copy dst into tmp to complete dict */
         if (dctx->dictSize + dstSize > dctx->maxBufferSize) {  /* tmp buffer not large enough */
-            size_t const preserveSize = 64 KB - dstSize;   /* note : dstSize < 64 KB */
+            size_t const preserveSize = 64 KB - dstSize;
             memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize);
             dctx->dictSize = preserveSize;
         }
@@ -1255,7 +1276,7 @@ static void LZ4F_updateDict(LZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize,
     }
 
     /* join dict & dest into tmp */
-    {   size_t preserveSize = 64 KB - dstSize;   /* note : dstSize < 64 KB */
+    {   size_t preserveSize = 64 KB - dstSize;
         if (preserveSize > dctx->dictSize) preserveSize = dctx->dictSize;
         memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize);
         memcpy(dctx->tmpOutBuffer + preserveSize, dstPtr, dstSize);
@@ -1322,7 +1343,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
             }
             dctx->tmpInSize = 0;
             if (srcEnd-srcPtr == 0) return minFHSize;   /* 0-size input */
-            dctx->tmpInTarget = minFHSize;   /* minimum to attempt decode */
+            dctx->tmpInTarget = minFHSize;   /* minimum size to decode header */
             dctx->dStage = dstage_storeFrameHeader;
             /* fall-through */
 
@@ -1479,8 +1500,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                     U32 const calcCRC = XXH32_digest(&dctx->blockChecksum);
                     if (readCRC != calcCRC)
                         return err0r(LZ4F_ERROR_blockChecksum_invalid);
-                }
-            }
+            }   }
             dctx->dStage = dstage_getBlockHeader;  /* new block */
             break;
 
@@ -1521,13 +1541,13 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
             }   }
 
             if ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) {
-                const char *dict = (const char *)dctx->dict;
+                const char* dict = (const char*)dctx->dict;
                 size_t dictSize = dctx->dictSize;
                 int decodedSize;
                 if (dict && dictSize > 1 GB) {
                     /* the dictSize param is an int, avoid truncation / sign issues */
-                    dict += dictSize - 1 GB;
-                    dictSize = 1 GB;
+                    dict += dictSize - 64 KB;
+                    dictSize = 64 KB;
                 }
                 /* enough capacity in `dst` to decompress directly there */
                 decodedSize = LZ4_decompress_safe_usingDict(
@@ -1561,18 +1581,16 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 } else {  /* dict not within tmp */
                     size_t const reservedDictSpace = MIN(dctx->dictSize, 64 KB);
                     dctx->tmpOut = dctx->tmpOutBuffer + reservedDictSpace;
-                }
-            }
+            }   }
 
             /* Decode block */
-            {
-                const char *dict = (const char *)dctx->dict;
+            {   const char* dict = (const char*)dctx->dict;
                 size_t dictSize = dctx->dictSize;
                 int decodedSize;
                 if (dict && dictSize > 1 GB) {
                     /* the dictSize param is an int, avoid truncation / sign issues */
-                    dict += dictSize - 1 GB;
-                    dictSize = 1 GB;
+                    dict += dictSize - 64 KB;
+                    dictSize = 64 KB;
                 }
                 decodedSize = LZ4_decompress_safe_usingDict(
                         (const char*)selectedIn, (char*)dctx->tmpOut,
@@ -1595,8 +1613,8 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 memcpy(dstPtr, dctx->tmpOut + dctx->tmpOutStart, sizeToCopy);
 
                 /* dictionary management */
-                if (dctx->frameInfo.blockMode==LZ4F_blockLinked)
-                    LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 1);
+                if (dctx->frameInfo.blockMode == LZ4F_blockLinked)
+                    LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 1 /*withinTmp*/);
 
                 dctx->tmpOutStart += sizeToCopy;
                 dstPtr += sizeToCopy;
@@ -1605,8 +1623,9 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                     dctx->dStage = dstage_getBlockHeader;  /* get next block */
                     break;
                 }
+                /* could not flush everything : stop there, just request a block header */
+                doAnotherStage = 0;
                 nextSrcSizeHint = BHSize;
-                doAnotherStage = 0;   /* still some data to flush */
                 break;
             }
 
@@ -1643,7 +1662,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 selectedIn = dctx->tmpIn;
             }   /* if (dctx->dStage == dstage_storeSuffix) */
 
-        /* case dstage_checkSuffix: */   /* no direct call, avoid scan-build warning */
+        /* case dstage_checkSuffix: */   /* no direct entry, avoid initialization risks */
             {   U32 const readCRC = LZ4F_readLE32(selectedIn);
                 U32 const resultCRC = XXH32_digest(&(dctx->xxh));
                 if (readCRC != resultCRC)
@@ -1667,8 +1686,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
 
             if (dctx->dStage == dstage_storeSFrameSize)
         case dstage_storeSFrameSize:
-            {
-                size_t const sizeToCopy = MIN(dctx->tmpInTarget - dctx->tmpInSize,
+            {   size_t const sizeToCopy = MIN(dctx->tmpInTarget - dctx->tmpInSize,
                                              (size_t)(srcEnd - srcPtr) );
                 memcpy(dctx->header + dctx->tmpInSize, srcPtr, sizeToCopy);
                 srcPtr += sizeToCopy;
@@ -1682,7 +1700,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 selectedIn = dctx->header + 4;
             }   /* if (dctx->dStage == dstage_storeSFrameSize) */
 
-        /* case dstage_decodeSFrameSize: */   /* no direct access */
+        /* case dstage_decodeSFrameSize: */   /* no direct entry */
             {   size_t const SFrameSize = LZ4F_readLE32(selectedIn);
                 dctx->frameInfo.contentSize = SFrameSize;
                 dctx->tmpInTarget = SFrameSize;
@@ -1701,7 +1719,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 LZ4F_resetDecompressionContext(dctx);
                 break;
             }
-        }
+        }   /* switch (dctx->dStage) */
     }   /* while (doAnotherStage) */
 
     /* preserve history within tmp whenever necessary */
diff --git a/lib/lz4hc.c b/lib/lz4hc.c
index 948d66d..0859ea6 100644
--- a/lib/lz4hc.c
+++ b/lib/lz4hc.c
@@ -138,6 +138,7 @@ int LZ4HC_countBack(const BYTE* const ip, const BYTE* const match,
 {
     int back = 0;
     int const min = (int)MAX(iMin - ip, mMin - match);
+    assert(min <= 0);
     assert(ip >= iMin); assert((size_t)(ip-iMin) < (1U<<31));
     assert(match >= mMin); assert((size_t)(match - mMin) < (1U<<31));
     while ( (back > min)
@@ -222,9 +223,9 @@ LZ4HC_InsertAndGetWiderMatch (
     const U32 dictLimit = hc4->dictLimit;
     const BYTE* const lowPrefixPtr = base + dictLimit;
     const U32 ipIndex = (U32)(ip - base);
-    const U32 lowLimit = (hc4->lowLimit + 64 KB > ipIndex) ? hc4->lowLimit : ipIndex - MAX_DISTANCE;
+    const U32 lowestMatchIndex = (hc4->lowLimit + 64 KB > ipIndex) ? hc4->lowLimit : ipIndex - MAX_DISTANCE;
     const BYTE* const dictBase = hc4->dictBase;
-    int const delta = (int)(ip-iLowLimit);
+    int const lookBackLength = (int)(ip-iLowLimit);
     int nbAttempts = maxNbAttempts;
     U32 const pattern = LZ4_read32(ip);
     U32 matchIndex;
@@ -236,10 +237,10 @@ LZ4HC_InsertAndGetWiderMatch (
     /* First Match */
     LZ4HC_Insert(hc4, ip);
     matchIndex = HashTable[LZ4HC_hashPtr(ip)];
-    DEBUGLOG(7, "First match at index %u / %u (lowLimit)",
-                matchIndex, lowLimit);
+    DEBUGLOG(7, "First match at index %u / %u (lowestMatchIndex)",
+                matchIndex, lowestMatchIndex);
 
-    while ((matchIndex>=lowLimit) && (nbAttempts)) {
+    while ((matchIndex>=lowestMatchIndex) && (nbAttempts)) {
         DEBUGLOG(7, "remaining attempts : %i", nbAttempts);
         nbAttempts--;
         assert(matchIndex < ipIndex);
@@ -247,34 +248,35 @@ LZ4HC_InsertAndGetWiderMatch (
             /* do nothing */
         } else if (matchIndex >= dictLimit) {
             const BYTE* const matchPtr = base + matchIndex;
+            assert(matchPtr >= lowPrefixPtr);
+            assert(matchPtr < ip);
             assert(longest >= 1);
-            if (LZ4_read16(iLowLimit + longest - 1) == LZ4_read16(matchPtr - delta + longest - 1)) {
+            if (LZ4_read16(iLowLimit + longest - 1) == LZ4_read16(matchPtr - lookBackLength + longest - 1)) {
                 if (LZ4_read32(matchPtr) == pattern) {
                     int mlt = MINMATCH + LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit);
-                    int const back = delta ? LZ4HC_countBack(ip, matchPtr, iLowLimit, lowPrefixPtr) : 0;
+                    int const back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, lowPrefixPtr) : 0;
                     mlt -= back;
-
                     if (mlt > longest) {
                         longest = mlt;
                         *matchpos = matchPtr+back;
                         *startpos = ip+back;
-                }   }
-            }
+            }   }   }
         } else {   /* matchIndex < dictLimit */
             const BYTE* const matchPtr = dictBase + matchIndex;
             if (LZ4_read32(matchPtr) == pattern) {
+                const BYTE* const dictStart = dictBase + hc4->lowLimit;
                 int mlt;
                 int back = 0;
                 const BYTE* vLimit = ip + (dictLimit - matchIndex);
                 if (vLimit > iHighLimit) vLimit = iHighLimit;
                 mlt = LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
                 if ((ip+mlt == vLimit) && (vLimit < iHighLimit))
-                    mlt += LZ4_count(ip+mlt, base+dictLimit, iHighLimit);
-                back = delta ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictBase+lowLimit) : 0;
+                    mlt += LZ4_count(ip+mlt, lowPrefixPtr, iHighLimit);
+                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictStart) : 0;
                 mlt -= back;
                 if (mlt > longest) {
                     longest = mlt;
-                    *matchpos = base + matchIndex + back;
+                    *matchpos = base + matchIndex + back;   /* virtual pos, relative to ip, to retrieve offset */
                     *startpos = ip + back;
         }   }   }
 
@@ -306,13 +308,13 @@ LZ4HC_InsertAndGetWiderMatch (
                             matchIndex -= (U32)backLength;   /* let's go to farthest segment position, will find a match of length currentSegmentLength + maybe some back */
                         }
         }   }   }   }
-    }  /* while ((matchIndex>=lowLimit) && (nbAttempts)) */
+    }  /* while ((matchIndex>=lowestMatchIndex) && (nbAttempts)) */
 
-    if (dict == usingDictCtx && nbAttempts && ipIndex - lowLimit < MAX_DISTANCE) {
+    if (dict == usingDictCtx && nbAttempts && ipIndex - lowestMatchIndex < MAX_DISTANCE) {
         size_t const dictEndOffset = dictCtx->end - dictCtx->base;
         assert(dictEndOffset <= 1 GB);
         dictMatchIndex = dictCtx->hashTable[LZ4HC_hashPtr(ip)];
-        matchIndex = dictMatchIndex + lowLimit - (U32)dictEndOffset;
+        matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset;
         while (ipIndex - matchIndex <= MAX_DISTANCE && nbAttempts--) {
             const BYTE* const matchPtr = dictCtx->base + dictMatchIndex;
 
@@ -322,7 +324,7 @@ LZ4HC_InsertAndGetWiderMatch (
                 const BYTE* vLimit = ip + (dictEndOffset - dictMatchIndex);
                 if (vLimit > iHighLimit) vLimit = iHighLimit;
                 mlt = LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
-                back = delta ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->base + dictCtx->dictLimit) : 0;
+                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->base + dictCtx->dictLimit) : 0;
                 mlt -= back;
                 if (mlt > longest) {
                     longest = mlt;
@@ -459,14 +461,14 @@ LZ4_FORCE_INLINE int LZ4HC_compress_hashChain (
     BYTE* op = (BYTE*) dest;
     BYTE* oend = op + maxOutputSize;
 
-    int   ml, ml2, ml3, ml0;
+    int   ml0, ml, ml2, ml3;
+    const BYTE* start0;
+    const BYTE* ref0;
     const BYTE* ref = NULL;
     const BYTE* start2 = NULL;
     const BYTE* ref2 = NULL;
     const BYTE* start3 = NULL;
     const BYTE* ref3 = NULL;
-    const BYTE* start0;
-    const BYTE* ref0;
 
     /* init */
     *srcSizePtr = 0;
@@ -479,31 +481,27 @@ LZ4_FORCE_INLINE int LZ4HC_compress_hashChain (
         if (ml<MINMATCH) { ip++; continue; }
 
         /* saved, in case we would skip too much */
-        start0 = ip;
-        ref0 = ref;
-        ml0 = ml;
+        start0 = ip; ref0 = ref; ml0 = ml;
 
 _Search2:
-        if (ip+ml <= mflimit)
+        if (ip+ml <= mflimit) {
             ml2 = LZ4HC_InsertAndGetWiderMatch(ctx,
                             ip + ml - 2, ip + 0, matchlimit, ml, &ref2, &start2,
                             maxNbAttempts, patternAnalysis, dict, favorCompressionRatio);
-        else
+        } else {
             ml2 = ml;
+        }
 
-        if (ml2 == ml) { /* No better match */
+        if (ml2 == ml) { /* No better match => encode ML1 */
             optr = op;
             if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) goto _dest_overflow;
             continue;
         }
 
-        if (start0 < ip) {
-            if (start2 < ip + ml0) {  /* empirical */
-                ip = start0;
-                ref = ref0;
-                ml = ml0;
-            }
-        }
+        if (start0 < ip) {   /* first match was skipped at least once */
+            if (start2 < ip + ml0) {  /* squeezing ML1 between ML0(original ML1) and ML2 */
+                ip = start0; ref = ref0; ml = ml0;  /* restore initial ML1 */
+        }   }
 
         /* Here, start0==ip */
         if ((start2 - ip) < 3) {  /* First Match too small : removed */
@@ -531,14 +529,15 @@ _Search3:
         }
         /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */
 
-        if (start2 + ml2 <= mflimit)
+        if (start2 + ml2 <= mflimit) {
             ml3 = LZ4HC_InsertAndGetWiderMatch(ctx,
                             start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3,
                             maxNbAttempts, patternAnalysis, dict, favorCompressionRatio);
-        else
+        } else {
             ml3 = ml2;
+        }
 
-        if (ml3 == ml2) {  /* No better match : 2 sequences to encode */
+        if (ml3 == ml2) {  /* No better match => encode ML1 and ML2 */
             /* ip & ref are known; Now for ml */
             if (start2 < ip+ml)  ml = (int)(start2 - ip);
             /* Now, encode 2 sequences */
@@ -583,11 +582,12 @@ _Search3:
         }
 
         /*
-        * OK, now we have 3 ascending matches; let's write at least the first one
-        * ip & ref are known; Now for ml
+        * OK, now we have 3 ascending matches;
+        * let's write the first one ML1.
+        * ip & ref are known; Now decide ml.
         */
         if (start2 < ip+ml) {
-            if ((start2 - ip) < (int)ML_MASK) {
+            if ((start2 - ip) < OPTIMAL_ML) {
                 int correction;
                 if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
                 if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
@@ -604,14 +604,13 @@ _Search3:
         optr = op;
         if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) goto _dest_overflow;
 
-        ip = start2;
-        ref = ref2;
-        ml = ml2;
+        /* ML2 becomes ML1 */
+        ip = start2; ref = ref2; ml = ml2;
 
-        start2 = start3;
-        ref2 = ref3;
-        ml2 = ml3;
+        /* ML3 becomes ML2 */
+        start2 = start3; ref2 = ref3; ml2 = ml3;
 
+        /* let's find a new ML3 */
         goto _Search3;
     }
 
@@ -682,19 +681,19 @@ LZ4_FORCE_INLINE int LZ4HC_compress_generic_internal (
         U32 targetLength;
     } cParams_t;
     static const cParams_t clTable[LZ4HC_CLEVEL_MAX+1] = {
-        { lz4hc,    2, 16 },  /* 0, unused */
-        { lz4hc,    2, 16 },  /* 1, unused */
-        { lz4hc,    2, 16 },  /* 2, unused */
-        { lz4hc,    4, 16 },  /* 3 */
-        { lz4hc,    8, 16 },  /* 4 */
-        { lz4hc,   16, 16 },  /* 5 */
-        { lz4hc,   32, 16 },  /* 6 */
-        { lz4hc,   64, 16 },  /* 7 */
-        { lz4hc,  128, 16 },  /* 8 */
-        { lz4hc,  256, 16 },  /* 9 */
-        { lz4opt,  96, 64 },  /*10==LZ4HC_CLEVEL_OPT_MIN*/
-        { lz4opt, 512,128 },  /*11 */
-        { lz4opt,8192, LZ4_OPT_NUM },  /* 12==LZ4HC_CLEVEL_MAX */
+        { lz4hc,     2, 16 },  /* 0, unused */
+        { lz4hc,     2, 16 },  /* 1, unused */
+        { lz4hc,     2, 16 },  /* 2, unused */
+        { lz4hc,     4, 16 },  /* 3 */
+        { lz4hc,     8, 16 },  /* 4 */
+        { lz4hc,    16, 16 },  /* 5 */
+        { lz4hc,    32, 16 },  /* 6 */
+        { lz4hc,    64, 16 },  /* 7 */
+        { lz4hc,   128, 16 },  /* 8 */
+        { lz4hc,   256, 16 },  /* 9 */
+        { lz4opt,   96, 64 },  /*10==LZ4HC_CLEVEL_OPT_MIN*/
+        { lz4opt,  512,128 },  /*11 */
+        { lz4opt,16384,LZ4_OPT_NUM },  /* 12==LZ4HC_CLEVEL_MAX */
     };
 
     DEBUGLOG(4, "LZ4HC_compress_generic(%p, %p, %d)", ctx, src, *srcSizePtr);
@@ -705,8 +704,6 @@ LZ4_FORCE_INLINE int LZ4HC_compress_generic_internal (
     ctx->end += *srcSizePtr;
     if (cLevel < 1) cLevel = LZ4HC_CLEVEL_DEFAULT;   /* note : convention is different from lz4frame, maybe something to review */
     cLevel = MIN(LZ4HC_CLEVEL_MAX, cLevel);
-    assert(cLevel >= 0);
-    assert(cLevel <= LZ4HC_CLEVEL_MAX);
     {   cParams_t const cParam = clTable[cLevel];
         HCfavor_e const favor = ctx->favorDecSpeed ? favorDecompressionSpeed : favorCompressionRatio;
         if (cParam.strat == lz4hc)
@@ -905,7 +902,8 @@ void LZ4_attach_HC_dictionary(LZ4_streamHC_t *working_stream, const LZ4_streamHC
 static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock)
 {
     DEBUGLOG(4, "LZ4HC_setExternalDict(%p, %p)", ctxPtr, newBlock);
-    if (ctxPtr->end >= ctxPtr->base + 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3);   /* Referencing remaining dictionary content */
+    if (ctxPtr->end >= ctxPtr->base + ctxPtr->dictLimit + 4)
+        LZ4HC_Insert (ctxPtr, ctxPtr->end-3);   /* Referencing remaining dictionary content */
 
     /* Only one memory segment for extDict, so any previous extDict is lost at this stage */
     ctxPtr->lowLimit  = ctxPtr->dictLimit;
diff --git a/programs/util.h b/programs/util.h
index ef6ca77..d74db0d 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -194,7 +194,7 @@ extern "C" {
         return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom);
     }
 
-#elif (PLATFORM_POSIX_VERSION >= 200112L) && (defined __UCLIBC__ || ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) || __GLIBC__ > 2))
+#elif (PLATFORM_POSIX_VERSION >= 200112L) && (defined __UCLIBC__ || (defined(__GLIBC__) && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) || __GLIBC__ > 2) ) )
 
     #include <time.h>
     typedef struct timespec UTIL_time_t;
diff --git a/tests/fullbench.c b/tests/fullbench.c
index ee2966f..c06e230 100644
--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@@ -267,13 +267,20 @@ static int local_LZ4_decompress_fast(const char* in, char* out, int inSize, int
     return outSize;
 }
 
-static int local_LZ4_decompress_fast_usingDict(const char* in, char* out, int inSize, int outSize)
+static int local_LZ4_decompress_fast_usingDict_prefix(const char* in, char* out, int inSize, int outSize)
 {
     (void)inSize;
     LZ4_decompress_fast_usingDict(in, out, outSize, out - 65536, 65536);
     return outSize;
 }
 
+static int local_LZ4_decompress_fast_usingExtDict(const char* in, char* out, int inSize, int outSize)
+{
+    (void)inSize;
+    LZ4_decompress_fast_usingDict(in, out, outSize, out - 65536, 65535);
+    return outSize;
+}
+
 static int local_LZ4_decompress_safe_usingDict(const char* in, char* out, int inSize, int outSize)
 {
     (void)inSize;
@@ -460,7 +467,7 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
                 double averageTime;
                 clock_t clockTime;
 
-                PROGRESS("%1i- %-28.28s :%9i ->\r", loopNb, compressorName, (int)benchedSize);
+                PROGRESS("%2i-%-34.34s :%10i ->\r", loopNb, compressorName, (int)benchedSize);
                 { size_t i; for (i=0; i<benchedSize; i++) compressed_buff[i]=(char)i; }     /* warming up memory */
 
                 nb_loops = 0;
@@ -471,7 +478,8 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
                     if (initFunction!=NULL) initFunction();
                     for (chunkNb=0; chunkNb<nbChunks; chunkNb++) {
                         chunkP[chunkNb].compressedSize = compressionFunction(chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origSize);
-                        if (chunkP[chunkNb].compressedSize==0) DISPLAY("ERROR ! %s() = 0 !! \n", compressorName), exit(1);
+                        if (chunkP[chunkNb].compressedSize==0)
+                            DISPLAY("ERROR ! %s() = 0 !! \n", compressorName), exit(1);
                     }
                     nb_loops++;
                 }
@@ -482,13 +490,13 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
                 if (averageTime < bestTime) bestTime = averageTime;
                 cSize=0; for (chunkNb=0; chunkNb<nbChunks; chunkNb++) cSize += chunkP[chunkNb].compressedSize;
                 ratio = (double)cSize/(double)benchedSize*100.;
-                PROGRESS("%1i- %-28.28s :%9i ->%9i (%5.2f%%),%7.1f MB/s\r", loopNb, compressorName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000000);
+                PROGRESS("%2i-%-34.34s :%10i ->%9i (%5.2f%%),%7.1f MB/s\r", loopNb, compressorName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000000);
             }
 
             if (ratio<100.)
-                DISPLAY("%2i-%-28.28s :%9i ->%9i (%5.2f%%),%7.1f MB/s\n", cAlgNb, compressorName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000000);
+                DISPLAY("%2i-%-34.34s :%10i ->%9i (%5.2f%%),%7.1f MB/s\n", cAlgNb, compressorName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 1000000);
             else
-                DISPLAY("%2i-%-28.28s :%9i ->%9i (%5.1f%%),%7.1f MB/s\n", cAlgNb, compressorName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 100000);
+                DISPLAY("%2i-%-34.34s :%10i ->%9i (%5.1f%%),%7.1f MB/s\n", cAlgNb, compressorName, (int)benchedSize, (int)cSize, ratio, (double)benchedSize / bestTime / 100000);
         }
 
         /* Prepare layout for decompression */
@@ -509,11 +517,12 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
         }
         for (chunkNb=0; chunkNb<nbChunks; chunkNb++) {
             chunkP[chunkNb].compressedSize = LZ4_compress_default(chunkP[chunkNb].origBuffer, chunkP[chunkNb].compressedBuffer, chunkP[chunkNb].origSize, maxCompressedChunkSize);
-            if (chunkP[chunkNb].compressedSize==0) DISPLAY("ERROR ! %s() = 0 !! \n", "LZ4_compress"), exit(1);
+            if (chunkP[chunkNb].compressedSize==0)
+                DISPLAY("ERROR ! %s() = 0 !! \n", "LZ4_compress"), exit(1);
         }
 
         /* Decompression Algorithms */
-        for (dAlgNb=0; (dAlgNb <= NB_DECOMPRESSION_ALGORITHMS) && (g_decompressionTest); dAlgNb++) {
+        for (dAlgNb=0; (dAlgNb <= NB_DECOMPRESSION_ALGORITHMS) && g_decompressionTest; dAlgNb++) {
             const char* dName;
             int (*decompressionFunction)(const char*, char*, int, int);
             double bestTime = 100000000.;
@@ -524,7 +533,8 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
             {
             case 0: DISPLAY("Decompression functions : \n"); continue;
             case 1: decompressionFunction = local_LZ4_decompress_fast; dName = "LZ4_decompress_fast"; break;
-            case 3: decompressionFunction = local_LZ4_decompress_fast_usingDict; dName = "LZ4_decompress_fast_usingDict"; break;
+            case 2: decompressionFunction = local_LZ4_decompress_fast_usingDict_prefix; dName = "LZ4_decompress_fast_usingDict(prefix)"; break;
+            case 3: decompressionFunction = local_LZ4_decompress_fast_usingExtDict; dName = "LZ4_decompress_fast_using(Ext)Dict"; break;
             case 4: decompressionFunction = LZ4_decompress_safe; dName = "LZ4_decompress_safe"; break;
             case 6: decompressionFunction = local_LZ4_decompress_safe_usingDict; dName = "LZ4_decompress_safe_usingDict"; break;
             case 7: decompressionFunction = local_LZ4_decompress_safe_partial; dName = "LZ4_decompress_safe_partial"; break;
@@ -555,7 +565,7 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
                 clock_t clockTime;
                 U32 crcDecoded;
 
-                PROGRESS("%1i- %-29.29s :%10i ->\r", loopNb, dName, (int)benchedSize);
+                PROGRESS("%2i-%-34.34s :%10i ->\r", loopNb, dName, (int)benchedSize);
 
                 nb_loops = 0;
                 clockTime = clock();
@@ -574,14 +584,14 @@ int fullSpeedBench(const char** fileNamesTable, int nbFiles)
                 averageTime = (double)clockTime / nb_loops / CLOCKS_PER_SEC;
                 if (averageTime < bestTime) bestTime = averageTime;
 
-                PROGRESS("%1i- %-29.29s :%10i -> %7.1f MB/s\r", loopNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000000);
+                PROGRESS("%2i-%-34.34s :%10i -> %7.1f MB/s\r", loopNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000000);
 
                 /* CRC Checking */
                 crcDecoded = XXH32(orig_buff, (int)benchedSize, 0);
                 if (crcOriginal!=crcDecoded) { DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", inFileName, (unsigned)crcOriginal, (unsigned)crcDecoded); exit(1); }
             }
 
-            DISPLAY("%2i-%-29.29s :%10i -> %7.1f MB/s\n", dAlgNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000000);
+            DISPLAY("%2i-%-34.34s :%10i -> %7.1f MB/s\n", dAlgNb, dName, (int)benchedSize, (double)benchedSize / bestTime / 1000000);
         }
       }
       free(orig_buff);
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index 1fbda8a..5dd75b3 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -47,6 +47,7 @@
 #include <stdio.h>      /* fgets, sscanf */
 #include <string.h>     /* strcmp */
 #include <time.h>       /* clock_t, clock, CLOCKS_PER_SEC */
+#include <assert.h>
 #define LZ4_STATIC_LINKING_ONLY
 #define LZ4_HC_STATIC_LINKING_ONLY
 #include "lz4hc.h"
@@ -953,6 +954,7 @@ static void FUZ_unitTests(int compressionLevel)
     const unsigned cycleNb= 0;
     char testInput[testInputSize];
     char testCompressed[testCompressedSize];
+    size_t const testVerifySize = testInputSize;
     char testVerify[testInputSize];
     char ringBuffer[ringBufferSize];
     U32 randState = 1;
@@ -1197,19 +1199,28 @@ static void FUZ_unitTests(int compressionLevel)
             }
         }
 
-        /* small decoder-side ring buffer test */
+        /* Ring buffer test : Non synchronized decoder */
+        /* This test uses minimum amount of memory required to setup a decoding ring buffer
+         * while being unsynchronized with encoder
+         * (no assumption done on how the data is encoded, it just follows LZ4 format specification).
+         * This size is documented in lz4.h, and is LZ4_decoderRingBufferSize(maxBlockSize).
+         */
         {   XXH64_state_t xxhOrig;
             XXH64_state_t xxhNewSafe, xxhNewFast;
             LZ4_streamDecode_t decodeStateSafe, decodeStateFast;
-            const U32 maxMessageSizeLog = 12;
-            const U32 maxMessageSizeMask = (1<<maxMessageSizeLog) - 1;
-            U32 messageSize;
+            const int maxMessageSizeLog = 12;
+            const int maxMessageSize = 1 << maxMessageSizeLog;
+            const int maxMessageSizeMask = maxMessageSize - 1;
+            int messageSize;
             U32 totalMessageSize = 0;
-            U32 iNext = 0;
-            U32 dNext = 0;
-            const U32 dBufferSize = 64 KB;
+            const int dBufferSize = LZ4_decoderRingBufferSize(maxMessageSize);
+            char* const ringBufferSafe = testVerify;
+            char* const ringBufferFast = testVerify + dBufferSize + 1;   /* used by LZ4_decompress_fast_continue */
+            int iNext = 0;
+            int dNext = 0;
             int compressedSize;
 
+            assert((size_t)(dBufferSize + 1 + dBufferSize) < testVerifySize);   /* space used by ringBufferSafe and ringBufferFast */
             XXH64_reset(&xxhOrig, 0);
             XXH64_reset(&xxhNewSafe, 0);
             XXH64_reset(&xxhNewFast, 0);
@@ -1217,38 +1228,39 @@ static void FUZ_unitTests(int compressionLevel)
             LZ4_setStreamDecode(&decodeStateSafe, NULL, 0);
             LZ4_setStreamDecode(&decodeStateFast, NULL, 0);
 
-#define BSIZE1 65537
-#define BSIZE2 16435
+#define BSIZE1 (dBufferSize - (maxMessageSize-1))
 
             /* first block */
-            messageSize = BSIZE1;
+            messageSize = BSIZE1;   /* note : we cheat a bit here, in theory no message should be > maxMessageSize. We just want to fill the decoding ring buffer once. */
             XXH64_update(&xxhOrig, testInput + iNext, messageSize);
             crcOrig = XXH64_digest(&xxhOrig);
 
             compressedSize = LZ4_compress_HC_continue(&sHC, testInput + iNext, testCompressed, messageSize, testCompressedSize-ringBufferSize);
             FUZ_CHECKTEST(compressedSize==0, "LZ4_compress_HC_continue() compression failed");
 
-            result = LZ4_decompress_safe_continue(&decodeStateSafe, testCompressed, testVerify + dNext, compressedSize, messageSize);
-            FUZ_CHECKTEST(result!=(int)messageSize, "64K D.ringBuffer : LZ4_decompress_safe_continue() test failed");
+            result = LZ4_decompress_safe_continue(&decodeStateSafe, testCompressed, ringBufferSafe + dNext, compressedSize, messageSize);
+            FUZ_CHECKTEST(result!=messageSize, "64K D.ringBuffer : LZ4_decompress_safe_continue() test failed");
 
-            XXH64_update(&xxhNewSafe, testVerify + dNext, messageSize);
+            XXH64_update(&xxhNewSafe, ringBufferSafe + dNext, messageSize);
             { U64 const crcNew = XXH64_digest(&xxhNewSafe);
               FUZ_CHECKTEST(crcOrig!=crcNew, "LZ4_decompress_safe_continue() decompression corruption"); }
 
-            result = LZ4_decompress_fast_continue(&decodeStateFast, testCompressed, testVerify + dNext, messageSize);
+            result = LZ4_decompress_fast_continue(&decodeStateFast, testCompressed, ringBufferFast + dNext, messageSize);
             FUZ_CHECKTEST(result!=compressedSize, "64K D.ringBuffer : LZ4_decompress_fast_continue() test failed");
 
-            XXH64_update(&xxhNewFast, testVerify + dNext, messageSize);
+            XXH64_update(&xxhNewFast, ringBufferFast + dNext, messageSize);
             { U64 const crcNew = XXH64_digest(&xxhNewFast);
               FUZ_CHECKTEST(crcOrig!=crcNew, "LZ4_decompress_fast_continue() decompression corruption"); }
 
-            /* prepare next message */
+            /* prepare second message */
             dNext += messageSize;
             totalMessageSize += messageSize;
-            messageSize = BSIZE2;
-            iNext = 132000;
-            memcpy(testInput + iNext, testInput + 8, messageSize);
-            if (dNext > dBufferSize) dNext = 0;
+            messageSize = maxMessageSize;
+            iNext = BSIZE1+1;
+            assert(BSIZE1 >= 65535);
+            memcpy(testInput + iNext, testInput + (BSIZE1-65535), messageSize);  /* will generate a match at max distance == 65535 */
+            FUZ_CHECKTEST(dNext+messageSize <= dBufferSize, "Ring buffer test : second message should require restarting from beginning");
+            dNext = 0;
 
             while (totalMessageSize < 9 MB) {
                 XXH64_update(&xxhOrig, testInput + iNext, messageSize);
@@ -1256,32 +1268,36 @@ static void FUZ_unitTests(int compressionLevel)
 
                 compressedSize = LZ4_compress_HC_continue(&sHC, testInput + iNext, testCompressed, messageSize, testCompressedSize-ringBufferSize);
                 FUZ_CHECKTEST(compressedSize==0, "LZ4_compress_HC_continue() compression failed");
-
-#if 1           /* Because the ring buffer is small, decompression overwrites part of the output which
-                 * is first used as dictionary.  Hence only one decompression function can be tested. */
-                result = LZ4_decompress_safe_continue(&decodeStateSafe, testCompressed, testVerify + dNext, compressedSize, messageSize);
-                FUZ_CHECKTEST(result!=(int)messageSize, "64K D.ringBuffer : LZ4_decompress_safe_continue() test failed");
-                XXH64_update(&xxhNewSafe, testVerify + dNext, messageSize);
+                DISPLAYLEVEL(5, "compressed %i bytes to %i bytes \n", messageSize, compressedSize);
+
+                /* test LZ4_decompress_safe_continue */
+                assert(dNext < dBufferSize);
+                assert(dBufferSize - dNext >= maxMessageSize);
+                result = LZ4_decompress_safe_continue(&decodeStateSafe,
+                                                      testCompressed, ringBufferSafe + dNext,
+                                                      compressedSize, dBufferSize - dNext);   /* works without knowing messageSize, under assumption that messageSize <= maxMessageSize */
+                FUZ_CHECKTEST(result!=messageSize, "D.ringBuffer : LZ4_decompress_safe_continue() test failed");
+                XXH64_update(&xxhNewSafe, ringBufferSafe + dNext, messageSize);
                 {   U64 const crcNew = XXH64_digest(&xxhNewSafe);
-                    if (crcOrig != crcNew) FUZ_findDiff(testInput + iNext, testVerify + dNext);
-                    FUZ_CHECKTEST(crcOrig!=crcNew, "LZ4_decompress_safe_continue() decompression corruption during small decoder-side ring buffer test");
+                    if (crcOrig != crcNew) FUZ_findDiff(testInput + iNext, ringBufferSafe + dNext);
+                    FUZ_CHECKTEST(crcOrig!=crcNew, "LZ4_decompress_safe_continue() decompression corruption during D.ringBuffer test");
                 }
-#else
-                result = LZ4_decompress_fast_continue(&decodeStateFast, testCompressed, testVerify + dNext, messageSize);
-                FUZ_CHECKTEST(result!=compressedSize, "64K D.ringBuffer : LZ4_decompress_fast_continue() test failed");
 
-                XXH64_update(&xxhNewFast, testVerify + dNext, messageSize);
+                /* test LZ4_decompress_fast_continue in its own buffer ringBufferFast */
+                result = LZ4_decompress_fast_continue(&decodeStateFast, testCompressed, ringBufferFast + dNext, messageSize);
+                FUZ_CHECKTEST(result!=compressedSize, "D.ringBuffer : LZ4_decompress_fast_continue() test failed");
+                XXH64_update(&xxhNewFast, ringBufferFast + dNext, messageSize);
                 {   U64 const crcNew = XXH64_digest(&xxhNewFast);
-                    if (crcOrig != crcNew) FUZ_findDiff(testInput + iNext, testVerify + dNext);
-                    FUZ_CHECKTEST(crcOrig!=crcNew, "LZ4_decompress_fast_continue() decompression corruption during small decoder-side ring buffer test");
+                    if (crcOrig != crcNew) FUZ_findDiff(testInput + iNext, ringBufferFast + dNext);
+                    FUZ_CHECKTEST(crcOrig!=crcNew, "LZ4_decompress_fast_continue() decompression corruption during D.ringBuffer test");
                 }
-#endif
+
                 /* prepare next message */
                 dNext += messageSize;
                 totalMessageSize += messageSize;
                 messageSize = (FUZ_rand(&randState) & maxMessageSizeMask) + 1;
                 iNext = (FUZ_rand(&randState) & 65535);
-                if (dNext > dBufferSize) dNext = 0;
+                if (dNext + maxMessageSize > dBufferSize) dNext = 0;
             }
         }
     }
diff --git a/visual/.gitignore b/visual/.gitignore
index dea92fc..276f8f5 100644
--- a/visual/.gitignore
+++ b/visual/.gitignore
@@ -6,5 +6,5 @@
 *.sdf
 *.suo
 *.user
-
+ver*/
 VS2010/bin/