From a31b7058cb97e4393da55e78a77a1c6f0c9ae038 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 25 Oct 2017 10:10:53 +0200
Subject: small modification of lz4 decoder to shortcut common case (short
 branch).

---
 lib/lz4.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/lz4.c b/lib/lz4.c
index 179408d..e0a961f 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1180,6 +1180,22 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
 
         /* get literal length */
         unsigned const token = *ip++;
+
+        if (1)
+        if (ip + 14 + 2 <= iend)
+        if ((token < 15*16) & ((token & 0xF) <= 12)) {
+            size_t const ll = token >> ML_BITS;
+            size_t const off = LZ4_readLE16(ip+ll);   /* check input validity */
+            if (off >= 16) {
+                size_t const ml = (token & 0xF) + MINMATCH;
+                DEBUGLOG(2, "rest:%u, ll:%2u, ml:%2u, off:%u",
+                    (U32)(oend-op), (U32)ll, (U32)ml, (U32)off);
+                memcpy(op, ip, 16); op += ll; ip += ll + 2 /* offset */;
+                memcpy(op, op - off, 16); op += ml;
+                continue;
+            }
+        }
+
         if ((length=(token>>ML_BITS)) == RUN_MASK) {
             unsigned s;
             do {
-- 
cgit v0.12


From e0914ff70c4a8b94d900deeea44c7cd1e9ac4a07 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 30 Oct 2017 16:07:15 -0700
Subject: more complete shortcut - passes tests

---
 lib/lz4.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index e0a961f..10f8d55 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1146,7 +1146,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
                  int partialDecoding,    /* full, partial */
                  int targetOutputSize,   /* only used if partialDecoding==partial */
                  int dict,               /* noDict, withPrefix64k, usingExtDict */
-                 const BYTE* const lowPrefix,  /* == dst when no prefix */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
                  const BYTE* const dictStart,  /* only if dict==usingExtDict */
                  const size_t dictSize         /* note : = 0 if noDict */
                  )
@@ -1168,7 +1168,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
 
 
     /* Special cases */
-    if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT;                        /* targetOutputSize too high => decode everything */
+    if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT;                      /* targetOutputSize too high => just decode everything */
     if ((endOnInput) && (unlikely(outputSize==0))) return ((srcSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
     if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
 
@@ -1178,24 +1178,25 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
         const BYTE* match;
         size_t offset;
 
-        /* get literal length */
         unsigned const token = *ip++;
 
-        if (1)
-        if (ip + 14 + 2 <= iend)
-        if ((token < 15*16) & ((token & 0xF) <= 12)) {
+        /* shortcut for common case */
+        /* this shortcut was tested on x86 and x64, where it improves decoding speed.
+         * it has not yet been benchmarked on ARM, Power, mips, etc. */
+        if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend))
+          & ((token < 15*16) & ((token & 0xF) <= 12)) ) {
             size_t const ll = token >> ML_BITS;
-            size_t const off = LZ4_readLE16(ip+ll);   /* check input validity */
-            if (off >= 16) {
+            size_t const off = LZ4_readLE16(ip+ll);
+            const BYTE* const matchPtr = op + ll - off;  /* pointer underflow risk ? */
+            if ((off >= 16) & (matchPtr >= lowPrefix)) {
                 size_t const ml = (token & 0xF) + MINMATCH;
-                DEBUGLOG(2, "rest:%u, ll:%2u, ml:%2u, off:%u",
-                    (U32)(oend-op), (U32)ll, (U32)ml, (U32)off);
-                memcpy(op, ip, 16); op += ll; ip += ll + 2 /* offset */;
-                memcpy(op, op - off, 16); op += ml;
+                memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
+                memcpy(op, matchPtr, 16); op += ml;
                 continue;
             }
         }
 
+        /* decode literal length */
         if ((length=(token>>ML_BITS)) == RUN_MASK) {
             unsigned s;
             do {
-- 
cgit v0.12


From 3f173052aef1d0f1503bd986cc871e59ef56cadd Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 31 Oct 2017 11:49:57 -0700
Subject: added comments, as suggested by @terrelln

---
 lib/lz4.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index 10f8d55..b036d98 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1180,15 +1180,16 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
 
         unsigned const token = *ip++;
 
-        /* shortcut for common case */
-        /* this shortcut was tested on x86 and x64, where it improves decoding speed.
+        /* shortcut for common case :
+         * in most circumstances, we expect to decode small matches (<= 16 bytes) separated by few literals (<= 14 bytes).
+         * this shortcut was tested on x86 and x64, where it improves decoding speed.
          * it has not yet been benchmarked on ARM, Power, mips, etc. */
         if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend))
-          & ((token < 15*16) & ((token & 0xF) <= 12)) ) {
+          & ((token < (15<<ML_BITS)) & ((token & 0xF) <= 12)) ) {
             size_t const ll = token >> ML_BITS;
             size_t const off = LZ4_readLE16(ip+ll);
             const BYTE* const matchPtr = op + ll - off;  /* pointer underflow risk ? */
-            if ((off >= 16) & (matchPtr >= lowPrefix)) {
+            if ((off >= 16) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
                 size_t const ml = (token & 0xF) + MINMATCH;
                 memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
                 memcpy(op, matchPtr, 16); op += ml;
-- 
cgit v0.12


From ace334a4c94927d97933888b343a2f435bbbc7fa Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 31 Oct 2017 12:22:15 -0700
Subject: minor : coding style : use ML_MASK constant

---
 lib/lz4.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index b036d98..34d8c40 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1185,12 +1185,12 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
          * this shortcut was tested on x86 and x64, where it improves decoding speed.
          * it has not yet been benchmarked on ARM, Power, mips, etc. */
         if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend))
-          & ((token < (15<<ML_BITS)) & ((token & 0xF) <= 12)) ) {
+          & ((token < (15<<ML_BITS)) & ((token & ML_MASK) <= 12)) ) {
             size_t const ll = token >> ML_BITS;
             size_t const off = LZ4_readLE16(ip+ll);
             const BYTE* const matchPtr = op + ll - off;  /* pointer underflow risk ? */
             if ((off >= 16) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
-                size_t const ml = (token & 0xF) + MINMATCH;
+                size_t const ml = (token & ML_MASK) + MINMATCH;
                 memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
                 memcpy(op, matchPtr, 16); op += ml;
                 continue;
-- 
cgit v0.12


From 9378f76e4186e7b4f33a79451b22afd5b6344c8d Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 31 Oct 2017 14:20:25 -0700
Subject: extended shortcut match length to 18

---
 lib/lz4.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index 34d8c40..1504790 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1181,18 +1181,19 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
         unsigned const token = *ip++;
 
         /* shortcut for common case :
-         * in most circumstances, we expect to decode small matches (<= 16 bytes) separated by few literals (<= 14 bytes).
+         * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes).
          * this shortcut was tested on x86 and x64, where it improves decoding speed.
          * it has not yet been benchmarked on ARM, Power, mips, etc. */
-        if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend))
-          & ((token < (15<<ML_BITS)) & ((token & ML_MASK) <= 12)) ) {
+        if (((ip + 14 /*maxLL*/ + 2 /*offset*/ <= iend)
+          & (op + 14 /*maxLL*/ + 18 /*maxML*/ <= oend))
+          & ((token < (15<<ML_BITS)) & ((token & ML_MASK) != 15)) ) {
             size_t const ll = token >> ML_BITS;
             size_t const off = LZ4_readLE16(ip+ll);
             const BYTE* const matchPtr = op + ll - off;  /* pointer underflow risk ? */
-            if ((off >= 16) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
+            if ((off >= 18) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
                 size_t const ml = (token & ML_MASK) + MINMATCH;
                 memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
-                memcpy(op, matchPtr, 16); op += ml;
+                memcpy(op, matchPtr, 18); op += ml;
                 continue;
             }
         }
-- 
cgit v0.12


From a5731d6b266170f7bf0893c833af680d929f5616 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 31 Oct 2017 15:51:56 -0700
Subject: minor change, to help store forwarding

in a marginal case (offset==4)
---
 lib/lz4.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index 1504790..5efcbc0 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1160,8 +1160,8 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
     BYTE* oexit = op + targetOutputSize;
 
     const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
-    const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};
-    const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+    const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+    const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
 
     const int safeDecode = (endOnInput==endOnInputSize);
     const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
@@ -1276,14 +1276,13 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
         /* copy match within block */
         cpy = op + length;
         if (unlikely(offset<8)) {
-            const int dec64 = dec64table[offset];
             op[0] = match[0];
             op[1] = match[1];
             op[2] = match[2];
             op[3] = match[3];
-            match += dec32table[offset];
+            match += inc32table[offset];
             memcpy(op+4, match, 4);
-            match -= dec64;
+            match -= dec64table[offset];
         } else { LZ4_copy8(op, match); match+=8; }
         op += 8;
 
@@ -1300,7 +1299,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
             LZ4_copy8(op, match);
             if (length>16) LZ4_wildCopy(op+8, match+8, cpy);
         }
-        op=cpy;   /* correction */
+        op = cpy;   /* correction */
     }
 
     /* end of decoding */
-- 
cgit v0.12