From d51f0466289d9a021291e736b463cf8de7bd60bd Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 6 Nov 2017 15:42:50 -0800
Subject: 2-stages LZ4_count

separate first branch from the rest of the compare loop
to get dedicated prediction.

measured a 3-4% compression speed improvement.
---
 lib/lz4.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index 64a2e82..ff6496c 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -407,7 +407,15 @@ static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLi
 {
     const BYTE* const pStart = pIn;
 
-    while (likely(pIn<pInLimit-(STEPSIZE-1))) {
+    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) {
+            pIn+=STEPSIZE; pMatch+=STEPSIZE;
+        } else {
+            return LZ4_NbCommonBytes(diff);
+    }   }
+
+    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
         reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
         if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
         pIn += LZ4_NbCommonBytes(diff);
-- 
cgit v0.12


From 9221419c6fad7d714369cb549b123825fd041d49 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 6 Nov 2017 17:29:27 -0800
Subject: added LZ4_FORCEINLINE to counter gcc regression

as recommended by @terrelln
---
 lib/lz4.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/lz4.c b/lib/lz4.c
index ff6496c..6157285 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -403,7 +403,8 @@ static unsigned LZ4_NbCommonBytes (REGISTER reg_t val)
 }
 
 #define STEPSIZE sizeof(reg_t)
-static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
 {
     const BYTE* const pStart = pIn;
 
-- 
cgit v0.12