From a31b7058cb97e4393da55e78a77a1c6f0c9ae038 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 25 Oct 2017 10:10:53 +0200 Subject: small modification of lz4 decoder to shortcut common case (short branch). --- lib/lz4.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/lz4.c b/lib/lz4.c index 179408d..e0a961f 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1180,6 +1180,22 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( /* get literal length */ unsigned const token = *ip++; + + if (1) + if (ip + 14 + 2 <= iend) + if ((token < 15*16) & ((token & 0xF) <= 12)) { + size_t const ll = token >> ML_BITS; + size_t const off = LZ4_readLE16(ip+ll); /* check input validity */ + if (off >= 16) { + size_t const ml = (token & 0xF) + MINMATCH; + DEBUGLOG(2, "rest:%u, ll:%2u, ml:%2u, off:%u", + (U32)(oend-op), (U32)ll, (U32)ml, (U32)off); + memcpy(op, ip, 16); op += ll; ip += ll + 2 /* offset */; + memcpy(op, op - off, 16); op += ml; + continue; + } + } + if ((length=(token>>ML_BITS)) == RUN_MASK) { unsigned s; do { -- cgit v0.12 From e0914ff70c4a8b94d900deeea44c7cd1e9ac4a07 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 30 Oct 2017 16:07:15 -0700 Subject: more complete shortcut - passes tests --- lib/lz4.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index e0a961f..10f8d55 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1146,7 +1146,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( int partialDecoding, /* full, partial */ int targetOutputSize, /* only used if partialDecoding==partial */ int dict, /* noDict, withPrefix64k, usingExtDict */ - const BYTE* const lowPrefix, /* == dst when no prefix */ + const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ const BYTE* const dictStart, /* only if dict==usingExtDict */ const size_t dictSize /* note : = 0 if noDict */ ) @@ -1168,7 +1168,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( /* Special cases */ - if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => just decode everything */ if ((endOnInput) && (unlikely(outputSize==0))) return ((srcSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); @@ -1178,24 +1178,25 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( const BYTE* match; size_t offset; - /* get literal length */ unsigned const token = *ip++; - if (1) - if (ip + 14 + 2 <= iend) - if ((token < 15*16) & ((token & 0xF) <= 12)) { + /* shortcut for common case */ + /* this shortcut was tested on x86 and x64, where it improves decoding speed. + * it has not yet been benchmarked on ARM, Power, mips, etc. */ + if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend)) + & ((token < 15*16) & ((token & 0xF) <= 12)) ) { size_t const ll = token >> ML_BITS; - size_t const off = LZ4_readLE16(ip+ll); /* check input validity */ - if (off >= 16) { + size_t const off = LZ4_readLE16(ip+ll); + const BYTE* const matchPtr = op + ll - off; /* pointer underflow risk ? */ + if ((off >= 16) & (matchPtr >= lowPrefix)) { size_t const ml = (token & 0xF) + MINMATCH; - DEBUGLOG(2, "rest:%u, ll:%2u, ml:%2u, off:%u", - (U32)(oend-op), (U32)ll, (U32)ml, (U32)off); - memcpy(op, ip, 16); op += ll; ip += ll + 2 /* offset */; - memcpy(op, op - off, 16); op += ml; + memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/; + memcpy(op, matchPtr, 16); op += ml; continue; } } + /* decode literal length */ if ((length=(token>>ML_BITS)) == RUN_MASK) { unsigned s; do { -- cgit v0.12 From 3f173052aef1d0f1503bd986cc871e59ef56cadd Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 31 Oct 2017 11:49:57 -0700 Subject: added comments, as suggested by @terrelln --- lib/lz4.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 10f8d55..b036d98 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1180,15 +1180,16 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( unsigned const token = *ip++; - /* shortcut for common case */ - /* this shortcut was tested on x86 and x64, where it improves decoding speed. + /* shortcut for common case : + * in most circumstances, we expect to decode small matches (<= 16 bytes) separated by few literals (<= 14 bytes). + * this shortcut was tested on x86 and x64, where it improves decoding speed. * it has not yet been benchmarked on ARM, Power, mips, etc. */ if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend)) - & ((token < 15*16) & ((token & 0xF) <= 12)) ) { + & ((token < (15<> ML_BITS; size_t const off = LZ4_readLE16(ip+ll); const BYTE* const matchPtr = op + ll - off; /* pointer underflow risk ? */ - if ((off >= 16) & (matchPtr >= lowPrefix)) { + if ((off >= 16) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) { size_t const ml = (token & 0xF) + MINMATCH; memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/; memcpy(op, matchPtr, 16); op += ml; -- cgit v0.12 From ace334a4c94927d97933888b343a2f435bbbc7fa Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 31 Oct 2017 12:22:15 -0700 Subject: minor : coding style : use ML_MASK constant --- lib/lz4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index b036d98..34d8c40 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1185,12 +1185,12 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( * this shortcut was tested on x86 and x64, where it improves decoding speed. * it has not yet been benchmarked on ARM, Power, mips, etc. */ if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend)) - & ((token < (15<> ML_BITS; size_t const off = LZ4_readLE16(ip+ll); const BYTE* const matchPtr = op + ll - off; /* pointer underflow risk ? */ if ((off >= 16) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) { - size_t const ml = (token & 0xF) + MINMATCH; + size_t const ml = (token & ML_MASK) + MINMATCH; memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/; memcpy(op, matchPtr, 16); op += ml; continue; -- cgit v0.12 From 9378f76e4186e7b4f33a79451b22afd5b6344c8d Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 31 Oct 2017 14:20:25 -0700 Subject: extended shortcut match length to 18 --- lib/lz4.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 34d8c40..1504790 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1181,18 +1181,19 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( unsigned const token = *ip++; /* shortcut for common case : - * in most circumstances, we expect to decode small matches (<= 16 bytes) separated by few literals (<= 14 bytes). + * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes). * this shortcut was tested on x86 and x64, where it improves decoding speed. * it has not yet been benchmarked on ARM, Power, mips, etc. */ - if (((ip + 14 + 2 <= iend) & (op + 14 + 16 <= oend)) - & ((token < (15<> ML_BITS; size_t const off = LZ4_readLE16(ip+ll); const BYTE* const matchPtr = op + ll - off; /* pointer underflow risk ? */ - if ((off >= 16) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) { + if ((off >= 18) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) { size_t const ml = (token & ML_MASK) + MINMATCH; memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/; - memcpy(op, matchPtr, 16); op += ml; + memcpy(op, matchPtr, 18); op += ml; continue; } } -- cgit v0.12 From a5731d6b266170f7bf0893c833af680d929f5616 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 31 Oct 2017 15:51:56 -0700 Subject: minor change, to help store forwarding in a marginal case (offset==4) --- lib/lz4.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 1504790..5efcbc0 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -1160,8 +1160,8 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( BYTE* oexit = op + targetOutputSize; const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; - const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; - const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; + const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; + const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; const int safeDecode = (endOnInput==endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); @@ -1276,14 +1276,13 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( /* copy match within block */ cpy = op + length; if (unlikely(offset<8)) { - const int dec64 = dec64table[offset]; op[0] = match[0]; op[1] = match[1]; op[2] = match[2]; op[3] = match[3]; - match += dec32table[offset]; + match += inc32table[offset]; memcpy(op+4, match, 4); - match -= dec64; + match -= dec64table[offset]; } else { LZ4_copy8(op, match); match+=8; } op += 8; @@ -1300,7 +1299,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic( LZ4_copy8(op, match); if (length>16) LZ4_wildCopy(op+8, match+8, cpy); } - op=cpy; /* correction */ + op = cpy; /* correction */ } /* end of decoding */ -- cgit v0.12