summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorYann Collet <Cyan4973@users.noreply.github.com>2018-04-30 22:32:37 (GMT)
committerGitHub <noreply@github.com>2018-04-30 22:32:37 (GMT)
commit90374271c273cbc51b5aa848463ec25dde8d1ede (patch)
treee2b21fd6cb93ac4d0dda336a1c5ff87893846568 /lib
parentc32e0319a5a8319599f0f3d1a510431bdaafbdb1 (diff)
parent45f8603aae389d34c689d3ff7427b314071ccd2c (diff)
downloadlz4-90374271c273cbc51b5aa848463ec25dde8d1ede.zip
lz4-90374271c273cbc51b5aa848463ec25dde8d1ede.tar.gz
lz4-90374271c273cbc51b5aa848463ec25dde8d1ede.tar.bz2
Merge pull request #527 from svpv/fastDec
lz4.c: two-stage shortcut for LZ4_decompress_generic
Diffstat (limited to 'lib')
-rw-r--r--lib/lz4.c107
1 files changed, 82 insertions, 25 deletions
diff --git a/lib/lz4.c b/lib/lz4.c
index 1e0d8e9..c6f0426 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1398,6 +1398,9 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
const int safeDecode = (endOnInput==endOnInputSize);
const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+ /* Set up the "end" pointers for the shortcut. */
+ const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+ const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
/* Special cases */
if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => just decode everything */
@@ -1407,39 +1410,90 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
/* Main Loop : decode sequences */
while (1) {
- size_t length;
const BYTE* match;
size_t offset;
unsigned const token = *ip++;
+ size_t length = token >> ML_BITS; /* literal length */
assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
- /* shortcut for common case :
- * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes).
- * this shortcut was tested on x86 and x64, where it improves decoding speed.
- * it has not yet been benchmarked on ARM, Power, mips, etc.
- * NOTE: The loop begins with a read, so we must have one byte left at the end. */
- if (endOnInput
- && ((ip + 14 /*maxLL*/ + 2 /*offset*/ < iend)
- & (op + 14 /*maxLL*/ + 18 /*maxML*/ <= oend)
- & (token < (15<<ML_BITS))
- & ((token & ML_MASK) != 15) ) ) {
- size_t const ll = token >> ML_BITS;
- size_t const off = LZ4_readLE16(ip+ll);
- const BYTE* const matchPtr = op + ll - off; /* pointer underflow risk ? */
- if ((off >= 8) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix)) {
- size_t const ml = (token & ML_MASK) + MINMATCH;
- memcpy(op, ip, 16); op += ll; ip += ll + 2 /*offset*/;
- memcpy(op + 0, matchPtr + 0, 8);
- memcpy(op + 8, matchPtr + 8, 8);
- memcpy(op +16, matchPtr +16, 2);
- op += ml;
- continue;
+
+ /* A two-stage shortcut for the most common case:
+ * 1) If the literal length is 0..14, and there is enough space,
+ * enter the shortcut and copy 16 bytes on behalf of the literals
+ * (in the fast mode, only 8 bytes can be safely copied this way).
+ * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+ * manner; but we ensure that there's enough space in the output for
+ * those 18 bytes earlier, upon entering the shortcut (in other words,
+ * there is a combined check for both stages).
+ */
+ if ((endOnInput ? length != RUN_MASK : length <= 8) &&
+ /* strictly "less than" on input, to re-enter the loop with at least one byte */
+ likely((endOnInput ? ip < shortiend : 1) && (op <= shortoend)))
+ {
+ /* Can we copy the literals with a single memcpy invocation? Sometimes we can't
+ * copy 16 bytes, because they can clobber the dictionary in the ring buffer. */
+ if (!endOnInput /* only 8 bytes */ || /* nothing to clobber */ dict != usingExtDict) {
+ /* Copy the literals. */
+ memcpy(op, ip, endOnInput ? 16 : 8);
+ op += length; ip += length;
+
+ /* The second stage: prepare for match copying, decode full info.
+ * It if doesn't work out, the info won't be wasted. */
+ length = token & ML_MASK; /* match length */
+ offset = LZ4_readLE16(ip); ip += 2;
+ match = op - offset;
+
+ /* Do not deal with overlapping matches. */
+ if ((length != 15) && (offset >= 8) &&
+ (dict==withPrefix64k || match >= lowPrefix))
+ {
+ /* Copy the match. */
+ memcpy(op + 0, match + 0, 8);
+ memcpy(op + 8, match + 8, 8);
+ memcpy(op +16, match +16, 2);
+ op += length + MINMATCH;
+ /* Both stages worked, load the next token. */
+ continue;
+ }
+ } else {
+ /* Save the literal length, can't copy 16 bytes just yet. */
+ size_t ll = length;
+
+ /* Prepare for the second satge. */
+ length = token & ML_MASK;
+ offset = LZ4_readLE16(ip+ll);
+ match = op + ll - offset;
+
+ if ((length != 15) && (offset >= 8) &&
+ (dict==withPrefix64k || match >= lowPrefix))
+ {
+ /* Copy the literals. */
+ memcpy(op, ip, 16);
+ op += ll; ip += ll + 2;
+ /* Copy the match. */
+ memcpy(op + 0, match + 0, 8);
+ memcpy(op + 8, match + 8, 8);
+ memcpy(op +16, match +16, 2);
+ op += length + MINMATCH;
+ /* Both stages worked, load the next token. */
+ continue;
+ }
+
+ /* So we took the literlas, but the second stage didn't work. */
+ memcpy(op, ip, 8);
+ if (ll > 8)
+ memcpy(op + 8, ip + 8, 8);
+ op += ll; ip += ll + 2;
}
+
+ /* The second stage didn't work out, but the info is ready.
+ * Propel it right to the point of match copying. */
+ goto _copy_match;
}
/* decode literal length */
- if ((length=(token>>ML_BITS)) == RUN_MASK) {
+ if (length == RUN_MASK) {
unsigned s;
if (unlikely(endOnInput ? ip >= iend-RUN_MASK : 0)) goto _output_error; /* overflow detection */
do {
@@ -1473,11 +1527,14 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
/* get offset */
offset = LZ4_readLE16(ip); ip+=2;
match = op - offset;
- if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */
- LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */
/* get matchlength */
length = token & ML_MASK;
+
+_copy_match:
+ if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */
+ LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */
+
if (length == ML_MASK) {
unsigned s;
do {