summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorDave Watson <davejwatson@fb.com>2019-01-28 17:36:25 (GMT)
committerDave Watson <davejwatson@fb.com>2019-02-08 21:57:23 (GMT)
commit75fb878a901900d0addd6950b4c9e4ab2bd3d1f2 (patch)
treed993051d095149efebe90ce4d02650708d4654c1 /lib
parentfaac110e20bef7c5715c068294d1a6950450fad6 (diff)
downloadlz4-75fb878a901900d0addd6950b4c9e4ab2bd3d1f2.zip
lz4-75fb878a901900d0addd6950b4c9e4ab2bd3d1f2.tar.gz
lz4-75fb878a901900d0addd6950b4c9e4ab2bd3d1f2.tar.bz2
decompress_generic: Add fastpath for small offsets
For small offsets of size 1, 2, 4 and 8, we can set a single uint64_t, and then use it to do a memset() variation. In particular, this makes the somewhat-common RLE (offset 1) about 2-4x faster than the previous implementation - we avoid not only the load blocked by store, but also avoid the loads entirely.
Diffstat (limited to 'lib')
-rw-r--r--lib/lz4.c78
1 files changed, 59 insertions, 19 deletions
diff --git a/lib/lz4.c b/lib/lz4.c
index 993d746..066d535 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -297,6 +297,30 @@ void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
}
+static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
+static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
+
+
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+void LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) {
+ if (offset < 8) {
+ dstPtr[0] = srcPtr[0];
+ dstPtr[1] = srcPtr[1];
+ dstPtr[2] = srcPtr[2];
+ dstPtr[3] = srcPtr[3];
+ srcPtr += inc32table[offset];
+ memcpy(dstPtr+4, srcPtr, 4);
+ srcPtr -= dec64table[offset];
+ dstPtr += 8;
+ } else {
+ memcpy(dstPtr, srcPtr, 8);
+ dstPtr += 8;
+ srcPtr += 8;
+ }
+
+ LZ4_wildCopy(dstPtr, srcPtr, dstEnd);
+}
+
/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd */
LZ4_FORCE_O2_INLINE_GCC_PPC64LE
void LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
@@ -307,6 +331,40 @@ void LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
do { memcpy(d,s,16); memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
}
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+void LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) {
+ BYTE v[8];
+ switch(offset) {
+ case 1:
+ memset(v, *srcPtr, 8);
+ goto copy_loop;
+ case 2:
+ memcpy(v, srcPtr, 2);
+ memcpy(&v[2], srcPtr, 2);
+ memcpy(&v[4], &v[0], 4);
+ goto copy_loop;
+ case 4:
+ memcpy(v, srcPtr, 4);
+ memcpy(&v[4], srcPtr, 4);
+ goto copy_loop;
+ case 3:
+ case 5:
+ case 6:
+ case 7:
+ case 8:
+ default:
+ LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+ return;
+ }
+
+ copy_loop:
+ memcpy(dstPtr, v, 8);
+ dstPtr += 8;
+ while (dstPtr < dstEnd) {
+ memcpy(dstPtr, v, 8);
+ dstPtr += 8;
+ }
+}
/*-************************************
* Common Constants
@@ -1505,8 +1563,6 @@ LZ4_decompress_generic(
BYTE* cpy;
const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
- const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
- const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
const int safeDecode = (endOnInput==endOnInputSize);
const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
@@ -1643,23 +1699,7 @@ LZ4_decompress_generic(
/* partialDecoding : may not respect endBlock parsing restrictions */
assert(op<=oend);
if (unlikely(offset<16)) {
- if (offset < 8) {
- op[0] = match[0];
- op[1] = match[1];
- op[2] = match[2];
- op[3] = match[3];
- match += inc32table[offset];
- memcpy(op+4, match, 4);
- match -= dec64table[offset];
- op += 8;
- } else {
- memcpy(op, match, 8);
- op += 8;
- match += 8;
- }
-
- memcpy(op, match, 8);
- if (length > 16) LZ4_wildCopy(op+8, match+8, cpy);
+ LZ4_memcpy_using_offset(op, match, cpy, offset);
} else {
LZ4_wildCopy32(op, match, cpy);
}