From e72897402df0b93978d9471979e6d5c62d278b2a Mon Sep 17 00:00:00 2001 From: aqrit Date: Tue, 11 Aug 2020 21:14:09 -0400 Subject: rejigger bit counting intrinsics Fix lz4/lz4#867 Optimize software fallback routines. Delete some faulty (and dead?) MSVC big endian code. --- lib/lz4.c | 91 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 1cd7322..5d0a2d2 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -487,75 +487,76 @@ LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const si **************************************/ static unsigned LZ4_NbCommonBytes (reg_t val) { + assert(val != 0); if (LZ4_isLittleEndian()) { - if (sizeof(val)==8) { -# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + if (sizeof(val) == 8) { +# if defined(_MSC_VER) && (_MSC_VER >= 1800) && defined(_M_AMD64) && !defined(LZ4_FORCE_SW_BITCOUNT) + /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */ + return (unsigned)_tzcnt_u64(val) >> 3; +# elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (int)(r>>3); -# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + _BitScanForward64(&r, (U64)val); + return (unsigned)r >> 3; +# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) return (unsigned)__builtin_ctzll((U64)val) >> 3; # else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, - 0, 3, 1, 3, 1, 4, 2, 7, - 0, 2, 3, 6, 1, 5, 3, 5, - 1, 3, 4, 4, 2, 5, 6, 7, - 7, 0, 1, 2, 3, 3, 4, 6, - 2, 6, 5, 5, 3, 4, 5, 6, - 7, 1, 2, 4, 6, 4, 4, 5, - 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; + const U64 m = 0x0101010101010101ULL; + val ^= val - 1; + return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56); # endif } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; - _BitScanForward( &r, (U32)val ); - return (int)(r>>3); -# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + _BitScanForward(&r, (U32)val); + return (unsigned)r >> 3; +# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) return (unsigned)__builtin_ctz((U32)val) >> 3; # else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; + const U32 m = 0x01010101; + return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24; # endif } } else /* Big Endian CPU */ { - if (sizeof(val)==8) { /* 64-bits */ -# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + if (sizeof(val)==8) { +# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) return (unsigned)__builtin_clzll((U64)val) >> 3; # else - static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits. - Just to avoid some static analyzer complaining about shift by 32 on 32-bits target. - Note that this code path is never triggered in 32-bits mode. */ - unsigned r; - if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; + static const unsigned char ctz7_tab[128] = { + 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + }; + const U64 mask = 0x0101010101010101ULL; + val = (((val >> 8) - mask) | val) & mask; + return ctz7_tab[((U64)(val * 0x0080402010080402ULL)) >> 57]; # endif } else /* 32 bits */ { -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) +# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) return (unsigned)__builtin_clz((U32)val) >> 3; # else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; + val >>= 8; + val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | + (val + 0x00FF0000)) >> 24; + return (unsigned)val ^ 3; # endif } } } + #define STEPSIZE sizeof(reg_t) LZ4_FORCE_INLINE unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) -- cgit v0.12 From e45defa8bdfe711e3e6c800970516038ca6d7143 Mon Sep 17 00:00:00 2001 From: aqrit Date: Mon, 17 Aug 2020 17:53:07 -0400 Subject: silence warning MSVC debug mode complains --- lib/lz4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 5d0a2d2..98fc657 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -538,8 +538,8 @@ static unsigned LZ4_NbCommonBytes (reg_t val) 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, }; const U64 mask = 0x0101010101010101ULL; - val = (((val >> 8) - mask) | val) & mask; - return ctz7_tab[((U64)(val * 0x0080402010080402ULL)) >> 57]; + U64 t = (((val >> 8) - mask) | val) & mask; + return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; # endif } else /* 32 bits */ { # if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ -- cgit v0.12 From 5243173b23b4deff16a5685ebfcf493e896a756e Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 25 Aug 2020 22:17:29 -0700 Subject: added documentation about LZ4_FORCE_SW_BITCOUNT Also : added memory-frugal software byte count for big endian 64-bit cpus. Disabled by default. --- lib/README.md | 20 ++++++++++++++------ lib/lz4.c | 21 +++++++++++++++++++-- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/lib/README.md b/lib/README.md index 707d777..3653c81 100644 --- a/lib/README.md +++ b/lib/README.md @@ -46,11 +46,11 @@ and `LZ4F_PUBLISH_STATIC_FUNCTIONS`. #### Build macros -The following build macro can be selected at compilation time : +The following build macro can be selected to adjust source code behavior at compilation time : -- `LZ4_FAST_DEC_LOOP` : this triggers the optimized decompression loop. - This loops works great on x86/x64 cpus, and is automatically enabled on this platform. - It's possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor. +- `LZ4_FAST_DEC_LOOP` : this triggers a speed optimized decompression loop, more powerful on modern cpus. + This loop works great on x86, x64 and aarch64 cpus, and is automatically enabled for them. + It's also possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor. For example, with `gcc` : `-DLZ4_FAST_DEC_LOOP=1`, and with `make` : `CPPFLAGS+=-DLZ4_FAST_DEC_LOOP=1 make lz4`. @@ -66,9 +66,17 @@ The following build macro can be selected at compilation time : Should this be a problem, it's generally possible to make the compiler ignore these warnings, for example with `-Wno-deprecated-declarations` on `gcc`, or `_CRT_SECURE_NO_WARNINGS` for Visual Studio. - Another method is to define `LZ4_DISABLE_DEPRECATE_WARNINGS` + Another project-specific method is to define `LZ4_DISABLE_DEPRECATE_WARNINGS` before including the LZ4 header files. +- `LZ4_FORCE_SW_BITCOUNT` : by default, the compression algorithm tries to determine lengths + by using bitcount instructions, generally implemented as fast single instructions in many cpus. + In case the target cpus doesn't support it, or compiler intrinsic doesn't work, or feature bad performance, + it's possible to use an optimized software path instead. + This is achieved by setting this build macros . + In most cases, it's not expected to be necessary, + but it can be legitimately considered for less common platforms. + #### Amalgamation @@ -103,7 +111,7 @@ The compiled executable will require LZ4 DLL which is available at `dll\liblz4.d #### Miscellaneous -Other files present in the directory are not source code. There are : +Other files present in the directory are not source code. They are : - `LICENSE` : contains the BSD license text - `Makefile` : `make` script to compile and install lz4 library (static and dynamic) diff --git a/lib/lz4.c b/lib/lz4.c index e10b58e..0ca7b21 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -88,6 +88,7 @@ * Define this parameter if your target system or compiler does not support hardware bit count */ #if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */ +# undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */ # define LZ4_FORCE_SW_BITCOUNT #endif @@ -527,6 +528,9 @@ static unsigned LZ4_NbCommonBytes (reg_t val) !defined(LZ4_FORCE_SW_BITCOUNT) return (unsigned)__builtin_clzll((U64)val) >> 3; # else +#if 1 + /* this method is probably faster, + * but adds a 128 bytes lookup table */ static const unsigned char ctz7_tab[128] = { 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, @@ -537,9 +541,22 @@ static unsigned LZ4_NbCommonBytes (reg_t val) 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, }; - const U64 mask = 0x0101010101010101ULL; - U64 t = (((val >> 8) - mask) | val) & mask; + U64 const mask = 0x0101010101010101ULL; + U64 const t = (((val >> 8) - mask) | val) & mask; return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; +#else + /* this method doesn't consume memory space like the previous one, + * but it contains several branches, + * that may end up slowing execution */ + static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits. + Just to avoid some static analyzer complaining about shift by 32 on 32-bits target. + Note that this code path is never triggered in 32-bits mode. */ + unsigned r; + if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +#endif # endif } else /* 32 bits */ { # if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ -- cgit v0.12