added documentation about LZ4_FORCE_SW_BITCOUNT

Also : added memory-frugal software byte count for big endian 64-bit cpus. Disabled by default.
author: Yann Collet <cyan@fb.com> 2020-08-26 05:17:29 (GMT)
committer: Yann Collet <cyan@fb.com> 2020-08-26 05:17:29 (GMT)
commit: 5243173b23b4deff16a5685ebfcf493e896a756e (patch)
tree: e0f9c444b6ea3a1145c8b7652769c4968f690700
parent: ee0a3cfa0c719871fcdc19bd847483828fb7a07d (diff)
download: lz4-5243173b23b4deff16a5685ebfcf493e896a756e.zip
lz4-5243173b23b4deff16a5685ebfcf493e896a756e.tar.gz
lz4-5243173b23b4deff16a5685ebfcf493e896a756e.tar.bz2
2 files changed, 33 insertions, 8 deletions
diff --git a/lib/README.md b/lib/README.md
index 707d777..3653c81 100644
--- a/lib/README.md
+++ b/lib/README.md
@@ -46,11 +46,11 @@ and `LZ4F_PUBLISH_STATIC_FUNCTIONS`.
 
 #### Build macros
 
-The following build macro can be selected at compilation time :
+The following build macro can be selected to adjust source code behavior at compilation time :
 
-- `LZ4_FAST_DEC_LOOP` : this triggers the optimized decompression loop.
-  This loops works great on x86/x64 cpus, and is automatically enabled on this platform.
-  It's possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor.
+- `LZ4_FAST_DEC_LOOP` : this triggers a speed optimized decompression loop, more powerful on modern cpus.
+  This loop works great on x86, x64 and aarch64 cpus, and is automatically enabled for them.
+  It's also possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor.
   For example, with `gcc` : `-DLZ4_FAST_DEC_LOOP=1`,
   and with `make` : `CPPFLAGS+=-DLZ4_FAST_DEC_LOOP=1 make lz4`.
 
@@ -66,9 +66,17 @@ The following build macro can be selected at compilation time :
   Should this be a problem, it's generally possible to make the compiler ignore these warnings,
   for example with `-Wno-deprecated-declarations` on `gcc`,
   or `_CRT_SECURE_NO_WARNINGS` for Visual Studio.
-  Another method is to define `LZ4_DISABLE_DEPRECATE_WARNINGS`
+  Another project-specific method is to define `LZ4_DISABLE_DEPRECATE_WARNINGS`
   before including the LZ4 header files.
 
+- `LZ4_FORCE_SW_BITCOUNT` : by default, the compression algorithm tries to determine lengths
+  by using bitcount instructions, generally implemented as fast single instructions in many cpus.
+  In case the target cpus doesn't support it, or compiler intrinsic doesn't work, or feature bad performance,
+  it's possible to use an optimized software path instead.
+  This is achieved by setting this build macros .
+  In most cases, it's not expected to be necessary,
+  but it can be legitimately considered for less common platforms.
+
 
 #### Amalgamation
 
@@ -103,7 +111,7 @@ The compiled executable will require LZ4 DLL which is available at `dll\liblz4.d
 
 #### Miscellaneous
 
-Other files present in the directory are not source code. There are :
+Other files present in the directory are not source code. They are :
 
  - `LICENSE` : contains the BSD license text
  - `Makefile` : `make` script to compile and install lz4 library (static and dynamic)
diff --git a/lib/lz4.c b/lib/lz4.c
index e10b58e..0ca7b21 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -88,6 +88,7 @@
  * Define this parameter if your target system or compiler does not support hardware bit count
  */
 #if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
+#  undef  LZ4_FORCE_SW_BITCOUNT  /* avoid double def */
 #  define LZ4_FORCE_SW_BITCOUNT
 #endif
 
@@ -527,6 +528,9 @@ static unsigned LZ4_NbCommonBytes (reg_t val)
                                         !defined(LZ4_FORCE_SW_BITCOUNT)
             return (unsigned)__builtin_clzll((U64)val) >> 3;
 #       else
+#if 1
+            /* this method is probably faster,
+             * but adds a 128 bytes lookup table */
             static const unsigned char ctz7_tab[128] = {
                 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
                 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
@@ -537,9 +541,22 @@ static unsigned LZ4_NbCommonBytes (reg_t val)
                 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
                 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
             };
-            const U64 mask = 0x0101010101010101ULL;
-            U64 t = (((val >> 8) - mask) | val) & mask;
+            U64 const mask = 0x0101010101010101ULL;
+            U64 const t = (((val >> 8) - mask) | val) & mask;
             return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+            /* this method doesn't consume memory space like the previous one,
+             * but it contains several branches,
+             * that may end up slowing execution */
+            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
+            Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
+            Note that this code path is never triggered in 32-bits mode. */
+            unsigned r;
+            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#endif
 #       endif
         } else /* 32 bits */ {
 #       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
author	Yann Collet <cyan@fb.com>	2020-08-26 05:17:29 (GMT)
committer	Yann Collet <cyan@fb.com>	2020-08-26 05:17:29 (GMT)
commit	5243173b23b4deff16a5685ebfcf493e896a756e (patch)
tree	e0f9c444b6ea3a1145c8b7652769c4968f690700
parent	ee0a3cfa0c719871fcdc19bd847483828fb7a07d (diff)
download	lz4-5243173b23b4deff16a5685ebfcf493e896a756e.zip lz4-5243173b23b4deff16a5685ebfcf493e896a756e.tar.gz lz4-5243173b23b4deff16a5685ebfcf493e896a756e.tar.bz2