summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorYann Collet <cyan@fb.com>2017-11-01 00:55:01 (GMT)
committerYann Collet <cyan@fb.com>2017-11-01 00:55:01 (GMT)
commit15b0d229c1d4a040d63f814c31a2a025145f29c7 (patch)
treeae7d9e0b837a2625be7ad75df65fdf364bbc93e5 /lib
parent931c5c20d0d87e706e3dd9bf57e089500455c987 (diff)
parent07c91d9dba3a2b5593f262185d34d45e6c25c7fc (diff)
downloadlz4-15b0d229c1d4a040d63f814c31a2a025145f29c7.zip
lz4-15b0d229c1d4a040d63f814c31a2a025145f29c7.tar.gz
lz4-15b0d229c1d4a040d63f814c31a2a025145f29c7.tar.bz2
Merge branch 'dev' into btopt
Diffstat (limited to 'lib')
-rw-r--r--lib/lz4.c35
-rw-r--r--lib/lz4.h18
2 files changed, 45 insertions, 8 deletions
diff --git a/lib/lz4.c b/lib/lz4.c
index 016e5c7..e3e9172 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -117,6 +117,28 @@
# endif /* _MSC_VER */
#endif /* LZ4_FORCE_INLINE */
+/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+ * Gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__)
+# define LZ4_FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
+# define LZ4_FORCE_O2_INLINE_GCC_PPC64LE __attribute__((optimize("O2"))) LZ4_FORCE_INLINE
+#else
+# define LZ4_FORCE_O2_GCC_PPC64LE
+# define LZ4_FORCE_O2_INLINE_GCC_PPC64LE static
+#endif
+
#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
# define expect(expr,value) (__builtin_expect ((expr),(value)) )
#else
@@ -253,7 +275,8 @@ static void LZ4_copy8(void* dst, const void* src)
}
/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
-static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
{
BYTE* d = (BYTE*)dstPtr;
const BYTE* s = (const BYTE*)srcPtr;
@@ -1120,6 +1143,7 @@ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
* Note that it is important for performance that this function really get inlined,
* in order to remove useless branches during compilation optimization.
*/
+LZ4_FORCE_O2_GCC_PPC64LE
LZ4_FORCE_INLINE int LZ4_decompress_generic(
const char* const src,
char* const dst,
@@ -1280,16 +1304,19 @@ _output_error:
}
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
}
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
}
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
{
return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
@@ -1335,6 +1362,7 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
If it's not possible, save the relevant part of decoded data into a safe buffer,
and indicate where it stands using LZ4_setStreamDecode()
*/
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
{
LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
@@ -1361,6 +1389,7 @@ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const ch
return result;
}
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
{
LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
@@ -1395,6 +1424,7 @@ Advanced decoding functions :
the dictionary must be explicitly provided within parameters
*/
+LZ4_FORCE_O2_GCC_PPC64LE
LZ4_FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
{
if (dictSize==0)
@@ -1407,17 +1437,20 @@ LZ4_FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char*
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
}
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
}
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
}
/* debug function */
+LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
diff --git a/lib/lz4.h b/lib/lz4.h
index d284d63..509d942 100644
--- a/lib/lz4.h
+++ b/lib/lz4.h
@@ -72,20 +72,24 @@ extern "C" {
/*
* LZ4_DLL_EXPORT :
* Enable exporting of functions when building a Windows DLL
-* LZ4LIB_API :
+* LZ4LIB_VISIBILITY :
* Control library symbols visibility.
*/
+#ifndef LZ4LIB_VISIBILITY
+# if defined(__GNUC__) && (__GNUC__ >= 4)
+# define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+# else
+# define LZ4LIB_VISIBILITY
+# endif
+#endif
#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
-# define LZ4LIB_API __declspec(dllexport)
+# define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
-# define LZ4LIB_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
-#elif defined(__GNUC__) && (__GNUC__ >= 4)
-# define LZ4LIB_API __attribute__ ((__visibility__ ("default")))
+# define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
#else
-# define LZ4LIB_API
+# define LZ4LIB_API LZ4LIB_VISIBILITY
#endif
-
/*------ Version ------*/
#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */
#define LZ4_VERSION_MINOR 8 /* for new (non-breaking) interface capabilities */