From b3b207c4db9378f3eb3518c0778ff03a9b5cdbb3 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 23 Nov 2014 00:46:15 +0100 Subject: New endian & alignment code --- NEWS | 10 +- lz4.c | 420 +++++++++++++++++++++++++++++++++------------------------------ lz4.h | 12 +- lz4hc.h | 6 +- xxhash.c | 72 +++++------ 5 files changed, 275 insertions(+), 245 deletions(-) diff --git a/NEWS b/NEWS index 7edee54..763fa5a 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,12 @@ +r125: +Changed : endian and alignment code +Fixed : some alignment warnings under clang + r124: -Fix : LZ4F_compressBound() using NULL preferencesPtr -Updated : xxHash, to r37 +New : LZ4 HC streaming mode +Fixed : LZ4F_compressBound() using null preferencesPtr +Updated : xxHash to r38 +Updated library number, to 1.4.0 r123: Added : experimental lz4frame API, thanks to Takayuki Matsuoka and Christopher Jackson for testings diff --git a/lz4.c b/lz4.c index 198b581..579a4fc 100644 --- a/lz4.c +++ b/lz4.c @@ -31,63 +31,37 @@ - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ + /************************************** Tuning parameters **************************************/ /* * HEAPMODE : * Select how default compression functions will allocate memory for their hash table, - * in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). */ #define HEAPMODE 0 +/* + * CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS : + * You can force the code to use unaligned memory access if you know your CPU can handle it. + */ +/* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */ + /************************************** CPU Feature Detection **************************************/ -/* 32 or 64 bits ? */ -#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ - || defined(__64BIT__) || defined(__mips64) \ - || defined(__powerpc64__) || defined(__powerpc64le__) \ - || defined(__ppc64__) || defined(__ppc64le__) \ - || defined(__PPC64__) || defined(__PPC64LE__) \ - || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) \ - || defined(__s390x__) ) /* Detects 64 bits mode */ -# define LZ4_ARCH64 1 -#else -# define LZ4_ARCH64 0 -#endif -#define LZ4_32BITS (sizeof(void*)==4) -#define LZ4_64BITS (sizeof(void*)==8) - /* - * Little Endian or Big Endian ? - * Overwrite the #define below if you know your architecture endianess + * Unaligned memory access detection */ -#include /* Apparently required to detect endianess */ -#if defined (__GLIBC__) -# include -# if (__BYTE_ORDER == __BIG_ENDIAN) -# define LZ4_BIG_ENDIAN 1 -# endif -#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) -# define LZ4_BIG_ENDIAN 1 -#elif defined(__sparc) || defined(__sparc__) \ - || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ - || defined(__hpux) || defined(__hppa) \ - || defined(_MIPSEB) || defined(__s390__) -# define LZ4_BIG_ENDIAN 1 +#if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \ + || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(__i386__) || defined(__x86_64__) \ + || defined(_M_IX86) || defined(_M_X64) +# define LZ4_UNALIGNED_ACCESS 1 #else -/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */ -#endif - -/* - * Unaligned memory access is automatically enabled for "common" CPU, such as x86. - * For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property - * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance - */ -#if defined(__ARM_FEATURE_UNALIGNED) -# define LZ4_FORCE_UNALIGNED_ACCESS 1 +# define LZ4_UNALIGNED_ACCESS 0 #endif /* Define this parameter if your target system or compiler does not support hardware bit count */ @@ -95,18 +69,9 @@ # define LZ4_FORCE_SW_BITCOUNT #endif -/* - * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : - * This option may provide a small boost to performance for some big endian cpu, although probably modest. - * You may set this option to 1 if data will remain within closed environment. - * This option is useless on Little_Endian CPU (such as x86) - */ - -/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ - /************************************** - Compiler Options + Compiler Options **************************************/ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ /* "restrict" is a known keyword */ @@ -117,13 +82,6 @@ #ifdef _MSC_VER /* Visual Studio */ # define FORCE_INLINE static __forceinline # include /* For Visual 2005 */ -# if LZ4_ARCH64 /* 64-bits */ -# pragma intrinsic(_BitScanForward64) /* For Visual 2005 */ -# pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */ -# else /* 32-bits */ -# pragma intrinsic(_BitScanForward) /* For Visual 2005 */ -# pragma intrinsic(_BitScanReverse) /* For Visual 2005 */ -# endif # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #else # ifdef __GNUC__ @@ -133,12 +91,6 @@ # endif #endif -#ifdef _MSC_VER /* Visual Studio */ -# define lz4_bswap16(x) _byteswap_ushort(x) -#else -# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) -#endif - #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) #if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) @@ -185,37 +137,167 @@ typedef unsigned long long U64; #endif -#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) -# define _PACKED __attribute__ ((packed)) -#else -# define _PACKED -#endif -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) -# pragma pack(1) -# else -# pragma pack(push, 1) -# endif -#endif +/************************************** + Reading and writing into memory +**************************************/ +static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } -typedef struct { U16 v; } _PACKED U16_S; -typedef struct { U32 v; } _PACKED U32_S; -typedef struct { U64 v; } _PACKED U64_S; -typedef struct {size_t v;} _PACKED size_t_S; +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) -# pragma pack(0) -# else -# pragma pack(pop) -# endif -#endif +static U16 LZ4_readLE16(const void* memPtr) +{ + if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) + return *(U16*)memPtr; + { + const BYTE* p = memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) + { + *(U16*)memPtr = value; + return; + } + { + BYTE* p = memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +static U32 LZ4_readLE32(const void* memPtr) +{ + if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) + return *(U32*)memPtr; + { + const BYTE* p = memPtr; + U32 result = (U32)((U32)p[0] + (p[1]<<8) + (p[2]<<16) + ((U32)p[3]<<24)); + return result; + } +} + +/* +static void LZ4_writeLE32(void* memPtr, U32 value) +{ + BYTE* p = memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + p[2] = (BYTE)(value>>16); + p[3] = (BYTE)(value>>24); +} +*/ + +static void LZ4_copy4(void* dstPtr, const void* srcPtr) +{ + if (LZ4_UNALIGNED_ACCESS) + { + *(U32*)dstPtr = *(U32*)srcPtr; + return; + } + { + BYTE* d = dstPtr; + const BYTE* s = srcPtr; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + } +} + +static U64 LZ4_readLE64(const void* memPtr) +{ + if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) + return *(U64*)memPtr; + { + const BYTE* p = memPtr; + return (U64)((U64)p[0] + (p[1]<<8) + (p[2]<<16) + ((U64)p[3]<<24) + + (((U64)p[4])<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56)); + } +} + +/* +static void LZ4_writeLE64(void* memPtr, U64 value) +{ + BYTE* p = memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + p[2] = (BYTE)(value>>16); + p[3] = (BYTE)(value>>24); + p[4] = (BYTE)(value>>32); + p[5] = (BYTE)(value>>40); + p[6] = (BYTE)(value>>48); + p[7] = (BYTE)(value>>56); +} +*/ + +static void LZ4_copy8(void* dstPtr, const void* srcPtr) +{ + if (LZ4_UNALIGNED_ACCESS) + { + if (LZ4_64bits()) + *(U64*)dstPtr = *(U64*)srcPtr; + else + ((U32*)dstPtr)[0] = ((U32*)srcPtr)[0], + ((U32*)dstPtr)[1] = ((U32*)srcPtr)[1]; + return; + } + { + BYTE* d = dstPtr; + const BYTE* s = srcPtr; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + } +} -#define A16(x) (((U16_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A64(x) (((U64_S *)(x))->v) -#define AARCH(x) (((size_t_S *)(x))->v) +#define STEPSIZE sizeof(size_t) + +static size_t LZ4_readLE_ARCH(const void* p) +{ + if (LZ4_64bits()) + return (size_t)LZ4_readLE64(p); + else + return (size_t)LZ4_readLE32(p); +} + +/* +static void LZ4_writeLE_ARCH(void* p, size_t value) +{ + if (LZ4_64BITS) + LZ4_writeLE64(p, (U64)value); + else + LZ4_writeLE32(p, (U32)value); +} + +static void LZ4_copyARCH(void* dstPtr, const void* srcPtr) +{ + if (LZ4_64BITS) + LZ4_copy8(dstPtr, srcPtr); + else + LZ4_copy4(dstPtr, srcPtr); +} +*/ + +#if !defined(__GNUC__) +# define LZ4_WILDCOPY(d,s,e) { do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=e; */ +#else +# define LZ4_WILDCOPY64(d,s,e) { do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=e; */ +# define LZ4_WILDCOPY32(d,s,e) { if (likely(e-d <= 8)) { LZ4_copy8(d,s); d+=8; s+=8; } else do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=e; */ -#else -# define LZ4_WILDCOPY(d,s,e) { if (likely(e-d <= 8)) LZ4_COPY8(d,s) else do { LZ4_COPY8(d,s) } while (d>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); -# else - int r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif -# else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + if (LZ4_64bits()) + { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; - _BitScanForward64( &r, val ); + _BitScanForward64( &r, (U64)val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); + return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif -# endif -} - -#else - -static int LZ4_NbCommonBytes (register U32 val) -{ -# if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); -# else - int r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif -# else + } + /* 32 bits */ + { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; - _BitScanForward( &r, val ); + _BitScanForward( &r, (U32)val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); + return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif -# endif + } } -#endif - - -/******************************** - Compression functions -********************************/ -int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } -int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } -static int LZ4_hashSequence(U32 sequence, tableType_t tableType) +static U32 LZ4_hashSequence(U32 sequence, tableType_t tableType) { if (tableType == byU16) return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); @@ -380,15 +403,15 @@ static int LZ4_hashSequence(U32 sequence, tableType_t tableType) return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); } -static int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } +static U32 LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(LZ4_readLE32(p), tableType); } static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) { switch (tableType) { - case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } - case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } - case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } + case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } } } @@ -417,13 +440,13 @@ static unsigned LZ4_count(const BYTE* pIn, const BYTE* pRef, const BYTE* pInLimi while (likely(pIn=lowRefLimit) : 1) && (ref+MAX_DISTANCE>=ip) - && (A32(ref+refDelta)==A32(ip)) ) + && (LZ4_readLE32(ref+refDelta)==LZ4_readLE32(ip)) ) { token=op++; *token=0; goto _next_match; } /* Prepare next loop */ @@ -646,16 +669,16 @@ _last_literals: int LZ4_compress(const char* source, char* dest, int inputSize) { #if (HEAPMODE) - void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U32, 4); /* Aligned on 4-bytes boundaries */ + void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U64, 8); /* Aligned on 8-bytes boundaries */ #else - U32 ctx[LZ4_STREAMSIZE_U32] = {0}; /* Ensure data is aligned on 4-bytes boundaries */ + U64 ctx[LZ4_STREAMSIZE_U64] = {0}; /* Ensure data is aligned on 8-bytes boundaries */ #endif int result; if (inputSize < (int)LZ4_64KLIMIT) result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue); else - result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue); + result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue); #if (HEAPMODE) FREEMEM(ctx); @@ -666,16 +689,16 @@ int LZ4_compress(const char* source, char* dest, int inputSize) int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { #if (HEAPMODE) - void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U32, 4); /* Aligned on 4-bytes boundaries */ + void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U64, 4); /* Aligned on 8-bytes boundaries */ #else - U32 ctx[LZ4_STREAMSIZE_U32] = {0}; /* Ensure data is aligned on 4-bytes boundaries */ + U64 ctx[LZ4_STREAMSIZE_U64] = {0}; /* Ensure data is aligned on 8-bytes boundaries */ #endif int result; if (inputSize < (int)LZ4_64KLIMIT) result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue); else - result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue); + result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue); #if (HEAPMODE) FREEMEM(ctx); @@ -700,7 +723,7 @@ void LZ4_resetStream (LZ4_stream_t* LZ4_stream) LZ4_stream_t* LZ4_createStream(void) { - LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(4, LZ4_STREAMSIZE_U32); + LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ LZ4_resetStream(lz4s); return lz4s; @@ -959,11 +982,12 @@ FORCE_INLINE int LZ4_decompress_generic( LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; /* get offset */ - LZ4_READ_LITTLEENDIAN_16(match,cpy,ip); ip+=2; + match = cpy - LZ4_readLE16(ip); ip+=2; if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ /* get matchlength */ - if ((length=(token&ML_MASK)) == ML_MASK) + length = token & ML_MASK; + if (length == ML_MASK) { unsigned s; do @@ -1020,9 +1044,9 @@ FORCE_INLINE int LZ4_decompress_generic( op[2] = match[2]; op[3] = match[3]; match += dec32table[op-match]; - A32(op+4) = A32(match); + LZ4_copy4(op+4, match); op += 8; match -= dec64; - } else { LZ4_COPY8(op,match); } + } else { LZ4_copy8(op, match); op+=8; match+=8; } if (unlikely(cpy>oend-12)) { @@ -1079,7 +1103,7 @@ typedef struct */ LZ4_streamDecode_t* LZ4_createStreamDecode(void) { - LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(sizeof(U32), LZ4_STREAMDECODESIZE_U32); + LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(sizeof(U64), LZ4_STREAMDECODESIZE_U64); return lz4s; } @@ -1241,7 +1265,7 @@ int LZ4_resetStreamState(void* state, const char* inputBuffer) void* LZ4_create (const char* inputBuffer) { - void* lz4ds = ALLOCATOR(4, LZ4_STREAMSIZE_U32); + void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); LZ4_init ((LZ4_stream_t_internal*)lz4ds, (const BYTE*)inputBuffer); return lz4ds; } @@ -1267,7 +1291,7 @@ int LZ4_compress_withState (void* state, const char* source, char* dest, int inp if (inputSize < (int)LZ4_64KLIMIT) return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue); else - return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue); + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue); } int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize) @@ -1278,7 +1302,7 @@ int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* if (inputSize < (int)LZ4_64KLIMIT) return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue); else - return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue); + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue); } /* Obsolete streaming decompression functions */ diff --git a/lz4.h b/lz4.h index 0350f6a..f995b05 100644 --- a/lz4.h +++ b/lz4.h @@ -172,14 +172,14 @@ int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedS Experimental Streaming Compression Functions ***********************************************/ -#define LZ4_STREAMSIZE_U32 ((1 << (LZ4_MEMORY_USAGE-2)) + 8) -#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U32 * sizeof(unsigned int)) +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long)) /* * LZ4_stream_t * information structure to track an LZ4 stream. * important : init this structure content before first use ! */ -typedef struct { unsigned int table[LZ4_STREAMSIZE_U32]; } LZ4_stream_t; +typedef struct { unsigned long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; /* * LZ4_resetStream @@ -234,14 +234,14 @@ int LZ4_saveDict (LZ4_stream_t* LZ4_streamPtr, char* safeBuffer, int dictSize); Experimental Streaming Decompression Functions ************************************************/ -#define LZ4_STREAMDECODESIZE_U32 8 -#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U32 * sizeof(unsigned int)) +#define LZ4_STREAMDECODESIZE_U64 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) /* * LZ4_streamDecode_t * information structure to track an LZ4 stream. * important : init this structure content using LZ4_setStreamDecode or memset() before first use ! */ -typedef struct { unsigned int table[LZ4_STREAMDECODESIZE_U32]; } LZ4_streamDecode_t; +typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; /* * If you prefer dynamic allocation methods, diff --git a/lz4hc.h b/lz4hc.h index 23ad7c1..26ba6d3 100644 --- a/lz4hc.h +++ b/lz4hc.h @@ -105,9 +105,9 @@ They just use the externally allocated memory for state instead of allocating th /************************************** Experimental Streaming Functions **************************************/ -#define LZ4_STREAMHCSIZE_U32 65548 -#define LZ4_STREAMHCSIZE (LZ4_STREAMHCSIZE_U32 * sizeof(unsigned int)) -typedef struct { unsigned int table[LZ4_STREAMHCSIZE_U32]; } LZ4_streamHC_t; +#define LZ4_STREAMHCSIZE_U64 32774 +#define LZ4_STREAMHCSIZE (LZ4_STREAMHCSIZE_U64 * sizeof(unsigned long long)) +typedef struct { unsigned long long table[LZ4_STREAMHCSIZE_U64]; } LZ4_streamHC_t; /* This structure allows static allocation of LZ4 HC streaming state. diff --git a/xxhash.c b/xxhash.c index e6c2f31..24a64b5 100644 --- a/xxhash.c +++ b/xxhash.c @@ -84,11 +84,11 @@ You can contact the author at : // Modify the local functions below should you wish to use some other memory routines // for malloc(), free() #include -FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); } -FORCE_INLINE void XXH_free (void* p) { free(p); } +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free (void* p) { free(p); } // for memcpy() #include -FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) +static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } @@ -221,28 +221,28 @@ static const int one = 1; //**************************** typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; -FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align) +FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); else - return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr); + return endian==XXH_littleEndian ? *(U32*)ptr : XXH_swap32(*(U32*)ptr); } -FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) +FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); } -FORCE_INLINE U64 XXH_readLE64_align(const U64* ptr, XXH_endianess endian, XXH_alignment align) +FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr)); else - return endian==XXH_littleEndian ? *ptr : XXH_swap64(*ptr); + return endian==XXH_littleEndian ? *(U64*)ptr : XXH_swap64(*(U64*)ptr); } -FORCE_INLINE U64 XXH_readLE64(const U64* ptr, XXH_endianess endian) +FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) { return XXH_readLE64_align(ptr, endian, XXH_unaligned); } @@ -256,7 +256,7 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; U32 h32; -#define XXH_get32bits(p) XXH_readLE32_align((const U32*)p, endian, align) +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (p==NULL) @@ -361,7 +361,7 @@ FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; U64 h64; -#define XXH_get64bits(p) XXH_readLE64_align((const U64*)p, endian, align) +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (p==NULL) @@ -509,8 +509,8 @@ typedef struct U32 v2; U32 v3; U32 v4; + U32 mem32[4]; /* defined as U32 for alignment */ U32 memsize; - char memory[16]; } XXH_istate32_t; typedef struct @@ -521,8 +521,8 @@ typedef struct U64 v2; U64 v3; U64 v4; + U64 mem64[4]; /* defined as U64 for alignment */ U32 memsize; - char memory[32]; } XXH_istate64_t; @@ -592,16 +592,16 @@ FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const v if (state->memsize + len < 16) // fill in tmp buffer { - XXH_memcpy(state->memory + state->memsize, input, len); + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); state->memsize += (U32)len; return XXH_OK; } if (state->memsize) // some data left from previous update { - XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize); + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); { - const U32* p32 = (const U32*)state->memory; + const U32* p32 = state->mem32; state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; @@ -633,19 +633,19 @@ FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const v do { - v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v1 += XXH_readLE32(p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; - v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v2 += XXH_readLE32(p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; - v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v3 += XXH_readLE32(p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; - v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v4 += XXH_readLE32(p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; @@ -660,7 +660,7 @@ FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const v if (p < bEnd) { - XXH_memcpy(state->memory, p, bEnd-p); + XXH_memcpy(state->mem32, p, bEnd-p); state->memsize = (int)(bEnd-p); } @@ -682,8 +682,8 @@ XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t l FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian) { XXH_istate32_t* state = (XXH_istate32_t*) state_in; - const BYTE * p = (const BYTE*)state->memory; - BYTE* bEnd = (BYTE*)state->memory + state->memsize; + const BYTE * p = (const BYTE*)state->mem32; + BYTE* bEnd = (BYTE*)(state->mem32) + state->memsize; U32 h32; if (state->total_len >= 16) @@ -699,7 +699,7 @@ FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endiane while (p+4<=bEnd) { - h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3; + h32 += XXH_readLE32(p, endian) * PRIME32_3; h32 = XXH_rotl32(h32, 17) * PRIME32_4; p+=4; } @@ -746,16 +746,16 @@ FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const v if (state->memsize + len < 32) // fill in tmp buffer { - XXH_memcpy(state->memory + state->memsize, input, len); + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); state->memsize += (U32)len; return XXH_OK; } if (state->memsize) // some data left from previous update { - XXH_memcpy(state->memory + state->memsize, input, 32-state->memsize); + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); { - const U64* p64 = (const U64*)state->memory; + const U64* p64 = state->mem64; state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; state->v1 = XXH_rotl64(state->v1, 31); state->v1 *= PRIME64_1; @@ -787,19 +787,19 @@ FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const v do { - v1 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v1 += XXH_readLE64(p, endian) * PRIME64_2; v1 = XXH_rotl64(v1, 31); v1 *= PRIME64_1; p+=8; - v2 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v2 += XXH_readLE64(p, endian) * PRIME64_2; v2 = XXH_rotl64(v2, 31); v2 *= PRIME64_1; p+=8; - v3 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v3 += XXH_readLE64(p, endian) * PRIME64_2; v3 = XXH_rotl64(v3, 31); v3 *= PRIME64_1; p+=8; - v4 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v4 += XXH_readLE64(p, endian) * PRIME64_2; v4 = XXH_rotl64(v4, 31); v4 *= PRIME64_1; p+=8; @@ -814,7 +814,7 @@ FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const v if (p < bEnd) { - XXH_memcpy(state->memory, p, bEnd-p); + XXH_memcpy(state->mem64, p, bEnd-p); state->memsize = (int)(bEnd-p); } @@ -836,8 +836,8 @@ XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t l FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian) { XXH_istate64_t * state = (XXH_istate64_t *) state_in; - const BYTE * p = (const BYTE*)state->memory; - BYTE* bEnd = (BYTE*)state->memory + state->memsize; + const BYTE * p = (const BYTE*)state->mem64; + BYTE* bEnd = (BYTE*)state->mem64 + state->memsize; U64 h64; if (state->total_len >= 32) @@ -882,7 +882,7 @@ FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endiane while (p+8<=bEnd) { - U64 k1 = XXH_readLE64((const U64*)p, endian); + U64 k1 = XXH_readLE64(p, endian); k1 *= PRIME64_2; k1 = XXH_rotl64(k1,31); k1 *= PRIME64_1; @@ -893,7 +893,7 @@ FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endiane if (p+4<=bEnd) { - h64 ^= (U64)(XXH_readLE32((const U32*)p, endian)) * PRIME64_1; + h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p+=4; } -- cgit v0.12 From 2ad37dbe58eb3709e5dee16ebcf345bd53680f85 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 23 Nov 2014 01:14:04 +0100 Subject: Corrected lz4io --- lz4.c | 11 +++++++++-- lz4.h | 2 +- programs/lz4cli.c | 2 +- programs/lz4io.c | 2 +- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lz4.c b/lz4.c index 579a4fc..7e454fd 100644 --- a/lz4.c +++ b/lz4.c @@ -44,7 +44,7 @@ /* * CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS : - * You can force the code to use unaligned memory access if you know your CPU can handle it. + * You can force the code to use unaligned memory access if you know your CPU can handle it efficiently. */ /* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */ @@ -58,7 +58,9 @@ #if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \ || defined(__ARM_FEATURE_UNALIGNED) \ || defined(__i386__) || defined(__x86_64__) \ - || defined(_M_IX86) || defined(_M_X64) + || defined(_M_IX86) || defined(_M_X64) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_8__) \ + || (defined(_M_ARM) && (_M_ARM >= 7)) # define LZ4_UNALIGNED_ACCESS 1 #else # define LZ4_UNALIGNED_ACCESS 0 @@ -153,6 +155,7 @@ static U16 LZ4_readLE16(const void* memPtr) { if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) return *(U16*)memPtr; + else { const BYTE* p = memPtr; return (U16)((U16)p[0] + (p[1]<<8)); @@ -166,6 +169,7 @@ static void LZ4_writeLE16(void* memPtr, U16 value) *(U16*)memPtr = value; return; } + else { BYTE* p = memPtr; p[0] = (BYTE) value; @@ -202,6 +206,7 @@ static void LZ4_copy4(void* dstPtr, const void* srcPtr) *(U32*)dstPtr = *(U32*)srcPtr; return; } + else { BYTE* d = dstPtr; const BYTE* s = srcPtr; @@ -216,6 +221,7 @@ static U64 LZ4_readLE64(const void* memPtr) { if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) return *(U64*)memPtr; + else { const BYTE* p = memPtr; return (U64)((U64)p[0] + (p[1]<<8) + (p[2]<<16) + ((U64)p[3]<<24) + @@ -249,6 +255,7 @@ static void LZ4_copy8(void* dstPtr, const void* srcPtr) ((U32*)dstPtr)[1] = ((U32*)srcPtr)[1]; return; } + else { BYTE* d = dstPtr; const BYTE* s = srcPtr; diff --git a/lz4.h b/lz4.h index f995b05..c9ed49f 100644 --- a/lz4.h +++ b/lz4.h @@ -48,7 +48,7 @@ extern "C" { **************************************/ #define LZ4_VERSION_MAJOR 1 /* for major interface/format changes */ #define LZ4_VERSION_MINOR 4 /* for minor interface/format changes */ -#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ +#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ #define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) int LZ4_versionNumber (void); diff --git a/programs/lz4cli.c b/programs/lz4cli.c index 2d612e7..9a6e5bc 100644 --- a/programs/lz4cli.c +++ b/programs/lz4cli.c @@ -113,7 +113,7 @@ //**************************** #define COMPRESSOR_NAME "LZ4 Compression CLI" #ifndef LZ4_VERSION -# define LZ4_VERSION "r122" +# define LZ4_VERSION "r125" #endif #define AUTHOR "Yann Collet" #define WELCOME_MESSAGE "*** %s %i-bits %s, by %s (%s) ***\n", COMPRESSOR_NAME, (int)(sizeof(void*)*8), LZ4_VERSION, AUTHOR, __DATE__ diff --git a/programs/lz4io.c b/programs/lz4io.c index 3a84866..afaa59f 100644 --- a/programs/lz4io.c +++ b/programs/lz4io.c @@ -357,7 +357,7 @@ int LZ4IO_compressFilename_Legacy(char* input_filename, char* output_filename, i static void* LZ4IO_LZ4_createStream (const char* inputBuffer) { (void)inputBuffer; - return calloc(4, LZ4_STREAMSIZE_U32); + return calloc(8, LZ4_STREAMSIZE_U64); } static int LZ4IO_LZ4_compress_limitedOutput_continue (void* ctx, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel) -- cgit v0.12 From 86fa21b3019d7e9983704661966a64f5aa1456bf Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 23 Nov 2014 18:36:04 +0100 Subject: Added : $(EXT) within install for cross-compilation support --- Makefile | 9 +-------- lz4.c | 5 ++++- programs/Makefile | 14 +++++++------- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 586ac5e..c2bbd57 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ # ################################################################ # Version numbers -VERSION=124 +VERSION=125 export RELEASE=r$(VERSION) LIBVER_MAJOR=`sed -n '/define LZ4_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < lz4.h` LIBVER_MINOR=`sed -n '/define LZ4_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < lz4.h` @@ -50,13 +50,6 @@ PRGDIR = programs DISTRIBNAME=lz4-$(RELEASE).tar.gz -# Define *.exe as extension for Windows systems -ifneq (,$(filter Windows%,$(OS))) -EXT =.exe -else -EXT = -endif - # OS X linker doesn't support -soname, and use different extension # see : https://developer.apple.com/library/mac/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html ifeq ($(shell uname), Darwin) diff --git a/lz4.c b/lz4.c index 7e454fd..8e81cc2 100644 --- a/lz4.c +++ b/lz4.c @@ -44,7 +44,10 @@ /* * CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS : - * You can force the code to use unaligned memory access if you know your CPU can handle it efficiently. + * You can force the code to use unaligned memory access, should you know your CPU can handle it efficiently. + * If it effectively results in better speed (up to 50% improvement can be expected) + * please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c) + * so that an automatic detection macro can be added to mainline. */ /* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */ diff --git a/programs/Makefile b/programs/Makefile index 1d7e17f..fcfb32c 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -30,7 +30,7 @@ # fullbench32: Same as fullbench, but forced to compile in 32-bits mode # ########################################################################## -RELEASE=r124 +RELEASE=r125 DESTDIR?= PREFIX ?= /usr @@ -113,9 +113,9 @@ ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU)) install: lz4 lz4c @echo Installing binaries @install -d -m 755 $(DESTDIR)$(BINDIR)/ $(DESTDIR)$(MANDIR)/ - @install -m 755 lz4 $(DESTDIR)$(BINDIR)/lz4 - @ln -sf lz4 $(DESTDIR)$(BINDIR)/lz4cat - @install -m 755 lz4c $(DESTDIR)$(BINDIR)/lz4c + @install -m 755 lz4$(EXT) $(DESTDIR)$(BINDIR)/lz4$(EXT) + @ln -sf lz4$(EXT) $(DESTDIR)$(BINDIR)/lz4cat + @install -m 755 lz4c$(EXT) $(DESTDIR)$(BINDIR)/lz4c$(EXT) @echo Installing man pages @install -m 644 lz4.1 $(DESTDIR)$(MANDIR)/lz4.1 @install -m 644 lz4c.1 $(DESTDIR)$(MANDIR)/lz4c.1 @@ -124,12 +124,12 @@ install: lz4 lz4c uninstall: rm -f $(DESTDIR)$(BINDIR)/lz4cat - [ -x $(DESTDIR)$(BINDIR)/lz4 ] && rm -f $(DESTDIR)$(BINDIR)/lz4 - [ -x $(DESTDIR)$(BINDIR)/lz4c ] && rm -f $(DESTDIR)$(BINDIR)/lz4c + [ -x $(DESTDIR)$(BINDIR)/lz4$(EXT) ] && rm -f $(DESTDIR)$(BINDIR)/lz4$(EXT) + [ -x $(DESTDIR)$(BINDIR)/lz4c$(EXT) ] && rm -f $(DESTDIR)$(BINDIR)/lz4c$(EXT) [ -f $(DESTDIR)$(MANDIR)/lz4.1 ] && rm -f $(DESTDIR)$(MANDIR)/lz4.1 [ -f $(DESTDIR)$(MANDIR)/lz4c.1 ] && rm -f $(DESTDIR)$(MANDIR)/lz4c.1 [ -f $(DESTDIR)$(MANDIR)/lz4cat.1 ] && rm -f $(DESTDIR)$(MANDIR)/lz4cat.1 - @echo lz4 successfully uninstalled + @echo lz4 programs successfully uninstalled test: test-lz4 test-lz4c test-frametest test-fullbench test-fuzzer test-mem -- cgit v0.12 From 2d98faf238516ecd6588f9202f8ae546c0b798e4 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 27 Nov 2014 22:44:36 +0100 Subject: Improved decoding speed --- lz4.c | 181 ++++++++++++++++++++++----------------------------- lz4.h | 4 +- programs/fullbench.c | 20 +++++- 3 files changed, 98 insertions(+), 107 deletions(-) diff --git a/lz4.c b/lz4.c index 8e81cc2..b3f1b2d 100644 --- a/lz4.c +++ b/lz4.c @@ -56,7 +56,9 @@ CPU Feature Detection **************************************/ /* - * Unaligned memory access detection + * Automated efficient unaligned memory access detection + * Based on known hardware architectures + * This list will be updated thanks to Open Source community feedbacks */ #if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \ || defined(__ARM_FEATURE_UNALIGNED) \ @@ -146,6 +148,8 @@ /************************************** Reading and writing into memory **************************************/ +#define STEPSIZE sizeof(size_t) + static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } static unsigned LZ4_isLittleEndian(void) @@ -180,43 +184,44 @@ static void LZ4_writeLE16(void* memPtr, U16 value) } } -static U32 LZ4_readLE32(const void* memPtr) + +static U32 LZ4_read16(const void* memPtr) { - if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) - return *(U32*)memPtr; + if (LZ4_UNALIGNED_ACCESS) { - const BYTE* p = memPtr; - U32 result = (U32)((U32)p[0] + (p[1]<<8) + (p[2]<<16) + ((U32)p[3]<<24)); - return result; + return *(U16*)memPtr; + } + else + { + U16 val16; + memcpy(&val16, memPtr, 2); + return val16; } } -/* -static void LZ4_writeLE32(void* memPtr, U32 value) -{ - BYTE* p = memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - p[2] = (BYTE)(value>>16); - p[3] = (BYTE)(value>>24); -} -*/ - -static void LZ4_copy4(void* dstPtr, const void* srcPtr) +static U32 LZ4_read32(const void* memPtr) { if (LZ4_UNALIGNED_ACCESS) { - *(U32*)dstPtr = *(U32*)srcPtr; - return; + return *(U32*)memPtr; } else { - BYTE* d = dstPtr; - const BYTE* s = srcPtr; - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - d[3] = s[3]; + U32 val32; + memcpy(&val32, memPtr, 4); + return val32; + } +} + + +static U32 LZ4_readLE32(const void* memPtr) +{ + if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) + return *(U32*)memPtr; + { + const BYTE* p = memPtr; + U32 result = (U32)((U32)p[0] + (p[1]<<8) + (p[2]<<16) + ((U32)p[3]<<24)); + return result; } } @@ -232,20 +237,24 @@ static U64 LZ4_readLE64(const void* memPtr) } } -/* -static void LZ4_writeLE64(void* memPtr, U64 value) +static size_t LZ4_readLE_ARCH(const void* p) +{ + if (LZ4_64bits()) + return (size_t)LZ4_readLE64(p); + else + return (size_t)LZ4_readLE32(p); +} + + +static void LZ4_copy4(void* dstPtr, const void* srcPtr) { - BYTE* p = memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - p[2] = (BYTE)(value>>16); - p[3] = (BYTE)(value>>24); - p[4] = (BYTE)(value>>32); - p[5] = (BYTE)(value>>40); - p[6] = (BYTE)(value>>48); - p[7] = (BYTE)(value>>56); + if (LZ4_UNALIGNED_ACCESS) + { + *(U32*)dstPtr = *(U32*)srcPtr; + return; + } + memcpy(dstPtr, srcPtr, 4); } -*/ static void LZ4_copy8(void* dstPtr, const void* srcPtr) { @@ -258,57 +267,18 @@ static void LZ4_copy8(void* dstPtr, const void* srcPtr) ((U32*)dstPtr)[1] = ((U32*)srcPtr)[1]; return; } - else - { - BYTE* d = dstPtr; - const BYTE* s = srcPtr; - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - d[3] = s[3]; - d[4] = s[4]; - d[5] = s[5]; - d[6] = s[6]; - d[7] = s[7]; - } + memcpy(dstPtr, srcPtr, 8); } -#define STEPSIZE sizeof(size_t) - -static size_t LZ4_readLE_ARCH(const void* p) +/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ +static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) { - if (LZ4_64bits()) - return (size_t)LZ4_readLE64(p); - else - return (size_t)LZ4_readLE32(p); + BYTE* d = dstPtr; + const BYTE* s = srcPtr; + BYTE* e = dstEnd; + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=e; */ -#else -# define LZ4_WILDCOPY64(d,s,e) { do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=e; */ -# define LZ4_WILDCOPY32(d,s,e) { if (likely(e-d <= 8)) { LZ4_copy8(d,s); d+=8; s+=8; } else do { LZ4_copy8(d,s); d+=8; s+=8; } while (d> ((MINMATCH*8)-LZ4_HASHLOG)); } -static U32 LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(LZ4_readLE32(p), tableType); } +static U32 LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(LZ4_read32(p), tableType); } static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) { @@ -444,20 +414,20 @@ static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t t return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); } -static unsigned LZ4_count(const BYTE* pIn, const BYTE* pRef, const BYTE* pInLimit) +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) { const BYTE* const pStart = pIn; while (likely(pIn=lowRefLimit) : 1) && (ref+MAX_DISTANCE>=ip) - && (LZ4_readLE32(ref+refDelta)==LZ4_readLE32(ip)) ) + && (LZ4_read32(ref+refDelta)==LZ4_read32(ip)) ) { token=op++; *token=0; goto _next_match; } /* Prepare next loop */ @@ -699,7 +669,7 @@ int LZ4_compress(const char* source, char* dest, int inputSize) int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { #if (HEAPMODE) - void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U64, 4); /* Aligned on 8-bytes boundaries */ + void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U64, 8); /* Aligned on 8-bytes boundaries */ #else U64 ctx[LZ4_STREAMSIZE_U64] = {0}; /* Ensure data is aligned on 8-bytes boundaries */ #endif @@ -989,7 +959,9 @@ FORCE_INLINE int LZ4_decompress_generic( op += length; break; /* Necessarily EOF, due to parsing restrictions */ } - LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; + //LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; /* get offset */ match = cpy - LZ4_readLE16(ip); ip+=2; @@ -1060,11 +1032,14 @@ FORCE_INLINE int LZ4_decompress_generic( if (unlikely(cpy>oend-12)) { - if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last 5 bytes must be literals */ - if (op oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ + if (op < oend-8) LZ4_wildCopy(op, match, oend-8); + match += oend-8 - op; + op = oend-8; + while (op chunkSize) { chunkP[i].origSize = chunkSize; remaining -= chunkSize; } else { chunkP[i].origSize = (int)remaining; remaining = 0; } + chunkP[i].compressedBuffer = out; out += maxCompressedChunkSize; + chunkP[i].compressedSize = 0; + } + } for (chunkNb=0; chunkNb Date: Fri, 28 Nov 2014 02:48:21 +0100 Subject: Fixed decompression bug --- lz4.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lz4.c b/lz4.c index b3f1b2d..2a6f038 100644 --- a/lz4.c +++ b/lz4.c @@ -1033,9 +1033,12 @@ FORCE_INLINE int LZ4_decompress_generic( if (unlikely(cpy>oend-12)) { if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ - if (op < oend-8) LZ4_wildCopy(op, match, oend-8); - match += oend-8 - op; - op = oend-8; + if (op < oend-8) + { + LZ4_wildCopy(op, match, oend-8); + match += oend-8 - op; + op = oend-8; + } while (op Date: Sat, 29 Nov 2014 16:41:28 +0100 Subject: Fixed : decompression issue on 32-bits CPU without unaligned memory access --- examples/Makefile | 2 +- lz4.c | 36 +++++++++++++++++++++++++++--------- lz4.h | 29 ++++++++--------------------- programs/lz4cli.c | 15 +-------------- 4 files changed, 37 insertions(+), 45 deletions(-) diff --git a/examples/Makefile b/examples/Makefile index 4474f59..df24ea9 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -32,7 +32,7 @@ CC := $(CC) CFLAGS ?= -O3 -CFLAGS += -std=c99 -Wall -Wextra -Wundef -Wshadow -Wstrict-prototypes -Wno-missing-braces # Wno-missing-braces required due to GCC <4.8.3 bug +CFLAGS += -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Wstrict-prototypes -Wno-missing-braces # Wno-missing-braces required due to GCC <4.8.3 bug FLAGS = -I.. $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) TESTFILE= Makefile diff --git a/lz4.c b/lz4.c index 2a6f038..f2a8120 100644 --- a/lz4.c +++ b/lz4.c @@ -44,10 +44,26 @@ /* * CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS : - * You can force the code to use unaligned memory access, should you know your CPU can handle it efficiently. - * If it effectively results in better speed (up to 50% improvement can be expected) + * By default, the source code expects the compiler to correctly optimize + * 4-bytes and 8-bytes read on architectures able to handle it efficiently. + * This is not always the case. In some circumstances (ARM notably), + * the compiler will issue cautious code even when target is able to correctly handle unaligned memory accesses. + * + * You can force the compiler to use unaligned memory access by uncommenting the line below. + * One of the below scenarios will happen : + * 1 - Your target CPU correctly handle unaligned access, and was not well optimized by compiler (good case). + * You will witness large performance improvements (+50% and up). + * Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c) + * The goal is to automatically detect such situations by adding your target CPU within an exception list. + * 2 - Your target CPU correctly handle unaligned access, and was already correctly optimized by compiler + * No change will be experienced. + * 3 - Your target CPU inefficiently handle unaligned access. + * You will experience a performance loss. Comment back the line. + * 4 - Your target CPU does not handle unaligned access. + * Program will crash. + * If it effectively results in better speed (case 1) * please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c) - * so that an automatic detection macro can be added to mainline. + * so that an automatic detection macro can be added for future versions of the library. */ /* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */ @@ -58,7 +74,7 @@ /* * Automated efficient unaligned memory access detection * Based on known hardware architectures - * This list will be updated thanks to Open Source community feedbacks + * This list will be updated thanks to feedbacks */ #if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \ || defined(__ARM_FEATURE_UNALIGNED) \ @@ -71,7 +87,10 @@ # define LZ4_UNALIGNED_ACCESS 0 #endif -/* Define this parameter if your target system or compiler does not support hardware bit count */ +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ #if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ # define LZ4_FORCE_SW_BITCOUNT #endif @@ -88,7 +107,7 @@ #ifdef _MSC_VER /* Visual Studio */ # define FORCE_INLINE static __forceinline -# include /* For Visual 2005 */ +# include # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #else # ifdef __GNUC__ @@ -961,7 +980,6 @@ FORCE_INLINE int LZ4_decompress_generic( } LZ4_wildCopy(op, ip, cpy); ip += length; op = cpy; - //LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; /* get offset */ match = cpy - LZ4_readLE16(ip); ip+=2; @@ -1018,7 +1036,7 @@ FORCE_INLINE int LZ4_decompress_generic( /* copy repeated sequence */ cpy = op + length; - if (unlikely((op-match)<(int)STEPSIZE)) + if (unlikely((op-match)<8)) { const size_t dec64 = dec64table[op-match]; op[0] = match[0]; @@ -1036,7 +1054,7 @@ FORCE_INLINE int LZ4_decompress_generic( if (op < oend-8) { LZ4_wildCopy(op, match, oend-8); - match += oend-8 - op; + match += (oend-8) - op; op = oend-8; } while (op on unix @@ -128,15 +124,6 @@ //************************************** -// Architecture Macros -//************************************** -static const int one = 1; -#define CPU_LITTLE_ENDIAN (*(char*)(&one)) -#define CPU_BIG_ENDIAN (!CPU_LITTLE_ENDIAN) -#define LITTLE_ENDIAN_32(i) (CPU_LITTLE_ENDIAN?(i):swap32(i)) - - -//************************************** // Macros //************************************** #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) @@ -462,7 +449,7 @@ int main(int argc, char** argv) } DISPLAYLEVEL(3, WELCOME_MESSAGE); - DISPLAYLEVEL(4, "Blocks size : %i KB\n", blockSize>>10); + if (!decode) DISPLAYLEVEL(4, "Blocks size : %i KB\n", blockSize>>10); // No input filename ==> use stdin if(!input_filename) { input_filename=stdinmark; } -- cgit v0.12 From 33dca250ee0afa3aec58bf3bf1a34b09bebf3fb5 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 29 Nov 2014 17:12:26 +0100 Subject: minor : fixed warning under clang --- lz4hc.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lz4hc.c b/lz4hc.c index 8f6f25b..3dbdf1d 100644 --- a/lz4hc.c +++ b/lz4hc.c @@ -50,7 +50,7 @@ Memory routines /************************************** -CPU Feature Detection + CPU Feature Detection **************************************/ /* 32 or 64 bits ? */ #if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ @@ -102,7 +102,7 @@ CPU Feature Detection /************************************** -Compiler Options + Compiler Options **************************************/ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ /* "restrict" is a known keyword */ @@ -138,14 +138,14 @@ Compiler Options /************************************** -Includes + Includes **************************************/ -#include "lz4hc.h" #include "lz4.h" +#include "lz4hc.h" /************************************** -Basic Types + Basic Types **************************************/ #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ # include @@ -248,9 +248,12 @@ Architecture-specific macros **************************************/ typedef struct { - U32 hashTable[HASHTABLESIZE]; + union { + U64 alignedOn8Bytes; /* force 8-bytes alignment on 32-bits systems */ + U32 hashTable[HASHTABLESIZE]; + }; U16 chainTable[MAXD]; - const BYTE* end; /* next block here to keep current prefix as prefix */ + const BYTE* end; /* next block here to continue on current prefix */ const BYTE* base; /* All index relative to this position */ const BYTE* dictBase; /* alternate base for extDict */ U32 dictLimit; /* below that point, need extDict */ -- cgit v0.12 From 6658c49a971e0363bfa17de418b4512bfe12f0c5 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 29 Nov 2014 17:44:33 +0100 Subject: Improved compression speed on big endian CPU --- lz4.c | 124 +++++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 73 insertions(+), 51 deletions(-) diff --git a/lz4.c b/lz4.c index f2a8120..fbe73bd 100644 --- a/lz4.c +++ b/lz4.c @@ -55,15 +55,15 @@ * You will witness large performance improvements (+50% and up). * Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c) * The goal is to automatically detect such situations by adding your target CPU within an exception list. - * 2 - Your target CPU correctly handle unaligned access, and was already correctly optimized by compiler + * 2 - Your target CPU correctly handle unaligned access, and was already already optimized by compiler * No change will be experienced. * 3 - Your target CPU inefficiently handle unaligned access. * You will experience a performance loss. Comment back the line. * 4 - Your target CPU does not handle unaligned access. * Program will crash. - * If it effectively results in better speed (case 1) + * If uncommenting results in better performance (case 1) * please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c) - * so that an automatic detection macro can be added for future versions of the library. + * An automatic detection macro will be added to match your case within future versions of the library. */ /* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */ @@ -177,6 +177,7 @@ static unsigned LZ4_isLittleEndian(void) return one.c[0]; } + static U16 LZ4_readLE16(const void* memPtr) { if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) @@ -204,12 +205,10 @@ static void LZ4_writeLE16(void* memPtr, U16 value) } -static U32 LZ4_read16(const void* memPtr) +static U16 LZ4_read16(const void* memPtr) { if (LZ4_UNALIGNED_ACCESS) - { return *(U16*)memPtr; - } else { U16 val16; @@ -221,9 +220,7 @@ static U32 LZ4_read16(const void* memPtr) static U32 LZ4_read32(const void* memPtr) { if (LZ4_UNALIGNED_ACCESS) - { return *(U32*)memPtr; - } else { U32 val32; @@ -232,36 +229,24 @@ static U32 LZ4_read32(const void* memPtr) } } - -static U32 LZ4_readLE32(const void* memPtr) -{ - if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) - return *(U32*)memPtr; - { - const BYTE* p = memPtr; - U32 result = (U32)((U32)p[0] + (p[1]<<8) + (p[2]<<16) + ((U32)p[3]<<24)); - return result; - } -} - -static U64 LZ4_readLE64(const void* memPtr) +static U64 LZ4_read64(const void* memPtr) { - if ((LZ4_UNALIGNED_ACCESS) && (LZ4_isLittleEndian())) + if (LZ4_UNALIGNED_ACCESS) return *(U64*)memPtr; else { - const BYTE* p = memPtr; - return (U64)((U64)p[0] + (p[1]<<8) + (p[2]<<16) + ((U64)p[3]<<24) + - (((U64)p[4])<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56)); + U64 val64; + memcpy(&val64, memPtr, 8); + return val64; } } -static size_t LZ4_readLE_ARCH(const void* p) +static size_t LZ4_read_ARCH(const void* p) { if (LZ4_64bits()) - return (size_t)LZ4_readLE64(p); + return (size_t)LZ4_read64(p); else - return (size_t)LZ4_readLE32(p); + return (size_t)LZ4_read32(p); } @@ -365,31 +350,68 @@ int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } ********************************/ static unsigned LZ4_NbCommonBytes (register size_t val) { - if (LZ4_64bits()) + if (LZ4_isLittleEndian()) { -# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif + if (LZ4_64bits()) + { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } } - /* 32 bits */ + else /* Big Endian CPU */ { -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanForward( &r, (U32)val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif + if (LZ4_64bits()) + { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } } @@ -439,7 +461,7 @@ static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLi while (likely(pIn Date: Sat, 29 Nov 2014 20:19:39 +0100 Subject: Updated lz4hc : re-use most shared elements from lz4 (endianess / align / bus detection routines) --- lz4.c | 131 ++++++++++++--------- lz4hc.c | 312 ++++++--------------------------------------------- programs/frametest.c | 2 +- 3 files changed, 108 insertions(+), 337 deletions(-) diff --git a/lz4.c b/lz4.c index fbe73bd..9439b36 100644 --- a/lz4.c +++ b/lz4.c @@ -285,12 +285,8 @@ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) /************************************** - Constants + Common Constants **************************************/ -#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) -#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) -#define HASH_SIZE_U32 (1 << LZ4_HASHLOG) - #define MINMATCH 4 #define COPYLENGTH 8 @@ -298,13 +294,10 @@ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) #define MFLIMIT (COPYLENGTH+MINMATCH) static const int LZ4_minLength = (MFLIMIT+1); -#define KB *(1U<<10) -#define MB *(1U<<20) +#define KB *(1 <<10) +#define MB *(1 <<20) #define GB *(1U<<30) -#define LZ4_64KLIMIT ((64 KB) + (MFLIMIT-1)) -#define SKIPSTRENGTH 6 /* Increasing this value will make the compression run slower on incompressible data */ - #define MAXD_LOG 16 #define MAX_DISTANCE ((1 << MAXD_LOG) - 1) @@ -315,38 +308,13 @@ static const int LZ4_minLength = (MFLIMIT+1); /************************************** - Structures and local types -**************************************/ -typedef struct { - U32 hashTable[HASH_SIZE_U32]; - U32 currentOffset; - U32 initCheck; - const BYTE* dictionary; - const BYTE* bufferStart; - U32 dictSize; -} LZ4_stream_t_internal; - -typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; -typedef enum { byPtr, byU32, byU16 } tableType_t; - -typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; -typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; - -typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; -typedef enum { full = 0, partial = 1 } earlyEnd_directive; - - -/************************************** - Utils + Common Utils **************************************/ #define LZ4_STATIC_ASSERT(c) { enum { LZ4_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ -int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } -int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } - /******************************** - Compression functions + Common functions ********************************/ static unsigned LZ4_NbCommonBytes (register size_t val) { @@ -415,6 +383,73 @@ static unsigned LZ4_NbCommonBytes (register size_t val) } } +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIn /* calloc, free */ -#define ALLOCATOR(s) calloc(1,s) -#define FREEMEM free -#include /* memset, memcpy */ -#define MEM_INIT memset - - -/************************************** - CPU Feature Detection -**************************************/ -/* 32 or 64 bits ? */ -#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ - || defined(__64BIT__) || defined(__mips64) \ - || defined(__powerpc64__) || defined(__powerpc64le__) \ - || defined(__ppc64__) || defined(__ppc64le__) \ - || defined(__PPC64__) || defined(__PPC64LE__) \ - || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) \ - || defined(__s390x__) ) /* Detects 64 bits mode */ -# define LZ4_ARCH64 1 -#else -# define LZ4_ARCH64 0 -#endif - -/* -* Little Endian or Big Endian ? -* Overwrite the #define below if you know your architecture endianess -*/ -#include /* Apparently required to detect endianess */ -#if defined (__GLIBC__) -# include -# if (__BYTE_ORDER == __BIG_ENDIAN) -# define LZ4_BIG_ENDIAN 1 -# endif -#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) -# define LZ4_BIG_ENDIAN 1 -#elif defined(__sparc) || defined(__sparc__) \ - || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ - || defined(__hpux) || defined(__hppa) \ - || defined(_MIPSEB) || defined(__s390__) -# define LZ4_BIG_ENDIAN 1 -#else -/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */ -#endif - -/* -* Unaligned memory access is automatically enabled for "common" CPU, such as x86. -* For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected -* If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance -*/ -#if defined(__ARM_FEATURE_UNALIGNED) -# define LZ4_FORCE_UNALIGNED_ACCESS 1 -#endif - -/* Define this parameter if your target system or compiler does not support hardware bit count */ -#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ -# define LZ4_FORCE_SW_BITCOUNT -#endif - - -/************************************** - Compiler Options -**************************************/ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ -/* "restrict" is a known keyword */ -#else -# define restrict /* Disable restrict */ -#endif - -#ifdef _MSC_VER /* Visual Studio */ -# define FORCE_INLINE static __forceinline -# include /* For Visual 2005 */ -# if LZ4_ARCH64 /* 64-bits */ -# pragma intrinsic(_BitScanForward64) /* For Visual 2005 */ -# pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */ -# else /* 32-bits */ -# pragma intrinsic(_BitScanForward) /* For Visual 2005 */ -# pragma intrinsic(_BitScanReverse) /* For Visual 2005 */ -# endif -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# pragma warning(disable : 4701) /* disable: C4701: potentially uninitialized local variable used */ -#else -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -#endif - -#ifdef _MSC_VER /* Visual Studio */ -# define lz4_bswap16(x) _byteswap_ushort(x) -#else -# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) -#endif - - -/************************************** Includes **************************************/ #include "lz4.h" @@ -145,106 +47,40 @@ Memory routines /************************************** - Basic Types + Local Compiler Options **************************************/ -#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ -# include -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; -#else -typedef unsigned char BYTE; -typedef unsigned short U16; -typedef unsigned int U32; -typedef signed int S32; -typedef unsigned long long U64; -#endif - -#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) -# define _PACKED __attribute__ ((packed)) -#else -# define _PACKED +#if defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-function" #endif -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# ifdef __IBMC__ -# pragma pack(1) -# else -# pragma pack(push, 1) -# endif +#if defined (__clang__) +# pragma clang diagnostic ignored "-Wunused-function" #endif -typedef struct _U16_S { U16 v; } _PACKED U16_S; -typedef struct _U32_S { U32 v; } _PACKED U32_S; -typedef struct _U64_S { U64 v; } _PACKED U64_S; - -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# pragma pack(pop) -#endif -#define A64(x) (((U64_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A16(x) (((U16_S *)(x))->v) +/************************************** + Common LZ4 definition +**************************************/ +#define LZ4_COMMONDEFS_ONLY +#include "lz4.c" /************************************** -Constants + Local Constants **************************************/ -#define MINMATCH 4 - #define DICTIONARY_LOGSIZE 16 #define MAXD (1<> ((MINMATCH*8)-HASH_LOG)) #define DELTANEXT(p) chainTable[(size_t)(p) & MAXD_MASK] #define GETNEXT(p) ((p) - (size_t)DELTANEXT(p)) -static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(A32(ptr)); } - -/************************************** -Private functions -**************************************/ -#if LZ4_ARCH64 +static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); } -FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) -{ -#if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); -# else - int r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif -#else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward64( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58]; -# endif -#endif -} - -#else - -FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) -{ -#if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanReverse( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); -# else - int r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif -#else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanForward( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif -#endif -} - -#endif +/************************************** + HC Compression +**************************************/ static void LZ4HC_init (LZ4HC_Data_Structure* hc4, const BYTE* base) { MEM_INIT((void*)hc4->hashTable, 0, sizeof(hc4->hashTable)); @@ -397,24 +166,6 @@ static void LZ4HC_setExternalDict(LZ4HC_Data_Structure* ctxPtr, const BYTE* newB } -static size_t LZ4HC_CommonLength (const BYTE* p1, const BYTE* p2, const BYTE* const p1Limit) -{ - const BYTE* const p1Start = p1; - - while (p1 <= p1Limit - STEPSIZE) - { - size_t diff = AARCH(p2) ^ AARCH(p1); - if (!diff) { p1+=STEPSIZE; p2+=STEPSIZE; continue; } - p1 += LZ4_NbCommonBytes(diff); - return (p1 - p1Start); - } - if (LZ4_ARCH64) if ((p1<(p1Limit-3)) && (A32(p2) == A32(p1))) { p1+=4; p2+=4; } - if ((p1<(p1Limit-1)) && (A16(p2) == A16(p1))) { p1+=2; p2+=2; } - if ((p1 ml) { ml = mlt; *matchpos = match; } } } else { match = dictBase + matchIndex; - if (A32(match) == A32(ip)) + if (LZ4_read32(match) == LZ4_read32(ip)) { size_t mlt; const BYTE* vLimit = ip + (dictLimit - matchIndex); if (vLimit > iLimit) vLimit = iLimit; - mlt = LZ4HC_CommonLength(ip+MINMATCH, match+MINMATCH, vLimit) + MINMATCH; + mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, vLimit) + MINMATCH; if ((ip+mlt == vLimit) && (vLimit < iLimit)) - mlt += LZ4HC_CommonLength(ip+mlt, base+dictLimit, iLimit); + mlt += LZ4_count(ip+mlt, base+dictLimit, iLimit); if (mlt > ml) { ml = mlt; *matchpos = base + matchIndex; } // virtual matchpos } } @@ -502,11 +253,11 @@ FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch ( { match = base + matchIndex; if (*(iLowLimit + longest) == *(match - delta + longest)) - if (A32(match) == A32(ip)) + if (LZ4_read32(match) == LZ4_read32(ip)) { const BYTE* startt = ip; const BYTE* tmpMatch = match; - const BYTE* const matchEnd = ip + MINMATCH + LZ4HC_CommonLength(ip+MINMATCH, match+MINMATCH, iHighLimit); + const BYTE* const matchEnd = ip + MINMATCH + LZ4_count(ip+MINMATCH, match+MINMATCH, iHighLimit); while ((startt>iLowLimit) && (tmpMatch > iLowLimit) && (startt[-1] == tmpMatch[-1])) {startt--; tmpMatch--;} @@ -521,15 +272,15 @@ FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch ( else { match = dictBase + matchIndex; - if (A32(match) == A32(ip)) + if (LZ4_read32(match) == LZ4_read32(ip)) { size_t mlt; int back=0; const BYTE* vLimit = ip + (dictLimit - matchIndex); if (vLimit > iHighLimit) vLimit = iHighLimit; - mlt = LZ4HC_CommonLength(ip+MINMATCH, match+MINMATCH, vLimit) + MINMATCH; + mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, vLimit) + MINMATCH; if ((ip+mlt == vLimit) && (vLimit < iHighLimit)) - mlt += LZ4HC_CommonLength(ip+mlt, base+dictLimit, iHighLimit); + mlt += LZ4_count(ip+mlt, base+dictLimit, iHighLimit); while ((ip+back > iLowLimit) && (matchIndex+back > lowLimit) && (ip[back-1] == match[back-1])) back--; mlt -= back; if ((int)mlt > longest) { longest = (int)mlt; *matchpos = base + matchIndex + back; *startpos = ip+back; } @@ -568,10 +319,11 @@ FORCE_INLINE int LZ4HC_encodeSequence ( else *token = (BYTE)(length< Date: Sun, 30 Nov 2014 12:58:00 +0100 Subject: Fixed : some minor Visual warnings --- lz4.c | 2 +- lz4hc.c | 4 ++++ programs/Makefile | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lz4.c b/lz4.c index 9439b36..fb84955 100644 --- a/lz4.c +++ b/lz4.c @@ -369,7 +369,7 @@ static unsigned LZ4_NbCommonBytes (register size_t val) { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; - _BitScanReverse( &r, val ); + _BitScanReverse( &r, (unsigned long)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clz(val) >> 3); diff --git a/lz4hc.c b/lz4hc.c index a798cab..89f0db0 100644 --- a/lz4hc.c +++ b/lz4hc.c @@ -57,6 +57,10 @@ You can contact the author at : # pragma clang diagnostic ignored "-Wunused-function" #endif +#if defined(_MSC_VER) /* Visual Studio */ +# pragma warning(disable : 4201) /* disable: C4201: unnamed struct/union*/ +#endif + /************************************** Common LZ4 definition diff --git a/programs/Makefile b/programs/Makefile index fcfb32c..8a3ed95 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -193,7 +193,7 @@ test-mem: lz4 datagen fuzzer frametest ./datagen -g16MB > tmp valgrind --leak-check=yes ./lz4 -9 -B5D -f tmp /dev/null ./datagen -g256MB > tmp - valgrind --leak-check=yes ./lz4 -B4D -f tmp /dev/null + valgrind --leak-check=yes ./lz4 -B4D -f -vq tmp /dev/null rm tmp valgrind --leak-check=yes ./fuzzer -i50 -t0 valgrind --leak-check=yes ./frametest -i100 -- cgit v0.12