From ecbce64ac93a81784c23c0cb8fe9ae8ede0866f9 Mon Sep 17 00:00:00 2001 From: "yann.collet.73@gmail.com" Date: Mon, 20 Feb 2012 20:53:49 +0000 Subject: Corrected : default to Software Bit Count for GCC earlier than 3.4. Thanks to Gray. Corrected : ARM compilation under Visual Studio : now generates cautious code, like GCC, to ensure compatibility with strict-align CPUs. Thanks to Joe WoodBury. Minor : bench.c : Hash verification changed to MurmurHash3A Minor : bench.c : selectable block size git-svn-id: https://lz4.googlecode.com/svn/trunk@56 650e7d94-2a16-8b24-b05c-7c0b3f6821cd --- bench.c | 103 ++++++++++++++++++++++++++++++++++++++++++++------------------ bench.h | 3 ++ lz4.c | 78 +++++++++++++++++++++-------------------------- lz4demo.c | 17 ++++++----- 4 files changed, 120 insertions(+), 81 deletions(-) diff --git a/bench.c b/bench.c index 220df60..8d86bbc 100644 --- a/bench.c +++ b/bench.c @@ -33,6 +33,11 @@ #define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif +// GCC on does not support _rotl outside of Windows +#if defined(__GNUC__) +#define _rotl(x,r) ((x << r) | (x >> (32 - r))) +#endif + //************************************** // Includes @@ -72,8 +77,7 @@ #define KNUTH 2654435761U #define MAX_MEM (1984<<20) -#define CHUNKSIZE (8<<20) -#define MAX_NB_CHUNKS ((MAX_MEM / CHUNKSIZE) + 1) +#define DEFAULT_CHUNKSIZE (8<<20) //************************************** @@ -102,6 +106,17 @@ struct compressionParameters +//************************************** +// Private Parameters +//************************************** +static int chunkSize = DEFAULT_CHUNKSIZE; + +void BMK_SetBlocksize(int bsize) +{ + chunkSize = bsize; + DISPLAY("Using Block Size of %i KB... ", chunkSize>>10); +} + //********************************************************* // Private functions //********************************************************* @@ -128,28 +143,53 @@ static int BMK_GetMilliSpan( int nTimeStart ) } -static U32 BMK_checksum(char* buff, U32 length) +static U32 BMK_checksum_MMH3A (char* buff, U32 length) { - BYTE* p = (BYTE*)buff; - BYTE* bEnd = p + length; - BYTE* limit = bEnd - 3; - U32 idx = 1; - U32 crc = KNUTH; - - while (p> 2; + + U32 h1 = KNUTH; + U32 c1 = 0xcc9e2d51; + U32 c2 = 0x1b873593; + + const U32* blocks = (const U32*)(data + nblocks*4); + int i; + + for(i = -nblocks; i; i++) + { + U32 k1 = blocks[i]; + + k1 *= c1; + k1 = _rotl(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = _rotl(h1,13); + h1 = h1*5+0xe6546b64; + } + + { + const BYTE* tail = (const BYTE*)(data + nblocks*4); + U32 k1 = 0; + + switch(length & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = _rotl(k1,15); k1 *= c2; h1 ^= k1; + }; + } + + h1 ^= length; + h1 ^= h1 >> 16; + h1 *= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *= 0xc2b2ae35; + h1 ^= h1 >> 16; + + return h1; +} static size_t BMK_findMaxMem(U64 requiredMem) @@ -203,7 +243,7 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles) size_t readSize; char* in_buff; char* out_buff; int out_buff_size; - struct chunkParameters chunkP[MAX_NB_CHUNKS]; + struct chunkParameters* chunkP; U32 crcc, crcd; struct compressionParameters compP; @@ -239,12 +279,14 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles) } // Alloc + chunkP = (struct chunkParameters*) malloc(((benchedsize / chunkSize)+1) * sizeof(struct chunkParameters)); in_buff = malloc((size_t )benchedsize); - nbChunks = (benchedsize / CHUNKSIZE) + 1; - maxCChunkSize = LZ4_compressBound(CHUNKSIZE); + nbChunks = (benchedsize / chunkSize) + 1; + maxCChunkSize = LZ4_compressBound(chunkSize); out_buff_size = nbChunks * maxCChunkSize; out_buff = malloc((size_t )out_buff_size); + if(!in_buff || !out_buff) { DISPLAY("\nError: not enough memory!\n"); @@ -263,8 +305,8 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles) for (i=0; i CHUNKSIZE) { chunkP[i].inputSize = CHUNKSIZE; remaining -= CHUNKSIZE; } else { chunkP[i].inputSize = remaining; remaining = 0; } + chunkP[i].inputBuffer = in; in += chunkSize; + if ((int)remaining > chunkSize) { chunkP[i].inputSize = chunkSize; remaining -= chunkSize; } else { chunkP[i].inputSize = remaining; remaining = 0; } chunkP[i].outputBuffer = out; out += maxCChunkSize; chunkP[i].outputSize = 0; } @@ -284,7 +326,7 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles) } // Calculating input Checksum - crcc = BMK_checksum(in_buff, benchedsize); + crcc = BMK_checksum_MMH3A(in_buff, benchedsize); // Bench @@ -336,7 +378,7 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles) DISPLAY("%1i-%-14.14s : %9i -> %9i (%5.2f%%), %6.1f MB/s , %6.1f MB/s\r", loopNb, infilename, (int)benchedsize, (int)cSize, (double)cSize/(double)benchedsize*100., (double)benchedsize / fastestC / 1000., (double)benchedsize / fastestD / 1000.); // CRC Checking - crcd = BMK_checksum(in_buff, benchedsize); + crcd = BMK_checksum_MMH3A(in_buff, benchedsize); if (crcc!=crcd) { DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", infilename, (unsigned)crcc, (unsigned)crcd); break; } } @@ -349,6 +391,7 @@ int BMK_benchFile(char** fileNamesTable, int nbFiles) free(in_buff); free(out_buff); + free(chunkP); } if (nbFiles > 1) diff --git a/bench.h b/bench.h index 967caea..547a1bc 100644 --- a/bench.h +++ b/bench.h @@ -29,6 +29,9 @@ extern "C" { int BMK_benchFile(char** fileNamesTable, int nbFiles) ; +// Parameters +void BMK_SetBlocksize(int bsize); + #if defined (__cplusplus) diff --git a/lz4.c b/lz4.c index 2c3f1c4..a131a81 100644 --- a/lz4.c +++ b/lz4.c @@ -70,15 +70,15 @@ // Little Endian assumed. PDP Endian and other very rare endian format are unsupported. #endif -// Unaligned memory access ? -// This feature is automatically enabled for "common" CPU, such as x86. -// For others CPU, you may want to force this option manually to improve performance if your target CPU supports unaligned memory access +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected +// If you know your target CPU supports unaligned memory access, you may want to force this option manually to improve performance #if defined(__ARM_FEATURE_UNALIGNED) #define LZ4_FORCE_UNALIGNED_ACCESS 1 #endif -// Uncomment this parameter if your target system does not support hardware bit count -//#define _FORCE_SW_BITCOUNT +// Uncomment this parameter if your target system or compiler does not support hardware bit count +//#define LZ4_FORCE_SW_BITCOUNT @@ -95,12 +95,6 @@ #define inline __forceinline // Visual is not C99, but supports inline #endif -#if (defined(__GNUC__) && (!defined(LZ4_FORCE_UNALIGNED_ACCESS))) -#define _PACKED __attribute__ ((packed)) -#else -#define _PACKED -#endif - #ifdef _MSC_VER // Visual Studio #define bswap16(i) _byteswap_ushort(i) #else @@ -134,11 +128,32 @@ #define U64 uint64_t #endif +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack(push, 1) +#endif + +typedef struct _U16_S { U16 v; } U16_S; +typedef struct _U32_S { U32 v; } U32_S; +typedef struct _U64_S { U64 v; } U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack(pop) +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + //************************************** // Constants //************************************** #define MINMATCH 4 + +#define HASH_LOG COMPRESSIONLEVEL +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + #define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION>2?NOTCOMPRESSIBLE_CONFIRMATION:2) #define STACKLIMIT 13 #define HEAPMODE (HASH_LOG>STACKLIMIT) // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()). @@ -150,10 +165,6 @@ #define MAXD_LOG 16 #define MAX_DISTANCE ((1 << MAXD_LOG) - 1) -#define HASH_LOG COMPRESSIONLEVEL -#define HASHTABLESIZE (1 << HASH_LOG) -#define HASH_MASK (HASHTABLESIZE - 1) - #define ML_BITS 4 #define ML_MASK ((1U<v) -#define A32(x) (((U32_S *)(x))->v) -#define A16(x) (((U16_S *)(x))->v) - //************************************** // Macros @@ -237,11 +229,11 @@ typedef struct _U16_S inline static int LZ4_NbCommonBytes (register U64 val) { #if defined(LZ4_BIG_ENDIAN) - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse64( &r, val ); return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) + #elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clzll(val) >> 3); #else int r; @@ -251,11 +243,11 @@ inline static int LZ4_NbCommonBytes (register U64 val) return r; #endif #else - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward64( &r, val ); return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) + #elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll(val) >> 3); #else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; @@ -269,11 +261,11 @@ inline static int LZ4_NbCommonBytes (register U64 val) inline static int LZ4_NbCommonBytes (register U32 val) { #if defined(LZ4_BIG_ENDIAN) - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse( &r, val ); return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) + #elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clz(val) >> 3); #else int r; @@ -282,15 +274,15 @@ inline static int LZ4_NbCommonBytes (register U32 val) return r; #endif #else - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward( &r, val ); return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) + #elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctz(val) >> 3); #else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -val) * 0x077CB531U)) >> 27]; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; #endif #endif } diff --git a/lz4demo.c b/lz4demo.c index 056e597..3905cfe 100644 --- a/lz4demo.c +++ b/lz4demo.c @@ -298,7 +298,8 @@ int main(int argc, char** argv) int i, compression=1, // default action if no argument decode=0, - bench=0; + bench=0, + filenamesStart=2; char* input_filename=0; char* output_filename=0; #ifdef _WIN32 @@ -316,16 +317,13 @@ int main(int argc, char** argv) for(i=1; i Error if(!input_filename) { badusage(); return 1; } - if (bench) return BMK_benchFile(argv+2, argc-2); + if (bench) return BMK_benchFile(argv+filenamesStart, argc-filenamesStart); // No output filename if (!output_filename) { badusage(); return 1; } -- cgit v0.12