summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryann.collet.73@gmail.com <yann.collet.73@gmail.com@650e7d94-2a16-8b24-b05c-7c0b3f6821cd>2012-01-10 20:03:01 (GMT)
committeryann.collet.73@gmail.com <yann.collet.73@gmail.com@650e7d94-2a16-8b24-b05c-7c0b3f6821cd>2012-01-10 20:03:01 (GMT)
commit99b3294fc2e5785f645f03c1bf04b84c73297d0c (patch)
treebf4fa8f18ea232e94a8823750a5c94283304c09e
parent4fa4221c6e18cc2352fbaed7fa7970489ce8eb2c (diff)
downloadlz4-99b3294fc2e5785f645f03c1bf04b84c73297d0c.zip
lz4-99b3294fc2e5785f645f03c1bf04b84c73297d0c.tar.gz
lz4-99b3294fc2e5785f645f03c1bf04b84c73297d0c.tar.bz2
Improved compression speed for big-endian CPU
git-svn-id: https://lz4.googlecode.com/svn/trunk@48 650e7d94-2a16-8b24-b05c-7c0b3f6821cd
-rw-r--r--Makefile4
-rw-r--r--lz4.c185
2 files changed, 75 insertions, 114 deletions
diff --git a/Makefile b/Makefile
index da77c9d..84da2c0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,10 @@
all: lz4demo64 lz4demo32
lz4demo64: lz4.c lz4.h bench.c lz4demo.c
- gcc -g -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo64.exe
+ gcc -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo64.exe
lz4demo32: lz4.c lz4.h bench.c lz4demo.c
- gcc -m32 -g -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo32.exe
+ gcc -m32 -O3 -I. -std=c99 -Wall -W -Wno-implicit-function-declaration lz4.c bench.c lz4demo.c -o lz4demo32.exe
clean:
rm -f core *.o lz4demo32.exe lz4demo64.exe
diff --git a/lz4.c b/lz4.c
index a5f34e6..5ad13ee 100644
--- a/lz4.c
+++ b/lz4.c
@@ -95,7 +95,6 @@
#define SKIPSTRENGTH 6
#define STACKLIMIT 13
#define HEAPMODE (HASH_LOG>STACKLIMIT) // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()).
-#define COPYTOKEN 4
#define COPYLENGTH 8
#define LASTLITERALS 5
#define MFLIMIT (COPYLENGTH+MINMATCH)
@@ -142,44 +141,47 @@ typedef struct _U16_S
//**************************************
-// Macros
-//**************************************
-#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))
-#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
-#define LZ4_COPYPACKET32(s,d) A32(d) = A32(s); d+=4; s+=4; A32(d) = A32(s); d+=4; s+=4;
-#define LZ4_COPYPACKET64(s,d) A64(d) = A64(s); d+=8; s+=8;
-#define LZ4_WILDCOPY32(s,d,e) do { LZ4_COPYPACKET32(s,d) } while (d<e);
-#define LZ4_WILDCOPY64(s,d,e) do { LZ4_COPYPACKET64(s,d) } while (d<e);
-
-
-//**************************************
// Architecture-specific macros
//**************************************
#if ARCH64 // 64-bit
-#define LZ4_WILDCOPY LZ4_WILDCOPY64
-#define LZ4_BLINDCOPY(s,d,l) { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }
+#define COPYSTEP 8
+#define LZ4_COPYSTEP(s,d) A64(d) = A64(s); d+=8; s+=8;
+#define LZ4_COPYPACKET(s,d) LZ4_COPYSTEP(s,d)
#else // 32-bit
-#define LZ4_WILDCOPY LZ4_WILDCOPY32
-#define LZ4_BLINDCOPY(s,d,l) { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }
+#define COPYSTEP 4
+#define LZ4_COPYSTEP(s,d) A32(d) = A32(s); d+=4; s+=4;
+#define LZ4_COPYPACKET(s,d) LZ4_COPYSTEP(s,d); LZ4_COPYSTEP(s,d);
#endif
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = s - A16(p); }
#define LZ4_WRITE_LITTLEENDIAN_16(p,v) { A16(p) = v; p+=2; }
+#define LZ4_NbCommonBytes LZ4_NbCommonBytes_LittleEndian
#else // Big Endian
#define LZ4_READ_LITTLEENDIAN_16(d,s,p) { int delta = p[0]; delta += p[1] << 8; d = s - delta; }
#define LZ4_WRITE_LITTLEENDIAN_16(p,v) { int delta = v; *p++ = delta; *op++ = delta>>8; }
+#define LZ4_NbCommonBytes LZ4_NbCommonBytes_BigEndian
#endif
+
+//**************************************
+// Macros
+//**************************************
+#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))
+#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define LZ4_WILDCOPY(s,d,e) do { LZ4_COPYPACKET(s,d) } while (d<e);
+#define LZ4_BLINDCOPY(s,d,l) { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }
+
+
//****************************
// Private functions
//****************************
-inline static int LZ4_NbCommonBytes_LittleEndian( register U32 val )
+inline static int LZ4_NbCommonBytes_LittleEndian (register U32 val)
{
#if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT)
- unsigned long b = 0;
- _BitScanForward( &b, val );
- return (int)(b>>3);
+ unsigned long r = 0;
+ _BitScanForward( &r, val );
+ return (int)(r>>3);
#elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT)
return (__builtin_ctz(val) >> 3);
#else
@@ -188,6 +190,22 @@ inline static int LZ4_NbCommonBytes_LittleEndian( register U32 val )
#endif
}
+inline static int LZ4_NbCommonBytes_BigEndian (register U32 val)
+{
+ #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanReverse( &r, val );
+ return (int)(r>>3);
+ #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT)
+ return (__builtin_clz(val) >> 3);
+ #else
+ int r;
+ if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+ r += (!val);
+ return r;
+ #endif
+}
+
//******************************
// Public Compression functions
@@ -281,15 +299,9 @@ _next_match:
anchor = ip;
while (ip<matchlimit-3)
{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
U32 diff = A32(ref) ^ A32(ip);
if (!diff) { ip+=4; ref+=4; continue; }
- ip += LZ4_NbCommonBytes_LittleEndian(diff);
-#else
- if (A32(ref) == A32(ip)) { ip+=4; ref+=4; continue; }
- if (A16(ref) == A16(ip)) { ip+=2; ref+=2; }
- if (*ref == *ip) ip++;
-#endif
+ ip += LZ4_NbCommonBytes(diff);
goto _endCount;
}
if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
@@ -427,15 +439,9 @@ _next_match:
anchor = ip;
while (ip<matchlimit-3)
{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
U32 diff = A32(ref) ^ A32(ip);
if (!diff) { ip+=4; ref+=4; continue; }
- ip += LZ4_NbCommonBytes_LittleEndian(diff);
-#else
- if (A32(ref) == A32(ip)) { ip+=4; ref+=4; continue; }
- if (A16(ref) == A16(ip)) { ip+=2; ref+=2; }
- if (*ref == *ip) ip++;
-#endif
+ ip += LZ4_NbCommonBytes(diff);
goto _endCount;
}
if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
@@ -501,14 +507,13 @@ int LZ4_compress(char* source,
//****************************
-// Decompression CODE
+// Decompression functions
//****************************
// Note : The decoding functions LZ4_uncompress() and LZ4_uncompress_unknownOutputSize()
// are safe against "buffer overflow" attack type.
-// They will *never* write nor read outside of the provided input and output buffer :
-// they both check this condition *before* writing anything.
-// A corrupted packet will trigger an error result, a negative int, indicating the position of the error within input stream.
+// They will never write nor read outside of the provided input and output buffers.
+// A corrupted input will produce an error result, a negative int, indicating the position of the error within input stream.
int LZ4_uncompress(char* source,
char* dest,
@@ -525,10 +530,7 @@ int LZ4_uncompress(char* source,
BYTE token;
int len, length;
- size_t dec1[] ={0, 3, 2, 3, 0, 0, 0, 0};
-#if ARCH64
- size_t dec2[]={0, 4, 4, 3, 4, 5, 6, 7};
-#endif
+ size_t dec[] ={0, 3, 2, 3, 0, 0, 0, 0};
// Main Loop
@@ -557,53 +559,34 @@ int LZ4_uncompress(char* source,
if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; }
// copy repeated sequence
-#if ARCH64
- if (op-ref<8)
+ if (op-ref<COPYSTEP)
{
- int tmp = dec2[op-ref];
- *op++ = *ref++;
- *op++ = *ref++;
- *op++ = *ref++;
- *op++ = *ref++;
- ref -= dec1[op-ref];
- A32(op)=A32(ref); op += 4; ref += 4;
- ref -= tmp;
- } else { LZ4_COPYPACKET64(ref,op); }
- cpy = op + length - 4;
- if (cpy > oend-COPYLENGTH)
- {
- if (cpy > oend) goto _output_error;
- LZ4_WILDCOPY64(ref, op, (oend-COPYLENGTH));
- while(op<cpy) *op++=*ref++;
- op=cpy;
- if (op == oend) break; // Check EOF (should never happen, since last 5 bytes are supposed to be literals)
- continue;
- }
- LZ4_WILDCOPY64(ref, op, cpy);
- op=cpy; // correction
+#if ARCH64
+ size_t dec2table[]={0, 4, 4, 3, 4, 5, 6, 7};
+ size_t dec2 = dec2table[op-ref];
#else
- if (op-ref<COPYTOKEN)
- {
+ int dec2 = 0;
+#endif
*op++ = *ref++;
*op++ = *ref++;
*op++ = *ref++;
*op++ = *ref++;
- ref -= dec1[op-ref];
- A32(op)=A32(ref);
- } else { A32(op)=A32(ref); op+=4; ref+=4; }
- cpy = op + length;
+ ref -= dec[op-ref];
+ A32(op)=A32(ref); op += COPYSTEP-4; ref += COPYSTEP-4;
+ ref -= dec2;
+ } else { LZ4_COPYSTEP(ref,op); }
+ cpy = op + length - (COPYSTEP-4);
if (cpy>oend-COPYLENGTH)
{
if (cpy > oend) goto _output_error;
- LZ4_WILDCOPY32(ref, op, (oend-COPYLENGTH));
+ LZ4_WILDCOPY(ref, op, (oend-COPYLENGTH));
while(op<cpy) *op++=*ref++;
op=cpy;
if (op == oend) break; // Check EOF (should never happen, since last 5 bytes are supposed to be literals)
continue;
}
- LZ4_WILDCOPY32(ref, op, cpy);
+ LZ4_WILDCOPY(ref, op, cpy);
op=cpy; // correction
-#endif
}
// end of decoding
@@ -633,10 +616,7 @@ int LZ4_uncompress_unknownOutputSize(
BYTE token;
int len, length;
- size_t dec1[] ={0, 3, 2, 3, 0, 0, 0, 0};
-#if ARCH64
- size_t dec2[]={0, 4, 4, 3, 4, 5, 6, 7};
-#endif
+ size_t dec[] ={0, 3, 2, 3, 0, 0, 0, 0};
// Main Loop
@@ -660,59 +640,40 @@ int LZ4_uncompress_unknownOutputSize(
// get offset
LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
- if (ref < (BYTE* const)dest) goto _output_error;
+ if (ref < (BYTE* const)dest) goto _output_error;
+
// get matchlength
if ((length=(token&ML_MASK)) == ML_MASK) { for (;(len=*ip++)==255;length+=255){} length += len; }
// copy repeated sequence
-#if ARCH64
- if (op-ref<8)
+ if (op-ref<COPYSTEP)
{
- int tmp = dec2[op-ref];
- *op++ = *ref++;
- *op++ = *ref++;
- *op++ = *ref++;
- *op++ = *ref++;
- ref -= dec1[op-ref];
- A32(op)=A32(ref); op += 4; ref += 4;
- ref -= tmp;
- } else { LZ4_COPYPACKET64(ref,op); }
- cpy = op + length - 4;
- if (cpy > oend-COPYLENGTH)
- {
- if (cpy > oend) goto _output_error;
- LZ4_WILDCOPY64(ref, op, (oend-COPYLENGTH));
- while(op<cpy) *op++=*ref++;
- op=cpy;
- if (op == oend) break; // Check EOF (should never happen, since last 5 bytes are supposed to be literals)
- continue;
- }
- LZ4_WILDCOPY64(ref, op, cpy);
- op=cpy; // correction
+#if ARCH64
+ size_t dec2table[]={0, 4, 4, 3, 4, 5, 6, 7};
+ size_t dec2 = dec2table[op-ref];
#else
- if (op-ref<COPYTOKEN)
- {
+ int dec2 = 0;
+#endif
*op++ = *ref++;
*op++ = *ref++;
*op++ = *ref++;
*op++ = *ref++;
- ref -= dec1[op-ref];
- A32(op)=A32(ref);
- } else { A32(op)=A32(ref); op+=4; ref+=4; }
- cpy = op + length;
+ ref -= dec[op-ref];
+ A32(op)=A32(ref); op += COPYSTEP-4; ref += COPYSTEP-4;
+ ref -= dec2;
+ } else { LZ4_COPYSTEP(ref,op); }
+ cpy = op + length - (COPYSTEP-4);
if (cpy>oend-COPYLENGTH)
{
if (cpy > oend) goto _output_error;
- LZ4_WILDCOPY32(ref, op, (oend-COPYLENGTH));
+ LZ4_WILDCOPY(ref, op, (oend-COPYLENGTH));
while(op<cpy) *op++=*ref++;
op=cpy;
if (op == oend) break; // Check EOF (should never happen, since last 5 bytes are supposed to be literals)
continue;
}
- LZ4_WILDCOPY32(ref, op, cpy);
+ LZ4_WILDCOPY(ref, op, cpy);
op=cpy; // correction
-#endif
-
}
// end of decoding