From 3dd34df75185f132238451ef4fdb68b83f6fb91a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 11 Aug 2020 14:03:20 -0700 Subject: added target lz4-wlib variant of lz4 linking to liblz4 dynamic library requires the dynamic library to expose static-only symbols (experimental API) Example for #888 --- .travis.yml | 4 +++- lib/README.md | 7 ++++--- lib/lz4frame.h | 6 +++--- ossfuzz/Makefile | 6 +++++- programs/.gitignore | 1 + programs/Makefile | 26 ++++++++++++++++++++------ 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index bd29630..f2da894 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,9 +33,11 @@ matrix: script: - CC=clang MOREFLAGS=-fsanitize=address make -C tests test-frametest test-fuzzer - - name: Custom LZ4_DISTANCE_MAX + - name: Custom LZ4_DISTANCE_MAX ; Build lz4-wlib (CLI linked to dynamic library) script: - MOREFLAGS=-DLZ4_DISTANCE_MAX=8000 make check + - make clean + - make -C programs lz4-wlib - name: (Precise) g++ and clang CMake test dist: precise diff --git a/lib/README.md b/lib/README.md index cba2c34..707d777 100644 --- a/lib/README.md +++ b/lib/README.md @@ -35,12 +35,13 @@ So it's necessary to include all `*.c` and `*.h` files present in `/lib`. Definitions which are not guaranteed to remain stable in future versions, are protected behind macros, such as `LZ4_STATIC_LINKING_ONLY`. -As the name implies, these definitions can only be invoked +As the name strongly implies, these definitions should only be invoked in the context of static linking ***only***. Otherwise, dependent application may fail on API or ABI break in the future. -The associated symbols are also not present in dynamic library by default. +The associated symbols are also not exposed by the dynamic library by default. Should they be nonetheless needed, it's possible to force their publication -by using build macro `LZ4_PUBLISH_STATIC_FUNCTIONS`. +by using build macros `LZ4_PUBLISH_STATIC_FUNCTIONS` +and `LZ4F_PUBLISH_STATIC_FUNCTIONS`. #### Build macros diff --git a/lib/lz4frame.h b/lib/lz4frame.h index 77d682b..c669aec 100644 --- a/lib/lz4frame.h +++ b/lib/lz4frame.h @@ -381,7 +381,7 @@ LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx); * note : Frame header size is variable, but is guaranteed to be * >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes. */ -size_t LZ4F_headerSize(const void* src, size_t srcSize); +LZ4FLIB_API size_t LZ4F_headerSize(const void* src, size_t srcSize); /*! LZ4F_getFrameInfo() : * This function extracts frame parameters (max blockSize, dictID, etc.). @@ -498,9 +498,9 @@ extern "C" { * Use at your own risk. */ #ifdef LZ4F_PUBLISH_STATIC_FUNCTIONS -#define LZ4FLIB_STATIC_API LZ4FLIB_API +# define LZ4FLIB_STATIC_API LZ4FLIB_API #else -#define LZ4FLIB_STATIC_API +# define LZ4FLIB_STATIC_API #endif diff --git a/ossfuzz/Makefile b/ossfuzz/Makefile index 7e043a1..f247405 100644 --- a/ossfuzz/Makefile +++ b/ossfuzz/Makefile @@ -47,6 +47,7 @@ FUZZERS := \ round_trip_frame_fuzzer \ decompress_frame_fuzzer +.PHONY: all all: $(FUZZERS) # Include a rule to build the static library if calling this target @@ -70,5 +71,8 @@ $(LZ4DIR)/liblz4.a: $(RM) $*_fuzzer $*_fuzzer.o standaloneengine.o .PHONY: clean -clean: compress_fuzzer_clean decompress_fuzzer_clean +clean: compress_fuzzer_clean decompress_fuzzer_clean \ + compress_frame_fuzzer_clean compress_hc_fuzzer_clean \ + decompress_frame_fuzzer_clean round_trip_frame_fuzzer_clean \ + round_trip_fuzzer_clean round_trip_hc_fuzzer_clean round_trip_stream_fuzzer_clean $(MAKE) -C $(LZ4DIR) clean diff --git a/programs/.gitignore b/programs/.gitignore index daa7f14..9ffadd9 100644 --- a/programs/.gitignore +++ b/programs/.gitignore @@ -4,6 +4,7 @@ unlz4 lz4cat lz4c lz4c32 +lz4-wlib datagen frametest frametest32 diff --git a/programs/Makefile b/programs/Makefile index 4994551..4256ae8 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -41,12 +41,13 @@ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT)) LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT)) LIBVER := $(shell echo $(LIBVER_SCRIPT)) -SRCFILES := $(sort $(wildcard $(LZ4DIR)/*.c) $(wildcard *.c)) -OBJFILES := $(SRCFILES:.c=.o) +LIBFILES = $(wildcard $(LZ4DIR)/*.c) +SRCFILES = $(sort $(LIBFILES) $(wildcard *.c)) +OBJFILES = $(SRCFILES:.c=.o) CPPFLAGS += -I$(LZ4DIR) -DXXH_NAMESPACE=LZ4_ CFLAGS ?= -O3 -DEBUGFLAGS:=-Wall -Wextra -Wundef -Wcast-qual -Wcast-align -Wshadow \ +DEBUGFLAGS= -Wall -Wextra -Wundef -Wcast-qual -Wcast-align -Wshadow \ -Wswitch-enum -Wdeclaration-after-statement -Wstrict-prototypes \ -Wpointer-arith -Wstrict-aliasing=1 CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) @@ -82,13 +83,25 @@ lz4: $(OBJFILES) lz4-exe.o $(CC) $(FLAGS) $^ -o $@$(EXT) else lz4: $(OBJFILES) - $(CC) $(FLAGS) $^ -o $@$(EXT) + $(CC) $(FLAGS) $(OBJFILES) -o $@$(EXT) $(LDLIBS) endif - +.PHONY: lz4-release lz4-release: DEBUGFLAGS= lz4-release: lz4 +lz4-wlib: LIBFILES = +lz4-wlib: SRCFILES+= $(LZ4DIR)/xxhash.c # benchmark unit needs XXH64() +lz4-wlib: LDFLAGS += -L $(LZ4DIR) +lz4-wlib: LDLIBS = -llz4 +lz4-wlib: liblz4 $(OBJFILES) + @echo WARNING: $@ must link to an extended variant of the dynamic library which also exposes unstable symbols + $(CC) $(FLAGS) $(OBJFILES) -o $@$(EXT) $(LDLIBS) + +.PHONY:liblz4 +liblz4: + CPPFLAGS="-DLZ4F_PUBLISH_STATIC_FUNCTIONS -DLZ4_PUBLISH_STATIC_FUNCTIONS" $(MAKE) -C $(LZ4DIR) liblz4 + lz4c: lz4 $(LN_SF) lz4$(EXT) lz4c$(EXT) @@ -113,7 +126,8 @@ ifeq ($(WINBASED),yes) endif @$(MAKE) -C $(LZ4DIR) $@ > $(VOID) @$(RM) core *.o *.test tmp* \ - lz4$(EXT) lz4c$(EXT) lz4c32$(EXT) unlz4$(EXT) lz4cat$(EXT) + lz4$(EXT) lz4c$(EXT) lz4c32$(EXT) lz4-wlib$(EXT) \ + unlz4$(EXT) lz4cat$(EXT) @echo Cleaning completed -- cgit v0.12 From e9cfa49d0ddde238a7f3dbe6320f11e3d1fe58ea Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 12 Aug 2020 17:27:33 -0700 Subject: Clarifies and fix EndMark EndMark, the 4-bytes value indicating the end of frame, must be `0x00000000`. Previously, it was just mentioned as a `0-size` block. But such definition could encompass uncompressed blocks of size 0, with a header of value `0x80000000`. But the intention was to also support uncompressed empty blocks. They could be used as a keep-alive signal. Note that compressed empty blocks are already supported, it's just that they have a size 1 instead of 0 (for the `0` token). Unfortunately, the decoder implementation was also wrong, and would also interpret a `0x80000000` block header as an endMark. This issue evaded detection so far simply because this situation never happens, as LZ4Frame always issues a clean 0x00000000 value as a endMark. It also does not flush empty blocks. This is fixed in this PR. The decoder can now deal with empty uncompressed blocks, and do not confuse them with EndMark. The specification is also clarified. Finally, FrameTest is updated to randomly insert empty blocks during fuzzing. --- doc/lz4_Frame_format.md | 47 +++++++++++++++++++++++++++++------------------ lib/lz4frame.c | 8 +++++--- tests/frametest.c | 30 +++++++++++++++++++++--------- 3 files changed, 55 insertions(+), 30 deletions(-) diff --git a/doc/lz4_Frame_format.md b/doc/lz4_Frame_format.md index a0514e0..4548698 100644 --- a/doc/lz4_Frame_format.md +++ b/doc/lz4_Frame_format.md @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -1.6.1 (30/01/2018) +1.6.2 (12/08/2020) Introduction @@ -75,7 +75,7 @@ __Frame Descriptor__ 3 to 15 Bytes, to be detailed in its own paragraph, as it is the most important part of the spec. -The combined __Magic Number__ and __Frame Descriptor__ fields are sometimes +The combined _Magic_Number_ and _Frame_Descriptor_ fields are sometimes called ___LZ4 Frame Header___. Its size varies between 7 and 19 bytes. __Data Blocks__ @@ -85,14 +85,13 @@ That’s where compressed data is stored. __EndMark__ -The flow of blocks ends when the last data block has a size of “0”. -The size is expressed as a 32-bits value. +The flow of blocks ends when the last data block is announced with +an invalid size of “0”, expressed as a 32-bits value (exactly `0x00000000`). __Content Checksum__ -Content Checksum verify that the full content has been decoded correctly. -The content checksum is the result -of [xxh32() hash function](https://github.com/Cyan4973/xxHash) +_Content_Checksum_ verify that the full content has been decoded correctly. +The content checksum is the result of [xxHash-32 algorithm] digesting the original (decoded) data as input, and a seed of zero. Content checksum is only present when its associated flag is set in the frame descriptor. @@ -101,7 +100,7 @@ that all blocks were fully transmitted in the correct order and without error, and also that the encoding/decoding process itself generated no distortion. Its usage is recommended. -The combined __EndMark__ and __Content Checksum__ fields might sometimes be +The combined _EndMark_ and _Content_Checksum_ fields might sometimes be referred to as ___LZ4 Frame Footer___. Its size varies between 4 and 8 bytes. __Frame Concatenation__ @@ -261,16 +260,24 @@ __Block Size__ This field uses 4-bytes, format is little-endian. -The highest bit is “1” if data in the block is uncompressed. +If highest bit is set (`1`), the block is uncompressed. -The highest bit is “0” if data in the block is compressed by LZ4. +If highest bit is not set (`0`), the block is LZ4-compressed, +using the [LZ4 block format specification](https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md). -All other bits give the size, in bytes, of the following data block. +All other bits give the size, in bytes, of the data section. The size does not include the block checksum if present. -Block Size shall never be larger than Block Maximum Size. -Such a thing could potentially happen for non-compressible sources. -In such a case, such data block shall be passed using uncompressed format. +_Block_Size_ shall never be larger than _Block_Maximum_Size_. +Such an outcome could potentially happen for non-compressible sources. +In such a case, such data block must be passed using uncompressed format. + +A value of `0x00000000` is invalid, and signifies an _EndMark_ instead. +Note that this is different from a value of `0x80000000` (highest bit set), +which is an uncompressed block of size 0 (empty), +which is valid, and therefore doesn't end a frame. +Note that, if _Block_checksum_ is enabled, +even an empty block must be followed by a 32-bit block checksum. __Data__ @@ -279,20 +286,22 @@ It might be compressed or not, depending on previous field indications. When compressed, the data must respect the [LZ4 block format specification](https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md). -Note that the block is not necessarily full. -Uncompressed size of data can be any size, up to "Block Maximum Size”, +Note that a block is not necessarily full. +Uncompressed size of data can be any size __up to__ _Block_Maximum_Size_, so it may contain less data than the maximum block size. __Block checksum__ Only present if the associated flag is set. This is a 4-bytes checksum value, in little endian format, -calculated by using the xxHash-32 algorithm on the raw (undecoded) data block, +calculated by using the [xxHash-32 algorithm] on the __raw__ (undecoded) data block, and a seed of zero. The intention is to detect data corruption (storage or transmission errors) before decoding. -Block checksum is cumulative with Content checksum. +_Block_checksum_ can be cumulative with _Content_checksum_. + +[xxHash-32 algorithm]: https://github.com/Cyan4973/xxHash/blob/release/doc/xxhash_spec.md Skippable Frames @@ -389,6 +398,8 @@ and trigger an error if it does not fit within acceptable range. Version changes --------------- +1.6.2 : clarifies specification of _EndMark_ + 1.6.1 : introduced terms "LZ4 Frame Header" and "LZ4 Frame Footer" 1.6.0 : restored Dictionary ID field in Frame header diff --git a/lib/lz4frame.c b/lib/lz4frame.c index 5d716ea..e11f1c8 100644 --- a/lib/lz4frame.c +++ b/lib/lz4frame.c @@ -1483,14 +1483,16 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx, } /* if (dctx->dStage == dstage_storeBlockHeader) */ /* decode block header */ - { size_t const nextCBlockSize = LZ4F_readLE32(selectedIn) & 0x7FFFFFFFU; + { U32 const blockHeader = LZ4F_readLE32(selectedIn); + size_t const nextCBlockSize = blockHeader & 0x7FFFFFFFU; size_t const crcSize = dctx->frameInfo.blockChecksumFlag * BFSize; - if (nextCBlockSize==0) { /* frameEnd signal, no more block */ + if (blockHeader==0) { /* frameEnd signal, no more block */ dctx->dStage = dstage_getSuffix; break; } - if (nextCBlockSize > dctx->maxBlockSize) + if (nextCBlockSize > dctx->maxBlockSize) { return err0r(LZ4F_ERROR_maxBlockSize_invalid); + } if (LZ4F_readLE32(selectedIn) & LZ4F_BLOCKUNCOMPRESSED_FLAG) { /* next block is uncompressed */ dctx->tmpInTarget = nextCBlockSize; diff --git a/tests/frametest.c b/tests/frametest.c index f891530..236a98c 100644 --- a/tests/frametest.c +++ b/tests/frametest.c @@ -995,13 +995,13 @@ int fuzzerTests(U32 seed, unsigned nbTests, unsigned startTest, double compressi BYTE* op = (BYTE*)compressedBuffer; BYTE* const oend = op + (neverFlush ? LZ4F_compressFrameBound(srcSize, prefsPtr) : compressedBufferSize); /* when flushes are possible, can't guarantee a max compressed size */ unsigned const maxBits = FUZ_highbit((U32)srcSize); - size_t cSegmentSize; LZ4F_compressOptions_t cOptions; memset(&cOptions, 0, sizeof(cOptions)); - cSegmentSize = LZ4F_compressBegin(cCtx, op, (size_t)(oend-op), prefsPtr); - CHECK(LZ4F_isError(cSegmentSize), "Compression header failed (error %i)", - (int)cSegmentSize); - op += cSegmentSize; + { size_t const fhSize = LZ4F_compressBegin(cCtx, op, (size_t)(oend-op), prefsPtr); + CHECK(LZ4F_isError(fhSize), "Compression header failed (error %i)", + (int)fhSize); + op += fhSize; + } while (ip < iend) { unsigned const nbBitsSeg = FUZ_rand(&randState) % maxBits; size_t const sampleMax = (FUZ_rand(&randState) & ((1<frameInfo.blockChecksumFlag) { + U32 const bc32 = XXH32(op, 0, 0); + op[0] = (BYTE)bc32; /* little endian format */ + op[1] = (BYTE)(bc32>>8); + op[2] = (BYTE)(bc32>>16); + op[3] = (BYTE)(bc32>>24); + op += 4; + } } } } + } /* while (ip=oend, "LZ4F_compressFrameBound overflow"); { size_t const dstEndSafeSize = LZ4F_compressBound(0, prefsPtr); int const tooSmallDstEnd = ((FUZ_rand(&randState) & 31) == 3); @@ -1086,8 +1098,8 @@ int fuzzerTests(U32 seed, unsigned nbTests, unsigned startTest, double compressi DISPLAYLEVEL(6, "noisy decompression \n"); test_lz4f_decompression(compressedBuffer, cSize, srcStart, srcSize, crcOrig, &randState, dCtxNoise, seed, testNb); /* note : we don't analyze result here : it probably failed, which is expected. - * We just check for potential out-of-bound reads and writes. */ - LZ4F_resetDecompressionContext(dCtxNoise); /* context must be reset after an error */ + * The sole purpose is to catch potential out-of-bound reads and writes. */ + LZ4F_resetDecompressionContext(dCtxNoise); /* context must be reset after an error */ #endif } /* for ( ; (testNb < nbTests) ; ) */ -- cgit v0.12 From 5ab7d22fa5622ab0a02bc627e6ec8742a8e3707c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 14 Aug 2020 15:03:03 -0700 Subject: clarify endMark definition --- doc/lz4_Frame_format.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/lz4_Frame_format.md b/doc/lz4_Frame_format.md index 4548698..e7cbdbf 100644 --- a/doc/lz4_Frame_format.md +++ b/doc/lz4_Frame_format.md @@ -85,8 +85,8 @@ That’s where compressed data is stored. __EndMark__ -The flow of blocks ends when the last data block is announced with -an invalid size of “0”, expressed as a 32-bits value (exactly `0x00000000`). +The flow of blocks ends when the last data block is followed by +the 32-bit value `0x00000000`. __Content Checksum__ @@ -260,9 +260,9 @@ __Block Size__ This field uses 4-bytes, format is little-endian. -If highest bit is set (`1`), the block is uncompressed. +If the highest bit is set (`1`), the block is uncompressed. -If highest bit is not set (`0`), the block is LZ4-compressed, +If the highest bit is not set (`0`), the block is LZ4-compressed, using the [LZ4 block format specification](https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md). All other bits give the size, in bytes, of the data section. -- cgit v0.12