Merge pull request #910 from lz4/extraInput

Fix issue #783
author: Yann Collet <Cyan4973@users.noreply.github.com> 2020-08-27 18:00:28 (GMT)
committer: GitHub <noreply@github.com> 2020-08-27 18:00:28 (GMT)
commit: 440c8461d71a79ee927ce93077b58ad22d894d28 (patch)
tree: 6e352872dcbe8543766d33612d12468c1bd8dd04
parent: b73cd37baba01229fb67a7aaae9e95fcffd09059 (diff)
parent: 8b75d403d86eaf9786da89a49ca02444916c462e (diff)
download: lz4-440c8461d71a79ee927ce93077b58ad22d894d28.zip
lz4-440c8461d71a79ee927ce93077b58ad22d894d28.tar.gz
lz4-440c8461d71a79ee927ce93077b58ad22d894d28.tar.bz2
6 files changed, 128 insertions, 49 deletions
diff --git a/.travis.yml b/.travis.yml
index 1474fad..6074f08 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,9 +10,7 @@ matrix:
       script:
         - make   # test library build
         - make clean
-        - make -C tests test-lz4 MOREFLAGS='-Werror -Wconversion -Wno-sign-conversion' | tee # test scenario where `stdout` is not the console
-        - make clean
-        - CFLAGS=-m32 make -C tests test-lz4-contentSize
+        - make test MOREFLAGS='-Werror -Wconversion -Wno-sign-conversion' | tee # test scenario where `stdout` is not the console
 
     # Container-based 12.04 LTS Server Edition 64 bit (doesn't support 32-bit includes)
     - name: (Precise) benchmark test
diff --git a/lib/lz4.c b/lib/lz4.c
index 0ca7b21..0628eac 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -1813,7 +1813,8 @@ LZ4_decompress_generic(
             if ((dict==usingExtDict) && (match < lowPrefix)) {
                 if (unlikely(op+length > oend-LASTLITERALS)) {
                     if (partialDecoding) {
-                        length = MIN(length, (size_t)(oend-op));  /* reach end of buffer */
+                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                        length = MIN(length, (size_t)(oend-op));
                     } else {
                         goto _output_error;  /* end-of-block condition violated */
                 }   }
@@ -1921,29 +1922,34 @@ LZ4_decompress_generic(
               || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
             {
                 /* We've either hit the input parsing restriction or the output parsing restriction.
-                 * If we've hit the input parsing condition then this must be the last sequence.
-                 * If we've hit the output parsing condition then we are either using partialDecoding
-                 * or we've hit the output parsing condition.
+                 * In the normal scenario, decoding a full block, it must be the last sequence,
+                 * otherwise it's an error (invalid input or dimensions).
+                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
                  */
                 if (partialDecoding) {
                     /* Since we are partial decoding we may be in this block because of the output parsing
                      * restriction, which is not valid since the output buffer is allowed to be undersized.
                      */
                     assert(endOnInput);
-                    /* If we're in this block because of the input parsing condition, then we must be on the
-                     * last sequence (or invalid), so we must check that we exactly consume the input.
+                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of input.
                      */
-                    if ((ip+length>iend-(2+1+LASTLITERALS)) && (ip+length != iend)) { goto _output_error; }
-                    assert(ip+length <= iend);
-                    /* We are finishing in the middle of a literals segment.
-                     * Break after the copy.
+                    if (ip+length > iend) {
+                        length = (size_t)(iend-ip);
+                        cpy = op + length;
+                    }
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of output space.
                      */
                     if (cpy > oend) {
                         cpy = oend;
                         assert(op<=oend);
                         length = (size_t)(oend-op);
                     }
-                    assert(ip+length <= iend);
                 } else {
                     /* We must be on the last sequence because of the parsing limitations so check
                      * that we exactly regenerate the original size (must be exact when !endOnInput).
@@ -1954,14 +1960,15 @@ LZ4_decompress_generic(
                       */
                     if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) { goto _output_error; }
                 }
-                memmove(op, ip, length);  /* supports overlapping memory regions, which only matters for in-place decompression scenarios */
+                memmove(op, ip, length);  /* supports overlapping memory regions; only matters for in-place decompression scenarios */
                 ip += length;
                 op += length;
-                /* Necessarily EOF when !partialDecoding. When partialDecoding
-                 * it is EOF if we've either filled the output buffer or hit
-                 * the input parsing restriction.
+                /* Necessarily EOF when !partialDecoding.
+                 * When partialDecoding, it is EOF if we've either
+                 * filled the output buffer or
+                 * can't proceed with reading an offset for following match.
                  */
-                if (!partialDecoding || (cpy == oend) || (ip == iend)) {
+                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
                     break;
                 }
             } else {
diff --git a/lib/lz4.h b/lib/lz4.h
index 5209c10..5d2475c 100644
--- a/lib/lz4.h
+++ b/lib/lz4.h
@@ -221,25 +221,35 @@ LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePt
  *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
  *  into destination buffer 'dst' of size 'dstCapacity'.
  *  Up to 'targetOutputSize' bytes will be decoded.
- *  The function stops decoding on reaching this objective,
- *  which can boost performance when only the beginning of a block is required.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
  *
- * @return : the number of bytes decoded in `dst` (necessarily <= dstCapacity)
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
  *           If source stream is detected malformed, function returns a negative result.
  *
- *  Note : @return can be < targetOutputSize, if compressed block contains less data.
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
  *
- *  Note 2 : this function features 2 parameters, targetOutputSize and dstCapacity,
- *           and expects targetOutputSize <= dstCapacity.
- *           It effectively stops decoding on reaching targetOutputSize,
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
  *           so dstCapacity is kind of redundant.
- *           This is because in a previous version of this function,
- *           decoding operation would not "break" a sequence in the middle.
- *           As a consequence, there was no guarantee that decoding would stop at exactly targetOutputSize,
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
  *           it could write more bytes, though only up to dstCapacity.
  *           Some "margin" used to be required for this operation to work properly.
- *           This is no longer necessary.
- *           The function nonetheless keeps its signature, in an effort to not break API.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
  */
 LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
 
diff --git a/tests/Makefile b/tests/Makefile
index 866ff5d..5a6ec59 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -55,7 +55,7 @@ NB_LOOPS     ?= -i1
 
 default: all
 
-all: fullbench fuzzer frametest roundTripTest datagen checkFrame
+all: fullbench fuzzer frametest roundTripTest datagen checkFrame decompress-partial
 
 all32: CFLAGS+=-m32
 all32: all
@@ -104,6 +104,9 @@ datagen : $(PRGDIR)/datagen.c datagencli.c
 checkFrame : lz4frame.o lz4.o lz4hc.o xxhash.o checkFrame.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)
 
+decompress-partial: lz4.o decompress-partial.c
+	$(CC) $(FLAGS) $^ -o $@$(EXT)
+
 clean:
 	@$(MAKE) -C $(LZ4DIR) $@ > $(VOID)
 	@$(MAKE) -C $(PRGDIR) $@ > $(VOID)
@@ -114,7 +117,8 @@ clean:
         frametest$(EXT) frametest32$(EXT) \
         fasttest$(EXT) roundTripTest$(EXT) \
         datagen$(EXT) checkTag$(EXT) \
-        frameTest$(EXT) lz4_all.c
+        frameTest$(EXT) decompress-partial$(EXT) \
+		lz4_all.c
 	@$(RM) -rf $(TESTDIR)
 	@echo Cleaning completed
 
@@ -158,7 +162,7 @@ list:
 check: test-lz4-essentials
 
 .PHONY: test
-test: test-lz4 test-lz4c test-frametest test-fullbench test-fuzzer test-install test-amalgamation listTest
+test: test-lz4 test-lz4c test-frametest test-fullbench test-fuzzer test-install test-amalgamation listTest test-decompress-partial
 
 .PHONY: test32
 test32: CFLAGS+=-m32
@@ -401,8 +405,8 @@ test-lz4-dict: lz4 datagen
 
 test-lz4-hugefile: lz4 datagen
 	@echo "\n ---- test huge files compression/decompression ----"
-	$(DATAGEN) -g6GB   | $(LZ4) -vB5D  | $(LZ4) -qt
-	$(DATAGEN) -g5GB   | $(LZ4) -v4BD  | $(LZ4) -qt
+	./datagen -g6GB    | $(LZ4) -vB5D  | $(LZ4) -qt
+	./datagen -g4500MB | $(LZ4) -v3BD | $(LZ4) -qt
 	# test large file size [2-4] GB
 	@$(DATAGEN) -g3G -P100 | $(LZ4) -vv | $(LZ4) --decompress --force --sparse - tmphf1
 	@ls -ls tmphf1
@@ -530,4 +534,8 @@ test-mem: lz4 datagen fuzzer frametest fullbench
 test-mem32: lz4c32 datagen
 # unfortunately, valgrind doesn't seem to work with non-native binary...
 
+test-decompress-partial : decompress-partial
+	@echo "\n ---- test decompress-partial ----"
+	./decompress-partial$(EXT)
+
 endif
diff --git a/tests/decompress-partial.c b/tests/decompress-partial.c
new file mode 100644
index 0000000..4e124b7
--- /dev/null
+++ b/tests/decompress-partial.c
@@ -0,0 +1,49 @@
+#include "stdio.h"
+#include "string.h"
+#include "lz4.h"
+
+const char source[] =
+  "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod\n"
+  "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim\n"
+  "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea\n"
+  "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate\n"
+  "velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat\n"
+  "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id\n"
+  "est laborum.\n"
+  "\n"
+  "Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium\n"
+  "doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore\n"
+  "veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim\n"
+  "ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia\n"
+  "consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque\n"
+  "porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur,\n"
+  "adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore\n"
+  "et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis\n"
+  "nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid\n"
+  "ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea\n"
+  "voluptate velit esse quam nihil molestiae consequatur, vel illum qui\n"
+  "dolorem eum fugiat quo voluptas nulla pariatur?\n";
+
+#define BUFFER_SIZE 2048
+
+int main(void)
+{
+  int srcLen = (int)strlen(source);
+  char cmpBuffer[BUFFER_SIZE];
+  char outBuffer[BUFFER_SIZE];
+  int cmpSize;
+  int i;
+  
+  cmpSize = LZ4_compress_default(source, cmpBuffer, srcLen, BUFFER_SIZE);
+
+  for (i = cmpSize; i < cmpSize + 10; ++i) {
+    int result = LZ4_decompress_safe_partial(cmpBuffer, outBuffer, i, srcLen, BUFFER_SIZE);
+    if ((result < 0) || (result != srcLen) || memcmp(source, outBuffer, srcLen)) {
+      printf("test decompress-partial error \n");
+      return -1;
+    }
+  }
+  
+  printf("test decompress-partial OK \n");
+  return 0;
+}
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index 3d7456a..4658d79 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -618,13 +618,16 @@ static int FUZ_test(U32 seed, U32 nbCycles, const U32 startCycle, const double c
 
         /* Test partial decoding => must work */
         FUZ_DISPLAYTEST("test LZ4_decompress_safe_partial");
-        {   size_t const missingBytes = FUZ_rand(&randState) % (unsigned)blockSize;
-            int const targetSize = (int)((size_t)blockSize - missingBytes);
+        {   size_t const missingOutBytes = FUZ_rand(&randState) % (unsigned)blockSize;
+            int const targetSize = (int)((size_t)blockSize - missingOutBytes);
+            size_t const extraneousInBytes = FUZ_rand(&randState) % 2;
+            int const inCSize = (int)((size_t)compressedSize + extraneousInBytes);
             char const sentinel = decodedBuffer[targetSize] = block[targetSize] ^ 0x5A;
-            int const decResult = LZ4_decompress_safe_partial(compressedBuffer, decodedBuffer, compressedSize, targetSize, blockSize);
+            int const decResult = LZ4_decompress_safe_partial(compressedBuffer, decodedBuffer, inCSize, targetSize, blockSize);
             FUZ_CHECKTEST(decResult<0, "LZ4_decompress_safe_partial failed despite valid input data (error:%i)", decResult);
             FUZ_CHECKTEST(decResult != targetSize, "LZ4_decompress_safe_partial did not regenerated required amount of data (%i < %i <= %i)", decResult, targetSize, blockSize);
             FUZ_CHECKTEST(decodedBuffer[targetSize] != sentinel, "LZ4_decompress_safe_partial overwrite beyond requested size (though %i <= %i <= %i)", decResult, targetSize, blockSize);
+            FUZ_CHECKTEST(memcmp(block, decodedBuffer, (size_t)targetSize), "LZ4_decompress_safe_partial: corruption detected in regenerated data");
         }
 
         /* Test Compression with limited output size */
@@ -856,12 +859,12 @@ static int FUZ_test(U32 seed, U32 nbCycles, const U32 startCycle, const double c
         FUZ_CHECKTEST(decodedBuffer[blockSize-1], "LZ4_decompress_safe_usingDict overrun specified output buffer size");
 
         FUZ_DISPLAYTEST("LZ4_decompress_safe_usingDict with a too small output buffer");
-        {   U32 const missingBytes = (FUZ_rand(&randState) & 0xF) + 2;
-            if ((U32)blockSize > missingBytes) {
-                decodedBuffer[(U32)blockSize-missingBytes] = 0;
+        {   int const missingBytes = (FUZ_rand(&randState) & 0xF) + 2;
+            if (blockSize > missingBytes) {
+                decodedBuffer[blockSize-missingBytes] = 0;
                 ret = LZ4_decompress_safe_usingDict(compressedBuffer, decodedBuffer, blockContinueCompressedSize, blockSize-missingBytes, dict, dictSize);
-                FUZ_CHECKTEST(ret>=0, "LZ4_decompress_safe_usingDict should have failed : output buffer too small (-%u byte)", missingBytes);
-                FUZ_CHECKTEST(decodedBuffer[blockSize-missingBytes], "LZ4_decompress_safe_usingDict overrun specified output buffer size (-%u byte) (blockSize=%i)", missingBytes, blockSize);
+                FUZ_CHECKTEST(ret>=0, "LZ4_decompress_safe_usingDict should have failed : output buffer too small (-%i byte)", missingBytes);
+                FUZ_CHECKTEST(decodedBuffer[blockSize-missingBytes], "LZ4_decompress_safe_usingDict overrun specified output buffer size (-%i byte) (blockSize=%i)", missingBytes, blockSize);
         }   }
 
         /* Compress HC using External dictionary */
@@ -948,7 +951,7 @@ static int FUZ_test(U32 seed, U32 nbCycles, const U32 startCycle, const double c
 
         /* Compress HC continue destSize */
         FUZ_DISPLAYTEST();
-        {   int const availableSpace = (int)(FUZ_rand(&randState) % blockSize) + 5;
+        {   int const availableSpace = (int)(FUZ_rand(&randState) % (U32)blockSize) + 5;
             int consumedSize = blockSize;
             FUZ_DISPLAYTEST();
             LZ4_loadDictHC(LZ4dictHC, dict, dictSize);
@@ -974,10 +977,14 @@ static int FUZ_test(U32 seed, U32 nbCycles, const U32 startCycle, const double c
 
         /* ***** End of tests *** */
         /* Fill stats */
-        bytes += blockSize;
-        cbytes += compressedSize;
-        hcbytes += HCcompressedSize;
-        ccbytes += blockContinueCompressedSize;
+        assert(blockSize >= 0);
+        bytes += (unsigned)blockSize;
+        assert(compressedSize >= 0);
+        cbytes += (unsigned)compressedSize;
+        assert(HCcompressedSize >= 0);
+        hcbytes += (unsigned)HCcompressedSize;
+        assert(blockContinueCompressedSize >= 0);
+        ccbytes += (unsigned)blockContinueCompressedSize;
     }
 
     if (nbCycles<=1) nbCycles = cycleNb;   /* end by time */
author	Yann Collet <Cyan4973@users.noreply.github.com>	2020-08-27 18:00:28 (GMT)
committer	GitHub <noreply@github.com>	2020-08-27 18:00:28 (GMT)
commit	440c8461d71a79ee927ce93077b58ad22d894d28 (patch)
tree	6e352872dcbe8543766d33612d12468c1bd8dd04
parent	b73cd37baba01229fb67a7aaae9e95fcffd09059 (diff)
parent	8b75d403d86eaf9786da89a49ca02444916c462e (diff)
download	lz4-440c8461d71a79ee927ce93077b58ad22d894d28.zip lz4-440c8461d71a79ee927ce93077b58ad22d894d28.tar.gz lz4-440c8461d71a79ee927ce93077b58ad22d894d28.tar.bz2