From 6b0c39b839b8343da195252a8c46e6d93138f3b8 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 10 Mar 2015 00:12:51 +0100 Subject: Updated datagen (can create sparse files) --- Makefile | 2 +- programs/Makefile | 5 +- programs/datagen.c | 327 ++++++++++++++++++++------------------------------ programs/datagen.h | 40 ++++++ programs/datagencli.c | 190 +++++++++++++++++++++++++++++ 5 files changed, 366 insertions(+), 198 deletions(-) create mode 100644 programs/datagen.h create mode 100644 programs/datagencli.c diff --git a/Makefile b/Makefile index 5662cb4..5c7c69e 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ TEXT = $(LZ4DIR)/lz4.c $(LZ4DIR)/lz4.h $(LZ4DIR)/lz4hc.c $(LZ4DIR)/lz4hc.h \ Makefile lz4_block_format.txt LZ4_Frame_Format.html NEWS README.md \ cmake_unofficial/CMakeLists.txt \ $(PRGDIR)/fullbench.c $(PRGDIR)/lz4cli.c \ - $(PRGDIR)/datagen.c $(PRGDIR)/fuzzer.c \ + $(PRGDIR)/datagen.c $(PRGDIR)/datagen.h $(PRGDIR)/datagencli.c $(PRGDIR)/fuzzer.c \ $(PRGDIR)/lz4io.c $(PRGDIR)/lz4io.h \ $(PRGDIR)/bench.c $(PRGDIR)/bench.h \ $(PRGDIR)/lz4.1 $(PRGDIR)/lz4c.1 $(PRGDIR)/lz4cat.1 \ diff --git a/programs/Makefile b/programs/Makefile index 1070f40..2c883d0 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -92,7 +92,7 @@ frametest: $(LZ4DIR)/lz4frame.c $(LZ4DIR)/lz4.c $(LZ4DIR)/lz4hc.c $(LZ4DIR)/xxha frametest32: $(LZ4DIR)/lz4frame.c $(LZ4DIR)/lz4.c $(LZ4DIR)/lz4hc.c $(LZ4DIR)/xxhash.c frametest.c $(CC) -m32 $(FLAGS) $^ -o $@$(EXT) -datagen : datagen.c +datagen : datagen.c datagencli.c $(CC) $(FLAGS) $^ -o $@$(EXT) @@ -142,7 +142,7 @@ test-travis: $(TRAVIS_TARGET) test-lz4: lz4 datagen ./datagen -g16KB | ./lz4 -9 | ./lz4 -vdq > $(VOID) ./datagen | ./lz4 | ./lz4 -vdq > $(VOID) - ./datagen -g6M -p100 | ./lz4 -9BD | ./lz4 -vdq > $(VOID) + ./datagen -g6M -P100 | ./lz4 -9BD | ./lz4 -vdq > $(VOID) ./datagen -g17M | ./lz4 -9v | ./lz4 -vdq > $(VOID) ./datagen -g256MB | ./lz4 -vqB4D | ./lz4 -vdq > $(VOID) ./datagen -g6GB | ./lz4 -vqB5D | ./lz4 -vdq > $(VOID) @@ -197,6 +197,7 @@ test-frametest32: frametest32 ./frametest32 test-mem: lz4 datagen fuzzer frametest + valgrind --leak-check=yes ./datagen -g50M > /dev/null ./datagen -g16KB > tmp valgrind --leak-check=yes ./lz4 -9 -BD -f tmp /dev/null ./datagen -g16MB > tmp diff --git a/programs/datagen.c b/programs/datagen.c index 0f07477..2a10b81 100644 --- a/programs/datagen.c +++ b/programs/datagen.c @@ -19,26 +19,20 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. You can contact the author at : - - LZ4 source repository : http://code.google.com/p/lz4 - - LZ4 source mirror : https://github.com/Cyan4973/lz4 - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c + - ZSTD source repository : https://github.com/Cyan4973/zstd + - Public forum : https://groups.google.com/forum/#!forum/lz4c */ /************************************** - Remove Visual warning messages +* Includes **************************************/ -#define _CRT_SECURE_NO_WARNINGS // fgets +#include /* malloc */ +#include /* FILE, fwrite */ +#include /* memcpy */ /************************************** - Includes -**************************************/ -#include // fgets, sscanf -#include // strcmp - - -/************************************** - Basic Types +* Basic Types **************************************/ #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ # include @@ -57,230 +51,173 @@ /************************************** - Constants +* OS-specific Includes **************************************/ -#ifndef LZ4_VERSION -# define LZ4_VERSION "r125" +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__) +# include /* _O_BINARY */ +# include /* _setmode, _isatty */ +# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY) +#else +# define SET_BINARY_MODE(file) #endif -#define KB *(1 <<10) -#define MB *(1 <<20) -#define GB *(1U<<30) - -#define CDG_SIZE_DEFAULT (64 KB) -#define CDG_SEED_DEFAULT 0 -#define CDG_COMPRESSIBILITY_DEFAULT 50 -#define PRIME1 2654435761U -#define PRIME2 2246822519U - /************************************** - Macros +* Constants **************************************/ -#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) -#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } - +#define KB *(1 <<10) -/************************************** - Local Parameters -**************************************/ -static unsigned no_prompt = 0; -static char* programName; -static unsigned displayLevel = 2; +#define PRIME1 2654435761U +#define PRIME2 2246822519U /********************************************************* - functions +* Local Functions *********************************************************/ - -#define CDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) -static unsigned int CDG_rand(U32* src) +#define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) +static unsigned int RDG_rand(U32* src) { U32 rand32 = *src; rand32 *= PRIME1; - rand32 += PRIME2; - rand32 = CDG_rotl32(rand32, 13); + rand32 ^= PRIME2; + rand32 = RDG_rotl32(rand32, 13); *src = rand32; return rand32; } -#define CDG_RAND15BITS ((CDG_rand(seed) >> 3) & 32767) -#define CDG_RANDLENGTH ( ((CDG_rand(seed) >> 7) & 3) ? (CDG_rand(seed) % 14) : (CDG_rand(seed) & 511) + 15) -#define CDG_RANDCHAR (((CDG_rand(seed) >> 9) & 63) + '0') -static void CDG_generate(U64 size, U32* seed, double proba) +#define LTSIZE 8192 +#define LTMASK (LTSIZE-1) +static void* RDG_createLiteralDistrib(double ld) { - BYTE fullbuff[32 KB + 128 KB + 1]; - BYTE* buff = fullbuff + 32 KB; - U64 total=0; - U32 P32 = (U32)(32768 * proba); - U32 pos=1; - U32 genBlockSize = 128 KB; + BYTE* lt = malloc(LTSIZE); + U32 i = 0; + BYTE character = '0'; + BYTE firstChar = '('; + BYTE lastChar = '}'; - // Build initial prefix - fullbuff[0] = CDG_RANDCHAR; - while (pos<32 KB) + if (ld==0.0) { - // Select : Literal (char) or Match (within 32K) - if (CDG_RAND15BITS < P32) + character = 0; + firstChar = 0; + lastChar =255; + } + while (i LTSIZE) weight = LTSIZE-i; + end = i + weight; + while (i < end) lt[i++] = character; + character++; + if (character > lastChar) character = firstChar; + } + return lt; +} + +static char RDG_genChar(U32* seed, const void* ltctx) +{ + const BYTE* lt = ltctx; + U32 id = RDG_rand(seed) & LTMASK; + return lt[id]; +} + +#define RDG_DICTSIZE (32 KB) +#define RDG_RAND15BITS ((RDG_rand(seed) >> 3) & 32767) +#define RDG_RANDLENGTH ( ((RDG_rand(seed) >> 7) & 7) ? (RDG_rand(seed) & 15) : (RDG_rand(seed) & 511) + 15) +void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, void* litTable, unsigned* seedPtr) +{ + BYTE* buffPtr = (BYTE*)buffer; + const U32 matchProba32 = (U32)(32768 * matchProba); + size_t pos = prefixSize; + void* ldctx = litTable; + U32* seed = seedPtr; + + /* special case */ + while (matchProba >= 1.0) + { + size_t size0 = RDG_rand(seed) & 3; + size0 = 1U << (16 + size0 * 2); + size0 += RDG_rand(seed) & (size0-1); /* because size0 is power of 2*/ + if (buffSize < pos + size0) + { + memset(buffPtr+pos, 0, buffSize-pos); + return; + } + memset(buffPtr+pos, 0, size0); + pos += size0; + buffPtr[pos-1] = RDG_genChar(seed, ldctx); + } + + /* init */ + if (pos==0) buffPtr[0] = RDG_genChar(seed, ldctx), pos=1; + + /* Generate compressible data */ + while (pos < buffSize) + { + /* Select : Literal (char) or Match (within 32K) */ + if (RDG_RAND15BITS < matchProba32) { - // Copy (within 64K) + /* Copy (within 32K) */ + int match; U32 d; - int ref; - int length = CDG_RANDLENGTH + 4; - U32 offset = CDG_RAND15BITS + 1; + int length = RDG_RANDLENGTH + 4; + U32 offset = RDG_RAND15BITS + 1; if (offset > pos) offset = pos; - ref = pos - offset; + match = pos - offset; d = pos + length; - while (pos < d) fullbuff[pos++] = fullbuff[ref++]; + if (d > buffSize) d = buffSize; + while (pos < d) buffPtr[pos++] = buffPtr[match++]; } else { - // Literal (noise) - U32 d = pos + CDG_RANDLENGTH; - while (pos < d) fullbuff[pos++] = CDG_RANDCHAR; - } - } - - // Generate compressible data - pos = 0; - while (total < size) - { - if (size-total < 128 KB) genBlockSize = (U32)(size-total); - total += genBlockSize; - buff[genBlockSize] = 0; - pos = 0; - while (pos genBlockSize ) length = genBlockSize - pos; - ref = pos - offset; - d = pos + length; - while (pos < d) buff[pos++] = buff[ref++]; - } - else - { - // Literal (noise) - U32 d; - int length = CDG_RANDLENGTH; - if (pos + length > genBlockSize) length = genBlockSize - pos; - d = pos + length; - while (pos < d) buff[pos++] = CDG_RANDCHAR; - } + /* Literal (noise) */ + size_t d; + size_t length = RDG_RANDLENGTH; + d = pos + length; + if (d > buffSize) d = buffSize; + while (pos < d) buffPtr[pos++] = RDG_genChar(seed, ldctx); } - // output datagen - pos=0; - for (;pos+512<=genBlockSize;pos+=512) - printf("%512.512s", buff+pos); - for (;pos='0') && (*argument<='9')) - { - size *= 10; - size += *argument - '0'; - argument++; - } - if (*argument=='K') { size <<= 10; argument++; } - if (*argument=='M') { size <<= 20; argument++; } - if (*argument=='G') { size <<= 30; argument++; } - if (*argument=='B') { argument++; } - break; - case 's': - argument++; - seed=0; - while ((*argument>='0') && (*argument<='9')) - { - seed *= 10; - seed += *argument - '0'; - argument++; - } - break; - case 'p': - argument++; - proba=0; - while ((*argument>='0') && (*argument<='9')) - { - proba *= 10; - proba += *argument - '0'; - argument++; - } - if (proba<0) proba=0; - if (proba>100) proba=100; - break; - case 'v': - displayLevel = 4; - argument++; - break; - default: ; - } - } + /* Generate dict */ + RDG_genBlock(buff, RDG_DICTSIZE, 0, matchProba, ldctx, &seed); - } + /* Generate compressible data */ + while (total < size) + { + RDG_genBlock(buff, RDG_DICTSIZE+RDG_BLOCKSIZE, RDG_DICTSIZE, matchProba, ldctx, &seed); + if (size-total < RDG_BLOCKSIZE) genBlockSize = (size_t)(size-total); + total += genBlockSize; + fwrite(buff, 1, genBlockSize, stdout); + /* update dict */ + memcpy(buff, buff + RDG_BLOCKSIZE, RDG_DICTSIZE); } - // Get Seed - DISPLAYLEVEL(4, "Data Generator %s \n", LZ4_VERSION); - DISPLAYLEVEL(3, "Seed = %u \n", seed); - if (proba!=CDG_COMPRESSIBILITY_DEFAULT) DISPLAYLEVEL(3, "Compressibility : %i%%\n", proba); - - CDG_generate(size, &seed, ((double)proba) / 100); - - return 0; + free(ldctx); } diff --git a/programs/datagen.h b/programs/datagen.h new file mode 100644 index 0000000..631d146 --- /dev/null +++ b/programs/datagen.h @@ -0,0 +1,40 @@ +/* + datagen.h - compressible data generator header + Copyright (C) Yann Collet 2012-2015 + + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + You can contact the author at : + - ZSTD source repository : https://github.com/Cyan4973/zstd + - Public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + + +#include /* size_t */ + +void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed); +void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed); +/* RDG_genOut + Generate 'size' bytes of compressible data into stdout. + Compressibility can be controlled using 'matchProba'. + 'LitProba' is optional, and affect variability of bytes. If litProba==0.0, default value is used. + Generated data can be selected using 'seed'. + If (matchProba, litProba and seed) are equal, the function always generate the same content. + + RDG_genBuffer + Same as RDG_genOut, but generate data into provided buffer +*/ diff --git a/programs/datagencli.c b/programs/datagencli.c new file mode 100644 index 0000000..801e198 --- /dev/null +++ b/programs/datagencli.c @@ -0,0 +1,190 @@ +/* + datagencli.c + compressible data command line generator + Copyright (C) Yann Collet 2012-2015 + + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + You can contact the author at : + - ZSTD source repository : https://github.com/Cyan4973/zstd + - Public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +/************************************** +* Includes +**************************************/ +#include /* fprintf, stderr */ +#include "datagen.h" /* RDG_generate */ + + +/************************************** +* Basic Types +**************************************/ +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + + +/************************************** +* Constants +**************************************/ +#ifndef ZSTD_VERSION +# define ZSTD_VERSION "r1" +#endif + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define SIZE_DEFAULT (64 KB) +#define SEED_DEFAULT 0 +#define COMPRESSIBILITY_DEFAULT 50 + + +/************************************** +* Macros +**************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static unsigned displayLevel = 2; + + +/********************************************************* +* Command line +*********************************************************/ +static int usage(char* programName) +{ + DISPLAY( "Compressible data generator\n"); + DISPLAY( "Usage :\n"); + DISPLAY( " %s [size] [args]\n", programName); + DISPLAY( "\n"); + DISPLAY( "Arguments :\n"); + DISPLAY( " -g# : generate # data (default:%i)\n", SIZE_DEFAULT); + DISPLAY( " -s# : Select seed (default:%i)\n", SEED_DEFAULT); + DISPLAY( " -P# : Select compressibility in %% (default:%i%%)\n", COMPRESSIBILITY_DEFAULT); + DISPLAY( " -h : display help and exit\n"); + return 0; +} + + +int main(int argc, char** argv) +{ + int argNb; + double proba = (double)COMPRESSIBILITY_DEFAULT / 100; + double litProba = 0.0; + U64 size = SIZE_DEFAULT; + U32 seed = SEED_DEFAULT; + char* programName; + + /* Check command line */ + programName = argv[0]; + for(argNb=1; argNb='0') && (*argument<='9')) + { + size *= 10; + size += *argument - '0'; + argument++; + } + if (*argument=='K') { size <<= 10; argument++; } + if (*argument=='M') { size <<= 20; argument++; } + if (*argument=='G') { size <<= 30; argument++; } + if (*argument=='B') { argument++; } + break; + case 's': + argument++; + seed=0; + while ((*argument>='0') && (*argument<='9')) + { + seed *= 10; + seed += *argument - '0'; + argument++; + } + break; + case 'P': + argument++; + proba=0.0; + while ((*argument>='0') && (*argument<='9')) + { + proba *= 10; + proba += *argument - '0'; + argument++; + } + if (proba>100.) proba=100.; + proba /= 100.; + break; + case 'L': /* hidden argument : Literal distribution probability */ + argument++; + litProba=0.; + while ((*argument>='0') && (*argument<='9')) + { + litProba *= 10; + litProba += *argument - '0'; + argument++; + } + if (litProba>100.) litProba=100.; + litProba /= 100.; + break; + case 'v': + displayLevel = 4; + argument++; + break; + default: + return usage(programName); + } + } + + } + } + + DISPLAYLEVEL(4, "Data Generator %s \n", ZSTD_VERSION); + DISPLAYLEVEL(3, "Seed = %u \n", seed); + if (proba!=COMPRESSIBILITY_DEFAULT) DISPLAYLEVEL(3, "Compressibility : %i%%\n", (U32)(proba*100)); + + RDG_genOut(size, proba, litProba, seed); + DISPLAYLEVEL(1, "\n"); + + return 0; +} -- cgit v0.12