summaryrefslogtreecommitdiffstats
path: root/programs/datagen.c
diff options
context:
space:
mode:
authorYann Collet <yann.collet.73@gmail.com>2015-03-15 00:34:05 (GMT)
committerYann Collet <yann.collet.73@gmail.com>2015-03-15 00:34:05 (GMT)
commita18fb4392a9f249528a4071a5583e17668d92872 (patch)
tree56cf1740e2843b81960bbcdf3adc1c8a768f3583 /programs/datagen.c
parent74a6b14f291c1041a6b36dfb2722268b703c0e96 (diff)
parent248b761f55b92a34eacbfe2ac5d6f1b94e69d7bf (diff)
downloadlz4-a18fb4392a9f249528a4071a5583e17668d92872.zip
lz4-a18fb4392a9f249528a4071a5583e17668d92872.tar.gz
lz4-a18fb4392a9f249528a4071a5583e17668d92872.tar.bz2
Merge pull request #58 from Cyan4973/sparseFile
Sparse file
Diffstat (limited to 'programs/datagen.c')
-rwxr-xr-x[-rw-r--r--]programs/datagen.c329
1 files changed, 133 insertions, 196 deletions
diff --git a/programs/datagen.c b/programs/datagen.c
index 0f07477..bccb21e 100644..100755
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -19,26 +19,20 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
You can contact the author at :
- - LZ4 source repository : http://code.google.com/p/lz4
- - LZ4 source mirror : https://github.com/Cyan4973/lz4
- - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+ - ZSTD source repository : https://github.com/Cyan4973/zstd
+ - Public forum : https://groups.google.com/forum/#!forum/lz4c
*/
/**************************************
- Remove Visual warning messages
+* Includes
**************************************/
-#define _CRT_SECURE_NO_WARNINGS // fgets
+#include <stdlib.h> /* malloc */
+#include <stdio.h> /* FILE, fwrite */
+#include <string.h> /* memcpy */
/**************************************
- Includes
-**************************************/
-#include <stdio.h> // fgets, sscanf
-#include <string.h> // strcmp
-
-
-/**************************************
- Basic Types
+* Basic Types
**************************************/
#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */
# include <stdint.h>
@@ -57,230 +51,173 @@
/**************************************
- Constants
+* OS-specific Includes
**************************************/
-#ifndef LZ4_VERSION
-# define LZ4_VERSION "r125"
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
+# include <fcntl.h> /* _O_BINARY */
+# include <io.h> /* _setmode, _isatty */
+# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
+#else
+# define SET_BINARY_MODE(file)
#endif
-#define KB *(1 <<10)
-#define MB *(1 <<20)
-#define GB *(1U<<30)
-
-#define CDG_SIZE_DEFAULT (64 KB)
-#define CDG_SEED_DEFAULT 0
-#define CDG_COMPRESSIBILITY_DEFAULT 50
-#define PRIME1 2654435761U
-#define PRIME2 2246822519U
-
/**************************************
- Macros
+* Constants
**************************************/
-#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
-#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-
+#define KB *(1 <<10)
-/**************************************
- Local Parameters
-**************************************/
-static unsigned no_prompt = 0;
-static char* programName;
-static unsigned displayLevel = 2;
+#define PRIME1 2654435761U
+#define PRIME2 2246822519U
/*********************************************************
- functions
+* Local Functions
*********************************************************/
-
-#define CDG_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-static unsigned int CDG_rand(U32* src)
+#define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static unsigned int RDG_rand(U32* src)
{
U32 rand32 = *src;
rand32 *= PRIME1;
- rand32 += PRIME2;
- rand32 = CDG_rotl32(rand32, 13);
+ rand32 ^= PRIME2;
+ rand32 = RDG_rotl32(rand32, 13);
*src = rand32;
return rand32;
}
-#define CDG_RAND15BITS ((CDG_rand(seed) >> 3) & 32767)
-#define CDG_RANDLENGTH ( ((CDG_rand(seed) >> 7) & 3) ? (CDG_rand(seed) % 14) : (CDG_rand(seed) & 511) + 15)
-#define CDG_RANDCHAR (((CDG_rand(seed) >> 9) & 63) + '0')
-static void CDG_generate(U64 size, U32* seed, double proba)
+#define LTSIZE 8192
+#define LTMASK (LTSIZE-1)
+static void* RDG_createLiteralDistrib(double ld)
{
- BYTE fullbuff[32 KB + 128 KB + 1];
- BYTE* buff = fullbuff + 32 KB;
- U64 total=0;
- U32 P32 = (U32)(32768 * proba);
- U32 pos=1;
- U32 genBlockSize = 128 KB;
+ BYTE* lt = (BYTE*)malloc(LTSIZE);
+ U32 i = 0;
+ BYTE character = '0';
+ BYTE firstChar = '(';
+ BYTE lastChar = '}';
- // Build initial prefix
- fullbuff[0] = CDG_RANDCHAR;
- while (pos<32 KB)
+ if (ld==0.0)
{
- // Select : Literal (char) or Match (within 32K)
- if (CDG_RAND15BITS < P32)
- {
- // Copy (within 64K)
- U32 d;
- int ref;
- int length = CDG_RANDLENGTH + 4;
- U32 offset = CDG_RAND15BITS + 1;
- if (offset > pos) offset = pos;
- ref = pos - offset;
- d = pos + length;
- while (pos < d) fullbuff[pos++] = fullbuff[ref++];
- }
- else
+ character = 0;
+ firstChar = 0;
+ lastChar =255;
+ }
+ while (i<LTSIZE)
+ {
+ U32 weight = (U32)((double)(LTSIZE - i) * ld) + 1;
+ U32 end;
+ if (weight + i > LTSIZE) weight = LTSIZE-i;
+ end = i + weight;
+ while (i < end) lt[i++] = character;
+ character++;
+ if (character > lastChar) character = firstChar;
+ }
+ return lt;
+}
+
+static char RDG_genChar(U32* seed, const void* ltctx)
+{
+ const BYTE* lt = (const BYTE*)ltctx;
+ U32 id = RDG_rand(seed) & LTMASK;
+ return lt[id];
+}
+
+#define RDG_DICTSIZE (32 KB)
+#define RDG_RAND15BITS ((RDG_rand(seed) >> 3) & 32767)
+#define RDG_RANDLENGTH ( ((RDG_rand(seed) >> 7) & 7) ? (RDG_rand(seed) & 15) : (RDG_rand(seed) & 511) + 15)
+void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, void* litTable, unsigned* seedPtr)
+{
+ BYTE* buffPtr = (BYTE*)buffer;
+ const U32 matchProba32 = (U32)(32768 * matchProba);
+ size_t pos = prefixSize;
+ void* ldctx = litTable;
+ U32* seed = seedPtr;
+
+ /* special case */
+ while (matchProba >= 1.0)
+ {
+ size_t size0 = RDG_rand(seed) & 3;
+ size0 = (size_t)1 << (16 + size0 * 2);
+ size0 += RDG_rand(seed) & (size0-1); /* because size0 is power of 2*/
+ if (buffSize < pos + size0)
{
- // Literal (noise)
- U32 d = pos + CDG_RANDLENGTH;
- while (pos < d) fullbuff[pos++] = CDG_RANDCHAR;
+ memset(buffPtr+pos, 0, buffSize-pos);
+ return;
}
+ memset(buffPtr+pos, 0, size0);
+ pos += size0;
+ buffPtr[pos-1] = RDG_genChar(seed, ldctx);
}
- // Generate compressible data
- pos = 0;
- while (total < size)
+ /* init */
+ if (pos==0) buffPtr[0] = RDG_genChar(seed, ldctx), pos=1;
+
+ /* Generate compressible data */
+ while (pos < buffSize)
{
- if (size-total < 128 KB) genBlockSize = (U32)(size-total);
- total += genBlockSize;
- buff[genBlockSize] = 0;
- pos = 0;
- while (pos<genBlockSize)
+ /* Select : Literal (char) or Match (within 32K) */
+ if (RDG_RAND15BITS < matchProba32)
+ {
+ /* Copy (within 32K) */
+ size_t match;
+ size_t d;
+ int length = RDG_RANDLENGTH + 4;
+ U32 offset = RDG_RAND15BITS + 1;
+ if (offset > pos) offset = (U32)pos;
+ match = pos - offset;
+ d = pos + length;
+ if (d > buffSize) d = buffSize;
+ while (pos < d) buffPtr[pos++] = buffPtr[match++];
+ }
+ else
{
- // Select : Literal (char) or Match (within 32K)
- if (CDG_RAND15BITS < P32)
- {
- // Copy (within 64K)
- int ref;
- U32 d;
- int length = CDG_RANDLENGTH + 4;
- U32 offset = CDG_RAND15BITS + 1;
- if (pos + length > genBlockSize ) length = genBlockSize - pos;
- ref = pos - offset;
- d = pos + length;
- while (pos < d) buff[pos++] = buff[ref++];
- }
- else
- {
- // Literal (noise)
- U32 d;
- int length = CDG_RANDLENGTH;
- if (pos + length > genBlockSize) length = genBlockSize - pos;
- d = pos + length;
- while (pos < d) buff[pos++] = CDG_RANDCHAR;
- }
+ /* Literal (noise) */
+ size_t d;
+ size_t length = RDG_RANDLENGTH;
+ d = pos + length;
+ if (d > buffSize) d = buffSize;
+ while (pos < d) buffPtr[pos++] = RDG_genChar(seed, ldctx);
}
- // output datagen
- pos=0;
- for (;pos+512<=genBlockSize;pos+=512)
- printf("%512.512s", buff+pos);
- for (;pos<genBlockSize;pos++) printf("%c", buff[pos]);
- // Regenerate prefix
- memcpy(fullbuff, buff + 96 KB, 32 KB);
}
}
-int CDG_usage(void)
+void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed)
{
- DISPLAY( "Compressible data generator\n");
- DISPLAY( "Usage :\n");
- DISPLAY( " %s [size] [args]\n", programName);
- DISPLAY( "\n");
- DISPLAY( "Arguments :\n");
- DISPLAY( " -g# : generate # data (default:%i)\n", CDG_SIZE_DEFAULT);
- DISPLAY( " -s# : Select seed (default:%i)\n", CDG_SEED_DEFAULT);
- DISPLAY( " -p# : Select compressibility in %% (default:%i%%)\n", CDG_COMPRESSIBILITY_DEFAULT);
- DISPLAY( " -h : display help and exit\n");
- return 0;
+ void* ldctx;
+ if (litProba==0.0) litProba = matchProba / 4.5;
+ ldctx = RDG_createLiteralDistrib(litProba);
+ RDG_genBlock(buffer, size, 0, matchProba, ldctx, &seed);
+ free(ldctx);
}
-int main(int argc, char** argv)
+#define RDG_BLOCKSIZE (128 KB)
+void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed)
{
- int argNb;
- int proba = CDG_COMPRESSIBILITY_DEFAULT;
- U64 size = CDG_SIZE_DEFAULT;
- U32 seed = CDG_SEED_DEFAULT;
-
- // Check command line
- programName = argv[0];
- for(argNb=1; argNb<argc; argNb++)
- {
- char* argument = argv[argNb];
-
- if(!argument) continue; // Protection if argument empty
+ BYTE buff[RDG_DICTSIZE + RDG_BLOCKSIZE];
+ U64 total = 0;
+ size_t genBlockSize = RDG_BLOCKSIZE;
+ void* ldctx;
- // Decode command (note : aggregated commands are allowed)
- if (*argument=='-')
- {
- if (!strcmp(argument, "--no-prompt")) { no_prompt=1; continue; }
+ /* init */
+ if (litProba==0.0) litProba = matchProba / 4.5;
+ ldctx = RDG_createLiteralDistrib(litProba);
+ SET_BINARY_MODE(stdout);
- argument++;
- while (*argument!=0)
- {
- switch(*argument)
- {
- case 'h':
- return CDG_usage();
- case 'g':
- argument++;
- size=0;
- while ((*argument>='0') && (*argument<='9'))
- {
- size *= 10;
- size += *argument - '0';
- argument++;
- }
- if (*argument=='K') { size <<= 10; argument++; }
- if (*argument=='M') { size <<= 20; argument++; }
- if (*argument=='G') { size <<= 30; argument++; }
- if (*argument=='B') { argument++; }
- break;
- case 's':
- argument++;
- seed=0;
- while ((*argument>='0') && (*argument<='9'))
- {
- seed *= 10;
- seed += *argument - '0';
- argument++;
- }
- break;
- case 'p':
- argument++;
- proba=0;
- while ((*argument>='0') && (*argument<='9'))
- {
- proba *= 10;
- proba += *argument - '0';
- argument++;
- }
- if (proba<0) proba=0;
- if (proba>100) proba=100;
- break;
- case 'v':
- displayLevel = 4;
- argument++;
- break;
- default: ;
- }
- }
+ /* Generate dict */
+ RDG_genBlock(buff, RDG_DICTSIZE, 0, matchProba, ldctx, &seed);
- }
+ /* Generate compressible data */
+ while (total < size)
+ {
+ RDG_genBlock(buff, RDG_DICTSIZE+RDG_BLOCKSIZE, RDG_DICTSIZE, matchProba, ldctx, &seed);
+ if (size-total < RDG_BLOCKSIZE) genBlockSize = (size_t)(size-total);
+ total += genBlockSize;
+ fwrite(buff, 1, genBlockSize, stdout);
+ /* update dict */
+ memcpy(buff, buff + RDG_BLOCKSIZE, RDG_DICTSIZE);
}
- // Get Seed
- DISPLAYLEVEL(4, "Data Generator %s \n", LZ4_VERSION);
- DISPLAYLEVEL(3, "Seed = %u \n", seed);
- if (proba!=CDG_COMPRESSIBILITY_DEFAULT) DISPLAYLEVEL(3, "Compressibility : %i%%\n", proba);
-
- CDG_generate(size, &seed, ((double)proba) / 100);
-
- return 0;
+ free(ldctx);
}