diff options
author | Cristian Adam <cristian.adam@qt.io> | 2020-09-23 16:49:33 (GMT) |
---|---|---|
committer | Cristian Adam <cristian.adam@qt.io> | 2020-09-23 16:49:33 (GMT) |
commit | 0b3e9259dd4f36000ded4240dcd39f60d5b600d7 (patch) | |
tree | 54ef4f3f66eb0971af4e56a9529887df0c7fed5b /Utilities/cmzstd/lib/dictBuilder | |
parent | 145730c74630f09e0cb7a0167059ec7ee470d245 (diff) | |
parent | 4676ad8c32d279c159895372f2bd15769197bf09 (diff) | |
download | CMake-0b3e9259dd4f36000ded4240dcd39f60d5b600d7.zip CMake-0b3e9259dd4f36000ded4240dcd39f60d5b600d7.tar.gz CMake-0b3e9259dd4f36000ded4240dcd39f60d5b600d7.tar.bz2 |
Merge branch 'upstream-zstd'
# By zstd upstream
* upstream-zstd:
zstd 2020-05-21 (b706286a)
Diffstat (limited to 'Utilities/cmzstd/lib/dictBuilder')
-rw-r--r-- | Utilities/cmzstd/lib/dictBuilder/cover.c | 285 | ||||
-rw-r--r-- | Utilities/cmzstd/lib/dictBuilder/cover.h | 88 | ||||
-rw-r--r-- | Utilities/cmzstd/lib/dictBuilder/fastcover.c | 141 | ||||
-rw-r--r-- | Utilities/cmzstd/lib/dictBuilder/zdict.c | 62 | ||||
-rw-r--r-- | Utilities/cmzstd/lib/dictBuilder/zdict.h | 108 |
5 files changed, 502 insertions, 182 deletions
diff --git a/Utilities/cmzstd/lib/dictBuilder/cover.c b/Utilities/cmzstd/lib/dictBuilder/cover.c index b55bfb5..da54ef1 100644 --- a/Utilities/cmzstd/lib/dictBuilder/cover.c +++ b/Utilities/cmzstd/lib/dictBuilder/cover.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -26,11 +26,11 @@ #include <string.h> /* memset */ #include <time.h> /* clock */ -#include "mem.h" /* read */ -#include "pool.h" -#include "threading.h" +#include "../common/mem.h" /* read */ +#include "../common/pool.h" +#include "../common/threading.h" #include "cover.h" -#include "zstd_internal.h" /* includes zstd.h */ +#include "../common/zstd_internal.h" /* includes zstd.h */ #ifndef ZDICT_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY #endif @@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group, * * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) * - * Once the dmer d is in the dictionay we set F(d) = 0. + * Once the dmer d is in the dictionary we set F(d) = 0. */ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, COVER_map_t *activeDmers, U32 begin, @@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer); activeSegment.begin += 1; *delDmerOcc -= 1; - /* If this is the last occurence of the dmer, subtract its score */ + /* If this is the last occurrence of the dmer, subtract its score */ if (*delDmerOcc == 0) { COVER_map_remove(activeDmers, delDmer); activeSegment.score -= freqs[delDmer]; @@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) { * Prepare a context for dictionary building. * The context is only dependent on the parameter `d` and can used multiple * times. - * Returns 1 on success or zero on error. + * Returns 0 on success or error code on error. * The context must be destroyed with `COVER_ctx_destroy()`. */ -static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, +static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, unsigned d, double splitPoint) { const BYTE *const samples = (const BYTE *)samplesBuffer; @@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); - return 0; + return ERROR(srcSize_wrong); } /* Check if there are at least 5 training samples */ if (nbTrainSamples < 5) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); - return 0; + return ERROR(srcSize_wrong); } /* Check if there's testing sample */ if (nbTestSamples < 1) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); - return 0; + return ERROR(srcSize_wrong); } /* Zero the context */ memset(ctx, 0, sizeof(*ctx)); @@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) { DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n"); COVER_ctx_destroy(ctx); - return 0; + return ERROR(memory_allocation); } ctx->freqs = NULL; ctx->d = d; @@ -624,7 +624,40 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group); ctx->freqs = ctx->suffix; ctx->suffix = NULL; - return 1; + return 0; +} + +void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel) +{ + const double ratio = (double)nbDmers / maxDictSize; + if (ratio >= 10) { + return; + } + LOCALDISPLAYLEVEL(displayLevel, 1, + "WARNING: The maximum dictionary size %u is too large " + "compared to the source size %u! " + "size(source)/size(dictionary) = %f, but it should be >= " + "10! This may lead to a subpar dictionary! We recommend " + "training on sources at least 10x, and preferably 100x " + "the size of the dictionary! \n", (U32)maxDictSize, + (U32)nbDmers, ratio); +} + +COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, + U32 nbDmers, U32 k, U32 passes) +{ + const U32 minEpochSize = k * 10; + COVER_epoch_info_t epochs; + epochs.num = MAX(1, maxDictSize / k / passes); + epochs.size = nbDmers / epochs.num; + if (epochs.size >= minEpochSize) { + assert(epochs.size * epochs.num <= nbDmers); + return epochs; + } + epochs.size = MIN(minEpochSize, nbDmers); + epochs.num = nbDmers / epochs.size; + assert(epochs.size * epochs.num <= nbDmers); + return epochs; } /** @@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, ZDICT_cover_params_t parameters) { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; - /* Divide the data up into epochs of equal size. - * We will select at least one segment from each epoch. - */ - const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4)); - const unsigned epochSize = (U32)(ctx->suffixSize / epochs); + /* Divide the data into epochs. We will select one segment from each epoch. */ + const COVER_epoch_info_t epochs = COVER_computeEpochs( + (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4); + const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3)); + size_t zeroScoreRun = 0; size_t epoch; DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", - epochs, epochSize); + (U32)epochs.num, (U32)epochs.size); /* Loop through the epochs until there are no more segments or the dictionary * is full. */ - for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { - const U32 epochBegin = (U32)(epoch * epochSize); - const U32 epochEnd = epochBegin + epochSize; + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) { + const U32 epochBegin = (U32)(epoch * epochs.size); + const U32 epochEnd = epochBegin + epochs.size; size_t segmentSize; /* Select a segment */ COVER_segment_t segment = COVER_selectSegment( ctx, freqs, activeDmers, epochBegin, epochEnd, parameters); - /* If the segment covers no dmers, then we are out of content */ + /* If the segment covers no dmers, then we are out of content. + * There may be new content in other epochs, for continue for some time. + */ if (segment.score == 0) { - break; + if (++zeroScoreRun >= maxZeroScoreRun) { + break; + } + continue; } + zeroScoreRun = 0; /* Trim the segment if necessary and if it is too small then we are done */ segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); if (segmentSize < parameters.d) { @@ -690,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( /* Checks */ if (!COVER_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "Cover parameters incorrect\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (nbSamples == 0) { DISPLAYLEVEL(1, "Cover must have at least one input file\n"); - return ERROR(GENERIC); + return ERROR(srcSize_wrong); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", @@ -702,14 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( return ERROR(dstSize_tooSmall); } /* Initialize context and activeDmers */ - if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, - parameters.d, parameters.splitPoint)) { - return ERROR(GENERIC); + { + size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + parameters.d, parameters.splitPoint); + if (ZSTD_isError(initVal)) { + return initVal; + } } + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel); if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); COVER_ctx_destroy(&ctx); - return ERROR(GENERIC); + return ERROR(memory_allocation); } DISPLAYLEVEL(2, "Building dictionary\n"); @@ -770,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, cctx, dst, dstCapacity, samples + offsets[i], samplesSizes[i], cdict); if (ZSTD_isError(size)) { - totalCompressedSize = ERROR(GENERIC); + totalCompressedSize = size; goto _compressCleanup; } totalCompressedSize += size; @@ -846,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) { * Decrements liveJobs and signals any waiting threads if liveJobs == 0. * If this dictionary is the best so far save it and its parameters. */ -void COVER_best_finish(COVER_best_t *best, size_t compressedSize, - ZDICT_cover_params_t parameters, void *dict, - size_t dictSize) { +void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, + COVER_dictSelection_t selection) { + void* dict = selection.dictContent; + size_t compressedSize = selection.totalCompressedSize; + size_t dictSize = selection.dictSize; if (!best) { return; } @@ -874,10 +919,12 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize, } } /* Save the dictionary, parameters, and size */ - memcpy(best->dict, dict, dictSize); - best->dictSize = dictSize; - best->parameters = parameters; - best->compressedSize = compressedSize; + if (dict) { + memcpy(best->dict, dict, dictSize); + best->dictSize = dictSize; + best->parameters = parameters; + best->compressedSize = compressedSize; + } } if (liveJobs == 0) { ZSTD_pthread_cond_broadcast(&best->cond); @@ -886,6 +933,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize, } } +COVER_dictSelection_t COVER_dictSelectionError(size_t error) { + COVER_dictSelection_t selection = { NULL, 0, error }; + return selection; +} + +unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) { + return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent); +} + +void COVER_dictSelectionFree(COVER_dictSelection_t selection){ + free(selection.dictContent); +} + +COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, + size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, + size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) { + + size_t largestDict = 0; + size_t largestCompressed = 0; + BYTE* customDictContentEnd = customDictContent + dictContentSize; + + BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize); + BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize); + double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00; + + if (!largestDictbuffer || !candidateDictBuffer) { + free(largestDictbuffer); + free(candidateDictBuffer); + return COVER_dictSelectionError(dictContentSize); + } + + /* Initial dictionary size and compressed size */ + memcpy(largestDictbuffer, customDictContent, dictContentSize); + dictContentSize = ZDICT_finalizeDictionary( + largestDictbuffer, dictContentSize, customDictContent, dictContentSize, + samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams); + + if (ZDICT_isError(dictContentSize)) { + free(largestDictbuffer); + free(candidateDictBuffer); + return COVER_dictSelectionError(dictContentSize); + } + + totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes, + samplesBuffer, offsets, + nbCheckSamples, nbSamples, + largestDictbuffer, dictContentSize); + + if (ZSTD_isError(totalCompressedSize)) { + free(largestDictbuffer); + free(candidateDictBuffer); + return COVER_dictSelectionError(totalCompressedSize); + } + + if (params.shrinkDict == 0) { + COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize }; + free(candidateDictBuffer); + return selection; + } + + largestDict = dictContentSize; + largestCompressed = totalCompressedSize; + dictContentSize = ZDICT_DICTSIZE_MIN; + + /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */ + while (dictContentSize < largestDict) { + memcpy(candidateDictBuffer, largestDictbuffer, largestDict); + dictContentSize = ZDICT_finalizeDictionary( + candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize, + samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams); + + if (ZDICT_isError(dictContentSize)) { + free(largestDictbuffer); + free(candidateDictBuffer); + return COVER_dictSelectionError(dictContentSize); + + } + + totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes, + samplesBuffer, offsets, + nbCheckSamples, nbSamples, + candidateDictBuffer, dictContentSize); + + if (ZSTD_isError(totalCompressedSize)) { + free(largestDictbuffer); + free(candidateDictBuffer); + return COVER_dictSelectionError(totalCompressedSize); + } + + if (totalCompressedSize <= largestCompressed * regressionTolerance) { + COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize }; + free(largestDictbuffer); + return selection; + } + dictContentSize *= 2; + } + dictContentSize = largestDict; + totalCompressedSize = largestCompressed; + { + COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize }; + free(candidateDictBuffer); + return selection; + } +} + /** * Parameters for COVER_tryParameters(). */ @@ -911,6 +1063,7 @@ static void COVER_tryParameters(void *opaque) { /* Allocate space for hash table, dict, and freqs */ COVER_map_t activeDmers; BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); @@ -926,29 +1079,21 @@ static void COVER_tryParameters(void *opaque) { { const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, dictBufferCapacity, parameters); - dictBufferCapacity = ZDICT_finalizeDictionary( - dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, - ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, - parameters.zParams); - if (ZDICT_isError(dictBufferCapacity)) { - DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, + totalCompressedSize); + + if (COVER_dictSelectionIsError(selection)) { + DISPLAYLEVEL(1, "Failed to select dictionary\n"); goto _cleanup; } } - /* Check total compressed size */ - totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes, - ctx->samples, ctx->offsets, - ctx->nbTrainSamples, ctx->nbSamples, - dict, dictBufferCapacity); - _cleanup: - COVER_best_finish(data->best, totalCompressedSize, parameters, dict, - dictBufferCapacity); + free(dict); + COVER_best_finish(data->best, parameters, selection); free(data); COVER_map_destroy(&activeDmers); - if (dict) { - free(dict); - } + COVER_dictSelectionFree(selection); if (freqs) { free(freqs); } @@ -970,6 +1115,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); const unsigned kIterations = (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); + const unsigned shrinkDict = 0; /* Local variables */ const int displayLevel = parameters->zParams.notificationLevel; unsigned iteration = 1; @@ -977,19 +1123,20 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( unsigned k; COVER_best_t best; POOL_ctx *pool = NULL; + int warned = 0; /* Checks */ if (splitPoint <= 0 || splitPoint > 1) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (kMinK < kMaxD || kMaxK < kMinK) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (nbSamples == 0) { DISPLAYLEVEL(1, "Cover must have at least one input file\n"); - return ERROR(GENERIC); + return ERROR(srcSize_wrong); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", @@ -1013,11 +1160,18 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* Initialize the context for this value of d */ COVER_ctx_t ctx; LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); - if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) { - LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); - COVER_best_destroy(&best); - POOL_free(pool); - return ERROR(GENERIC); + { + const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint); + if (ZSTD_isError(initVal)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + COVER_best_destroy(&best); + POOL_free(pool); + return initVal; + } + } + if (!warned) { + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel); + warned = 1; } /* Loop through k reusing the same context */ for (k = kMinK; k <= kMaxK; k += kStepSize) { @@ -1030,7 +1184,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( COVER_best_destroy(&best); COVER_ctx_destroy(&ctx); POOL_free(pool); - return ERROR(GENERIC); + return ERROR(memory_allocation); } data->ctx = &ctx; data->best = &best; @@ -1040,6 +1194,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( data->parameters.d = d; data->parameters.splitPoint = splitPoint; data->parameters.steps = kSteps; + data->parameters.shrinkDict = shrinkDict; data->parameters.zParams.notificationLevel = g_displayLevel; /* Check the parameters */ if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) { diff --git a/Utilities/cmzstd/lib/dictBuilder/cover.h b/Utilities/cmzstd/lib/dictBuilder/cover.h index 82e2e1c..f2aa0e3 100644 --- a/Utilities/cmzstd/lib/dictBuilder/cover.h +++ b/Utilities/cmzstd/lib/dictBuilder/cover.h @@ -1,11 +1,21 @@ +/* + * Copyright (c) 2017-2020, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + #include <stdio.h> /* fprintf */ #include <stdlib.h> /* malloc, free, qsort */ #include <string.h> /* memset */ #include <time.h> /* clock */ -#include "mem.h" /* read */ -#include "pool.h" -#include "threading.h" -#include "zstd_internal.h" /* includes zstd.h */ +#include "../common/mem.h" /* read */ +#include "../common/pool.h" +#include "../common/threading.h" +#include "../common/zstd_internal.h" /* includes zstd.h */ #ifndef ZDICT_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY #endif @@ -39,6 +49,44 @@ typedef struct { } COVER_segment_t; /** + *Number of epochs and size of each epoch. + */ +typedef struct { + U32 num; + U32 size; +} COVER_epoch_info_t; + +/** + * Struct used for the dictionary selection function. + */ +typedef struct COVER_dictSelection { + BYTE* dictContent; + size_t dictSize; + size_t totalCompressedSize; +} COVER_dictSelection_t; + +/** + * Computes the number of epochs and the size of each epoch. + * We will make sure that each epoch gets at least 10 * k bytes. + * + * The COVER algorithms divide the data up into epochs of equal size and + * select one segment from each epoch. + * + * @param maxDictSize The maximum allowed dictionary size. + * @param nbDmers The number of dmers we are training on. + * @param k The parameter k (segment size). + * @param passes The target number of passes over the dmer corpus. + * More passes means a better dictionary. + */ +COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, + U32 k, U32 passes); + +/** + * Warns the user when their corpus is too small. + */ +void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); + +/** * Checks total compressed size of a dictionary */ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, @@ -78,6 +126,32 @@ void COVER_best_start(COVER_best_t *best); * Decrements liveJobs and signals any waiting threads if liveJobs == 0. * If this dictionary is the best so far save it and its parameters. */ -void COVER_best_finish(COVER_best_t *best, size_t compressedSize, - ZDICT_cover_params_t parameters, void *dict, - size_t dictSize); +void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, + COVER_dictSelection_t selection); +/** + * Error function for COVER_selectDict function. Checks if the return + * value is an error. + */ +unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); + + /** + * Error function for COVER_selectDict function. Returns a struct where + * return.totalCompressedSize is a ZSTD error. + */ +COVER_dictSelection_t COVER_dictSelectionError(size_t error); + +/** + * Always call after selectDict is called to free up used memory from + * newly created dictionary. + */ +void COVER_dictSelectionFree(COVER_dictSelection_t selection); + +/** + * Called to finalize the dictionary and select one based on whether or not + * the shrink-dict flag was enabled. If enabled the dictionary used is the + * smallest dictionary within a specified regression of the compressed size + * from the largest dictionary. + */ + COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, + size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, + size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); diff --git a/Utilities/cmzstd/lib/dictBuilder/fastcover.c b/Utilities/cmzstd/lib/dictBuilder/fastcover.c index c289c06..485c333 100644 --- a/Utilities/cmzstd/lib/dictBuilder/fastcover.c +++ b/Utilities/cmzstd/lib/dictBuilder/fastcover.c @@ -1,3 +1,13 @@ +/* + * Copyright (c) 2018-2020, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + /*-************************************* * Dependencies ***************************************/ @@ -6,11 +16,11 @@ #include <string.h> /* memset */ #include <time.h> /* clock */ -#include "mem.h" /* read */ -#include "pool.h" -#include "threading.h" +#include "../common/mem.h" /* read */ +#include "../common/pool.h" +#include "../common/threading.h" #include "cover.h" -#include "zstd_internal.h" /* includes zstd.h */ +#include "../common/zstd_internal.h" /* includes zstd.h */ #ifndef ZDICT_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY #endif @@ -132,7 +142,7 @@ typedef struct { * * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) * - * Once the dmer with hash value d is in the dictionay we set F(d) = 0. + * Once the dmer with hash value d is in the dictionary we set F(d) = 0. */ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, U32 *freqs, U32 begin, U32 end, @@ -161,7 +171,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* Get hash value of current dmer */ const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d); - /* Add frequency of this index to score if this is the first occurence of index in active segment */ + /* Add frequency of this index to score if this is the first occurrence of index in active segment */ if (segmentFreqs[idx] == 0) { activeSegment.score += freqs[idx]; } @@ -287,10 +297,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx) * Prepare a context for dictionary building. * The context is only dependent on the parameter `d` and can used multiple * times. - * Returns 1 on success or zero on error. + * Returns 0 on success or error code on error. * The context must be destroyed with `FASTCOVER_ctx_destroy()`. */ -static int +static size_t FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, @@ -310,19 +320,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx, totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) { DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20)); - return 0; + return ERROR(srcSize_wrong); } /* Check if there are at least 5 training samples */ if (nbTrainSamples < 5) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples); - return 0; + return ERROR(srcSize_wrong); } /* Check if there's testing sample */ if (nbTestSamples < 1) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples); - return 0; + return ERROR(srcSize_wrong); } /* Zero the context */ @@ -347,7 +357,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx, if (ctx->offsets == NULL) { DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n"); FASTCOVER_ctx_destroy(ctx); - return 0; + return ERROR(memory_allocation); } /* Fill offsets from the samplesSizes */ @@ -364,13 +374,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx, if (ctx->freqs == NULL) { DISPLAYLEVEL(1, "Failed to allocate frequency table \n"); FASTCOVER_ctx_destroy(ctx); - return 0; + return ERROR(memory_allocation); } DISPLAYLEVEL(2, "Computing frequencies\n"); FASTCOVER_computeFrequency(ctx->freqs, ctx); - return 1; + return 0; } @@ -386,29 +396,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx, { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; - /* Divide the data up into epochs of equal size. - * We will select at least one segment from each epoch. - */ - const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k)); - const unsigned epochSize = (U32)(ctx->nbDmers / epochs); + /* Divide the data into epochs. We will select one segment from each epoch. */ + const COVER_epoch_info_t epochs = COVER_computeEpochs( + (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1); + const size_t maxZeroScoreRun = 10; + size_t zeroScoreRun = 0; size_t epoch; DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", - epochs, epochSize); + (U32)epochs.num, (U32)epochs.size); /* Loop through the epochs until there are no more segments or the dictionary * is full. */ - for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { - const U32 epochBegin = (U32)(epoch * epochSize); - const U32 epochEnd = epochBegin + epochSize; + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) { + const U32 epochBegin = (U32)(epoch * epochs.size); + const U32 epochEnd = epochBegin + epochs.size; size_t segmentSize; /* Select a segment */ COVER_segment_t segment = FASTCOVER_selectSegment( ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs); - /* If the segment covers no dmers, then we are out of content */ + /* If the segment covers no dmers, then we are out of content. + * There may be new content in other epochs, for continue for some time. + */ if (segment.score == 0) { - break; + if (++zeroScoreRun >= maxZeroScoreRun) { + break; + } + continue; } + zeroScoreRun = 0; /* Trim the segment if necessary and if it is too small then we are done */ segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); @@ -429,7 +445,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx, return tail; } - /** * Parameters for FASTCOVER_tryParameters(). */ @@ -458,6 +473,7 @@ static void FASTCOVER_tryParameters(void *opaque) U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16)); /* Allocate space for hash table, dict, and freqs */ BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); if (!segmentFreqs || !dict || !freqs) { DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); @@ -467,27 +483,24 @@ static void FASTCOVER_tryParameters(void *opaque) memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32)); /* Build the dictionary */ { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity, - parameters, segmentFreqs); + parameters, segmentFreqs); + const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); - dictBufferCapacity = ZDICT_finalizeDictionary( - dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, - ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams); - if (ZDICT_isError(dictBufferCapacity)) { - DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, + totalCompressedSize); + + if (COVER_dictSelectionIsError(selection)) { + DISPLAYLEVEL(1, "Failed to select dictionary\n"); goto _cleanup; } } - /* Check total compressed size */ - totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes, - ctx->samples, ctx->offsets, - ctx->nbTrainSamples, ctx->nbSamples, - dict, dictBufferCapacity); _cleanup: - COVER_best_finish(data->best, totalCompressedSize, parameters, dict, - dictBufferCapacity); + free(dict); + COVER_best_finish(data->best, parameters, selection); free(data); free(segmentFreqs); - free(dict); + COVER_dictSelectionFree(selection); free(freqs); } @@ -502,6 +515,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams, coverParams->nbThreads = fastCoverParams.nbThreads; coverParams->splitPoint = fastCoverParams.splitPoint; coverParams->zParams = fastCoverParams.zParams; + coverParams->shrinkDict = fastCoverParams.shrinkDict; } @@ -518,6 +532,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams, fastCoverParams->f = f; fastCoverParams->accel = accel; fastCoverParams->zParams = coverParams.zParams; + fastCoverParams->shrinkDict = coverParams.shrinkDict; } @@ -544,11 +559,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f, parameters.accel)) { DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (nbSamples == 0) { DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); - return ERROR(GENERIC); + return ERROR(srcSize_wrong); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", @@ -558,12 +573,16 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, /* Assign corresponding FASTCOVER_accel_t to accelParams*/ accelParams = FASTCOVER_defaultAccelParameters[parameters.accel]; /* Initialize context */ - if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + { + size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, coverParams.d, parameters.splitPoint, parameters.f, - accelParams)) { - DISPLAYLEVEL(1, "Failed to initialize context\n"); - return ERROR(GENERIC); + accelParams); + if (ZSTD_isError(initVal)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + return initVal; + } } + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel); /* Build the dictionary */ DISPLAYLEVEL(2, "Building dictionary\n"); { @@ -609,6 +628,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover( (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f; const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel; + const unsigned shrinkDict = 0; /* Local variables */ const int displayLevel = parameters->zParams.notificationLevel; unsigned iteration = 1; @@ -616,22 +636,23 @@ ZDICT_optimizeTrainFromBuffer_fastCover( unsigned k; COVER_best_t best; POOL_ctx *pool = NULL; + int warned = 0; /* Checks */ if (splitPoint <= 0 || splitPoint > 1) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (kMinK < kMaxD || kMaxK < kMinK) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n"); - return ERROR(GENERIC); + return ERROR(parameter_outOfBound); } if (nbSamples == 0) { LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n"); - return ERROR(GENERIC); + return ERROR(srcSize_wrong); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n", @@ -658,11 +679,18 @@ ZDICT_optimizeTrainFromBuffer_fastCover( /* Initialize the context for this value of d */ FASTCOVER_ctx_t ctx; LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); - if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) { - LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); - COVER_best_destroy(&best); - POOL_free(pool); - return ERROR(GENERIC); + { + size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams); + if (ZSTD_isError(initVal)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + COVER_best_destroy(&best); + POOL_free(pool); + return initVal; + } + } + if (!warned) { + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel); + warned = 1; } /* Loop through k reusing the same context */ for (k = kMinK; k <= kMaxK; k += kStepSize) { @@ -675,7 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover( COVER_best_destroy(&best); FASTCOVER_ctx_destroy(&ctx); POOL_free(pool); - return ERROR(GENERIC); + return ERROR(memory_allocation); } data->ctx = &ctx; data->best = &best; @@ -685,6 +713,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover( data->parameters.d = d; data->parameters.splitPoint = splitPoint; data->parameters.steps = kSteps; + data->parameters.shrinkDict = shrinkDict; data->parameters.zParams.notificationLevel = g_displayLevel; /* Check the parameters */ if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity, diff --git a/Utilities/cmzstd/lib/dictBuilder/zdict.c b/Utilities/cmzstd/lib/dictBuilder/zdict.c index c753da0..6d0b042 100644 --- a/Utilities/cmzstd/lib/dictBuilder/zdict.c +++ b/Utilities/cmzstd/lib/dictBuilder/zdict.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -37,17 +37,18 @@ #include <stdio.h> /* fprintf, fopen, ftello64 */ #include <time.h> /* clock */ -#include "mem.h" /* read */ -#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ +#include "../common/mem.h" /* read */ +#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */ #define HUF_STATIC_LINKING_ONLY -#include "huf.h" /* HUF_buildCTable, HUF_writeCTable */ -#include "zstd_internal.h" /* includes zstd.h */ -#include "xxhash.h" /* XXH64 */ +#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */ +#include "../common/zstd_internal.h" /* includes zstd.h */ +#include "../common/xxhash.h" /* XXH64 */ #include "divsufsort.h" #ifndef ZDICT_STATIC_LINKING_ONLY # define ZDICT_STATIC_LINKING_ONLY #endif #include "zdict.h" +#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */ /*-************************************* @@ -99,6 +100,29 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) return MEM_readLE32((const char*)dictBuffer + 4); } +size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize) +{ + size_t headerSize; + if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted); + + { unsigned offcodeMaxValue = MaxOff; + ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); + U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE); + short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short)); + if (!bs || !wksp || !offcodeNCount) { + headerSize = ERROR(memory_allocation); + } else { + ZSTD_reset_compressedBlockState(bs); + headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize); + } + + free(bs); + free(wksp); + free(offcodeNCount); + } + + return headerSize; +} /*-******************************************************** * Dictionary training functions @@ -571,7 +595,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length) unsigned const prime1 = 2654435761U; unsigned const prime2 = 2246822519U; unsigned acc = prime1; - size_t p=0;; + size_t p=0; for (p=0; p<length; p++) { acc *= prime2; ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); @@ -588,12 +612,12 @@ typedef struct #define MAXREPOFFSET 1024 -static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, +static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets, const void* src, size_t srcSize, U32 notificationLevel) { - size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); + size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog); size_t cSize; if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ @@ -731,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* collect stats on all samples */ for (u=0; u<nbFiles; u++) { - ZDICT_countEStats(esr, params, + ZDICT_countEStats(esr, ¶ms, countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, (const char*)srcBuffer + pos, fileSizes[u], notificationLevel); @@ -741,7 +765,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* analyze, build stats, starting with literals */ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); if (HUF_isError(maxNbBits)) { - eSize = ERROR(GENERIC); + eSize = maxNbBits; DISPLAYLEVEL(1, " HUF_buildCTable error \n"); goto _cleanup; } @@ -764,7 +788,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u]; errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax); if (FSE_isError(errorCode)) { - eSize = ERROR(GENERIC); + eSize = errorCode; DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n"); goto _cleanup; } @@ -773,7 +797,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML); if (FSE_isError(errorCode)) { - eSize = ERROR(GENERIC); + eSize = errorCode; DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n"); goto _cleanup; } @@ -782,7 +806,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL); if (FSE_isError(errorCode)) { - eSize = ERROR(GENERIC); + eSize = errorCode; DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n"); goto _cleanup; } @@ -791,7 +815,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* write result to buffer */ { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog); if (HUF_isError(hhSize)) { - eSize = ERROR(GENERIC); + eSize = hhSize; DISPLAYLEVEL(1, "HUF_writeCTable error \n"); goto _cleanup; } @@ -802,7 +826,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); if (FSE_isError(ohSize)) { - eSize = ERROR(GENERIC); + eSize = ohSize; DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n"); goto _cleanup; } @@ -813,7 +837,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog); if (FSE_isError(mhSize)) { - eSize = ERROR(GENERIC); + eSize = mhSize; DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n"); goto _cleanup; } @@ -824,7 +848,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog); if (FSE_isError(lhSize)) { - eSize = ERROR(GENERIC); + eSize = lhSize; DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n"); goto _cleanup; } @@ -834,7 +858,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, } if (maxDstSize<12) { - eSize = ERROR(GENERIC); + eSize = ERROR(dstSize_tooSmall); DISPLAYLEVEL(1, "not enough space to write RepOffsets \n"); goto _cleanup; } diff --git a/Utilities/cmzstd/lib/dictBuilder/zdict.h b/Utilities/cmzstd/lib/dictBuilder/zdict.h index d57d59f..ff2e77f 100644 --- a/Utilities/cmzstd/lib/dictBuilder/zdict.h +++ b/Utilities/cmzstd/lib/dictBuilder/zdict.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -46,7 +46,12 @@ extern "C" { * The resulting dictionary will be saved into `dictBuffer`. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * or an error code, which can be tested with ZDICT_isError(). - * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. + * Note: Dictionary training will fail if there are not enough samples to construct a + * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit). + * If dictionary training fails, you should use zstd without a dictionary, as the dictionary + * would've been ineffective anyways. If you believe your samples would benefit from a dictionary + * please open an issue with details, and we can look into it. + * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. * In general, it's recommended to provide a few thousands samples, though this can vary a lot. @@ -56,9 +61,57 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); +typedef struct { + int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */ + unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ + unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value) */ +} ZDICT_params_t; + +/*! ZDICT_finalizeDictionary(): + * Given a custom content as a basis for dictionary, and a set of samples, + * finalize dictionary by adding headers and statistics according to the zstd + * dictionary format. + * + * Samples must be stored concatenated in a flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each + * sample in order. The samples are used to construct the statistics, so they + * should be representative of what you will compress with this dictionary. + * + * The compression level can be set in `parameters`. You should pass the + * compression level you expect to use in production. The statistics for each + * compression level differ, so tuning the dictionary for the compression level + * can help quite a bit. + * + * You can set an explicit dictionary ID in `parameters`, or allow us to pick + * a random dictionary ID for you, but we can't guarantee no collisions. + * + * The dstDictBuffer and the dictContent may overlap, and the content will be + * appended to the end of the header. If the header + the content doesn't fit in + * maxDictSize the beginning of the content is truncated to make room, since it + * is presumed that the most profitable content is at the end of the dictionary, + * since that is the cheapest to reference. + * + * `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes. + * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN). + * + * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`), + * or an error code, which can be tested by ZDICT_isError(). + * Note: ZDICT_finalizeDictionary() will push notifications into stderr if + * instructed to, using notificationLevel>0. + * NOTE: This function currently may fail in several edge cases including: + * * Not enough samples + * * Samples are uncompressible + * * Samples are all exactly the same + */ +ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize, + const void* dictContent, size_t dictContentSize, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t parameters); + /*====== Helper functions ======*/ ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */ +ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */ ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode); ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); @@ -73,11 +126,8 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); * Use them only in association with static linking. * ==================================================================================== */ -typedef struct { - int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */ - unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */ -} ZDICT_params_t; +#define ZDICT_CONTENTSIZE_MIN 128 +#define ZDICT_DICTSIZE_MIN 256 /*! ZDICT_cover_params_t: * k and d are the only required parameters. @@ -89,6 +139,8 @@ typedef struct { unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ + unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */ + unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */ ZDICT_params_t zParams; } ZDICT_cover_params_t; @@ -100,6 +152,9 @@ typedef struct { unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */ unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */ + unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */ + unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */ + ZDICT_params_t zParams; } ZDICT_fastCover_params_t; @@ -110,6 +165,7 @@ typedef struct { * The resulting dictionary will be saved into `dictBuffer`. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * or an error code, which can be tested with ZDICT_isError(). + * See ZDICT_trainFromBuffer() for details on failure modes. * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. @@ -133,8 +189,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000]. * * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - * On success `*parameters` contains the parameters selected. + * or an error code, which can be tested with ZDICT_isError(). + * On success `*parameters` contains the parameters selected. + * See ZDICT_trainFromBuffer() for details on failure modes. * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. */ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( @@ -151,7 +208,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( * The resulting dictionary will be saved into `dictBuffer`. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * or an error code, which can be tested with ZDICT_isError(). - * Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory . + * See ZDICT_trainFromBuffer() for details on failure modes. + * Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. * In general, it's recommended to provide a few thousands samples, though this can vary a lot. @@ -175,37 +233,16 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, * If accel is zero, default value of 1 is used. * * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - * On success `*parameters` contains the parameters selected. - * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread. + * or an error code, which can be tested with ZDICT_isError(). + * On success `*parameters` contains the parameters selected. + * See ZDICT_trainFromBuffer() for details on failure modes. + * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread. */ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t* parameters); -/*! ZDICT_finalizeDictionary(): - * Given a custom content as a basis for dictionary, and a set of samples, - * finalize dictionary by adding headers and statistics. - * - * Samples must be stored concatenated in a flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. - * - * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes. - * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes. - * - * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), - * or an error code, which can be tested by ZDICT_isError(). - * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. - * Note 2: dictBuffer and dictContent can overlap - */ -#define ZDICT_CONTENTSIZE_MIN 128 -#define ZDICT_DICTSIZE_MIN 256 -ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, - const void* dictContent, size_t dictContentSize, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_params_t parameters); - typedef struct { unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ ZDICT_params_t zParams; @@ -219,6 +256,7 @@ typedef struct { * `parameters` is optional and can be provided with values set to 0 to mean "default". * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * or an error code, which can be tested with ZDICT_isError(). + * See ZDICT_trainFromBuffer() for details on failure modes. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |