/** * Checks total compressed size of a dictionary */ public static nuint COVER_checkTotalCompressedSize(ZDICT_cover_params_t parameters, nuint *samplesSizes, byte *samples, nuint *offsets, nuint nbTrainSamples, nuint nbSamples, byte *dict, nuint dictBufferCapacity) { nuint totalCompressedSize = (unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC))); ZSTD_CCtx_s * cctx; ZSTD_CDict_s *cdict; void * dst; nuint dstCapacity; nuint i; { nuint maxSampleSize = 0; i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0; for (; i < nbSamples; ++i) { maxSampleSize = ((samplesSizes[i]) > (maxSampleSize) ? (samplesSizes[i]) : (maxSampleSize)); } dstCapacity = ZSTD_compressBound(maxSampleSize); dst = malloc(dstCapacity); } cctx = ZSTD_createCCtx(); cdict = ZSTD_createCDict((void *)dict, dictBufferCapacity, parameters.zParams.compressionLevel); if (dst == null || cctx == null || cdict == null) { goto _compressCleanup; } totalCompressedSize = dictBufferCapacity; i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0; for (; i < nbSamples; ++i) { nuint size = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, (void *)(samples + offsets[i]), samplesSizes[i], cdict); if ((ERR_isError(size)) != 0) { totalCompressedSize = size; goto _compressCleanup; } totalCompressedSize += size; } _compressCleanup: ZSTD_freeCCtx(cctx); ZSTD_freeCDict(cdict); if (dst != null) { free(dst); } return(totalCompressedSize); }
private static void FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams, ZDICT_fastCover_params_t *fastCoverParams, uint f, uint accel) { fastCoverParams->k = coverParams.k; fastCoverParams->d = coverParams.d; fastCoverParams->steps = coverParams.steps; fastCoverParams->nbThreads = coverParams.nbThreads; fastCoverParams->splitPoint = coverParams.splitPoint; fastCoverParams->f = f; fastCoverParams->accel = accel; fastCoverParams->zParams = coverParams.zParams; fastCoverParams->shrinkDict = coverParams.shrinkDict; }
/** * Called when a thread finishes executing, both on error or success. * Decrements liveJobs and signals any waiting threads if liveJobs == 0. * If this dictionary is the best so far save it and its parameters. */ public static void COVER_best_finish(COVER_best_s *best, ZDICT_cover_params_t parameters, COVER_dictSelection selection) { void *dict = (void *)selection.dictContent; nuint compressedSize = selection.totalCompressedSize; nuint dictSize = selection.dictSize; if (best == null) { return; } { nuint liveJobs; --best->liveJobs; liveJobs = best->liveJobs; if (compressedSize < best->compressedSize) { if (best->dict == null || best->dictSize < dictSize) { if (best->dict != null) { free(best->dict); } best->dict = malloc(dictSize); if (best->dict == null) { best->compressedSize = (unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC))); best->dictSize = 0; return; } } if (dict != null) { memcpy(best->dict, dict, dictSize); best->dictSize = dictSize; best->parameters = parameters; best->compressedSize = compressedSize; } } if (liveJobs == 0) { } } }
private static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters, nuint maxDictSize, uint f, uint accel) { if (parameters.d == 0 || parameters.k == 0) { return(0); } if (parameters.d != 6 && parameters.d != 8) { return(0); } if (parameters.k > maxDictSize) { return(0); } if (parameters.d > parameters.k) { return(0); } if (f > 31 || f == 0) { return(0); } if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) { return(0); } if (accel > 10 || accel == 0) { return(0); } return(1); }
/** * Tries a set of parameters and updates the COVER_best_t with the results. * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ private static void FASTCOVER_tryParameters(void *opaque) { FASTCOVER_tryParameters_data_s *data = (FASTCOVER_tryParameters_data_s *)(opaque); FASTCOVER_ctx_t * ctx = data->ctx; ZDICT_cover_params_t parameters = data->parameters; nuint dictBufferCapacity = data->dictBufferCapacity; nuint totalCompressedSize = (unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC))); ushort * segmentFreqs = (ushort *)(calloc((nuint)((ulong)(1) << (int)ctx->f), (nuint)(2))); byte * dict = (byte *)(malloc(dictBufferCapacity)); COVER_dictSelection selection = COVER_dictSelectionError((unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC)))); uint * freqs = (uint *)(malloc((nuint)(((ulong)(1) << (int)ctx->f) * (nuint)(4)))); if (segmentFreqs == null || dict == null || freqs == null) { goto _cleanup; } memcpy((void *)freqs, (void *)ctx->freqs, (nuint)(((ulong)(1) << (int)ctx->f) * (nuint)(sizeof(uint)))); { nuint tail = FASTCOVER_buildDictionary(ctx, freqs, (void *)dict, dictBufferCapacity, parameters, segmentFreqs); uint nbFinalizeSamples = (uint)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, totalCompressedSize); if ((COVER_dictSelectionIsError(selection)) != 0) { goto _cleanup; } } _cleanup: free((void *)dict); COVER_best_finish(data->best, parameters, selection); free((void *)data); free((void *)segmentFreqs); COVER_dictSelectionFree(selection); free((void *)freqs); }
/** * Called to finalize the dictionary and select one based on whether or not * the shrink-dict flag was enabled. If enabled the dictionary used is the * smallest dictionary within a specified regression of the compressed size * from the largest dictionary. */ public static COVER_dictSelection COVER_selectDict(byte *customDictContent, nuint dictBufferCapacity, nuint dictContentSize, byte *samplesBuffer, nuint *samplesSizes, uint nbFinalizeSamples, nuint nbCheckSamples, nuint nbSamples, ZDICT_cover_params_t @params, nuint *offsets, nuint totalCompressedSize) { nuint largestDict = 0; nuint largestCompressed = 0; byte * customDictContentEnd = customDictContent + dictContentSize; byte * largestDictbuffer = (byte *)(malloc(dictBufferCapacity)); byte * candidateDictBuffer = (byte *)(malloc(dictBufferCapacity)); double regressionTolerance = ((double)(@params.shrinkDictMaxRegression) / 100.0) + 1.00; if (largestDictbuffer == null || candidateDictBuffer == null) { free((void *)largestDictbuffer); free((void *)candidateDictBuffer); return(COVER_dictSelectionError(dictContentSize)); } memcpy((void *)largestDictbuffer, (void *)customDictContent, dictContentSize); dictContentSize = ZDICT_finalizeDictionary((void *)largestDictbuffer, dictBufferCapacity, (void *)customDictContent, dictContentSize, (void *)samplesBuffer, samplesSizes, nbFinalizeSamples, @params.zParams); if ((ZDICT_isError(dictContentSize)) != 0) { free((void *)largestDictbuffer); free((void *)candidateDictBuffer); return(COVER_dictSelectionError(dictContentSize)); } totalCompressedSize = COVER_checkTotalCompressedSize(@params, samplesSizes, samplesBuffer, offsets, nbCheckSamples, nbSamples, largestDictbuffer, dictContentSize); if ((ERR_isError(totalCompressedSize)) != 0) { free((void *)largestDictbuffer); free((void *)candidateDictBuffer); return(COVER_dictSelectionError(totalCompressedSize)); } if (@params.shrinkDict == 0) { COVER_dictSelection selection = new COVER_dictSelection { dictContent = largestDictbuffer, dictSize = dictContentSize, totalCompressedSize = totalCompressedSize, }; free((void *)candidateDictBuffer); return(selection); } largestDict = dictContentSize; largestCompressed = totalCompressedSize; dictContentSize = 256; while (dictContentSize < largestDict) { memcpy((void *)candidateDictBuffer, (void *)largestDictbuffer, largestDict); dictContentSize = ZDICT_finalizeDictionary((void *)candidateDictBuffer, dictBufferCapacity, (void *)(customDictContentEnd - dictContentSize), dictContentSize, (void *)samplesBuffer, samplesSizes, nbFinalizeSamples, @params.zParams); if ((ZDICT_isError(dictContentSize)) != 0) { free((void *)largestDictbuffer); free((void *)candidateDictBuffer); return(COVER_dictSelectionError(dictContentSize)); } totalCompressedSize = COVER_checkTotalCompressedSize(@params, samplesSizes, samplesBuffer, offsets, nbCheckSamples, nbSamples, candidateDictBuffer, dictContentSize); if ((ERR_isError(totalCompressedSize)) != 0) { free((void *)largestDictbuffer); free((void *)candidateDictBuffer); return(COVER_dictSelectionError(totalCompressedSize)); } if (totalCompressedSize <= largestCompressed * regressionTolerance) { COVER_dictSelection selection = new COVER_dictSelection { dictContent = candidateDictBuffer, dictSize = dictContentSize, totalCompressedSize = totalCompressedSize, }; free((void *)largestDictbuffer); return(selection); } dictContentSize *= 2; } dictContentSize = largestDict; totalCompressedSize = largestCompressed; { COVER_dictSelection selection = new COVER_dictSelection { dictContent = largestDictbuffer, dictSize = dictContentSize, totalCompressedSize = totalCompressedSize, }; free((void *)candidateDictBuffer); return(selection); } }
/*-************************************* * Helper functions ***************************************/ /** * Selects the best segment in an epoch. * Segments of are scored according to the function: * * Let F(d) be the frequency of all dmers with hash value d. * Let S_i be hash value of the dmer at position i of segment S which has length k. * * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) * * Once the dmer with hash value d is in the dictionary we set F(d) = 0. */ private static COVER_segment_t FASTCOVER_selectSegment(FASTCOVER_ctx_t *ctx, uint *freqs, uint begin, uint end, ZDICT_cover_params_t parameters, ushort *segmentFreqs) { uint k = parameters.k; uint d = parameters.d; uint f = ctx->f; uint dmersInK = k - d + 1; COVER_segment_t bestSegment = new COVER_segment_t { begin = 0, end = 0, score = 0, }; COVER_segment_t activeSegment; activeSegment.begin = begin; activeSegment.end = begin; activeSegment.score = 0; while (activeSegment.end < end) { nuint idx = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.end), f, d); if (segmentFreqs[idx] == 0) { activeSegment.score += freqs[idx]; } activeSegment.end += 1; segmentFreqs[idx] += 1; if (activeSegment.end - activeSegment.begin == dmersInK + 1) { nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d); segmentFreqs[delIndex] -= 1; if (segmentFreqs[delIndex] == 0) { activeSegment.score -= freqs[delIndex]; } activeSegment.begin += 1; } if (activeSegment.score > bestSegment.score) { bestSegment = activeSegment; } } while (activeSegment.begin < end) { nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d); segmentFreqs[delIndex] -= 1; activeSegment.begin += 1; } { uint pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { nuint i = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + pos), f, d); freqs[i] = 0; } } return(bestSegment); }
/** * Given the prepared context build the dictionary. */ private static nuint FASTCOVER_buildDictionary(FASTCOVER_ctx_t *ctx, uint *freqs, void *dictBuffer, nuint dictBufferCapacity, ZDICT_cover_params_t parameters, ushort *segmentFreqs) { byte *dict = (byte *)(dictBuffer); nuint tail = dictBufferCapacity; COVER_epoch_info_t epochs = COVER_computeEpochs((uint)(dictBufferCapacity), (uint)(ctx->nbDmers), parameters.k, 1); nuint maxZeroScoreRun = 10; nuint zeroScoreRun = 0; nuint epoch; for (epoch = 0; tail > 0; epoch = (nuint)((epoch + 1) % epochs.num)) { uint epochBegin = (uint)(epoch * epochs.size); uint epochEnd = epochBegin + epochs.size; nuint segmentSize; COVER_segment_t segment = FASTCOVER_selectSegment(ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs); if (segment.score == 0) { if (++zeroScoreRun >= maxZeroScoreRun) { break; } continue; } zeroScoreRun = 0; segmentSize = ((segment.end - segment.begin + parameters.d - 1) < (tail) ? (segment.end - segment.begin + parameters.d - 1) : (tail)); if (segmentSize < parameters.d) { break; } tail -= segmentSize; memcpy((void *)(dict + tail), (void *)(ctx->samples + segment.begin), segmentSize); } return(tail); }