/** * Clean up a context initialized with `FASTCOVER_ctx_init()`. */ private static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) { if (ctx == null) { return; } free((void *)ctx->freqs); ctx->freqs = null; free((void *)ctx->offsets); ctx->offsets = null; }
/** * Given the prepared context build the dictionary. */ private static nuint FASTCOVER_buildDictionary(FASTCOVER_ctx_t *ctx, uint *freqs, void *dictBuffer, nuint dictBufferCapacity, ZDICT_cover_params_t parameters, ushort *segmentFreqs) { byte *dict = (byte *)(dictBuffer); nuint tail = dictBufferCapacity; COVER_epoch_info_t epochs = COVER_computeEpochs((uint)(dictBufferCapacity), (uint)(ctx->nbDmers), parameters.k, 1); nuint maxZeroScoreRun = 10; nuint zeroScoreRun = 0; nuint epoch; for (epoch = 0; tail > 0; epoch = (nuint)((epoch + 1) % epochs.num)) { uint epochBegin = (uint)(epoch * epochs.size); uint epochEnd = epochBegin + epochs.size; nuint segmentSize; COVER_segment_t segment = FASTCOVER_selectSegment(ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs); if (segment.score == 0) { if (++zeroScoreRun >= maxZeroScoreRun) { break; } continue; } zeroScoreRun = 0; segmentSize = ((segment.end - segment.begin + parameters.d - 1) < (tail) ? (segment.end - segment.begin + parameters.d - 1) : (tail)); if (segmentSize < parameters.d) { break; } tail -= segmentSize; memcpy((void *)(dict + tail), (void *)(ctx->samples + segment.begin), segmentSize); } return(tail); }
/** * Tries a set of parameters and updates the COVER_best_t with the results. * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ private static void FASTCOVER_tryParameters(void *opaque) { FASTCOVER_tryParameters_data_s *data = (FASTCOVER_tryParameters_data_s *)(opaque); FASTCOVER_ctx_t * ctx = data->ctx; ZDICT_cover_params_t parameters = data->parameters; nuint dictBufferCapacity = data->dictBufferCapacity; nuint totalCompressedSize = (unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC))); ushort * segmentFreqs = (ushort *)(calloc((nuint)((ulong)(1) << (int)ctx->f), (nuint)(2))); byte * dict = (byte *)(malloc(dictBufferCapacity)); COVER_dictSelection selection = COVER_dictSelectionError((unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC)))); uint * freqs = (uint *)(malloc((nuint)(((ulong)(1) << (int)ctx->f) * (nuint)(4)))); if (segmentFreqs == null || dict == null || freqs == null) { goto _cleanup; } memcpy((void *)freqs, (void *)ctx->freqs, (nuint)(((ulong)(1) << (int)ctx->f) * (nuint)(sizeof(uint)))); { nuint tail = FASTCOVER_buildDictionary(ctx, freqs, (void *)dict, dictBufferCapacity, parameters, segmentFreqs); uint nbFinalizeSamples = (uint)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, totalCompressedSize); if ((COVER_dictSelectionIsError(selection)) != 0) { goto _cleanup; } } _cleanup: free((void *)dict); COVER_best_finish(data->best, parameters, selection); free((void *)data); free((void *)segmentFreqs); COVER_dictSelectionFree(selection); free((void *)freqs); }
/** * Calculate for frequency of hash value of each dmer in ctx->samples */ private static void FASTCOVER_computeFrequency(uint *freqs, FASTCOVER_ctx_t *ctx) { uint f = ctx->f; uint d = ctx->d; uint skip = ctx->accelParams.skip; uint readLength = ((d) > (8) ? (d) : (8)); nuint i; assert(ctx->nbTrainSamples >= 5); assert(ctx->nbTrainSamples <= ctx->nbSamples); for (i = 0; i < ctx->nbTrainSamples; i++) { nuint start = ctx->offsets[i]; nuint currSampleEnd = ctx->offsets[i + 1]; while (start + readLength <= currSampleEnd) { nuint dmerIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + start), f, d); freqs[dmerIndex]++; start = start + skip + 1; } } }
/*-************************************* * Helper functions ***************************************/ /** * Selects the best segment in an epoch. * Segments of are scored according to the function: * * Let F(d) be the frequency of all dmers with hash value d. * Let S_i be hash value of the dmer at position i of segment S which has length k. * * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) * * Once the dmer with hash value d is in the dictionary we set F(d) = 0. */ private static COVER_segment_t FASTCOVER_selectSegment(FASTCOVER_ctx_t *ctx, uint *freqs, uint begin, uint end, ZDICT_cover_params_t parameters, ushort *segmentFreqs) { uint k = parameters.k; uint d = parameters.d; uint f = ctx->f; uint dmersInK = k - d + 1; COVER_segment_t bestSegment = new COVER_segment_t { begin = 0, end = 0, score = 0, }; COVER_segment_t activeSegment; activeSegment.begin = begin; activeSegment.end = begin; activeSegment.score = 0; while (activeSegment.end < end) { nuint idx = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.end), f, d); if (segmentFreqs[idx] == 0) { activeSegment.score += freqs[idx]; } activeSegment.end += 1; segmentFreqs[idx] += 1; if (activeSegment.end - activeSegment.begin == dmersInK + 1) { nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d); segmentFreqs[delIndex] -= 1; if (segmentFreqs[delIndex] == 0) { activeSegment.score -= freqs[delIndex]; } activeSegment.begin += 1; } if (activeSegment.score > bestSegment.score) { bestSegment = activeSegment; } } while (activeSegment.begin < end) { nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d); segmentFreqs[delIndex] -= 1; activeSegment.begin += 1; } { uint pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { nuint i = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + pos), f, d); freqs[i] = 0; } } return(bestSegment); }
/** * Prepare a context for dictionary building. * The context is only dependent on the parameter `d` and can used multiple * times. * Returns 0 on success or error code on error. * The context must be destroyed with `FASTCOVER_ctx_destroy()`. */ private static nuint FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, void *samplesBuffer, nuint *samplesSizes, uint nbSamples, uint d, double splitPoint, uint f, FASTCOVER_accel_t accelParams) { byte *samples = (byte *)(samplesBuffer); nuint totalSamplesSize = COVER_sum(samplesSizes, nbSamples); uint nbTrainSamples = splitPoint < 1.0 ? (uint)((double)(nbSamples) * splitPoint) : nbSamples; uint nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; nuint trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize; nuint testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize; if (totalSamplesSize < ((d) > ((nuint)(sizeof(ulong))) ? (d) : ((nuint)(sizeof(ulong)))) || totalSamplesSize >= (nuint)((nuint)(sizeof(nuint)) == 8 ? (unchecked ((uint)(-1))) : ((uint)(1) * (1U << 30)))) { return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_srcSize_wrong))); } if (nbTrainSamples < 5) { return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_srcSize_wrong))); } if (nbTestSamples < 1) { return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_srcSize_wrong))); } memset((void *)ctx, 0, (nuint)(sizeof(FASTCOVER_ctx_t))); ctx->samples = samples; ctx->samplesSizes = samplesSizes; ctx->nbSamples = nbSamples; ctx->nbTrainSamples = nbTrainSamples; ctx->nbTestSamples = nbTestSamples; ctx->nbDmers = trainingSamplesSize - ((d) > ((nuint)(sizeof(ulong))) ? (d) : ((nuint)(sizeof(ulong)))) + 1; ctx->d = d; ctx->f = f; ctx->accelParams = accelParams; ctx->offsets = (nuint *)(calloc((nbSamples + 1), (nuint)(sizeof(nuint)))); if (ctx->offsets == null) { FASTCOVER_ctx_destroy(ctx); return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_memory_allocation))); } { uint i; ctx->offsets[0] = 0; assert(nbSamples >= 5); for (i = 1; i <= nbSamples; ++i) { ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; } } ctx->freqs = (uint *)(calloc((nuint)((ulong)(1) << (int)f), (nuint)(sizeof(uint)))); if (ctx->freqs == null) { FASTCOVER_ctx_destroy(ctx); return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_memory_allocation))); } FASTCOVER_computeFrequency(ctx->freqs, ctx); return(0); }