/** * Given the prepared context build the dictionary. */ private static nuint FASTCOVER_buildDictionary(FASTCOVER_ctx_t *ctx, uint *freqs, void *dictBuffer, nuint dictBufferCapacity, ZDICT_cover_params_t parameters, ushort *segmentFreqs) { byte *dict = (byte *)(dictBuffer); nuint tail = dictBufferCapacity; COVER_epoch_info_t epochs = COVER_computeEpochs((uint)(dictBufferCapacity), (uint)(ctx->nbDmers), parameters.k, 1); nuint maxZeroScoreRun = 10; nuint zeroScoreRun = 0; nuint epoch; for (epoch = 0; tail > 0; epoch = (nuint)((epoch + 1) % epochs.num)) { uint epochBegin = (uint)(epoch * epochs.size); uint epochEnd = epochBegin + epochs.size; nuint segmentSize; COVER_segment_t segment = FASTCOVER_selectSegment(ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs); if (segment.score == 0) { if (++zeroScoreRun >= maxZeroScoreRun) { break; } continue; } zeroScoreRun = 0; segmentSize = ((segment.end - segment.begin + parameters.d - 1) < (tail) ? (segment.end - segment.begin + parameters.d - 1) : (tail)); if (segmentSize < parameters.d) { break; } tail -= segmentSize; memcpy((void *)(dict + tail), (void *)(ctx->samples + segment.begin), segmentSize); } return(tail); }
/*-************************************* * Helper functions ***************************************/ /** * Selects the best segment in an epoch. * Segments of are scored according to the function: * * Let F(d) be the frequency of all dmers with hash value d. * Let S_i be hash value of the dmer at position i of segment S which has length k. * * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) * * Once the dmer with hash value d is in the dictionary we set F(d) = 0. */ private static COVER_segment_t FASTCOVER_selectSegment(FASTCOVER_ctx_t *ctx, uint *freqs, uint begin, uint end, ZDICT_cover_params_t parameters, ushort *segmentFreqs) { uint k = parameters.k; uint d = parameters.d; uint f = ctx->f; uint dmersInK = k - d + 1; COVER_segment_t bestSegment = new COVER_segment_t { begin = 0, end = 0, score = 0, }; COVER_segment_t activeSegment; activeSegment.begin = begin; activeSegment.end = begin; activeSegment.score = 0; while (activeSegment.end < end) { nuint idx = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.end), f, d); if (segmentFreqs[idx] == 0) { activeSegment.score += freqs[idx]; } activeSegment.end += 1; segmentFreqs[idx] += 1; if (activeSegment.end - activeSegment.begin == dmersInK + 1) { nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d); segmentFreqs[delIndex] -= 1; if (segmentFreqs[delIndex] == 0) { activeSegment.score -= freqs[delIndex]; } activeSegment.begin += 1; } if (activeSegment.score > bestSegment.score) { bestSegment = activeSegment; } } while (activeSegment.begin < end) { nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d); segmentFreqs[delIndex] -= 1; activeSegment.begin += 1; } { uint pos; for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { nuint i = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + pos), f, d); freqs[i] = 0; } } return(bestSegment); }