示例#1
0
        /**
         * Clean up a context initialized with `FASTCOVER_ctx_init()`.
         */
        private static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx)
        {
            if (ctx == null)
            {
                return;
            }

            free((void *)ctx->freqs);
            ctx->freqs = null;
            free((void *)ctx->offsets);
            ctx->offsets = null;
        }
示例#2
0
        /**
         * Given the prepared context build the dictionary.
         */
        private static nuint FASTCOVER_buildDictionary(FASTCOVER_ctx_t *ctx, uint *freqs, void *dictBuffer, nuint dictBufferCapacity, ZDICT_cover_params_t parameters, ushort *segmentFreqs)
        {
            byte *dict = (byte *)(dictBuffer);
            nuint tail = dictBufferCapacity;
            COVER_epoch_info_t epochs = COVER_computeEpochs((uint)(dictBufferCapacity), (uint)(ctx->nbDmers), parameters.k, 1);
            nuint maxZeroScoreRun     = 10;
            nuint zeroScoreRun        = 0;
            nuint epoch;

            for (epoch = 0; tail > 0; epoch = (nuint)((epoch + 1) % epochs.num))
            {
                uint            epochBegin = (uint)(epoch * epochs.size);
                uint            epochEnd   = epochBegin + epochs.size;
                nuint           segmentSize;
                COVER_segment_t segment = FASTCOVER_selectSegment(ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);

                if (segment.score == 0)
                {
                    if (++zeroScoreRun >= maxZeroScoreRun)
                    {
                        break;
                    }

                    continue;
                }

                zeroScoreRun = 0;
                segmentSize  = ((segment.end - segment.begin + parameters.d - 1) < (tail) ? (segment.end - segment.begin + parameters.d - 1) : (tail));
                if (segmentSize < parameters.d)
                {
                    break;
                }

                tail -= segmentSize;
                memcpy((void *)(dict + tail), (void *)(ctx->samples + segment.begin), segmentSize);
            }

            return(tail);
        }
示例#3
0
        /**
         * Tries a set of parameters and updates the COVER_best_t with the results.
         * This function is thread safe if zstd is compiled with multithreaded support.
         * It takes its parameters as an *OWNING* opaque pointer to support threading.
         */
        private static void FASTCOVER_tryParameters(void *opaque)
        {
            FASTCOVER_tryParameters_data_s *data    = (FASTCOVER_tryParameters_data_s *)(opaque);
            FASTCOVER_ctx_t *    ctx                = data->ctx;
            ZDICT_cover_params_t parameters         = data->parameters;
            nuint               dictBufferCapacity  = data->dictBufferCapacity;
            nuint               totalCompressedSize = (unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC)));
            ushort *            segmentFreqs        = (ushort *)(calloc((nuint)((ulong)(1) << (int)ctx->f), (nuint)(2)));
            byte *              dict                = (byte *)(malloc(dictBufferCapacity));
            COVER_dictSelection selection           = COVER_dictSelectionError((unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_GENERIC))));
            uint *              freqs               = (uint *)(malloc((nuint)(((ulong)(1) << (int)ctx->f) * (nuint)(4))));

            if (segmentFreqs == null || dict == null || freqs == null)
            {
                goto _cleanup;
            }

            memcpy((void *)freqs, (void *)ctx->freqs, (nuint)(((ulong)(1) << (int)ctx->f) * (nuint)(sizeof(uint))));

            {
                nuint tail = FASTCOVER_buildDictionary(ctx, freqs, (void *)dict, dictBufferCapacity, parameters, segmentFreqs);
                uint  nbFinalizeSamples = (uint)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);

                selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, totalCompressedSize);
                if ((COVER_dictSelectionIsError(selection)) != 0)
                {
                    goto _cleanup;
                }
            }

_cleanup:
            free((void *)dict);
            COVER_best_finish(data->best, parameters, selection);
            free((void *)data);
            free((void *)segmentFreqs);
            COVER_dictSelectionFree(selection);
            free((void *)freqs);
        }
示例#4
0
        /**
         * Calculate for frequency of hash value of each dmer in ctx->samples
         */
        private static void FASTCOVER_computeFrequency(uint *freqs, FASTCOVER_ctx_t *ctx)
        {
            uint  f          = ctx->f;
            uint  d          = ctx->d;
            uint  skip       = ctx->accelParams.skip;
            uint  readLength = ((d) > (8) ? (d) : (8));
            nuint i;

            assert(ctx->nbTrainSamples >= 5);
            assert(ctx->nbTrainSamples <= ctx->nbSamples);
            for (i = 0; i < ctx->nbTrainSamples; i++)
            {
                nuint start         = ctx->offsets[i];
                nuint currSampleEnd = ctx->offsets[i + 1];

                while (start + readLength <= currSampleEnd)
                {
                    nuint dmerIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + start), f, d);

                    freqs[dmerIndex]++;
                    start = start + skip + 1;
                }
            }
        }
示例#5
0
        /*-*************************************
        *  Helper functions
        ***************************************/
        /**
         * Selects the best segment in an epoch.
         * Segments of are scored according to the function:
         *
         * Let F(d) be the frequency of all dmers with hash value d.
         * Let S_i be hash value of the dmer at position i of segment S which has length k.
         *
         *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
         *
         * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
         */
        private static COVER_segment_t FASTCOVER_selectSegment(FASTCOVER_ctx_t *ctx, uint *freqs, uint begin, uint end, ZDICT_cover_params_t parameters, ushort *segmentFreqs)
        {
            uint            k           = parameters.k;
            uint            d           = parameters.d;
            uint            f           = ctx->f;
            uint            dmersInK    = k - d + 1;
            COVER_segment_t bestSegment = new COVER_segment_t
            {
                begin = 0,
                end   = 0,
                score = 0,
            };
            COVER_segment_t activeSegment;

            activeSegment.begin = begin;
            activeSegment.end   = begin;
            activeSegment.score = 0;
            while (activeSegment.end < end)
            {
                nuint idx = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.end), f, d);

                if (segmentFreqs[idx] == 0)
                {
                    activeSegment.score += freqs[idx];
                }

                activeSegment.end += 1;
                segmentFreqs[idx] += 1;
                if (activeSegment.end - activeSegment.begin == dmersInK + 1)
                {
                    nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d);

                    segmentFreqs[delIndex] -= 1;
                    if (segmentFreqs[delIndex] == 0)
                    {
                        activeSegment.score -= freqs[delIndex];
                    }

                    activeSegment.begin += 1;
                }

                if (activeSegment.score > bestSegment.score)
                {
                    bestSegment = activeSegment;
                }
            }

            while (activeSegment.begin < end)
            {
                nuint delIndex = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + activeSegment.begin), f, d);

                segmentFreqs[delIndex] -= 1;
                activeSegment.begin    += 1;
            }


            {
                uint pos;

                for (pos = bestSegment.begin; pos != bestSegment.end; ++pos)
                {
                    nuint i = FASTCOVER_hashPtrToIndex((void *)(ctx->samples + pos), f, d);

                    freqs[i] = 0;
                }
            }

            return(bestSegment);
        }
示例#6
0
        /**
         * Prepare a context for dictionary building.
         * The context is only dependent on the parameter `d` and can used multiple
         * times.
         * Returns 0 on success or error code on error.
         * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
         */
        private static nuint FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, void *samplesBuffer, nuint *samplesSizes, uint nbSamples, uint d, double splitPoint, uint f, FASTCOVER_accel_t accelParams)
        {
            byte *samples             = (byte *)(samplesBuffer);
            nuint totalSamplesSize    = COVER_sum(samplesSizes, nbSamples);
            uint  nbTrainSamples      = splitPoint < 1.0 ? (uint)((double)(nbSamples) * splitPoint) : nbSamples;
            uint  nbTestSamples       = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
            nuint trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
            nuint testSamplesSize     = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;

            if (totalSamplesSize < ((d) > ((nuint)(sizeof(ulong))) ? (d) : ((nuint)(sizeof(ulong)))) || totalSamplesSize >= (nuint)((nuint)(sizeof(nuint)) == 8 ? (unchecked ((uint)(-1))) : ((uint)(1) * (1U << 30))))
            {
                return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_srcSize_wrong)));
            }

            if (nbTrainSamples < 5)
            {
                return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_srcSize_wrong)));
            }

            if (nbTestSamples < 1)
            {
                return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_srcSize_wrong)));
            }

            memset((void *)ctx, 0, (nuint)(sizeof(FASTCOVER_ctx_t)));
            ctx->samples        = samples;
            ctx->samplesSizes   = samplesSizes;
            ctx->nbSamples      = nbSamples;
            ctx->nbTrainSamples = nbTrainSamples;
            ctx->nbTestSamples  = nbTestSamples;
            ctx->nbDmers        = trainingSamplesSize - ((d) > ((nuint)(sizeof(ulong))) ? (d) : ((nuint)(sizeof(ulong)))) + 1;
            ctx->d           = d;
            ctx->f           = f;
            ctx->accelParams = accelParams;
            ctx->offsets     = (nuint *)(calloc((nbSamples + 1), (nuint)(sizeof(nuint))));
            if (ctx->offsets == null)
            {
                FASTCOVER_ctx_destroy(ctx);
                return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_memory_allocation)));
            }


            {
                uint i;

                ctx->offsets[0] = 0;
                assert(nbSamples >= 5);
                for (i = 1; i <= nbSamples; ++i)
                {
                    ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
                }
            }

            ctx->freqs = (uint *)(calloc((nuint)((ulong)(1) << (int)f), (nuint)(sizeof(uint))));
            if (ctx->freqs == null)
            {
                FASTCOVER_ctx_destroy(ctx);
                return(unchecked ((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_memory_allocation)));
            }

            FASTCOVER_computeFrequency(ctx->freqs, ctx);
            return(0);
        }