Beispiel #1
0
        public void Sampler_Basics()
        {
            Random        r   = new Random(8);
            ArraySelector all = ArraySelector.All(10240);

            int[]         eighthArray = null;
            ArraySelector eighth      = Sampler.Eighth(all, r, ref eighthArray);

            AssertClose(all.Count / 8, eighth.Count, 0.2f);

            int[]         sixtyfourthArray = null;
            ArraySelector sixtyfourth      = Sampler.Eighth(eighth, r, ref sixtyfourthArray);

            AssertClose(eighth.Count / 8, sixtyfourth.Count, 0.2f);
        }
        public void Sample()
        {
            Random        r   = new Random(8);
            ArraySelector all = ArraySelector.All(10240);

            int[] eighthArray      = null;
            int[] sixtyfourthArray = null;

            using (Benchmarker b = new Benchmarker($"Sampler.Eighth", DefaultMeasureMilliseconds))
            {
                b.Measure("Sampler.Eighth", all.Count, () =>
                {
                    ArraySelector eighth      = Sampler.Eighth(all, r, ref eighthArray);
                    ArraySelector sixtyfourth = Sampler.Eighth(eighth, r, ref sixtyfourthArray);
                    return(sixtyfourth.Count);
                });
            }
        }
Beispiel #3
0
        /// <summary>
        ///  Build a GroupBy Dictionary for Peek.
        /// </summary>
        /// <remarks>
        ///  Peek identifies each distinct common value and the approximate percentage of rows with it.
        ///  If we have many matching rows, we can sample - the sample will have any common values in it.
        ///  However, we don't know how many matches we have in advance.
        ///  Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows.
        ///  As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets.
        ///  This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries.
        /// </remarks>
        /// <param name="cancellationToken">CancellationToken to request early stop</param>
        private void BuildDictionary(CancellationToken cancellationToken)
        {
            // Short-circuit path if there's one key column and it's an EnumColumn
            if (_column.IsEnumColumn())
            {
                BuildSingleEnumColumnDictionary(cancellationToken);
                return;
            }

            // Build a Random instance to sample rows
            Random r = new Random();

            // Build a Dictionary and CountAggregator for each sample
            GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount];
            CountAggregator[]   counts       = new CountAggregator[SampleCount];
            int[][]             remapArrays  = new int[SampleCount][];
            for (int i = 0; i < SampleCount; ++i)
            {
                dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails });
                counts[i]       = new CountAggregator();
            }

            // Retrieve the column getter
            Func <XArray> columnGetter = _column.CurrentGetter();

            // Track which sample we'll currently report
            int currentSample = 0;

            XArray[] arrays = new XArray[1];
            int      count;

            while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0)
            {
                // Get the column values
                arrays[0] = columnGetter();

                // Build the GroupBy count for all rows and successive 1/8 samples
                for (int i = 0; i < SampleCount; ++i)
                {
                    // Add these to the Join Dictionary
                    if (i >= currentSample)
                    {
                        // Choose buckets for each row
                        XArray indicesForRows = dictionaries[i].FindOrAdd(arrays);

                        // Identify the bucket for each row and aggregate them
                        counts[i].Add(indicesForRows, dictionaries[i].Count);

                        // If this sample now has enough values, stop collecting bigger row sets
                        if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize)
                        {
                            // If every row was unique, stop early and don't set outputs (zero rows)
                            if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample]))
                            {
                                return;
                            }

                            dictionaries[currentSample] = null;
                            counts[currentSample]       = null;
                            currentSample++;
                        }
                    }

                    // Each successive dictionary has ~1/8 of the rows of the previous one
                    if (i < SampleCount - 1)
                    {
                        ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]);
                        arrays[0] = arrays[0].Reselect(sample);
                    }
                }
            }

            // Once the loop is done, get the distinct values and aggregation results
            PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0);
        }