예제 #1
0
        private bool ShouldStopEarly(GroupByDictionary dictionary, CountAggregator counter)
        {
            // If every value was unique so far, stop
            if (dictionary.Count == counter.TotalRowCount)
            {
                return(true);
            }

            // If any value had enough rows to report, keep going
            XArray counts = counter.Values;

            int[] countsArray = (int[])counts.Array;
            int   threshold   = (int)(counter.TotalRowCount * MinimumPercentageToReport);

            for (int i = 0; i < counts.Count; ++i)
            {
                int count = countsArray[counts.Index(i)];
                if (count >= threshold)
                {
                    return(false);
                }
            }

            // Otherwise, stop (not all unique, but no values in > 0.5% of rows)
            return(true);
        }
예제 #2
0
        public GroupBy(IXTable source, IList <IXColumn> keyColumns, IList <IAggregator> aggregators)
        {
            if (source == null)
            {
                throw new ArgumentNullException("source");
            }
            _source      = source;
            _keyColumns  = keyColumns.ToArray();
            _aggregators = aggregators.ToArray();

            // Build a typed dictionary to handle the rank and key column types
            _dictionary = new GroupByDictionary(keyColumns.Select((col) => col.ColumnDetails).ToArray());

            // Build a DeferredArrayColumn for each key and for the aggregator
            _columns = new DeferredArrayColumn[keyColumns.Count + aggregators.Count];
            for (int i = 0; i < keyColumns.Count; ++i)
            {
                _columns[i] = new DeferredArrayColumn(keyColumns[i].ColumnDetails);
            }

            for (int i = 0; i < aggregators.Count; ++i)
            {
                _columns[keyColumns.Count + i] = new DeferredArrayColumn(_aggregators[i].ColumnDetails);
            }
        }
예제 #3
0
        /// <summary>
        ///  Build a GroupBy Dictionary for Peek.
        /// </summary>
        /// <remarks>
        ///  Peek identifies each distinct common value and the approximate percentage of rows with it.
        ///  If we have many matching rows, we can sample - the sample will have any common values in it.
        ///  However, we don't know how many matches we have in advance.
        ///  Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows.
        ///  As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets.
        ///  This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries.
        /// </remarks>
        /// <param name="cancellationToken">CancellationToken to request early stop</param>
        private void BuildDictionary(CancellationToken cancellationToken)
        {
            // Short-circuit path if there's one key column and it's an EnumColumn
            if (_column.IsEnumColumn())
            {
                BuildSingleEnumColumnDictionary(cancellationToken);
                return;
            }

            // Build a Random instance to sample rows
            Random r = new Random();

            // Build a Dictionary and CountAggregator for each sample
            GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount];
            CountAggregator[]   counts       = new CountAggregator[SampleCount];
            int[][]             remapArrays  = new int[SampleCount][];
            for (int i = 0; i < SampleCount; ++i)
            {
                dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails });
                counts[i]       = new CountAggregator();
            }

            // Retrieve the column getter
            Func <XArray> columnGetter = _column.CurrentGetter();

            // Track which sample we'll currently report
            int currentSample = 0;

            XArray[] arrays = new XArray[1];
            int      count;

            while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0)
            {
                // Get the column values
                arrays[0] = columnGetter();

                // Build the GroupBy count for all rows and successive 1/8 samples
                for (int i = 0; i < SampleCount; ++i)
                {
                    // Add these to the Join Dictionary
                    if (i >= currentSample)
                    {
                        // Choose buckets for each row
                        XArray indicesForRows = dictionaries[i].FindOrAdd(arrays);

                        // Identify the bucket for each row and aggregate them
                        counts[i].Add(indicesForRows, dictionaries[i].Count);

                        // If this sample now has enough values, stop collecting bigger row sets
                        if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize)
                        {
                            // If every row was unique, stop early and don't set outputs (zero rows)
                            if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample]))
                            {
                                return;
                            }

                            dictionaries[currentSample] = null;
                            counts[currentSample]       = null;
                            currentSample++;
                        }
                    }

                    // Each successive dictionary has ~1/8 of the rows of the previous one
                    if (i < SampleCount - 1)
                    {
                        ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]);
                        arrays[0] = arrays[0].Reselect(sample);
                    }
                }
            }

            // Once the loop is done, get the distinct values and aggregation results
            PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0);
        }