private bool ShouldStopEarly(GroupByDictionary dictionary, CountAggregator counter) { // If every value was unique so far, stop if (dictionary.Count == counter.TotalRowCount) { return(true); } // If any value had enough rows to report, keep going XArray counts = counter.Values; int[] countsArray = (int[])counts.Array; int threshold = (int)(counter.TotalRowCount * MinimumPercentageToReport); for (int i = 0; i < counts.Count; ++i) { int count = countsArray[counts.Index(i)]; if (count >= threshold) { return(false); } } // Otherwise, stop (not all unique, but no values in > 0.5% of rows) return(true); }
public GroupBy(IXTable source, IList <IXColumn> keyColumns, IList <IAggregator> aggregators) { if (source == null) { throw new ArgumentNullException("source"); } _source = source; _keyColumns = keyColumns.ToArray(); _aggregators = aggregators.ToArray(); // Build a typed dictionary to handle the rank and key column types _dictionary = new GroupByDictionary(keyColumns.Select((col) => col.ColumnDetails).ToArray()); // Build a DeferredArrayColumn for each key and for the aggregator _columns = new DeferredArrayColumn[keyColumns.Count + aggregators.Count]; for (int i = 0; i < keyColumns.Count; ++i) { _columns[i] = new DeferredArrayColumn(keyColumns[i].ColumnDetails); } for (int i = 0; i < aggregators.Count; ++i) { _columns[keyColumns.Count + i] = new DeferredArrayColumn(_aggregators[i].ColumnDetails); } }
/// <summary> /// Build a GroupBy Dictionary for Peek. /// </summary> /// <remarks> /// Peek identifies each distinct common value and the approximate percentage of rows with it. /// If we have many matching rows, we can sample - the sample will have any common values in it. /// However, we don't know how many matches we have in advance. /// Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows. /// As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets. /// This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries. /// </remarks> /// <param name="cancellationToken">CancellationToken to request early stop</param> private void BuildDictionary(CancellationToken cancellationToken) { // Short-circuit path if there's one key column and it's an EnumColumn if (_column.IsEnumColumn()) { BuildSingleEnumColumnDictionary(cancellationToken); return; } // Build a Random instance to sample rows Random r = new Random(); // Build a Dictionary and CountAggregator for each sample GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount]; CountAggregator[] counts = new CountAggregator[SampleCount]; int[][] remapArrays = new int[SampleCount][]; for (int i = 0; i < SampleCount; ++i) { dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails }); counts[i] = new CountAggregator(); } // Retrieve the column getter Func <XArray> columnGetter = _column.CurrentGetter(); // Track which sample we'll currently report int currentSample = 0; XArray[] arrays = new XArray[1]; int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Get the column values arrays[0] = columnGetter(); // Build the GroupBy count for all rows and successive 1/8 samples for (int i = 0; i < SampleCount; ++i) { // Add these to the Join Dictionary if (i >= currentSample) { // Choose buckets for each row XArray indicesForRows = dictionaries[i].FindOrAdd(arrays); // Identify the bucket for each row and aggregate them counts[i].Add(indicesForRows, dictionaries[i].Count); // If this sample now has enough values, stop collecting bigger row sets if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize) { // If every row was unique, stop early and don't set outputs (zero rows) if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample])) { return; } dictionaries[currentSample] = null; counts[currentSample] = null; currentSample++; } } // Each successive dictionary has ~1/8 of the rows of the previous one if (i < SampleCount - 1) { ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]); arrays[0] = arrays[0].Reselect(sample); } } } // Once the loop is done, get the distinct values and aggregation results PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0); }