public void Sampler_Basics() { Random r = new Random(8); ArraySelector all = ArraySelector.All(10240); int[] eighthArray = null; ArraySelector eighth = Sampler.Eighth(all, r, ref eighthArray); AssertClose(all.Count / 8, eighth.Count, 0.2f); int[] sixtyfourthArray = null; ArraySelector sixtyfourth = Sampler.Eighth(eighth, r, ref sixtyfourthArray); AssertClose(eighth.Count / 8, sixtyfourth.Count, 0.2f); }
public void Sample() { Random r = new Random(8); ArraySelector all = ArraySelector.All(10240); int[] eighthArray = null; int[] sixtyfourthArray = null; using (Benchmarker b = new Benchmarker($"Sampler.Eighth", DefaultMeasureMilliseconds)) { b.Measure("Sampler.Eighth", all.Count, () => { ArraySelector eighth = Sampler.Eighth(all, r, ref eighthArray); ArraySelector sixtyfourth = Sampler.Eighth(eighth, r, ref sixtyfourthArray); return(sixtyfourth.Count); }); } }
/// <summary> /// Build a GroupBy Dictionary for Peek. /// </summary> /// <remarks> /// Peek identifies each distinct common value and the approximate percentage of rows with it. /// If we have many matching rows, we can sample - the sample will have any common values in it. /// However, we don't know how many matches we have in advance. /// Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows. /// As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets. /// This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries. /// </remarks> /// <param name="cancellationToken">CancellationToken to request early stop</param> private void BuildDictionary(CancellationToken cancellationToken) { // Short-circuit path if there's one key column and it's an EnumColumn if (_column.IsEnumColumn()) { BuildSingleEnumColumnDictionary(cancellationToken); return; } // Build a Random instance to sample rows Random r = new Random(); // Build a Dictionary and CountAggregator for each sample GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount]; CountAggregator[] counts = new CountAggregator[SampleCount]; int[][] remapArrays = new int[SampleCount][]; for (int i = 0; i < SampleCount; ++i) { dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails }); counts[i] = new CountAggregator(); } // Retrieve the column getter Func <XArray> columnGetter = _column.CurrentGetter(); // Track which sample we'll currently report int currentSample = 0; XArray[] arrays = new XArray[1]; int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Get the column values arrays[0] = columnGetter(); // Build the GroupBy count for all rows and successive 1/8 samples for (int i = 0; i < SampleCount; ++i) { // Add these to the Join Dictionary if (i >= currentSample) { // Choose buckets for each row XArray indicesForRows = dictionaries[i].FindOrAdd(arrays); // Identify the bucket for each row and aggregate them counts[i].Add(indicesForRows, dictionaries[i].Count); // If this sample now has enough values, stop collecting bigger row sets if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize) { // If every row was unique, stop early and don't set outputs (zero rows) if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample])) { return; } dictionaries[currentSample] = null; counts[currentSample] = null; currentSample++; } } // Each successive dictionary has ~1/8 of the rows of the previous one if (i < SampleCount - 1) { ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]); arrays[0] = arrays[0].Reselect(sample); } } } // Once the loop is done, get the distinct values and aggregation results PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0); }