protected async override Task <Result> Explore() { var stats = await statsResultProvider.ResultAsync; var bucketsToSample = BucketUtils.EstimateBucketResolutions( stats.Count, stats.Min, stats.Max, ValuesPerBucketTarget, isIntegerColumn: ctx.ColumnType == DValueType.Integer); var histogramQ = await conn.Exec(new SingleColumnHistogram(ctx.Table, ctx.Column, bucketsToSample)); var histograms = Histogram.FromQueryRows(histogramQ.Rows); var valueCounts = histogramQ.Rows .GroupBy( row => row.BucketSize, (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows))); var results = valueCounts.Join( histograms, v => v.BucketSize.SnappedSize, h => h.BucketSize.SnappedSize, (v, h) => new Result(v.Rows, h)); return(results .OrderBy(h => h.BucketSize.SnappedSize) .ThenBy(h => h.ValueCounts.SuppressedCount) .First()); }
internal Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows) { ValueCounts = ValueCounts.Compute(distinctRows); Distribution = distinctRows .Where(r => r.HasValue) .OrderBy(r => r.Value.GetInt32()) .Select(r => (r.Value.GetInt64(), r.Count)) .ToList(); }
protected async override Task <List <HistogramWithCounts>?> Explore() { var stats = await statsResultProvider.ResultAsync; if (stats == null) { return(null); } var(minBound, maxBound) = (stats.Min, stats.Max); if (!minBound.HasValue || !maxBound.HasValue) { var distincts = await distinctValuesProvider.ResultAsync; if (distincts == null || distincts.ValueCounts.NonSuppressedNonNullCount == 0) { return(null); } var values = distincts.DistinctRows.Where(row => row.HasValue).Select(row => row.Value.GetDouble()); minBound ??= values.Min(); maxBound ??= values.Max(); } if (!minBound.HasValue || !maxBound.HasValue || minBound == maxBound) { Logger.LogWarning("Unable to calculate suitable bounds for numerical column {Context.Column}."); return(null); } var bucketsToSample = BucketUtils.EstimateBucketResolutions( stats.Count, (double)minBound, (double)maxBound, ValuesPerBucketTarget, isIntegerColumn: Context.ColumnInfo.Type == DValueType.Integer); var histogramQ = await Context.Exec(new SingleColumnHistogram(bucketsToSample)); var histograms = Histogram.FromQueryRows(histogramQ.Rows); var valueCounts = histogramQ.Rows .GroupBy( row => row.BucketSize, (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows))); return(valueCounts .Join( histograms, v => v.BucketSize.SnappedSize, h => h.BucketSize.SnappedSize, (v, h) => new HistogramWithCounts(v.Rows, h)) .ToList()); }
private static IEnumerable <(ValueCounts, IGrouping <string, GroupingSetsResult <DateTime> >)> ProcessLinearBuckets( IEnumerable <GroupingSetsResult <DateTime> > queryResult) { foreach (var group in TimeUtilities.GroupByLabel(queryResult)) { var counts = ValueCounts.Compute(group); if (counts.SuppressedRowRatio > SuppressedRatioThreshold) { break; } yield return(counts, group); } }
public async void TestEmailNegative() { using var queryScope = testFixture.SimpleQueryTestScope( TestDataSource, VcrSharp.Cassette.GenerateVcrFilename(this)); var result = await queryScope.QueryRows( new TextColumnTrim("cards", "lastname", TextColumnTrimType.Both, Constants.EmailAddressChars)); var counts = ValueCounts.Compute(result); var isEmail = counts.TotalCount == result .Where(r => r.IsNull || r.Value == "@") .Sum(r => r.Count); Assert.False(isEmail); }
public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows) { DistinctRows = distinctRows; ValueCounts = ValueCounts.Compute(distinctRows); }
public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows, NumericDistribution?decimalsCountDistribution) { DistinctRows = distinctRows.ToList(); ValueCounts = ValueCounts.Compute(DistinctRows); DecimalsCountDistribution = decimalsCountDistribution; }