protected async override Task <Result> Explore()
        {
            var stats = await statsResultProvider.ResultAsync;

            var bucketsToSample = BucketUtils.EstimateBucketResolutions(
                stats.Count,
                stats.Min,
                stats.Max,
                ValuesPerBucketTarget,
                isIntegerColumn: ctx.ColumnType == DValueType.Integer);

            var histogramQ = await conn.Exec(new SingleColumnHistogram(ctx.Table, ctx.Column, bucketsToSample));

            var histograms = Histogram.FromQueryRows(histogramQ.Rows);

            var valueCounts = histogramQ.Rows
                              .GroupBy(
                row => row.BucketSize,
                (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows)));

            var results = valueCounts.Join(
                histograms,
                v => v.BucketSize.SnappedSize,
                h => h.BucketSize.SnappedSize,
                (v, h) => new Result(v.Rows, h));

            return(results
                   .OrderBy(h => h.BucketSize.SnappedSize)
                   .ThenBy(h => h.ValueCounts.SuppressedCount)
                   .First());
        }
 internal Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows)
 {
     ValueCounts  = ValueCounts.Compute(distinctRows);
     Distribution = distinctRows
                    .Where(r => r.HasValue)
                    .OrderBy(r => r.Value.GetInt32())
                    .Select(r => (r.Value.GetInt64(), r.Count))
                    .ToList();
 }
Exemple #3
0
        protected async override Task <List <HistogramWithCounts>?> Explore()
        {
            var stats = await statsResultProvider.ResultAsync;

            if (stats == null)
            {
                return(null);
            }

            var(minBound, maxBound) = (stats.Min, stats.Max);
            if (!minBound.HasValue || !maxBound.HasValue)
            {
                var distincts = await distinctValuesProvider.ResultAsync;
                if (distincts == null || distincts.ValueCounts.NonSuppressedNonNullCount == 0)
                {
                    return(null);
                }

                var values = distincts.DistinctRows.Where(row => row.HasValue).Select(row => row.Value.GetDouble());
                minBound ??= values.Min();
                maxBound ??= values.Max();
            }

            if (!minBound.HasValue || !maxBound.HasValue || minBound == maxBound)
            {
                Logger.LogWarning("Unable to calculate suitable bounds for numerical column {Context.Column}.");

                return(null);
            }

            var bucketsToSample = BucketUtils.EstimateBucketResolutions(
                stats.Count,
                (double)minBound,
                (double)maxBound,
                ValuesPerBucketTarget,
                isIntegerColumn: Context.ColumnInfo.Type == DValueType.Integer);

            var histogramQ = await Context.Exec(new SingleColumnHistogram(bucketsToSample));

            var histograms = Histogram.FromQueryRows(histogramQ.Rows);

            var valueCounts = histogramQ.Rows
                              .GroupBy(
                row => row.BucketSize,
                (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows)));

            return(valueCounts
                   .Join(
                       histograms,
                       v => v.BucketSize.SnappedSize,
                       h => h.BucketSize.SnappedSize,
                       (v, h) => new HistogramWithCounts(v.Rows, h))
                   .ToList());
        }
Exemple #4
0
        private static IEnumerable <(ValueCounts, IGrouping <string, GroupingSetsResult <DateTime> >)> ProcessLinearBuckets(
            IEnumerable <GroupingSetsResult <DateTime> > queryResult)
        {
            foreach (var group in TimeUtilities.GroupByLabel(queryResult))
            {
                var counts = ValueCounts.Compute(group);
                if (counts.SuppressedRowRatio > SuppressedRatioThreshold)
                {
                    break;
                }

                yield return(counts, group);
            }
        }
        public async void TestEmailNegative()
        {
            using var queryScope = testFixture.SimpleQueryTestScope(
                      TestDataSource,
                      VcrSharp.Cassette.GenerateVcrFilename(this));

            var result = await queryScope.QueryRows(
                new TextColumnTrim("cards", "lastname", TextColumnTrimType.Both, Constants.EmailAddressChars));

            var counts = ValueCounts.Compute(result);

            var isEmail = counts.TotalCount == result
                          .Where(r => r.IsNull || r.Value == "@")
                          .Sum(r => r.Count);

            Assert.False(isEmail);
        }
 public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows)
 {
     DistinctRows = distinctRows;
     ValueCounts  = ValueCounts.Compute(distinctRows);
 }
Exemple #7
0
 public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows, NumericDistribution?decimalsCountDistribution)
 {
     DistinctRows = distinctRows.ToList();
     ValueCounts  = ValueCounts.Compute(DistinctRows);
     DecimalsCountDistribution = decimalsCountDistribution;
 }