protected async override Task <Result> Explore()
        {
            var stats = await statsResultProvider.ResultAsync;

            var bucketsToSample = BucketUtils.EstimateBucketResolutions(
                stats.Count,
                stats.Min,
                stats.Max,
                ValuesPerBucketTarget,
                isIntegerColumn: ctx.ColumnType == DValueType.Integer);

            var histogramQ = await conn.Exec(new SingleColumnHistogram(ctx.Table, ctx.Column, bucketsToSample));

            var histograms = Histogram.FromQueryRows(histogramQ.Rows);

            var valueCounts = histogramQ.Rows
                              .GroupBy(
                row => row.BucketSize,
                (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows)));

            var results = valueCounts.Join(
                histograms,
                v => v.BucketSize.SnappedSize,
                h => h.BucketSize.SnappedSize,
                (v, h) => new Result(v.Rows, h));

            return(results
                   .OrderBy(h => h.BucketSize.SnappedSize)
                   .ThenBy(h => h.ValueCounts.SuppressedCount)
                   .First());
        }
 internal Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows)
 {
     ValueCounts  = ValueCounts.Compute(distinctRows);
     Distribution = distinctRows
                    .Where(r => r.HasValue)
                    .OrderBy(r => r.Value.GetInt32())
                    .Select(r => (r.Value.GetInt64(), r.Count))
                    .ToList();
 }
Beispiel #3
0
        protected async override Task <List <HistogramWithCounts>?> Explore()
        {
            var stats = await statsResultProvider.ResultAsync;

            if (stats == null)
            {
                return(null);
            }

            var(minBound, maxBound) = (stats.Min, stats.Max);
            if (!minBound.HasValue || !maxBound.HasValue)
            {
                var distincts = await distinctValuesProvider.ResultAsync;
                if (distincts == null || distincts.ValueCounts.NonSuppressedNonNullCount == 0)
                {
                    return(null);
                }

                var values = distincts.DistinctRows.Where(row => row.HasValue).Select(row => row.Value.GetDouble());
                minBound ??= values.Min();
                maxBound ??= values.Max();
            }

            if (!minBound.HasValue || !maxBound.HasValue || minBound == maxBound)
            {
                Logger.LogWarning("Unable to calculate suitable bounds for numerical column {Context.Column}.");

                return(null);
            }

            var bucketsToSample = BucketUtils.EstimateBucketResolutions(
                stats.Count,
                (double)minBound,
                (double)maxBound,
                ValuesPerBucketTarget,
                isIntegerColumn: Context.ColumnInfo.Type == DValueType.Integer);

            var histogramQ = await Context.Exec(new SingleColumnHistogram(bucketsToSample));

            var histograms = Histogram.FromQueryRows(histogramQ.Rows);

            var valueCounts = histogramQ.Rows
                              .GroupBy(
                row => row.BucketSize,
                (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows)));

            return(valueCounts
                   .Join(
                       histograms,
                       v => v.BucketSize.SnappedSize,
                       h => h.BucketSize.SnappedSize,
                       (v, h) => new HistogramWithCounts(v.Rows, h))
                   .ToList());
        }
Beispiel #4
0
        private static IEnumerable <(ValueCounts, IGrouping <string, GroupingSetsResult <DateTime> >)> ProcessLinearBuckets(
            IEnumerable <GroupingSetsResult <DateTime> > queryResult)
        {
            foreach (var group in TimeUtilities.GroupByLabel(queryResult))
            {
                var counts = ValueCounts.Compute(group);
                if (counts.SuppressedRowRatio > SuppressedRatioThreshold)
                {
                    break;
                }

                yield return(counts, group);
            }
        }
        public async void TestEmailNegative()
        {
            using var queryScope = testFixture.SimpleQueryTestScope(
                      TestDataSource,
                      VcrSharp.Cassette.GenerateVcrFilename(this));

            var result = await queryScope.QueryRows(
                new TextColumnTrim("cards", "lastname", TextColumnTrimType.Both, Constants.EmailAddressChars));

            var counts = ValueCounts.Compute(result);

            var isEmail = counts.TotalCount == result
                          .Where(r => r.IsNull || r.Value == "@")
                          .Sum(r => r.Count);

            Assert.False(isEmail);
        }
 public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows)
 {
     DistinctRows = distinctRows;
     ValueCounts  = ValueCounts.Compute(distinctRows);
 }
Beispiel #7
0
        public ParseDataItem(StreamReader sr, ParseDocument parentDocument, ParseDataItem parentDataItem)
        {
            ParentDocument = parentDocument;
            ParentDataItem = parentDataItem;

            string         nextTag = string.Empty;
            int            associatedChildCount = 0;
            Stack <string> tagStack             = new Stack <string>();

            do
            {
                while (sr.Peek() > 0)
                {
                    char firstChar;
                    if (string.IsNullOrEmpty(nextTag))
                    {
                        firstChar = (char)sr.Read();
                        while (sr.Peek() > 0 && char.IsControl(firstChar))
                        {
                            firstChar = (char)sr.Read();
                        }
                    }
                    else
                    {
                        firstChar = nextTag[0];
                    }

                    string toTest     = string.Empty;
                    string toTestNode = firstChar.Equals('/') ? nextTag : string.Empty;
                    if (!string.IsNullOrEmpty(toTestNode))
                    {
                        if (toTestNode.Contains(" ") || toTestNode.StartsWith("br"))
                        {
                            toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(" ") - 1 : toTestNode.IndexOf(" ")).Trim();
                        }
                        else
                        {
                            toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(">") - 1 : toTestNode.IndexOf(">")).Trim();
                        }
                    }
                    if (firstChar.Equals('<') ||
                        ((firstChar.Equals('/') && tagStack.Peek() != DATA_TAG) ||
                         (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && (!string.IsNullOrEmpty(toTest) && !tagsToProcess.Contains(toTest))) ||
                         (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim())) ||
                         (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && !reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim()))))
                    {
                        string tagNode = string.IsNullOrEmpty(nextTag) ? StreamReaderExtensions.ReadUntil(sr, '>') : nextTag.StartsWith("<") ? nextTag.Substring(1) : nextTag;
                        nextTag = string.Empty;

                        bool   isOpenNode = !(firstChar.Equals('/') || tagNode.StartsWith("/"));
                        string tag        = string.Empty;
                        if (tagNode.Contains(" ") || tagNode.StartsWith("br"))
                        {
                            tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(" ") - 1 : tagNode.IndexOf(" ")).Trim();
                        }
                        else
                        {
                            tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(">") - 1 : tagNode.IndexOf(">")).Trim();
                        }

                        if (tagStack.Any() && tagStack.Peek().Trim().ToLower() == DATA_TAG && tag.Trim().ToLower() == TABLE_TAG)
                        {
                            if (isOpenNode)
                            {
                                ParseDataItem newChild = new ParseDataItem(sr, ParentDocument, this);
                                if (newChild.HasData)
                                {
                                    AddChild(newChild);
                                    associatedChildCount++;
                                }
                            }
                        }
                        else
                        {
                            if (tagsToProcess.Contains(tag))
                            {
                                if (isOpenNode)
                                {
                                    tagStack.Push(tag);
                                }
                                else
                                {
                                    if (tagStack.Peek().Equals(tag))
                                    {
                                        tagStack.Pop();
                                    }
                                    if (tag.Equals(DATA_TAG))
                                    {
                                        if (HTMLDecodedValues != null && HTMLDecodedValues.Any())
                                        {
                                            ValueCounts.Last().AssociatedChildCount = associatedChildCount;
                                        }
                                        associatedChildCount = 0;
                                    }
                                }
                            }
                            if (!tagStack.Any())
                            {
                                break;
                            }
                        }
                    }
                    else
                    {
                        if (tagStack != null && tagStack.Count > 0)
                        {
                            bool done          = false;
                            int  dataLoopCount = 0;
                            do
                            {
                                string control = Int32.TryParse(firstChar.ToString(), out int temp) || firstChar.ToString().ToUpper().Equals("X") ||
                                                 firstChar.ToString().ToUpper().Equals("Y") ? string.Empty : firstChar + "<";
                                string text = dataLoopCount == 0 ? firstChar + StreamReaderExtensions.ReadUntil(sr, '<') : StreamReaderExtensions.ReadUntil(sr, '<');
                                if (!text.Equals(control) && !text.StartsWith("br /") && !text.Equals("<"))
                                {
                                    text = new string(text.Substring(0, text.Length - 1).Where(c => !char.IsControl(c)).ToArray());
                                    if (!string.IsNullOrEmpty(text))
                                    {
                                        if (tagStack.Peek().Trim().ToLower() == HEADER_TAG)
                                        {
                                            HTMLDecodedHeader = text;
                                        }
                                        else if (tagStack.Peek().Trim().ToLower() == DATA_TAG)
                                        {
                                            if (HTMLDecodedValues != null && HTMLDecodedValues.Any())
                                            {
                                                ValueCounts.Last().AssociatedChildCount = associatedChildCount;
                                            }
                                            associatedChildCount = 0;
                                            AddValue(text);
                                        }
                                    }
                                    nextTag = StreamReaderExtensions.ReadUntil(sr, '>');
                                    if (!nextTag.StartsWith("br /") && !nextTag.StartsWith("br/"))
                                    {
                                        done = true;
                                    }
                                    dataLoopCount++;
                                }
                                else
                                {
                                    nextTag = StreamReaderExtensions.ReadUntil(sr, '>');

                                    string tempTag = nextTag;
                                    tempTag = tempTag.Replace(">", "");
                                    if (tagsToProcess.Contains(tempTag))
                                    {
                                        nextTag = "<" + nextTag;
                                    }
                                    done = true;
                                }
                            } while (!done);
                        }
                    }
                }
            } while (tagStack.Any() && sr.Peek() > 0);
        }
Beispiel #8
0
 public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows, NumericDistribution?decimalsCountDistribution)
 {
     DistinctRows = distinctRows.ToList();
     ValueCounts  = ValueCounts.Compute(DistinctRows);
     DecimalsCountDistribution = decimalsCountDistribution;
 }
Beispiel #9
0
 internal static bool TooManySuppressedValues(ValueCounts counts)
 => counts.SuppressedCountRatio > SuppressedRatioThreshold;
Beispiel #10
0
 internal HistogramWithCounts(ValueCounts valueCounts, Histogram histogram)
 {
     ValueCounts = valueCounts;
     Histogram   = histogram;
 }
 internal Result(ValueCounts valueCounts, Histogram histogram)
 {
     ValueCounts = valueCounts;
     Histogram   = histogram;
 }