protected async override Task <Result> Explore() { var stats = await statsResultProvider.ResultAsync; var bucketsToSample = BucketUtils.EstimateBucketResolutions( stats.Count, stats.Min, stats.Max, ValuesPerBucketTarget, isIntegerColumn: ctx.ColumnType == DValueType.Integer); var histogramQ = await conn.Exec(new SingleColumnHistogram(ctx.Table, ctx.Column, bucketsToSample)); var histograms = Histogram.FromQueryRows(histogramQ.Rows); var valueCounts = histogramQ.Rows .GroupBy( row => row.BucketSize, (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows))); var results = valueCounts.Join( histograms, v => v.BucketSize.SnappedSize, h => h.BucketSize.SnappedSize, (v, h) => new Result(v.Rows, h)); return(results .OrderBy(h => h.BucketSize.SnappedSize) .ThenBy(h => h.ValueCounts.SuppressedCount) .First()); }
internal Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows) { ValueCounts = ValueCounts.Compute(distinctRows); Distribution = distinctRows .Where(r => r.HasValue) .OrderBy(r => r.Value.GetInt32()) .Select(r => (r.Value.GetInt64(), r.Count)) .ToList(); }
protected async override Task <List <HistogramWithCounts>?> Explore() { var stats = await statsResultProvider.ResultAsync; if (stats == null) { return(null); } var(minBound, maxBound) = (stats.Min, stats.Max); if (!minBound.HasValue || !maxBound.HasValue) { var distincts = await distinctValuesProvider.ResultAsync; if (distincts == null || distincts.ValueCounts.NonSuppressedNonNullCount == 0) { return(null); } var values = distincts.DistinctRows.Where(row => row.HasValue).Select(row => row.Value.GetDouble()); minBound ??= values.Min(); maxBound ??= values.Max(); } if (!minBound.HasValue || !maxBound.HasValue || minBound == maxBound) { Logger.LogWarning("Unable to calculate suitable bounds for numerical column {Context.Column}."); return(null); } var bucketsToSample = BucketUtils.EstimateBucketResolutions( stats.Count, (double)minBound, (double)maxBound, ValuesPerBucketTarget, isIntegerColumn: Context.ColumnInfo.Type == DValueType.Integer); var histogramQ = await Context.Exec(new SingleColumnHistogram(bucketsToSample)); var histograms = Histogram.FromQueryRows(histogramQ.Rows); var valueCounts = histogramQ.Rows .GroupBy( row => row.BucketSize, (bs, rows) => (BucketSize: new BucketSize(bs), Rows: ValueCounts.Compute(rows))); return(valueCounts .Join( histograms, v => v.BucketSize.SnappedSize, h => h.BucketSize.SnappedSize, (v, h) => new HistogramWithCounts(v.Rows, h)) .ToList()); }
private static IEnumerable <(ValueCounts, IGrouping <string, GroupingSetsResult <DateTime> >)> ProcessLinearBuckets( IEnumerable <GroupingSetsResult <DateTime> > queryResult) { foreach (var group in TimeUtilities.GroupByLabel(queryResult)) { var counts = ValueCounts.Compute(group); if (counts.SuppressedRowRatio > SuppressedRatioThreshold) { break; } yield return(counts, group); } }
public async void TestEmailNegative() { using var queryScope = testFixture.SimpleQueryTestScope( TestDataSource, VcrSharp.Cassette.GenerateVcrFilename(this)); var result = await queryScope.QueryRows( new TextColumnTrim("cards", "lastname", TextColumnTrimType.Both, Constants.EmailAddressChars)); var counts = ValueCounts.Compute(result); var isEmail = counts.TotalCount == result .Where(r => r.IsNull || r.Value == "@") .Sum(r => r.Count); Assert.False(isEmail); }
public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows) { DistinctRows = distinctRows; ValueCounts = ValueCounts.Compute(distinctRows); }
public ParseDataItem(StreamReader sr, ParseDocument parentDocument, ParseDataItem parentDataItem) { ParentDocument = parentDocument; ParentDataItem = parentDataItem; string nextTag = string.Empty; int associatedChildCount = 0; Stack <string> tagStack = new Stack <string>(); do { while (sr.Peek() > 0) { char firstChar; if (string.IsNullOrEmpty(nextTag)) { firstChar = (char)sr.Read(); while (sr.Peek() > 0 && char.IsControl(firstChar)) { firstChar = (char)sr.Read(); } } else { firstChar = nextTag[0]; } string toTest = string.Empty; string toTestNode = firstChar.Equals('/') ? nextTag : string.Empty; if (!string.IsNullOrEmpty(toTestNode)) { if (toTestNode.Contains(" ") || toTestNode.StartsWith("br")) { toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(" ") - 1 : toTestNode.IndexOf(" ")).Trim(); } else { toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(">") - 1 : toTestNode.IndexOf(">")).Trim(); } } if (firstChar.Equals('<') || ((firstChar.Equals('/') && tagStack.Peek() != DATA_TAG) || (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && (!string.IsNullOrEmpty(toTest) && !tagsToProcess.Contains(toTest))) || (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim())) || (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && !reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim())))) { string tagNode = string.IsNullOrEmpty(nextTag) ? StreamReaderExtensions.ReadUntil(sr, '>') : nextTag.StartsWith("<") ? nextTag.Substring(1) : nextTag; nextTag = string.Empty; bool isOpenNode = !(firstChar.Equals('/') || tagNode.StartsWith("/")); string tag = string.Empty; if (tagNode.Contains(" ") || tagNode.StartsWith("br")) { tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(" ") - 1 : tagNode.IndexOf(" ")).Trim(); } else { tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(">") - 1 : tagNode.IndexOf(">")).Trim(); } if (tagStack.Any() && tagStack.Peek().Trim().ToLower() == DATA_TAG && tag.Trim().ToLower() == TABLE_TAG) { if (isOpenNode) { ParseDataItem newChild = new ParseDataItem(sr, ParentDocument, this); if (newChild.HasData) { AddChild(newChild); associatedChildCount++; } } } else { if (tagsToProcess.Contains(tag)) { if (isOpenNode) { tagStack.Push(tag); } else { if (tagStack.Peek().Equals(tag)) { tagStack.Pop(); } if (tag.Equals(DATA_TAG)) { if (HTMLDecodedValues != null && HTMLDecodedValues.Any()) { ValueCounts.Last().AssociatedChildCount = associatedChildCount; } associatedChildCount = 0; } } } if (!tagStack.Any()) { break; } } } else { if (tagStack != null && tagStack.Count > 0) { bool done = false; int dataLoopCount = 0; do { string control = Int32.TryParse(firstChar.ToString(), out int temp) || firstChar.ToString().ToUpper().Equals("X") || firstChar.ToString().ToUpper().Equals("Y") ? string.Empty : firstChar + "<"; string text = dataLoopCount == 0 ? firstChar + StreamReaderExtensions.ReadUntil(sr, '<') : StreamReaderExtensions.ReadUntil(sr, '<'); if (!text.Equals(control) && !text.StartsWith("br /") && !text.Equals("<")) { text = new string(text.Substring(0, text.Length - 1).Where(c => !char.IsControl(c)).ToArray()); if (!string.IsNullOrEmpty(text)) { if (tagStack.Peek().Trim().ToLower() == HEADER_TAG) { HTMLDecodedHeader = text; } else if (tagStack.Peek().Trim().ToLower() == DATA_TAG) { if (HTMLDecodedValues != null && HTMLDecodedValues.Any()) { ValueCounts.Last().AssociatedChildCount = associatedChildCount; } associatedChildCount = 0; AddValue(text); } } nextTag = StreamReaderExtensions.ReadUntil(sr, '>'); if (!nextTag.StartsWith("br /") && !nextTag.StartsWith("br/")) { done = true; } dataLoopCount++; } else { nextTag = StreamReaderExtensions.ReadUntil(sr, '>'); string tempTag = nextTag; tempTag = tempTag.Replace(">", ""); if (tagsToProcess.Contains(tempTag)) { nextTag = "<" + nextTag; } done = true; } } while (!done); } } } } while (tagStack.Any() && sr.Peek() > 0); }
public Result(IEnumerable <ValueWithCount <JsonElement> > distinctRows, NumericDistribution?decimalsCountDistribution) { DistinctRows = distinctRows.ToList(); ValueCounts = ValueCounts.Compute(DistinctRows); DecimalsCountDistribution = decimalsCountDistribution; }
internal static bool TooManySuppressedValues(ValueCounts counts) => counts.SuppressedCountRatio > SuppressedRatioThreshold;
internal HistogramWithCounts(ValueCounts valueCounts, Histogram histogram) { ValueCounts = valueCounts; Histogram = histogram; }
internal Result(ValueCounts valueCounts, Histogram histogram) { ValueCounts = valueCounts; Histogram = histogram; }