public static TryCatch <AggregateValue> TryCreate( AggregateOperator aggregateOperator, string continuationToken) { TryCatch <IAggregator> tryCreateAggregator; switch (aggregateOperator) { case AggregateOperator.Average: tryCreateAggregator = AverageAggregator.TryCreate(continuationToken); break; case AggregateOperator.Count: tryCreateAggregator = CountAggregator.TryCreate(continuationToken); break; case AggregateOperator.Max: tryCreateAggregator = MinMaxAggregator.TryCreateMaxAggregator(continuationToken); break; case AggregateOperator.Min: tryCreateAggregator = MinMaxAggregator.TryCreateMinAggregator(continuationToken); break; case AggregateOperator.Sum: tryCreateAggregator = SumAggregator.TryCreate(continuationToken); break; default: throw new ArgumentException($"Unknown {nameof(AggregateOperator)}: {aggregateOperator}."); } return(tryCreateAggregator.Try <AggregateValue>((aggregator) => new AggregateAggregateValue(aggregator))); }
public static AggregateAggregateValue Create( AggregateOperator aggregateOperator, string continuationToken) { IAggregator aggregator; switch (aggregateOperator) { case AggregateOperator.Average: aggregator = AverageAggregator.Create(continuationToken); break; case AggregateOperator.Count: aggregator = CountAggregator.Create(continuationToken); break; case AggregateOperator.Max: aggregator = MinMaxAggregator.CreateMaxAggregator(continuationToken); break; case AggregateOperator.Min: aggregator = MinMaxAggregator.CreateMinAggregator(continuationToken); break; case AggregateOperator.Sum: aggregator = SumAggregator.Create(continuationToken); break; default: throw new ArgumentException($"Unknown {nameof(AggregateOperator)}: {aggregateOperator}."); } return(new AggregateAggregateValue(aggregator)); }
private bool ShouldStopEarly(GroupByDictionary dictionary, CountAggregator counter) { // If every value was unique so far, stop if (dictionary.Count == counter.TotalRowCount) { return(true); } // If any value had enough rows to report, keep going XArray counts = counter.Values; int[] countsArray = (int[])counts.Array; int threshold = (int)(counter.TotalRowCount * MinimumPercentageToReport); for (int i = 0; i < counts.Count; ++i) { int count = countsArray[counts.Index(i)]; if (count >= threshold) { return(false); } } // Otherwise, stop (not all unique, but no values in > 0.5% of rows) return(true); }
public void Constructor_WithName_NameIsSet() { var filter = new SelectorFilter("test", "value"); var agg = new CountAggregator("test"); var aggregator = new FilteredAggregator(filter, agg); Assert.That(aggregator.Filter, Is.EqualTo(filter)); Assert.That(aggregator.Aggregator, Is.EqualTo(agg)); }
public void CountAggregator() { var cnt = new CountAggregator(); for (int i = 0; i < 10; i++) { cnt.Push(5, null); } Assert.Equal(10, Convert.ToInt32(cnt.Value)); }
private static void AddStatsMetric(CountAggregator aggregator, string statName, double value) { var statsMetric = new StatsMetric { MetricType = MetricType. Count, StatName = statName, NumericValue = value, }; aggregator.OnNewValue(ref statsMetric); }
public StatsRouter( Serializers serializers, BufferBuilder bufferBuilder, Aggregators optionalAggregators) { _serializers = serializers; _bufferBuilder = bufferBuilder; if (optionalAggregators != null) { _optionalCountAggregator = optionalAggregators.OptionalCount; _optionalGaugeAggregator = optionalAggregators.OptionalGauge; _optionalSetAggregator = optionalAggregators.OptionalSet; } }
public void OnNewValue() { var handler = new BufferBuilderHandlerMock(); var aggregator = new CountAggregator(MetricAggregatorParametersFactory.Create(handler.Object)); AddStatsMetric(aggregator, "s1", 1); AddStatsMetric(aggregator, "s1", 2); AddStatsMetric(aggregator, "s2", 2); aggregator.TryFlush(force: true); Assert.AreEqual("s1:3|c,s2:2|c", handler.Value); AddStatsMetric(aggregator, "s3", 1); aggregator.TryFlush(force: true); Assert.AreEqual("s3:1|c", handler.Value); }
private void BuildSingleEnumColumnDictionary(CancellationToken cancellationToken) { XArray values = _keyColumns[0].ValuesGetter()(); Func <XArray> indicesGetter = _keyColumns[0].IndicesCurrentGetter(); // Find or construct an aggregator which can track which enum values ended up with any rows in the result IFoundIndicesTracker tracker = (IFoundIndicesTracker)_aggregators.FirstOrDefault((agg) => agg is IFoundIndicesTracker); bool trackerFound = (tracker != null); if (!trackerFound) { tracker = new CountAggregator(); } int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Aggregate each row directly on the row index (already a small zero-based value) XArray indices = indicesGetter(); for (int i = 0; i < _aggregators.Length; ++i) { _aggregators[i].Add(indices, values.Count); } if (!trackerFound) { tracker.Add(indices, values.Count); } } // Figure out which rows had matches ArraySelector foundValuesSelector = tracker.FoundIndices; // Store the distinct count now that we know it _distinctCount = foundValuesSelector.Count; // Once the loop is done, get the distinct values and aggregation results _columns[0].SetValues(values.Reselect(foundValuesSelector)); for (int i = 0; i < _aggregators.Length; ++i) { _columns[i + 1].SetValues(_aggregators[i].Values.Reselect(foundValuesSelector)); } }
private void BuildSingleEnumColumnDictionary(CancellationToken cancellationToken) { // Build a CountAggregator for the enum GroupBy CountAggregator counts = new CountAggregator(); XArray values = _column.ValuesGetter()(); Func <XArray> indicesGetter = _column.IndicesCurrentGetter(); int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Aggregate each row directly on the row index (already a small zero-based value) XArray indices = indicesGetter(); counts.Add(indices, values.Count); } // Once the loop is done, get the distinct values and aggregation results PostSortAndFilter(values, counts.Values, counts.TotalRowCount, true); }
/// <summary> /// Returns the feature selection scores for each slot of each column. /// </summary> /// <param name="env">The host environment.</param> /// <param name="input">The input dataview.</param> /// <param name="columns">The columns for which to compute the feature selection scores.</param> /// <param name="colSizes">Outputs an array containing the vector sizes of the input columns</param> /// <returns>A list of scores.</returns> public static long[][] Train(IHostEnvironment env, IDataView input, string[] columns, out int[] colSizes) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(input, nameof(input)); env.CheckParam(Utils.Size(columns) > 0, nameof(columns)); var schema = input.Schema; var size = columns.Length; var activeInput = new bool[schema.ColumnCount]; var colSrcs = new int[size]; var colTypes = new ColumnType[size]; colSizes = new int[size]; for (int i = 0; i < size; i++) { int colSrc; var colName = columns[i]; if (!schema.TryGetColumnIndex(colName, out colSrc)) { throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Source column '{0}' not found", colName); } var colType = schema.GetColumnType(colSrc); if (colType.IsVector && !colType.IsKnownSizeVector) { throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Variable length column '{0}' is not allowed", colName); } activeInput[colSrc] = true; colSrcs[i] = colSrc; colTypes[i] = colType; colSizes[i] = colType.ValueCount; } var aggregators = new CountAggregator[size]; long rowCur = 0; double rowCount = input.GetRowCount(true) ?? double.NaN; using (var pch = env.StartProgressChannel("Aggregating counts")) using (var cursor = input.GetRowCursor(col => activeInput[col])) { var header = new ProgressHeader(new[] { "rows" }); pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); }); for (int i = 0; i < size; i++) { if (colTypes[i].IsVector) { aggregators[i] = GetVecAggregator(cursor, colTypes[i], colSrcs[i]); } else { aggregators[i] = GetOneAggregator(cursor, colTypes[i], colSrcs[i]); } } while (cursor.MoveNext()) { for (int i = 0; i < size; i++) { aggregators[i].ProcessValue(); } rowCur++; } pch.Checkpoint(rowCur); } return(aggregators.Select(a => a.Count).ToArray()); }
public void Constructor_TypeIsCorrect() { var aggregator = new CountAggregator("test"); Assert.That(aggregator.Type, Is.EqualTo("count")); }
public void Constructor_WithName_NameIsSet() { var aggregator = new CountAggregator("test"); Assert.That(aggregator.Name, Is.EqualTo("test")); }
/// <summary> /// Build a GroupBy Dictionary for Peek. /// </summary> /// <remarks> /// Peek identifies each distinct common value and the approximate percentage of rows with it. /// If we have many matching rows, we can sample - the sample will have any common values in it. /// However, we don't know how many matches we have in advance. /// Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows. /// As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets. /// This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries. /// </remarks> /// <param name="cancellationToken">CancellationToken to request early stop</param> private void BuildDictionary(CancellationToken cancellationToken) { // Short-circuit path if there's one key column and it's an EnumColumn if (_column.IsEnumColumn()) { BuildSingleEnumColumnDictionary(cancellationToken); return; } // Build a Random instance to sample rows Random r = new Random(); // Build a Dictionary and CountAggregator for each sample GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount]; CountAggregator[] counts = new CountAggregator[SampleCount]; int[][] remapArrays = new int[SampleCount][]; for (int i = 0; i < SampleCount; ++i) { dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails }); counts[i] = new CountAggregator(); } // Retrieve the column getter Func <XArray> columnGetter = _column.CurrentGetter(); // Track which sample we'll currently report int currentSample = 0; XArray[] arrays = new XArray[1]; int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Get the column values arrays[0] = columnGetter(); // Build the GroupBy count for all rows and successive 1/8 samples for (int i = 0; i < SampleCount; ++i) { // Add these to the Join Dictionary if (i >= currentSample) { // Choose buckets for each row XArray indicesForRows = dictionaries[i].FindOrAdd(arrays); // Identify the bucket for each row and aggregate them counts[i].Add(indicesForRows, dictionaries[i].Count); // If this sample now has enough values, stop collecting bigger row sets if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize) { // If every row was unique, stop early and don't set outputs (zero rows) if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample])) { return; } dictionaries[currentSample] = null; counts[currentSample] = null; currentSample++; } } // Each successive dictionary has ~1/8 of the rows of the previous one if (i < SampleCount - 1) { ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]); arrays[0] = arrays[0].Reselect(sample); } } } // Once the loop is done, get the distinct values and aggregation results PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0); }