Ejemplo n.º 1
0
                public static TryCatch <AggregateValue> TryCreate(
                    AggregateOperator aggregateOperator,
                    string continuationToken)
                {
                    TryCatch <IAggregator> tryCreateAggregator;

                    switch (aggregateOperator)
                    {
                    case AggregateOperator.Average:
                        tryCreateAggregator = AverageAggregator.TryCreate(continuationToken);
                        break;

                    case AggregateOperator.Count:
                        tryCreateAggregator = CountAggregator.TryCreate(continuationToken);
                        break;

                    case AggregateOperator.Max:
                        tryCreateAggregator = MinMaxAggregator.TryCreateMaxAggregator(continuationToken);
                        break;

                    case AggregateOperator.Min:
                        tryCreateAggregator = MinMaxAggregator.TryCreateMinAggregator(continuationToken);
                        break;

                    case AggregateOperator.Sum:
                        tryCreateAggregator = SumAggregator.TryCreate(continuationToken);
                        break;

                    default:
                        throw new ArgumentException($"Unknown {nameof(AggregateOperator)}: {aggregateOperator}.");
                    }

                    return(tryCreateAggregator.Try <AggregateValue>((aggregator) => new AggregateAggregateValue(aggregator)));
                }
                public static AggregateAggregateValue Create(
                    AggregateOperator aggregateOperator,
                    string continuationToken)
                {
                    IAggregator aggregator;

                    switch (aggregateOperator)
                    {
                    case AggregateOperator.Average:
                        aggregator = AverageAggregator.Create(continuationToken);
                        break;

                    case AggregateOperator.Count:
                        aggregator = CountAggregator.Create(continuationToken);
                        break;

                    case AggregateOperator.Max:
                        aggregator = MinMaxAggregator.CreateMaxAggregator(continuationToken);
                        break;

                    case AggregateOperator.Min:
                        aggregator = MinMaxAggregator.CreateMinAggregator(continuationToken);
                        break;

                    case AggregateOperator.Sum:
                        aggregator = SumAggregator.Create(continuationToken);
                        break;

                    default:
                        throw new ArgumentException($"Unknown {nameof(AggregateOperator)}: {aggregateOperator}.");
                    }

                    return(new AggregateAggregateValue(aggregator));
                }
Ejemplo n.º 3
0
        private bool ShouldStopEarly(GroupByDictionary dictionary, CountAggregator counter)
        {
            // If every value was unique so far, stop
            if (dictionary.Count == counter.TotalRowCount)
            {
                return(true);
            }

            // If any value had enough rows to report, keep going
            XArray counts = counter.Values;

            int[] countsArray = (int[])counts.Array;
            int   threshold   = (int)(counter.TotalRowCount * MinimumPercentageToReport);

            for (int i = 0; i < counts.Count; ++i)
            {
                int count = countsArray[counts.Index(i)];
                if (count >= threshold)
                {
                    return(false);
                }
            }

            // Otherwise, stop (not all unique, but no values in > 0.5% of rows)
            return(true);
        }
Ejemplo n.º 4
0
        public void Constructor_WithName_NameIsSet()
        {
            var filter = new SelectorFilter("test", "value");
            var agg    = new CountAggregator("test");

            var aggregator = new FilteredAggregator(filter, agg);

            Assert.That(aggregator.Filter, Is.EqualTo(filter));
            Assert.That(aggregator.Aggregator, Is.EqualTo(agg));
        }
Ejemplo n.º 5
0
        public void CountAggregator()
        {
            var cnt = new CountAggregator();

            for (int i = 0; i < 10; i++)
            {
                cnt.Push(5, null);
            }
            Assert.Equal(10, Convert.ToInt32(cnt.Value));
        }
Ejemplo n.º 6
0
        private static void AddStatsMetric(CountAggregator aggregator, string statName, double value)
        {
            var statsMetric = new StatsMetric
            {
                MetricType = MetricType.
                             Count,
                StatName     = statName,
                NumericValue = value,
            };

            aggregator.OnNewValue(ref statsMetric);
        }
Ejemplo n.º 7
0
 public StatsRouter(
     Serializers serializers,
     BufferBuilder bufferBuilder,
     Aggregators optionalAggregators)
 {
     _serializers   = serializers;
     _bufferBuilder = bufferBuilder;
     if (optionalAggregators != null)
     {
         _optionalCountAggregator = optionalAggregators.OptionalCount;
         _optionalGaugeAggregator = optionalAggregators.OptionalGauge;
         _optionalSetAggregator   = optionalAggregators.OptionalSet;
     }
 }
        public void OnNewValue()
        {
            var handler    = new BufferBuilderHandlerMock();
            var aggregator = new CountAggregator(MetricAggregatorParametersFactory.Create(handler.Object));

            AddStatsMetric(aggregator, "s1", 1);
            AddStatsMetric(aggregator, "s1", 2);
            AddStatsMetric(aggregator, "s2", 2);
            aggregator.TryFlush(force: true);
            Assert.AreEqual("s1:3|c,s2:2|c", handler.Value);

            AddStatsMetric(aggregator, "s3", 1);
            aggregator.TryFlush(force: true);
            Assert.AreEqual("s3:1|c", handler.Value);
        }
Ejemplo n.º 9
0
        private void BuildSingleEnumColumnDictionary(CancellationToken cancellationToken)
        {
            XArray        values        = _keyColumns[0].ValuesGetter()();
            Func <XArray> indicesGetter = _keyColumns[0].IndicesCurrentGetter();

            // Find or construct an aggregator which can track which enum values ended up with any rows in the result
            IFoundIndicesTracker tracker = (IFoundIndicesTracker)_aggregators.FirstOrDefault((agg) => agg is IFoundIndicesTracker);
            bool trackerFound            = (tracker != null);

            if (!trackerFound)
            {
                tracker = new CountAggregator();
            }

            int count;

            while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0)
            {
                // Aggregate each row directly on the row index (already a small zero-based value)
                XArray indices = indicesGetter();

                for (int i = 0; i < _aggregators.Length; ++i)
                {
                    _aggregators[i].Add(indices, values.Count);
                }

                if (!trackerFound)
                {
                    tracker.Add(indices, values.Count);
                }
            }

            // Figure out which rows had matches
            ArraySelector foundValuesSelector = tracker.FoundIndices;

            // Store the distinct count now that we know it
            _distinctCount = foundValuesSelector.Count;

            // Once the loop is done, get the distinct values and aggregation results
            _columns[0].SetValues(values.Reselect(foundValuesSelector));
            for (int i = 0; i < _aggregators.Length; ++i)
            {
                _columns[i + 1].SetValues(_aggregators[i].Values.Reselect(foundValuesSelector));
            }
        }
Ejemplo n.º 10
0
        private void BuildSingleEnumColumnDictionary(CancellationToken cancellationToken)
        {
            // Build a CountAggregator for the enum GroupBy
            CountAggregator counts = new CountAggregator();

            XArray        values        = _column.ValuesGetter()();
            Func <XArray> indicesGetter = _column.IndicesCurrentGetter();

            int count;

            while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0)
            {
                // Aggregate each row directly on the row index (already a small zero-based value)
                XArray indices = indicesGetter();
                counts.Add(indices, values.Count);
            }

            // Once the loop is done, get the distinct values and aggregation results
            PostSortAndFilter(values, counts.Values, counts.TotalRowCount, true);
        }
        /// <summary>
        /// Returns the feature selection scores for each slot of each column.
        /// </summary>
        /// <param name="env">The host environment.</param>
        /// <param name="input">The input dataview.</param>
        /// <param name="columns">The columns for which to compute the feature selection scores.</param>
        /// <param name="colSizes">Outputs an array containing the vector sizes of the input columns</param>
        /// <returns>A list of scores.</returns>
        public static long[][] Train(IHostEnvironment env, IDataView input, string[] columns, out int[] colSizes)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(input, nameof(input));
            env.CheckParam(Utils.Size(columns) > 0, nameof(columns));

            var schema      = input.Schema;
            var size        = columns.Length;
            var activeInput = new bool[schema.ColumnCount];
            var colSrcs     = new int[size];
            var colTypes    = new ColumnType[size];

            colSizes = new int[size];
            for (int i = 0; i < size; i++)
            {
                int colSrc;
                var colName = columns[i];
                if (!schema.TryGetColumnIndex(colName, out colSrc))
                {
                    throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Source column '{0}' not found", colName);
                }

                var colType = schema.GetColumnType(colSrc);
                if (colType.IsVector && !colType.IsKnownSizeVector)
                {
                    throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Variable length column '{0}' is not allowed", colName);
                }

                activeInput[colSrc] = true;
                colSrcs[i]          = colSrc;
                colTypes[i]         = colType;
                colSizes[i]         = colType.ValueCount;
            }

            var    aggregators = new CountAggregator[size];
            long   rowCur      = 0;
            double rowCount    = input.GetRowCount(true) ?? double.NaN;

            using (var pch = env.StartProgressChannel("Aggregating counts"))
                using (var cursor = input.GetRowCursor(col => activeInput[col]))
                {
                    var header = new ProgressHeader(new[] { "rows" });
                    pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); });
                    for (int i = 0; i < size; i++)
                    {
                        if (colTypes[i].IsVector)
                        {
                            aggregators[i] = GetVecAggregator(cursor, colTypes[i], colSrcs[i]);
                        }
                        else
                        {
                            aggregators[i] = GetOneAggregator(cursor, colTypes[i], colSrcs[i]);
                        }
                    }

                    while (cursor.MoveNext())
                    {
                        for (int i = 0; i < size; i++)
                        {
                            aggregators[i].ProcessValue();
                        }
                        rowCur++;
                    }
                    pch.Checkpoint(rowCur);
                }
            return(aggregators.Select(a => a.Count).ToArray());
        }
Ejemplo n.º 12
0
        public void Constructor_TypeIsCorrect()
        {
            var aggregator = new CountAggregator("test");

            Assert.That(aggregator.Type, Is.EqualTo("count"));
        }
Ejemplo n.º 13
0
        public void Constructor_WithName_NameIsSet()
        {
            var aggregator = new CountAggregator("test");

            Assert.That(aggregator.Name, Is.EqualTo("test"));
        }
Ejemplo n.º 14
0
        /// <summary>
        ///  Build a GroupBy Dictionary for Peek.
        /// </summary>
        /// <remarks>
        ///  Peek identifies each distinct common value and the approximate percentage of rows with it.
        ///  If we have many matching rows, we can sample - the sample will have any common values in it.
        ///  However, we don't know how many matches we have in advance.
        ///  Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows.
        ///  As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets.
        ///  This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries.
        /// </remarks>
        /// <param name="cancellationToken">CancellationToken to request early stop</param>
        private void BuildDictionary(CancellationToken cancellationToken)
        {
            // Short-circuit path if there's one key column and it's an EnumColumn
            if (_column.IsEnumColumn())
            {
                BuildSingleEnumColumnDictionary(cancellationToken);
                return;
            }

            // Build a Random instance to sample rows
            Random r = new Random();

            // Build a Dictionary and CountAggregator for each sample
            GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount];
            CountAggregator[]   counts       = new CountAggregator[SampleCount];
            int[][]             remapArrays  = new int[SampleCount][];
            for (int i = 0; i < SampleCount; ++i)
            {
                dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails });
                counts[i]       = new CountAggregator();
            }

            // Retrieve the column getter
            Func <XArray> columnGetter = _column.CurrentGetter();

            // Track which sample we'll currently report
            int currentSample = 0;

            XArray[] arrays = new XArray[1];
            int      count;

            while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0)
            {
                // Get the column values
                arrays[0] = columnGetter();

                // Build the GroupBy count for all rows and successive 1/8 samples
                for (int i = 0; i < SampleCount; ++i)
                {
                    // Add these to the Join Dictionary
                    if (i >= currentSample)
                    {
                        // Choose buckets for each row
                        XArray indicesForRows = dictionaries[i].FindOrAdd(arrays);

                        // Identify the bucket for each row and aggregate them
                        counts[i].Add(indicesForRows, dictionaries[i].Count);

                        // If this sample now has enough values, stop collecting bigger row sets
                        if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize)
                        {
                            // If every row was unique, stop early and don't set outputs (zero rows)
                            if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample]))
                            {
                                return;
                            }

                            dictionaries[currentSample] = null;
                            counts[currentSample]       = null;
                            currentSample++;
                        }
                    }

                    // Each successive dictionary has ~1/8 of the rows of the previous one
                    if (i < SampleCount - 1)
                    {
                        ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]);
                        arrays[0] = arrays[0].Reselect(sample);
                    }
                }
            }

            // Once the loop is done, get the distinct values and aggregation results
            PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0);
        }