Пример #1
0
        public static MultiModelPipeline Append(this IEstimator <ITransformer> estimator, params SweepableEstimator[] estimators)
        {
            var sweepableEstimator = new SweepableEstimator((context, parameter) => estimator, new SearchSpace.SearchSpace());
            var multiModelPipeline = new MultiModelPipeline().Append(sweepableEstimator).Append(estimators);

            return(multiModelPipeline);
        }
Пример #2
0
        public PipelineProposer(AutoMLExperimentSettings settings)
        {
            // this cost is used to initialize eci when started, the smaller the number, the less cost this trainer will use at start, and more likely it will be
            // picked.
            _estimatorCost = new Dictionary <EstimatorType, double>()
            {
                { EstimatorType.LightGbmRegression, 0.788 },
                { EstimatorType.FastTreeRegression, 0.382 },
                { EstimatorType.FastForestRegression, 0.374 },
                { EstimatorType.SdcaRegression, 0.566 },
                { EstimatorType.FastTreeTweedieRegression, 0.401 },
                { EstimatorType.LbfgsPoissonRegressionRegression, 4.73 },
                { EstimatorType.FastForestOva, 4.283 },
                { EstimatorType.FastTreeOva, 3.701 },
                { EstimatorType.LightGbmMulti, 4.765 },
                { EstimatorType.SdcaMaximumEntropyMulti, 10.129 },
                { EstimatorType.SdcaLogisticRegressionOva, 13.16 },
                { EstimatorType.LbfgsMaximumEntropyMulti, 7.980 },
                { EstimatorType.LbfgsLogisticRegressionOva, 11.513 },
                { EstimatorType.LightGbmBinary, 4.765 },
                { EstimatorType.FastTreeBinary, 3.701 },
                { EstimatorType.FastForestBinary, 4.283 },
                { EstimatorType.SdcaLogisticRegressionBinary, 13.16 },
                { EstimatorType.LbfgsLogisticRegressionBinary, 11.513 },
                { EstimatorType.ForecastBySsa, 1 },
                { EstimatorType.ImageClassificationMulti, 1 },
                { EstimatorType.MatrixFactorization, 1 },
            };
            _rand = new Random(settings.Seed ?? 0);

            _multiModelPipeline = null;
        }
Пример #3
0
        public static MultiModelPipeline Append(this SweepableEstimator estimator, params SweepableEstimator[] estimators)
        {
            var multiModelPipeline = new MultiModelPipeline();

            multiModelPipeline = multiModelPipeline.Append(estimator);

            return(multiModelPipeline.Append(estimators));
        }
        public override ExperimentResult <BinaryClassificationMetrics> Execute(IDataView trainData, ColumnInformation columnInformation, IEstimator <ITransformer> preFeaturizer = null, IProgress <RunDetail <BinaryClassificationMetrics> > progressHandler = null)
        {
            var label = columnInformation.LabelColumnName;

            _experiment.SetEvaluateMetric(Settings.OptimizingMetric, label);
            _experiment.SetTrainingTimeInSeconds(Settings.MaxExperimentTimeInSeconds);

            // Cross val threshold for # of dataset rows --
            // If dataset has < threshold # of rows, use cross val.
            // Else, run experiment using train-validate split.
            const int crossValRowCountThreshold = 15000;
            var       rowCount = DatasetDimensionsUtil.CountRows(trainData, crossValRowCountThreshold);

            // TODO
            // split cross validation result according to sample key as well.
            if (rowCount < crossValRowCountThreshold)
            {
                const int numCrossValFolds = 10;
                _experiment.SetDataset(trainData, numCrossValFolds);
            }
            else
            {
                var splitData = Context.Data.TrainTestSplit(trainData);
                _experiment.SetDataset(splitData.TrainSet, splitData.TestSet);
            }

            MultiModelPipeline pipeline = new MultiModelPipeline();

            if (preFeaturizer != null)
            {
                pipeline = pipeline.Append(preFeaturizer);
            }

            pipeline = pipeline.Append(Context.Auto().Featurizer(trainData, columnInformation, Features))
                       .Append(Context.Auto().BinaryClassification(label, Features));
            _experiment.SetPipeline(pipeline);

            var monitor = new BinaryClassificationTrialResultMonitor();

            monitor.OnTrialCompleted += (o, e) =>
            {
                var detail = ToRunDetail(e);
                progressHandler?.Report(detail);
            };

            _experiment.SetMonitor(monitor);
            _experiment.Run();

            var runDetails = monitor.RunDetails.Select(e => ToRunDetail(e));
            var bestRun    = ToRunDetail(monitor.BestRun);
            var result     = new ExperimentResult <BinaryClassificationMetrics>(runDetails, bestRun);

            return(result);
        }
Пример #5
0
        public static MultiModelPipeline Append(this SweepableEstimatorPipeline pipeline, params SweepableEstimator[] estimators)
        {
            var multiModelPipeline = new MultiModelPipeline();

            foreach (var estimator in pipeline.Estimators)
            {
                multiModelPipeline = multiModelPipeline.Append(estimator);
            }

            return(multiModelPipeline.Append(estimators));
        }
Пример #6
0
        public AutoMLExperiment SetPipeline(SweepableEstimatorPipeline pipeline)
        {
            var res = new MultiModelPipeline();

            foreach (var e in pipeline.Estimators)
            {
                res = res.Append(e);
            }

            SetPipeline(res);

            return(this);
        }
Пример #7
0
        /// <summary>
        /// Create a single featurize pipeline according to <paramref name="columnInformation"/>. This function will collect all columns in <paramref name="columnInformation"/>,
        /// featurizing them using <see cref="CatalogFeaturizer(string[], string[])"/>, <see cref="NumericFeaturizer(string[], string[])"/> or <see cref="TextFeaturizer(string, string)"/>. And combine
        /// them into a single feature column as output.
        /// </summary>
        /// <param name="data">input data.</param>
        /// <param name="columnInformation">column information.</param>
        /// <param name="outputColumnName">output feature column.</param>
        /// <returns>A <see cref="MultiModelPipeline"/> for featurization.</returns>
        public MultiModelPipeline Featurizer(IDataView data, ColumnInformation columnInformation, string outputColumnName = "Features")
        {
            Contracts.CheckValue(data, nameof(data));
            Contracts.CheckValue(columnInformation, nameof(columnInformation));

            var columnPurposes            = PurposeInference.InferPurposes(this._context, data, columnInformation);
            var textFeatures              = columnPurposes.Where(c => c.Purpose == ColumnPurpose.TextFeature);
            var numericFeatures           = columnPurposes.Where(c => c.Purpose == ColumnPurpose.NumericFeature);
            var catalogFeatures           = columnPurposes.Where(c => c.Purpose == ColumnPurpose.CategoricalFeature);
            var textFeatureColumnNames    = textFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray();
            var numericFeatureColumnNames = numericFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray();
            var catalogFeatureColumnNames = catalogFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray();

            var pipeline = new MultiModelPipeline();

            if (numericFeatureColumnNames.Length > 0)
            {
                pipeline = pipeline.Append(this.NumericFeaturizer(numericFeatureColumnNames, numericFeatureColumnNames));
            }

            if (catalogFeatureColumnNames.Length > 0)
            {
                pipeline = pipeline.Append(this.CatalogFeaturizer(catalogFeatureColumnNames, catalogFeatureColumnNames));
            }

            foreach (var textColumn in textFeatureColumnNames)
            {
                pipeline = pipeline.Append(this.TextFeaturizer(textColumn, textColumn));
            }

            var option = new ConcatOption
            {
                InputColumnNames = textFeatureColumnNames.Concat(numericFeatureColumnNames).Concat(catalogFeatureColumnNames).ToArray(),
                OutputColumnName = outputColumnName,
            };

            if (option.InputColumnNames.Length > 0)
            {
                pipeline = pipeline.Append(SweepableEstimatorFactory.CreateConcatenate(option));
            }

            return(pipeline);
        }
        public override CrossValidationExperimentResult <BinaryClassificationMetrics> Execute(IDataView trainData, uint numberOfCVFolds, ColumnInformation columnInformation = null, IEstimator <ITransformer> preFeaturizer = null, IProgress <CrossValidationRunDetail <BinaryClassificationMetrics> > progressHandler = null)
        {
            var label = columnInformation.LabelColumnName;

            _experiment.SetEvaluateMetric(Settings.OptimizingMetric, label);
            _experiment.SetTrainingTimeInSeconds(Settings.MaxExperimentTimeInSeconds);
            _experiment.SetDataset(trainData, (int)numberOfCVFolds);

            MultiModelPipeline pipeline = new MultiModelPipeline();

            if (preFeaturizer != null)
            {
                pipeline = pipeline.Append(preFeaturizer);
            }

            pipeline = pipeline.Append(Context.Auto().Featurizer(trainData, columnInformation, "__Features__"))
                       .Append(Context.Auto().BinaryClassification(label, featureColumnName: Features));

            _experiment.SetPipeline(pipeline);

            var monitor = new BinaryClassificationTrialResultMonitor();

            monitor.OnTrialCompleted += (o, e) =>
            {
                var runDetails = ToCrossValidationRunDetail(e);

                progressHandler?.Report(runDetails);
            };

            _experiment.SetMonitor(monitor);
            _experiment.Run();

            var runDetails = monitor.RunDetails.Select(e => ToCrossValidationRunDetail(e));
            var bestResult = ToCrossValidationRunDetail(monitor.BestRun);

            var result = new CrossValidationExperimentResult <BinaryClassificationMetrics>(runDetails, bestResult);

            return(result);
        }
Пример #9
0
        public TrialSettings Propose(TrialSettings settings)
        {
            _multiModelPipeline = settings.ExperimentSettings.Pipeline;
            _learnerInitialCost = _multiModelPipeline.PipelineIds.ToDictionary(kv => kv, kv => GetEstimatedCostForPipeline(kv, _multiModelPipeline));
            var pipelineIds = _multiModelPipeline.PipelineIds;

            if (_eci == null)
            {
                // initialize eci with the estimated cost and always start from pipeline which has lowest cost.
                _eci            = pipelineIds.ToDictionary(kv => kv, kv => GetEstimatedCostForPipeline(kv, _multiModelPipeline));
                settings.Schema = _eci.OrderBy(kv => kv.Value).First().Key;
            }
            else
            {
                var probabilities = pipelineIds.Select(id => _eci[id]).ToArray();
                probabilities = ArrayMath.Inverse(probabilities);
                probabilities = ArrayMath.Normalize(probabilities);

                // sample
                var randdouble = _rand.NextDouble();
                var sum        = 0.0;
                // selected pipeline id index
                int i;

                for (i = 0; i != pipelineIds.Length; ++i)
                {
                    sum += ((double[])probabilities)[i];
                    if (sum > randdouble)
                    {
                        break;
                    }
                }

                settings.Schema = pipelineIds[i];
            }

            settings.Pipeline = _multiModelPipeline.BuildSweepableEstimatorPipeline(settings.Schema);
            return(settings);
        }
Пример #10
0
        private double GetEstimatedCostForPipeline(string kv, MultiModelPipeline multiModelPipeline)
        {
            var entity = Entity.FromExpression(kv);

            var estimatorTypes = entity.ValueEntities().Where(v => v is StringEntity s && s.Value != "Nil")
                                 .Select(v =>
            {
                var s         = v as StringEntity;
                var estimator = multiModelPipeline.Estimators[s.Value];
                return(estimator.EstimatorType);
            });

            var res = 1;

            foreach (var estimatorType in estimatorTypes)
            {
                if (_estimatorCost.ContainsKey(estimatorType))
                {
                    return(_estimatorCost[estimatorType]);
                }
            }

            return(res);
        }
Пример #11
0
 public AutoMLExperiment SetPipeline(MultiModelPipeline pipeline)
 {
     _settings.Pipeline = pipeline;
     return(this);
 }