public void AddEvaluated(PipelinePattern pipeline)
            {
                if (pipeline.PerformanceSummary == null)
                {
                    throw new Exception("Candidate pipeline missing run summary.");
                }
                var d = pipeline.PerformanceSummary.MetricValue;

                while (_sortedSampledElements.ContainsKey(d))
                {
                    d += 1e-3;
                }
                _sortedSampledElements.Add(d, pipeline);
                _history.Add(pipeline);

                using (var ch = _host.Start("Suggested Pipeline"))
                {
                    ch.Info($"PipelineSweeper Iteration Number : {_history.Count}");
                    ch.Info($"PipelineSweeper Pipeline Id : {pipeline.UniqueId}");

                    foreach (var transform in pipeline.Transforms)
                    {
                        ch.Info($"PipelineSweeper Transform : {transform.Transform}");
                    }

                    ch.Info($"PipelineSweeper Learner : {pipeline.Learner}");
                    ch.Info($"PipelineSweeper Train Metric Value : {pipeline.PerformanceSummary.TrainingMetricValue}");
                    ch.Info($"PipelineSweeper Test Metric Value : {pipeline.PerformanceSummary.MetricValue}");
                }
            }
        public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine, string trainDataPath,
                                                   string schemaDefinitionFile, out string schemaDefinition, int numTransformLevels, int batchSize, SupportedMetric metric,
                                                   out PipelinePattern bestPipeline, int numOfSampleRows, ITerminator terminator, MacroUtils.TrainerKinds trainerKind)
        {
            Contracts.CheckValue(env, nameof(env));

            // REVIEW: Should be able to infer schema by itself, without having to
            // infer recipes. Look into this.
            // Set loader settings through inference
            RecipeInference.InferRecipesFromData(env, trainDataPath, schemaDefinitionFile,
                                                 out var _, out schemaDefinition, out var _, true);

#pragma warning disable 0618
            var data = ImportTextData.ImportText(env, new ImportTextData.Input
            {
                InputFile    = new SimpleFileHandle(env, trainDataPath, false, false),
                CustomSchema = schemaDefinition
            }).Data;
#pragma warning restore 0618
            var splitOutput = TrainTestSplit.Split(env, new TrainTestSplit.Input {
                Data = data, Fraction = 0.8f
            });
            AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind,
                                                   splitOutput.TrainData.Take(numOfSampleRows), splitOutput.TestData.Take(numOfSampleRows));
            bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows);
            return(amls);
        }
Exemple #3
0
        private void SampleHyperparameters(RecipeInference.SuggestedRecipe.SuggestedLearner learner, PipelinePattern[] history)
        {
            // If first time optimizing hyperparams, create new hyperparameter sweeper.
            if (!_hyperSweepers.ContainsKey(learner.LearnerName))
            {
                var sps = AutoMlUtils.ConvertToComponentFactories(learner.PipelineNode.SweepParams);
                if (sps.Length > 0)
                {
                    _hyperSweepers[learner.LearnerName] = new KdoSweeper(Env,
                                                                         new KdoSweeper.Arguments
                    {
                        SweptParameters         = sps,
                        NumberInitialPopulation = Math.Max(_remainingThirdStageTrials, 2)
                    });
                }
                else
                {
                    _hyperSweepers[learner.LearnerName] = new FalseSweeper();
                }
            }
            var sweeper      = _hyperSweepers[learner.LearnerName];
            var historyToUse = history.Where(p => p.Learner.LearnerName == learner.LearnerName).ToArray();

            if (_currentStage == (int)Stages.Third)
            {
                _remainingThirdStageTrials--;
                historyToUse = new PipelinePattern[0];
                if (_remainingThirdStageTrials < 1)
                {
                    _currentStage++;
                }
            }
            SampleHyperparameters(learner, sweeper, IsMaximizingMetric, historyToUse);
        }
        /// <summary>
        /// The InferPipelines methods are just public portals to the internal function that handle different
        /// types of data being passed in: training IDataView, path to training file, or train and test files.
        /// </summary>
        public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine,
                                                   IDataView trainData, IDataView testData, int numTransformLevels, int batchSize, SupportedMetric metric,
                                                   out PipelinePattern bestPipeline, ITerminator terminator, MacroUtils.TrainerKinds trainerKind)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(trainData, nameof(trainData));
            env.CheckValue(testData, nameof(testData));

            int           numOfRows = (int)(trainData.GetRowCount(false) ?? 1000);
            AutoMlMlState amls      = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, trainData, testData);

            bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfRows);
            return(amls);
        }
            public void AddEvaluated(PipelinePattern pipeline)
            {
                if (pipeline.PerformanceSummary == null)
                {
                    throw new Exception("Candidate pipeline missing run summary.");
                }
                var d = pipeline.PerformanceSummary.MetricValue;

                while (_sortedSampledElements.ContainsKey(d))
                {
                    d += 1e-3;
                }
                _sortedSampledElements.Add(d, pipeline);
                _history.Add(pipeline);
            }
 public AutoMlMlState(IHostEnvironment env, SupportedMetric metric, IPipelineOptimizer autoMlEngine,
                      ITerminator terminator, MacroUtils.TrainerKinds trainerKind, IDataView trainData = null, IDataView testData = null,
                      string[] requestedLearners = null)
 {
     Contracts.CheckValue(env, nameof(env));
     _sortedSampledElements =
         metric.IsMaximizing ? new SortedList <double, PipelinePattern>(new ReversedComparer <double>()) :
         new SortedList <double, PipelinePattern>();
     _history           = new List <PipelinePattern>();
     _env               = env;
     _host              = _env.Register("AutoMlState");
     _trainData         = trainData;
     _testData          = testData;
     _terminator        = terminator;
     _requestedLearners = requestedLearners;
     AutoMlEngine       = autoMlEngine;
     BatchCandidates    = new PipelinePattern[] { };
     Metric             = metric;
     TrainerKind        = trainerKind;
 }
        public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine, IDataView data, int numTransformLevels,
                                                   int batchSize, SupportedMetric metric, out PipelinePattern bestPipeline, int numOfSampleRows,
                                                   ITerminator terminator, MacroUtils.TrainerKinds trainerKind)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(data, nameof(data));

            var splitOutput = TrainTestSplit.Split(env, new TrainTestSplit.Input {
                Data = data, Fraction = 0.8f
            });
            AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind,
                                                   splitOutput.TrainData.Take(numOfSampleRows), splitOutput.TestData.Take(numOfSampleRows));

            bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows);
            return(amls);
        }
 public void ClearEvaluatedPipelines()
 {
     _sortedSampledElements.Clear();
     BatchCandidates = new PipelinePattern[0];
 }
            private void ProcessPipeline(Sweeper.Algorithms.SweeperProbabilityUtils utils, Stopwatch stopwatch, PipelinePattern candidate, int numOfTrainingRows)
            {
                // Create a randomized numer of rows to do train/test with.
                int randomizedNumberOfRows =
                    (int)Math.Floor(utils.NormalRVs(1, numOfTrainingRows, (double)numOfTrainingRows / 10).First());

                if (randomizedNumberOfRows > numOfTrainingRows)
                {
                    randomizedNumberOfRows = numOfTrainingRows - (randomizedNumberOfRows - numOfTrainingRows);
                }

                // Run pipeline, and time how long it takes
                stopwatch.Restart();
                candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows),
                                                 _testData, Metric, TrainerKind, out var testMetricVal, out var trainMetricVal);
                stopwatch.Stop();

                // Handle key collisions on sorted list
                while (_sortedSampledElements.ContainsKey(testMetricVal))
                {
                    testMetricVal += 1e-10;
                }

                // Save performance score
                candidate.PerformanceSummary =
                    new RunSummary(testMetricVal, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds, trainMetricVal);
                _sortedSampledElements.Add(candidate.PerformanceSummary.MetricValue, candidate);
                _history.Add(candidate);
            }
        private PipelinePattern[] GetRandomPipelines(int numOfPipelines)
        {
            Host.Check(AvailableLearners.All(l => l.PipelineNode != null));
            Host.Check(AvailableTransforms.All(t => t.PipelineNode != null));
            int atomicGroupLimit = AvailableTransforms.Select(t => t.AtomicGroupId)
                                   .DefaultIfEmpty(-1).Max() + 1;
            var pipelines  = new List <PipelinePattern>();
            int collisions = 0;
            int totalCount = 0;

            while (pipelines.Count < numOfPipelines)
            {
                // Generate random bitmask (set of transform atomic group IDs)
                long transformsBitMask = Host.Rand.Next((int)Math.Pow(2, atomicGroupLimit));

                // Include all "always on" transforms, such as autolabel.
                transformsBitMask |= AutoMlUtils.IncludeMandatoryTransforms(AvailableTransforms.ToList());

                // Get actual learner and transforms for pipeline
                var selectedLearner    = AvailableLearners[Host.Rand.Next(AvailableLearners.Length)];
                var selectedTransforms = AvailableTransforms.Where(t =>
                                                                   AutoMlUtils.AtomicGroupPresent(transformsBitMask, t.AtomicGroupId)).ToList();

                // Randomly change transform sweepable hyperparameter settings
                selectedTransforms.ForEach(t => RandomlyPerturbSweepableHyperparameters(t.PipelineNode));

                // Randomly change learner sweepable hyperparameter settings
                RandomlyPerturbSweepableHyperparameters(selectedLearner.PipelineNode);

                // Always include features concat transform
                selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData,
                                                                              DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, DataRoles));

                // Compute hash key for checking if we've already seen this pipeline.
                // However, if we keep missing, don't want to get stuck in infinite loop.
                // Try for a good number of times (for example, numOfPipelines * 4), then just add
                // all generated pipelines to get us out of rut.
                string hashKey = GetHashKey(transformsBitMask, selectedLearner);
                if (collisions < numOfPipelines * 4 && VisitedPipelines.Contains(hashKey))
                {
                    collisions++;
                    continue;
                }

                VisitedPipelines.Add(hashKey);
                collisions = 0;
                totalCount++;

                // Keep pipeline if valid
                var pipeline = new PipelinePattern(selectedTransforms.ToArray(), selectedLearner, "", Env);
                if (!TransformsMaskValidity.ContainsKey(transformsBitMask))
                {
                    TransformsMaskValidity.Add(transformsBitMask, PipelineVerifier(pipeline, transformsBitMask));
                }
                if (TransformsMaskValidity[transformsBitMask])
                {
                    pipelines.Add(pipeline);
                }

                // Only invalid pipelines available, stuck in loop.
                // Break out and return no pipelines.
                if (totalCount > numOfPipelines * 10)
                {
                    break;
                }
            }

            return(pipelines.ToArray());
        }