public void AddEvaluated(PipelinePattern pipeline) { if (pipeline.PerformanceSummary == null) { throw new Exception("Candidate pipeline missing run summary."); } var d = pipeline.PerformanceSummary.MetricValue; while (_sortedSampledElements.ContainsKey(d)) { d += 1e-3; } _sortedSampledElements.Add(d, pipeline); _history.Add(pipeline); using (var ch = _host.Start("Suggested Pipeline")) { ch.Info($"PipelineSweeper Iteration Number : {_history.Count}"); ch.Info($"PipelineSweeper Pipeline Id : {pipeline.UniqueId}"); foreach (var transform in pipeline.Transforms) { ch.Info($"PipelineSweeper Transform : {transform.Transform}"); } ch.Info($"PipelineSweeper Learner : {pipeline.Learner}"); ch.Info($"PipelineSweeper Train Metric Value : {pipeline.PerformanceSummary.TrainingMetricValue}"); ch.Info($"PipelineSweeper Test Metric Value : {pipeline.PerformanceSummary.MetricValue}"); } }
public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine, string trainDataPath, string schemaDefinitionFile, out string schemaDefinition, int numTransformLevels, int batchSize, SupportedMetric metric, out PipelinePattern bestPipeline, int numOfSampleRows, ITerminator terminator, MacroUtils.TrainerKinds trainerKind) { Contracts.CheckValue(env, nameof(env)); // REVIEW: Should be able to infer schema by itself, without having to // infer recipes. Look into this. // Set loader settings through inference RecipeInference.InferRecipesFromData(env, trainDataPath, schemaDefinitionFile, out var _, out schemaDefinition, out var _, true); #pragma warning disable 0618 var data = ImportTextData.ImportText(env, new ImportTextData.Input { InputFile = new SimpleFileHandle(env, trainDataPath, false, false), CustomSchema = schemaDefinition }).Data; #pragma warning restore 0618 var splitOutput = TrainTestSplit.Split(env, new TrainTestSplit.Input { Data = data, Fraction = 0.8f }); AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, splitOutput.TrainData.Take(numOfSampleRows), splitOutput.TestData.Take(numOfSampleRows)); bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); return(amls); }
private void SampleHyperparameters(RecipeInference.SuggestedRecipe.SuggestedLearner learner, PipelinePattern[] history) { // If first time optimizing hyperparams, create new hyperparameter sweeper. if (!_hyperSweepers.ContainsKey(learner.LearnerName)) { var sps = AutoMlUtils.ConvertToComponentFactories(learner.PipelineNode.SweepParams); if (sps.Length > 0) { _hyperSweepers[learner.LearnerName] = new KdoSweeper(Env, new KdoSweeper.Arguments { SweptParameters = sps, NumberInitialPopulation = Math.Max(_remainingThirdStageTrials, 2) }); } else { _hyperSweepers[learner.LearnerName] = new FalseSweeper(); } } var sweeper = _hyperSweepers[learner.LearnerName]; var historyToUse = history.Where(p => p.Learner.LearnerName == learner.LearnerName).ToArray(); if (_currentStage == (int)Stages.Third) { _remainingThirdStageTrials--; historyToUse = new PipelinePattern[0]; if (_remainingThirdStageTrials < 1) { _currentStage++; } } SampleHyperparameters(learner, sweeper, IsMaximizingMetric, historyToUse); }
/// <summary> /// The InferPipelines methods are just public portals to the internal function that handle different /// types of data being passed in: training IDataView, path to training file, or train and test files. /// </summary> public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine, IDataView trainData, IDataView testData, int numTransformLevels, int batchSize, SupportedMetric metric, out PipelinePattern bestPipeline, ITerminator terminator, MacroUtils.TrainerKinds trainerKind) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(trainData, nameof(trainData)); env.CheckValue(testData, nameof(testData)); int numOfRows = (int)(trainData.GetRowCount(false) ?? 1000); AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, trainData, testData); bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfRows); return(amls); }
public void AddEvaluated(PipelinePattern pipeline) { if (pipeline.PerformanceSummary == null) { throw new Exception("Candidate pipeline missing run summary."); } var d = pipeline.PerformanceSummary.MetricValue; while (_sortedSampledElements.ContainsKey(d)) { d += 1e-3; } _sortedSampledElements.Add(d, pipeline); _history.Add(pipeline); }
public AutoMlMlState(IHostEnvironment env, SupportedMetric metric, IPipelineOptimizer autoMlEngine, ITerminator terminator, MacroUtils.TrainerKinds trainerKind, IDataView trainData = null, IDataView testData = null, string[] requestedLearners = null) { Contracts.CheckValue(env, nameof(env)); _sortedSampledElements = metric.IsMaximizing ? new SortedList <double, PipelinePattern>(new ReversedComparer <double>()) : new SortedList <double, PipelinePattern>(); _history = new List <PipelinePattern>(); _env = env; _host = _env.Register("AutoMlState"); _trainData = trainData; _testData = testData; _terminator = terminator; _requestedLearners = requestedLearners; AutoMlEngine = autoMlEngine; BatchCandidates = new PipelinePattern[] { }; Metric = metric; TrainerKind = trainerKind; }
public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine, IDataView data, int numTransformLevels, int batchSize, SupportedMetric metric, out PipelinePattern bestPipeline, int numOfSampleRows, ITerminator terminator, MacroUtils.TrainerKinds trainerKind) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(data, nameof(data)); var splitOutput = TrainTestSplit.Split(env, new TrainTestSplit.Input { Data = data, Fraction = 0.8f }); AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, splitOutput.TrainData.Take(numOfSampleRows), splitOutput.TestData.Take(numOfSampleRows)); bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); return(amls); }
public void ClearEvaluatedPipelines() { _sortedSampledElements.Clear(); BatchCandidates = new PipelinePattern[0]; }
private void ProcessPipeline(Sweeper.Algorithms.SweeperProbabilityUtils utils, Stopwatch stopwatch, PipelinePattern candidate, int numOfTrainingRows) { // Create a randomized numer of rows to do train/test with. int randomizedNumberOfRows = (int)Math.Floor(utils.NormalRVs(1, numOfTrainingRows, (double)numOfTrainingRows / 10).First()); if (randomizedNumberOfRows > numOfTrainingRows) { randomizedNumberOfRows = numOfTrainingRows - (randomizedNumberOfRows - numOfTrainingRows); } // Run pipeline, and time how long it takes stopwatch.Restart(); candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows), _testData, Metric, TrainerKind, out var testMetricVal, out var trainMetricVal); stopwatch.Stop(); // Handle key collisions on sorted list while (_sortedSampledElements.ContainsKey(testMetricVal)) { testMetricVal += 1e-10; } // Save performance score candidate.PerformanceSummary = new RunSummary(testMetricVal, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds, trainMetricVal); _sortedSampledElements.Add(candidate.PerformanceSummary.MetricValue, candidate); _history.Add(candidate); }
private PipelinePattern[] GetRandomPipelines(int numOfPipelines) { Host.Check(AvailableLearners.All(l => l.PipelineNode != null)); Host.Check(AvailableTransforms.All(t => t.PipelineNode != null)); int atomicGroupLimit = AvailableTransforms.Select(t => t.AtomicGroupId) .DefaultIfEmpty(-1).Max() + 1; var pipelines = new List <PipelinePattern>(); int collisions = 0; int totalCount = 0; while (pipelines.Count < numOfPipelines) { // Generate random bitmask (set of transform atomic group IDs) long transformsBitMask = Host.Rand.Next((int)Math.Pow(2, atomicGroupLimit)); // Include all "always on" transforms, such as autolabel. transformsBitMask |= AutoMlUtils.IncludeMandatoryTransforms(AvailableTransforms.ToList()); // Get actual learner and transforms for pipeline var selectedLearner = AvailableLearners[Host.Rand.Next(AvailableLearners.Length)]; var selectedTransforms = AvailableTransforms.Where(t => AutoMlUtils.AtomicGroupPresent(transformsBitMask, t.AtomicGroupId)).ToList(); // Randomly change transform sweepable hyperparameter settings selectedTransforms.ForEach(t => RandomlyPerturbSweepableHyperparameters(t.PipelineNode)); // Randomly change learner sweepable hyperparameter settings RandomlyPerturbSweepableHyperparameters(selectedLearner.PipelineNode); // Always include features concat transform selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, DataRoles)); // Compute hash key for checking if we've already seen this pipeline. // However, if we keep missing, don't want to get stuck in infinite loop. // Try for a good number of times (for example, numOfPipelines * 4), then just add // all generated pipelines to get us out of rut. string hashKey = GetHashKey(transformsBitMask, selectedLearner); if (collisions < numOfPipelines * 4 && VisitedPipelines.Contains(hashKey)) { collisions++; continue; } VisitedPipelines.Add(hashKey); collisions = 0; totalCount++; // Keep pipeline if valid var pipeline = new PipelinePattern(selectedTransforms.ToArray(), selectedLearner, "", Env); if (!TransformsMaskValidity.ContainsKey(transformsBitMask)) { TransformsMaskValidity.Add(transformsBitMask, PipelineVerifier(pipeline, transformsBitMask)); } if (TransformsMaskValidity[transformsBitMask]) { pipelines.Add(pipeline); } // Only invalid pipelines available, stuck in loop. // Break out and return no pipelines. if (totalCount > numOfPipelines * 10) { break; } } return(pipelines.ToArray()); }