Beispiel #1
0
        /// <summary>
        /// Computes the quality metrics for the multi-class classification PredictionModel
        /// using the specified data set.
        /// </summary>
        /// <param name="model">
        /// The trained multi-class classification PredictionModel to be evaluated.
        /// </param>
        /// <param name="testData">
        /// The test data that will be predicted and used to evaluate the model.
        /// </param>
        /// <returns>
        /// A ClassificationMetrics instance that describes how well the model performed against the test data.
        /// </returns>
        public ClassificationMetrics Evaluate(PredictionModel model, ILearningPipelineLoader testData)
        {
            var environment = new MLContext();

            environment.CheckValue(model, nameof(model));
            environment.CheckValue(testData, nameof(testData));

            Experiment experiment = environment.CreateExperiment();

            ILearningPipelineStep testDataStep = testData.ApplyStep(previousStep: null, experiment);

            if (!(testDataStep is ILearningPipelineDataStep testDataOutput))
            {
                throw environment.Except($"The {nameof(ILearningPipelineLoader)} did not return a {nameof(ILearningPipelineDataStep)} from ApplyStep.");
            }

            var datasetScorer = new DatasetTransformScorer
            {
                Data = testDataOutput.Data,
            };

            DatasetTransformScorer.Output scoreOutput = experiment.Add(datasetScorer);

            Data = scoreOutput.ScoredData;
            Output evaluteOutput = experiment.Add(this);

            experiment.Compile();

            experiment.SetInput(datasetScorer.TransformModel, model.PredictorModel);
            testData.SetInput(environment, experiment);

            experiment.Run();

            IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);

            if (overallMetrics == null)
            {
                throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
            }

            IDataView confusionMatrix = experiment.GetOutput(evaluteOutput.ConfusionMatrix);

            if (confusionMatrix == null)
            {
                throw environment.Except($"Could not find ConfusionMatrix in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
            }

            var metric = ClassificationMetrics.FromMetrics(environment, overallMetrics, confusionMatrix);

            if (metric.Count != 1)
            {
                throw environment.Except($"Exactly one metric set was expected but found {metric.Count} metrics");
            }

            return(metric[0]);
        }
Beispiel #2
0
        public DataAndModel <ITransformModel> Add(Experiment experiment)
        {
            if (_subTrainerObj != null && _entryPointObj is CommonInputs.IFeaturizerInput epFeat)
            {
                epFeat.PredictorModel = experiment.Add(_subTrainerObj).PredictorModel;
            }
            var output = experiment.Add(_entryPointObj);

            return(new DataAndModel <ITransformModel>(output.OutputData, output.Model));
        }
Beispiel #3
0
        private static ITransformModel CreateKcHousePricePredictorModel(string dataPath)
        {
            var dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,";

            Experiment experiment = s_environment.CreateExperiment();

            var importData = new Data.TextLoader();

            importData.CustomSchema = dataSchema;
            Data.TextLoader.Output imported = experiment.Add(importData);

            var numericalConcatenate = new Transforms.ColumnConcatenator();

            numericalConcatenate.Data = imported.Data;
            numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15");
            Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate);

            var categoryConcatenate = new Transforms.ColumnConcatenator();

            categoryConcatenate.Data = numericalConcatenated.OutputData;
            categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode");
            Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate);

            var categorize = new Transforms.CategoricalOneHotVectorizer();

            categorize.AddColumn("CategoryFeatures");
            categorize.Data = categoryConcatenated.OutputData;
            Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize);

            var featuresConcatenate = new Transforms.ColumnConcatenator();

            featuresConcatenate.Data = categorized.OutputData;
            featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures");
            Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate);

            var learner = new Trainers.StochasticDualCoordinateAscentRegressor();

            learner.TrainingData = featuresConcatenated.OutputData;
            learner.NumThreads   = 1;
            Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner);

            var combineModels = new Transforms.ManyHeterogeneousModelCombiner();

            combineModels.TransformModels = new ArrayVar <ITransformModel>(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model);
            combineModels.PredictorModel  = learnerOutput.PredictorModel;
            Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels);

            var scorer = new Transforms.Scorer
            {
                PredictorModel = combinedModels.PredictorModel
            };

            var scorerOutput = experiment.Add(scorer);

            experiment.Compile();
            experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false));
            experiment.Run();

            return(experiment.GetOutput(scorerOutput.ScoringTransform));
        }
        public Legacy.Models.TrainTestEvaluator.Output AddAsTrainTest(Var <IDataView> trainData, Var <IDataView> testData,
                                                                      MacroUtils.TrainerKinds trainerKind, Experiment experiment = null, bool includeTrainingMetrics = false)
        {
            experiment = experiment ?? _env.CreateExperiment();
            var graphDef   = ToEntryPointGraph(experiment);
            var subGraph   = graphDef.Graph;
            var firstInput = new Var <IDataView> {
                VarName = graphDef.GetSubgraphFirstNodeDataVarName(_env)
            };
            var finalOutput = graphDef.ModelOutput;

            // TrainTestMacro
            var trainTestInput = new Legacy.Models.TrainTestEvaluator
            {
                Nodes          = subGraph,
                TransformModel = null,
                Inputs         =
                {
                    Data = firstInput
                },
                Outputs =
                {
                    PredictorModel = finalOutput
                },
                TrainingData           = trainData,
                TestingData            = testData,
                Kind                   = MacroUtils.TrainerKindApiValue <Legacy.Models.MacroUtilsTrainerKinds>(trainerKind),
                PipelineId             = UniqueId.ToString("N"),
                IncludeTrainingMetrics = includeTrainingMetrics
            };
            var trainTestOutput = experiment.Add(trainTestInput);

            return(trainTestOutput);
        }
        public void PipelineSweeperNoTransforms()
        {
            // Set up inputs for experiment
            string       pathData        = GetDataPath("adult.train");
            string       pathDataTest    = GetDataPath("adult.test");
            const int    numOfSampleRows = 1000;
            const string schema          = "sep=, col=Features:R4:0,2,4,10-12 col=Label:R4:14 header=+";

            var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false);

#pragma warning disable 0618
            var datasetTrain = ImportTextData.ImportText(Env,
                                                         new ImportTextData.Input {
                InputFile = inputFileTrain, CustomSchema = schema
            }).Data.Take(numOfSampleRows);
            var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false);
            var datasetTest   = ImportTextData.ImportText(Env,
                                                          new ImportTextData.Input {
                InputFile = inputFileTest, CustomSchema = schema
            }).Data.Take(numOfSampleRows);
#pragma warning restore 0618
            const int       batchSize          = 5;
            const int       numIterations      = 20;
            const int       numTransformLevels = 2;
            var             env    = new MLContext();
            SupportedMetric metric = PipelineSweeperSupportedMetrics.GetSupportedMetric(PipelineSweeperSupportedMetrics.Metrics.Auc);

            // Using the simple, uniform random sampling (with replacement) engine
            PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(Env);

            // Create search object
            var amls = new AutoInference.AutoMlMlState(Env, metric, autoMlEngine, new IterationTerminator(numIterations),
                                                       MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, datasetTrain, datasetTest);

            // Infer search space
            amls.InferSearchSpace(numTransformLevels);

            // Create macro object
            var pipelineSweepInput = new Microsoft.ML.Legacy.Models.PipelineSweeper()
            {
                BatchSize = batchSize,
            };

            var exp    = new Experiment(Env);
            var output = exp.Add(pipelineSweepInput);
            exp.Compile();
            exp.SetInput(pipelineSweepInput.TrainingData, datasetTrain);
            exp.SetInput(pipelineSweepInput.TestingData, datasetTest);
            exp.SetInput(pipelineSweepInput.State, amls);
            exp.SetInput(pipelineSweepInput.CandidateOutputs, new IDataView[0]);
            exp.Run();

            // Make sure you get back an AutoMlState, and that it ran for correct number of iterations
            // with at least minimal performance values (i.e., best should have AUC better than 0.1 on this dataset).
            AutoInference.AutoMlMlState amlsOut = (AutoInference.AutoMlMlState)exp.GetOutput(output.State);
            Assert.NotNull(amlsOut);
            Assert.Equal(amlsOut.GetAllEvaluatedPipelines().Length, numIterations);
            Assert.True(amlsOut.GetBestPipeline().PerformanceSummary.MetricValue > 0.8);
        }
            public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
            {
                Contracts.Assert(previousStep == null);
                _dataViewEntryPoint = new Data.DataViewReference();
                var importOutput = experiment.Add(_dataViewEntryPoint);

                return(new CollectionDataSourcePipelineStep(importOutput.Data));
            }
Beispiel #7
0
        /// <summary>
        /// Computes the quality metrics for the PredictionModel using the specified data set.
        /// </summary>
        /// <param name="model">
        /// The trained PredictionModel to be evaluated.
        /// </param>
        /// <param name="testData">
        /// The test data that will be predicted and used to evaulate the model.
        /// </param>
        /// <returns>
        /// A RegressionMetrics instance that describes how well the model performed against the test data.
        /// </returns>
        public RegressionMetrics Evaluate(PredictionModel model, ILearningPipelineLoader testData)
        {
            using (var environment = new TlcEnvironment())
            {
                environment.CheckValue(model, nameof(model));
                environment.CheckValue(testData, nameof(testData));

                Experiment experiment = environment.CreateExperiment();

                ILearningPipelineStep testDataStep = testData.ApplyStep(previousStep: null, experiment);
                if (!(testDataStep is ILearningPipelineDataStep testDataOutput))
                {
                    throw environment.Except($"The {nameof(ILearningPipelineLoader)} did not return a {nameof(ILearningPipelineDataStep)} from ApplyStep.");
                }

                var datasetScorer = new DatasetTransformScorer
                {
                    Data = testDataOutput.Data,
                };
                DatasetTransformScorer.Output scoreOutput = experiment.Add(datasetScorer);

                Data = scoreOutput.ScoredData;
                Output evaluteOutput = experiment.Add(this);

                experiment.Compile();

                experiment.SetInput(datasetScorer.TransformModel, model.PredictorModel);
                testData.SetInput(environment, experiment);

                experiment.Run();

                IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);

                if (overallMetrics == null)
                {
                    throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(RegressionEvaluator)} Evaluate.");
                }

                return(RegressionMetrics.FromOverallMetrics(environment, overallMetrics));
            }
        }
Beispiel #8
0
        /// <summary>
        /// <see href="https://onnx.ai/">ONNX</see> is an intermediate representation format
        /// for machine learning models. It is used to make models portable such that you can
        /// train a model using a toolkit and run it in another tookit's runtime, for example,
        /// you can create a model using ML.NET, export it to an ONNX-ML model file,
        /// then load and run that ONNX-ML model in Windows ML, on an UWP Windows 10 app.
        ///
        /// This API converts an ML.NET model to ONNX-ML format by inspecting the transform pipeline
        /// from the end, checking for components that know how to save themselves as ONNX.
        /// The first item in the transform pipeline that does not know how to save itself
        /// as ONNX, is considered the "input" to the ONNX pipeline. (Ideally this would be the
        /// original loader itself, but this may not be possible if the user used unsavable
        /// transforms in defining the pipe.) All the columns in the source that are a type the
        /// ONNX knows how to deal with will be tracked. Intermediate transformations of the
        /// data appearing as new columns will appear in the output block of the ONNX, with names
        /// derived from the corresponding column names. The ONNX JSON will be serialized to a
        /// path defined through the Json option.
        ///
        /// This API supports the following arguments:
        /// <see cref="Onnx"/> indicates the file to write the ONNX protocol buffer file to. This is optional.
        /// <see cref="Json"/> indicates the file to write the JSON representation of the ONNX model. This is optional.
        /// <see cref="Name"/> indicates the name property in the ONNX model. If left unspecified, it will
        /// be the extension-less name of the file specified in the onnx indicates the protocol buffer file
        /// to write the ONNX representation to.
        /// <see cref="Domain"/> indicates the domain name of the model. ONNX uses reverse domain name space indicators.
        /// For example com.microsoft.cognitiveservices. This is a required field.
        /// <see cref="InputsToDrop"/> is a string array of input column names to omit from the input mapping.
        /// A common scenario might be to drop the label column, for instance, since it may not be practically
        /// useful for the pipeline. Note that any columns depending on these naturally cannot be saved.
        /// <see cref="OutputsToDrop"/> is similar, except for the output schema. Note that the pipeline handler
        /// is currently not intelligent enough to drop intermediate calculations that produce this value: this will
        /// merely omit that value from the actual output.
        ///
        /// Transforms that can be exported to ONNX
        /// 1. Concat
        /// 2. KeyToVector
        /// 3. NAReplace
        /// 4. Normalize
        /// 5. Term
        /// 6. Categorical
        ///
        /// Learners that can be exported to ONNX
        /// 1. FastTree
        /// 2. LightGBM
        /// 3. Logistic Regression
        ///
        /// See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.Tests/OnnxTests.cs"/>
        /// for an example on how to train a model and then convert that model to ONNX.
        /// </summary>
        /// <param name="model">Model that needs to be converted to ONNX format.</param>
        public void Convert(PredictionModel model)
        {
            using (var environment = new TlcEnvironment())
            {
                environment.CheckValue(model, nameof(model));

                Experiment experiment = environment.CreateExperiment();
                experiment.Add(this);
                experiment.Compile();
                experiment.SetInput(Model, model.PredictorModel);
                experiment.Run();
            }
        }
Beispiel #9
0
            public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
            {
                using (var env = new TlcEnvironment())
                {
                    var subgraph = env.CreateExperiment();
                    subgraph.Add(_trainer);
                    var ova = new OneVersusAll();
                    if (previousStep != null)
                    {
                        if (!(previousStep is ILearningPipelineDataStep dataStep))
                        {
                            throw new InvalidOperationException($"{ nameof(OneVersusAll)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
                        }

                        _data                = dataStep.Data;
                        ova.TrainingData     = dataStep.Data;
                        ova.UseProbabilities = _useProbabilities;
                        ova.Nodes            = subgraph;
                    }
                    Output output = experiment.Add(ova);
                    return(new OvaPipelineStep(output));
                }
            }
        public static CommonOutputs.MacroOutput <Output> TrainTest(
            IHostEnvironment env,
            Arguments input,
            EntryPointNode node)
        {
            // Create default pipeline ID if one not given.
            input.PipelineId = input.PipelineId ?? Guid.NewGuid().ToString("N");

            // Parse the subgraph.
            var subGraphRunContext = new RunContext(env);
            var subGraphNodes      = EntryPointNode.ValidateNodes(env, subGraphRunContext, input.Nodes, node.Catalog);

            // Change the subgraph to use the training data as input.
            var             varName = input.Inputs.Data.VarName;
            VariableBinding transformModelVarName = null;

            if (input.TransformModel != null)
            {
                transformModelVarName = node.GetInputVariable(nameof(input.TransformModel));
            }

            if (!subGraphRunContext.TryGetVariable(varName, out var dataVariable))
            {
                throw env.Except($"Invalid variable name '{varName}'.");
            }
            var trainingVar = node.GetInputVariable(nameof(input.TrainingData));

            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.RenameInputVariable(dataVariable.Name, trainingVar);
            }
            subGraphRunContext.RemoveVariable(dataVariable);

            // Change the subgraph to use the model variable as output.
            varName = input.Outputs.Model.VarName;
            if (!subGraphRunContext.TryGetVariable(varName, out dataVariable))
            {
                throw env.Except($"Invalid variable name '{varName}'.");
            }
            string outputVarName = node.GetOutputVariableName(nameof(Output.PredictorModel));

            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.RenameOutputVariable(dataVariable.Name, outputVarName);
            }
            subGraphRunContext.RemoveVariable(dataVariable);

            // Move the variables from the subcontext to the main context.
            node.Context.AddContextVariables(subGraphRunContext);

            // Change all the subgraph nodes to use the main context.
            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.SetContext(node.Context);
            }

            // Testing using test data set
            var testingVar = node.GetInputVariable(nameof(input.TestingData));
            var exp        = new Experiment(env);

            //combine the predictor model with any potential transfrom model passed from the outer graph
            if (transformModelVarName != null && transformModelVarName.VariableName != null)
            {
                var modelCombine = new ML.Transforms.TwoHeterogeneousModelCombiner
                {
                    TransformModel = { VarName = transformModelVarName.VariableName },
                    PredictorModel = { VarName = outputVarName }
                };

                var modelCombineOutput = exp.Add(modelCombine);
                outputVarName = modelCombineOutput.PredictorModel.VarName;
            }

            // Add the scoring node for testing.
            var scoreNode = new ML.Transforms.DatasetScorer
            {
                Data           = { VarName = testingVar.ToJson() },
                PredictorModel = { VarName = outputVarName }
            };
            var scoreNodeOutput = exp.Add(scoreNode);

            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));

            // Do not double-add previous nodes.
            exp.Reset();

            // REVIEW: we need to extract the proper label column name here to pass to the evaluators.
            // This is where you would add code to do it.
            var settings = new MacroUtils.EvaluatorSettings
            {
                LabelColumn = DefaultColumnNames.Label
            };

            string outVariableName;

            if (input.IncludeTrainingMetrics)
            {
                // Add the scoring node for training.
                var scoreNodeTraining = new ML.Transforms.DatasetScorer
                {
                    Data           = { VarName = trainingVar.ToJson() },
                    PredictorModel = { VarName = outputVarName }
                };
                var scoreNodeTrainingOutput = exp.Add(scoreNodeTraining);
                subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));

                // Do not double-add previous nodes.
                exp.Reset();

                // Add the evaluator node for training.
                var evalInputOutputTraining = MacroUtils.GetEvaluatorInputOutput(input.Kind, settings);
                var evalNodeTraining        = evalInputOutputTraining.Item1;
                var evalOutputTraining      = evalInputOutputTraining.Item2;
                evalNodeTraining.Data.VarName = scoreNodeTrainingOutput.ScoredData.VarName;

                if (node.OutputMap.TryGetValue(nameof(Output.TrainingWarnings), out outVariableName))
                {
                    evalOutputTraining.Warnings.VarName = outVariableName;
                }
                if (node.OutputMap.TryGetValue(nameof(Output.TrainingOverallMetrics), out outVariableName))
                {
                    evalOutputTraining.OverallMetrics.VarName = outVariableName;
                }
                if (node.OutputMap.TryGetValue(nameof(Output.TrainingPerInstanceMetrics), out outVariableName))
                {
                    evalOutputTraining.PerInstanceMetrics.VarName = outVariableName;
                }
                if (node.OutputMap.TryGetValue(nameof(Output.TrainingConfusionMatrix), out outVariableName) &&
                    evalOutputTraining is CommonOutputs.IClassificationEvaluatorOutput eoTraining)
                {
                    eoTraining.ConfusionMatrix.VarName = outVariableName;
                }

                exp.Add(evalNodeTraining, evalOutputTraining);
                subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));
            }

            // Do not double-add previous nodes.
            exp.Reset();

            // Add the evaluator node for testing.
            var evalInputOutput = MacroUtils.GetEvaluatorInputOutput(input.Kind, settings);
            var evalNode        = evalInputOutput.Item1;
            var evalOutput      = evalInputOutput.Item2;

            evalNode.Data.VarName = scoreNodeOutput.ScoredData.VarName;

            if (node.OutputMap.TryGetValue(nameof(Output.Warnings), out outVariableName))
            {
                evalOutput.Warnings.VarName = outVariableName;
            }
            if (node.OutputMap.TryGetValue(nameof(Output.OverallMetrics), out outVariableName))
            {
                evalOutput.OverallMetrics.VarName = outVariableName;
            }
            if (node.OutputMap.TryGetValue(nameof(Output.PerInstanceMetrics), out outVariableName))
            {
                evalOutput.PerInstanceMetrics.VarName = outVariableName;
            }
            if (node.OutputMap.TryGetValue(nameof(Output.ConfusionMatrix), out outVariableName) &&
                evalOutput is CommonOutputs.IClassificationEvaluatorOutput eo)
            {
                eo.ConfusionMatrix.VarName = outVariableName;
            }

            exp.Add(evalNode, evalOutput);
            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));

            // Marks as an atomic unit that can be run in
            // a distributed fashion.
            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.StageId = input.PipelineId;
            }

            return(new CommonOutputs.MacroOutput <Output>()
            {
                Nodes = subGraphNodes
            });
        }
        public static CommonOutputs.MacroOutput <Output> CrossValidate(
            IHostEnvironment env,
            Arguments input,
            EntryPointNode node)
        {
            env.CheckValue(input, nameof(input));

            // This will be the final resulting list of nodes that is returned from the macro.
            var subGraphNodes = new List <EntryPointNode>();

            //the input transform model
            VariableBinding transformModelVarName = null;

            if (input.TransformModel != null)
            {
                transformModelVarName = node.GetInputVariable(nameof(input.TransformModel));
            }

            // Split the input data into folds.
            var exp     = new Experiment(env);
            var cvSplit = new Models.CrossValidatorDatasetSplitter();

            cvSplit.Data.VarName         = node.GetInputVariable("Data").ToJson();
            cvSplit.NumFolds             = input.NumFolds;
            cvSplit.StratificationColumn = input.StratificationColumn;
            var cvSplitOutput = exp.Add(cvSplit);

            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));

            var predModelVars           = new Var <IPredictorModel> [input.NumFolds];
            var transformModelVars      = new Var <ITransformModel> [input.NumFolds];
            var inputTransformModelVars = new Var <IPredictorModel> [input.NumFolds];
            var warningsVars            = new Var <IDataView> [input.NumFolds];
            var overallMetricsVars      = new Var <IDataView> [input.NumFolds];
            var instanceMetricsVars     = new Var <IDataView> [input.NumFolds];
            var confusionMatrixVars     = new Var <IDataView> [input.NumFolds];

            // Instantiate the subgraph for each fold.
            for (int k = 0; k < input.NumFolds; k++)
            {
                // Parse the nodes in input.Nodes into a temporary run context.
                var context = new RunContext(env);
                var graph   = EntryPointNode.ValidateNodes(env, context, input.Nodes, node.Catalog);

                // Rename all the variables such that they don't conflict with the ones in the outer run context.
                var mapping = new Dictionary <string, string>();
                foreach (var entryPointNode in graph)
                {
                    entryPointNode.RenameAllVariables(mapping);
                }

                // Instantiate a TrainTest entry point for this fold.
                var args = new TrainTestMacro.Arguments
                {
                    Nodes          = new JArray(graph.Select(n => n.ToJson()).ToArray()),
                    TransformModel = null,
                    LabelColumn    = input.LabelColumn,
                    GroupColumn    = input.GroupColumn,
                    WeightColumn   = input.WeightColumn
                };

                if (transformModelVarName != null)
                {
                    args.TransformModel = new Var <ITransformModel> {
                        VarName = transformModelVarName.VariableName
                    }
                }
                ;

                args.Inputs.Data = new Var <IDataView>
                {
                    VarName = mapping[input.Inputs.Data.VarName]
                };

                if (input.Outputs.PredictorModel != null && mapping.ContainsKey(input.Outputs.PredictorModel.VarName))
                {
                    args.Outputs.PredictorModel = new Var <IPredictorModel>
                    {
                        VarName = mapping[input.Outputs.PredictorModel.VarName]
                    };
                }
                else
                {
                    args.Outputs.PredictorModel = null;
                }

                if (input.Outputs.TransformModel != null && mapping.ContainsKey(input.Outputs.TransformModel.VarName))
                {
                    args.Outputs.TransformModel = new Var <ITransformModel>
                    {
                        VarName = mapping[input.Outputs.TransformModel.VarName]
                    };
                }
                else
                {
                    args.Outputs.TransformModel = null;
                }

                // Set train/test trainer kind to match.
                args.Kind = input.Kind;

                // Set the input bindings for the TrainTest entry point.
                var inputBindingMap = new Dictionary <string, List <ParameterBinding> >();
                var inputMap        = new Dictionary <ParameterBinding, VariableBinding>();
                var trainingData    = new SimpleParameterBinding(nameof(args.TrainingData));
                inputBindingMap.Add(nameof(args.TrainingData), new List <ParameterBinding> {
                    trainingData
                });
                inputMap.Add(trainingData, new ArrayIndexVariableBinding(cvSplitOutput.TrainData.VarName, k));
                var testingData = new SimpleParameterBinding(nameof(args.TestingData));
                inputBindingMap.Add(nameof(args.TestingData), new List <ParameterBinding> {
                    testingData
                });
                inputMap.Add(testingData, new ArrayIndexVariableBinding(cvSplitOutput.TestData.VarName, k));
                var outputMap         = new Dictionary <string, string>();
                var transformModelVar = new Var <ITransformModel>();
                var predModelVar      = new Var <IPredictorModel>();
                if (input.Outputs.PredictorModel == null)
                {
                    outputMap.Add(nameof(TrainTestMacro.Output.TransformModel), transformModelVar.VarName);
                    transformModelVars[k] = transformModelVar;
                    ML.Transforms.ModelCombiner.Output modelCombineOutput = null;
                    if (transformModelVarName != null && transformModelVarName.VariableName != null)
                    {
                        var modelCombine = new ML.Transforms.ModelCombiner
                        {
                            Models = new ArrayVar <ITransformModel>(
                                new Var <ITransformModel>[] {
                                new Var <ITransformModel> {
                                    VarName = transformModelVarName.VariableName
                                },
                                transformModelVar
                            }
                                )
                        };

                        exp.Reset();
                        modelCombineOutput = exp.Add(modelCombine);
                        subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));
                        transformModelVars[k] = modelCombineOutput.OutputModel;
                    }
                }
                else
                {
                    outputMap.Add(nameof(TrainTestMacro.Output.PredictorModel), predModelVar.VarName);
                    predModelVars[k] = predModelVar;
                    ML.Transforms.TwoHeterogeneousModelCombiner.Output modelCombineOutput = null;
                    if (transformModelVarName != null && transformModelVarName.VariableName != null)
                    {
                        var modelCombine = new ML.Transforms.TwoHeterogeneousModelCombiner
                        {
                            TransformModel = { VarName = transformModelVarName.VariableName },
                            PredictorModel = predModelVar
                        };

                        exp.Reset();
                        modelCombineOutput = exp.Add(modelCombine);
                        subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));
                        predModelVars[k] = modelCombineOutput.PredictorModel;
                    }
                }

                var warningVar = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestMacro.Output.Warnings), warningVar.VarName);
                warningsVars[k] = warningVar;
                var overallMetric = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestMacro.Output.OverallMetrics), overallMetric.VarName);
                overallMetricsVars[k] = overallMetric;
                var instanceMetric = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestMacro.Output.PerInstanceMetrics), instanceMetric.VarName);
                instanceMetricsVars[k] = instanceMetric;
                var confusionMatrix = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestMacro.Output.ConfusionMatrix), confusionMatrix.VarName);
                confusionMatrixVars[k] = confusionMatrix;
                const string trainTestEvaluatorMacroEntryPoint = "Models.TrainTestEvaluator";
                subGraphNodes.Add(EntryPointNode.Create(env, trainTestEvaluatorMacroEntryPoint, args, node.Catalog, node.Context, inputBindingMap, inputMap, outputMap));
            }

            exp.Reset();

            // Convert predictors from all folds into an array of predictors.

            if (input.Outputs.PredictorModel == null)
            {
                var outModels = new ML.Data.TransformModelArrayConverter
                {
                    TransformModel = new ArrayVar <ITransformModel>(transformModelVars)
                };
                var outModelsOutput = new ML.Data.TransformModelArrayConverter.Output();
                outModelsOutput.OutputModel.VarName = node.GetOutputVariableName(nameof(Output.TransformModel));
                exp.Add(outModels, outModelsOutput);
            }
            else
            {
                var outModels = new ML.Data.PredictorModelArrayConverter
                {
                    Model = new ArrayVar <IPredictorModel>(predModelVars)
                };
                var outModelsOutput = new ML.Data.PredictorModelArrayConverter.Output();
                outModelsOutput.OutputModel.VarName = node.GetOutputVariableName(nameof(Output.PredictorModel));
                exp.Add(outModels, outModelsOutput);
            }

            // Convert warnings data views from all folds into an array of data views.
            var warnings = new ML.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(warningsVars)
            };
            var warningsOutput = new ML.Data.IDataViewArrayConverter.Output();

            exp.Add(warnings, warningsOutput);

            // Convert overall metrics data views from all folds into an array of data views.
            var overallMetrics = new ML.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(overallMetricsVars)
            };
            var overallMetricsOutput = new ML.Data.IDataViewArrayConverter.Output();

            exp.Add(overallMetrics, overallMetricsOutput);

            // Convert per instance data views from all folds into an array of data views.
            var instanceMetrics = new ML.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(instanceMetricsVars)
            };
            var instanceMetricsOutput = new ML.Data.IDataViewArrayConverter.Output();

            exp.Add(instanceMetrics, instanceMetricsOutput);

            ML.Data.IDataViewArrayConverter.Output confusionMatricesOutput = null;
            if (input.Kind == MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer ||
                input.Kind == MacroUtils.TrainerKinds.SignatureMultiClassClassifierTrainer)
            {
                // Convert confusion matrix data views from all folds into an array of data views.
                var confusionMatrices = new ML.Data.IDataViewArrayConverter
                {
                    Data = new ArrayVar <IDataView>(confusionMatrixVars)
                };
                confusionMatricesOutput = new ML.Data.IDataViewArrayConverter.Output();
                exp.Add(confusionMatrices, confusionMatricesOutput);
            }

            var combineArgs = new CombineMetricsInput();

            combineArgs.Kind         = input.Kind;
            combineArgs.LabelColumn  = input.LabelColumn;
            combineArgs.WeightColumn = input.WeightColumn;
            combineArgs.GroupColumn  = input.GroupColumn;

            // Set the input bindings for the CombineMetrics entry point.
            var combineInputBindingMap = new Dictionary <string, List <ParameterBinding> >();
            var combineInputMap        = new Dictionary <ParameterBinding, VariableBinding>();
            var overallArray           = new SimpleParameterBinding(nameof(combineArgs.OverallMetrics));

            combineInputBindingMap.Add(nameof(combineArgs.OverallMetrics), new List <ParameterBinding> {
                overallArray
            });
            combineInputMap.Add(overallArray, new SimpleVariableBinding(overallMetricsOutput.OutputData.VarName));
            var combinePerInstArray = new SimpleParameterBinding(nameof(combineArgs.PerInstanceMetrics));

            combineInputBindingMap.Add(nameof(combineArgs.PerInstanceMetrics), new List <ParameterBinding> {
                combinePerInstArray
            });
            combineInputMap.Add(combinePerInstArray, new SimpleVariableBinding(instanceMetricsOutput.OutputData.VarName));
            if (confusionMatricesOutput != null)
            {
                var combineConfArray = new SimpleParameterBinding(nameof(combineArgs.ConfusionMatrix));
                combineInputBindingMap.Add(nameof(combineArgs.ConfusionMatrix), new List <ParameterBinding> {
                    combineConfArray
                });
                combineInputMap.Add(combineConfArray, new SimpleVariableBinding(confusionMatricesOutput.OutputData.VarName));
            }

            var combineOutputMap  = new Dictionary <string, string>();
            var combineWarningVar = new Var <IDataView>();

            combineWarningVar.VarName = node.GetOutputVariableName(nameof(Output.Warnings));
            combineOutputMap.Add(nameof(Output.Warnings), combineWarningVar.VarName);
            var combineOverallMetric = new Var <IDataView>();

            combineOverallMetric.VarName = node.GetOutputVariableName(nameof(Output.OverallMetrics));
            combineOutputMap.Add(nameof(Output.OverallMetrics), combineOverallMetric.VarName);
            var combineInstanceMetric = new Var <IDataView>();

            combineInstanceMetric.VarName = node.GetOutputVariableName(nameof(Output.PerInstanceMetrics));
            combineOutputMap.Add(nameof(Output.PerInstanceMetrics), combineInstanceMetric.VarName);
            if (confusionMatricesOutput != null)
            {
                var combineConfusionMatrix = new Var <IDataView>();
                combineConfusionMatrix.VarName = node.GetOutputVariableName(nameof(Output.ConfusionMatrix));
                combineOutputMap.Add(nameof(TrainTestMacro.Output.ConfusionMatrix), combineConfusionMatrix.VarName);
            }
            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes(), node.Catalog));
            subGraphNodes.Add(EntryPointNode.Create(env, "Models.CrossValidationResultsCombiner", combineArgs, node.Catalog, node.Context, combineInputBindingMap, combineInputMap, combineOutputMap));
            return(new CommonOutputs.MacroOutput <Output>()
            {
                Nodes = subGraphNodes
            });
        }
Beispiel #12
0
        public DataAndModel <IPredictorModel> Add(Experiment experiment)
        {
            var output = experiment.Add(_entryPointObj);

            return(new DataAndModel <IPredictorModel>(_entryPointObj.TrainingData, output.PredictorModel));
        }
Beispiel #13
0
        public static CommonOutputs.MacroOutput <Output> CrossValidateBinary(
            IHostEnvironment env,
            Arguments input,
            EntryPointNode node)
        {
            // This will be the final resulting list of nodes that is returned from the macro.
            var subGraphNodes = new List <EntryPointNode>();

            // Split the input data into folds.
            var exp     = new Experiment(env);
            var cvSplit = new Legacy.Models.CrossValidatorDatasetSplitter();

            cvSplit.Data.VarName         = node.GetInputVariable("Data").ToJson();
            cvSplit.NumFolds             = input.NumFolds;
            cvSplit.StratificationColumn = input.StratificationColumn;
            var cvSplitOutput = exp.Add(cvSplit);

            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes()));

            var predModelVars       = new Var <PredictorModel> [input.NumFolds];
            var warningsVars        = new Var <IDataView> [input.NumFolds];
            var overallMetricsVars  = new Var <IDataView> [input.NumFolds];
            var instanceMetricsVars = new Var <IDataView> [input.NumFolds];
            var confusionMatrixVars = new Var <IDataView> [input.NumFolds];

            // Instantiate the subgraph for each fold.
            for (int k = 0; k < input.NumFolds; k++)
            {
                // Parse the nodes in input.Nodes into a temporary run context.
                var context = new RunContext(env);
                var graph   = EntryPointNode.ValidateNodes(env, context, input.Nodes);

                // Rename all the variables such that they don't conflict with the ones in the outer run context.
                var mapping = new Dictionary <string, string>();
                foreach (var entryPointNode in graph)
                {
                    entryPointNode.RenameAllVariables(mapping);
                }

                // Instantiate a TrainTest entry point for this fold.
                var args = new TrainTestBinaryMacro.Arguments
                {
                    Nodes = new JArray(graph.Select(n => n.ToJson()).ToArray())
                };
                args.Inputs.Data = new Var <IDataView>
                {
                    VarName = mapping[input.Inputs.Data.VarName]
                };
                args.Outputs.Model = new Var <PredictorModel>
                {
                    VarName = mapping[input.Outputs.Model.VarName]
                };

                // Set the input bindings for the TrainTest entry point.
                var inputBindingMap = new Dictionary <string, List <ParameterBinding> >();
                var inputMap        = new Dictionary <ParameterBinding, VariableBinding>();
                var trainingData    = new SimpleParameterBinding(nameof(args.TrainingData));
                inputBindingMap.Add(nameof(args.TrainingData), new List <ParameterBinding> {
                    trainingData
                });
                inputMap.Add(trainingData, new ArrayIndexVariableBinding(cvSplitOutput.TrainData.VarName, k));
                var testingData = new SimpleParameterBinding(nameof(args.TestingData));
                inputBindingMap.Add(nameof(args.TestingData), new List <ParameterBinding> {
                    testingData
                });
                inputMap.Add(testingData, new ArrayIndexVariableBinding(cvSplitOutput.TestData.VarName, k));
                var outputMap    = new Dictionary <string, string>();
                var predModelVar = new Var <PredictorModel>();
                outputMap.Add(nameof(TrainTestBinaryMacro.Output.PredictorModel), predModelVar.VarName);
                predModelVars[k] = predModelVar;
                var warningVar = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestBinaryMacro.Output.Warnings), warningVar.VarName);
                warningsVars[k] = warningVar;
                var overallMetric = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestBinaryMacro.Output.OverallMetrics), overallMetric.VarName);
                overallMetricsVars[k] = overallMetric;
                var instanceMetric = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestBinaryMacro.Output.PerInstanceMetrics), instanceMetric.VarName);
                instanceMetricsVars[k] = instanceMetric;
                var confusionMatrix = new Var <IDataView>();
                outputMap.Add(nameof(TrainTestBinaryMacro.Output.ConfusionMatrix), confusionMatrix.VarName);
                confusionMatrixVars[k] = confusionMatrix;
                subGraphNodes.Add(EntryPointNode.Create(env, "Models.TrainTestBinaryEvaluator", args, node.Context, inputBindingMap, inputMap, outputMap));
            }

            exp.Reset();

            var outModels = new Legacy.Data.PredictorModelArrayConverter
            {
                Model = new ArrayVar <PredictorModel>(predModelVars)
            };
            var outModelsOutput = new Legacy.Data.PredictorModelArrayConverter.Output();

            outModelsOutput.OutputModel.VarName = node.GetOutputVariableName(nameof(Output.PredictorModel));
            exp.Add(outModels, outModelsOutput);

            var warnings = new Legacy.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(warningsVars)
            };
            var warningsOutput = new Legacy.Data.IDataViewArrayConverter.Output();

            warningsOutput.OutputData.VarName = node.GetOutputVariableName(nameof(Output.Warnings));
            exp.Add(warnings, warningsOutput);

            var overallMetrics = new Legacy.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(overallMetricsVars)
            };
            var overallMetricsOutput = new Legacy.Data.IDataViewArrayConverter.Output();

            overallMetricsOutput.OutputData.VarName = node.GetOutputVariableName(nameof(Output.OverallMetrics));
            exp.Add(overallMetrics, overallMetricsOutput);

            var instanceMetrics = new Legacy.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(instanceMetricsVars)
            };
            var instanceMetricsOutput = new Legacy.Data.IDataViewArrayConverter.Output();

            instanceMetricsOutput.OutputData.VarName = node.GetOutputVariableName(nameof(Output.PerInstanceMetrics));
            exp.Add(instanceMetrics, instanceMetricsOutput);

            var confusionMatrices = new Legacy.Data.IDataViewArrayConverter
            {
                Data = new ArrayVar <IDataView>(confusionMatrixVars)
            };
            var confusionMatricesOutput = new Legacy.Data.IDataViewArrayConverter.Output();

            confusionMatricesOutput.OutputData.VarName = node.GetOutputVariableName(nameof(Output.ConfusionMatrix));
            exp.Add(confusionMatrices, confusionMatricesOutput);

            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes()));

            return(new CommonOutputs.MacroOutput <Output>()
            {
                Nodes = subGraphNodes
            });
        }
        public static CommonOutputs.MacroOutput <Output> PipelineSweep(
            IHostEnvironment env,
            Arguments input,
            EntryPointNode node)
        {
            env.Check(input.StateArguments != null || input.State is AutoInference.AutoMlMlState,
                      "Must have a valid AutoML State, or pass arguments to create one.");
            env.Check(input.BatchSize > 0, "Batch size must be > 0.");

            // Get the user-defined column roles (if any)
            var dataRoles = GetDataRoles(env, input);

            // If no current state, create object and set data.
            if (input.State == null)
            {
                input.State = input.StateArguments?.CreateComponent(env);

                if (input.State is AutoInference.AutoMlMlState inState)
                {
                    inState.SetTrainTestData(input.TrainingData, input.TestingData);
                }
                else
                {
                    throw env.Except($"Incompatible type. Expecting type {typeof(AutoInference.AutoMlMlState)}, received type {input.State?.GetType()}.");
                }

                var result = node.AddNewVariable("State", input.State);
                node.Context.AddInputVariable(result.Item2, typeof(IMlState));
            }
            var autoMlState = (AutoInference.AutoMlMlState)input.State;

            // The indicators are just so the macro knows those pipelines need to
            // be run before performing next expansion. If we add them as inputs
            // to the next iteration, the next iteration cannot run until they have
            // their values set. Thus, indicators are needed.
            var pipelineIndicators = new List <Var <IDataView> >();

            var expNodes = new List <EntryPointNode>();

            // Keep versions of the training and testing var names
            var training = new Var <IDataView> {
                VarName = node.GetInputVariable("TrainingData").VariableName
            };
            var testing = new Var <IDataView> {
                VarName = node.GetInputVariable("TestingData").VariableName
            };
            var amlsVarObj =
                new Var <IMlState>()
            {
                VarName = node.GetInputVariable(nameof(input.State)).VariableName
            };

            // Make sure search space is defined. If not, infer,
            // with default number of transform levels.
            if (!autoMlState.IsSearchSpaceDefined())
            {
                autoMlState.InferSearchSpace(numTransformLevels: 1, dataRoles);
            }

            // Extract performance summaries and assign to previous candidate pipelines.
            foreach (var pipeline in autoMlState.BatchCandidates)
            {
                if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), out var v) &&
                    node.Context.TryGetVariable(AutoMlUtils.GenerateOverallTrainingMetricVarName(pipeline.UniqueId), out var v2))
                {
                    pipeline.PerformanceSummary = AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name, (IDataView)v2.Value);
                    autoMlState.AddEvaluated(pipeline);
                }
            }

            node.OutputMap.TryGetValue("Results", out string outDvName);
            var outDvVar = new Var <IDataView>()
            {
                VarName = outDvName
            };

            node.OutputMap.TryGetValue("State", out string outStateName);
            var outStateVar = new Var <IMlState>()
            {
                VarName = outStateName
            };

            // Get next set of candidates.
            var candidatePipelines = autoMlState.GetNextCandidates(input.BatchSize);

            // Check if termination condition was met, i.e. no more candidates were returned.
            // If so, end expansion and add a node to extract the sweep result.
            if (candidatePipelines == null || candidatePipelines.Length == 0)
            {
                // Add a node to extract the sweep result.
                var resultSubgraph = new Experiment(env);
                var resultNode     = new Microsoft.ML.Legacy.Models.SweepResultExtractor()
                {
                    State = amlsVarObj
                };
                var resultOutput = new Legacy.Models.SweepResultExtractor.Output()
                {
                    State = outStateVar, Results = outDvVar
                };
                resultSubgraph.Add(resultNode, resultOutput);
                var resultSubgraphNodes = EntryPointNode.ValidateNodes(env, node.Context, resultSubgraph.GetNodes());
                expNodes.AddRange(resultSubgraphNodes);
                return(new CommonOutputs.MacroOutput <Output>()
                {
                    Nodes = expNodes
                });
            }

            // Prep all returned candidates
            foreach (var p in candidatePipelines)
            {
                // Add train test experiments to current graph for candidate pipeline
                var subgraph        = new Experiment(env);
                var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph, true);

                // Change variable name to reference pipeline ID in output map, context and entrypoint output.
                var uniqueName         = ExperimentUtils.GenerateOverallMetricVarName(p.UniqueId);
                var uniqueNameTraining = AutoMlUtils.GenerateOverallTrainingMetricVarName(p.UniqueId);
                var sgNode             = EntryPointNode.ValidateNodes(env, node.Context,
                                                                      new JArray(subgraph.GetNodes().Last())).Last();
                sgNode.RenameOutputVariable(trainTestOutput.OverallMetrics.VarName, uniqueName, cascadeChanges: true);
                sgNode.RenameOutputVariable(trainTestOutput.TrainingOverallMetrics.VarName, uniqueNameTraining, cascadeChanges: true);
                trainTestOutput.OverallMetrics.VarName         = uniqueName;
                trainTestOutput.TrainingOverallMetrics.VarName = uniqueNameTraining;
                expNodes.Add(sgNode);

                // Store indicators, to pass to next iteration of macro.
                pipelineIndicators.Add(trainTestOutput.OverallMetrics);
            }

            // Add recursive macro node
            var macroSubgraph = new Experiment(env);
            var macroNode     = new Legacy.Models.PipelineSweeper()
            {
                BatchSize        = input.BatchSize,
                CandidateOutputs = new ArrayVar <IDataView>(pipelineIndicators.ToArray()),
                TrainingData     = training,
                TestingData      = testing,
                State            = amlsVarObj
            };
            var output = new Legacy.Models.PipelineSweeper.Output()
            {
                Results = outDvVar, State = outStateVar
            };

            macroSubgraph.Add(macroNode, output);

            var subgraphNodes = EntryPointNode.ValidateNodes(env, node.Context, macroSubgraph.GetNodes());

            expNodes.AddRange(subgraphNodes);

            return(new CommonOutputs.MacroOutput <Output>()
            {
                Nodes = expNodes
            });
        }
Beispiel #15
0
        private static ITransformModel CreateKcHousePricePredictorModel(string dataPath)
        {
            Experiment experiment = s_environment.CreateExperiment();
            var        importData = new Legacy.Data.TextLoader(dataPath)
            {
                Arguments = new TextLoaderArguments
                {
                    Separator = new[] { ',' },
                    HasHeader = true,
                    Column    = new[]
                    {
                        new TextLoaderColumn()
                        {
                            Name   = "Id",
                            Source = new [] { new TextLoaderRange(0) },
                            Type   = Legacy.Data.DataKind.Text
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Date",
                            Source = new [] { new TextLoaderRange(1) },
                            Type   = Legacy.Data.DataKind.Text
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Label",
                            Source = new [] { new TextLoaderRange(2) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Bedrooms",
                            Source = new [] { new TextLoaderRange(3) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Bathrooms",
                            Source = new [] { new TextLoaderRange(4) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLiving",
                            Source = new [] { new TextLoaderRange(5) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLot",
                            Source = new [] { new TextLoaderRange(6) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Floors",
                            Source = new [] { new TextLoaderRange(7) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Waterfront",
                            Source = new [] { new TextLoaderRange(8) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "View",
                            Source = new [] { new TextLoaderRange(9) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Condition",
                            Source = new [] { new TextLoaderRange(10) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Grade",
                            Source = new [] { new TextLoaderRange(11) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftAbove",
                            Source = new [] { new TextLoaderRange(12) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftBasement",
                            Source = new [] { new TextLoaderRange(13) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "YearBuilt",
                            Source = new [] { new TextLoaderRange(14) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "YearRenovated",
                            Source = new [] { new TextLoaderRange(15) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Zipcode",
                            Source = new [] { new TextLoaderRange(16) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Lat",
                            Source = new [] { new TextLoaderRange(17) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Long",
                            Source = new [] { new TextLoaderRange(18) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLiving15",
                            Source = new [] { new TextLoaderRange(19) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLot15",
                            Source = new [] { new TextLoaderRange(20) },
                            Type   = Legacy.Data.DataKind.Num
                        },
                    }
                }

                //new Data.CustomTextLoader();
                // importData.CustomSchema = dataSchema;
                //
            };

            Legacy.Data.TextLoader.Output imported = experiment.Add(importData);
            var numericalConcatenate = new Legacy.Transforms.ColumnConcatenator();

            numericalConcatenate.Data = imported.Data;
            numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15");
            Legacy.Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate);

            var categoryConcatenate = new Legacy.Transforms.ColumnConcatenator();

            categoryConcatenate.Data = numericalConcatenated.OutputData;
            categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode");
            Legacy.Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate);

            var categorize = new Legacy.Transforms.CategoricalOneHotVectorizer();

            categorize.AddColumn("CategoryFeatures");
            categorize.Data = categoryConcatenated.OutputData;
            Legacy.Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize);

            var featuresConcatenate = new Legacy.Transforms.ColumnConcatenator();

            featuresConcatenate.Data = categorized.OutputData;
            featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures");
            Legacy.Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate);

            var learner = new Legacy.Trainers.StochasticDualCoordinateAscentRegressor();

            learner.TrainingData = featuresConcatenated.OutputData;
            learner.NumThreads   = 1;
            Legacy.Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner);

            var combineModels = new Legacy.Transforms.ManyHeterogeneousModelCombiner();

            combineModels.TransformModels = new ArrayVar <ITransformModel>(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model);
            combineModels.PredictorModel  = learnerOutput.PredictorModel;
            Legacy.Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels);

            var scorer = new Legacy.Transforms.Scorer
            {
                PredictorModel = combinedModels.PredictorModel
            };

            var scorerOutput = experiment.Add(scorer);

            experiment.Compile();
            experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false));
            experiment.Run();

            return(experiment.GetOutput(scorerOutput.ScoringTransform));
        }
        public static CommonOutputs.MacroOutput <Output> TrainTestBinary(
            IHostEnvironment env,
            Arguments input,
            EntryPointNode node)
        {
            // Parse the subgraph.
            var subGraphRunContext = new RunContext(env);
            var subGraphNodes      = EntryPointNode.ValidateNodes(env, subGraphRunContext, input.Nodes);

            // Change the subgraph to use the training data as input.
            var varName = input.Inputs.Data.VarName;
            EntryPointVariable variable;

            if (!subGraphRunContext.TryGetVariable(varName, out variable))
            {
                throw env.Except($"Invalid variable name '{varName}'.");
            }
            var trainingVar = node.GetInputVariable("TrainingData");

            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.RenameInputVariable(variable.Name, trainingVar);
            }
            subGraphRunContext.RemoveVariable(variable);

            // Change the subgraph to use the model variable as output.
            varName = input.Outputs.Model.VarName;
            if (!subGraphRunContext.TryGetVariable(varName, out variable))
            {
                throw env.Except($"Invalid variable name '{varName}'.");
            }
            string outputVarName = node.GetOutputVariableName("PredictorModel");

            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.RenameOutputVariable(variable.Name, outputVarName);
            }
            subGraphRunContext.RemoveVariable(variable);

            // Move the variables from the subcontext to the main context.
            node.Context.AddContextVariables(subGraphRunContext);

            // Change all the subgraph nodes to use the main context.
            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.SetContext(node.Context);
            }

            // Add the scoring node.
            var testingVar = node.GetInputVariable("TestingData");
            var exp        = new Experiment(env);
            var scoreNode  = new Legacy.Transforms.DatasetScorer();

            scoreNode.Data.VarName           = testingVar.ToJson();
            scoreNode.PredictorModel.VarName = outputVarName;
            var scoreNodeOutput = exp.Add(scoreNode);

            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes()));

            // Add the evaluator node.
            exp.Reset();
            var evalNode = new Legacy.Models.BinaryClassificationEvaluator();

            evalNode.Data.VarName = scoreNodeOutput.ScoredData.VarName;
            var    evalOutput = new Legacy.Models.BinaryClassificationEvaluator.Output();
            string outVariableName;

            if (node.OutputMap.TryGetValue("Warnings", out outVariableName))
            {
                evalOutput.Warnings.VarName = outVariableName;
            }
            if (node.OutputMap.TryGetValue("OverallMetrics", out outVariableName))
            {
                evalOutput.OverallMetrics.VarName = outVariableName;
            }
            if (node.OutputMap.TryGetValue("PerInstanceMetrics", out outVariableName))
            {
                evalOutput.PerInstanceMetrics.VarName = outVariableName;
            }
            if (node.OutputMap.TryGetValue("ConfusionMatrix", out outVariableName))
            {
                evalOutput.ConfusionMatrix.VarName = outVariableName;
            }
            exp.Add(evalNode, evalOutput);
            subGraphNodes.AddRange(EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes()));

            var stageId = Guid.NewGuid().ToString("N");

            foreach (var subGraphNode in subGraphNodes)
            {
                subGraphNode.StageId = stageId;
            }

            return(new CommonOutputs.MacroOutput <Output>()
            {
                Nodes = subGraphNodes
            });
        }
        private static Tuple <List <EntryPointNode>, Var <IPredictorModel> > ProcessClass(IHostEnvironment env, int k, string label, Arguments input, EntryPointNode node)
        {
            var macroNodes = new List <EntryPointNode>();

            // Convert label into T,F based on k.
            var remapper = new Legacy.Transforms.LabelIndicator
            {
                ClassIndex = k,
                Column     = new[]
                {
                    new Legacy.Transforms.LabelIndicatorTransformColumn
                    {
                        ClassIndex = k,
                        Name       = label,
                        Source     = label
                    }
                },
                Data = { VarName = node.GetInputVariable(nameof(input.TrainingData)).ToJson() }
            };
            var exp             = new Experiment(env);
            var remapperOutNode = exp.Add(remapper);
            var subNodes        = EntryPointNode.ValidateNodes(env, node.Context, exp.GetNodes());

            macroNodes.AddRange(subNodes);

            // Parse the nodes in input.Nodes into a temporary run context.
            var subGraphRunContext = new RunContext(env);
            var subGraphNodes      = EntryPointNode.ValidateNodes(env, subGraphRunContext, input.Nodes);

            // Rename all the variables such that they don't conflict with the ones in the outer run context.
            var  mapping     = new Dictionary <string, string>();
            bool foundOutput = false;
            Var <IPredictorModel> predModelVar = null;

            foreach (var entryPointNode in subGraphNodes)
            {
                // Rename variables in input/output maps, and in subgraph context.
                entryPointNode.RenameAllVariables(mapping);
                foreach (var kvp in mapping)
                {
                    subGraphRunContext.RenameContextVariable(kvp.Key, kvp.Value);
                }

                // Grab a hold of output model from this subgraph.
                if (entryPointNode.GetOutputVariableName("PredictorModel") is string mvn)
                {
                    predModelVar = new Var <IPredictorModel> {
                        VarName = mvn
                    };
                    foundOutput = true;
                }

                // Connect label remapper output to wherever training data was expected within the input graph.
                if (entryPointNode.GetInputVariable(nameof(input.TrainingData)) is VariableBinding vb)
                {
                    vb.Rename(remapperOutNode.OutputData.VarName);
                }

                // Change node to use the main context.
                entryPointNode.SetContext(node.Context);
            }

            // Move the variables from the subcontext to the main context.
            node.Context.AddContextVariables(subGraphRunContext);

            // Make sure we found the output variable for this model.
            if (!foundOutput)
            {
                throw new Exception("Invalid input graph. Does not output predictor model.");
            }

            // Add training subgraph to our context.
            macroNodes.AddRange(subGraphNodes);

            return(new Tuple <List <EntryPointNode>, Var <IPredictorModel> >(macroNodes, predModelVar));
        }
        public static CommonOutputs.MacroOutput <Output> OneVersusAll(
            IHostEnvironment env,
            Arguments input,
            EntryPointNode node)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(input, nameof(input));
            env.Assert(input.Nodes.Count > 0);

            var numClasses    = GetNumberOfClasses(env, input, out var label);
            var predModelVars = new Var <IPredictorModel> [numClasses];

            // This will be the final resulting list of nodes that is returned from the macro.
            var macroNodes = new List <EntryPointNode>();

            // Instantiate the subgraph for each label value.
            for (int k = 0; k < numClasses; k++)
            {
                var result = ProcessClass(env, k, label, input, node);
                predModelVars[k] = result.Item2;
                macroNodes.AddRange(result.Item1);
            }

            // Use OVA model combiner to combine these models into one.
            // Takes in array of models that are binary predictor models and
            // produces single multiclass predictor model.
            var macroExperiment = new Experiment(env);
            var combinerNode    = new Legacy.Models.OvaModelCombiner
            {
                ModelArray   = new ArrayVar <IPredictorModel>(predModelVars),
                TrainingData = new Var <IDataView> {
                    VarName = node.GetInputVariable(nameof(input.TrainingData)).VariableName
                },
                Caching           = (Legacy.Models.CachingOptions)input.Caching,
                FeatureColumn     = input.FeatureColumn,
                NormalizeFeatures = (Legacy.Models.NormalizeOption)input.NormalizeFeatures,
                LabelColumn       = input.LabelColumn,
                UseProbabilities  = input.UseProbabilities
            };

            // Get output model variable.
            if (!node.OutputMap.TryGetValue(nameof(Output.PredictorModel), out var outVariableName))
            {
                throw new Exception("Cannot find OVA model output.");
            }

            // Map macro's output back to OVA combiner (so OVA combiner will set the value on our output variable).
            var combinerOutput = new Legacy.Models.OvaModelCombiner.Output {
                PredictorModel = new Var <IPredictorModel> {
                    VarName = outVariableName
                }
            };

            // Add to experiment (must be done AFTER we assign variable name to output).
            macroExperiment.Add(combinerNode, combinerOutput);

            // Add nodes to main experiment.
            var nodes    = macroExperiment.GetNodes();
            var expNodes = EntryPointNode.ValidateNodes(env, node.Context, nodes);

            macroNodes.AddRange(expNodes);

            return(new CommonOutputs.MacroOutput <Output>()
            {
                Nodes = macroNodes
            });
        }