Example #1
0
        public void TestSimpleExperiment()
        {
            var dataPath   = GetDataPath("adult.tiny.with-schema.txt");
            var env        = new MLContext();
            var experiment = env.CreateExperiment();

            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var normalizeInput = new Legacy.Transforms.MinMaxNormalizer
            {
                Data = importOutput.Data
            };

            normalizeInput.AddColumn("NumericFeatures");
            var normalizeOutput = experiment.Add(normalizeInput);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(normalizeOutput.OutputData);

            var schema = data.Schema;

            Assert.Equal(5, schema.Count);
            var expected = new[] { "Label", "Workclass", "Categories", "NumericFeatures", "NumericFeatures" };

            for (int i = 0; i < schema.Count; i++)
            {
                Assert.Equal(expected[i], schema[i].Name);
            }
        }
        /// <summary>
        /// Performs cross validation on a pipeline.
        /// </summary>
        /// <typeparam name="TInput">Class type that represents input schema.</typeparam>
        /// <typeparam name="TOutput">Class type that represents prediction schema.</typeparam>
        /// <param name="pipeline">Machine learning pipeline may contain loader, transforms and at least one trainer.</param>
        /// <returns>List containing metrics and predictor model for each fold</returns>
        public CrossValidationOutput <TInput, TOutput> CrossValidate <TInput, TOutput>(LearningPipeline pipeline)
            where TInput : class
            where TOutput : class, new()
        {
            var environment = new MLContext();
            {
                Experiment                     subGraph              = environment.CreateExperiment();
                ILearningPipelineStep          step                  = null;
                List <ILearningPipelineLoader> loaders               = new List <ILearningPipelineLoader>();
                List <Var <ITransformModel> >  transformModels       = new List <Var <ITransformModel> >();
                Var <ITransformModel>          lastTransformModel    = null;
                Var <IDataView>                firstPipelineDataStep = null;
                Var <IPredictorModel>          firstModel            = null;
                ILearningPipelineItem          firstTransform        = null;
                foreach (ILearningPipelineItem currentItem in pipeline)
                {
                    if (currentItem is ILearningPipelineLoader loader)
                    {
                        loaders.Add(loader);
                        continue;
                    }

                    step = currentItem.ApplyStep(step, subGraph);

                    if (step is ILearningPipelineDataStep dataStep && dataStep.Model != null)
                    {
                        transformModels.Add(dataStep.Model);
                        if (firstPipelineDataStep == null)
                        {
                            firstPipelineDataStep = dataStep.Data;
                            firstTransform        = currentItem;
                        }
                    }
Example #3
0
        /// <summary>
        /// Computes the quality metrics for the multi-class classification PredictionModel
        /// using the specified data set.
        /// </summary>
        /// <param name="model">
        /// The trained multi-class classification PredictionModel to be evaluated.
        /// </param>
        /// <param name="testData">
        /// The test data that will be predicted and used to evaluate the model.
        /// </param>
        /// <returns>
        /// A ClassificationMetrics instance that describes how well the model performed against the test data.
        /// </returns>
        public ClassificationMetrics Evaluate(PredictionModel model, ILearningPipelineLoader testData)
        {
            var environment = new MLContext();

            environment.CheckValue(model, nameof(model));
            environment.CheckValue(testData, nameof(testData));

            Experiment experiment = environment.CreateExperiment();

            ILearningPipelineStep testDataStep = testData.ApplyStep(previousStep: null, experiment);

            if (!(testDataStep is ILearningPipelineDataStep testDataOutput))
            {
                throw environment.Except($"The {nameof(ILearningPipelineLoader)} did not return a {nameof(ILearningPipelineDataStep)} from ApplyStep.");
            }

            var datasetScorer = new DatasetTransformScorer
            {
                Data = testDataOutput.Data,
            };

            DatasetTransformScorer.Output scoreOutput = experiment.Add(datasetScorer);

            Data = scoreOutput.ScoredData;
            Output evaluteOutput = experiment.Add(this);

            experiment.Compile();

            experiment.SetInput(datasetScorer.TransformModel, model.PredictorModel);
            testData.SetInput(environment, experiment);

            experiment.Run();

            IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);

            if (overallMetrics == null)
            {
                throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
            }

            IDataView confusionMatrix = experiment.GetOutput(evaluteOutput.ConfusionMatrix);

            if (confusionMatrix == null)
            {
                throw environment.Except($"Could not find ConfusionMatrix in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
            }

            var metric = ClassificationMetrics.FromMetrics(environment, overallMetrics, confusionMatrix);

            if (metric.Count != 1)
            {
                throw environment.Except($"Exactly one metric set was expected but found {metric.Count} metrics");
            }

            return(metric[0]);
        }
Example #4
0
        public void CanSuccessfullyRetrieveQuotedData()
        {
            string dataPath = GetDataPath("QuotingData.csv");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: true, supportSparse: false);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var IDGetter   = cursor.GetGetter <float>(0);
                var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1);

                Assert.True(cursor.MoveNext());

                float ID = 0;
                IDGetter(ref ID);
                Assert.Equal(1, ID);

                ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("This text contains comma, within quotes.", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(2, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("This text contains extra punctuations and special characters.;*<>?!@#$%^&*()_+=-{}|[]:;'", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(3, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("This text has no quotes", Text.ToString());

                Assert.False(cursor.MoveNext());
            }
        }
Example #5
0
        public void CanSuccessfullyApplyATransform()
        {
            var loader = new Legacy.Data.TextLoader("fakeFile.txt").CreateFrom <Input>();

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            Assert.NotNull(output.Data);
            Assert.NotNull(output.Data.VarName);
            Assert.Null(output.Model);
        }
Example #6
0
        /// <summary>
        /// <a href="https://onnx.ai/">ONNX</a> is an intermediate representation format
        /// for machine learning models.
        /// </summary>
        /// <remarks>
        /// <para>It is used to make models portable such that you can
        /// train a model using a toolkit and run it in another tookit's runtime, for example,
        /// you can create a model using ML.NET, export it to an ONNX-ML model file,
        /// then load and run that ONNX-ML model in Windows ML, on an UWP Windows 10 app.</para>
        ///
        /// <para>This API converts an ML.NET model to ONNX-ML format by inspecting the transform pipeline
        /// from the end, checking for components that know how to save themselves as ONNX.
        /// The first item in the transform pipeline that does not know how to save itself
        /// as ONNX, is considered the "input" to the ONNX pipeline. (Ideally this would be the
        /// original loader itself, but this may not be possible if the user used unsavable
        /// transforms in defining the pipe.) All the columns in the source that are a type the
        /// ONNX knows how to deal with will be tracked. Intermediate transformations of the
        /// data appearing as new columns will appear in the output block of the ONNX, with names
        /// derived from the corresponding column names. The ONNX JSON will be serialized to a
        /// path defined through the Json option.</para>
        ///
        /// <para>This API supports the following arguments:</para>
        /// <list type="bullet">
        /// <item><description><see cref="Onnx"/> indicates the file to write the ONNX protocol buffer file to. This is required.</description></item>
        /// <item><description><see cref="Json"/> indicates the file to write the JSON representation of the ONNX model. This is optional.</description></item>
        /// <item><description><see cref="Name"/> indicates the name property in the ONNX model. If left unspecified, it will
        /// be the extension-less name of the file specified in the onnx indicates the protocol buffer file
        /// to write the ONNX representation to.</description></item>
        /// <item><description><see cref="Domain"/> indicates the domain name of the model. ONNX uses reverse domain name space indicators.
        /// For example com.microsoft.cognitiveservices. This is a required field.</description></item>
        /// <item><description><see cref="InputsToDrop"/> is a string array of input column names to omit from the input mapping.
        /// A common scenario might be to drop the label column, for instance, since it may not be practically
        /// useful for the pipeline. Note that any columns depending on these naturally cannot be saved.</description></item>
        /// <item><description><see cref="OutputsToDrop"/> is similar, except for the output schema. Note that the pipeline handler
        /// is currently not intelligent enough to drop intermediate calculations that produce this value: this will
        /// merely omit that value from the actual output.</description></item>
        /// </list>
        ///
        /// <para>Transforms that can be exported to ONNX</para>
        /// <list type="number">
        /// <item><description>Concat</description></item>
        /// <item><description>KeyToVector</description></item>
        /// <item><description>NAReplace</description></item>
        /// <item><description>Normalize</description></item>
        /// <item><description>Term</description></item>
        /// <item><description>Categorical</description></item>
        /// </list>
        ///
        /// <para>Learners that can be exported to ONNX</para>
        /// <list type="number">
        /// <item><description>FastTree</description></item>
        /// <item><description>LightGBM</description></item>
        /// <item><description>Logistic Regression</description></item>
        /// </list>
        ///
        /// <para>See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.Tests/OnnxTests.cs">OnnxTests.cs</a>
        /// for an example on how to train a model and then convert that model to ONNX.</para>
        /// </remarks>
        /// <param name="model">Model that needs to be converted to ONNX format.</param>
        public void Convert(PredictionModel model)
        {
            var environment = new MLContext();

            environment.CheckValue(model, nameof(model));

            Experiment experiment = environment.CreateExperiment();

            experiment.Add(this);
            experiment.Compile();
            experiment.SetInput(Model, model.PredictorModel);
            experiment.Run();
        }
        public void CanSuccessfullyApplyATransform()
        {
            var collection = CollectionDataSource.Create(new List <Input>()
            {
                new Input {
                    Number1 = 1, String1 = "1"
                }
            });
            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = (Legacy.ILearningPipelineDataStep)collection.ApplyStep(null, experiment);

            Assert.NotNull(output.Data);
            Assert.NotNull(output.Data.VarName);
            Assert.Null(output.Model);
        }
Example #8
0
        public void CanSuccessfullyTrimSpaces()
        {
            string dataPath = GetDataPath("TrimData.csv");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: false, supportSparse: false, trimWhitespace: true);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var IDGetter   = cursor.GetGetter <float>(0);
                var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1);

                Assert.True(cursor.MoveNext());

                float ID = 0;
                IDGetter(ref ID);
                Assert.Equal(1, ID);

                ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("There is a space at the end", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(2, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("There is no space at the end", Text.ToString());

                Assert.False(cursor.MoveNext());
            }
        }
Example #9
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // TensorFlow is 64-bit only
        public void TestTensorFlowEntryPoint()
        {
            var dataPath   = GetDataPath("Train-Tiny-28x28.txt");
            var env        = new MLContext(42);
            var experiment = env.CreateExperiment();

            var importInput = new Legacy.Data.TextLoader(dataPath);

            importInput.Arguments.Column = new TextLoaderColumn[]
            {
                new TextLoaderColumn {
                    Name = "Label", Source = new[] { new TextLoaderRange(0) }
                },
                new TextLoaderColumn {
                    Name = "Placeholder", Source = new[] { new TextLoaderRange(1, 784) }
                }
            };
            var importOutput = experiment.Add(importInput);

            var tfTransformInput = new Legacy.Transforms.TensorFlowScorer
            {
                Data          = importOutput.Data,
                ModelLocation = "mnist_model/frozen_saved_model.pb",
                InputColumns  = new[] { "Placeholder" },
                OutputColumns = new[] { "Softmax" },
            };
            var tfTransformOutput = experiment.Add(tfTransformInput);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(tfTransformOutput.OutputData);

            var schema = data.Schema;

            Assert.Equal(3, schema.Count);
            Assert.Equal("Softmax", schema[2].Name);
            Assert.Equal(10, (schema[2].Type as VectorType)?.Size);
        }
        /// <summary>
        /// Train the model using the ML components in the pipeline.
        /// </summary>
        /// <typeparam name="TInput">Type of data instances the model will be trained on. It's a custom type defined by the user according to the structure of data.
        /// <para/>
        /// Please see https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet/get-started/windows for more details on input type.
        /// </typeparam>
        /// <typeparam name="TOutput">Ouput type. The prediction will be return based on this type.
        /// Please see https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet/get-started/windows for more details on output type.
        /// </typeparam>
        /// <returns>PredictionModel object. This is the model object used for prediction on new instances. </returns>
        public PredictionModel <TInput, TOutput> Train <TInput, TOutput>()
            where TInput : class
            where TOutput : class, new()
        {
            var                            environment        = new MLContext(seed: _seed, conc: _conc);
            Experiment                     experiment         = environment.CreateExperiment();
            ILearningPipelineStep          step               = null;
            List <ILearningPipelineLoader> loaders            = new List <ILearningPipelineLoader>();
            List <Var <ITransformModel> >  transformModels    = new List <Var <ITransformModel> >();
            Var <ITransformModel>          lastTransformModel = null;

            foreach (ILearningPipelineItem currentItem in this)
            {
                if (currentItem is ILearningPipelineLoader loader)
                {
                    loaders.Add(loader);
                }

                step = currentItem.ApplyStep(step, experiment);
                if (step is ILearningPipelineDataStep dataStep && dataStep.Model != null)
                {
                    transformModels.Add(dataStep.Model);
                }
            public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
            {
                var env      = new MLContext();
                var subgraph = env.CreateExperiment();

                subgraph.Add(_trainer);
                var ova = new OneVersusAll();

                if (previousStep != null)
                {
                    if (!(previousStep is ILearningPipelineDataStep dataStep))
                    {
                        throw new InvalidOperationException($"{ nameof(OneVersusAll)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
                    }

                    _data                = dataStep.Data;
                    ova.TrainingData     = dataStep.Data;
                    ova.UseProbabilities = _useProbabilities;
                    ova.Nodes            = subgraph;
                }
                Output output = experiment.Add(ova);

                return(new OvaPipelineStep(output));
            }
Example #12
0
        public void TestOvaMacroWithUncalibratedLearner()
        {
            var dataPath = GetDataPath(@"iris.txt");
            var env      = new MLContext(42);
            // Specify subgraph for OVA
            var subGraph     = env.CreateExperiment();
            var learnerInput = new Legacy.Trainers.AveragedPerceptronBinaryClassifier {
                Shuffle = false
            };
            var learnerOutput = subGraph.Add(learnerInput);
            // Create pipeline with OVA and multiclass scoring.
            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath);

            importInput.Arguments.Column = new TextLoaderColumn[]
            {
                new TextLoaderColumn {
                    Name = "Label", Source = new[] { new TextLoaderRange(0) }
                },
                new TextLoaderColumn {
                    Name = "Features", Source = new[] { new TextLoaderRange(1, 4) }
                }
            };
            var importOutput = experiment.Add(importInput);
            var oneVersusAll = new Legacy.Models.OneVersusAll
            {
                TrainingData     = importOutput.Data,
                Nodes            = subGraph,
                UseProbabilities = true,
            };
            var ovaOutput  = experiment.Add(oneVersusAll);
            var scoreInput = new Legacy.Transforms.DatasetScorer
            {
                Data           = importOutput.Data,
                PredictorModel = ovaOutput.PredictorModel
            };
            var scoreOutput = experiment.Add(scoreInput);
            var evalInput   = new Legacy.Models.ClassificationEvaluator
            {
                Data = scoreOutput.ScoredData
            };
            var evalOutput = experiment.Add(evalInput);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();

            var data   = experiment.GetOutput(evalOutput.OverallMetrics);
            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex(MultiClassClassifierEvaluator.AccuracyMacro, out int accCol);

            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == accCol))
            {
                var getter = cursor.GetGetter <double>(accCol);
                b = cursor.MoveNext();
                Assert.True(b);
                double acc = 0;
                getter(ref acc);
                Assert.Equal(0.71, acc, 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
        public void CanSuccessfullyEnumerated()
        {
            var collection = CollectionDataSource.Create(new List <Input>()
            {
                new Input {
                    Number1 = 1, String1 = "1"
                },
                new Input {
                    Number1 = 2, String1 = "2"
                },
                new Input {
                    Number1 = 3, String1 = "3"
                }
            });

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = collection.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            collection.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var IDGetter   = cursor.GetGetter <float>(0);
                var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1);

                Assert.True(cursor.MoveNext());

                float ID = 0;
                IDGetter(ref ID);
                Assert.Equal(1, ID);

                ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("1", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(2, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("2", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(3, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("3", Text.ToString());

                Assert.False(cursor.MoveNext());
            }
        }
Example #14
0
        public void TestSimpleTrainExperiment()
        {
            var dataPath   = GetDataPath("adult.tiny.with-schema.txt");
            var env        = new MLContext();
            var experiment = env.CreateExperiment();

            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var catInput = new Legacy.Transforms.CategoricalOneHotVectorizer
            {
                Data = importOutput.Data
            };

            catInput.AddColumn("Categories");
            var catOutput = experiment.Add(catInput);

            var concatInput = new Legacy.Transforms.ColumnConcatenator
            {
                Data = catOutput.OutputData
            };

            concatInput.AddColumn("Features", "Categories", "NumericFeatures");
            var concatOutput = experiment.Add(concatInput);

            var sdcaInput = new Legacy.Trainers.StochasticDualCoordinateAscentBinaryClassifier
            {
                TrainingData = concatOutput.OutputData,
                LossFunction = new HingeLossSDCAClassificationLossFunction()
                {
                    Margin = 1.1f
                },
                NumThreads = 1,
                Shuffle    = false
            };
            var sdcaOutput = experiment.Add(sdcaInput);

            var scoreInput = new Legacy.Transforms.DatasetScorer
            {
                Data           = concatOutput.OutputData,
                PredictorModel = sdcaOutput.PredictorModel
            };
            var scoreOutput = experiment.Add(scoreInput);

            var evalInput = new Legacy.Models.BinaryClassificationEvaluator
            {
                Data = scoreOutput.ScoredData
            };
            var evalOutput = experiment.Add(evalInput);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(evalOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("AUC", out int aucCol);

            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == aucCol))
            {
                var getter = cursor.GetGetter <double>(aucCol);
                b = cursor.MoveNext();
                Assert.True(b);
                double auc = 0;
                getter(ref auc);
                Assert.Equal(0.93, auc, 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Example #15
0
        public void TestCrossValidationMacroWithNonDefaultNames()
        {
            string dataPath = GetDataPath(@"adult.tiny.with-schema.txt");
            var    env      = new MLContext(42);
            var    subGraph = env.CreateExperiment();

            var textToKey = new Legacy.Transforms.TextToKeyConverter();

            textToKey.Column = new[] { new Legacy.Transforms.ValueToKeyMappingTransformerColumn()
                                       {
                                           Name = "Label1", Source = "Label"
                                       } };
            var textToKeyOutput = subGraph.Add(textToKey);

            var hash = new Legacy.Transforms.HashConverter();

            hash.Column = new[] { new Legacy.Transforms.HashJoiningTransformColumn()
                                  {
                                      Name = "GroupId1", Source = "Workclass"
                                  } };
            hash.Data = textToKeyOutput.OutputData;
            var hashOutput = subGraph.Add(hash);

            var learnerInput = new Legacy.Trainers.FastTreeRanker
            {
                TrainingData  = hashOutput.OutputData,
                NumThreads    = 1,
                LabelColumn   = "Label1",
                GroupIdColumn = "GroupId1"
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(textToKeyOutput.Model, hashOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath);

            importInput.Arguments.HasHeader = true;
            importInput.Arguments.Column    = new TextLoaderColumn[]
            {
                new TextLoaderColumn {
                    Name = "Label", Source = new[] { new TextLoaderRange(0) }
                },
                new TextLoaderColumn {
                    Name = "Workclass", Source = new[] { new TextLoaderRange(1) }, Type = Legacy.Data.DataKind.Text
                },
                new TextLoaderColumn {
                    Name = "Features", Source = new[] { new TextLoaderRange(9, 14) }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                TransformModel = null,
                LabelColumn    = "Label1",
                GroupColumn    = "GroupId1",
                NameColumn     = "Workclass",
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureRankerTrainer
            };

            crossValidate.Inputs.Data            = textToKey.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("NDCG", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <VBuffer <double> >(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the verage.
                b = cursor.MoveNext();
                Assert.True(b);
                var avg = default(VBuffer <double>);
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                var stdev = default(VBuffer <double>);
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                var stdevValues = stdev.GetValues();
                Assert.Equal(2.462, stdevValues[0], 3);
                Assert.Equal(2.763, stdevValues[1], 3);
                Assert.Equal(3.273, stdevValues[2], 3);

                var sumBldr = new BufferBuilder <double>(R8Adder.Instance);
                sumBldr.Reset(avg.Length, true);
                var val = default(VBuffer <double>);
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sumBldr.AddFeatures(0, in val);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                var sum = default(VBuffer <double>);
                sumBldr.GetResult(ref sum);

                var avgValues = avg.GetValues();
                var sumValues = sum.GetValues();
                for (int i = 0; i < avgValues.Length; i++)
                {
                    Assert.Equal(avgValues[i], sumValues[i] / 2);
                }
                b = cursor.MoveNext();
                Assert.False(b);
            }

            data = experiment.GetOutput(crossValidateOutput.PerInstanceMetrics);
            Assert.True(data.Schema.TryGetColumnIndex("Instance", out int nameCol));
            using (var cursor = data.GetRowCursor(col => col == nameCol))
            {
                var getter = cursor.GetGetter <ReadOnlyMemory <char> >(nameCol);
                while (cursor.MoveNext())
                {
                    ReadOnlyMemory <char> name = default;
                    getter(ref name);
                    Assert.Subset(new HashSet <string>()
                    {
                        "Private", "?", "Federal-gov"
                    }, new HashSet <string>()
                    {
                        name.ToString()
                    });
                    if (cursor.Position > 4)
                    {
                        break;
                    }
                }
            }
        }
Example #16
0
        public void TestCrossValidationMacroMultiClassWithWarnings()
        {
            var dataPath = GetDataPath(@"Train-Tiny-28x28.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.LogisticRegressionClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var experiment   = env.CreateExperiment();
            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var filter = new Legacy.Transforms.RowRangeFilter();

            filter.Data   = importOutput.Data;
            filter.Column = "Label";
            filter.Min    = 0;
            filter.Max    = 5;
            var filterOutput = experiment.Add(filter);

            var term = new Legacy.Transforms.TextToKeyConverter();

            term.Column = new[]
            {
                new Legacy.Transforms.ValueToKeyMappingTransformerColumn()
                {
                    Source = "Label", Name = "Strat", Sort = Legacy.Transforms.ValueToKeyMappingTransformerSortOrder.Value
                }
            };
            term.Data = filterOutput.OutputData;
            var termOutput = experiment.Add(term);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data                 = termOutput.OutputData,
                Nodes                = subGraph,
                Kind                 = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer,
                TransformModel       = null,
                StratificationColumn = "Strat"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = learnerOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var warnings = experiment.GetOutput(crossValidateOutput.Warnings);

            var schema = warnings.Schema;
            var b      = schema.TryGetColumnIndex("WarningText", out int warningCol);

            Assert.True(b);
            using (var cursor = warnings.GetRowCursor(col => col == warningCol))
            {
                var getter = cursor.GetGetter <ReadOnlyMemory <char> >(warningCol);

                b = cursor.MoveNext();
                Assert.True(b);
                var warning = default(ReadOnlyMemory <char>);
                getter(ref warning);
                Assert.Contains("test instances with class values not seen in the training set.", warning.ToString());
                b = cursor.MoveNext();
                Assert.True(b);
                getter(ref warning);
                Assert.Contains("Detected columns of variable length: SortedScores, SortedClasses", warning.ToString());
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Example #17
0
        public void TestCrossValidationMacroWithStratification()
        {
            var dataPath = GetDataPath(@"breast-cancer.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentBinaryClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath);

            importInput.Arguments.Column = new Legacy.Data.TextLoaderColumn[]
            {
                new Legacy.Data.TextLoaderColumn {
                    Name = "Label", Source = new[] { new Legacy.Data.TextLoaderRange(0) }
                },
                new Legacy.Data.TextLoaderColumn {
                    Name = "Strat", Source = new[] { new Legacy.Data.TextLoaderRange(1) }
                },
                new Legacy.Data.TextLoaderColumn {
                    Name = "Features", Source = new[] { new Legacy.Data.TextLoaderRange(2, 9) }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data                 = importOutput.Data,
                Nodes                = subGraph,
                TransformModel       = null,
                StratificationColumn = "Strat"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("AUC", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the verage.
                b = cursor.MoveNext();
                Assert.True(b);
                double avg = 0;
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                double stdev = 0;
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                Assert.Equal(0.00488, stdev, 5);

                double sum = 0;
                double val = 0;
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sum += val;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Example #18
0
        public void TestCrossValidationMacroWithMultiClass()
        {
            var dataPath = GetDataPath(@"Train-Tiny-28x28.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment   = env.CreateExperiment();
            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer,
                TransformModel = null
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("Accuracy(micro-avg)", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the average.
                b = cursor.MoveNext();
                Assert.True(b);
                double avg = 0;
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                double stdev = 0;
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                Assert.Equal(0.015, stdev, 3);

                double sum = 0;
                double val = 0;
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sum += val;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }

            var confusion = experiment.GetOutput(crossValidateOutput.ConfusionMatrix);

            schema = confusion.Schema;
            b      = schema.TryGetColumnIndex("Count", out int countCol);
            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out foldCol);
            Assert.True(b);
            var type = schema[countCol].Metadata.Schema[MetadataUtils.Kinds.SlotNames].Type;

            Assert.True(type is VectorType vecType && vecType.ItemType is TextType && vecType.Size == 10);
            var slotNames = default(VBuffer <ReadOnlyMemory <char> >);

            schema[countCol].GetSlotNames(ref slotNames);
            var slotNameValues = slotNames.GetValues();

            for (int i = 0; i < slotNameValues.Length; i++)
            {
                Assert.True(ReadOnlyMemoryUtils.EqualsStr(i.ToString(), slotNameValues[i]));
            }
            using (var curs = confusion.GetRowCursor(col => true))
            {
                var countGetter = curs.GetGetter <VBuffer <double> >(countCol);
                var foldGetter  = curs.GetGetter <ReadOnlyMemory <char> >(foldCol);
                var confCount   = default(VBuffer <double>);
                var foldIndex   = default(ReadOnlyMemory <char>);
                int rowCount    = 0;
                var foldCur     = "Fold 0";
                while (curs.MoveNext())
                {
                    countGetter(ref confCount);
                    foldGetter(ref foldIndex);
                    rowCount++;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr(foldCur, foldIndex));
                    if (rowCount == 10)
                    {
                        rowCount = 0;
                        foldCur  = "Fold 1";
                    }
                }
                Assert.Equal(0, rowCount);
            }

            var warnings = experiment.GetOutput(crossValidateOutput.Warnings);

            using (var cursor = warnings.GetRowCursor(col => true))
                Assert.False(cursor.MoveNext());
        }
Example #19
0
        public void CanSuccessfullyRetrieveSparseData()
        {
            string dataPath = GetDataPath("SparseData.txt");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <SparseInput>(useHeader: true, allowQuotedStrings: false, supportSparse: true);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var getters = new ValueGetter <float>[] {
                    cursor.GetGetter <float>(0),
                    cursor.GetGetter <float>(1),
                    cursor.GetGetter <float>(2),
                    cursor.GetGetter <float>(3),
                    cursor.GetGetter <float>(4)
                };


                Assert.True(cursor.MoveNext());

                float[] targets = new float[] { 1, 2, 3, 4, 5 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.True(cursor.MoveNext());

                targets = new float[] { 0, 0, 0, 4, 5 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.True(cursor.MoveNext());

                targets = new float[] { 0, 2, 0, 0, 0 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.False(cursor.MoveNext());
            }
        }
Example #20
0
        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
        public void TestCrossValidationMacro()
        {
            var dataPath = GetDataPath(TestDatasets.generatedRegressionDatasetmacro.trainFilename);
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var generate = new Legacy.Transforms.RandomNumberGenerator();

            generate.Column = new[] { new Legacy.Transforms.GenerateNumberTransformColumn()
                                      {
                                          Name = "Weight1"
                                      } };
            generate.Data = nopOutput.OutputData;
            var generateOutput = subGraph.Add(generate);

            var learnerInput = new Legacy.Trainers.PoissonRegressor
            {
                TrainingData = generateOutput.OutputData,
                NumThreads   = 1,
                WeightColumn = "Weight1"
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model, generateOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath)
            {
                Arguments = new Legacy.Data.TextLoaderArguments
                {
                    Separator = new[] { ';' },
                    HasHeader = true,
                    Column    = new[]
                    {
                        new TextLoaderColumn()
                        {
                            Name   = "Label",
                            Source = new [] { new TextLoaderRange(11) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Features",
                            Source = new [] { new TextLoaderRange(0, 10) },
                            Type   = Legacy.Data.DataKind.Num
                        }
                    }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer,
                TransformModel = null,
                WeightColumn   = "Weight1"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("L1(avg)", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            b = schema.TryGetColumnIndex("IsWeighted", out int isWeightedCol);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol || col == isWeightedCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;
                var    isWeightedGetter    = cursor.GetGetter <bool>(isWeightedCol);
                bool   isWeighted          = default;
                double avg                 = 0;
                double weightedAvg         = 0;
                for (int w = 0; w < 2; w++)
                {
                    // Get the average.
                    b = cursor.MoveNext();
                    Assert.True(b);
                    if (w == 1)
                    {
                        getter(ref weightedAvg);
                    }
                    else
                    {
                        getter(ref avg);
                    }
                    foldGetter(ref fold);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));
                    isWeightedGetter(ref isWeighted);
                    Assert.True(isWeighted == (w == 1));

                    // Get the standard deviation.
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double stdev = 0;
                    getter(ref stdev);
                    foldGetter(ref fold);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                    if (w == 1)
                    {
                        Assert.Equal(1.585, stdev, 3);
                    }
                    else
                    {
                        Assert.Equal(1.39, stdev, 2);
                    }
                    isWeightedGetter(ref isWeighted);
                    Assert.True(isWeighted == (w == 1));
                }
                double sum         = 0;
                double weightedSum = 0;
                for (int f = 0; f < 2; f++)
                {
                    for (int w = 0; w < 2; w++)
                    {
                        b = cursor.MoveNext();
                        Assert.True(b);
                        double val = 0;
                        getter(ref val);
                        foldGetter(ref fold);
                        if (w == 1)
                        {
                            weightedSum += val;
                        }
                        else
                        {
                            sum += val;
                        }
                        Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                        isWeightedGetter(ref isWeighted);
                        Assert.True(isWeighted == (w == 1));
                    }
                }
                Assert.Equal(weightedAvg, weightedSum / 2);
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Example #21
0
        public void TestCrossValidationBinaryMacro()
        {
            var dataPath = GetDataPath("adult.tiny.with-schema.txt");
            var env      = new MLContext();
            var subGraph = env.CreateExperiment();

            var catInput = new Legacy.Transforms.CategoricalOneHotVectorizer();

            catInput.AddColumn("Categories");
            var catOutput = subGraph.Add(catInput);

            var concatInput = new Legacy.Transforms.ColumnConcatenator
            {
                Data = catOutput.OutputData
            };

            concatInput.AddColumn("Features", "Categories", "NumericFeatures");
            var concatOutput = subGraph.Add(concatInput);

            var lrInput = new Legacy.Trainers.LogisticRegressionBinaryClassifier
            {
                TrainingData = concatOutput.OutputData,
                NumThreads   = 1
            };
            var lrOutput = subGraph.Add(lrInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(catOutput.Model, concatOutput.Model),
                PredictorModel  = lrOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment = env.CreateExperiment();

            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var crossValidateBinary = new Legacy.Models.BinaryCrossValidator
            {
                Data  = importOutput.Data,
                Nodes = subGraph
            };

            crossValidateBinary.Inputs.Data   = catInput.Data;
            crossValidateBinary.Outputs.Model = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidateBinary);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("AUC", out int aucCol);

            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == aucCol))
            {
                var getter = cursor.GetGetter <double>(aucCol);
                b = cursor.MoveNext();
                Assert.True(b);
                double auc = 0;
                getter(ref auc);
                Assert.Equal(0.87, auc, 1);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Example #22
0
        private static TransformModel CreateKcHousePricePredictorModel(string dataPath)
        {
            Experiment experiment = s_environment.CreateExperiment();
            var        importData = new Legacy.Data.TextLoader(dataPath)
            {
                Arguments = new TextLoaderArguments
                {
                    Separator = new[] { ',' },
                    HasHeader = true,
                    Column    = new[]
                    {
                        new TextLoaderColumn()
                        {
                            Name   = "Id",
                            Source = new [] { new TextLoaderRange(0) },
                            Type   = Legacy.Data.DataKind.Text
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Date",
                            Source = new [] { new TextLoaderRange(1) },
                            Type   = Legacy.Data.DataKind.Text
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Label",
                            Source = new [] { new TextLoaderRange(2) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Bedrooms",
                            Source = new [] { new TextLoaderRange(3) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Bathrooms",
                            Source = new [] { new TextLoaderRange(4) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLiving",
                            Source = new [] { new TextLoaderRange(5) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLot",
                            Source = new [] { new TextLoaderRange(6) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Floors",
                            Source = new [] { new TextLoaderRange(7) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Waterfront",
                            Source = new [] { new TextLoaderRange(8) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "View",
                            Source = new [] { new TextLoaderRange(9) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Condition",
                            Source = new [] { new TextLoaderRange(10) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Grade",
                            Source = new [] { new TextLoaderRange(11) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftAbove",
                            Source = new [] { new TextLoaderRange(12) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftBasement",
                            Source = new [] { new TextLoaderRange(13) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "YearBuilt",
                            Source = new [] { new TextLoaderRange(14) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "YearRenovated",
                            Source = new [] { new TextLoaderRange(15) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Zipcode",
                            Source = new [] { new TextLoaderRange(16) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Lat",
                            Source = new [] { new TextLoaderRange(17) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Long",
                            Source = new [] { new TextLoaderRange(18) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLiving15",
                            Source = new [] { new TextLoaderRange(19) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "SqftLot15",
                            Source = new [] { new TextLoaderRange(20) },
                            Type   = Legacy.Data.DataKind.Num
                        },
                    }
                }

                //new Data.CustomTextLoader();
                // importData.CustomSchema = dataSchema;
                //
            };

            Legacy.Data.TextLoader.Output imported = experiment.Add(importData);
            var numericalConcatenate = new Legacy.Transforms.ColumnConcatenator();

            numericalConcatenate.Data = imported.Data;
            numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15");
            Legacy.Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate);

            var categoryConcatenate = new Legacy.Transforms.ColumnConcatenator();

            categoryConcatenate.Data = numericalConcatenated.OutputData;
            categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode");
            Legacy.Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate);

            var categorize = new Legacy.Transforms.CategoricalOneHotVectorizer();

            categorize.AddColumn("CategoryFeatures");
            categorize.Data = categoryConcatenated.OutputData;
            Legacy.Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize);

            var featuresConcatenate = new Legacy.Transforms.ColumnConcatenator();

            featuresConcatenate.Data = categorized.OutputData;
            featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures");
            Legacy.Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate);

            var learner = new Legacy.Trainers.StochasticDualCoordinateAscentRegressor();

            learner.TrainingData = featuresConcatenated.OutputData;
            learner.NumThreads   = 1;
            Legacy.Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner);

            var combineModels = new Legacy.Transforms.ManyHeterogeneousModelCombiner();

            combineModels.TransformModels = new ArrayVar <TransformModel>(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model);
            combineModels.PredictorModel  = learnerOutput.PredictorModel;
            Legacy.Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels);

            var scorer = new Legacy.Transforms.Scorer
            {
                PredictorModel = combinedModels.PredictorModel
            };

            var scorerOutput = experiment.Add(scorer);

            experiment.Compile();
            experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false));
            experiment.Run();

            return(experiment.GetOutput(scorerOutput.ScoringTransform));
        }