public void TestSimpleExperiment()
        {
            var dataPath = GetDataPath(@"adult.tiny.with-schema.txt");

            using (var env = new TlcEnvironment())
            {
                var experiment = env.CreateExperiment();

                var importInput  = new ML.Data.TextLoader(dataPath);
                var importOutput = experiment.Add(importInput);

                var normalizeInput = new ML.Transforms.MinMaxNormalizer
                {
                    Data = importOutput.Data
                };
                normalizeInput.AddColumn("NumericFeatures");
                var normalizeOutput = experiment.Add(normalizeInput);

                experiment.Compile();
                experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
                experiment.Run();
                var data = experiment.GetOutput(normalizeOutput.OutputData);

                var schema = data.Schema;
                Assert.Equal(5, schema.ColumnCount);
                var expected = new[] { "Label", "Workclass", "Categories", "NumericFeatures", "NumericFeatures" };
                for (int i = 0; i < schema.ColumnCount; i++)
                {
                    Assert.Equal(expected[i], schema.GetColumnName(i));
                }
            }
        }
示例#2
0
        public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
        {
            Contracts.Assert(previousStep == null);

            ImportTextInput = new Data.TextLoader();
            ImportTextInput.CustomSchema = CustomSchema;
            var importOutput = experiment.Add(ImportTextInput);

            return(new TextLoaderPipelineStep(importOutput.Data));
        }
        public void TestSimpleTrainExperiment()
        {
            var dataPath = GetDataPath(@"adult.tiny.with-schema.txt");

            using (var env = new TlcEnvironment())
            {
                var experiment = env.CreateExperiment();

                var importInput  = new ML.Data.TextLoader(dataPath);
                var importOutput = experiment.Add(importInput);

                var catInput = new ML.Transforms.CategoricalOneHotVectorizer
                {
                    Data = importOutput.Data
                };
                catInput.AddColumn("Categories");
                var catOutput = experiment.Add(catInput);

                var concatInput = new ML.Transforms.ColumnConcatenator
                {
                    Data = catOutput.OutputData
                };
                concatInput.AddColumn("Features", "Categories", "NumericFeatures");
                var concatOutput = experiment.Add(concatInput);

                var sdcaInput = new ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier
                {
                    TrainingData = concatOutput.OutputData,
                    LossFunction = new HingeLossSDCAClassificationLossFunction()
                    {
                        Margin = 1.1f
                    },
                    NumThreads = 1,
                    Shuffle    = false
                };
                var sdcaOutput = experiment.Add(sdcaInput);

                var scoreInput = new ML.Transforms.DatasetScorer
                {
                    Data           = concatOutput.OutputData,
                    PredictorModel = sdcaOutput.PredictorModel
                };
                var scoreOutput = experiment.Add(scoreInput);

                var evalInput = new ML.Models.BinaryClassificationEvaluator
                {
                    Data = scoreOutput.ScoredData
                };
                var evalOutput = experiment.Add(evalInput);

                experiment.Compile();
                experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
                experiment.Run();
                var data = experiment.GetOutput(evalOutput.OverallMetrics);

                var schema = data.Schema;
                var b      = schema.TryGetColumnIndex("AUC", out int aucCol);
                Assert.True(b);
                using (var cursor = data.GetRowCursor(col => col == aucCol))
                {
                    var getter = cursor.GetGetter <double>(aucCol);
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double auc = 0;
                    getter(ref auc);
                    Assert.Equal(0.93, auc, 2);
                    b = cursor.MoveNext();
                    Assert.False(b);
                }
            }
        }
        public void TestCrossValidationMacroWithStratification()
        {
            var dataPath = GetDataPath(@"breast-cancer.txt");

            using (var env = new TlcEnvironment())
            {
                var subGraph = env.CreateExperiment();

                var nop       = new ML.Transforms.NoOperation();
                var nopOutput = subGraph.Add(nop);

                var learnerInput = new ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier
                {
                    TrainingData = nopOutput.OutputData,
                    NumThreads   = 1
                };
                var learnerOutput = subGraph.Add(learnerInput);

                var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner
                {
                    TransformModels = new ArrayVar <ITransformModel>(nopOutput.Model),
                    PredictorModel  = learnerOutput.PredictorModel
                };
                var modelCombineOutput = subGraph.Add(modelCombine);

                var experiment  = env.CreateExperiment();
                var importInput = new ML.Data.TextLoader(dataPath);
                importInput.Arguments.Column = new ML.Data.TextLoaderColumn[]
                {
                    new ML.Data.TextLoaderColumn {
                        Name = "Label", Source = new[] { new ML.Data.TextLoaderRange(0) }
                    },
                    new ML.Data.TextLoaderColumn {
                        Name = "Strat", Source = new[] { new ML.Data.TextLoaderRange(1) }
                    },
                    new ML.Data.TextLoaderColumn {
                        Name = "Features", Source = new[] { new ML.Data.TextLoaderRange(2, 9) }
                    }
                };
                var importOutput = experiment.Add(importInput);

                var crossValidate = new ML.Models.CrossValidator
                {
                    Data                 = importOutput.Data,
                    Nodes                = subGraph,
                    TransformModel       = null,
                    StratificationColumn = "Strat"
                };
                crossValidate.Inputs.Data   = nop.Data;
                crossValidate.Outputs.Model = modelCombineOutput.PredictorModel;
                var crossValidateOutput = experiment.Add(crossValidate);

                experiment.Compile();
                experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
                experiment.Run();
                var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]);

                var schema = data.Schema;
                var b      = schema.TryGetColumnIndex("AUC", out int metricCol);
                Assert.True(b);
                using (var cursor = data.GetRowCursor(col => col == metricCol))
                {
                    var getter = cursor.GetGetter <double>(metricCol);
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double val = 0;
                    getter(ref val);
                    Assert.Equal(0.99, val, 2);
                    b = cursor.MoveNext();
                    Assert.False(b);
                }
            }
        }
        public void TestCrossValidationMacro()
        {
            var dataPath = GetDataPath(TestDatasets.winequality.trainFilename);

            using (var env = new TlcEnvironment())
            {
                var subGraph = env.CreateExperiment();

                var nop       = new ML.Transforms.NoOperation();
                var nopOutput = subGraph.Add(nop);

                var learnerInput = new ML.Trainers.StochasticDualCoordinateAscentRegressor
                {
                    TrainingData = nopOutput.OutputData,
                    NumThreads   = 1
                };
                var learnerOutput = subGraph.Add(learnerInput);

                var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner
                {
                    TransformModels = new ArrayVar <ITransformModel>(nopOutput.Model),
                    PredictorModel  = learnerOutput.PredictorModel
                };
                var modelCombineOutput = subGraph.Add(modelCombine);

                var experiment  = env.CreateExperiment();
                var importInput = new ML.Data.TextLoader(dataPath)
                {
                    Arguments = new TextLoaderArguments
                    {
                        Separator = new[] { ';' },
                        HasHeader = true,
                        Column    = new[]
                        {
                            new TextLoaderColumn()
                            {
                                Name   = "Label",
                                Source = new [] { new TextLoaderRange(11) },
                                Type   = DataKind.Num
                            },

                            new TextLoaderColumn()
                            {
                                Name   = "Features",
                                Source = new [] { new TextLoaderRange(0, 10) },
                                Type   = DataKind.Num
                            }
                        }
                    }
                };
                var importOutput = experiment.Add(importInput);

                var crossValidate = new ML.Models.CrossValidator
                {
                    Data           = importOutput.Data,
                    Nodes          = subGraph,
                    Kind           = ML.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer,
                    TransformModel = null
                };
                crossValidate.Inputs.Data   = nop.Data;
                crossValidate.Outputs.Model = modelCombineOutput.PredictorModel;
                var crossValidateOutput = experiment.Add(crossValidate);

                experiment.Compile();
                experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
                experiment.Run();
                var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]);

                var schema = data.Schema;
                var b      = schema.TryGetColumnIndex("L1(avg)", out int metricCol);
                Assert.True(b);
                using (var cursor = data.GetRowCursor(col => col == metricCol))
                {
                    var getter = cursor.GetGetter <double>(metricCol);
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double val = 0;
                    getter(ref val);
                    Assert.Equal(0.58, val, 1);
                    b = cursor.MoveNext();
                    Assert.False(b);
                }
            }
        }
        public void TestCrossValidationBinaryMacro()
        {
            var dataPath = GetDataPath(@"adult.tiny.with-schema.txt");

            using (var env = new TlcEnvironment())
            {
                var subGraph = env.CreateExperiment();

                var catInput = new ML.Transforms.CategoricalOneHotVectorizer();
                catInput.AddColumn("Categories");
                var catOutput = subGraph.Add(catInput);

                var concatInput = new ML.Transforms.ColumnConcatenator
                {
                    Data = catOutput.OutputData
                };
                concatInput.AddColumn("Features", "Categories", "NumericFeatures");
                var concatOutput = subGraph.Add(concatInput);

                var lrInput = new ML.Trainers.LogisticRegressionBinaryClassifier
                {
                    TrainingData = concatOutput.OutputData,
                    NumThreads   = 1
                };
                var lrOutput = subGraph.Add(lrInput);

                var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner
                {
                    TransformModels = new ArrayVar <ITransformModel>(catOutput.Model, concatOutput.Model),
                    PredictorModel  = lrOutput.PredictorModel
                };
                var modelCombineOutput = subGraph.Add(modelCombine);

                var experiment = env.CreateExperiment();

                var importInput  = new ML.Data.TextLoader(dataPath);
                var importOutput = experiment.Add(importInput);

                var crossValidateBinary = new ML.Models.BinaryCrossValidator
                {
                    Data  = importOutput.Data,
                    Nodes = subGraph
                };
                crossValidateBinary.Inputs.Data   = catInput.Data;
                crossValidateBinary.Outputs.Model = modelCombineOutput.PredictorModel;
                var crossValidateOutput = experiment.Add(crossValidateBinary);

                experiment.Compile();
                experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
                experiment.Run();
                var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]);

                var schema = data.Schema;
                var b      = schema.TryGetColumnIndex("AUC", out int aucCol);
                Assert.True(b);
                using (var cursor = data.GetRowCursor(col => col == aucCol))
                {
                    var getter = cursor.GetGetter <double>(aucCol);
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double auc = 0;
                    getter(ref auc);
                    Assert.Equal(0.87, auc, 1);
                    b = cursor.MoveNext();
                    Assert.False(b);
                }
            }
        }
            // Note that we don't filter out rows with parsing issues since it's not acceptable to
            // produce a different set of rows when subsetting columns. Any parsing errors need to be
            // translated to NaN, not result in skipping the row. We should produce some diagnostics
            // to alert the user to the issues.
            private Cursor(TextLoader parent, ParseStats stats, bool[] active, LineReader reader, int srcNeeded, int cthd)
                : base(parent._host)
            {
                Ch.Assert(active == null || active.Length == parent._bindings.OutputSchema.Count);
                Ch.AssertValue(reader);
                Ch.AssertValue(stats);
                Ch.Assert(srcNeeded >= 0);
                Ch.Assert(cthd > 0);

                _total     = -1;
                _batch     = -1;
                _bindings  = parent._bindings;
                _parser    = parent._parser;
                _active    = active;
                _reader    = reader;
                _stats     = stats;
                _srcNeeded = srcNeeded;

                ParallelState state = null;

                if (cthd > 1)
                {
                    state = new ParallelState(this, out _rows, cthd);
                }
                else
                {
                    _rows = _parser.CreateRowSet(_stats, 1, _active);
                }

                try
                {
                    _getters = new Delegate[_bindings.Infos.Length];
                    for (int i = 0; i < _getters.Length; i++)
                    {
                        if (_active != null && !_active[i])
                        {
                            continue;
                        }
                        ColumnPipe v = _rows.Pipes[i];
                        Ch.Assert(v != null);
                        _getters[i] = v.GetGetter();
                        Ch.Assert(_getters[i] != null);
                    }

                    if (state != null)
                    {
                        _ator = ParseParallel(state).GetEnumerator();
                        state = null;
                    }
                    else
                    {
                        _ator = ParseSequential().GetEnumerator();
                    }
                }
                finally
                {
                    if (state != null)
                    {
                        state.Dispose();
                    }
                }
            }