public void TestSimpleExperiment() { var dataPath = GetDataPath(@"adult.tiny.with-schema.txt"); using (var env = new TlcEnvironment()) { var experiment = env.CreateExperiment(); var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var normalizeInput = new ML.Transforms.MinMaxNormalizer { Data = importOutput.Data }; normalizeInput.AddColumn("NumericFeatures"); var normalizeOutput = experiment.Add(normalizeInput); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(normalizeOutput.OutputData); var schema = data.Schema; Assert.Equal(5, schema.ColumnCount); var expected = new[] { "Label", "Workclass", "Categories", "NumericFeatures", "NumericFeatures" }; for (int i = 0; i < schema.ColumnCount; i++) { Assert.Equal(expected[i], schema.GetColumnName(i)); } } }
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { Contracts.Assert(previousStep == null); ImportTextInput = new Data.TextLoader(); ImportTextInput.CustomSchema = CustomSchema; var importOutput = experiment.Add(ImportTextInput); return(new TextLoaderPipelineStep(importOutput.Data)); }
public void TestSimpleTrainExperiment() { var dataPath = GetDataPath(@"adult.tiny.with-schema.txt"); using (var env = new TlcEnvironment()) { var experiment = env.CreateExperiment(); var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var catInput = new ML.Transforms.CategoricalOneHotVectorizer { Data = importOutput.Data }; catInput.AddColumn("Categories"); var catOutput = experiment.Add(catInput); var concatInput = new ML.Transforms.ColumnConcatenator { Data = catOutput.OutputData }; concatInput.AddColumn("Features", "Categories", "NumericFeatures"); var concatOutput = experiment.Add(concatInput); var sdcaInput = new ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier { TrainingData = concatOutput.OutputData, LossFunction = new HingeLossSDCAClassificationLossFunction() { Margin = 1.1f }, NumThreads = 1, Shuffle = false }; var sdcaOutput = experiment.Add(sdcaInput); var scoreInput = new ML.Transforms.DatasetScorer { Data = concatOutput.OutputData, PredictorModel = sdcaOutput.PredictorModel }; var scoreOutput = experiment.Add(scoreInput); var evalInput = new ML.Models.BinaryClassificationEvaluator { Data = scoreOutput.ScoredData }; var evalOutput = experiment.Add(evalInput); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(evalOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("AUC", out int aucCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == aucCol)) { var getter = cursor.GetGetter <double>(aucCol); b = cursor.MoveNext(); Assert.True(b); double auc = 0; getter(ref auc); Assert.Equal(0.93, auc, 2); b = cursor.MoveNext(); Assert.False(b); } } }
public void TestCrossValidationMacroWithStratification() { var dataPath = GetDataPath(@"breast-cancer.txt"); using (var env = new TlcEnvironment()) { var subGraph = env.CreateExperiment(); var nop = new ML.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var learnerInput = new ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier { TrainingData = nopOutput.OutputData, NumThreads = 1 }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <ITransformModel>(nopOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new ML.Data.TextLoader(dataPath); importInput.Arguments.Column = new ML.Data.TextLoaderColumn[] { new ML.Data.TextLoaderColumn { Name = "Label", Source = new[] { new ML.Data.TextLoaderRange(0) } }, new ML.Data.TextLoaderColumn { Name = "Strat", Source = new[] { new ML.Data.TextLoaderRange(1) } }, new ML.Data.TextLoaderColumn { Name = "Features", Source = new[] { new ML.Data.TextLoaderRange(2, 9) } } }; var importOutput = experiment.Add(importInput); var crossValidate = new ML.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, TransformModel = null, StratificationColumn = "Strat" }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.Model = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]); var schema = data.Schema; var b = schema.TryGetColumnIndex("AUC", out int metricCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == metricCol)) { var getter = cursor.GetGetter <double>(metricCol); b = cursor.MoveNext(); Assert.True(b); double val = 0; getter(ref val); Assert.Equal(0.99, val, 2); b = cursor.MoveNext(); Assert.False(b); } } }
public void TestCrossValidationMacro() { var dataPath = GetDataPath(TestDatasets.winequality.trainFilename); using (var env = new TlcEnvironment()) { var subGraph = env.CreateExperiment(); var nop = new ML.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var learnerInput = new ML.Trainers.StochasticDualCoordinateAscentRegressor { TrainingData = nopOutput.OutputData, NumThreads = 1 }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <ITransformModel>(nopOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new ML.Data.TextLoader(dataPath) { Arguments = new TextLoaderArguments { Separator = new[] { ';' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(11) }, Type = DataKind.Num }, new TextLoaderColumn() { Name = "Features", Source = new [] { new TextLoaderRange(0, 10) }, Type = DataKind.Num } } } }; var importOutput = experiment.Add(importInput); var crossValidate = new ML.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, Kind = ML.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer, TransformModel = null }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.Model = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]); var schema = data.Schema; var b = schema.TryGetColumnIndex("L1(avg)", out int metricCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == metricCol)) { var getter = cursor.GetGetter <double>(metricCol); b = cursor.MoveNext(); Assert.True(b); double val = 0; getter(ref val); Assert.Equal(0.58, val, 1); b = cursor.MoveNext(); Assert.False(b); } } }
public void TestCrossValidationBinaryMacro() { var dataPath = GetDataPath(@"adult.tiny.with-schema.txt"); using (var env = new TlcEnvironment()) { var subGraph = env.CreateExperiment(); var catInput = new ML.Transforms.CategoricalOneHotVectorizer(); catInput.AddColumn("Categories"); var catOutput = subGraph.Add(catInput); var concatInput = new ML.Transforms.ColumnConcatenator { Data = catOutput.OutputData }; concatInput.AddColumn("Features", "Categories", "NumericFeatures"); var concatOutput = subGraph.Add(concatInput); var lrInput = new ML.Trainers.LogisticRegressionBinaryClassifier { TrainingData = concatOutput.OutputData, NumThreads = 1 }; var lrOutput = subGraph.Add(lrInput); var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <ITransformModel>(catOutput.Model, concatOutput.Model), PredictorModel = lrOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var crossValidateBinary = new ML.Models.BinaryCrossValidator { Data = importOutput.Data, Nodes = subGraph }; crossValidateBinary.Inputs.Data = catInput.Data; crossValidateBinary.Outputs.Model = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidateBinary); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]); var schema = data.Schema; var b = schema.TryGetColumnIndex("AUC", out int aucCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == aucCol)) { var getter = cursor.GetGetter <double>(aucCol); b = cursor.MoveNext(); Assert.True(b); double auc = 0; getter(ref auc); Assert.Equal(0.87, auc, 1); b = cursor.MoveNext(); Assert.False(b); } } }
// Note that we don't filter out rows with parsing issues since it's not acceptable to // produce a different set of rows when subsetting columns. Any parsing errors need to be // translated to NaN, not result in skipping the row. We should produce some diagnostics // to alert the user to the issues. private Cursor(TextLoader parent, ParseStats stats, bool[] active, LineReader reader, int srcNeeded, int cthd) : base(parent._host) { Ch.Assert(active == null || active.Length == parent._bindings.OutputSchema.Count); Ch.AssertValue(reader); Ch.AssertValue(stats); Ch.Assert(srcNeeded >= 0); Ch.Assert(cthd > 0); _total = -1; _batch = -1; _bindings = parent._bindings; _parser = parent._parser; _active = active; _reader = reader; _stats = stats; _srcNeeded = srcNeeded; ParallelState state = null; if (cthd > 1) { state = new ParallelState(this, out _rows, cthd); } else { _rows = _parser.CreateRowSet(_stats, 1, _active); } try { _getters = new Delegate[_bindings.Infos.Length]; for (int i = 0; i < _getters.Length; i++) { if (_active != null && !_active[i]) { continue; } ColumnPipe v = _rows.Pipes[i]; Ch.Assert(v != null); _getters[i] = v.GetGetter(); Ch.Assert(_getters[i] != null); } if (state != null) { _ator = ParseParallel(state).GetEnumerator(); state = null; } else { _ator = ParseSequential().GetEnumerator(); } } finally { if (state != null) { state.Dispose(); } } }