public void CanSuccessfullyRetrieveQuotedData() { string dataPath = GetDataPath("QuotingData.csv"); var loader = new Legacy.Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: true, supportSparse: false); var environment = new MLContext(); Experiment experiment = environment.CreateExperiment(); Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var IDGetter = cursor.GetGetter <float>(0); var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1); Assert.True(cursor.MoveNext()); float ID = 0; IDGetter(ref ID); Assert.Equal(1, ID); ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>(); TextGetter(ref Text); Assert.Equal("This text contains comma, within quotes.", Text.ToString()); Assert.True(cursor.MoveNext()); ID = 0; IDGetter(ref ID); Assert.Equal(2, ID); Text = new ReadOnlyMemory <char>(); TextGetter(ref Text); Assert.Equal("This text contains extra punctuations and special characters.;*<>?!@#$%^&*()_+=-{}|[]:;'", Text.ToString()); Assert.True(cursor.MoveNext()); ID = 0; IDGetter(ref ID); Assert.Equal(3, ID); Text = new ReadOnlyMemory <char>(); TextGetter(ref Text); Assert.Equal("This text has no quotes", Text.ToString()); Assert.False(cursor.MoveNext()); } }
public void CanSuccessfullyTrimSpaces() { string dataPath = GetDataPath("TrimData.csv"); var loader = new Legacy.Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: false, supportSparse: false, trimWhitespace: true); var environment = new MLContext(); Experiment experiment = environment.CreateExperiment(); Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var IDGetter = cursor.GetGetter <float>(0); var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1); Assert.True(cursor.MoveNext()); float ID = 0; IDGetter(ref ID); Assert.Equal(1, ID); ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>(); TextGetter(ref Text); Assert.Equal("There is a space at the end", Text.ToString()); Assert.True(cursor.MoveNext()); ID = 0; IDGetter(ref ID); Assert.Equal(2, ID); Text = new ReadOnlyMemory <char>(); TextGetter(ref Text); Assert.Equal("There is no space at the end", Text.ToString()); Assert.False(cursor.MoveNext()); } }
public void TestCrossValidationMacroMultiClassWithWarnings() { var dataPath = GetDataPath(@"Train-Tiny-28x28.txt"); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var nop = new Legacy.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var learnerInput = new Legacy.Trainers.LogisticRegressionClassifier { TrainingData = nopOutput.OutputData, NumThreads = 1 }; var learnerOutput = subGraph.Add(learnerInput); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var filter = new Legacy.Transforms.RowRangeFilter(); filter.Data = importOutput.Data; filter.Column = "Label"; filter.Min = 0; filter.Max = 5; var filterOutput = experiment.Add(filter); var term = new Legacy.Transforms.TextToKeyConverter(); term.Column = new[] { new Legacy.Transforms.ValueToKeyMappingTransformerColumn() { Source = "Label", Name = "Strat", Sort = Legacy.Transforms.ValueToKeyMappingTransformerSortOrder.Value } }; term.Data = filterOutput.OutputData; var termOutput = experiment.Add(term); var crossValidate = new Legacy.Models.CrossValidator { Data = termOutput.OutputData, Nodes = subGraph, Kind = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer, TransformModel = null, StratificationColumn = "Strat" }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.PredictorModel = learnerOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); importInput.SetInput(env, experiment); experiment.Run(); var warnings = experiment.GetOutput(crossValidateOutput.Warnings); var schema = warnings.Schema; var b = schema.TryGetColumnIndex("WarningText", out int warningCol); Assert.True(b); using (var cursor = warnings.GetRowCursor(col => col == warningCol)) { var getter = cursor.GetGetter <ReadOnlyMemory <char> >(warningCol); b = cursor.MoveNext(); Assert.True(b); var warning = default(ReadOnlyMemory <char>); getter(ref warning); Assert.Contains("test instances with class values not seen in the training set.", warning.ToString()); b = cursor.MoveNext(); Assert.True(b); getter(ref warning); Assert.Contains("Detected columns of variable length: SortedScores, SortedClasses", warning.ToString()); b = cursor.MoveNext(); Assert.False(b); } }
public void TestCrossValidationMacroWithMultiClass() { var dataPath = GetDataPath(@"Train-Tiny-28x28.txt"); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var nop = new Legacy.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentClassifier { TrainingData = nopOutput.OutputData, NumThreads = 1 }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <TransformModel>(nopOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var crossValidate = new Legacy.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, Kind = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer, TransformModel = null }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); importInput.SetInput(env, experiment); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("Accuracy(micro-avg)", out int metricCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out int foldCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol)) { var getter = cursor.GetGetter <double>(metricCol); var foldGetter = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol); ReadOnlyMemory <char> fold = default; // Get the average. b = cursor.MoveNext(); Assert.True(b); double avg = 0; getter(ref avg); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold)); // Get the standard deviation. b = cursor.MoveNext(); Assert.True(b); double stdev = 0; getter(ref stdev); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold)); Assert.Equal(0.015, stdev, 3); double sum = 0; double val = 0; for (int f = 0; f < 2; f++) { b = cursor.MoveNext(); Assert.True(b); getter(ref val); foldGetter(ref fold); sum += val; Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold)); } Assert.Equal(avg, sum / 2); b = cursor.MoveNext(); Assert.False(b); } var confusion = experiment.GetOutput(crossValidateOutput.ConfusionMatrix); schema = confusion.Schema; b = schema.TryGetColumnIndex("Count", out int countCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out foldCol); Assert.True(b); var type = schema[countCol].Metadata.Schema[MetadataUtils.Kinds.SlotNames].Type; Assert.True(type is VectorType vecType && vecType.ItemType is TextType && vecType.Size == 10); var slotNames = default(VBuffer <ReadOnlyMemory <char> >); schema[countCol].GetSlotNames(ref slotNames); var slotNameValues = slotNames.GetValues(); for (int i = 0; i < slotNameValues.Length; i++) { Assert.True(ReadOnlyMemoryUtils.EqualsStr(i.ToString(), slotNameValues[i])); } using (var curs = confusion.GetRowCursor(col => true)) { var countGetter = curs.GetGetter <VBuffer <double> >(countCol); var foldGetter = curs.GetGetter <ReadOnlyMemory <char> >(foldCol); var confCount = default(VBuffer <double>); var foldIndex = default(ReadOnlyMemory <char>); int rowCount = 0; var foldCur = "Fold 0"; while (curs.MoveNext()) { countGetter(ref confCount); foldGetter(ref foldIndex); rowCount++; Assert.True(ReadOnlyMemoryUtils.EqualsStr(foldCur, foldIndex)); if (rowCount == 10) { rowCount = 0; foldCur = "Fold 1"; } } Assert.Equal(0, rowCount); } var warnings = experiment.GetOutput(crossValidateOutput.Warnings); using (var cursor = warnings.GetRowCursor(col => true)) Assert.False(cursor.MoveNext()); }
[ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline public void TestCrossValidationMacro() { var dataPath = GetDataPath(TestDatasets.generatedRegressionDatasetmacro.trainFilename); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var nop = new Legacy.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var generate = new Legacy.Transforms.RandomNumberGenerator(); generate.Column = new[] { new Legacy.Transforms.GenerateNumberTransformColumn() { Name = "Weight1" } }; generate.Data = nopOutput.OutputData; var generateOutput = subGraph.Add(generate); var learnerInput = new Legacy.Trainers.PoissonRegressor { TrainingData = generateOutput.OutputData, NumThreads = 1, WeightColumn = "Weight1" }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <TransformModel>(nopOutput.Model, generateOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath) { Arguments = new Legacy.Data.TextLoaderArguments { Separator = new[] { ';' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(11) }, Type = Legacy.Data.DataKind.Num }, new TextLoaderColumn() { Name = "Features", Source = new [] { new TextLoaderRange(0, 10) }, Type = Legacy.Data.DataKind.Num } } } }; var importOutput = experiment.Add(importInput); var crossValidate = new Legacy.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, Kind = Legacy.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer, TransformModel = null, WeightColumn = "Weight1" }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); importInput.SetInput(env, experiment); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("L1(avg)", out int metricCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out int foldCol); Assert.True(b); b = schema.TryGetColumnIndex("IsWeighted", out int isWeightedCol); using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol || col == isWeightedCol)) { var getter = cursor.GetGetter <double>(metricCol); var foldGetter = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol); ReadOnlyMemory <char> fold = default; var isWeightedGetter = cursor.GetGetter <bool>(isWeightedCol); bool isWeighted = default; double avg = 0; double weightedAvg = 0; for (int w = 0; w < 2; w++) { // Get the average. b = cursor.MoveNext(); Assert.True(b); if (w == 1) { getter(ref weightedAvg); } else { getter(ref avg); } foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold)); isWeightedGetter(ref isWeighted); Assert.True(isWeighted == (w == 1)); // Get the standard deviation. b = cursor.MoveNext(); Assert.True(b); double stdev = 0; getter(ref stdev); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold)); if (w == 1) { Assert.Equal(1.585, stdev, 3); } else { Assert.Equal(1.39, stdev, 2); } isWeightedGetter(ref isWeighted); Assert.True(isWeighted == (w == 1)); } double sum = 0; double weightedSum = 0; for (int f = 0; f < 2; f++) { for (int w = 0; w < 2; w++) { b = cursor.MoveNext(); Assert.True(b); double val = 0; getter(ref val); foldGetter(ref fold); if (w == 1) { weightedSum += val; } else { sum += val; } Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold)); isWeightedGetter(ref isWeighted); Assert.True(isWeighted == (w == 1)); } } Assert.Equal(weightedAvg, weightedSum / 2); Assert.Equal(avg, sum / 2); b = cursor.MoveNext(); Assert.False(b); } }
public void TestCrossValidationBinaryMacro() { var dataPath = GetDataPath("adult.tiny.with-schema.txt"); using (var env = new ConsoleEnvironment()) { var subGraph = env.CreateExperiment(); var catInput = new Legacy.Transforms.CategoricalOneHotVectorizer(); catInput.AddColumn("Categories"); var catOutput = subGraph.Add(catInput); var concatInput = new Legacy.Transforms.ColumnConcatenator { Data = catOutput.OutputData }; concatInput.AddColumn("Features", "Categories", "NumericFeatures"); var concatOutput = subGraph.Add(concatInput); var lrInput = new Legacy.Trainers.LogisticRegressionBinaryClassifier { TrainingData = concatOutput.OutputData, NumThreads = 1 }; var lrOutput = subGraph.Add(lrInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <ITransformModel>(catOutput.Model, concatOutput.Model), PredictorModel = lrOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var crossValidateBinary = new Legacy.Models.BinaryCrossValidator { Data = importOutput.Data, Nodes = subGraph }; crossValidateBinary.Inputs.Data = catInput.Data; crossValidateBinary.Outputs.Model = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidateBinary); experiment.Compile(); importInput.SetInput(env, experiment); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]); var schema = data.Schema; var b = schema.TryGetColumnIndex("AUC", out int aucCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == aucCol)) { var getter = cursor.GetGetter <double>(aucCol); b = cursor.MoveNext(); Assert.True(b); double auc = 0; getter(ref auc); Assert.Equal(0.87, auc, 1); b = cursor.MoveNext(); Assert.False(b); } } }
public void CanSuccessfullyRetrieveSparseData() { string dataPath = GetDataPath("SparseData.txt"); var loader = new Legacy.Data.TextLoader(dataPath).CreateFrom <SparseInput>(useHeader: true, allowQuotedStrings: false, supportSparse: true); var environment = new MLContext(); Experiment experiment = environment.CreateExperiment(); Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var getters = new ValueGetter <float>[] { cursor.GetGetter <float>(0), cursor.GetGetter <float>(1), cursor.GetGetter <float>(2), cursor.GetGetter <float>(3), cursor.GetGetter <float>(4) }; Assert.True(cursor.MoveNext()); float[] targets = new float[] { 1, 2, 3, 4, 5 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.True(cursor.MoveNext()); targets = new float[] { 0, 0, 0, 4, 5 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.True(cursor.MoveNext()); targets = new float[] { 0, 2, 0, 0, 0 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.False(cursor.MoveNext()); } }