Ejemplo n.º 1
0
        public void CanSuccessfullyRetrieveQuotedData()
        {
            string dataPath = GetDataPath("QuotingData.csv");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: true, supportSparse: false);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var IDGetter   = cursor.GetGetter <float>(0);
                var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1);

                Assert.True(cursor.MoveNext());

                float ID = 0;
                IDGetter(ref ID);
                Assert.Equal(1, ID);

                ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("This text contains comma, within quotes.", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(2, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("This text contains extra punctuations and special characters.;*<>?!@#$%^&*()_+=-{}|[]:;'", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(3, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("This text has no quotes", Text.ToString());

                Assert.False(cursor.MoveNext());
            }
        }
Ejemplo n.º 2
0
        public void CanSuccessfullyTrimSpaces()
        {
            string dataPath = GetDataPath("TrimData.csv");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: false, supportSparse: false, trimWhitespace: true);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var IDGetter   = cursor.GetGetter <float>(0);
                var TextGetter = cursor.GetGetter <ReadOnlyMemory <char> >(1);

                Assert.True(cursor.MoveNext());

                float ID = 0;
                IDGetter(ref ID);
                Assert.Equal(1, ID);

                ReadOnlyMemory <char> Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("There is a space at the end", Text.ToString());

                Assert.True(cursor.MoveNext());

                ID = 0;
                IDGetter(ref ID);
                Assert.Equal(2, ID);

                Text = new ReadOnlyMemory <char>();
                TextGetter(ref Text);
                Assert.Equal("There is no space at the end", Text.ToString());

                Assert.False(cursor.MoveNext());
            }
        }
Ejemplo n.º 3
0
        public void TestCrossValidationMacroMultiClassWithWarnings()
        {
            var dataPath = GetDataPath(@"Train-Tiny-28x28.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.LogisticRegressionClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var experiment   = env.CreateExperiment();
            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var filter = new Legacy.Transforms.RowRangeFilter();

            filter.Data   = importOutput.Data;
            filter.Column = "Label";
            filter.Min    = 0;
            filter.Max    = 5;
            var filterOutput = experiment.Add(filter);

            var term = new Legacy.Transforms.TextToKeyConverter();

            term.Column = new[]
            {
                new Legacy.Transforms.ValueToKeyMappingTransformerColumn()
                {
                    Source = "Label", Name = "Strat", Sort = Legacy.Transforms.ValueToKeyMappingTransformerSortOrder.Value
                }
            };
            term.Data = filterOutput.OutputData;
            var termOutput = experiment.Add(term);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data                 = termOutput.OutputData,
                Nodes                = subGraph,
                Kind                 = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer,
                TransformModel       = null,
                StratificationColumn = "Strat"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = learnerOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var warnings = experiment.GetOutput(crossValidateOutput.Warnings);

            var schema = warnings.Schema;
            var b      = schema.TryGetColumnIndex("WarningText", out int warningCol);

            Assert.True(b);
            using (var cursor = warnings.GetRowCursor(col => col == warningCol))
            {
                var getter = cursor.GetGetter <ReadOnlyMemory <char> >(warningCol);

                b = cursor.MoveNext();
                Assert.True(b);
                var warning = default(ReadOnlyMemory <char>);
                getter(ref warning);
                Assert.Contains("test instances with class values not seen in the training set.", warning.ToString());
                b = cursor.MoveNext();
                Assert.True(b);
                getter(ref warning);
                Assert.Contains("Detected columns of variable length: SortedScores, SortedClasses", warning.ToString());
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Ejemplo n.º 4
0
        public void TestCrossValidationMacroWithMultiClass()
        {
            var dataPath = GetDataPath(@"Train-Tiny-28x28.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment   = env.CreateExperiment();
            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer,
                TransformModel = null
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("Accuracy(micro-avg)", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the average.
                b = cursor.MoveNext();
                Assert.True(b);
                double avg = 0;
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                double stdev = 0;
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                Assert.Equal(0.015, stdev, 3);

                double sum = 0;
                double val = 0;
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sum += val;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }

            var confusion = experiment.GetOutput(crossValidateOutput.ConfusionMatrix);

            schema = confusion.Schema;
            b      = schema.TryGetColumnIndex("Count", out int countCol);
            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out foldCol);
            Assert.True(b);
            var type = schema[countCol].Metadata.Schema[MetadataUtils.Kinds.SlotNames].Type;

            Assert.True(type is VectorType vecType && vecType.ItemType is TextType && vecType.Size == 10);
            var slotNames = default(VBuffer <ReadOnlyMemory <char> >);

            schema[countCol].GetSlotNames(ref slotNames);
            var slotNameValues = slotNames.GetValues();

            for (int i = 0; i < slotNameValues.Length; i++)
            {
                Assert.True(ReadOnlyMemoryUtils.EqualsStr(i.ToString(), slotNameValues[i]));
            }
            using (var curs = confusion.GetRowCursor(col => true))
            {
                var countGetter = curs.GetGetter <VBuffer <double> >(countCol);
                var foldGetter  = curs.GetGetter <ReadOnlyMemory <char> >(foldCol);
                var confCount   = default(VBuffer <double>);
                var foldIndex   = default(ReadOnlyMemory <char>);
                int rowCount    = 0;
                var foldCur     = "Fold 0";
                while (curs.MoveNext())
                {
                    countGetter(ref confCount);
                    foldGetter(ref foldIndex);
                    rowCount++;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr(foldCur, foldIndex));
                    if (rowCount == 10)
                    {
                        rowCount = 0;
                        foldCur  = "Fold 1";
                    }
                }
                Assert.Equal(0, rowCount);
            }

            var warnings = experiment.GetOutput(crossValidateOutput.Warnings);

            using (var cursor = warnings.GetRowCursor(col => true))
                Assert.False(cursor.MoveNext());
        }
Ejemplo n.º 5
0
        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
        public void TestCrossValidationMacro()
        {
            var dataPath = GetDataPath(TestDatasets.generatedRegressionDatasetmacro.trainFilename);
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var generate = new Legacy.Transforms.RandomNumberGenerator();

            generate.Column = new[] { new Legacy.Transforms.GenerateNumberTransformColumn()
                                      {
                                          Name = "Weight1"
                                      } };
            generate.Data = nopOutput.OutputData;
            var generateOutput = subGraph.Add(generate);

            var learnerInput = new Legacy.Trainers.PoissonRegressor
            {
                TrainingData = generateOutput.OutputData,
                NumThreads   = 1,
                WeightColumn = "Weight1"
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model, generateOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath)
            {
                Arguments = new Legacy.Data.TextLoaderArguments
                {
                    Separator = new[] { ';' },
                    HasHeader = true,
                    Column    = new[]
                    {
                        new TextLoaderColumn()
                        {
                            Name   = "Label",
                            Source = new [] { new TextLoaderRange(11) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Features",
                            Source = new [] { new TextLoaderRange(0, 10) },
                            Type   = Legacy.Data.DataKind.Num
                        }
                    }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer,
                TransformModel = null,
                WeightColumn   = "Weight1"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("L1(avg)", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            b = schema.TryGetColumnIndex("IsWeighted", out int isWeightedCol);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol || col == isWeightedCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;
                var    isWeightedGetter    = cursor.GetGetter <bool>(isWeightedCol);
                bool   isWeighted          = default;
                double avg                 = 0;
                double weightedAvg         = 0;
                for (int w = 0; w < 2; w++)
                {
                    // Get the average.
                    b = cursor.MoveNext();
                    Assert.True(b);
                    if (w == 1)
                    {
                        getter(ref weightedAvg);
                    }
                    else
                    {
                        getter(ref avg);
                    }
                    foldGetter(ref fold);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));
                    isWeightedGetter(ref isWeighted);
                    Assert.True(isWeighted == (w == 1));

                    // Get the standard deviation.
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double stdev = 0;
                    getter(ref stdev);
                    foldGetter(ref fold);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                    if (w == 1)
                    {
                        Assert.Equal(1.585, stdev, 3);
                    }
                    else
                    {
                        Assert.Equal(1.39, stdev, 2);
                    }
                    isWeightedGetter(ref isWeighted);
                    Assert.True(isWeighted == (w == 1));
                }
                double sum         = 0;
                double weightedSum = 0;
                for (int f = 0; f < 2; f++)
                {
                    for (int w = 0; w < 2; w++)
                    {
                        b = cursor.MoveNext();
                        Assert.True(b);
                        double val = 0;
                        getter(ref val);
                        foldGetter(ref fold);
                        if (w == 1)
                        {
                            weightedSum += val;
                        }
                        else
                        {
                            sum += val;
                        }
                        Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                        isWeightedGetter(ref isWeighted);
                        Assert.True(isWeighted == (w == 1));
                    }
                }
                Assert.Equal(weightedAvg, weightedSum / 2);
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Ejemplo n.º 6
0
        public void TestCrossValidationBinaryMacro()
        {
            var dataPath = GetDataPath("adult.tiny.with-schema.txt");

            using (var env = new ConsoleEnvironment())
            {
                var subGraph = env.CreateExperiment();

                var catInput = new Legacy.Transforms.CategoricalOneHotVectorizer();
                catInput.AddColumn("Categories");
                var catOutput = subGraph.Add(catInput);

                var concatInput = new Legacy.Transforms.ColumnConcatenator
                {
                    Data = catOutput.OutputData
                };
                concatInput.AddColumn("Features", "Categories", "NumericFeatures");
                var concatOutput = subGraph.Add(concatInput);

                var lrInput = new Legacy.Trainers.LogisticRegressionBinaryClassifier
                {
                    TrainingData = concatOutput.OutputData,
                    NumThreads   = 1
                };
                var lrOutput = subGraph.Add(lrInput);

                var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
                {
                    TransformModels = new ArrayVar <ITransformModel>(catOutput.Model, concatOutput.Model),
                    PredictorModel  = lrOutput.PredictorModel
                };
                var modelCombineOutput = subGraph.Add(modelCombine);

                var experiment = env.CreateExperiment();

                var importInput  = new Legacy.Data.TextLoader(dataPath);
                var importOutput = experiment.Add(importInput);

                var crossValidateBinary = new Legacy.Models.BinaryCrossValidator
                {
                    Data  = importOutput.Data,
                    Nodes = subGraph
                };
                crossValidateBinary.Inputs.Data   = catInput.Data;
                crossValidateBinary.Outputs.Model = modelCombineOutput.PredictorModel;
                var crossValidateOutput = experiment.Add(crossValidateBinary);

                experiment.Compile();
                importInput.SetInput(env, experiment);
                experiment.Run();
                var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]);

                var schema = data.Schema;
                var b      = schema.TryGetColumnIndex("AUC", out int aucCol);
                Assert.True(b);
                using (var cursor = data.GetRowCursor(col => col == aucCol))
                {
                    var getter = cursor.GetGetter <double>(aucCol);
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double auc = 0;
                    getter(ref auc);
                    Assert.Equal(0.87, auc, 1);
                    b = cursor.MoveNext();
                    Assert.False(b);
                }
            }
        }
Ejemplo n.º 7
0
        public void CanSuccessfullyRetrieveSparseData()
        {
            string dataPath = GetDataPath("SparseData.txt");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <SparseInput>(useHeader: true, allowQuotedStrings: false, supportSparse: true);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var getters = new ValueGetter <float>[] {
                    cursor.GetGetter <float>(0),
                    cursor.GetGetter <float>(1),
                    cursor.GetGetter <float>(2),
                    cursor.GetGetter <float>(3),
                    cursor.GetGetter <float>(4)
                };


                Assert.True(cursor.MoveNext());

                float[] targets = new float[] { 1, 2, 3, 4, 5 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.True(cursor.MoveNext());

                targets = new float[] { 0, 0, 0, 4, 5 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.True(cursor.MoveNext());

                targets = new float[] { 0, 2, 0, 0, 0 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.False(cursor.MoveNext());
            }
        }