public void SetupBreastCancerPipeline()
        {
            _breastCancerExample = new BreastCancerData()
            {
                Features = new[] { 5f, 1f, 1f, 1f, 2f, 1f, 3f, 1f, 1f }
            };

            string _breastCancerDataPath = BaseTestClass.GetDataPath("breast-cancer.txt");

            var env    = new MLContext(seed: 1, conc: 1);
            var reader = new TextLoader(env, columns: new[]
            {
                new TextLoader.Column("Label", DataKind.BL, 0),
                new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(1, 9) })
            },
                                        hasHeader: false
                                        );

            IDataView data = reader.Read(_breastCancerDataPath);

            var pipeline = env.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
                new SdcaBinaryTrainer.Options {
                NumThreads = 1, ConvergenceTolerance = 1e-2f,
            });

            var model = pipeline.Fit(data);

            _breastCancerModel = model.CreatePredictionEngine <BreastCancerData, BreastCancerPrediction>(env);
        }
        public void Setup()
        {
            _context = new MLContext(1);
            var data = _context.Data.LoadFromTextFile <SentimentData>(BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv"), hasHeader: true);

            // Pipeline.
            var pipeline = _context.Transforms.Text.FeaturizeText("Features", "SentimentText")
                           .AppendCacheCheckpoint(_context)
                           .Append(_context.BinaryClassification.Trainers.SdcaNonCalibrated(
                                       new SdcaNonCalibratedBinaryTrainer.Options {
                NumberOfThreads = 1
            }));

            // Train.
            var model     = pipeline.Fit(data);
            var modelPath = "temp.zip";

            // Save model.
            _context.Model.Save(model, data.Schema, modelPath);

            // Load model.
            _trainedModel = _context.Model.Load(modelPath, out var inputSchema);

            _trainedModelOldFormat = _context.Model.Load(Path.Combine("TestModels", "SentimentModel.zip"), out inputSchema);
        }
Exemplo n.º 3
0
        void CrossValidation()
        {
            var mlContext = new MLContext(seed: 1);

            // Get the dataset.
            var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
                                                       hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
                       .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));

            // Create a pipeline to train on the sentiment data.
            var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
                "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
                "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"
            })
                           .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
                           .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares());

            // Compute the CV result.
            var cvResult = mlContext.Regression.CrossValidate(data, pipeline, numFolds: 5);

            // Check that the results are valid
            Assert.IsType <RegressionMetrics>(cvResult[0].Metrics);
            Assert.IsType <TransformerChain <RegressionPredictionTransformer <OrdinaryLeastSquaresRegressionModelParameters> > >(cvResult[0].Model);
            Assert.True(cvResult[0].ScoredHoldOutSet is IDataView);
            Assert.Equal(5, cvResult.Length);

            // And validate the metrics.
            foreach (var result in cvResult)
            {
                Common.AssertMetrics(result.Metrics);
            }
        }
Exemplo n.º 4
0
        public void ReconfigurablePrediction()
        {
            var mlContext = new MLContext(seed: 789);

            // Get the dataset, create a train and test
            var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
                                                       hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
                       .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
            var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.2);

            // Create a pipeline to train on the housing data
            var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
                "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
                "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"
            })
                           .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
                           .Append(mlContext.Regression.Trainers.Ols());

            var model = pipeline.Fit(split.TrainSet);

            var scoredTest = model.Transform(split.TestSet);
            var metrics    = mlContext.Regression.Evaluate(scoredTest);

            Common.AssertMetrics(metrics);

            // Todo #2465: Allow the setting of threshold and thresholdColumn for scoring.
            // This is no longer possible in the API
            //var newModel = new BinaryPredictionTransformer<IPredictorProducing<float>>(ml, model.Model, trainData.Schema, model.FeatureColumnName, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability);
            //var newScoredTest = newModel.Transform(pipeline.Transform(testData));
            //var newMetrics = mlContext.BinaryClassification.Evaluate(scoredTest);
            // And the Threshold and ThresholdColumn properties are not settable.
            //var predictor = model.LastTransformer;
            //predictor.Threshold = 0.01; // Not possible
        }
        public void SetupSentimentPipeline()
        {
            _sentimentExample = new SentimentData()
            {
                SentimentText = "Not a big fan of this."
            };

            string _sentimentDataPath = BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv");

            var env    = new MLContext(seed: 1, conc: 1);
            var reader = new TextLoader(env, columns: new[]
            {
                new TextLoader.Column("Label", DataKind.BL, 0),
                new TextLoader.Column("SentimentText", DataKind.Text, 1)
            },
                                        hasHeader: true
                                        );

            IDataView data = reader.Read(_sentimentDataPath);

            var pipeline = new TextFeaturizingEstimator(env, "Features", "SentimentText")
                           .Append(env.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
                                       new SdcaBinaryTrainer.Options {
                NumThreads = 1, ConvergenceTolerance = 1e-2f,
            }));

            var model = pipeline.Fit(data);

            _sentimentModel = model.CreatePredictionEngine <SentimentData, SentimentPrediction>(env);
        }
        public void SetupSentimentPipeline()
        {
            _sentimentExample = new SentimentData()
            {
                SentimentText = "Not a big fan of this."
            };

            string _sentimentDataPath = BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv");

            var mlContext = new MLContext(seed: 1, conc: 1);

            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("Label", DataKind.Boolean, 0),
                    new TextLoader.Column("SentimentText", DataKind.String, 1)
                },
                HasHeader = true,
            };
            var loader = new TextLoader(mlContext, options: options);

            IDataView data = loader.Load(_sentimentDataPath);

            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
                           .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
                                       new SdcaNonCalibratedBinaryTrainer.Options {
                NumberOfThreads = 1, ConvergenceTolerance = 1e-2f,
            }));

            var model = pipeline.Fit(data);

            _sentimentModel = model.CreatePredictionEngine <SentimentData, SentimentPrediction>(mlContext);
        }
        public void SetupBreastCancerPipeline()
        {
            _breastCancerExample = new BreastCancerData()
            {
                Features = new[] { 5f, 1f, 1f, 1f, 2f, 1f, 3f, 1f, 1f }
            };

            string _breastCancerDataPath = BaseTestClass.GetDataPath("breast-cancer.txt");

            var env = new MLContext(seed: 1, conc: 1);

            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("Label", DataKind.Boolean, 0),
                    new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(1, 9) })
                },
                HasHeader = false,
            };
            var loader = new TextLoader(env, options: options);

            IDataView data = loader.Load(_breastCancerDataPath);

            var pipeline = env.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
                new SdcaNonCalibratedBinaryTrainer.Options {
                NumberOfThreads = 1, ConvergenceTolerance = 1e-2f,
            });

            var model = pipeline.Fit(data);

            _breastCancerModel = model.CreatePredictionEngine <BreastCancerData, BreastCancerPrediction>(env);
        }
Exemplo n.º 8
0
        public void SetupIrisPipeline()
        {
            _irisExample = new IrisData()
            {
                SepalLength = 3.3f,
                SepalWidth  = 1.6f,
                PetalLength = 0.2f,
                PetalWidth  = 5.1f,
            };

            string _irisDataPath = BaseTestClass.GetDataPath("iris.txt");

            var env    = new MLContext(seed: 1, conc: 1);
            var reader = new TextLoader(env,
                                        columns: new[]
            {
                new TextLoader.Column("Label", DataKind.R4, 0),
                new TextLoader.Column("SepalLength", DataKind.R4, 1),
                new TextLoader.Column("SepalWidth", DataKind.R4, 2),
                new TextLoader.Column("PetalLength", DataKind.R4, 3),
                new TextLoader.Column("PetalWidth", DataKind.R4, 4),
            },
                                        hasHeader: true
                                        );

            IDataView data = reader.Read(_irisDataPath);

            var pipeline = new ColumnConcatenatingEstimator(env, "Features", new[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" })
                           .Append(new SdcaMultiClassTrainer(env, "Label", "Features", advancedSettings: (s) => { s.NumThreads = 1; s.ConvergenceTolerance = 1e-2f; }));

            var model = pipeline.Fit(data);

            _irisModel = model.CreatePredictionEngine <IrisData, IrisPrediction>(env);
        }
Exemplo n.º 9
0
        public void SetupTrainingSpeedTests()
        {
            _dataPath_Digits = BaseTestClass.GetDataPath(TestDatasets.Digits.trainFilename);

            if (!File.Exists(_dataPath_Digits))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Digits));
            }
        }
Exemplo n.º 10
0
        public void SetupTrainingSpeedTests()
        {
            _mslrWeb10k_Validate = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.validFilename);
            _mslrWeb10k_Train    = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.trainFilename);

            if (!File.Exists(_mslrWeb10k_Validate))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Validate));
            }

            if (!File.Exists(_mslrWeb10k_Train))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Train));
            }
        }
Exemplo n.º 11
0
        public void TrainWithValidationSet()
        {
            var mlContext = new MLContext(seed: 1);

            // Get the dataset.
            var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
                                                       hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
                       .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
            var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2);
            var trainData = dataSplit.TrainSet;
            var validData = dataSplit.TestSet;

            // Create a pipeline to featurize the dataset.
            var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
                "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
                "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"
            })
                           .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
                           .AppendCacheCheckpoint(mlContext) as IEstimator <ITransformer>;

            // Preprocess the datasets.
            var preprocessor          = pipeline.Fit(trainData);
            var preprocessedTrainData = preprocessor.Transform(trainData);
            var preprocessedValidData = preprocessor.Transform(validData);

            // Train the model with a validation set.
            var trainedModel = mlContext.Regression.Trainers.FastTree(new Trainers.FastTree.FastTreeRegressionTrainer.Options {
                NumberOfTrees       = 2,
                EarlyStoppingMetric = EarlyStoppingMetric.L2Norm,
                EarlyStoppingRule   = new GeneralityLossRule()
            })
                               .Fit(trainData: preprocessedTrainData, validationData: preprocessedValidData);

            // Combine the model.
            var model = preprocessor.Append(trainedModel);

            // Score the data sets.
            var scoredTrainData = model.Transform(trainData);
            var scoredValidData = model.Transform(validData);

            var trainMetrics = mlContext.Regression.Evaluate(scoredTrainData);
            var validMetrics = mlContext.Regression.Evaluate(scoredValidData);

            Common.AssertMetrics(trainMetrics);
            Common.AssertMetrics(validMetrics);
        }
        public void SetupIrisPipeline()
        {
            _irisExample = new IrisData()
            {
                SepalLength = 3.3f,
                SepalWidth  = 1.6f,
                PetalLength = 0.2f,
                PetalWidth  = 5.1f,
            };

            string _irisDataPath = BaseTestClass.GetDataPath("iris.txt");

            var env = new MLContext(seed: 1);

            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("Label", DataKind.Single, 0),
                    new TextLoader.Column("SepalLength", DataKind.Single, 1),
                    new TextLoader.Column("SepalWidth", DataKind.Single, 2),
                    new TextLoader.Column("PetalLength", DataKind.Single, 3),
                    new TextLoader.Column("PetalWidth", DataKind.Single, 4),
                },
                HasHeader = true,
            };
            var loader = new TextLoader(env, options: options);

            IDataView data = loader.Load(_irisDataPath);

            var pipeline = new ColumnConcatenatingEstimator(env, "Features", new[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" })
                           .Append(env.Transforms.Conversion.MapValueToKey("Label"))
                           .Append(env.MulticlassClassification.Trainers.SdcaCalibrated(
                                       new SdcaCalibratedMulticlassTrainer.Options {
                NumberOfThreads = 1, ConvergenceTolerance = 1e-2f,
            }));

            var model = pipeline.Fit(data);

            _irisModel = env.Model.CreatePredictionEngine <IrisData, IrisPrediction>(model);
        }
        public void SetupScoringSpeedTests()
        {
            _dataPath_Wiki = BaseTestClass.GetDataPath(TestDatasets.WikiDetox.trainFilename);

            if (!File.Exists(_dataPath_Wiki))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Wiki));
            }

            _modelPath_Wiki = Path.Combine(Path.GetDirectoryName(typeof(MulticlassClassificationTest).Assembly.Location), @"WikiModel.zip");

            string cmd = @"CV k=5 data=" + _dataPath_Wiki +
                         " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4}" +
                         " xf=CategoricalTransform{col=ns}" +
                         " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" +
                         " xf=Concat{col=Features:FeaturesText,logged_in,ns}" +
                         " tr=OVA{p=AveragedPerceptron{iter=10}}" +
                         " out={" + _modelPath_Wiki + "}";

            var environment = EnvironmentFactory.CreateClassificationEnvironment <TextLoader, OneHotEncodingTransformer, AveragedPerceptronTrainer, LinearBinaryModelParameters>();

            cmd.ExecuteMamlCommand(environment);
        }
Exemplo n.º 14
0
        public void SetupScoringSpeedTests()
        {
            _mslrWeb10k_Test     = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.testFilename);
            _mslrWeb10k_Validate = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.validFilename);
            _mslrWeb10k_Train    = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.trainFilename);

            if (!File.Exists(_mslrWeb10k_Test))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Test));
            }

            if (!File.Exists(_mslrWeb10k_Validate))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Validate));
            }

            if (!File.Exists(_mslrWeb10k_Train))
            {
                throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Train));
            }

            _modelPath_MSLR = Path.Combine(Path.GetDirectoryName(typeof(RankingTest).Assembly.Location), "FastTreeRankingModel.zip");

            string cmd = @"TrainTest test=" + _mslrWeb10k_Validate +
                         " eval=RankingEvaluator{t=10}" +
                         " data=" + _mslrWeb10k_Train +
                         " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" +
                         " xf=HashTransform{col=GroupId}" +
                         " xf=NAHandleTransform{col=Features}" +
                         " tr=FastTreeRanking{}" +
                         " out={" + _modelPath_MSLR + "}";

            var environment = EnvironmentFactory.CreateRankingEnvironment <RankerEvaluator, TextLoader, HashingTransformer, FastTreeRankingTrainer, FastTreeRankingModelParameters>();

            cmd.ExecuteMamlCommand(environment);
        }