public void SetupBreastCancerPipeline() { _breastCancerExample = new BreastCancerData() { Features = new[] { 5f, 1f, 1f, 1f, 2f, 1f, 3f, 1f, 1f } }; string _breastCancerDataPath = BaseTestClass.GetDataPath("breast-cancer.txt"); var env = new MLContext(seed: 1, conc: 1); var reader = new TextLoader(env, columns: new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(1, 9) }) }, hasHeader: false ); IDataView data = reader.Read(_breastCancerDataPath); var pipeline = env.BinaryClassification.Trainers.StochasticDualCoordinateAscent( new SdcaBinaryTrainer.Options { NumThreads = 1, ConvergenceTolerance = 1e-2f, }); var model = pipeline.Fit(data); _breastCancerModel = model.CreatePredictionEngine <BreastCancerData, BreastCancerPrediction>(env); }
public void Setup() { _context = new MLContext(1); var data = _context.Data.LoadFromTextFile <SentimentData>(BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv"), hasHeader: true); // Pipeline. var pipeline = _context.Transforms.Text.FeaturizeText("Features", "SentimentText") .AppendCacheCheckpoint(_context) .Append(_context.BinaryClassification.Trainers.SdcaNonCalibrated( new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 })); // Train. var model = pipeline.Fit(data); var modelPath = "temp.zip"; // Save model. _context.Model.Save(model, data.Schema, modelPath); // Load model. _trainedModel = _context.Model.Load(modelPath, out var inputSchema); _trainedModelOldFormat = _context.Model.Load(Path.Combine("TestModels", "SentimentModel.zip"), out inputSchema); }
void CrossValidation() { var mlContext = new MLContext(seed: 1); // Get the dataset. var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); // Create a pipeline to train on the sentiment data. var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio" }) .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); // Compute the CV result. var cvResult = mlContext.Regression.CrossValidate(data, pipeline, numFolds: 5); // Check that the results are valid Assert.IsType <RegressionMetrics>(cvResult[0].Metrics); Assert.IsType <TransformerChain <RegressionPredictionTransformer <OrdinaryLeastSquaresRegressionModelParameters> > >(cvResult[0].Model); Assert.True(cvResult[0].ScoredHoldOutSet is IDataView); Assert.Equal(5, cvResult.Length); // And validate the metrics. foreach (var result in cvResult) { Common.AssertMetrics(result.Metrics); } }
public void ReconfigurablePrediction() { var mlContext = new MLContext(seed: 789); // Get the dataset, create a train and test var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.2); // Create a pipeline to train on the housing data var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio" }) .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) .Append(mlContext.Regression.Trainers.Ols()); var model = pipeline.Fit(split.TrainSet); var scoredTest = model.Transform(split.TestSet); var metrics = mlContext.Regression.Evaluate(scoredTest); Common.AssertMetrics(metrics); // Todo #2465: Allow the setting of threshold and thresholdColumn for scoring. // This is no longer possible in the API //var newModel = new BinaryPredictionTransformer<IPredictorProducing<float>>(ml, model.Model, trainData.Schema, model.FeatureColumnName, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability); //var newScoredTest = newModel.Transform(pipeline.Transform(testData)); //var newMetrics = mlContext.BinaryClassification.Evaluate(scoredTest); // And the Threshold and ThresholdColumn properties are not settable. //var predictor = model.LastTransformer; //predictor.Threshold = 0.01; // Not possible }
public void SetupSentimentPipeline() { _sentimentExample = new SentimentData() { SentimentText = "Not a big fan of this." }; string _sentimentDataPath = BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv"); var env = new MLContext(seed: 1, conc: 1); var reader = new TextLoader(env, columns: new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) }, hasHeader: true ); IDataView data = reader.Read(_sentimentDataPath); var pipeline = new TextFeaturizingEstimator(env, "Features", "SentimentText") .Append(env.BinaryClassification.Trainers.StochasticDualCoordinateAscent( new SdcaBinaryTrainer.Options { NumThreads = 1, ConvergenceTolerance = 1e-2f, })); var model = pipeline.Fit(data); _sentimentModel = model.CreatePredictionEngine <SentimentData, SentimentPrediction>(env); }
public void SetupSentimentPipeline() { _sentimentExample = new SentimentData() { SentimentText = "Not a big fan of this." }; string _sentimentDataPath = BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv"); var mlContext = new MLContext(seed: 1, conc: 1); // Create text loader. var options = new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.Boolean, 0), new TextLoader.Column("SentimentText", DataKind.String, 1) }, HasHeader = true, }; var loader = new TextLoader(mlContext, options: options); IDataView data = loader.Load(_sentimentDataPath); var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1, ConvergenceTolerance = 1e-2f, })); var model = pipeline.Fit(data); _sentimentModel = model.CreatePredictionEngine <SentimentData, SentimentPrediction>(mlContext); }
public void SetupBreastCancerPipeline() { _breastCancerExample = new BreastCancerData() { Features = new[] { 5f, 1f, 1f, 1f, 2f, 1f, 3f, 1f, 1f } }; string _breastCancerDataPath = BaseTestClass.GetDataPath("breast-cancer.txt"); var env = new MLContext(seed: 1, conc: 1); // Create text loader. var options = new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.Boolean, 0), new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(1, 9) }) }, HasHeader = false, }; var loader = new TextLoader(env, options: options); IDataView data = loader.Load(_breastCancerDataPath); var pipeline = env.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1, ConvergenceTolerance = 1e-2f, }); var model = pipeline.Fit(data); _breastCancerModel = model.CreatePredictionEngine <BreastCancerData, BreastCancerPrediction>(env); }
public void SetupIrisPipeline() { _irisExample = new IrisData() { SepalLength = 3.3f, SepalWidth = 1.6f, PetalLength = 0.2f, PetalWidth = 5.1f, }; string _irisDataPath = BaseTestClass.GetDataPath("iris.txt"); var env = new MLContext(seed: 1, conc: 1); var reader = new TextLoader(env, columns: new[] { new TextLoader.Column("Label", DataKind.R4, 0), new TextLoader.Column("SepalLength", DataKind.R4, 1), new TextLoader.Column("SepalWidth", DataKind.R4, 2), new TextLoader.Column("PetalLength", DataKind.R4, 3), new TextLoader.Column("PetalWidth", DataKind.R4, 4), }, hasHeader: true ); IDataView data = reader.Read(_irisDataPath); var pipeline = new ColumnConcatenatingEstimator(env, "Features", new[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" }) .Append(new SdcaMultiClassTrainer(env, "Label", "Features", advancedSettings: (s) => { s.NumThreads = 1; s.ConvergenceTolerance = 1e-2f; })); var model = pipeline.Fit(data); _irisModel = model.CreatePredictionEngine <IrisData, IrisPrediction>(env); }
public void SetupTrainingSpeedTests() { _dataPath_Digits = BaseTestClass.GetDataPath(TestDatasets.Digits.trainFilename); if (!File.Exists(_dataPath_Digits)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Digits)); } }
public void SetupTrainingSpeedTests() { _mslrWeb10k_Validate = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.validFilename); _mslrWeb10k_Train = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.trainFilename); if (!File.Exists(_mslrWeb10k_Validate)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Validate)); } if (!File.Exists(_mslrWeb10k_Train)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Train)); } }
public void TrainWithValidationSet() { var mlContext = new MLContext(seed: 1); // Get the dataset. var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; var validData = dataSplit.TestSet; // Create a pipeline to featurize the dataset. var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio" }) .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) .AppendCacheCheckpoint(mlContext) as IEstimator <ITransformer>; // Preprocess the datasets. var preprocessor = pipeline.Fit(trainData); var preprocessedTrainData = preprocessor.Transform(trainData); var preprocessedValidData = preprocessor.Transform(validData); // Train the model with a validation set. var trainedModel = mlContext.Regression.Trainers.FastTree(new Trainers.FastTree.FastTreeRegressionTrainer.Options { NumberOfTrees = 2, EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, EarlyStoppingRule = new GeneralityLossRule() }) .Fit(trainData: preprocessedTrainData, validationData: preprocessedValidData); // Combine the model. var model = preprocessor.Append(trainedModel); // Score the data sets. var scoredTrainData = model.Transform(trainData); var scoredValidData = model.Transform(validData); var trainMetrics = mlContext.Regression.Evaluate(scoredTrainData); var validMetrics = mlContext.Regression.Evaluate(scoredValidData); Common.AssertMetrics(trainMetrics); Common.AssertMetrics(validMetrics); }
public void SetupIrisPipeline() { _irisExample = new IrisData() { SepalLength = 3.3f, SepalWidth = 1.6f, PetalLength = 0.2f, PetalWidth = 5.1f, }; string _irisDataPath = BaseTestClass.GetDataPath("iris.txt"); var env = new MLContext(seed: 1); // Create text loader. var options = new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.Single, 0), new TextLoader.Column("SepalLength", DataKind.Single, 1), new TextLoader.Column("SepalWidth", DataKind.Single, 2), new TextLoader.Column("PetalLength", DataKind.Single, 3), new TextLoader.Column("PetalWidth", DataKind.Single, 4), }, HasHeader = true, }; var loader = new TextLoader(env, options: options); IDataView data = loader.Load(_irisDataPath); var pipeline = new ColumnConcatenatingEstimator(env, "Features", new[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" }) .Append(env.Transforms.Conversion.MapValueToKey("Label")) .Append(env.MulticlassClassification.Trainers.SdcaCalibrated( new SdcaCalibratedMulticlassTrainer.Options { NumberOfThreads = 1, ConvergenceTolerance = 1e-2f, })); var model = pipeline.Fit(data); _irisModel = env.Model.CreatePredictionEngine <IrisData, IrisPrediction>(model); }
public void SetupScoringSpeedTests() { _dataPath_Wiki = BaseTestClass.GetDataPath(TestDatasets.WikiDetox.trainFilename); if (!File.Exists(_dataPath_Wiki)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Wiki)); } _modelPath_Wiki = Path.Combine(Path.GetDirectoryName(typeof(MulticlassClassificationTest).Assembly.Location), @"WikiModel.zip"); string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4}" + " xf=CategoricalTransform{col=ns}" + " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" + " xf=Concat{col=Features:FeaturesText,logged_in,ns}" + " tr=OVA{p=AveragedPerceptron{iter=10}}" + " out={" + _modelPath_Wiki + "}"; var environment = EnvironmentFactory.CreateClassificationEnvironment <TextLoader, OneHotEncodingTransformer, AveragedPerceptronTrainer, LinearBinaryModelParameters>(); cmd.ExecuteMamlCommand(environment); }
public void SetupScoringSpeedTests() { _mslrWeb10k_Test = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.testFilename); _mslrWeb10k_Validate = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.validFilename); _mslrWeb10k_Train = BaseTestClass.GetDataPath(TestDatasets.MSLRWeb.trainFilename); if (!File.Exists(_mslrWeb10k_Test)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Test)); } if (!File.Exists(_mslrWeb10k_Validate)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Validate)); } if (!File.Exists(_mslrWeb10k_Train)) { throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _mslrWeb10k_Train)); } _modelPath_MSLR = Path.Combine(Path.GetDirectoryName(typeof(RankingTest).Assembly.Location), "FastTreeRankingModel.zip"); string cmd = @"TrainTest test=" + _mslrWeb10k_Validate + " eval=RankingEvaluator{t=10}" + " data=" + _mslrWeb10k_Train + " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" + " xf=HashTransform{col=GroupId}" + " xf=NAHandleTransform{col=Features}" + " tr=FastTreeRanking{}" + " out={" + _modelPath_MSLR + "}"; var environment = EnvironmentFactory.CreateRankingEnvironment <RankerEvaluator, TextLoader, HashingTransformer, FastTreeRankingTrainer, FastTreeRankingModelParameters>(); cmd.ExecuteMamlCommand(environment); }