private Data.TextLoader PrepareTextLoaderTestData() { var testDataPath = GetDataPath(SentimentTestPath); var testData = new Data.TextLoader(testDataPath) { Arguments = new TextLoaderArguments { Separator = new[] { '\t' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(0) }, Type = Runtime.Data.DataKind.Num }, new TextLoaderColumn() { Name = "SentimentText", Source = new [] { new TextLoaderRange(1) }, Type = Runtime.Data.DataKind.Text } } } }; return(testData); }
private static ITransformModel CreateKcHousePricePredictorModel(string dataPath) { var dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,"; Experiment experiment = s_environment.CreateExperiment(); var importData = new Data.TextLoader(); importData.CustomSchema = dataSchema; Data.TextLoader.Output imported = experiment.Add(importData); var numericalConcatenate = new Transforms.ColumnConcatenator(); numericalConcatenate.Data = imported.Data; numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate); var categoryConcatenate = new Transforms.ColumnConcatenator(); categoryConcatenate.Data = numericalConcatenated.OutputData; categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode"); Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate); var categorize = new Transforms.CategoricalOneHotVectorizer(); categorize.AddColumn("CategoryFeatures"); categorize.Data = categoryConcatenated.OutputData; Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize); var featuresConcatenate = new Transforms.ColumnConcatenator(); featuresConcatenate.Data = categorized.OutputData; featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures"); Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate); var learner = new Trainers.StochasticDualCoordinateAscentRegressor(); learner.TrainingData = featuresConcatenated.OutputData; learner.NumThreads = 1; Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner); var combineModels = new Transforms.ManyHeterogeneousModelCombiner(); combineModels.TransformModels = new ArrayVar <ITransformModel>(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model); combineModels.PredictorModel = learnerOutput.PredictorModel; Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels); var scorer = new Transforms.Scorer { PredictorModel = combinedModels.PredictorModel }; var scorerOutput = experiment.Add(scorer); experiment.Compile(); experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false)); experiment.Run(); return(experiment.GetOutput(scorerOutput.ScoringTransform)); }
public void CanSuccessfullyRetrieveQuotedData() { string dataPath = GetDataPath("QuotingData.csv"); var loader = new Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: true, supportSparse: false); using (var environment = new TlcEnvironment()) { Experiment experiment = environment.CreateExperiment(); ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var IDGetter = cursor.GetGetter <float>(0); var TextGetter = cursor.GetGetter <DvText>(1); Assert.True(cursor.MoveNext()); float ID = 0; IDGetter(ref ID); Assert.Equal(1, ID); DvText Text = new DvText(); TextGetter(ref Text); Assert.Equal("This text contains comma, within quotes.", Text.ToString()); Assert.True(cursor.MoveNext()); ID = 0; IDGetter(ref ID); Assert.Equal(2, ID); Text = new DvText(); TextGetter(ref Text); Assert.Equal("This text contains extra punctuations and special characters.;*<>?!@#$%^&*()_+=-{}|[]:;'", Text.ToString()); Assert.True(cursor.MoveNext()); ID = 0; IDGetter(ref ID); Assert.Equal(3, ID); Text = new DvText(); TextGetter(ref Text); Assert.Equal("This text has no quotes", Text.ToString()); Assert.False(cursor.MoveNext()); } } }
public void CanSuccessfullyApplyATransform() { var loader = new Data.TextLoader("fakeFile.txt").CreateFrom <Input>(); using (var environment = new TlcEnvironment()) { Experiment experiment = environment.CreateExperiment(); ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as ILearningPipelineDataStep; Assert.NotNull(output.Data); Assert.NotNull(output.Data.VarName); Assert.Null(output.Model); } }
public void CanSuccessfullyTrimSpaces() { string dataPath = GetDataPath("TrimData.csv"); var loader = new Data.TextLoader(dataPath).CreateFrom <QuoteInput>(useHeader: true, separator: ',', allowQuotedStrings: false, supportSparse: false, trimWhitespace: true); using (var environment = new TlcEnvironment()) { Experiment experiment = environment.CreateExperiment(); ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var IDGetter = cursor.GetGetter <float>(0); var TextGetter = cursor.GetGetter <DvText>(1); Assert.True(cursor.MoveNext()); float ID = 0; IDGetter(ref ID); Assert.Equal(1, ID); DvText Text = new DvText(); TextGetter(ref Text); Assert.Equal("There is a space at the end", Text.ToString()); Assert.True(cursor.MoveNext()); ID = 0; IDGetter(ref ID); Assert.Equal(2, ID); Text = new DvText(); TextGetter(ref Text); Assert.Equal("There is no space at the end", Text.ToString()); Assert.False(cursor.MoveNext()); } } }
public void SetupPredictBenchmarks() { _trainedModel = Train(_dataPath); _consumer.Consume(_trainedModel.Predict(_example)); var testData = new Data.TextLoader(_dataPath).CreateFrom <IrisData>(useHeader: true); var evaluator = new ClassificationEvaluator(); _metrics = evaluator.Evaluate(_trainedModel, testData); _batches = new IrisData[_batchSizes.Length][]; for (int i = 0; i < _batches.Length; i++) { var batch = new IrisData[_batchSizes[i]]; _batches[i] = batch; for (int bi = 0; bi < batch.Length; bi++) { batch[bi] = _example; } } }
public void Setup() { s_dataPath = Program.GetDataPath("iris.txt"); s_sentimentDataPath = Program.GetDataPath("wikipedia-detox-250-line-data.tsv"); s_trainedModel = TrainCore(); IrisPrediction prediction = s_trainedModel.Predict(s_example); var testData = new Data.TextLoader(s_dataPath).CreateFrom <IrisData>(useHeader: true); var evaluator = new ClassificationEvaluator(); s_metrics = evaluator.Evaluate(s_trainedModel, testData); s_batches = new IrisData[s_batchSizes.Length][]; for (int i = 0; i < s_batches.Length; i++) { var batch = new IrisData[s_batchSizes[i]]; s_batches[i] = batch; for (int bi = 0; bi < batch.Length; bi++) { batch[bi] = s_example; } } }
public void TrainAndPredictSentimentModelTest() { string dataPath = GetDataPath(SentimentDataPath); var pipeline = new LearningPipeline(); pipeline.Add(new Data.TextLoader(dataPath) { Arguments = new TextLoaderArguments { Separator = new[] { '\t' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(0) }, Type = Runtime.Data.DataKind.Num }, new TextLoaderColumn() { Name = "SentimentText", Source = new [] { new TextLoaderRange(1) }, Type = Runtime.Data.DataKind.Text } } } }); pipeline.Add(new TextFeaturizer("Features", "SentimentText") { KeepDiacritics = false, KeepPunctuations = false, TextCase = TextNormalizerTransformCaseNormalizationMode.Lower, OutputTokens = true, StopWordsRemover = new PredefinedStopWordsRemover(), VectorNormalizer = TextTransformTextNormKind.L2, CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false }, WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true } }); pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 }); pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" }); PredictionModel <SentimentData, SentimentPrediction> model = pipeline.Train <SentimentData, SentimentPrediction>(); IEnumerable <SentimentData> sentiments = new[] { new SentimentData { SentimentText = "Please refrain from adding nonsense to Wikipedia." }, new SentimentData { SentimentText = "He is a CHEATER, and the article should say that." } }; IEnumerable <SentimentPrediction> predictions = model.Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsFalse); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); string testDataPath = GetDataPath(SentimentTestPath); var testData = new Data.TextLoader(testDataPath) { Arguments = new TextLoaderArguments { Separator = new[] { '\t' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(0) }, Type = Runtime.Data.DataKind.Num }, new TextLoaderColumn() { Name = "SentimentText", Source = new [] { new TextLoaderRange(1) }, Type = Runtime.Data.DataKind.Text } } } }; var evaluator = new BinaryClassificationEvaluator(); BinaryClassificationMetrics metrics = evaluator.Evaluate(model, testData); Assert.Equal(.5556, metrics.Accuracy, 4); Assert.Equal(.8, metrics.Auc, 1); Assert.Equal(.87, metrics.Auprc, 2); Assert.Equal(1, metrics.Entropy, 3); Assert.Equal(.6923, metrics.F1Score, 4); Assert.Equal(.969, metrics.LogLoss, 3); Assert.Equal(3.083, metrics.LogLossReduction, 3); Assert.Equal(1, metrics.NegativePrecision, 3); Assert.Equal(.111, metrics.NegativeRecall, 3); Assert.Equal(.529, metrics.PositivePrecision, 3); Assert.Equal(1, metrics.PositiveRecall); ConfusionMatrix matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(9, matrix[0, 0]); Assert.Equal(9, matrix["positive", "positive"]); Assert.Equal(0, matrix[0, 1]); Assert.Equal(0, matrix["positive", "negative"]); Assert.Equal(8, matrix[1, 0]); Assert.Equal(8, matrix["negative", "positive"]); Assert.Equal(1, matrix[1, 1]); Assert.Equal(1, matrix["negative", "negative"]); }
private static ITransformModel CreateKcHousePricePredictorModel(string dataPath) { Experiment experiment = s_environment.CreateExperiment(); var importData = new Data.TextLoader(dataPath) { Arguments = new TextLoaderArguments { Separator = new[] { ',' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Id", Source = new [] { new TextLoaderRange(0) }, Type = Data.DataKind.Text }, new TextLoaderColumn() { Name = "Date", Source = new [] { new TextLoaderRange(1) }, Type = Data.DataKind.Text }, new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(2) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Bedrooms", Source = new [] { new TextLoaderRange(3) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Bathrooms", Source = new [] { new TextLoaderRange(4) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "SqftLiving", Source = new [] { new TextLoaderRange(5) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "SqftLot", Source = new [] { new TextLoaderRange(6) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Floors", Source = new [] { new TextLoaderRange(7) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Waterfront", Source = new [] { new TextLoaderRange(8) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "View", Source = new [] { new TextLoaderRange(9) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Condition", Source = new [] { new TextLoaderRange(10) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Grade", Source = new [] { new TextLoaderRange(11) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "SqftAbove", Source = new [] { new TextLoaderRange(12) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "SqftBasement", Source = new [] { new TextLoaderRange(13) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "YearBuilt", Source = new [] { new TextLoaderRange(14) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "YearRenovated", Source = new [] { new TextLoaderRange(15) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Zipcode", Source = new [] { new TextLoaderRange(16) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Lat", Source = new [] { new TextLoaderRange(17) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "Long", Source = new [] { new TextLoaderRange(18) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "SqftLiving15", Source = new [] { new TextLoaderRange(19) }, Type = Data.DataKind.Num }, new TextLoaderColumn() { Name = "SqftLot15", Source = new [] { new TextLoaderRange(20) }, Type = Data.DataKind.Num }, } } //new Data.CustomTextLoader(); // importData.CustomSchema = dataSchema; // }; Data.TextLoader.Output imported = experiment.Add(importData); var numericalConcatenate = new Transforms.ColumnConcatenator(); numericalConcatenate.Data = imported.Data; numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate); var categoryConcatenate = new Transforms.ColumnConcatenator(); categoryConcatenate.Data = numericalConcatenated.OutputData; categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode"); Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate); var categorize = new Transforms.CategoricalOneHotVectorizer(); categorize.AddColumn("CategoryFeatures"); categorize.Data = categoryConcatenated.OutputData; Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize); var featuresConcatenate = new Transforms.ColumnConcatenator(); featuresConcatenate.Data = categorized.OutputData; featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures"); Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate); var learner = new Trainers.StochasticDualCoordinateAscentRegressor(); learner.TrainingData = featuresConcatenated.OutputData; learner.NumThreads = 1; Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner); var combineModels = new Transforms.ManyHeterogeneousModelCombiner(); combineModels.TransformModels = new ArrayVar <ITransformModel>(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model); combineModels.PredictorModel = learnerOutput.PredictorModel; Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels); var scorer = new Transforms.Scorer { PredictorModel = combinedModels.PredictorModel }; var scorerOutput = experiment.Add(scorer); experiment.Compile(); experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false)); experiment.Run(); return(experiment.GetOutput(scorerOutput.ScoringTransform)); }
public void CanSuccessfullyRetrieveSparseData() { string dataPath = GetDataPath("SparseData.txt"); var loader = new Data.TextLoader(dataPath).CreateFrom <SparseInput>(useHeader: true, allowQuotedStrings: false, supportSparse: true); using (var environment = new TlcEnvironment()) { Experiment experiment = environment.CreateExperiment(); ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var getters = new ValueGetter <float>[] { cursor.GetGetter <float>(0), cursor.GetGetter <float>(1), cursor.GetGetter <float>(2), cursor.GetGetter <float>(3), cursor.GetGetter <float>(4) }; Assert.True(cursor.MoveNext()); float[] targets = new float[] { 1, 2, 3, 4, 5 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.True(cursor.MoveNext()); targets = new float[] { 0, 0, 0, 4, 5 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.True(cursor.MoveNext()); targets = new float[] { 0, 2, 0, 0, 0 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.False(cursor.MoveNext()); } } }