static void Main(string[] args) { var onnxModel = Path.Combine("Assets", "Model", "model.onnx"); var imagePathsData = Path.Combine("Assets", "Data", "images.tsv"); var imageFolder = Path.Combine("Assets", "Images"); using (var environment = new ConsoleEnvironment()) { var imageHeight = 64; var imageWidth = 64; var loader = TextLoader.CreateReader(environment, context => ( ImagePath: context.LoadText(0), Name: context.LoadText(1)), separator: '\t', hasHeader: false); var data = loader.Read(new MultiFileSource(imagePathsData)); var estimator = loader.MakeNewEstimator() .Append(row => ( Name: row.Name, input: row.ImagePath.LoadAsImage(imageFolder).AsGrayscale().Resize(imageWidth, imageHeight).ExtractPixels())) .Append(row => (row.Name, EmotionScores: row.input.ApplyOnnxModel(onnxModel))); var model = estimator.Fit(data); var predictionFunction = model.AsDynamic.MakePredictionFunction <EmotionData, EmotionPrediction>(environment); var prediction = predictionFunction.Predict(new EmotionData() { ImagePath = "1.jpg" }); int emotion = GetEmotion(prediction.PredictedLabels); Console.WriteLine(GetEmotionString(emotion)); Console.ReadLine(); } }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only public void MultiClassLightGBM() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new MulticlassClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); OvaPredictor pred = null; // With a custom loss function we no longer get calibrated predictions. var est = reader.MakeNewEstimator() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, preds: ctx.Trainers.LightGbm( r.label, r.features, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); var data = model.Read(dataSource); // Just output some data on the schema for fun. var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.ColumnCount; ++c) { Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}"); } var metrics = ctx.Evaluate(data, r => r.label, r => r.preds, 2); Assert.True(metrics.LogLoss > 0); Assert.True(metrics.TopKAccuracy > 0); }
public void ClassifyGithubIssues() { var env = new LocalEnvironment(new SysRandom(0), verbose: true); string dataPath = "corefx-issues-train.tsv"; // Create reader with specific schema. // string :ID, string: Area, string:Title, string:Description var reader = TextLoader.CreateReader(env, ctx => (area: ctx.LoadText(1), title: ctx.LoadText(2), description: ctx.LoadText(3), dataPath, useHeader: true)); var estimator = reader.MakeEstimator() .Append(row => ( // Convert string label to key. label: row.area.Dictionarize(), // Featurizes 'description' description: row.description.FeaturizeText(), // Featurizes 'title' title: row.title.FeaturizeText())) .Append(row => ( // Concatenate the two features into a vector. features: row.description.ConcatWith(r.title), // Preserve the label label: row.label)) .Append(row => r.label.PredictSdcaMultiClass(row.features)); // Read the data var data = reader.Read(dataPath); // Fit the data var model = estimator.Fit(data); string modelPath = "github-Model.zip"; // We don't currently have the WriteAsync await model.WriteAsync(modelPath); }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only public void LightGbmBinaryClassification() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); IPredictorWithFeatureWeights <float> pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.LightGbm(r.label, r.features, numBoostRound: 10, numLeaves: 5, learningRate: 0.01, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. VBuffer <float> weights = new VBuffer <float>(); pred.GetFeatureWeights(ref weights); Assert.Equal(9, weights.Length); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only public void LightGBMRanking() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.adultRanking.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new RankingContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(0), features: c.LoadFloat(9, 14), groupId: c.LoadText(1)), separator: '\t', hasHeader: true); LightGbmRankingPredictor pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, r.features, groupId: r.groupId.ToKey())) .Append(r => (r.label, r.groupId, score: ctx.Trainers.LightGbm(r.label, r.features, r.groupId, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.groupId, r => r.score); Assert.NotNull(metrics); Assert.True(metrics.Ndcg.Length == metrics.Dcg.Length && metrics.Dcg.Length == 3); Assert.InRange(metrics.Dcg[0], 1.4, 1.6); Assert.InRange(metrics.Dcg[1], 1.4, 1.8); Assert.InRange(metrics.Dcg[2], 1.4, 1.8); Assert.InRange(metrics.Ndcg[0], 36.5, 37); Assert.InRange(metrics.Ndcg[1], 36.5, 37); Assert.InRange(metrics.Ndcg[2], 36.5, 37); }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // TensorFlow is 64-bit only public void TestTensorFlowStatic() { var modelLocation = "cifar_model/frozen_model.pb"; var mlContext = new MLContext(seed: 1, conc: 1); var imageHeight = 32; var imageWidth = 32; var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); var data = TextLoader.CreateReader(mlContext, ctx => ( imagePath: ctx.LoadText(0), name: ctx.LoadText(1))) .Read(dataFile); // Note that CamelCase column names are there to match the TF graph node names. var pipe = data.MakeNewEstimator() .Append(row => ( row.name, Input: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleaveArgb: true))) .Append(row => (row.name, Output: row.Input.ApplyTensorFlowGraph(modelLocation))); TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); var result = pipe.Fit(data).Transform(data).AsDynamic; result.Schema.TryGetColumnIndex("Output", out int output); using (var cursor = result.GetRowCursor(col => col == output)) { var buffer = default(VBuffer <float>); var getter = cursor.GetGetter <VBuffer <float> >(output); var numRows = 0; while (cursor.MoveNext()) { getter(ref buffer); Assert.Equal(10, buffer.Length); numRows += 1; } Assert.Equal(4, numRows); } }
public void TestWhiteningOldSavingAndLoading() { var env = new ConsoleEnvironment(seed: 0); string dataSource = GetDataPath("generated_regression_dataset.csv"); var dataView = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource).AsDynamic; var pipe = new VectorWhiteningEstimator(env, "features", "whitened"); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) { TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles); ms.Position = 0; var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms); } Done(); }
public void OnlineLinearWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); var data = TextLoader.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) .Read(new MultiFileSource(dataPath)); var pipe = data.MakeNewEstimator() .Append(r => (r.Label, Features: r.Features.Normalize())); var trainData = pipe.Fit(data).Transform(data).AsDynamic; IEstimator <ITransformer> est = new OnlineGradientDescentTrainer(Env, new OnlineGradientDescentTrainer.Arguments()); TestEstimatorCore(est, trainData); est = new AveragedPerceptronTrainer(Env, new AveragedPerceptronTrainer.Arguments()); TestEstimatorCore(est, trainData); Done(); }
public void SimpleImageSmokeTest() { var env = new ConsoleEnvironment(0, verbose: true); var reader = TextLoader.CreateReader(env, ctx => ctx.LoadText(0).LoadAsImage().AsGrayscale().Resize(10, 8).ExtractPixels()); var schema = reader.AsDynamic.GetOutputSchema(); Assert.True(schema.TryGetColumnIndex("Data", out int col), "Could not find 'Data' column"); var type = schema.GetColumnType(col); Assert.True(type.IsKnownSizeVector, $"Type was supposed to be known size vector but was instead '{type}'"); var vecType = type.AsVector; Assert.Equal(NumberType.R4, vecType.ItemType); Assert.Equal(3, vecType.DimCount); Assert.Equal(3, vecType.GetDim(0)); Assert.Equal(8, vecType.GetDim(1)); Assert.Equal(10, vecType.GetDim(2)); }
public void SdcaWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); var data = TextLoader.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) .Read(dataPath).Cache(); var binaryTrainer = new SdcaBinaryTrainer(Env, "Label", "Features", advancedSettings: (s) => s.ConvergenceTolerance = 1e-2f); TestEstimatorCore(binaryTrainer, data.AsDynamic); var regressionTrainer = new SdcaRegressionTrainer(Env, "Label", "Features", advancedSettings: (s) => s.ConvergenceTolerance = 1e-2f); TestEstimatorCore(regressionTrainer, data.AsDynamic); var mcTrainer = new SdcaMultiClassTrainer(Env, "Label", "Features", advancedSettings: (s) => s.ConvergenceTolerance = 1e-2f); TestEstimatorCore(mcTrainer, data.AsDynamic); Done(); }
public void HogwildSGDBinaryClassification() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); IPredictorWithFeatureWeights <float> pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.StochasticGradientDescentClassificationTrainer(r.label, r.features, l2Weight: 0, onFit: (p) => { pred = p; }, advancedSettings: s => s.NumThreads = 1))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. VBuffer <float> weights = new VBuffer <float>(); pred.GetFeatureWeights(ref weights); Assert.Equal(9, weights.Length); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new WordTokenizer(Env, "text", "text") .Append(new TermEstimator(Env, "text", "terms")) .Append(new NgramEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void ConcatWith() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("iris.data"); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(4), values: c.LoadFloat(0, 3), value: c.LoadFloat(2)), separator: ','); var dataSource = new MultiFileSource(dataPath); var data = reader.Read(dataSource); var est = data.MakeNewEstimator() .Append(r => ( r.label, r.values, r.value, c0: r.label.AsVector(), c1: r.label.ConcatWith(r.label), c2: r.value.ConcatWith(r.values), c3: r.values.ConcatWith(r.value, r.values))); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; int[] idx = new int[4]; for (int i = 0; i < idx.Length; ++i) { Assert.True(schema.TryGetColumnIndex("c" + i, out idx[i]), $"Could not find col c{i}"); } var types = new VectorType[idx.Length]; int[] expectedLen = new int[] { 1, 2, 5, 9 }; for (int i = 0; i < idx.Length; ++i) { var type = schema.GetColumnType(idx[i]); Assert.True(type.VectorSize > 0, $"Col c{i} had unexpected type {type}"); types[i] = type.AsVector; Assert.Equal(expectedLen[i], type.VectorSize); } Assert.Equal(TextType.Instance, types[0].ItemType); Assert.Equal(TextType.Instance, types[1].ItemType); Assert.Equal(NumberType.Float, types[2].ItemType); Assert.Equal(NumberType.Float, types[3].ItemType); }
public void TestPcaEstimator() { var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = new PcaEstimator(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = _env.Start("save")) { IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(_env, savedData, "pca"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4); Done(); }
public void PcaWorkout() { var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var invalidData = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = new PcaEstimator(_env, "features", "pca", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var estNonDefaultArgs = new PcaEstimator(_env, "features", "pca", rank: 3, weightColumn: "weight", overSampling: 2, center: false); TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic); Done(); }
public void PrincipalComponentAnalysis() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var data = reader.Read(dataSource); var est = reader.MakeNewEstimator() .Append(r => (r.label, pca: r.features.ToPrincipalComponents(rank: 5))); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("pca", out int pcaCol)); var type = schema.GetColumnType(pcaCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); }
public void SdcaBinaryClassification() { var env = new TlcEnvironment(seed: 0); var dataPath = GetDataPath("breast-cancer.txt"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryPredictor pred = null; ParameterMixingCalibratedPredictor cali = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: r.label.PredictSdcaBinaryClassification(r.features, maxIterations: 2, onFit: (p, c) => { pred = p; cali = c; }))); var pipe = reader.Append(est); Assert.Null(pred); Assert.Null(cali); var model = pipe.Fit(dataSource); Assert.NotNull(pred); Assert.NotNull(cali); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights2.Count); var data = model.Read(dataSource); // Just output some data on the schema for fun. var rows = DataViewUtils.ComputeRowCount(data.AsDynamic); var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.ColumnCount; ++c) { Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}"); } }
public void AveragePerceptronNoCalibration() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryPredictor pred = null; var loss = new HingeLoss(new HingeLoss.Arguments() { Margin = 1 }); var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss, numIterations: 2, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights2.Count); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); }
public void PcaWorkout() { var env = new ConsoleEnvironment(seed: 1, conc: 1); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => { s.Seed = 1; }); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(env, savedData, "pca"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv"); Done(); }
public static void Train() { using (var env = new LocalEnvironment(1974)) { /*env.AddListener((messageSource, message) => * Console.WriteLine($"{messageSource.ShortName}: {message.Message} ({message.Kind})"));*/ env.AddListener(ConsoleLogger); var classification = new MulticlassClassificationContext(env); var reader = TextLoader.CreateReader(env, ctx => ( Sentence: ctx.LoadText(1), Label: ctx.LoadText(0) ), separator: ','); var trainData = reader.Read(new MultiFileSource(TrainDataPath)); var pipeline = reader.MakeNewEstimator() .Append(r => ( Label: r.Label.ToKey(), Features: r.Sentence.FeaturizeText())) .Append(r => ( r.Label, Predictions: classification.Trainers.Sdca(r.Label, r.Features) )) .Append(r => r.Predictions.predictedLabel.ToValue()); Console.WriteLine("=============== Training model ==============="); var model = pipeline.Fit(trainData).AsDynamic; using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write)) model.SaveTo(env, fs); Console.WriteLine("=============== End training ==============="); Console.WriteLine("The model is saved to {0}", ModelPath); } }
public void ToKey() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("iris.data"); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(4), values: c.LoadFloat(0, 3)), separator: ','); var dataSource = new MultiFileSource(dataPath); var data = reader.Read(dataSource); var est = data.MakeNewEstimator() .Append(r => (labelKey: r.label.ToKey(), valuesKey: r.values.ToKey(onFit: m => { }))) .Append(r => (r.labelKey, r.valuesKey, valuesKeyKey: r.valuesKey.ToKey())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("labelKey", out int labelCol)); Assert.True(schema.TryGetColumnIndex("valuesKey", out int valuesCol)); Assert.True(schema.TryGetColumnIndex("valuesKeyKey", out int valuesKeyCol)); Assert.Equal(3, schema.GetColumnType(labelCol).KeyCount); Assert.True(schema.GetColumnType(valuesCol).ItemType.IsKey); Assert.True(schema.GetColumnType(valuesKeyCol).ItemType.IsKey); var labelKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, labelCol); var valuesKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesCol); var valuesKeyKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesKeyCol); Assert.NotNull(labelKeyType); Assert.NotNull(valuesKeyType); Assert.NotNull(valuesKeyKeyType); Assert.True(labelKeyType.IsVector && labelKeyType.ItemType == TextType.Instance); Assert.True(valuesKeyType.IsVector && valuesKeyType.ItemType == NumberType.Float); Assert.True(valuesKeyKeyType.IsVector && valuesKeyKeyType.ItemType == NumberType.Float); // Because they're over exactly the same data, they ought to have the same cardinality and everything. Assert.True(valuesKeyKeyType.Equals(valuesKeyType)); }
public void TestWordEmbeddings() { var dataPath = GetDataPath(ScenariosTests.SentimentDataPath); var testDataPath = GetDataPath(ScenariosTests.SentimentTestPath); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), SentimentText: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(dataPath)); var dynamicData = TextTransform.Create(Env, new TextTransform.Arguments() { Column = new TextTransform.Column { Name = "SentimentText_Features", Source = new[] { "SentimentText" } }, KeepDiacritics = false, KeepPunctuations = false, TextCase = Runtime.TextAnalytics.TextNormalizerTransform.CaseNormalizationMode.Lower, OutputTokens = true, StopWordsRemover = new Runtime.TextAnalytics.PredefinedStopWordsRemoverFactory(), VectorNormalizer = TextTransform.TextNormKind.None, CharFeatureExtractor = null, WordFeatureExtractor = null, }, data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( SentimentText_Features_TransformedText: ctx.Text.VarVector, SentimentText: ctx.Text.Scalar, label: ctx.Bool.Scalar)); var est = data2.MakeNewEstimator() .Append(row => row.SentimentText_Features_TransformedText.WordEmbeddings()); TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); Done(); }
public void CrossValidate() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new MulticlassClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); var est = reader.MakeNewEstimator() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, preds: ctx.Trainers.Sdca( r.label, r.features, maxIterations: 2))); var results = ctx.CrossValidate(reader.Read(dataSource), est, r => r.label) .Select(x => x.metrics).ToArray(); Assert.Equal(5, results.Length); Assert.True(results.All(x => x.LogLoss > 0)); }
static void Main(string[] args) { var testDataFile = TestData.PrepareTestDataAndReturnPath(23, 61, 72); for (int i = 0; i < 5; i++) { var env = new LocalEnvironment(); var classification = new MulticlassClassificationContext(env); var reader = TextLoader.CreateReader(env, ctx => (Label: ctx.LoadText(0), Text: ctx.LoadText(1)), separator: ',', hasHeader: false); var data = reader.Read(new MultiFileSource(testDataFile)); var learningPipeline = reader.MakeNewEstimator() .Append(r => (Label: r.Label.ToKey(), Features: r.Text.FeaturizeText(advancedSettings: s => { s.KeepDiacritics = false; //s.KeepNumbers = false; bv PC s.KeepPunctuations = false; s.TextCase = TextNormalizerTransform.CaseNormalizationMode.Lower; s.TextLanguage = TextTransform.Language.Dutch; s.VectorNormalizer = TextTransform.TextNormKind.LInf; }))) .Append(r => (Label: r.Label, Predications: classification.Trainers.Sdca(r.Label, r.Features))); var(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2); var model = learningPipeline.Fit(trainData); var metrics = classification.Evaluate(model.Transform(testData), r => r.Label, r => r.Predications); Console.WriteLine(metrics.AccuracyMicro); Console.WriteLine(metrics.AccuracyMacro); //var cvResults = classification.CrossValidate(data, learningPipeline, r => r.Label, numFolds: 5); //var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro); //Console.WriteLine(microAccuracies.Average()); //var macroAccuracies = cvResults.Select(r => r.metrics.AccuracyMacro); //Console.WriteLine(macroAccuracies.Average()); Console.WriteLine("-----------"); } }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)) .AsDynamic; //var feat = Estimator.MakeNew(data) // .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; })); var feat = new TextTransform(Env, "text", "Data", advancedSettings: s => { s.OutputTokens = true; }); TestEstimatorCore(feat, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, feat.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "Data", "Data_TransformedText"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
public void LpGcNormAndWhitening() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var data = reader.Read(dataSource); var est = reader.MakeNewEstimator() .Append(r => (r.label, lpnorm: r.features.LpNormalize(), gcnorm: r.features.GlobalContrastNormalize(), zcawhitened: r.features.ZcaWhitening(), pcswhitened: r.features.PcaWhitening())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("lpnorm", out int lpnormCol)); var type = schema.GetColumnType(lpnormCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); Assert.True(schema.TryGetColumnIndex("gcnorm", out int gcnormCol)); type = schema.GetColumnType(gcnormCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); Assert.True(schema.TryGetColumnIndex("zcawhitened", out int zcawhitenedCol)); type = schema.GetColumnType(zcawhitenedCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); Assert.True(schema.TryGetColumnIndex("pcswhitened", out int pcswhitenedCol)); type = schema.GetColumnType(pcswhitenedCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) .Append(new NgramCountingEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void LpGcNormAndWhiteningWorkout() { var env = new ConsoleEnvironment(seed: 0); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var est = new LpNormalizer(env, "features", "lpnorm") .Append(new GlobalContrastNormalizer(env, "features", "gcnorm")) .Append(new Whitening(env, "features", "whitened")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "lpnorm_gcnorm_whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "lpnorm", "gcnorm", "whitened"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4); Done(); }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new TextNormalizer(Env, "text") .Append(new WordTokenizer(Env, "text", "words")) .Append(new StopwordRemover(Env, "words", "words_without_stopwords")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }
public void Normalizer() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var data = reader.Read(dataSource); var est = reader.MakeNewEstimator() .Append(r => (r.label, r.features, bin: r.features.NormalizeByBinning(), mm: r.features.Normalize())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("features", out int featCol)); Assert.True(schema.TryGetColumnIndex("bin", out int binCol)); Assert.True(schema.TryGetColumnIndex("mm", out int mmCol)); Assert.False(schema.IsNormalized(featCol)); Assert.True(schema.IsNormalized(binCol)); Assert.True(schema.IsNormalized(mmCol)); }