public void NormalizerWithOnFit() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => c.LoadFloat(0, 2), separator: ';', hasHeader: true); var data = reader.Read(dataSource); // These will be populated once we call fit. ImmutableArray <float> mm; ImmutableArray <float> ss; ImmutableArray <ImmutableArray <float> > bb; var est = reader.MakeNewEstimator() .Append(r => (r, ncdf: r.NormalizeByCumulativeDistribution(onFit: (m, s) => mm = m), n: r.NormalizeByMeanVar(onFit: (s, o) => { ss = s; Assert.Empty(o); }), b: r.NormalizeByBinning(onFit: b => bb = b))); var tdata = est.Fit(data).Transform(data); Assert.Equal(3, mm.Length); Assert.Equal(3, ss.Length); Assert.Equal(3, bb.Length); // Just for fun, let's also write out some of the lines of the data to the console. using (var stream = new MemoryStream()) { IDataView v = new ChooseColumnsTransform(env, tdata.AsDynamic, "r", "ncdf", "n", "b"); v = TakeFilter.Create(env, v, 10); var saver = new TextSaver(env, new TextSaver.Arguments() { Dense = true, Separator = ",", OutputHeader = false }); saver.SaveData(stream, v, Utils.GetIdentityPermutation(v.Schema.ColumnCount)); Console.WriteLine(Encoding.UTF8.GetString(stream.ToArray())); } }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new WordTokenizer(Env, "text", "text") .Append(new TermEstimator(Env, "text", "terms")) .Append(new NgramEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TestPcaEstimator() { var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = new PcaEstimator(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = _env.Start("save")) { IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(_env, savedData, "pca"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4); Done(); }
public void PcaWorkout() { var env = new ConsoleEnvironment(seed: 1, conc: 1); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => { s.Seed = 1; }); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(env, savedData, "pca"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv"); Done(); }
public void LpGcNormAndWhiteningWorkout() { var env = new ConsoleEnvironment(seed: 0); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var est = new LpNormalizer(env, "features", "lpnorm") .Append(new GlobalContrastNormalizer(env, "features", "gcnorm")) .Append(new Whitening(env, "features", "whitened")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "lpnorm_gcnorm_whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "lpnorm", "gcnorm", "whitened"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4); Done(); }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)) .AsDynamic; //var feat = Estimator.MakeNew(data) // .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; })); var feat = new TextTransform(Env, "text", "Data", advancedSettings: s => { s.OutputTokens = true; }); TestEstimatorCore(feat, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, feat.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "Data", "Data_TransformedText"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new TextNormalizer(Env, "text") .Append(new WordTokenizer(Env, "text", "words")) .Append(new StopwordRemover(Env, "words", "words_without_stopwords")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }
public void KeyToValuePigsty() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoader.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4) )); var data = reader.Read(dataPath); // Non-pigsty Term. var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { new TermTransform.ColumnInfo("ScalarString", "A"), new TermTransform.ColumnInfo("VectorString", "B") }) .Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( A: ctx.KeyU4.TextValues.Scalar, B: ctx.KeyU4.TextValues.Vector)); var est = data2.MakeNewEstimator() .Append(row => ( ScalarString: row.A.ToValue(), VectorString: row.B.ToValue())); TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); // Check that term and ToValue are round-trippable. var dataLeft = new ChooseColumnsTransform(Env, data.AsDynamic, "ScalarString", "VectorString"); var dataRight = new ChooseColumnsTransform(Env, est.Fit(data2).Transform(data2).AsDynamic, "ScalarString", "VectorString"); CheckSameSchemas(dataLeft.Schema, dataRight.Schema); CheckSameValues(dataLeft, dataRight); Done(); }
void TestConcat() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); var loader = new TextLoader(Env, new TextLoader.Arguments { Column = new[] { new TextLoader.Column("float1", DataKind.R4, 0), new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null) { AutoEnd = false, VariableEnd = true } }) }, Separator = ",", HasHeader = true }, new MultiFileSource(dataPath)); var data = loader.Read(source); ColumnType GetType(ISchema schema, string name) { Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'"); return(schema.GetColumnType(cIdx)); } var pipe = new ConcatEstimator(Env, "f1", "float1") .Append(new ConcatEstimator(Env, "f2", "float1", "float1")) .Append(new ConcatEstimator(Env, "f3", "float4", "float1")) .Append(new ConcatEstimator(Env, "f4", "vfloat", "float1")); data = TakeFilter.Create(Env, data, 10); data = pipe.Fit(data).Transform(data); ColumnType t; t = GetType(data.Schema, "f1"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 1); t = GetType(data.Schema, "f2"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 2); t = GetType(data.Schema, "f3"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 5); t = GetType(data.Schema, "f4"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 0); data = new ChooseColumnsTransform(Env, data, "f1", "f2", "f3", "f4"); var subdir = Path.Combine("Transform", "Concat"); var outputPath = GetOutputPath(subdir, "Concat1.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, Dense = true }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false); } CheckEquality(subdir, "Concat1.tsv"); Done(); }