public void LpGcNormAndWhiteningWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoader.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoader.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = new LpNormalizingEstimator(ML, "features", "lpnorm") .Append(new GlobalContrastNormalizingEstimator(ML, "features", "gcnorm")) .Append(new VectorWhiteningEstimator(ML, "features", "whitened")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = SelectColumnsTransform.CreateKeep(ML, savedData, new[] { "lpnorm", "gcnorm", "whitened" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "words") .Append(new CharacterTokenizingEstimator(Env, "text", "chars")) .Append(new KeyToValueEstimator(Env, "chars")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "tokenized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "text", "words", "chars"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenized.tsv"); Done(); }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath) .AsDynamic; var feat = data.MakeNewEstimator() .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; })); TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, feat.Fit(data).Transform(data).AsDynamic, 4); savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "Data", "Data_TransformedText"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
public void WhiteningWorkout() { var env = new ConsoleEnvironment(seed: 0); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = new VectorWhiteningEstimator(env, "features", "whitened1") .Append(new VectorWhiteningEstimator(env, "features", "whitened2", kind: WhiteningKind.Pca, pcaNum: 5)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "whitened1", "whitened2"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4); Done(); }
void TestConcat() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); var loader = new TextLoader(Env, new TextLoader.Arguments { Column = new[] { new TextLoader.Column("float1", DataKind.R4, 0), new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("float6", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, 12) }), new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(14, null) { AutoEnd = false, VariableEnd = true } }) }, Separator = ",", HasHeader = true }, new MultiFileSource(dataPath)); var data = loader.Read(source); ColumnType GetType(Schema schema, string name) { Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'"); return(schema.GetColumnType(cIdx)); } var pipe = new ColumnConcatenatingEstimator(Env, "f1", "float1") .Append(new ColumnConcatenatingEstimator(Env, "f2", "float1", "float1")) .Append(new ColumnConcatenatingEstimator(Env, "f3", "float4", "float1")) .Append(new ColumnConcatenatingEstimator(Env, "f4", "float6", "vfloat", "float1")); data = TakeFilter.Create(Env, data, 10); data = pipe.Fit(data).Transform(data); ColumnType t; t = GetType(data.Schema, "f1"); Assert.True(t is VectorType vt1 && vt1.ItemType == NumberType.R4 && vt1.Size == 1); t = GetType(data.Schema, "f2"); Assert.True(t is VectorType vt2 && vt2.ItemType == NumberType.R4 && vt2.Size == 2); t = GetType(data.Schema, "f3"); Assert.True(t is VectorType vt3 && vt3.ItemType == NumberType.R4 && vt3.Size == 5); t = GetType(data.Schema, "f4"); Assert.True(t is VectorType vt4 && vt4.ItemType == NumberType.R4 && vt4.Size == 0); data = SelectColumnsTransform.CreateKeep(Env, data, new[] { "f1", "f2", "f3", "f4" }); var subdir = Path.Combine("Transform", "Concat"); var outputPath = GetOutputPath(subdir, "Concat1.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, Dense = true }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false); } CheckEquality(subdir, "Concat1.tsv"); Done(); }
private void RunCore(IChannel ch) { Host.AssertValue(ch); IDataView data = CreateAndSaveLoader(); if (!string.IsNullOrWhiteSpace(Args.Columns)) { var keepColumns = Args.Columns .Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToArray(); if (Utils.Size(keepColumns) > 0) { data = SelectColumnsTransform.CreateKeep(Host, data, keepColumns); } } IDataSaver saver; if (Args.Saver != null) { saver = Args.Saver.CreateComponent(Host); } else { saver = new TextSaver(Host, new TextSaver.Arguments() { Dense = Args.Dense }); } var cols = new List <int>(); for (int i = 0; i < data.Schema.ColumnCount; i++) { if (!Args.KeepHidden && data.Schema.IsHidden(i)) { continue; } var type = data.Schema.GetColumnType(i); if (saver.IsColumnSavable(type)) { cols.Add(i); } else { ch.Info(MessageSensitivity.Schema, "The column '{0}' will not be written as it has unsavable column type.", data.Schema.GetColumnName(i)); } } Host.NotSensitive().Check(cols.Count > 0, "No valid columns to save"); // Send the first N lines to console. if (Args.Rows > 0) { var args = new SkipTakeFilter.TakeArguments() { Count = Args.Rows }; data = SkipTakeFilter.Create(Host, args, data); } var textSaver = saver as TextSaver; // If it is a text saver, utilize a special utility for this purpose. if (textSaver != null) { textSaver.WriteData(data, true, cols.ToArray()); } else { using (MemoryStream mem = new MemoryStream()) { using (Stream wrapStream = new SubsetStream(mem)) saver.SaveData(wrapStream, data, cols.ToArray()); mem.Seek(0, SeekOrigin.Begin); using (StreamReader reader = new StreamReader(mem)) { string result = reader.ReadToEnd(); ch.Info(MessageSensitivity.UserData | MessageSensitivity.Schema, result); } } } }