public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(Env, "text", "bag_of_words") .Append(new CountFeatureSelector(Env, "bag_of_words", "bag_of_words_count", 10) .Append(new MutualInformationFeatureSelector(Env, "bag_of_words", "bag_of_words_mi", labelColumn: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "bag_of_words_count", "bag_of_words_mi"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("FeatureSelection", "featureselection.tsv"); Done(); }
public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Load(sentimentDataPath); var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath); var est = new WordBagEstimator(ML, "bag_of_words", "text") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("bag_of_words_count", "bag_of_words_mi").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("FeatureSelection", "featureselection.tsv"); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(ML, "bag_of_words", "text"). Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", invertHash: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true, allowQuoting: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true, allowQuoting: true); var est = new WordBagEstimator(ML, "bag_of_words", "text"). Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", maximumNumberOfInverts: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "bag_of_words", "bag_of_wordshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void LdaWorkout() { IHostEnvironment env = new MLContext(seed: 42); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true); var est = new WordBagEstimator(env, "bag_of_words", "text"). Append(new LatentDirichletAllocationEstimator(env, "topics", "bag_of_words", 10, maximumNumberOfIterations: 10, resetRandomGenerator: true)); // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); var transformer = est.Fit(data); var transformedData = transformer.Transform(data); var savedData = ML.Data.TakeRows(transformedData, 4); savedData = ML.Transforms.SelectColumns("topics").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, (savedData.Schema[0].Type as VectorDataViewType)?.Size); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
public void LdaWorkout() { IHostEnvironment env = new MLContext(seed: 42, conc: 1); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(env, "text", "bag_of_words"). Append(new LatentDirichletAllocationEstimator(env, "bag_of_words", "topics", 10, numIterations: 10, resetRandomGenerator: true)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // In this test it manifests because of the WordBagEstimator in the estimator chain // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); var transformer = est.Fit(data.AsDynamic); var transformedData = transformer.Transform(data.AsDynamic); IDataView savedData = TakeFilter.Create(env, transformedData, 4); savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
public void LdaWorkout() { var env = new ConsoleEnvironment(seed: 42, conc: 1); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new WordBagEstimator(env, "text", "bag_of_words"). Append(new LdaEstimator(env, "bag_of_words", "topics", 10, advancedSettings: s => { s.NumIterations = 10; s.ResetRandomGenerator = true; })); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(env, savedData, "topics"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, savedData.Schema.GetColumnType(0).VectorSize); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(Env, "text", "bag_of_words"). Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash", invertHash: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true, allowQuoting: true, allowSparse: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true, allowQuoting: true, allowSparse: true); var est = new WordBagEstimator(ML, "bag_of_words", "text") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("bag_of_words_count", "bag_of_words_mi").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("FeatureSelection", "featureselection.tsv"); Done(); }