public void LdaWorkout() { IHostEnvironment env = new MLContext(seed: 42, conc: 1); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(env, "bag_of_words", "text"). Append(new LatentDirichletAllocationEstimator(env, "topics", "bag_of_words", 10, numIterations: 10, resetRandomGenerator: true)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // In this test it manifests because of the WordBagEstimator in the estimator chain // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); var transformer = est.Fit(data.AsDynamic); var transformedData = transformer.Transform(data.AsDynamic); var savedData = ML.Data.TakeRows(transformedData, 4); savedData = ML.Transforms.SelectColumns("topics").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
public void TestSavePaymentInfo() { PaymentInfo paymentInfo = new PaymentInfo { BSB = "this is the bsb", AccountName = "account name is here", AccountNumber = "what's the number", Reference = "take reference", Amount = 1123.45 }; TextSaver saver = new TextSaver(); BpActionResult result = saver.SavePaymentInfo(paymentInfo); Assert.AreEqual(true, result.Success); }
public void LdaWorkout() { var env = new ConsoleEnvironment(seed: 42, conc: 1); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(env, "text", "bag_of_words"). Append(new LdaEstimator(env, "bag_of_words", "topics", 10, advancedSettings: s => { s.NumIterations = 10; s.ResetRandomGenerator = true; })); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, (savedData.Schema.GetColumnType(0) as VectorType)?.Size); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
public void NAReplaceStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoader.CreateReader(Env, ctx => ( ScalarFloat: ctx.LoadFloat(1), ScalarDouble: ctx.LoadDouble(1), VectorFloat: ctx.LoadFloat(1, 4), VectorDoulbe: ctx.LoadDouble(1, 4) )); var data = reader.Read(new MultiFileSource(dataPath)); var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarFloat.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Maximum), B: row.ScalarDouble.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Mean), C: row.VectorFloat.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Mean), D: row.VectorDoulbe.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Minimum) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("NAReplace", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); savedData = new ChooseColumnsTransform(Env, savedData, "A", "B", "C", "D"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NAReplace", "featurized.tsv"); Done(); }
void TestDifferentTypes() { string dataPath = GetDataPath("adult.test"); var loader = new TextLoader(Env, new TextLoader.Arguments { Column = new[] { new TextLoader.Column("float1", DataKind.R4, 0), new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("double1", DataKind.R8, 0), new TextLoader.Column("double4", DataKind.R8, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("int1", DataKind.I4, 0), new TextLoader.Column("text1", DataKind.TX, 1), new TextLoader.Column("text2", DataKind.TX, new[] { new TextLoader.Range(1), new TextLoader.Range(3) }), }, Separator = ",", HasHeader = true }, new MultiFileSource(dataPath)); var pipe = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingTransformer.ColumnInfo("float1", "TermFloat1"), new ValueToKeyMappingTransformer.ColumnInfo("float4", "TermFloat4"), new ValueToKeyMappingTransformer.ColumnInfo("double1", "TermDouble1"), new ValueToKeyMappingTransformer.ColumnInfo("double4", "TermDouble4"), new ValueToKeyMappingTransformer.ColumnInfo("int1", "TermInt1"), new ValueToKeyMappingTransformer.ColumnInfo("text1", "TermText1"), new ValueToKeyMappingTransformer.ColumnInfo("text2", "TermText2") }); var data = loader.Read(dataPath); data = TakeFilter.Create(Env, data, 10); var outputPath = GetOutputPath("Term", "Term.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, pipe.Fit(data).Transform(data), fs, keepHidden: true); } CheckEquality("Term", "Term.tsv"); Done(); }
public void KeyToValueWorkout() { string dataPath = GetDataPath("iris.txt"); var reader = new TextLoader(Env, new TextLoader.Options { Columns = new[] { new TextLoader.Column("ScalarString", DataKind.String, 1), new TextLoader.Column("VectorString", DataKind.String, new[] { new TextLoader.Range(1, 4) }), new TextLoader.Column("BareKey", DataKind.UInt32, new[] { new TextLoader.Range(0) }, new KeyCount(6)) } }); var data = reader.Read(dataPath); data = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"), new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString") }).Fit(data).Transform(data); var badData1 = new ColumnCopyingTransformer(Env, ("A", "BareKey")).Transform(data); var badData2 = new ColumnCopyingTransformer(Env, ("B", "VectorString")).Transform(data); var est = new KeyToValueMappingEstimator(Env, ("A_back", "A"), ("B_back", "B")); TestEstimatorCore(est, data, invalidInput: badData1); TestEstimatorCore(est, data, invalidInput: badData2); var outputPath = GetOutputPath("KeyToValue", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = est.Fit(data).Transform(data); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("KeyToValue", "featurized.tsv"); Done(); }
void TestDifferentTypes() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); var loader = new TextLoader(ML, new TextLoader.Options { Columns = new[] { new TextLoader.Column("float1", DataKind.Single, 9), new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }), new TextLoader.Column("double1", DataKind.Double, 9), new TextLoader.Column("double4", DataKind.Double, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }), new TextLoader.Column("int1", DataKind.Int32, 9), new TextLoader.Column("text1", DataKind.String, 1), new TextLoader.Column("text2", DataKind.String, new[] { new TextLoader.Range(1), new TextLoader.Range(2) }), }, Separator = "\t", HasHeader = true }, new MultiFileSource(dataPath)); var pipe = new ValueToKeyMappingEstimator(ML, new[] { new ValueToKeyMappingEstimator.ColumnInfo("TermFloat1", "float1"), new ValueToKeyMappingEstimator.ColumnInfo("TermFloat4", "float4"), new ValueToKeyMappingEstimator.ColumnInfo("TermDouble1", "double1"), new ValueToKeyMappingEstimator.ColumnInfo("TermDouble4", "double4"), new ValueToKeyMappingEstimator.ColumnInfo("TermInt1", "int1"), new ValueToKeyMappingEstimator.ColumnInfo("TermText1", "text1"), new ValueToKeyMappingEstimator.ColumnInfo("TermText2", "text2") }); var data = loader.Read(dataPath); data = ML.Data.TakeRows(data, 10); var outputPath = GetOutputPath("Term", "Term.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, pipe.Fit(data).Transform(data), fs, keepHidden: true); } CheckEquality("Term", "Term.tsv"); Done(); }
private static void SaveIdvSchemaToFile(IDataView idv, string path, IHost host) { var emptyDataView = new EmptyDataView(host, idv.Schema); var saverArgs = new TextSaver.Arguments { OutputHeader = false, OutputSchema = true, Dense = true }; IDataSaver saver = new TextSaver(host, saverArgs); using (var fs = File.OpenWrite(path)) { saver.SaveData(fs, emptyDataView, Utils.GetIdentityPermutation(emptyDataView.Schema.Count) .Where(x => !emptyDataView.Schema[x].IsHidden && saver.IsColumnSavable(emptyDataView.Schema[x].Type)) .ToArray()); } }
public void NAIndicatorFileOutput() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( ScalarFloat: ctx.LoadFloat(1), ScalarDouble: ctx.LoadDouble(1), VectorFloat: ctx.LoadFloat(1, 4), VectorDoulbe: ctx.LoadDouble(1, 4) )); var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); var est = ML.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("A", "ScalarFloat"), new InputOutputColumnPair("B", "ScalarDouble"), new InputOutputColumnPair("C", "VectorFloat"), new InputOutputColumnPair("D", "VectorDoulbe") }); TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NAIndicator", "featurized.tsv"); using (var ch = ((IHostEnvironment)ML).Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NAIndicator", "featurized.tsv"); Done(); }
public void NAIndicatorFileOutput() { string dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("ScalarFloat", DataKind.Single, 1), new TextLoader.Column("ScalarDouble", DataKind.Double, 1), new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), new TextLoader.Column("VectorDoulbe", DataKind.Double, 1, 4) }); var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); var est = ML.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("A", "ScalarFloat"), new InputOutputColumnPair("B", "ScalarDouble"), new InputOutputColumnPair("C", "VectorFloat"), new InputOutputColumnPair("D", "VectorDoulbe") }); TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NAIndicator", "featurized.tsv"); using (var ch = ((IHostEnvironment)ML).Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NAIndicator", "featurized.tsv"); Done(); }
public void CategoricalStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var invalidData = ML.Data.ReadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind), B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind), C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag), D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin), E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Categorical", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); var view = new ColumnSelectingTransformer(Env, new string[] { "A", "B", "C", "D", "E" }, null, false).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true); } CheckEquality("Categorical", "featurized.tsv"); Done(); }
public void CategoricalHashStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoader.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag), D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin), E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); var view = SelectColumnsTransform.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true); } CheckEquality("CategoricalHash", "featurized.tsv"); Done(); }
public void NormalizerWithOnFit() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => c.LoadFloat(0, 2), separator: ';', hasHeader: true); var data = reader.Read(dataSource); // These will be populated once we call fit. ImmutableArray <float> mm; ImmutableArray <float> ss; ImmutableArray <ImmutableArray <float> > bb; var est = reader.MakeNewEstimator() .Append(r => (r, ncdf: r.NormalizeByCumulativeDistribution(onFit: (m, s) => mm = m), n: r.NormalizeByMeanVar(onFit: (s, o) => { ss = s; Assert.Empty(o); }), b: r.NormalizeByBinning(onFit: b => bb = b))); var tdata = est.Fit(data).Transform(data); Assert.Equal(3, mm.Length); Assert.Equal(3, ss.Length); Assert.Equal(3, bb.Length); // Just for fun, let's also write out some of the lines of the data to the console. using (var stream = new MemoryStream()) { IDataView v = new ChooseColumnsTransform(env, tdata.AsDynamic, "r", "ncdf", "n", "b"); v = TakeFilter.Create(env, v, 10); var saver = new TextSaver(env, new TextSaver.Arguments() { Dense = true, Separator = ",", OutputHeader = false }); saver.SaveData(stream, v, Utils.GetIdentityPermutation(v.Schema.ColumnCount)); Console.WriteLine(Encoding.UTF8.GetString(stream.ToArray())); } }
public void OldKeyTypeCodecTest() { // Checks that we can load IDataViews defined with unknown cardinality KeyType. // schema-codec-test.idv was generated with the following command before simplifying the KeyType: // dotnet MML.dll savedata loader=text{col=A:U4[0-2]:0 col=B:U4[0-5]:0 col=C:U1[0-10]:0 col=D:U2[0-*]:0 col=E:U4[0-*]:0 col=F:U8[0-*]:0} dout=codectest.idv var data = ML.Data.ReadFromBinary(GetDataPath("schema-codec-test.idv")); var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("BinaryLoaderSaver", "OldKeyTypeCodecTest.txt"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(data, fs); } CheckEquality("BinaryLoaderSaver", "OldKeyTypeCodecTest.txt"); Done(); }
/// <summary> /// Save the data view as text. /// </summary> /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> /// <param name="data">The data view to save.</param> /// <param name="stream">The stream to write to.</param> /// <param name="separatorChar">The column separator.</param> /// <param name="headerRow">Whether to write the header row.</param> /// <param name="schema">Whether to write the header comment with the schema.</param> /// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param> public static void SaveAsText(this DataOperationsCatalog catalog, IDataView data, Stream stream, char separatorChar = TextLoader.DefaultArguments.Separator, bool headerRow = TextLoader.DefaultArguments.HasHeader, bool schema = true, bool keepHidden = false) { Contracts.CheckValue(catalog, nameof(catalog)); Contracts.CheckValue(data, nameof(data)); Contracts.CheckValue(stream, nameof(stream)); var env = catalog.GetEnvironment(); var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema }); using (var ch = env.Start("Saving data")) DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden); }
public void LambdaTransformCreate() { var env = new MLContext(seed: 42); var data = ReadBreastCancerExamples(); var idv = env.CreateDataView(data); var filter = LambdaTransform.CreateFilter<BreastCancerExample, object>(env, idv, (input, state) => input.Label == 0, null); Assert.Null(filter.GetRowCount()); // test re-apply var applied = env.CreateDataView(data); applied = ApplyTransformUtils.ApplyAllTransformsToData(env, filter, applied); var saver = new TextSaver(env, new TextSaver.Arguments()); Assert.True(applied.Schema.TryGetColumnIndex("Label", out int label)); using (var fs = File.Create(GetOutputPath(OutputRelativePath, "lambda-output.tsv"))) saver.SaveData(fs, applied, label); }
/// <summary> /// Метод удаляет символы с текста. /// </summary> /// <param name="deleteSymblol">Символ для удаления.</param> /// <param name="text">Текст.</param> public static void Delete(string deleteSymblol, ref TextSaver text) { int count = 0; //Итератор для foreach. bool isExist = false; //Булевая переменная для проверки существования удаляемого обьекта. foreach (string words in text.Words) { if (words.Contains(deleteSymblol)) { isExist = true; text.Words[count] = words.Replace(deleteSymblol, ""); } count++; } text.Synchro(); if (!isExist) { Console.WriteLine($"This file does not have this \"{deleteSymblol}\""); } }
public void TestWordEmbeddings() { var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); var data = new TextLoader(Env, new TextLoader.Arguments() { Separator = "\t", HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) } }).Read(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe); TestEstimatorCore(pipe, words, invalidInput: data); var outputPath = GetOutputPath("Text", "wordEmbeddings.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "wordEmbeddings.tsv"); Done(); }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateLoader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Load(dataPath).AsDynamic; var est = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
private static void SaveIdvToFile(IDataView idv, string path, IHost host) { if (path == STDNULL) { return; } var extension = Path.GetExtension(path); IDataSaver saver; if (extension != ".csv" && extension != ".tsv" && extension != ".txt") { saver = new BinarySaver(host, new BinarySaver.Arguments()); var schemaFilePath = Path.GetDirectoryName(path) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(path) + ".schema"; SaveIdvSchemaToFile(idv, schemaFilePath, host); } else { var saverArgs = new TextSaver.Arguments { OutputHeader = true, OutputSchema = true, Dense = true, Separator = extension == ".csv" ? "comma" : "tab" }; saver = new TextSaver(host, saverArgs); } using (var fs = File.OpenWrite(path)) { saver.SaveData(fs, idv, Utils.GetIdentityPermutation(idv.Schema.Count) .Where(x => !idv.Schema[x].IsHidden && saver.IsColumnSavable(idv.Schema[x].Type)) .ToArray()); } }
public void PcaWorkout() { var env = new ConsoleEnvironment(seed: 1, conc: 1); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => { s.Seed = 1; }); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(env, savedData, "pca"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv"); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new WordTokenizer(Env, "text", "text") .Append(new TermEstimator(Env, "text", "terms")) .Append(new NgramEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true, allowQuoting: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true, allowQuoting: true); var feat = ML.Transforms.Text.FeaturizeText("Data", new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens" }, new[] { "text" }); TestEstimatorCore(feat, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = ((IHostEnvironment)ML).Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
/// <summary> /// Save schema associations of role/column-name in <paramref name="rep"/>. /// </summary> internal static void SaveRoleMappings(IHostEnvironment env, IChannel ch, RoleMappedSchema schema, RepositoryWriter rep) { // REVIEW: Should we also save this stuff, for instance, in some portion of the // score command or transform? Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(schema); ArrayDataViewBuilder builder = new ArrayDataViewBuilder(env); List <string> rolesList = new List <string>(); List <string> columnNamesList = new List <string>(); // OrderBy is stable, so there is no danger in it "reordering" columns // when a role is filled by multiple columns. foreach (var role in schema.GetColumnRoleNames().OrderBy(r => r.Key.Value)) { rolesList.Add(role.Key.Value); columnNamesList.Add(role.Value); } builder.AddColumn("Role", rolesList.ToArray()); builder.AddColumn("Column", columnNamesList.ToArray()); using (var entry = rep.CreateEntry(DirTrainingInfo, RoleMappingFile)) { // REVIEW: It seems very important that we have the role mappings // be easily human interpretable and even manipulable, but relying on the // text saver/loader means that special characters like '\n' won't be reinterpretable. // On the other hand, no one is such a big lunatic that they will actually // ever go ahead and do something so stupid as that. var saver = new TextSaver(env, new TextSaver.Arguments() { Dense = true, Silent = true }); var view = builder.GetDataView(); saver.SaveData(entry.Stream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount)); } }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Load(sentimentDataPath); var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath) .AsDynamic; var feat = data.MakeNewEstimator() .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens", })); TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = ((IHostEnvironment)ML).Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4); savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
public State(TextSaver parent, TextWriter writer, ValueWriter[] pipes, bool hasHeader) { Contracts.AssertValue(parent); Contracts.AssertValue(parent._host); _host = parent._host; _host.AssertValue(writer); _host.AssertValue(pipes); _dense = parent._forceDense; _sepChar = parent._sepChar; _sepStr = parent._sepStr; _writer = writer; _pipes = pipes; _hasHeader = hasHeader && parent._outputHeader; _mpcoldst = new int[_pipes.Length + 1]; _mpcolslot = new int[_pipes.Length + 1]; _rgch = new char[1024]; _mpslotdst = new int[128]; _mpslotichLim = new int[128]; }
public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true, allowQuoting: true, allowSparse: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true, allowQuoting: true, allowSparse: true); var est = new WordBagEstimator(ML, "bag_of_words", "text") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("bag_of_words_count", "bag_of_words_mi").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("FeatureSelection", "featureselection.tsv"); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) .Append(new NgramExtractingEstimator(Env, "terms", "ngrams")) .Append(new NgramHashingEstimator(Env, "terms", "ngramshash")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(Env, "text", "bag_of_words"). Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash", invertHash: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void LpGcNormAndWhiteningWorkout() { var env = new ConsoleEnvironment(seed: 0); string dataSource = GetDataPath("generated_regression_dataset.csv"); var data = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var invalidData = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); var est = new LpNormalizer(env, "features", "lpnorm") .Append(new GlobalContrastNormalizer(env, "features", "gcnorm")) .Append(new Whitening(env, "features", "whitened")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "lpnorm_gcnorm_whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "lpnorm", "gcnorm", "whitened"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4); Done(); }