public static Output Split(IHostEnvironment env, Input input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(ModuleName); host.CheckValue(input, nameof(input)); host.Check(0 < input.Fraction && input.Fraction < 1, "The fraction must be in the interval (0,1)."); EntryPointUtils.CheckInputArgs(host, input); var data = input.Data; var stratCol = SplitUtils.CreateStratificationColumn(host, ref data, input.StratificationColumn); IDataView trainData = new RangeFilter(host, new RangeFilter.Options { Column = stratCol, Min = 0, Max = input.Fraction, Complement = false }, data); trainData = ColumnSelectingTransformer.CreateDrop(host, trainData, stratCol); IDataView testData = new RangeFilter(host, new RangeFilter.Options { Column = stratCol, Min = 0, Max = input.Fraction, Complement = true }, data); testData = ColumnSelectingTransformer.CreateDrop(host, testData, stratCol); return(new Output() { TrainData = trainData, TestData = testData }); }
public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(ML, "text", "bag_of_words") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words", "bag_of_words_count", 10) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words", "bag_of_words_mi", labelColumn: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "bag_of_words_count", "bag_of_words_mi" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("FeatureSelection", "featureselection.tsv"); Done(); }
/// <summary> /// Split the dataset into the train set and test set according to the given fraction. /// Respects the <paramref name="samplingKeyColumnName"/> if provided. /// </summary> /// <param name="data">The dataset to split.</param> /// <param name="testFraction">The fraction of data to go into the test set.</param> /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>, /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set. /// If <see langword="null"/> no row grouping will be performed.</param> /// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// [!code-csharp[TrainTestSplit](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/TrainTestSplit.cs)] /// ]]> /// </format> /// </example> public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int?seed = null) { _env.CheckValue(data, nameof(data)); _env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive"); _env.CheckValueOrNull(samplingKeyColumnName); var splitColumn = CreateSplitColumn(_env, ref data, samplingKeyColumnName, seed, fallbackInEnvSeed: true); var trainFilter = new RangeFilter(_env, new RangeFilter.Options() { Column = splitColumn, Min = 0, Max = testFraction, Complement = true }, data); var testFilter = new RangeFilter(_env, new RangeFilter.Options() { Column = splitColumn, Min = 0, Max = testFraction, Complement = false }, data); var trainDV = ColumnSelectingTransformer.CreateDrop(_env, trainFilter, splitColumn); var testDV = ColumnSelectingTransformer.CreateDrop(_env, testFilter, splitColumn); return(new TrainTestData(trainDV, testDV)); }
/// <summary> /// Splits the data based on the splitColumn, and drops that column as it is only /// intended to be used for splitting the data, and shouldn't be part of the output schema. /// </summary> internal static IEnumerable <TrainTestData> CrossValidationSplit(IHostEnvironment env, IDataView data, string splitColumn, int numberOfFolds = 5) { env.CheckValue(splitColumn, nameof(splitColumn)); for (int fold = 0; fold < numberOfFolds; fold++) { var trainFilter = new RangeFilter(env, new RangeFilter.Options { Column = splitColumn, Min = (double)fold / numberOfFolds, Max = (double)(fold + 1) / numberOfFolds, Complement = true, IncludeMin = true, IncludeMax = true, }, data); var testFilter = new RangeFilter(env, new RangeFilter.Options { Column = splitColumn, Min = (double)fold / numberOfFolds, Max = (double)(fold + 1) / numberOfFolds, Complement = false, IncludeMin = true, IncludeMax = true }, data); var trainDV = ColumnSelectingTransformer.CreateDrop(env, trainFilter, splitColumn); var testDV = ColumnSelectingTransformer.CreateDrop(env, testFilter, splitColumn); yield return(new TrainTestData(trainDV, testDV)); } }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = ML.Transforms.Text.NormalizeText("text") .Append(ML.Transforms.Text.TokenizeWords("words", "text")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words")) .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "NoDefaultStopwords", "NoStopWords" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(dataPath).AsDynamic; var est = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4); var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" }); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(ML, "bag_of_words", "text"). Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", invertHash: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void LpNormWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = ML.Transforms.Projection.LpNormalize("lpNorm1", "features") .Append(ML.Transforms.Projection.LpNormalize("lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, subMean: true)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "lpNorm1", "lpNorm2" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "lpNorm.tsv"); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) .Append(new NgramExtractingEstimator(ML, "ngrams", "terms")) .Append(new NgramHashingEstimator(ML, "ngramshash", "terms")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "ngrams.tsv"); Done(); }
public void WhiteningWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoader.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoader.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = new VectorWhiteningEstimator(ML, "features", "whitened1") .Append(new VectorWhiteningEstimator(ML, "features", "whitened2", kind: WhiteningKind.Pca, pcaNum: 5)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "whitened1", "whitened2" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4); Done(); }
public void GcnWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoader.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoader.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = new GlobalContrastNormalizingEstimator(ML, "features", "gcnNorm1") .Append(new GlobalContrastNormalizingEstimator(ML, "features", "gcnNorm2", substractMean: false, useStdDev: true, scale: 3)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "gcnNorm1", "gcnNorm2" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "gcnNorm.tsv", digitsOfPrecision: 4); Done(); }
// Factory method for SignatureDataTransform. internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckNonWhiteSpace(args.Source, nameof(args.Source)); if (string.IsNullOrWhiteSpace(args.Name)) args.Name = args.Source; var file = Utils.FindExistentFileOrNull("pretrained.model", "Sentiment", assemblyForBasePath: typeof(SentimentAnalyzingTransformer)); if (file == null) { throw h.Except("resourcePath", "Missing resource for SentimentAnalyzingTransform."); } // The logic below ensures that any columns in our input IDataView that conflict // with column names known to be used in the pretrained model transform pipeline we're // loading are aliased to temporary column names before we apply the pipeline and then // renamed back to their original names after. We do this to ensure the pretrained model // doesn't shadow or replace columns we aren't expecting it to. // 1. Alias any column in the input IDataView that is known to appear to the pretrained // model into a temporary column so that we can restore them after the pretrained model // is added to the pipeline. KeyValuePair<string, string>[] aliased; input = AliasIfNeeded(env, input, _modelIntermediateColumnNames, out aliased); // 2. Copy source column to a column with the name expected by the pretrained model featurization // transform pipeline. var copyTransformer = new ColumnCopyingTransformer(env, (args.Source, ModelInputColumnName)); input = copyTransformer.Transform(input); // 3. Apply the pretrained model and its featurization transform pipeline. input = LoadTransforms(env, input, file); // 4. Copy the output column from the pretrained model to a temporary column. var scoreTempName = input.Schema.GetTempColumnName("sa_out"); copyTransformer = new ColumnCopyingTransformer(env, (ModelScoreColumnName, scoreTempName)); input = copyTransformer.Transform(input); // 5. Drop all the columns created by the pretrained model, including the expected input column // and the output column, which we have copied to a temporary column in (4). input = ColumnSelectingTransformer.CreateDrop(env, input, _modelIntermediateColumnNames); // 6. Unalias all the original columns that were originally present in the IDataView, but may have // been shadowed by column names in the pretrained model. This method will also drop all the temporary // columns that were created for them in (1). input = UnaliasIfNeeded(env, input, aliased); // 7. Copy the temporary column with the score we created in (4) to a column with the user-specified destination name. copyTransformer = new ColumnCopyingTransformer(env, (scoreTempName, args.Name)); input = copyTransformer.Transform(input); // 8. Drop the temporary column with the score created in (4). return ColumnSelectingTransformer.CreateDrop(env, input, scoreTempName); }
private static IDataView UnaliasIfNeeded(IHostEnvironment env, IDataView input, KeyValuePair<string, string>[] hiddenNames) { if (Utils.Size(hiddenNames) == 0) return input; input = new ColumnCopyingTransformer(env, hiddenNames.Select(x => (Input: x.Key, Output: x.Value)).ToArray()).Transform(input); return ColumnSelectingTransformer.CreateDrop(env, input, hiddenNames.Select(pair => pair.Value).ToArray()); }
public static CommonOutputs.TransformOutput RenameBinaryPredictionScoreColumns(IHostEnvironment env, RenameBinaryPredictionScoreColumnsInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("ScoreModel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); if (input.PredictorModel.Predictor.PredictionKind == PredictionKind.BinaryClassification) { DataViewType labelType; var labelNames = input.PredictorModel.GetLabelInfo(host, out labelType); if (labelNames != null && labelNames.Length == 2) { var positiveClass = labelNames[1]; // Rename all the score columns. int colMax; var maxScoreId = input.Data.Schema.GetMaxAnnotationKind(out colMax, AnnotationUtils.Kinds.ScoreColumnSetId); var copyCols = new List <(string name, string source)>(); for (int i = 0; i < input.Data.Schema.Count; i++) { if (input.Data.Schema[i].IsHidden) { continue; } if (!ShouldAddColumn(input.Data.Schema, i, null, maxScoreId)) { continue; } // Do not rename the PredictedLabel column. ReadOnlyMemory <char> tmp = default; if (input.Data.Schema.TryGetAnnotation(TextDataViewType.Instance, AnnotationUtils.Kinds.ScoreValueKind, i, ref tmp) && ReadOnlyMemoryUtils.EqualsStr(AnnotationUtils.Const.ScoreValueKind.PredictedLabel, tmp)) { continue; } var source = input.Data.Schema[i].Name; var name = source + "." + positiveClass; copyCols.Add((name, source)); } var copyColumn = new ColumnCopyingTransformer(env, copyCols.ToArray()).Transform(input.Data); var dropColumn = ColumnSelectingTransformer.CreateDrop(env, copyColumn, copyCols.Select(c => c.source).ToArray()); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, dropColumn, input.Data), OutputData = dropColumn }); } } var newView = NopTransform.CreateIfNeeded(env, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, newView, input.Data), OutputData = newView }); }
private IDataView WrapPerInstance(RoleMappedData perInst) { var idv = perInst.Data; // Make a list of column names that Maml outputs as part of the per-instance data view, and then wrap // the per-instance data computed by the evaluator in a SelectColumnsTransform. var cols = new List <(string Source, string Name)>(); var colsToKeep = new List <string>(); // If perInst is the result of cross-validation and contains a fold Id column, include it. int foldCol; if (perInst.Schema.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.FoldIndex, out foldCol)) { colsToKeep.Add(MetricKinds.ColumnNames.FoldIndex); } // Maml always outputs a name column, if it doesn't exist add a GenerateNumberTransform. if (perInst.Schema.Name == null) { var args = new GenerateNumberTransform.Arguments(); args.Column = new[] { new GenerateNumberTransform.Column() { Name = "Instance" } }; args.UseCounter = true; idv = new GenerateNumberTransform(Host, args, idv); colsToKeep.Add("Instance"); } else { cols.Add((perInst.Schema.Name.Name, "Instance")); colsToKeep.Add("Instance"); } // Maml outputs the weight column if it exists. if (perInst.Schema.Weight != null) { colsToKeep.Add(perInst.Schema.Weight.Name); } // Get the other columns from the evaluator. foreach (var col in GetPerInstanceColumnsToSave(perInst.Schema)) { colsToKeep.Add(col); } idv = new ColumnsCopyingTransformer(Host, cols.ToArray()).Transform(idv); idv = ColumnSelectingTransformer.CreateKeep(Host, idv, colsToKeep.ToArray()); return(GetPerInstanceMetricsCore(idv, perInst.Schema)); }
public static CommonOutputs.TransformOutput SelectColumns(IHostEnvironment env, ColumnSelectingTransformer.Arguments input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("SelectColumns"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var xf = new ColumnSelectingTransformer(env, input.KeepColumns, input.DropColumns, input.KeepHidden, input.IgnoreMissing).Transform(input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
public void TestCustomWordEmbeddings() { var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); var data = new TextLoader(Env, new TextLoader.Arguments() { Separator = "\t", HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) } }).Read(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); var pathToCustomModel = DeleteOutputPath("custommodel.txt"); using (StreamWriter file = new StreamWriter(pathToCustomModel)) { file.WriteLine("This is custom file for 4 words with 5 dimentional vector. First line in this file is ignored"); file.WriteLine("stop" + " " + string.Join(" ", 1.5f, 2.5f, 3.5f, 4.5f, 5.5f)); file.WriteLine("bursts" + " " + string.Join(" ", -0.9f, -3f, 7.3f, 1.0f, 12f)); file.WriteLine("you" + " " + string.Join(" ", -1f, -2f, -4f, -6f, -1f)); file.WriteLine("dude" + " " + string.Join(" ", 100f, 0f, 0f, 0f, 0f)); } var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", pathToCustomModel, "CleanWords"); TestEstimatorCore(pipe, words, invalidInput: data); var outputPath = GetOutputPath("Text", "customWordEmbeddings.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 10); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings", "CleanWords" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "customWordEmbeddings.tsv"); Done(); }
public void CategoricalHashStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoader.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( row.ScalarString, row.VectorString, // Create a VarVector column VarVectorString: row.ScalarString.TokenizeText())). Append(row => ( A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag), D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin), E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin), F: row.VarVectorString.OneHotHashEncoding() )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); var view = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E", "F" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true); } CheckEquality("CategoricalHash", "featurized.tsv"); Done(); }
private void TestSvmLight(string path, string savingPath, int inputSize, int expectedInputSize, bool zeroBased, IDataView expectedData, long?numberOfRows = null) { var data = ML.Data.LoadFromSvmLightFile(path, inputSize: inputSize, zeroBased: zeroBased, numberOfRows: numberOfRows); Assert.True(data.Schema["Features"].Type.GetValueCount() == expectedInputSize); CheckSameValues(data, expectedData, checkId: false); // Save, reload and compare dataviews again. using (var stream = File.Create(savingPath)) ML.Data.SaveInSvmLightFormat(expectedData, stream, zeroBasedIndexing: zeroBased, exampleWeightColumnName: "Weight"); data = ML.Data.LoadFromSvmLightFile(savingPath, inputSize: inputSize, zeroBased: zeroBased); CheckSameValues(ColumnSelectingTransformer.CreateDrop(Env, data, "Comment"), ColumnSelectingTransformer.CreateDrop(Env, expectedData, "Comment"), checkId: false); }
public void LdaWorkout() { IHostEnvironment env = new MLContext(seed: 42, conc: 1); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(env, "text", "bag_of_words"). Append(new LatentDirichletAllocationEstimator(env, "bag_of_words", "topics", 10, numIterations: 10, resetRandomGenerator: true)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // In this test it manifests because of the WordBagEstimator in the estimator chain // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); var transformer = est.Fit(data.AsDynamic); var transformedData = transformer.Transform(data.AsDynamic); IDataView savedData = TakeFilter.Create(env, transformedData, 4); savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
public void NAReplaceStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoader.CreateReader(Env, ctx => ( ScalarFloat: ctx.LoadFloat(1), ScalarDouble: ctx.LoadDouble(1), VectorFloat: ctx.LoadFloat(1, 4), VectorDoulbe: ctx.LoadDouble(1, 4) )); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Maximum), B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Mean), C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Mean), D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Minimum) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("NAReplace", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); var view = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true); } CheckEquality("NAReplace", "featurized.tsv"); Done(); }
private static IDataView UnaliasIfNeeded(IHostEnvironment env, IDataView input, KeyValuePair <string, string>[] hiddenNames) { if (Utils.Size(hiddenNames) == 0) { return(input); } input = ColumnCopyingTransformer.Create(env, new ColumnCopyingTransformer.Arguments() { Column = hiddenNames.Select(pair => new ColumnCopyingTransformer.Column() { Name = pair.Key, Source = pair.Value }).ToArray() }, input); return(ColumnSelectingTransformer.CreateDrop(env, input, hiddenNames.Select(pair => pair.Value).ToArray())); }
public void CategoricalHashStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(ML, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var invalidData = ML.Data.ReadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( row.ScalarString, row.VectorString, // Create a VarVector column VarVectorString: row.ScalarString.TokenizeText())). Append(row => ( A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag), D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin), E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin), F: row.VarVectorString.OneHotHashEncoding() )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); var view = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "A", "B", "C", "D", "E", "F" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true); CheckEquality("CategoricalHash", "featurized.tsv"); Done(); }
public void TestPcaEstimator() { var data = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = ML.Transforms.Projection.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "pca" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4); Done(); }
public void CategoricalStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var invalidData = ML.Data.ReadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind), B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind), C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag), D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin), E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Categorical", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); var view = new ColumnSelectingTransformer(Env, new string[] { "A", "B", "C", "D", "E" }, null, false).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true); } CheckEquality("Categorical", "featurized.tsv"); Done(); }
public void NAReplaceStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(ML, ctx => ( ScalarFloat: ctx.LoadFloat(1), ScalarDouble: ctx.LoadDouble(1), VectorFloat: ctx.LoadFloat(1, 4), VectorDoulbe: ctx.LoadDouble(1, 4) )); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ML.Data.ReadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Maximum), B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Mean), C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Mean), D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Minimum) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("NAReplace", "featurized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); var view = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true); CheckEquality("NAReplace", "featurized.tsv"); Done(); }
public void TestWordEmbeddings() { var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); var data = new TextLoader(Env, new TextLoader.Arguments() { Separator = "\t", HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) } }).Read(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe); TestEstimatorCore(pipe, words, invalidInput: data); var outputPath = GetOutputPath("Text", "wordEmbeddings.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "wordEmbeddings.tsv"); Done(); }
public static Output Split(IHostEnvironment env, Input input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(ModuleName); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var data = input.Data; var stratCol = SplitUtils.CreateStratificationColumn(host, ref data, input.StratificationColumn); int n = input.NumFolds; var output = new Output { TrainData = new IDataView[n], TestData = new IDataView[n] }; // Construct per-fold datasets. double fraction = 1.0 / n; for (int i = 0; i < n; i++) { var trainData = new RangeFilter(host, new RangeFilter.Options { Column = stratCol, Min = i * fraction, Max = (i + 1) * fraction, Complement = true }, data); output.TrainData[i] = ColumnSelectingTransformer.CreateDrop(host, trainData, stratCol); var testData = new RangeFilter(host, new RangeFilter.Options { Column = stratCol, Min = i * fraction, Max = (i + 1) * fraction, Complement = false }, data); output.TestData[i] = ColumnSelectingTransformer.CreateDrop(host, testData, stratCol); } return(output); }
public void TestPcaEstimator() { var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = new PrincipalComponentAnalysisEstimator(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = _env.Start("save")) { IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(_env, savedData, new[] { "pca" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) .Append(new NgramEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }