public static Output Split(IHostEnvironment env, Input input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register(ModuleName);

            host.CheckValue(input, nameof(input));
            host.Check(0 < input.Fraction && input.Fraction < 1, "The fraction must be in the interval (0,1).");

            EntryPointUtils.CheckInputArgs(host, input);

            var data     = input.Data;
            var stratCol = SplitUtils.CreateStratificationColumn(host, ref data, input.StratificationColumn);

            IDataView trainData = new RangeFilter(host,
                                                  new RangeFilter.Options {
                Column = stratCol, Min = 0, Max = input.Fraction, Complement = false
            }, data);

            trainData = ColumnSelectingTransformer.CreateDrop(host, trainData, stratCol);

            IDataView testData = new RangeFilter(host,
                                                 new RangeFilter.Options {
                Column = stratCol, Min = 0, Max = input.Fraction, Complement = true
            }, data);

            testData = ColumnSelectingTransformer.CreateDrop(host, testData, stratCol);

            return(new Output()
            {
                TrainData = trainData, TestData = testData
            });
        }
Example #2
0
        public void FeatureSelectionWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(ML, "text", "bag_of_words")
                      .AppendCacheCheckpoint(ML)
                      .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words", "bag_of_words_count", 10)
                              .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words", "bag_of_words_mi", labelColumn: "label")));

            var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "bag_of_words_count", "bag_of_words_mi" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("FeatureSelection", "featureselection.tsv");
            Done();
        }
Example #3
0
        /// <summary>
        /// Split the dataset into the train set and test set according to the given fraction.
        /// Respects the <paramref name="samplingKeyColumnName"/> if provided.
        /// </summary>
        /// <param name="data">The dataset to split.</param>
        /// <param name="testFraction">The fraction of data to go into the test set.</param>
        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
        /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
        /// If <see langword="null"/> no row grouping will be performed.</param>
        /// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[TrainTestSplit](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/TrainTestSplit.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int?seed = null)
        {
            _env.CheckValue(data, nameof(data));
            _env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
            _env.CheckValueOrNull(samplingKeyColumnName);

            var splitColumn = CreateSplitColumn(_env, ref data, samplingKeyColumnName, seed, fallbackInEnvSeed: true);

            var trainFilter = new RangeFilter(_env, new RangeFilter.Options()
            {
                Column     = splitColumn,
                Min        = 0,
                Max        = testFraction,
                Complement = true
            }, data);
            var testFilter = new RangeFilter(_env, new RangeFilter.Options()
            {
                Column     = splitColumn,
                Min        = 0,
                Max        = testFraction,
                Complement = false
            }, data);

            var trainDV = ColumnSelectingTransformer.CreateDrop(_env, trainFilter, splitColumn);
            var testDV  = ColumnSelectingTransformer.CreateDrop(_env, testFilter, splitColumn);

            return(new TrainTestData(trainDV, testDV));
        }
Example #4
0
        /// <summary>
        /// Splits the data based on the splitColumn, and drops that column as it is only
        /// intended to be used for splitting the data, and shouldn't be part of the output schema.
        /// </summary>
        internal static IEnumerable <TrainTestData> CrossValidationSplit(IHostEnvironment env, IDataView data, string splitColumn, int numberOfFolds = 5)
        {
            env.CheckValue(splitColumn, nameof(splitColumn));

            for (int fold = 0; fold < numberOfFolds; fold++)
            {
                var trainFilter = new RangeFilter(env, new RangeFilter.Options
                {
                    Column     = splitColumn,
                    Min        = (double)fold / numberOfFolds,
                    Max        = (double)(fold + 1) / numberOfFolds,
                    Complement = true,
                    IncludeMin = true,
                    IncludeMax = true,
                }, data);

                var testFilter = new RangeFilter(env, new RangeFilter.Options
                {
                    Column     = splitColumn,
                    Min        = (double)fold / numberOfFolds,
                    Max        = (double)(fold + 1) / numberOfFolds,
                    Complement = false,
                    IncludeMin = true,
                    IncludeMax = true
                }, data);

                var trainDV = ColumnSelectingTransformer.CreateDrop(env, trainFilter, splitColumn);
                var testDV  = ColumnSelectingTransformer.CreateDrop(env, testFilter, splitColumn);

                yield return(new TrainTestData(trainDV, testDV));
            }
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);
            var est = ML.Transforms.Text.NormalizeText("text")
                      .Append(ML.Transforms.Text.TokenizeWords("words", "text"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
                      .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "NoDefaultStopwords", "NoStopWords" });
            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadText(1)), hasHeader: true)
                              .Read(dataPath).AsDynamic;

            var est       = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4);
            var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" });

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
        public void WordBagWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(ML, "bag_of_words", "text").
                      Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", invertHash: -1));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "bag_of_words.tsv");
            Done();
        }
        public void LpNormWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = ML.Transforms.Projection.LpNormalize("lpNorm1", "features")
                      .Append(ML.Transforms.Projection.LpNormalize("lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, subMean: true));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "lpNorm1", "lpNorm2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpNorm.tsv");
            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(ML, "terms", "text"))
                      .Append(new NgramExtractingEstimator(ML, "ngrams", "terms"))
                      .Append(new NgramHashingEstimator(ML, "ngramshash", "terms"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void WhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoader.CreateReader(ML,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(ML,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new VectorWhiteningEstimator(ML, "features", "whitened1")
                      .Append(new VectorWhiteningEstimator(ML, "features", "whitened2", kind: WhiteningKind.Pca, pcaNum: 5));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "whitened1", "whitened2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void GcnWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoader.CreateReader(ML,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(ML,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new GlobalContrastNormalizingEstimator(ML, "features", "gcnNorm1")
                      .Append(new GlobalContrastNormalizingEstimator(ML, "features", "gcnNorm2", substractMean: false, useStdDev: true, scale: 3));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "gcnNorm1", "gcnNorm2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "gcnNorm.tsv", digitsOfPrecision: 4);
            Done();
        }
        // Factory method for SignatureDataTransform.
        internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);
            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckNonWhiteSpace(args.Source, nameof(args.Source));

            if (string.IsNullOrWhiteSpace(args.Name))
                args.Name = args.Source;

            var file = Utils.FindExistentFileOrNull("pretrained.model", "Sentiment", assemblyForBasePath: typeof(SentimentAnalyzingTransformer));
            if (file == null)
            {
                throw h.Except("resourcePath", "Missing resource for SentimentAnalyzingTransform.");
            }

            // The logic below ensures that any columns in our input IDataView that conflict
            // with column names known to be used in the pretrained model transform pipeline we're
            // loading are aliased to temporary column names before we apply the pipeline and then
            // renamed back to their original names after. We do this to ensure the pretrained model
            // doesn't shadow or replace columns we aren't expecting it to.

            // 1. Alias any column in the input IDataView that is known to appear to the pretrained
            // model into a temporary column so that we can restore them after the pretrained model
            // is added to the pipeline.
            KeyValuePair<string, string>[] aliased;
            input = AliasIfNeeded(env, input, _modelIntermediateColumnNames, out aliased);

            // 2. Copy source column to a column with the name expected by the pretrained model featurization
            // transform pipeline.
            var copyTransformer = new ColumnCopyingTransformer(env, (args.Source, ModelInputColumnName));

            input = copyTransformer.Transform(input);

            // 3. Apply the pretrained model and its featurization transform pipeline.
            input = LoadTransforms(env, input, file);

            // 4. Copy the output column from the pretrained model to a temporary column.
            var scoreTempName = input.Schema.GetTempColumnName("sa_out");
            copyTransformer = new ColumnCopyingTransformer(env, (ModelScoreColumnName, scoreTempName));
            input = copyTransformer.Transform(input);

            // 5. Drop all the columns created by the pretrained model, including the expected input column
            // and the output column, which we have copied to a temporary column in (4).
            input = ColumnSelectingTransformer.CreateDrop(env, input, _modelIntermediateColumnNames);

            // 6. Unalias all the original columns that were originally present in the IDataView, but may have
            // been shadowed by column names in the pretrained model. This method will also drop all the temporary
            // columns that were created for them in (1).
            input = UnaliasIfNeeded(env, input, aliased);

            // 7. Copy the temporary column with the score we created in (4) to a column with the user-specified destination name.
            copyTransformer = new ColumnCopyingTransformer(env, (scoreTempName, args.Name));
            input = copyTransformer.Transform(input);

            // 8. Drop the temporary column with the score created in (4).
            return ColumnSelectingTransformer.CreateDrop(env, input, scoreTempName);
        }
        private static IDataView UnaliasIfNeeded(IHostEnvironment env, IDataView input, KeyValuePair<string, string>[] hiddenNames)
        {
            if (Utils.Size(hiddenNames) == 0)
                return input;

            input = new ColumnCopyingTransformer(env, hiddenNames.Select(x => (Input: x.Key, Output: x.Value)).ToArray()).Transform(input);
            return ColumnSelectingTransformer.CreateDrop(env, input, hiddenNames.Select(pair => pair.Value).ToArray());
        }
Example #14
0
        public static CommonOutputs.TransformOutput RenameBinaryPredictionScoreColumns(IHostEnvironment env,
                                                                                       RenameBinaryPredictionScoreColumnsInput input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("ScoreModel");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            if (input.PredictorModel.Predictor.PredictionKind == PredictionKind.BinaryClassification)
            {
                DataViewType labelType;
                var          labelNames = input.PredictorModel.GetLabelInfo(host, out labelType);
                if (labelNames != null && labelNames.Length == 2)
                {
                    var positiveClass = labelNames[1];

                    // Rename all the score columns.
                    int colMax;
                    var maxScoreId = input.Data.Schema.GetMaxAnnotationKind(out colMax, AnnotationUtils.Kinds.ScoreColumnSetId);
                    var copyCols   = new List <(string name, string source)>();
                    for (int i = 0; i < input.Data.Schema.Count; i++)
                    {
                        if (input.Data.Schema[i].IsHidden)
                        {
                            continue;
                        }
                        if (!ShouldAddColumn(input.Data.Schema, i, null, maxScoreId))
                        {
                            continue;
                        }
                        // Do not rename the PredictedLabel column.
                        ReadOnlyMemory <char> tmp = default;
                        if (input.Data.Schema.TryGetAnnotation(TextDataViewType.Instance, AnnotationUtils.Kinds.ScoreValueKind, i,
                                                               ref tmp) &&
                            ReadOnlyMemoryUtils.EqualsStr(AnnotationUtils.Const.ScoreValueKind.PredictedLabel, tmp))
                        {
                            continue;
                        }
                        var source = input.Data.Schema[i].Name;
                        var name   = source + "." + positiveClass;
                        copyCols.Add((name, source));
                    }

                    var copyColumn = new ColumnCopyingTransformer(env, copyCols.ToArray()).Transform(input.Data);
                    var dropColumn = ColumnSelectingTransformer.CreateDrop(env, copyColumn, copyCols.Select(c => c.source).ToArray());
                    return(new CommonOutputs.TransformOutput {
                        Model = new TransformModelImpl(env, dropColumn, input.Data), OutputData = dropColumn
                    });
                }
            }

            var newView = NopTransform.CreateIfNeeded(env, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, newView, input.Data), OutputData = newView
            });
        }
Example #15
0
        private IDataView WrapPerInstance(RoleMappedData perInst)
        {
            var idv = perInst.Data;

            // Make a list of column names that Maml outputs as part of the per-instance data view, and then wrap
            // the per-instance data computed by the evaluator in a SelectColumnsTransform.
            var cols       = new List <(string Source, string Name)>();
            var colsToKeep = new List <string>();

            // If perInst is the result of cross-validation and contains a fold Id column, include it.
            int foldCol;

            if (perInst.Schema.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.FoldIndex, out foldCol))
            {
                colsToKeep.Add(MetricKinds.ColumnNames.FoldIndex);
            }

            // Maml always outputs a name column, if it doesn't exist add a GenerateNumberTransform.
            if (perInst.Schema.Name == null)
            {
                var args = new GenerateNumberTransform.Arguments();
                args.Column = new[] { new GenerateNumberTransform.Column()
                                      {
                                          Name = "Instance"
                                      } };
                args.UseCounter = true;
                idv             = new GenerateNumberTransform(Host, args, idv);
                colsToKeep.Add("Instance");
            }
            else
            {
                cols.Add((perInst.Schema.Name.Name, "Instance"));
                colsToKeep.Add("Instance");
            }

            // Maml outputs the weight column if it exists.
            if (perInst.Schema.Weight != null)
            {
                colsToKeep.Add(perInst.Schema.Weight.Name);
            }

            // Get the other columns from the evaluator.
            foreach (var col in GetPerInstanceColumnsToSave(perInst.Schema))
            {
                colsToKeep.Add(col);
            }

            idv = new ColumnsCopyingTransformer(Host, cols.ToArray()).Transform(idv);
            idv = ColumnSelectingTransformer.CreateKeep(Host, idv, colsToKeep.ToArray());
            return(GetPerInstanceMetricsCore(idv, perInst.Schema));
        }
Example #16
0
        public static CommonOutputs.TransformOutput SelectColumns(IHostEnvironment env, ColumnSelectingTransformer.Arguments input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("SelectColumns");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            var xf = new ColumnSelectingTransformer(env, input.KeepColumns, input.DropColumns, input.KeepHidden, input.IgnoreMissing).Transform(input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf
            });
        }
Example #17
0
        public void TestCustomWordEmbeddings()
        {
            var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var data     = new TextLoader(Env,
                                          new TextLoader.Arguments()
            {
                Separator = "\t",
                HasHeader = true,
                Columns   = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            }).Read(GetDataPath(dataPath));

            var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
                      .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
            var words             = est.Fit(data).Transform(data);
            var pathToCustomModel = DeleteOutputPath("custommodel.txt");

            using (StreamWriter file = new StreamWriter(pathToCustomModel))
            {
                file.WriteLine("This is custom file for 4 words with 5 dimentional vector. First line in this file is ignored");
                file.WriteLine("stop" + " " + string.Join(" ", 1.5f, 2.5f, 3.5f, 4.5f, 5.5f));
                file.WriteLine("bursts" + " " + string.Join(" ", -0.9f, -3f, 7.3f, 1.0f, 12f));
                file.WriteLine("you" + " " + string.Join(" ", -1f, -2f, -4f, -6f, -1f));
                file.WriteLine("dude" + " " + string.Join(" ", 100f, 0f, 0f, 0f, 0f));
            }
            var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", pathToCustomModel, "CleanWords");

            TestEstimatorCore(pipe, words, invalidInput: data);

            var outputPath = GetOutputPath("Text", "customWordEmbeddings.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 10);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings", "CleanWords" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "customWordEmbeddings.tsv");
            Done();
        }
Example #18
0
        public void CategoricalHashStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarString: ctx.LoadText(1),
                                                          VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = "1", B = "2", C = "3",
                                          }, new TestClass()
                                          {
                                              A = "4", B = "5", C = "6"
                                          } };

            var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         row.ScalarString,
                                         row.VectorString,
                                         // Create a VarVector column
                                         VarVectorString: row.ScalarString.TokenizeText())).
                              Append(row => (
                                         A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin),
                                         F: row.VarVectorString.OneHotHashEncoding()
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E", "F" });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("CategoricalHash", "featurized.tsv");
            Done();
        }
Example #19
0
        private void TestSvmLight(string path, string savingPath, int inputSize, int expectedInputSize, bool zeroBased, IDataView expectedData, long?numberOfRows = null)
        {
            var data = ML.Data.LoadFromSvmLightFile(path, inputSize: inputSize, zeroBased: zeroBased, numberOfRows: numberOfRows);

            Assert.True(data.Schema["Features"].Type.GetValueCount() == expectedInputSize);

            CheckSameValues(data, expectedData, checkId: false);

            // Save, reload and compare dataviews again.
            using (var stream = File.Create(savingPath))
                ML.Data.SaveInSvmLightFormat(expectedData, stream, zeroBasedIndexing: zeroBased, exampleWeightColumnName: "Weight");
            data = ML.Data.LoadFromSvmLightFile(savingPath, inputSize: inputSize, zeroBased: zeroBased);
            CheckSameValues(ColumnSelectingTransformer.CreateDrop(Env, data, "Comment"),
                            ColumnSelectingTransformer.CreateDrop(Env, expectedData, "Comment"), checkId: false);
        }
        public void LdaWorkout()
        {
            IHostEnvironment env = new MLContext(seed: 42, conc: 1);
            string           sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1)), hasHeader: true)
                       .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(env, "text", "bag_of_words").
                      Append(new LatentDirichletAllocationEstimator(env, "bag_of_words", "topics", 10, numIterations: 10,
                                                                    resetRandomGenerator: true));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // In this test it manifests because of the WordBagEstimator in the estimator chain
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ldatopics.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false, Dense = true
                });
                var       transformer     = est.Fit(data.AsDynamic);
                var       transformedData = transformer.Transform(data.AsDynamic);
                IDataView savedData       = TakeFilter.Create(env, transformedData, 4);
                savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

                Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size);
            }

            // Diabling this check due to the following issue with consitency of output.
            // `seed` specified in ConsoleEnvironment has no effect.
            // https://github.com/dotnet/machinelearning/issues/1004
            // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server
            // CheckEquality("Text", "ldatopics.tsv");
            Done();
        }
        public void NAReplaceStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarFloat: ctx.LoadFloat(1),
                                                          ScalarDouble: ctx.LoadDouble(1),
                                                          VectorFloat: ctx.LoadFloat(1, 4),
                                                          VectorDoulbe: ctx.LoadDouble(1, 4)
                                                          ));

            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);

            var est = data.MakeNewEstimator().
                      Append(row => (
                                 A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Maximum),
                                 B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Mean),
                                 C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Mean),
                                 D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Minimum)
                                 ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAReplace", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D" });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("NAReplace", "featurized.tsv");
            Done();
        }
Example #22
0
        private static IDataView UnaliasIfNeeded(IHostEnvironment env, IDataView input, KeyValuePair <string, string>[] hiddenNames)
        {
            if (Utils.Size(hiddenNames) == 0)
            {
                return(input);
            }

            input = ColumnCopyingTransformer.Create(env, new ColumnCopyingTransformer.Arguments()
            {
                Column = hiddenNames.Select(pair => new ColumnCopyingTransformer.Column()
                {
                    Name = pair.Key, Source = pair.Value
                }).ToArray()
            }, input);

            return(ColumnSelectingTransformer.CreateDrop(env, input, hiddenNames.Select(pair => pair.Value).ToArray()));
        }
        public void CategoricalHashStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = "1", B = "2", C = "3",
                                          }, new TestClass()
                                          {
                                              A = "4", B = "5", C = "6"
                                          } };

            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         row.ScalarString,
                                         row.VectorString,
                                         // Create a VarVector column
                                         VarVectorString: row.ScalarString.TokenizeText())).
                              Append(row => (
                                         A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin),
                                         F: row.VarVectorString.OneHotHashEncoding()
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4);
            var view       = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "A", "B", "C", "D", "E", "F" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true);

            CheckEquality("CategoricalHash", "featurized.tsv");
            Done();
        }
Example #24
0
        public void TestPcaEstimator()
        {
            var data = TextLoaderStatic.CreateReader(ML,
                                                     c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                     separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = ML.Transforms.Projection.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "pca" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
Example #25
0
        public void CategoricalStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 2, C = 3,
                                          }, new TestClass()
                                          {
                                              A = 4, B = 5, C = 6
                                          } };

            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin)
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Categorical", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = new ColumnSelectingTransformer(Env, new string[] { "A", "B", "C", "D", "E" }, null, false).Transform(savedData);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("Categorical", "featurized.tsv");
            Done();
        }
        public void NAReplaceStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                ScalarFloat: ctx.LoadFloat(1),
                                                                ScalarDouble: ctx.LoadDouble(1),
                                                                VectorFloat: ctx.LoadFloat(1, 4),
                                                                VectorDoulbe: ctx.LoadDouble(1, 4)
                                                                ));

            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);

            var est = data.MakeNewEstimator().
                      Append(row => (
                                 A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Maximum),
                                 B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Mean),
                                 C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Mean),
                                 D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Minimum)
                                 ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAReplace", "featurized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4);
            var view       = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true);

            CheckEquality("NAReplace", "featurized.tsv");
            Done();
        }
Example #27
0
        public void TestWordEmbeddings()
        {
            var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var data     = new TextLoader(Env,
                                          new TextLoader.Arguments()
            {
                Separator = "\t",
                HasHeader = true,
                Columns   = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            }).Read(GetDataPath(dataPath));

            var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
                      .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
            var words = est.Fit(data).Transform(data);

            var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe);

            TestEstimatorCore(pipe, words, invalidInput: data);

            var outputPath = GetOutputPath("Text", "wordEmbeddings.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "wordEmbeddings.tsv");
            Done();
        }
Example #28
0
        public static Output Split(IHostEnvironment env, Input input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register(ModuleName);

            host.CheckValue(input, nameof(input));

            EntryPointUtils.CheckInputArgs(host, input);

            var data = input.Data;

            var stratCol = SplitUtils.CreateStratificationColumn(host, ref data, input.StratificationColumn);

            int n      = input.NumFolds;
            var output = new Output
            {
                TrainData = new IDataView[n],
                TestData  = new IDataView[n]
            };

            // Construct per-fold datasets.
            double fraction = 1.0 / n;

            for (int i = 0; i < n; i++)
            {
                var trainData = new RangeFilter(host,
                                                new RangeFilter.Options {
                    Column = stratCol, Min = i * fraction, Max = (i + 1) * fraction, Complement = true
                }, data);
                output.TrainData[i] = ColumnSelectingTransformer.CreateDrop(host, trainData, stratCol);

                var testData = new RangeFilter(host,
                                               new RangeFilter.Options {
                    Column = stratCol, Min = i * fraction, Max = (i + 1) * fraction, Complement = false
                }, data);
                output.TestData[i] = ColumnSelectingTransformer.CreateDrop(host, testData, stratCol);
            }

            return(output);
        }
Example #29
0
        public void TestPcaEstimator()
        {
            var data = TextLoader.CreateReader(_env,
                                               c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                               separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = new PrincipalComponentAnalysisEstimator(_env, "features", "pca", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = _env.Start("save"))
            {
                IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(_env, savedData, new[] { "pca" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }