private void InitializeTextNormalizer(TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower, bool keepDiacritics = false, bool keepPuncuations = false, bool keepNumbers = false) { _mlContext = new MLContext(); _emptySamplesList = new List <TextData>(); _emptyDataView = _mlContext.Data.LoadFromEnumerable(_emptySamplesList); // text normalizer _normTextPipeline = _mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text", caseMode, keepDiacritics: keepDiacritics, keepPunctuations: keepPuncuations, keepNumbers: keepNumbers); _normTextTransformer = _normTextPipeline.Fit(_emptyDataView); _predictionEngine = _mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(_normTextTransformer); }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new TextNormalizingEstimator(Env, "text") .Append(new WordTokenizingEstimator(Env, "text", "words")) .Append(new StopwordRemover(Env, "words", "words_without_stopwords")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words_without_stopwords" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }