public ISentenceTokenizer Create(string wordPattern, bool simple, bool removeStopWords) { List <IPipeline <WordEx> > pipelines = new List <IPipeline <WordEx> >(); if (!simple) { pipelines.Add(new InvertorPipeline()); } if (removeStopWords) { pipelines.Add(new StopWordItemPipeline()); pipelines.Add(new WordItemFilterOutPipeline(item => item.Tag.WordType == WordType.SeparationSymbol)); pipelines.Add(new WordItemFilterOutPipeline(item => item.IsConjunction())); } pipelines.Add(new WordItemFilterOutPipeline(item => item.Tag == SentenceFinalPunctuation.Instance)); WordsTokenizerFactory factory = new WordsTokenizerFactory( wordPattern, new SimpleWordItemFactory(tagger, raw), new CombinedPipeline <string>(new LowerCasePipeline(), new WordCleanupPipeline(), new PunctuationPipeline()), new CombinedPipeline <WordEx>(pipelines.ToArray())); return(new SentenceTokenizer(factory)); }
public void CreateNull() { WordsTokenizerFactory tokenizerFactory = new WordsTokenizerFactory( WordsTokenizerFactory.Grouped, new SimpleWordItemFactory(Global.PosTagger, Global.Raw), new CombinedPipeline <string>(), new CombinedPipeline <WordEx>()); IWordsTokenizer tokenizer = tokenizerFactory.Create(null); Assert.IsInstanceOf <NullWordsTokenizer>(tokenizer); }
public void Create() { WordsTokenizerFactory tokenizerFactory = new WordsTokenizerFactory( WordsTokenizerFactory.Grouped, new SimpleWordItemFactory(Global.PosTagger, Global.Raw), new CombinedPipeline <string>(), new CombinedPipeline <WordEx>()); IWordsTokenizer tokenizer = tokenizerFactory.Create("Test words"); string[] words = tokenizer.GetWords().ToArray(); Assert.AreEqual(2, words.Length); Assert.AreEqual("Test", words[0]); Assert.AreEqual("words", words[1]); }