public ISentenceTokenizer Create(string wordPattern, bool simple, bool removeStopWords)
        {
            List <IPipeline <WordEx> > pipelines = new List <IPipeline <WordEx> >();

            if (!simple)
            {
                pipelines.Add(new InvertorPipeline());
            }

            if (removeStopWords)
            {
                pipelines.Add(new StopWordItemPipeline());
                pipelines.Add(new WordItemFilterOutPipeline(item => item.Tag.WordType == WordType.SeparationSymbol));
                pipelines.Add(new WordItemFilterOutPipeline(item => item.IsConjunction()));
            }

            pipelines.Add(new WordItemFilterOutPipeline(item => item.Tag == SentenceFinalPunctuation.Instance));

            WordsTokenizerFactory factory = new WordsTokenizerFactory(
                wordPattern,
                new SimpleWordItemFactory(tagger, raw),
                new CombinedPipeline <string>(new LowerCasePipeline(), new WordCleanupPipeline(), new PunctuationPipeline()),
                new CombinedPipeline <WordEx>(pipelines.ToArray()));

            return(new SentenceTokenizer(factory));
        }
        public void CreateNull()
        {
            WordsTokenizerFactory tokenizerFactory = new WordsTokenizerFactory(
                WordsTokenizerFactory.Grouped,
                new SimpleWordItemFactory(Global.PosTagger, Global.Raw),
                new CombinedPipeline <string>(),
                new CombinedPipeline <WordEx>());
            IWordsTokenizer tokenizer = tokenizerFactory.Create(null);

            Assert.IsInstanceOf <NullWordsTokenizer>(tokenizer);
        }
        public void Create()
        {
            WordsTokenizerFactory tokenizerFactory = new WordsTokenizerFactory(
                WordsTokenizerFactory.Grouped,
                new SimpleWordItemFactory(Global.PosTagger, Global.Raw),
                new CombinedPipeline <string>(),
                new CombinedPipeline <WordEx>());
            IWordsTokenizer tokenizer = tokenizerFactory.Create("Test words");

            string[] words = tokenizer.GetWords().ToArray();
            Assert.AreEqual(2, words.Length);
            Assert.AreEqual("Test", words[0]);
            Assert.AreEqual("words", words[1]);
        }