public IEnumerable<IWordsTokenizer> Parse(string text) { if (string.IsNullOrEmpty(text)) { throw new ArgumentException("Value cannot be null or empty.", nameof(text)); } string[] sentences = splitter.Split(text).ToArray(); string saved = string.Empty; for (int i = 0; i < sentences.Length; i++) { string currentSentence = sentences[i].Trim(); while (currentSentence.Length > 1 && currentSentence[currentSentence.Length - 2] == ' ') { currentSentence = currentSentence.Remove(currentSentence.Length - 2, 1); } if (string.IsNullOrWhiteSpace(currentSentence)) { continue; } if (i < sentences.Length - 1) { string nextSentence = sentences[i + 1]; bool found = currentSentence.Count(char.IsLetterOrDigit) <= 2; if (!found) { for (int j = 0; j < nextSentence.Length && j <= 3; j++) { if (nextSentence[j] == '.') { found = true; break; } } } if (found) { saved += currentSentence; continue; } } if (!string.IsNullOrWhiteSpace(saved)) { currentSentence = saved + " " + currentSentence; } IWordsTokenizer wordsTokenizer = TokenizerFactory.Create(currentSentence); saved = string.Empty; if (wordsTokenizer != NullWordsTokenizer.Instance) { yield return wordsTokenizer; } } }
public void CreateNull() { WordsTokenizerFactory tokenizerFactory = new WordsTokenizerFactory( WordsTokenizerFactory.Grouped, new SimpleWordItemFactory(Global.PosTagger, Global.Raw), new CombinedPipeline <string>(), new CombinedPipeline <WordEx>()); IWordsTokenizer tokenizer = tokenizerFactory.Create(null); Assert.IsInstanceOf <NullWordsTokenizer>(tokenizer); }
public void Create() { WordsTokenizerFactory tokenizerFactory = new WordsTokenizerFactory( WordsTokenizerFactory.Grouped, new SimpleWordItemFactory(Global.PosTagger, Global.Raw), new CombinedPipeline <string>(), new CombinedPipeline <WordEx>()); IWordsTokenizer tokenizer = tokenizerFactory.Create("Test words"); string[] words = tokenizer.GetWords().ToArray(); Assert.AreEqual(2, words.Length); Assert.AreEqual("Test", words[0]); Assert.AreEqual("words", words[1]); }