public virtual void TestGraphs() { TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10); AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23); }
private static IEnumerable <string> Tokenize(string value) { using (StringReader stringReader = new StringReader(value)) using (LetterTokenizer letterTokenizer = new LetterTokenizer(LuceneVersion.LUCENE_48, stringReader)) { letterTokenizer.Reset(); while (letterTokenizer.IncrementToken()) { yield return(letterTokenizer.GetAttribute <ICharTermAttribute>().ToString()); } } }
public static ArrayList removePunctuation(String inputString) { StringReader reader = new StringReader(inputString); Tokenizer standardTokenizer = new LetterTokenizer(reader); TokenStream tokenStream = new LengthFilter(standardTokenizer, 2, int.MaxValue); var termAttribute = tokenStream.GetAttribute <ITermAttribute>(); tokenStream.Reset(); ArrayList tokenizedString = new ArrayList(); while (tokenStream.IncrementToken()) { tokenizedString.Add(termAttribute.Term); } tokenStream.End(); tokenStream.Dispose(); return(tokenizedString); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(tokenizer, tokenizer); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader); return(new TokenStreamComponents(tokenizer, tokenizer)); }