/// <summary> /// Tokenizes the text contained in the specified TextReader. /// </summary> /// <param name="fieldName">Name of the field.</param> /// <param name="reader">The reader.</param> /// <returns></returns> public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new CustomTokenizer(reader, _separatorChars); var stream = _ignoreCase ? new LowerCaseFilter(tokenizer) as TokenStream : tokenizer as TokenStream; if (_enableStemming) stream = new PorterStemFilter(stream); return stream; }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new SentenceTokenizer(reader); result = new WordTokenizer(result, wordSegment); // result = new LowerCaseFilter(result); // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写 // stem太严格了, This is not bug, this feature:) result = new PorterStemFilter(result); if (stopWords != null) { result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false); } return result; }
/// <summary> /// </summary> public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream stream = new StandardTokenizer(reader); stream = new PorterStemFilter(stream); return new StopFilter(new LowerCaseFilter(stream), stopSet); }
public static IEnumerable<string> InnerGetLuceneTokens(string text) { TextReader tr = new StringReader(text); TokenStream tok = new Lucene.Net.Analysis.Standard.StandardTokenizer(tr); tok = new LowerCaseFilter(tok); tok = new PorterStemFilter(tok); Token t = tok.Next(); while (t != null) { yield return t.TermText(); t = tok.Next(); } tr.Close(); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream filter = new StandardFilter(new LowerCaseTokenizer(reader)); filter = new PorterStemFilter(filter); return filter; }