コード例 #1
0
        /// <summary>
        /// Tokenizes the text contained in the specified TextReader.
        /// </summary>
        /// <param name="fieldName">Name of the field.</param>
        /// <param name="reader">The reader.</param>
        /// <returns></returns>        
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var tokenizer = new CustomTokenizer(reader, _separatorChars);
            var stream = _ignoreCase ? new LowerCaseFilter(tokenizer) as TokenStream : tokenizer as TokenStream;

            if (_enableStemming)
                stream = new PorterStemFilter(stream);

            return stream;
        }
コード例 #2
0
ファイル: Smart.cs プロジェクト: renyh1013/dp2
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenizer(result, wordSegment);
     // result = new LowerCaseFilter(result);
     // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
     // stem太严格了, This is not bug, this feature:)
     result = new PorterStemFilter(result);
     if (stopWords != null)
     {
         result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
     }
     return result;
 }
コード例 #3
0
 /// <summary>
 /// </summary>
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream stream = new StandardTokenizer(reader);
     stream = new PorterStemFilter(stream);
     return new StopFilter(new LowerCaseFilter(stream), stopSet);
 }
コード例 #4
0
ファイル: StringUtils.cs プロジェクト: KommuSoft/MLTag
 public static IEnumerable<string> InnerGetLuceneTokens(string text)
 {
     TextReader tr = new StringReader(text);
     TokenStream tok = new Lucene.Net.Analysis.Standard.StandardTokenizer(tr);
     tok = new LowerCaseFilter(tok);
     tok = new PorterStemFilter(tok);
     Token t = tok.Next();
     while (t != null) {
         yield return t.TermText();
         t = tok.Next();
     }
     tr.Close();
 }
コード例 #5
0
ファイル: NGramAnalyzer.cs プロジェクト: jncc/topcat
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream filter = new StandardFilter(new LowerCaseTokenizer(reader));
            filter = new PorterStemFilter(filter);

            return filter;
        }