Esempio n. 1
0
        /// <summary>
        /// Constructs a <see cref="StandardTokenizer"/> filtered by a
        ///    <see cref="StandardFilter"/>, a <see cref="LowerCaseFilter"/>, a <see cref="StopFilter"/>,
        ///    and a <see cref="SnowballFilter"/>
        /// </summary>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   tokenizer = new StandardTokenizer(matchVersion, reader);
            TokenStream result    = new StandardFilter(matchVersion, tokenizer);

            // remove the possessive 's for english stemmers
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English", StringComparison.Ordinal) || name.Equals("Porter", StringComparison.Ordinal) || name.Equals("Lovins", StringComparison.Ordinal)))
            {
                result = new EnglishPossessiveFilter(result);
            }
            // Use a special lowercase filter for turkish, the stemmer expects it.
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish", StringComparison.Ordinal))
            {
                result = new TurkishLowerCaseFilter(result);
            }
            else
            {
                result = new LowerCaseFilter(matchVersion, result);
            }
            if (stopSet != null)
            {
                result = new StopFilter(matchVersion, result, stopSet);
            }
            result = new SnowballFilter(result, name);
            return(new TokenStreamComponents(tokenizer, result));
        }
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new EnglishPossessiveFilter(m_matchVersion, result);
            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Esempio n. 3
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            // for stripping 's from words
            result = new EnglishPossessiveFilter(m_matchVersion, result);
            // converts é to e (and © to (c), etc.
            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, EnglishAnalyzer.DefaultStopSet);
            // for chopping off common word suffixes, like removing ming from stemming, etc.
            result = new PorterStemFilter(result);

            // The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters,
            // then it emits N-grams of each word of the specified length.
            if (_userNGram)
            {
                result = new EdgeNGramTokenFilter(m_matchVersion, result, _ngramMin, _ngramMax);
            }

            return(new TokenStreamComponents(source, result));
        }