/// <summary> /// Constructs a <see cref="StandardTokenizer"/> filtered by a /// <see cref="StandardFilter"/>, a <see cref="LowerCaseFilter"/>, a <see cref="StopFilter"/>, /// and a <see cref="SnowballFilter"/> /// </summary> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English", StringComparison.Ordinal) || name.Equals("Porter", StringComparison.Ordinal) || name.Equals("Lovins", StringComparison.Ordinal))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish", StringComparison.Ordinal)) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return(new TokenStreamComponents(tokenizer, result)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new EnglishPossessiveFilter(m_matchVersion, result); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); // for stripping 's from words result = new EnglishPossessiveFilter(m_matchVersion, result); // converts é to e (and © to (c), etc. result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, EnglishAnalyzer.DefaultStopSet); // for chopping off common word suffixes, like removing ming from stemming, etc. result = new PorterStemFilter(result); // The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, // then it emits N-grams of each word of the specified length. if (_userNGram) { result = new EdgeNGramTokenFilter(m_matchVersion, result, _ngramMin, _ngramMax); } return(new TokenStreamComponents(source, result)); }