Пример #1
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new NorwegianStemmer());
            return(new TokenStreamComponents(source, result));
        }
Пример #2
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Apply standard tokenizer to input
            var tokenizedInput = new StandardTokenizer(_version, reader);

            //Apply standard, lowercase and English stop words filters to input
            var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)),
                                                                  StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer());

            //Apply EdgeNGram filter to front of words
            //Min size of grams max size of grams
            var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram);

            return(grammedInput);
        }
Пример #3
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new StandardFilter(matchVersion, tokenizer);

            stream = new LowerCaseFilter(matchVersion, stream);
            stream = new StopFilter(matchVersion, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            stream = new PorterStemFilter(stream);
            stream = new SnowballFilter(stream, new EnglishStemmer());

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
Пример #4
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
	  ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SnowballFilter(result, new FinnishStemmer());
		return new TokenStreamComponents(source, result);
	  }
Пример #5
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new LowerCaseFilter(matchVersion, tokenizer);

            stream = new CyrllicToLatinFilter(stream);
            stream = new StopFilter(matchVersion, stream, StopFilter.MakeStopSet(matchVersion, STOP_WORDS));
            stream = new SnowballFilter(stream, new SimpleSerbianStemmer());
            stream = new ASCIIFoldingFilter(stream);

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
Пример #6
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);

            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);
            result = new SnowballFilter(result, "English");

            return(result);
        }
Пример #7
0
 /// <summary>
 /// Creates a
 /// <see cref="TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <see cref="TextReader"/>.
 /// </summary>
 /// <returns> A
 ///         <see cref="TokenStreamComponents"/>
 ///         built from an <see cref="StandardTokenizer"/> filtered with
 ///         <see cref="StandardFilter"/>, <see cref="TurkishLowerCaseFilter"/>,
 ///         <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem
 ///         exclusion set is provided and <see cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(m_matchVersion, reader);
     TokenStream result = new StandardFilter(m_matchVersion, source);
     if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         result = new ApostropheFilter(result);
     }
     result = new TurkishLowerCaseFilter(result);
     result = new StopFilter(m_matchVersion, result, m_stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
 }
Пример #8
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>,
        ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new CatalanStemmer());
            return(new TokenStreamComponents(source, result));
        }
Пример #9
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <see cref="FrenchLightStemFilter"/> </returns>
        ///
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return(new TokenStreamComponents(source, result));
            }
            else
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result)));
            }
        }
Пример #10
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return(result);
        }
Пример #11
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
                Tokenizer   source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (!excltable.Empty)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                if (matchVersion.onOrAfter(Version.LUCENE_36))
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
                }
                return(new TokenStreamComponents(source, result));
            }
            else
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
                Tokenizer   source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (!excltable.Empty)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                result = new FrenchStemFilter(result);
                // Convert to lowercase after stemming!
                return(new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)));
            }
        }
Пример #12
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            StopFilter  s      = new StopFilter(matchVersion, result, HYPHENATIONS);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                s.EnablePositionIncrements = false;
            }
            result = s;
            result = new ElisionFilter(result, DEFAULT_ARTICLES);
            result = new IrishLowerCaseFilter(result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new IrishStemmer());
            return(new TokenStreamComponents(source, result));
        }
Пример #13
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PortugueseLightStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new PortugueseLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.PortugueseStemmer());
            }
            return(new TokenStreamComponents(source, result));
        }
Пример #14
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            if (!this._Language.Equals("JA", StringComparison.CurrentCultureIgnoreCase))
            {//不是日语的需要重建Reader。wangyunpeng
                reader = this.InitReader(reader);
            }
            TokenStream result = this._Analyzer.TokenStream(fieldName, reader);

            result = new StopFilter(this._EnableStopPositionIncrements, result, this._StopCharArraySet, true);
            SnowballProgram snowballProgram = SnowballDict.GetSnowball(this._Language);//词干。wangyunpeng,2015-8-17改成线程安全的调用方式。

            if (snowballProgram != null)
            {
                result = new SnowballFilter(result, snowballProgram);
            }
            if (_UseIndexSynonyms)
            {//在创建索引的时候,将同义词,近义词,相关词存入索引。
                result = new SynonymsFilter(this._Language, result);
            }
            return(result);
        }
Пример #15
0
        /// <summary>
        /// Creates a
        /// <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A
        ///         <see cref="TokenStreamComponents"/>
        ///         built from an <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <see cref="SpanishLightStemFilter"/>. </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                result = new SpanishLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new SpanishStemmer());
            }
            return(new TokenStreamComponents(source, result));
        }
Пример #16
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            result = new SetKeywordMarkerFilter(result, exclusionSet);
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new GermanNormalizationFilter(result);
                result = new GermanLightStemFilter(result);
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                result = new SnowballFilter(result, new German2Stemmer());
            }
            else
            {
                result = new GermanStemFilter(result);
            }
            return(new TokenStreamComponents(source, result));
        }
Пример #17
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            StopFilter  s      = new StopFilter(matchVersion, result, HYPHENATIONS);

            if (!matchVersion.onOrAfter(Version.LUCENE_44))
            {
                s.EnablePositionIncrements = false;
            }
            result = s;
            result = new ElisionFilter(result, DEFAULT_ARTICLES);
            result = new IrishLowerCaseFilter(result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new IrishStemmer());
            return(new TokenStreamComponents(source, result));
        }
Пример #18
0
        /// <summary>
        /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the
        /// text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A <see cref="TokenStream"/> built from a <see cref="StandardTokenizer"/>
        ///   filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>,
        ///   <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
        ///   <see cref="StemmerOverrideFilter"/>, and <see cref="SnowballFilter"/> </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stoptable);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                if (stemdict != null)
                {
                    result = new StemmerOverrideFilter(result, stemdict);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
                return(new TokenStreamComponents(source, result));
            }
            else
            {
                Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stoptable);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new DutchStemFilter(result, origStemdict);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(source, result));
            }
        }
Пример #19
0
        /// <summary>
        /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the
        /// text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
        ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>,
        ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
        ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader aReader)
        {
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
                Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stoptable);
                if (!excltable.Empty)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                if (stemdict != null)
                {
                    result = new StemmerOverrideFilter(result, stemdict);
                }
                result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
                return(new TokenStreamComponents(source, result));
            }
            else
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
                Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stoptable);
                if (!excltable.Empty)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                result = new DutchStemFilter(result, origStemdict);
                return(new TokenStreamComponents(source, result));
            }
        }
Пример #20
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, 
	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		}
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SnowballFilter(result, new CatalanStemmer());
		return new TokenStreamComponents(source, result);
	  }
Пример #21
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
	  ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		  result = new LowerCaseFilter(matchVersion, result);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  if (matchVersion.onOrAfter(Version.LUCENE_36))
		  {
			result = new FrenchLightStemFilter(result);
		  }
		  else
		  {
			result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
		  }
		  return new TokenStreamComponents(source, result);
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  result = new FrenchStemFilter(result);
		  // Convert to lowercase after stemming!
		  return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
		}
	  }