protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
            {
                CharArraySet stopSet   = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
                Tokenizer    tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter  filter    = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);

                return(new TokenStreamComponents(tokenizer, filter));
            }
Example #2
0
        protected internal override TokenStreamComponents CreateComponents(string fieldName)
        {
            StandardTokenizer src = new StandardTokenizer();

            src.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
            TokenStream tok = new StandardFilter(src);

            tok = new LowerCaseFilter(tok);
            tok = new StopFilter(tok, stopwords);
            tok = new ASCIIFoldingFilter(tok);
            return(new TokenStreamComponents(src, tok));
        }
Example #3
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Example #4
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
 public override TokenStreamComponents createComponents(string field, Reader reader)
 {
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     StopFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, StandardAnalyzer.STOP_WORDS_SET);
     return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, filter, flags, protWords));
 }
Example #6
0
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
 ///         <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
 ///         if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = matchVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_31) ? new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
     Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     // the order here is important: the stopword list is not normalized!
     result = new StopFilter(matchVersion, result, stopwords);
     // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
     result = new ArabicNormalizationFilter(result);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     return new TokenStreamComponents(source, new ArabicStemFilter(result));
 }
Example #7
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, 
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     // prior to this we get the classic behavior, standardfilter does it for us.
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       result = new EnglishPossessiveFilter(matchVersion, result);
     }
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(source, result);
 }
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
     return new TokenStreamComponents(tokenizer, filter);
 }
Example #9
0
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.onOrAfter(Version.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
       result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.onOrAfter(Version.LUCENE_31) && name.Equals("Turkish"))
     {
       result = new TurkishLowerCaseFilter(result);
     }
     else
     {
       result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
       result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
Example #10
0
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     result = new SetKeywordMarkerFilter(result, exclusionSet);
     if (matchVersion.onOrAfter(Version.LUCENE_36))
     {
       result = new GermanNormalizationFilter(result);
       result = new GermanLightStemFilter(result);
     }
     else if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       result = new SnowballFilter(result, new German2Stemmer());
     }
     else
     {
       result = new GermanStemFilter(result);
     }
     return new TokenStreamComponents(source, result);
 }
Example #11
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
     if (!matchVersion.onOrAfter(Version.LUCENE_44))
     {
       s.EnablePositionIncrements = false;
     }
     result = s;
     result = new ElisionFilter(result, DEFAULT_ARTICLES);
     result = new IrishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new IrishStemmer());
     return new TokenStreamComponents(source, result);
 }
Example #12
0
 /// <summary>
 /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the 
 /// text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
 ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, 
 ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
 ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader aReader)
 {
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
       Tokenizer source = new StandardTokenizer(matchVersion, aReader);
       TokenStream result = new StandardFilter(matchVersion, source);
       result = new LowerCaseFilter(matchVersion, result);
       result = new StopFilter(matchVersion, result, stoptable);
       if (!excltable.Empty)
       {
     result = new SetKeywordMarkerFilter(result, excltable);
       }
       if (stemdict != null)
       {
     result = new StemmerOverrideFilter(result, stemdict);
       }
       result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
       return new TokenStreamComponents(source, result);
     }
     else
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
       Tokenizer source = new StandardTokenizer(matchVersion, aReader);
       TokenStream result = new StandardFilter(matchVersion, source);
       result = new StopFilter(matchVersion, result, stoptable);
       if (!excltable.Empty)
       {
     result = new SetKeywordMarkerFilter(result, excltable);
       }
       result = new DutchStemFilter(result, origStemdict);
       return new TokenStreamComponents(source, result);
     }
 }
Example #13
0
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="IndicNormalizationFilter"/>,
 ///         <seealso cref="HindiNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
 ///         if a stem exclusion set is provided, <seealso cref="HindiStemFilter"/>, and
 ///         Hindi Stop words </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source;
     Tokenizer source;
     if (matchVersion.onOrAfter(Version.LUCENE_36))
     {
       source = new StandardTokenizer(matchVersion, reader);
     }
     else
     {
       source = new IndicTokenizer(matchVersion, reader);
     }
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new IndicNormalizationFilter(result);
     result = new HindiNormalizationFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     result = new HindiStemFilter(result);
     return new TokenStreamComponents(source, result);
 }