Exemplo n.º 1
0
        public virtual void TestLowerCaseFilterLowSurrogateLeftover()
        {
            // test if the limit of the termbuffer is correctly used with supplementary
            // chars
            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16"));
            LowerCaseFilter     filter    = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);

            AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" });
            filter.Reset();
            string highSurEndingUpper = "BogustermBoguster\ud801";
            string highSurEndingLower = "bogustermboguster\ud801";

            tokenizer.SetReader(new StringReader(highSurEndingUpper));
            AssertTokenStreamContents(filter, new string[] { highSurEndingLower });
            assertTrue(filter.HasAttribute <ICharTermAttribute>());
            char[] termBuffer = filter.GetAttribute <ICharTermAttribute>().Buffer;
            int    length     = highSurEndingLower.Length;

            assertEquals('\ud801', termBuffer[length - 1]);
        }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
     TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
     return new TokenStreamComponents(source, new PortugueseStemFilter(result));
 }
Exemplo n.º 3
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/>
 ///         , and <seealso cref="BrazilianStemFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new StandardFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (excltable != null && excltable.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, excltable);
     }
     return new TokenStreamComponents(source, new BrazilianStemFilter(result));
 }
Exemplo n.º 5
0
        public override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
	  {
		UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader);
		src.MaxTokenLength = maxTokenLength;
		TokenStream tok = new StandardFilter(matchVersion, src);
		tok = new LowerCaseFilter(matchVersion, tok);
		tok = new StopFilter(matchVersion, tok, stopwords);
		return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
	  }
Exemplo n.º 6
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
        ///         
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return new TokenStreamComponents(source, result);
            }
            else
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
            }
        }
Exemplo n.º 7
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="SnowballFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
#pragma warning restore 612, 618
                TokenStream result = new LowerCaseFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
        }
Exemplo n.º 8
0
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
         result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish"))
     {
         result = new TurkishLowerCaseFilter(result);
     }
     else
     {
         result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
         result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
Exemplo n.º 9
0
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                // run the widthfilter first before bigramming, it sometimes combines characters.
                TokenStream result = new CJKWidthFilter(source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new CJKBigramFilter(result);
                return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new CJKTokenizer(reader);
#pragma warning restore 612, 618
                return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
            }
        }
Exemplo n.º 10
0
 public virtual void TestLowerCaseFilterLowSurrogateLeftover()
 {
     // test if the limit of the termbuffer is correctly used with supplementary
     // chars
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16"));
     LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);
     AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" });
     filter.Reset();
     string highSurEndingUpper = "BogustermBoguster\ud801";
     string highSurEndingLower = "bogustermboguster\ud801";
     tokenizer.Reader = new StringReader(highSurEndingUpper);
     AssertTokenStreamContents(filter, new string[] { highSurEndingLower });
     assertTrue(filter.HasAttribute<ICharTermAttribute>());
     char[] termBuffer = filter.GetAttribute<ICharTermAttribute>().Buffer();
     int length = highSurEndingLower.Length;
     assertEquals('\ud801', termBuffer[length - 1]);
 }
Exemplo n.º 11
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, 
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            // prior to this we get the classic behavior, standardfilter does it for us.
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return new TokenStreamComponents(source, result);
        }
Exemplo n.º 12
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="ItalianLightStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_32))
#pragma warning restore 612, 618
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                result = new ItalianLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new ItalianStemmer());
            }
            return new TokenStreamComponents(source, result);
        }
Exemplo n.º 13
0
//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
//ORIGINAL LINE: @Override protected TokenStreamComponents createComponents(final String fieldName, final java.io.Reader reader)
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader);
		UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader);
		src.MaxTokenLength = maxTokenLength;
		TokenStream tok = new StandardFilter(matchVersion, src);
		tok = new LowerCaseFilter(matchVersion, tok);
		tok = new StopFilter(matchVersion, tok, stopwords);
		return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
	  }
Exemplo n.º 14
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
        ///         if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            Tokenizer source = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) 
                ? new StandardTokenizer(matchVersion, reader) 
                : (Tokenizer)new ArabicLetterTokenizer(matchVersion, reader);
#pragma warning restore 612, 618
            TokenStream result = new LowerCaseFilter(matchVersion, source);
            // the order here is important: the stopword list is not normalized!
            result = new StopFilter(matchVersion, result, stopwords);
            // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
            result = new ArabicNormalizationFilter(result);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            return new TokenStreamComponents(source, new ArabicStemFilter(result));
        }
Exemplo n.º 15
0
 /// <summary>
 /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the 
 /// text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
 ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, 
 ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
 ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
 {
     #pragma warning disable 612, 618
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     #pragma warning restore 612, 618
     {
         Tokenizer source = new StandardTokenizer(matchVersion, aReader);
         TokenStream result = new StandardFilter(matchVersion, source);
         result = new LowerCaseFilter(matchVersion, result);
         result = new StopFilter(matchVersion, result, stoptable);
         if (excltable.Count > 0)
         {
             result = new SetKeywordMarkerFilter(result, excltable);
         }
         if (stemdict != null)
         {
             result = new StemmerOverrideFilter(result, stemdict);
         }
         result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
         return new TokenStreamComponents(source, result);
     }
     else
     {
         Tokenizer source = new StandardTokenizer(matchVersion, aReader);
         TokenStream result = new StandardFilter(matchVersion, source);
         result = new StopFilter(matchVersion, result, stoptable);
         if (excltable.Count > 0)
         {
             result = new SetKeywordMarkerFilter(result, excltable);
         }
     #pragma warning disable 612, 618
         result = new DutchStemFilter(result, origStemdict);
     #pragma warning restore 612, 618
         return new TokenStreamComponents(source, result);
     }
 }
Exemplo n.º 16
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , and <seealso cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If
        ///         a version is >= LUCENE_31 and a stem exclusion set is provided via
        ///         <seealso cref="#CzechAnalyzer(Version, CharArraySet, CharArraySet)"/> a
        ///         <seealso cref="SetKeywordMarkerFilter"/> is added before
        ///         <seealso cref="CzechStemFilter"/>. </returns>

        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                if (this.stemExclusionTable.Any())
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionTable);
                }
                result = new CzechStemFilter(result);
            }
            return new TokenStreamComponents(source, result);
        }