コード例 #1
0
            protected internal override Analyzer.TokenStreamComponents createComponents(string fieldName, Reader reader)
            {
                Tokenizer   source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
                TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);

                return(new Analyzer.TokenStreamComponents(source, new PortugueseLightStemFilter(result)));
            }
コード例 #2
0
        protected internal override TokenStreamComponents CreateComponents(string fieldName)
        {
            StandardTokenizer src = new StandardTokenizer();

            src.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
            TokenStream tok = new StandardFilter(src);

            tok = new LowerCaseFilter(tok);
            tok = new StopFilter(tok, stopwords);
            tok = new ASCIIFoldingFilter(tok);
            return(new TokenStreamComponents(src, tok));
        }
コード例 #3
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
コード例 #4
0
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="ArabicNormalizationFilter"/>,
 ///         <seealso cref="PersianNormalizationFilter"/> and Persian Stop words </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source;
     Tokenizer source;
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       source = new StandardTokenizer(matchVersion, reader);
     }
     else
     {
       source = new ArabicLetterTokenizer(matchVersion, reader);
     }
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);
     /*
      * the order here is important: the stopword list is normalized with the
      * above!
      */
     return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
 }
コード例 #5
0
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
     TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
     return new TokenStreamComponents(source, new PortugueseStemFilter(result));
 }
コード例 #6
0
ファイル: DanishAnalyzer.cs プロジェクト: Cefa68000/lucenenet
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
コード例 #7
0
ファイル: ArabicAnalyzer.cs プロジェクト: Cefa68000/lucenenet
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
 ///         <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
 ///         if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = matchVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_31) ? new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
     Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     // the order here is important: the stopword list is not normalized!
     result = new StopFilter(matchVersion, result, stopwords);
     // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
     result = new ArabicNormalizationFilter(result);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     return new TokenStreamComponents(source, new ArabicStemFilter(result));
 }
コード例 #8
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, 
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     // prior to this we get the classic behavior, standardfilter does it for us.
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       result = new EnglishPossessiveFilter(matchVersion, result);
     }
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(source, result);
 }
コード例 #9
0
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.onOrAfter(Version.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
       result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.onOrAfter(Version.LUCENE_31) && name.Equals("Turkish"))
     {
       result = new TurkishLowerCaseFilter(result);
     }
     else
     {
       result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
       result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
コード例 #10
0
ファイル: GermanAnalyzer.cs プロジェクト: Cefa68000/lucenenet
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     result = new SetKeywordMarkerFilter(result, exclusionSet);
     if (matchVersion.onOrAfter(Version.LUCENE_36))
     {
       result = new GermanNormalizationFilter(result);
       result = new GermanLightStemFilter(result);
     }
     else if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       result = new SnowballFilter(result, new German2Stemmer());
     }
     else
     {
       result = new GermanStemFilter(result);
     }
     return new TokenStreamComponents(source, result);
 }
コード例 #11
0
ファイル: DutchAnalyzer.cs プロジェクト: Cefa68000/lucenenet
 /// <summary>
 /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the 
 /// text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
 ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, 
 ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
 ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader aReader)
 {
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
       Tokenizer source = new StandardTokenizer(matchVersion, aReader);
       TokenStream result = new StandardFilter(matchVersion, source);
       result = new LowerCaseFilter(matchVersion, result);
       result = new StopFilter(matchVersion, result, stoptable);
       if (!excltable.Empty)
       {
     result = new SetKeywordMarkerFilter(result, excltable);
       }
       if (stemdict != null)
       {
     result = new StemmerOverrideFilter(result, stemdict);
       }
       result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
       return new TokenStreamComponents(source, result);
     }
     else
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
       Tokenizer source = new StandardTokenizer(matchVersion, aReader);
       TokenStream result = new StandardFilter(matchVersion, source);
       result = new StopFilter(matchVersion, result, stoptable);
       if (!excltable.Empty)
       {
     result = new SetKeywordMarkerFilter(result, excltable);
       }
       result = new DutchStemFilter(result, origStemdict);
       return new TokenStreamComponents(source, result);
     }
 }
コード例 #12
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testMultipleSources() throws Exception
        public virtual void testMultipleSources()
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            tee1.reset();
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1);
            TokenStream source1 = new CachingTokenFilter(tee1);

            tee1.addAttribute(typeof(CheckClearAttributesAttribute));
            dogDetector.addAttribute(typeof(CheckClearAttributesAttribute));
            theDetector.addAttribute(typeof(CheckClearAttributesAttribute));

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));
            tee2.addSinkTokenStream(dogDetector);
            tee2.addSinkTokenStream(theDetector);
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TokenStream source2 = tee2;
            TokenStream source2 = tee2;

            assertTokenStreamContents(source1, tokens1);
            assertTokenStreamContents(source2, tokens2);

            assertTokenStreamContents(theDetector, new string[]{"The", "the", "The", "the"});
            assertTokenStreamContents(dogDetector, new string[]{"Dogs", "Dogs"});

            source1.reset();
            TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
            string[] lowerCaseTokens = new string[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
              lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT);
            }
            assertTokenStreamContents(lowerCasing, lowerCaseTokens);
        }
コード例 #13
0
ファイル: HindiAnalyzer.cs プロジェクト: Cefa68000/lucenenet
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="IndicNormalizationFilter"/>,
 ///         <seealso cref="HindiNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
 ///         if a stem exclusion set is provided, <seealso cref="HindiStemFilter"/>, and
 ///         Hindi Stop words </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source;
     Tokenizer source;
     if (matchVersion.onOrAfter(Version.LUCENE_36))
     {
       source = new StandardTokenizer(matchVersion, reader);
     }
     else
     {
       source = new IndicTokenizer(matchVersion, reader);
     }
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new IndicNormalizationFilter(result);
     result = new HindiNormalizationFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     result = new HindiStemFilter(result);
     return new TokenStreamComponents(source, result);
 }