Beispiel #1
0
        protected internal override TokenStreamComponents CreateComponents(string fieldName)
        {
            StandardTokenizer src = new StandardTokenizer();

            src.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
            TokenStream tok = new StandardFilter(src);

            tok = new LowerCaseFilter(tok);
            tok = new StopFilter(tok, stopwords);
            tok = new ASCIIFoldingFilter(tok);
            return(new TokenStreamComponents(src, tok));
        }
Beispiel #2
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Beispiel #3
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
Beispiel #4
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, 
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     // prior to this we get the classic behavior, standardfilter does it for us.
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       result = new EnglishPossessiveFilter(matchVersion, result);
     }
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(source, result);
 }
Beispiel #5
0
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     result = new SetKeywordMarkerFilter(result, exclusionSet);
     if (matchVersion.onOrAfter(Version.LUCENE_36))
     {
       result = new GermanNormalizationFilter(result);
       result = new GermanLightStemFilter(result);
     }
     else if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
       result = new SnowballFilter(result, new German2Stemmer());
     }
     else
     {
       result = new GermanStemFilter(result);
     }
     return new TokenStreamComponents(source, result);
 }
Beispiel #6
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
     if (!matchVersion.onOrAfter(Version.LUCENE_44))
     {
       s.EnablePositionIncrements = false;
     }
     result = s;
     result = new ElisionFilter(result, DEFAULT_ARTICLES);
     result = new IrishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.Empty)
     {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new IrishStemmer());
     return new TokenStreamComponents(source, result);
 }
Beispiel #7
0
 /// <summary>
 /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the 
 /// text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
 ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, 
 ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
 ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader aReader)
 {
     if (matchVersion.onOrAfter(Version.LUCENE_31))
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
       Tokenizer source = new StandardTokenizer(matchVersion, aReader);
       TokenStream result = new StandardFilter(matchVersion, source);
       result = new LowerCaseFilter(matchVersion, result);
       result = new StopFilter(matchVersion, result, stoptable);
       if (!excltable.Empty)
       {
     result = new SetKeywordMarkerFilter(result, excltable);
       }
       if (stemdict != null)
       {
     result = new StemmerOverrideFilter(result, stemdict);
       }
       result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
       return new TokenStreamComponents(source, result);
     }
     else
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
       Tokenizer source = new StandardTokenizer(matchVersion, aReader);
       TokenStream result = new StandardFilter(matchVersion, source);
       result = new StopFilter(matchVersion, result, stoptable);
       if (!excltable.Empty)
       {
     result = new SetKeywordMarkerFilter(result, excltable);
       }
       result = new DutchStemFilter(result, origStemdict);
       return new TokenStreamComponents(source, result);
     }
 }
        /// <summary>
        /// Not an explicit test, just useful to print out some info on performance
        /// </summary>
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void performance() throws Exception
        public virtual void performance()
        {
            int[] tokCount = new int[] {100, 500, 1000, 2000, 5000, 10000};
            int[] modCounts = new int[] {1, 2, 5, 10, 20, 50, 100, 200, 500};
            for (int k = 0; k < tokCount.Length; k++)
            {
              StringBuilder buffer = new StringBuilder();
              Console.WriteLine("-----Tokens: " + tokCount[k] + "-----");
              for (int i = 0; i < tokCount[k]; i++)
              {
            buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' ');
              }
              //make sure we produce the same tokens
              TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
              TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, 100));
              teeStream.consumeAllTokens();
              TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100);
              CharTermAttribute tfTok = stream.addAttribute(typeof(CharTermAttribute));
              CharTermAttribute sinkTok = sink.addAttribute(typeof(CharTermAttribute));
              for (int i = 0; stream.incrementToken(); i++)
              {
            assertTrue(sink.incrementToken());
            assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true);
              }

              //simulate two fields, each being analyzed once, for 20 documents
              for (int j = 0; j < modCounts.Length; j++)
              {
            int tfPos = 0;
            long start = DateTimeHelperClass.CurrentUnixTimeMillis();
            for (int i = 0; i < 20; i++)
            {
              stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())));
              PositionIncrementAttribute posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute));
              while (stream.incrementToken())
              {
                tfPos += posIncrAtt.PositionIncrement;
              }
              stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]);
              posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute));
              while (stream.incrementToken())
              {
                tfPos += posIncrAtt.PositionIncrement;
              }
            }
            long finish = DateTimeHelperClass.CurrentUnixTimeMillis();
            Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
            int sinkPos = 0;
            //simulate one field with one sink
            start = DateTimeHelperClass.CurrentUnixTimeMillis();
            for (int i = 0; i < 20; i++)
            {
              teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
              sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
              PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(typeof(PositionIncrementAttribute));
              while (teeStream.incrementToken())
              {
                sinkPos += posIncrAtt.PositionIncrement;
              }
              //System.out.println("Modulo--------");
              posIncrAtt = sink.getAttribute(typeof(PositionIncrementAttribute));
              while (sink.incrementToken())
              {
                sinkPos += posIncrAtt.PositionIncrement;
              }
            }
            finish = DateTimeHelperClass.CurrentUnixTimeMillis();
            Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
            assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);

              }
              Console.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }