protected internal override TokenStreamComponents CreateComponents(string fieldName) { StandardTokenizer src = new StandardTokenizer(); src.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; TokenStream tok = new StandardFilter(src); tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); tok = new ASCIIFoldingFilter(tok); return(new TokenStreamComponents(src, tok)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PorterStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new EnglishPossessiveFilter(matchVersion, result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new DanishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PorterStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new EnglishPossessiveFilter(matchVersion, result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new SetKeywordMarkerFilter(result, exclusionSet); if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new GermanNormalizationFilter(result); result = new GermanLightStemFilter(result); } else if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new SnowballFilter(result, new German2Stemmer()); } else { result = new GermanStemFilter(result); } return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); if (!matchVersion.onOrAfter(Version.LUCENE_44)) { s.EnablePositionIncrements = false; } result = s; result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the /// text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/> /// filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided, /// <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader aReader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader); Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stoptable); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (stemdict != null) { result = new StemmerOverrideFilter(result, stemdict); } result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); return new TokenStreamComponents(source, result); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader); Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stoptable); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new DutchStemFilter(result, origStemdict); return new TokenStreamComponents(source, result); } }
/// <summary> /// Not an explicit test, just useful to print out some info on performance /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void performance() throws Exception public virtual void performance() { int[] tokCount = new int[] {100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = new int[] {1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.Length; k++) { StringBuilder buffer = new StringBuilder(); Console.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.consumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100); CharTermAttribute tfTok = stream.addAttribute(typeof(CharTermAttribute)); CharTermAttribute sinkTok = sink.addAttribute(typeof(CharTermAttribute)); for (int i = 0; stream.incrementToken(); i++) { assertTrue(sink.incrementToken()); assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = DateTimeHelperClass.CurrentUnixTimeMillis(); for (int i = 0; i < 20; i++) { stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))); PositionIncrementAttribute posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute)); while (stream.incrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute)); while (stream.incrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } long finish = DateTimeHelperClass.CurrentUnixTimeMillis(); Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = DateTimeHelperClass.CurrentUnixTimeMillis(); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(typeof(PositionIncrementAttribute)); while (teeStream.incrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.getAttribute(typeof(PositionIncrementAttribute)); while (sink.incrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } finish = DateTimeHelperClass.CurrentUnixTimeMillis(); Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); } Console.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }