/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(result, stopSet); result = new SnowballFilter(result, name); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new SnowballFilter( new StopFilter(true, new LowerCaseFilter( new StandardFilter( new StandardTokenizer(_version, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer()); return result; }
/// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new SnowballFilter(result, name); return result; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); result = new SnowballFilter(result, "English"); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Apply standard tokenizer to input var tokenizedInput = new StandardTokenizer(_version, reader); //Apply standard, lowercase and English stop words filters to input var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)), StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer()); //Apply EdgeNGram filter to front of words //Min size of grams max size of grams var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram); return grammedInput; }
public virtual void TestFilterTokens() { Token tok = new Token("accents", 2, 7, "wrd"); tok.SetPositionIncrement(3); SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English"); Token newtok = filter.Next(); System.Diagnostics.Trace.Assert("accent" == newtok.TermText()); //// assertEquals("accent", newtok.TermText()); System.Diagnostics.Trace.Assert(2 == newtok.StartOffset()); //// assertEquals(2, newtok.StartOffset()); System.Diagnostics.Trace.Assert(7 == newtok.EndOffset()); //// assertEquals(7, newtok.EndOffset()); System.Diagnostics.Trace.Assert("wrd" == newtok.Type()); //// assertEquals("wrd", newtok.Type()); System.Diagnostics.Trace.Assert(3 == newtok.GetPositionIncrement()); //// assertEquals(3, newtok.GetPositionIncrement()); }
public virtual void TestFilterTokens() { Token tok = new Token("accents", 2, 7, "wrd"); tok.SetPositionIncrement(3); SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English"); Token newtok = filter.Next(); Assert.AreEqual("accent", newtok.TermText()); Assert.AreEqual(2, newtok.StartOffset()); Assert.AreEqual(7, newtok.EndOffset()); Assert.AreEqual("wrd", newtok.Type()); Assert.AreEqual(3, newtok.GetPositionIncrement()); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); // we are using a distinct version of the Spanish stemmer, called Spanish2 // Please check if this class can be found in the Snowball library, the relative path // should be: Snowball\SF\Snowball\Ext\ // just in case, I would leave a copy of this class in this project result = new SnowballFilter(result, "Spanish"); return result; }
public virtual void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English"); ICharTermAttribute termAtt = filter.GetAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute <IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute <IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute <IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute <IFlagsAttribute>(); filter.IncrementToken(); assertEquals("accent", termAtt.ToString()); assertEquals(2, offsetAtt.StartOffset()); assertEquals(7, offsetAtt.EndOffset()); assertEquals("wrd", typeAtt.Type); assertEquals(3, posIncAtt.PositionIncrement); assertEquals(77, flagsAtt.Flags); assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
public void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); ITermAttribute termAtt = filter.GetAttribute <ITermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute <IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute <IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute <IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute <IFlagsAttribute>(); filter.IncrementToken(); Assert.AreEqual("accent", termAtt.Term); Assert.AreEqual(2, offsetAtt.StartOffset); Assert.AreEqual(7, offsetAtt.EndOffset); Assert.AreEqual("wrd", typeAtt.Type); Assert.AreEqual(3, posIncAtt.PositionIncrement); Assert.AreEqual(77, flagsAtt.Flags); Assert.AreEqual(new Payload(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader); TokenStream filter = new StandardFilter(src); //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET; HashSet<string> newWords = new HashSet<string>() { "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also", "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a", "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it" }; foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET) { newWords.Add(var); } TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader); result = new SnowballFilter(result, new EnglishStemmer()); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(true, result, newWords); return result; }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> /// public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new DanishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(reader); if( TextFields == null || TextFields.Count == 0 || TextFields.Contains(fieldName) ) { result = new StandardFilter(result); result = new LowerCaseFilter(result); if( StopWords != null ) result = new StopFilter(result, StopWords); result = new SnowballFilter(result, StemmerName); } return result; }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return new TokenStreamComponents(source, result); }
public void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); ITermAttribute termAtt = filter.GetAttribute<ITermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute<IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute<ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute<IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute<IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute<IFlagsAttribute>(); filter.IncrementToken(); Assert.AreEqual("accent", termAtt.Term); Assert.AreEqual(2, offsetAtt.StartOffset); Assert.AreEqual(7, offsetAtt.EndOffset); Assert.AreEqual("wrd", typeAtt.Type); Assert.AreEqual(3, posIncAtt.PositionIncrement); Assert.AreEqual(77, flagsAtt.Flags); Assert.AreEqual(new Payload(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new SetKeywordMarkerFilter(result, exclusionSet); #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new GermanNormalizationFilter(result); result = new GermanLightStemFilter(result); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new SnowballFilter(result, new German2Stemmer()); } else { result = new GermanStemFilter(result); } return new TokenStreamComponents(source, result); }
public override TokenStream TokenStream (string fieldName, TextReader reader) { bool is_text_prop = false; // Strip off the first two characters in a property. // We store type information in those two characters, so we don't // want to index them. if (fieldName.StartsWith ("prop:")) { if (strip_extra_property_info) { // Skip everything up to and including the first : int c; do { c = reader.Read (); } while (c != -1 && c != ':'); } is_text_prop = fieldName.StartsWith ("prop:t"); // If this is non-text property, just return one token // containing the entire string. We do this to avoid // tokenizing keywords. if (! is_text_prop) { // We don't want to lower case the token if it's // not in the private namespace. TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ()); if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace)) return singleton_stream; else return new LowerCaseFilter (singleton_stream); } } else if (fieldName == "PropertyKeyword") return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ())); else if (fieldName == "Properties") return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ())); else if (fieldName == "TextLinks") return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ())); TokenStream outstream; outstream = base.TokenStream (fieldName, reader); NoiseEmailHostFilter.LinkCallback add_link_callback = null; lock (this) { if (fieldName == "Text") add_link_callback = add_link; } if (fieldName == "Text" || fieldName == "HotText" || fieldName == "PropertyText" || is_text_prop) { outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback); // Sharing Stemmer is not thread safe. // Currently our underlying lucene indexing is not done in multiple threads. StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE); outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod); } return outstream; }
public virtual void TestFilterTokens() { Token tok = new Token("accents", 2, 7, "wrd"); tok.SetPositionIncrement(3); SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English"); Token newtok = filter.Next(); System.Diagnostics.Trace.Assert("accent" == newtok.TermText()); //// assertEquals("accent", newtok.TermText()); System.Diagnostics.Trace.Assert(2 == newtok.StartOffset()); //// assertEquals(2, newtok.StartOffset()); System.Diagnostics.Trace.Assert(7 == newtok.EndOffset()); //// assertEquals(7, newtok.EndOffset()); System.Diagnostics.Trace.Assert("wrd" == newtok.Type()); //// assertEquals("wrd", newtok.Type()); System.Diagnostics.Trace.Assert(3 == newtok.GetPositionIncrement()); //// assertEquals(3, newtok.GetPositionIncrement()); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); #pragma warning disable 612, 618 if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { s.EnablePositionIncrements = false; } result = s; result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="SnowballFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return new TokenStreamComponents(source, result); } else { #pragma warning disable 612, 618 Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return new TokenStreamComponents(source, result); } }
/// <summary> /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the /// text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/> /// filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided, /// <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } if (stemdict != null) { result = new StemmerOverrideFilter(result, stemdict); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer()); return new TokenStreamComponents(source, result); } else { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new DutchStemFilter(result, origStemdict); #pragma warning restore 612, 618 return new TokenStreamComponents(source, result); } }
public override TokenStream TokenStream (string fieldName, TextReader reader) { TokenStream outstream; outstream = base.TokenStream (fieldName, reader); outstream = new NoiseEmailHostFilter (outstream, true); outstream = new SnowballFilter (outstream, "English"); return outstream; }
/// <summary> /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>, /// and a <seealso cref="SnowballFilter"/> /// </summary> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins"))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="ItalianLightStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_32)) #pragma warning restore 612, 618 { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new ItalianLightStemFilter(result); } else { result = new SnowballFilter(result, new ItalianStemmer()); } return new TokenStreamComponents(source, result); }
public virtual void TestFilterTokens() { Token tok = new Token("accents", 2, 7, "wrd"); tok.SetPositionIncrement(3); SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English"); Token newtok = filter.Next(); Assert.AreEqual("accent", newtok.TermText()); Assert.AreEqual(2, newtok.StartOffset()); Assert.AreEqual(7, newtok.EndOffset()); Assert.AreEqual("wrd", newtok.Type()); Assert.AreEqual(3, newtok.GetPositionIncrement()); }
public virtual void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English"); ICharTermAttribute termAtt = filter.GetAttribute<ICharTermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute<IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute<ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute<IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute<IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute<IFlagsAttribute>(); filter.IncrementToken(); assertEquals("accent", termAtt.ToString()); assertEquals(2, offsetAtt.StartOffset()); assertEquals(7, offsetAtt.EndOffset()); assertEquals("wrd", typeAtt.Type); assertEquals(3, posIncAtt.PositionIncrement); assertEquals(77, flagsAtt.Flags); assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }