A filter that stems words using a Snowball-generated stemmer. Available stemmers are listed in {@link SF.Snowball.Ext}. The name of a stemmer is the part of the class name before "Stemmer", e.g., the stemmer in {@link EnglishStemmer} is named "English".
Inheritance: Lucene.Net.Analysis.TokenFilter
 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
 /// </summary>
 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 {
     TokenStream result = new StandardTokenizer(reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     if (stopSet != null)
         result = new StopFilter(result, stopSet);
     result = new SnowballFilter(result, name);
     return result;
 }
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream result = new SnowballFilter(
         new StopFilter(true,
             new LowerCaseFilter(
                 new StandardFilter(
                     new StandardTokenizer(_version, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET),
         new EnglishStemmer());
     return result;
 }
 /// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link
 /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. 
 /// </summary>
 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     if (stopSet != null)
         result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                 result, stopSet);
     result = new SnowballFilter(result, name);
     return result;
 }
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);
            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);

            if (STOP_WORDS != null)
                result = new StopFilter(false, result, STOP_WORDS);
            result = new ASCIIFoldingFilter(result);
            result = new SnowballFilter(result, "English");

            return result;
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Apply standard tokenizer to input
            var tokenizedInput = new StandardTokenizer(_version, reader);

            //Apply standard, lowercase and English stop words filters to input
            var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)),
                StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer());

            //Apply EdgeNGram filter to front of words
            //Min size of grams max size of grams
            var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram);

            return grammedInput;
        }
Example #6
0
        public virtual void  TestFilterTokens()
        {
            Token tok = new Token("accents", 2, 7, "wrd");

            tok.SetPositionIncrement(3);

            SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English");

            Token newtok = filter.Next();

            System.Diagnostics.Trace.Assert("accent" == newtok.TermText());      //// assertEquals("accent", newtok.TermText());
            System.Diagnostics.Trace.Assert(2 == newtok.StartOffset());          //// assertEquals(2, newtok.StartOffset());
            System.Diagnostics.Trace.Assert(7 == newtok.EndOffset());            //// assertEquals(7, newtok.EndOffset());
            System.Diagnostics.Trace.Assert("wrd" == newtok.Type());             //// assertEquals("wrd", newtok.Type());
            System.Diagnostics.Trace.Assert(3 == newtok.GetPositionIncrement()); //// assertEquals(3, newtok.GetPositionIncrement());
        }
Example #7
0
        public virtual void  TestFilterTokens()
        {
            Token tok = new Token("accents", 2, 7, "wrd");

            tok.SetPositionIncrement(3);

            SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English");

            Token newtok = filter.Next();

            Assert.AreEqual("accent", newtok.TermText());
            Assert.AreEqual(2, newtok.StartOffset());
            Assert.AreEqual(7, newtok.EndOffset());
            Assert.AreEqual("wrd", newtok.Type());
            Assert.AreEqual(3, newtok.GetPositionIncrement());
        }
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
                result = new StopFilter(false, result, STOP_WORDS);
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return result;
        }
Example #9
0
        public virtual void TestFilterTokens()
        {
            SnowballFilter              filter     = new SnowballFilter(new TestTokenStream(this), "English");
            ICharTermAttribute          termAtt    = filter.GetAttribute <ICharTermAttribute>();
            IOffsetAttribute            offsetAtt  = filter.GetAttribute <IOffsetAttribute>();
            ITypeAttribute              typeAtt    = filter.GetAttribute <ITypeAttribute>();
            IPayloadAttribute           payloadAtt = filter.GetAttribute <IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt  = filter.GetAttribute <IPositionIncrementAttribute>();
            IFlagsAttribute             flagsAtt   = filter.GetAttribute <IFlagsAttribute>();

            filter.IncrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.StartOffset());
            assertEquals(7, offsetAtt.EndOffset());
            assertEquals("wrd", typeAtt.Type);
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Example #10
0
        public void TestFilterTokens()
        {
            SnowballFilter              filter     = new SnowballFilter(new TestTokenStream(), "English");
            ITermAttribute              termAtt    = filter.GetAttribute <ITermAttribute>();
            IOffsetAttribute            offsetAtt  = filter.GetAttribute <IOffsetAttribute>();
            ITypeAttribute              typeAtt    = filter.GetAttribute <ITypeAttribute>();
            IPayloadAttribute           payloadAtt = filter.GetAttribute <IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt  = filter.GetAttribute <IPositionIncrementAttribute>();
            IFlagsAttribute             flagsAtt   = filter.GetAttribute <IFlagsAttribute>();

            filter.IncrementToken();

            Assert.AreEqual("accent", termAtt.Term);
            Assert.AreEqual(2, offsetAtt.StartOffset);
            Assert.AreEqual(7, offsetAtt.EndOffset);
            Assert.AreEqual("wrd", typeAtt.Type);
            Assert.AreEqual(3, posIncAtt.PositionIncrement);
            Assert.AreEqual(77, flagsAtt.Flags);
            Assert.AreEqual(new Payload(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Example #11
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader);
            TokenStream filter = new StandardFilter(src);
            //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
            HashSet<string> newWords = new HashSet<string>()
            {
                "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also",
                "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a",
                "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it"
            };
            foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET)
            {
                newWords.Add(var);
            }

            TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
            result = new SnowballFilter(result, new EnglishStemmer());
            result = new LowerCaseFilter(result);
            result = new ASCIIFoldingFilter(result);
            result = new StopFilter(true, result, newWords);
            return result;
        }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
        ///         
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return new TokenStreamComponents(source, result);
            }
            else
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
            }
        }
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
Example #14
0
        /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
        /// </summary>
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(reader);

            if( TextFields == null || TextFields.Count == 0 || TextFields.Contains(fieldName) )
            {
                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                if( StopWords != null ) result = new StopFilter(result, StopWords);
                result = new SnowballFilter(result, StemmerName);
            }

            return result;
        }
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>,
 ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem
 ///         exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         result = new ApostropheFilter(result);
     }
     result = new TurkishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Any())
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
 }
Example #16
0
        public void TestFilterTokens()
        {
            SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
            ITermAttribute termAtt = filter.GetAttribute<ITermAttribute>();
            IOffsetAttribute offsetAtt = filter.GetAttribute<IOffsetAttribute>();
            ITypeAttribute typeAtt = filter.GetAttribute<ITypeAttribute>();
            IPayloadAttribute payloadAtt = filter.GetAttribute<IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt = filter.GetAttribute<IPositionIncrementAttribute>();
            IFlagsAttribute flagsAtt = filter.GetAttribute<IFlagsAttribute>();

            filter.IncrementToken();

            Assert.AreEqual("accent", termAtt.Term);
            Assert.AreEqual(2, offsetAtt.StartOffset);
            Assert.AreEqual(7, offsetAtt.EndOffset);
            Assert.AreEqual("wrd", typeAtt.Type);
            Assert.AreEqual(3, posIncAtt.PositionIncrement);
            Assert.AreEqual(77, flagsAtt.Flags);
            Assert.AreEqual(new Payload(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            result = new SetKeywordMarkerFilter(result, exclusionSet);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new GermanNormalizationFilter(result);
                result = new GermanLightStemFilter(result);
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new SnowballFilter(result, new German2Stemmer());
            }
            else
            {
                result = new GermanStemFilter(result);
            }
            return new TokenStreamComponents(source, result);
        }
Example #18
0
			public override TokenStream TokenStream (string fieldName, TextReader reader)
			{
				bool is_text_prop = false;

				// Strip off the first two characters in a property.
				// We store type information in those two characters, so we don't
				// want to index them.
				if (fieldName.StartsWith ("prop:")) {
					
					if (strip_extra_property_info) {
						// Skip everything up to and including the first :
						int c;
						do {
							c = reader.Read ();
						} while (c != -1 && c != ':');
					}

					is_text_prop = fieldName.StartsWith ("prop:t");

					// If this is non-text property, just return one token
					// containing the entire string.  We do this to avoid
					// tokenizing keywords.
					if (! is_text_prop) {
						// We don't want to lower case the token if it's
						// not in the private namespace.
							
						TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());
						
						if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace))
							return singleton_stream;
						else
							return new LowerCaseFilter (singleton_stream);
					}
				} else if (fieldName == "PropertyKeyword")
					return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
				else if (fieldName == "Properties")
					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
				else if (fieldName == "TextLinks")
					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));

				TokenStream outstream;
				outstream = base.TokenStream (fieldName, reader);

				NoiseEmailHostFilter.LinkCallback add_link_callback = null;
				lock (this) {
					if (fieldName == "Text")
						add_link_callback = add_link;
				}

				if (fieldName == "Text"
				    || fieldName == "HotText"
				    || fieldName == "PropertyText"
				    || is_text_prop) {
					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback);
					// Sharing Stemmer is not thread safe.
					// Currently our underlying lucene indexing is not done in multiple threads.
					StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
					outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
				}

				return outstream;
			}
Example #19
0
		public virtual void  TestFilterTokens()
		{
			Token tok = new Token("accents", 2, 7, "wrd");
			tok.SetPositionIncrement(3);
			
			SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English");
			
			Token newtok = filter.Next();
			
			System.Diagnostics.Trace.Assert("accent" == newtok.TermText()); //// assertEquals("accent", newtok.TermText());
			System.Diagnostics.Trace.Assert(2 == newtok.StartOffset()); //// assertEquals(2, newtok.StartOffset());
			System.Diagnostics.Trace.Assert(7 == newtok.EndOffset()); //// assertEquals(7, newtok.EndOffset());
			System.Diagnostics.Trace.Assert("wrd" == newtok.Type()); //// assertEquals("wrd", newtok.Type());
			System.Diagnostics.Trace.Assert(3 == newtok.GetPositionIncrement()); //// assertEquals(3, newtok.GetPositionIncrement());
		}
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
     #pragma warning disable 612, 618
     if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
     #pragma warning restore 612, 618
     {
         s.EnablePositionIncrements = false;
     }
     result = s;
     result = new ElisionFilter(result, DEFAULT_ARTICLES);
     result = new IrishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new IrishStemmer());
     return new TokenStreamComponents(source, result);
 }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="SnowballFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
#pragma warning restore 612, 618
                TokenStream result = new LowerCaseFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
        }
 /// <summary>
 /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the 
 /// text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
 ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, 
 ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
 ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
 {
     #pragma warning disable 612, 618
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     #pragma warning restore 612, 618
     {
         Tokenizer source = new StandardTokenizer(matchVersion, aReader);
         TokenStream result = new StandardFilter(matchVersion, source);
         result = new LowerCaseFilter(matchVersion, result);
         result = new StopFilter(matchVersion, result, stoptable);
         if (excltable.Count > 0)
         {
             result = new SetKeywordMarkerFilter(result, excltable);
         }
         if (stemdict != null)
         {
             result = new StemmerOverrideFilter(result, stemdict);
         }
         result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
         return new TokenStreamComponents(source, result);
     }
     else
     {
         Tokenizer source = new StandardTokenizer(matchVersion, aReader);
         TokenStream result = new StandardFilter(matchVersion, source);
         result = new StopFilter(matchVersion, result, stoptable);
         if (excltable.Count > 0)
         {
             result = new SetKeywordMarkerFilter(result, excltable);
         }
     #pragma warning disable 612, 618
         result = new DutchStemFilter(result, origStemdict);
     #pragma warning restore 612, 618
         return new TokenStreamComponents(source, result);
     }
 }
Example #23
0
		public override TokenStream TokenStream (string fieldName, TextReader reader)
		{
			TokenStream outstream;
			outstream = base.TokenStream (fieldName, reader);
			outstream = new NoiseEmailHostFilter (outstream, true);
			outstream = new SnowballFilter (outstream, "English");

			return outstream;
		}
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
         result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish"))
     {
         result = new TurkishLowerCaseFilter(result);
     }
     else
     {
         result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
         result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="ItalianLightStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_32))
#pragma warning restore 612, 618
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                result = new ItalianLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new ItalianStemmer());
            }
            return new TokenStreamComponents(source, result);
        }
Example #26
0
		public virtual void  TestFilterTokens()
		{
			Token tok = new Token("accents", 2, 7, "wrd");
			tok.SetPositionIncrement(3);
			
			SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English");
			
			Token newtok = filter.Next();
			
			Assert.AreEqual("accent", newtok.TermText());
			Assert.AreEqual(2, newtok.StartOffset());
			Assert.AreEqual(7, newtok.EndOffset());
			Assert.AreEqual("wrd", newtok.Type());
			Assert.AreEqual(3, newtok.GetPositionIncrement());
		}
Example #27
0
        public virtual void TestFilterTokens()
        {
            SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English");
            ICharTermAttribute termAtt = filter.GetAttribute<ICharTermAttribute>();
            IOffsetAttribute offsetAtt = filter.GetAttribute<IOffsetAttribute>();
            ITypeAttribute typeAtt = filter.GetAttribute<ITypeAttribute>();
            IPayloadAttribute payloadAtt = filter.GetAttribute<IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt = filter.GetAttribute<IPositionIncrementAttribute>();
            IFlagsAttribute flagsAtt = filter.GetAttribute<IFlagsAttribute>();

            filter.IncrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.StartOffset());
            assertEquals(7, offsetAtt.EndOffset());
            assertEquals("wrd", typeAtt.Type);
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }