Removes stop words from a token stream.
Inheritance: Lucene.Net.Analysis.TokenFilter
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader);
     TokenStream filterStream = new StandardFilter(tokenizer);
     TokenStream stream = new StopFilter(true, filterStream, _stopWords, true);
     return stream;
 }
Exemplo n.º 2
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream result = whitespaceAnalyzer.TokenStream(fieldName, reader);
     result = new LowerCaseFilter(result);
     result = new StopFilter(true, result, stopWords);
     return result;
 }
Exemplo n.º 3
0
		public virtual void  TestIgnoreCase()
		{
			System.IO.StringReader reader = new System.IO.StringReader("Now is The Time");
			System.String[] stopWords = new System.String[]{"is", "the", "Time"};
			TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true);
			Assert.AreEqual("Now", stream.Next().TermText());
			Assert.AreEqual(null, stream.Next());
		}
Exemplo n.º 4
0
 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
 /// </summary>
 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 {
     TokenStream result = new StandardTokenizer(reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     result = new StopFilter(result, stopSet);
     return result;
 }
Exemplo n.º 5
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var seg = new JiebaSegmenter();
     TokenStream result = new JiebaTokenizer(seg, reader);
     // This filter is necessary, because the parser converts the queries to lower case.
     result = new LowerCaseFilter(result);
     result = new StopFilter(true, result, StopWords);
     return result;
 }
Exemplo n.º 6
0
 public override TokenStream TokenStream(string fieldname, TextReader reader)
 {
     TokenStream result = new PersianTokenizer(reader);
     result = new LowerCaseFilter(result);
     result = new PersianNormalizationFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable);
     result = new PersianLemmatizationFilter(result);
     return result;
 }
Exemplo n.º 7
0
 /** Constructs a {@link StandardTokenizer} filtered by a {@link
  * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}
  * and a {@link SpanishStemFilter}. */
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(Version.LUCENE_24,reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     result = new StopFilter(true,result, stopTable);
     result = new SpanishStemFilter(result);
     return result;
 }
Exemplo n.º 8
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            var seg = new JiebaSegmenter();
            TokenStream result = new JiebaTokenizer(seg, reader);

            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, StopWords);
            return result;
        }
Exemplo n.º 9
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var tokenizer = new StandardTokenizer(Version.LUCENE_29, reader);
     tokenizer.MaxTokenLength = 255;
     TokenStream filter = new StandardFilter(tokenizer);
     filter = new LowerCaseFilter(filter);
     filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET);
     return new NGramTokenFilter(filter, 2, 6);
 }
Exemplo n.º 10
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(matchVersion, reader);
            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(this.enableStopPositionIncrements, result, stoptable);
            result = new BulgarianStemFilter(result);

            return result;
        }
Exemplo n.º 11
0
		public virtual void  TestStopFilt()
		{
			System.IO.StringReader reader = new System.IO.StringReader("Now is The Time");
			System.String[] stopWords = new System.String[]{"is", "the", "Time"};
			System.Collections.Hashtable stopSet = StopFilter.MakeStopSet(stopWords);
			TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
			Assert.AreEqual("Now", stream.Next().TermText());
			Assert.AreEqual("The", stream.Next().TermText());
			Assert.AreEqual(null, stream.Next());
		}
Exemplo n.º 12
0
		public virtual void  TestIgnoreCase()
		{
			System.IO.StringReader reader = new System.IO.StringReader("Now is The Time");
			System.String[] stopWords = new System.String[]{"is", "the", "Time"};
			TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true);
			TermAttribute termAtt = (TermAttribute) stream.GetAttribute(typeof(TermAttribute));
			Assert.IsTrue(stream.IncrementToken());
			Assert.AreEqual("Now", termAtt.Term());
			Assert.IsFalse(stream.IncrementToken());
		}
Exemplo n.º 13
0
 public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
 {
     var result = new StopFilter(true,
                     new ASCIIFoldingFilter(
                         new LowerCaseFilter(
                             new StandardFilter(
                                 new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader)))),
                     StandardAnalyzer.STOP_WORDS_SET);
     return result;
 }
Exemplo n.º 14
0
 /// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link
 /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. 
 /// </summary>
 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     if (stopSet != null)
         result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                 result, stopSet);
     result = new SnowballFilter(result, name);
     return result;
 }
Exemplo n.º 15
0
		public virtual void  TestIgnoreCase()
		{
			System.IO.StringReader reader = new System.IO.StringReader("Now is The Time");
            var stopWords = Support.Compatibility.SetFactory.GetSet<string>();
            stopWords.UnionWith(new[] {"is", "the", "Time"});

			TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true);
            ITermAttribute termAtt = stream.GetAttribute<ITermAttribute>();
			Assert.IsTrue(stream.IncrementToken());
			Assert.AreEqual("Now", termAtt.Term);
			Assert.IsFalse(stream.IncrementToken());
		}
Exemplo n.º 16
0
		public virtual void  TestStopFilt()
		{
			System.IO.StringReader reader = new System.IO.StringReader("Now is The Time");
			System.String[] stopWords = new System.String[]{"is", "the", "Time"};
			var stopSet = StopFilter.MakeStopSet(stopWords);
			TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
            ITermAttribute termAtt = stream.GetAttribute<ITermAttribute>();
			Assert.IsTrue(stream.IncrementToken());
			Assert.AreEqual("Now", termAtt.Term);
			Assert.IsTrue(stream.IncrementToken());
			Assert.AreEqual("The", termAtt.Term);
			Assert.IsFalse(stream.IncrementToken());
		}
Exemplo n.º 17
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);
            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);

            if (STOP_WORDS != null)
                result = new StopFilter(false, result, STOP_WORDS);
            result = new ASCIIFoldingFilter(result);
            result = new SnowballFilter(result, "English");

            return result;
        }
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(kLuceneVersion, reader);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new ASCIIFoldingFilter(result);
            result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
            result = new EdgeNGramTokenFilter(
                result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.Side.FRONT, 1, 20);

            return result;
        }
 public override TokenStream TokenStream(string fieldName, TextReader reader) {
     TokenStream result = new StandardTokenizer(this._luceneVersion, reader);
     result = new StandardFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this._luceneVersion),
         result,
         CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false))
     );
     result = new FrenchStemFilter(result, CharArraySet.EMPTY_SET);
     // Convert to lowercase after stemming!
     result = new LowerCaseFilter(result);
     result = new ASCIIFoldingFilter(result);
     return result;
 }
Exemplo n.º 20
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenizer(result, wordSegment);
     // result = new LowerCaseFilter(result);
     // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
     // stem太严格了, This is not bug, this feature:)
     result = new PorterStemFilter(result);
     if (stopWords != null)
     {
         result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
     }
     return result;
 }
Exemplo n.º 21
0
        /// <summary>
        /// 
        /// </summary>
        /// <remarks></remarks>
        /// <seealso cref=""/>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <returns></returns>
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            //create the tokenizer
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader);

            //add in filters
            result = new StandardFilter(result); // first normalize the StandardTokenizer
            result = new LowerCaseFilter(result);// makes sure everything is lower case
            result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // use the default list of Stop Words, provided by the StopAnalyzer class.
            result = new SynonymFilter(result, SynonymEngine); // injects the synonyms.

            //return the built token stream.
            return result;
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Apply standard tokenizer to input
            var tokenizedInput = new StandardTokenizer(_version, reader);

            //TODO: do we want to remove stop words from auto complete?
            //Apply standard, lowercase and English stop words filters to input
            var filteredInput = new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)),
                StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            //Apply EdgeNGram filter to front of words
            //Min size of grams max size of grams
            var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram);

            return grammedInput;
        }
Exemplo n.º 23
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
                result = new StopFilter(false, result, STOP_WORDS);
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return result;
        }
Exemplo n.º 24
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader);
            TokenStream filter = new StandardFilter(src);
            //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
            HashSet<string> newWords = new HashSet<string>()
            {
                "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also",
                "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a",
                "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it"
            };
            foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET)
            {
                newWords.Add(var);
            }

            TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
            result = new SnowballFilter(result, new EnglishStemmer());
            result = new LowerCaseFilter(result);
            result = new ASCIIFoldingFilter(result);
            result = new StopFilter(true, result, newWords);
            return result;
        }
Exemplo n.º 25
0
        private void  TestCachingCustomToken(int api)
        {
            TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));

            stream = new PartOfSpeechTaggingFilter(stream);
            stream = new LowerCaseFilter(stream);
            stream = new StopFilter(stream, stopwords);
            stream = new CachingTokenFilter(stream);             // <- the caching is done before the annotating!
            stream = new PartOfSpeechAnnotatingFilter(stream);

            switch (api)
            {
            case 0:
                ConsumeStreamNewAPI(stream);
                ConsumeStreamNewAPI(stream);
                break;

            case 1:
                ConsumeStreamOldAPI(stream);
                ConsumeStreamOldAPI(stream);
                break;

            case 2:
                ConsumeStreamVeryOldAPI(stream);
                ConsumeStreamVeryOldAPI(stream);
                break;

            case 3:
                ConsumeStreamNewAPI(stream);
                ConsumeStreamOldAPI(stream);
                ConsumeStreamVeryOldAPI(stream);
                ConsumeStreamNewAPI(stream);
                ConsumeStreamVeryOldAPI(stream);
                break;
            }
        }
Exemplo n.º 26
0
 /*
  * Creates a {@link TokenStream} which tokenizes all the text in the 
  * provided {@link Reader}.
  *
  * @return  A {@link TokenStream} built from a 
  *   {@link RussianLetterTokenizer} filtered with 
  *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
  *   and {@link RussianStemFilter}
  */
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new RussianLetterTokenizer(reader);
     result = new LowerCaseFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stopSet);
     result = new RussianStemFilter(result);
     return result;
 }
Exemplo n.º 27
0
	/*
	 * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
	 *
	 * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
	 * 			{@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
	 */
	public override sealed TokenStream TokenStream( String fieldName, TextReader reader ) {
                TokenStream result = new StandardTokenizer( matchVersion, reader );
		result = new StandardFilter( result );
		result = new LowerCaseFilter( result );
		result = new StopFilter( StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                         result, stoptable );
		return result;
	}
Exemplo n.º 28
0
		public virtual void  TestOverridesAny()
		{
			try
			{
				TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));
				stream = new AnonymousClassTokenFilter(this, stream);
				stream = new LowerCaseFilter(stream);
				stream = new StopFilter(stream, stopwords);
				while (stream.IncrementToken())
					;
				Assert.Fail("One TokenFilter does not override any of the required methods, so it should fail.");
			}
			catch (System.NotSupportedException uoe)
			{
				Assert.IsTrue(uoe.Message.EndsWith("does not implement any of incrementToken(), next(Token), next()."));
			}
		}
Exemplo n.º 29
0
		public virtual void  TestOnlyNewAPI()
		{
			TokenStream.SetOnlyUseNewAPI(true);
			try
			{
				
				// this should fail with UOE
				try
				{
					TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));
					stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil!
					stream = new LowerCaseFilter(stream);
					stream = new StopFilter(stream, stopwords);
					while (stream.IncrementToken())
						;
					Assert.Fail("If only the new API is allowed, this should fail with an UOE");
				}
				catch (System.NotSupportedException uoe)
				{
					Assert.IsTrue((typeof(PartOfSpeechTaggingFilter).FullName + " does not implement incrementToken() which is needed for onlyUseNewAPI.").Equals(uoe.Message));
				}
				
				// this should pass, as all core token streams support the new API
				TokenStream stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc));
				stream2 = new LowerCaseFilter(stream2);
				stream2 = new StopFilter(stream2, stopwords);
				while (stream2.IncrementToken())
					;
				
				// Test, if all attributes are implemented by their implementation, not Token/TokenWrapper
				Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TermAttributeImpl, "TermAttribute is implemented by TermAttributeImpl");
				Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is OffsetAttributeImpl, "OffsetAttribute is implemented by OffsetAttributeImpl");
				Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is FlagsAttributeImpl, "FlagsAttribute is implemented by FlagsAttributeImpl");
				Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is PayloadAttributeImpl, "PayloadAttribute is implemented by PayloadAttributeImpl");
				Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is PositionIncrementAttributeImpl, "PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl");
				Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TypeAttributeImpl, "TypeAttribute is implemented by TypeAttributeImpl");
				Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl");
				
				// try to call old API, this should fail
				try
				{
					stream2.Reset();
					Token reusableToken = new Token();
					while ((reusableToken = stream2.Next(reusableToken)) != null)
						;
					Assert.Fail("If only the new API is allowed, this should fail with an UOE");
				}
				catch (System.NotSupportedException uoe)
				{
					Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message));
				}
				try
				{
					stream2.Reset();
					while (stream2.Next() != null)
						;
					Assert.Fail("If only the new API is allowed, this should fail with an UOE");
				}
				catch (System.NotSupportedException uoe)
				{
					Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message));
				}
				
				// Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper
				// as attribute instance.
				// TokenWrapper encapsulates a Token instance that can be exchanged
				// by another Token instance without changing the AttributeImpl instance
				// itsself.
				TokenStream.SetOnlyUseNewAPI(false);
				stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc));
				Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TokenWrapper, "TermAttribute is implemented by TokenWrapper");
				Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is TokenWrapper, "OffsetAttribute is implemented by TokenWrapper");
				Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is TokenWrapper, "FlagsAttribute is implemented by TokenWrapper");
				Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is TokenWrapper, "PayloadAttribute is implemented by TokenWrapper");
				Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is TokenWrapper, "PositionIncrementAttribute is implemented by TokenWrapper");
				Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TokenWrapper, "TypeAttribute is implemented by TokenWrapper");
				// This one is not implemented by TokenWrapper:
				Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl");
			}
			finally
			{
				TokenStream.SetOnlyUseNewAPI(false);
			}
		}
Exemplo n.º 30
0
		private void  TestCachingCustomToken(int api)
		{
			TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));
			stream = new PartOfSpeechTaggingFilter(stream);
			stream = new LowerCaseFilter(stream);
			stream = new StopFilter(stream, stopwords);
			stream = new CachingTokenFilter(stream); // <- the caching is done before the annotating!
			stream = new PartOfSpeechAnnotatingFilter(stream);
			
			switch (api)
			{
				
				case 0: 
					ConsumeStreamNewAPI(stream);
					ConsumeStreamNewAPI(stream);
					break;
				
				case 1: 
					ConsumeStreamOldAPI(stream);
					ConsumeStreamOldAPI(stream);
					break;
				
				case 2: 
					ConsumeStreamVeryOldAPI(stream);
					ConsumeStreamVeryOldAPI(stream);
					break;
				
				case 3: 
					ConsumeStreamNewAPI(stream);
					ConsumeStreamOldAPI(stream);
					ConsumeStreamVeryOldAPI(stream);
					ConsumeStreamNewAPI(stream);
					ConsumeStreamVeryOldAPI(stream);
					break;
				}
		}
Exemplo n.º 31
0
 /// <summary>Builds an analyzer with the stop words from the given reader. </summary>
 /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
 /// </seealso>
 /// <param name="matchVersion">See <a href="#Version">above</a>
 /// </param>
 /// <param name="stopwords">Reader to load stop words from
 /// </param>
 public StopAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
 {
     stopWords = WordlistLoader.GetWordSet(stopwords);
     enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }
Exemplo n.º 32
0
 /// <summary>Builds an analyzer with the stop words from the given set.</summary>
 public StopAnalyzer(Version matchVersion, ICollection <string> stopWords)
 {
     this.stopWords = stopWords;
     useDefaultStopPositionIncrement = false;
     enablePositionIncrements        = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }
Exemplo n.º 33
0
 /// <summary>Builds an analyzer with the stop words from the given set.</summary>
 public StopAnalyzer(Version matchVersion, System.Collections.Hashtable stopWords)
 {
     this.stopWords = stopWords;
     useDefaultStopPositionIncrement = false;
     enablePositionIncrements        = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }
Exemplo n.º 34
0
 public StopAnalyzer(System.String[] stopWords, bool enablePositionIncrements)
 {
     this.stopWords = StopFilter.MakeStopSet(stopWords);
     this.enablePositionIncrements   = enablePositionIncrements;
     useDefaultStopPositionIncrement = false;
 }
Exemplo n.º 35
0
 /// <summary> Builds an analyzer with the stop words from the given file.
 ///
 /// </summary>
 /// <seealso cref="WordlistLoader.getWordSet(File)">
 /// </seealso>
 /// <param name="matchVersion">See <a href="#version">above</a>
 /// </param>
 /// <param name="stopwordsFile">File to load stop words from
 /// </param>
 public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile)
 {
     stopWords = WordlistLoader.GetWordSet(stopwordsFile);
     this.enablePositionIncrements   = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
     useDefaultStopPositionIncrement = false;
 }
Exemplo n.º 36
0
 /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary>
 public StopAnalyzer(Version matchVersion)
 {
     stopWords = ENGLISH_STOP_WORDS_SET;
     enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }
Exemplo n.º 37
0
        public virtual void  TestStopPositons()
        {
            System.Text.StringBuilder sb = new System.Text.StringBuilder();
            System.Collections.Generic.List <string> a = new System.Collections.Generic.List <string>();
            for (int i = 0; i < 20; i++)
            {
                System.String w = English.IntToEnglish(i).Trim();
                sb.Append(w).Append(" ");
                if (i % 3 != 0)
                {
                    a.Add(w);
                }
            }
            Log(sb.ToString());
            System.String[] stopWords = (System.String[])a.ToArray();
            for (int i = 0; i < a.Count; i++)
            {
                Log("Stop: " + stopWords[i]);
            }
            var stopSet = StopFilter.MakeStopSet(stopWords);

            // with increments
            System.IO.StringReader reader = new System.IO.StringReader(sb.ToString());
            StopFilter             stpf   = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);

            DoTestStopPositons(stpf, true);
            // without increments
            reader = new System.IO.StringReader(sb.ToString());
            stpf   = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
            DoTestStopPositons(stpf, false);
            // with increments, concatenating two stop filters
            System.Collections.Generic.List <System.String> a0 = new System.Collections.Generic.List <System.String>();
            System.Collections.Generic.List <System.String> a1 = new System.Collections.Generic.List <System.String>();
            for (int i = 0; i < a.Count; i++)
            {
                if (i % 2 == 0)
                {
                    a0.Add(a[i]);
                }
                else
                {
                    a1.Add(a[i]);
                }
            }
            System.String[] stopWords0 = (System.String[])a0.ToArray();
            for (int i = 0; i < a0.Count; i++)
            {
                Log("Stop0: " + stopWords0[i]);
            }
            System.String[] stopWords1 = (System.String[])a1.ToArray();
            for (int i = 0; i < a1.Count; i++)
            {
                Log("Stop1: " + stopWords1[i]);
            }
            var stopSet0 = StopFilter.MakeStopSet(stopWords0);
            var stopSet1 = StopFilter.MakeStopSet(stopWords1);

            reader = new System.IO.StringReader(sb.ToString());
            StopFilter stpf0 = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet0);             // first part of the set

            stpf0.EnablePositionIncrements = true;
            StopFilter stpf01 = new StopFilter(false, stpf0, stopSet1);             // two stop filters concatenated!

            DoTestStopPositons(stpf01, true);
        }
Exemplo n.º 38
0
 /// <summary>Builds an analyzer which removes words in ENGLISH_STOP_WORDS. </summary>
 public StopAnalyzer()
 {
     stopWords = StopFilter.MakeStopSet(ENGLISH_STOP_WORDS);
 }
Exemplo n.º 39
0
 /// <summary>Builds an analyzer which removes words in the provided array. </summary>
 public StopAnalyzer(System.String[] stopWords)
 {
     this.stopWords = StopFilter.MakeStopSet(stopWords);
 }
Exemplo n.º 40
0
		private void  TestTeeSinkCustomToken(int api)
		{
			TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));
			stream = new PartOfSpeechTaggingFilter(stream);
			stream = new LowerCaseFilter(stream);
			stream = new StopFilter(stream, stopwords);
			
			SinkTokenizer sink = new SinkTokenizer();
			TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink);
			
			stream = new TeeTokenFilter(stream, sink);
			stream = new PartOfSpeechAnnotatingFilter(stream);
			
			switch (api)
			{
				
				case 0: 
					ConsumeStreamNewAPI(stream);
					ConsumeStreamNewAPI(stream1);
					break;
				
				case 1: 
					ConsumeStreamOldAPI(stream);
					ConsumeStreamOldAPI(stream1);
					break;
				
				case 2: 
					ConsumeStreamVeryOldAPI(stream);
					ConsumeStreamVeryOldAPI(stream1);
					break;
				}
		}
        public virtual void  TestOnlyNewAPI()
        {
            TokenStream.SetOnlyUseNewAPI(true);
            try
            {
                // this should fail with UOE
                try
                {
                    TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));
                    stream = new PartOfSpeechTaggingFilter(stream);                     // <-- this one is evil!
                    stream = new LowerCaseFilter(stream);
                    stream = new StopFilter(stream, stopwords);
                    while (stream.IncrementToken())
                    {
                        ;
                    }
                    Assert.Fail("If only the new API is allowed, this should fail with an UOE");
                }
                catch (System.NotSupportedException uoe)
                {
                    Assert.IsTrue((typeof(PartOfSpeechTaggingFilter).FullName + " does not implement incrementToken() which is needed for onlyUseNewAPI.").Equals(uoe.Message));
                }

                // this should pass, as all core token streams support the new API
                TokenStream stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc));
                stream2 = new LowerCaseFilter(stream2);
                stream2 = new StopFilter(stream2, stopwords);
                while (stream2.IncrementToken())
                {
                    ;
                }

                // Test, if all attributes are implemented by their implementation, not Token/TokenWrapper
                Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TermAttributeImpl, "TermAttribute is implemented by TermAttributeImpl");
                Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is OffsetAttributeImpl, "OffsetAttribute is implemented by OffsetAttributeImpl");
                Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is FlagsAttributeImpl, "FlagsAttribute is implemented by FlagsAttributeImpl");
                Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is PayloadAttributeImpl, "PayloadAttribute is implemented by PayloadAttributeImpl");
                Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is PositionIncrementAttributeImpl, "PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl");
                Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TypeAttributeImpl, "TypeAttribute is implemented by TypeAttributeImpl");
                Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl");

                // try to call old API, this should fail
                try
                {
                    stream2.Reset();
                    Token reusableToken = new Token();
                    while ((reusableToken = stream2.Next(reusableToken)) != null)
                    {
                        ;
                    }
                    Assert.Fail("If only the new API is allowed, this should fail with an UOE");
                }
                catch (System.NotSupportedException uoe)
                {
                    Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message));
                }
                try
                {
                    stream2.Reset();
                    while (stream2.Next() != null)
                    {
                        ;
                    }
                    Assert.Fail("If only the new API is allowed, this should fail with an UOE");
                }
                catch (System.NotSupportedException uoe)
                {
                    Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message));
                }

                // Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper
                // as attribute instance.
                // TokenWrapper encapsulates a Token instance that can be exchanged
                // by another Token instance without changing the AttributeImpl instance
                // itsself.
                TokenStream.SetOnlyUseNewAPI(false);
                stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc));
                Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TokenWrapper, "TermAttribute is implemented by TokenWrapper");
                Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is TokenWrapper, "OffsetAttribute is implemented by TokenWrapper");
                Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is TokenWrapper, "FlagsAttribute is implemented by TokenWrapper");
                Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is TokenWrapper, "PayloadAttribute is implemented by TokenWrapper");
                Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is TokenWrapper, "PositionIncrementAttribute is implemented by TokenWrapper");
                Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TokenWrapper, "TypeAttribute is implemented by TokenWrapper");
                // This one is not implemented by TokenWrapper:
                Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl");
            }
            finally
            {
                TokenStream.SetOnlyUseNewAPI(false);
            }
        }
Exemplo n.º 42
0
 /// <summary>Builds an analyzer with the stop words from the given set.</summary>
 public StopAnalyzer(Version matchVersion, ISet <string> stopWords)
 {
     this.stopWords           = stopWords;
     enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }