public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader); TokenStream filterStream = new StandardFilter(tokenizer); TokenStream stream = new StopFilter(true, filterStream, _stopWords, true); return stream; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = whitespaceAnalyzer.TokenStream(fieldName, reader); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stopWords); return result; }
public virtual void TestIgnoreCase() { System.IO.StringReader reader = new System.IO.StringReader("Now is The Time"); System.String[] stopWords = new System.String[]{"is", "the", "Time"}; TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true); Assert.AreEqual("Now", stream.Next().TermText()); Assert.AreEqual(null, stream.Next()); }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var seg = new JiebaSegmenter(); TokenStream result = new JiebaTokenizer(seg, reader); // This filter is necessary, because the parser converts the queries to lower case. result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopWords); return result; }
public override TokenStream TokenStream(string fieldname, TextReader reader) { TokenStream result = new PersianTokenizer(reader); result = new LowerCaseFilter(result); result = new PersianNormalizationFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable); result = new PersianLemmatizationFilter(result); return result; }
/** Constructs a {@link StandardTokenizer} filtered by a {@link * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter} * and a {@link SpanishStemFilter}. */ public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(Version.LUCENE_24,reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true,result, stopTable); result = new SpanishStemFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { var seg = new JiebaSegmenter(); TokenStream result = new JiebaTokenizer(seg, reader); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopWords); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_29, reader); tokenizer.MaxTokenLength = 255; TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET); return new NGramTokenFilter(filter, 2, 6); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(this.enableStopPositionIncrements, result, stoptable); result = new BulgarianStemFilter(result); return result; }
public virtual void TestStopFilt() { System.IO.StringReader reader = new System.IO.StringReader("Now is The Time"); System.String[] stopWords = new System.String[]{"is", "the", "Time"}; System.Collections.Hashtable stopSet = StopFilter.MakeStopSet(stopWords); TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet); Assert.AreEqual("Now", stream.Next().TermText()); Assert.AreEqual("The", stream.Next().TermText()); Assert.AreEqual(null, stream.Next()); }
public virtual void TestIgnoreCase() { System.IO.StringReader reader = new System.IO.StringReader("Now is The Time"); System.String[] stopWords = new System.String[]{"is", "the", "Time"}; TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true); TermAttribute termAtt = (TermAttribute) stream.GetAttribute(typeof(TermAttribute)); Assert.IsTrue(stream.IncrementToken()); Assert.AreEqual("Now", termAtt.Term()); Assert.IsFalse(stream.IncrementToken()); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { var result = new StopFilter(true, new ASCIIFoldingFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader)))), StandardAnalyzer.STOP_WORDS_SET); return result; }
/// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new SnowballFilter(result, name); return result; }
public virtual void TestIgnoreCase() { System.IO.StringReader reader = new System.IO.StringReader("Now is The Time"); var stopWords = Support.Compatibility.SetFactory.GetSet<string>(); stopWords.UnionWith(new[] {"is", "the", "Time"}); TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true); ITermAttribute termAtt = stream.GetAttribute<ITermAttribute>(); Assert.IsTrue(stream.IncrementToken()); Assert.AreEqual("Now", termAtt.Term); Assert.IsFalse(stream.IncrementToken()); }
public virtual void TestStopFilt() { System.IO.StringReader reader = new System.IO.StringReader("Now is The Time"); System.String[] stopWords = new System.String[]{"is", "the", "Time"}; var stopSet = StopFilter.MakeStopSet(stopWords); TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet); ITermAttribute termAtt = stream.GetAttribute<ITermAttribute>(); Assert.IsTrue(stream.IncrementToken()); Assert.AreEqual("Now", termAtt.Term); Assert.IsTrue(stream.IncrementToken()); Assert.AreEqual("The", termAtt.Term); Assert.IsFalse(stream.IncrementToken()); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); result = new SnowballFilter(result, "English"); return result; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(kLuceneVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords)); result = new EdgeNGramTokenFilter( result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.Side.FRONT, 1, 20); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(this._luceneVersion, reader); result = new StandardFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this._luceneVersion), result, CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false)) ); result = new FrenchStemFilter(result, CharArraySet.EMPTY_SET); // Convert to lowercase after stemming! result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); return result; }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new SentenceTokenizer(reader); result = new WordTokenizer(result, wordSegment); // result = new LowerCaseFilter(result); // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写 // stem太严格了, This is not bug, this feature:) result = new PorterStemFilter(result); if (stopWords != null) { result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false); } return result; }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns></returns> public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { //create the tokenizer TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader); //add in filters result = new StandardFilter(result); // first normalize the StandardTokenizer result = new LowerCaseFilter(result);// makes sure everything is lower case result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // use the default list of Stop Words, provided by the StopAnalyzer class. result = new SynonymFilter(result, SynonymEngine); // injects the synonyms. //return the built token stream. return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Apply standard tokenizer to input var tokenizedInput = new StandardTokenizer(_version, reader); //TODO: do we want to remove stop words from auto complete? //Apply standard, lowercase and English stop words filters to input var filteredInput = new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)), StopAnalyzer.ENGLISH_STOP_WORDS_SET); //Apply EdgeNGram filter to front of words //Min size of grams max size of grams var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram); return grammedInput; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); // we are using a distinct version of the Spanish stemmer, called Spanish2 // Please check if this class can be found in the Snowball library, the relative path // should be: Snowball\SF\Snowball\Ext\ // just in case, I would leave a copy of this class in this project result = new SnowballFilter(result, "Spanish"); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader); TokenStream filter = new StandardFilter(src); //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET; HashSet<string> newWords = new HashSet<string>() { "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also", "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a", "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it" }; foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET) { newWords.Add(var); } TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader); result = new SnowballFilter(result, new EnglishStemmer()); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(true, result, newWords); return result; }
private void TestCachingCustomToken(int api) { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); stream = new CachingTokenFilter(stream); // <- the caching is done before the annotating! stream = new PartOfSpeechAnnotatingFilter(stream); switch (api) { case 0: ConsumeStreamNewAPI(stream); ConsumeStreamNewAPI(stream); break; case 1: ConsumeStreamOldAPI(stream); ConsumeStreamOldAPI(stream); break; case 2: ConsumeStreamVeryOldAPI(stream); ConsumeStreamVeryOldAPI(stream); break; case 3: ConsumeStreamNewAPI(stream); ConsumeStreamOldAPI(stream); ConsumeStreamVeryOldAPI(stream); ConsumeStreamNewAPI(stream); ConsumeStreamVeryOldAPI(stream); break; } }
/* * Creates a {@link TokenStream} which tokenizes all the text in the * provided {@link Reader}. * * @return A {@link TokenStream} built from a * {@link RussianLetterTokenizer} filtered with * {@link RussianLowerCaseFilter}, {@link StopFilter}, * and {@link RussianStemFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new RussianStemFilter(result); return result; }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter} */ public override sealed TokenStream TokenStream( String fieldName, TextReader reader ) { TokenStream result = new StandardTokenizer( matchVersion, reader ); result = new StandardFilter( result ); result = new LowerCaseFilter( result ); result = new StopFilter( StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable ); return result; }
public virtual void TestOverridesAny() { try { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new AnonymousClassTokenFilter(this, stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); while (stream.IncrementToken()) ; Assert.Fail("One TokenFilter does not override any of the required methods, so it should fail."); } catch (System.NotSupportedException uoe) { Assert.IsTrue(uoe.Message.EndsWith("does not implement any of incrementToken(), next(Token), next().")); } }
public virtual void TestOnlyNewAPI() { TokenStream.SetOnlyUseNewAPI(true); try { // this should fail with UOE try { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil! stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); while (stream.IncrementToken()) ; Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue((typeof(PartOfSpeechTaggingFilter).FullName + " does not implement incrementToken() which is needed for onlyUseNewAPI.").Equals(uoe.Message)); } // this should pass, as all core token streams support the new API TokenStream stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream2 = new LowerCaseFilter(stream2); stream2 = new StopFilter(stream2, stopwords); while (stream2.IncrementToken()) ; // Test, if all attributes are implemented by their implementation, not Token/TokenWrapper Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TermAttributeImpl, "TermAttribute is implemented by TermAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is OffsetAttributeImpl, "OffsetAttribute is implemented by OffsetAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is FlagsAttributeImpl, "FlagsAttribute is implemented by FlagsAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is PayloadAttributeImpl, "PayloadAttribute is implemented by PayloadAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is PositionIncrementAttributeImpl, "PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TypeAttributeImpl, "TypeAttribute is implemented by TypeAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl"); // try to call old API, this should fail try { stream2.Reset(); Token reusableToken = new Token(); while ((reusableToken = stream2.Next(reusableToken)) != null) ; Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message)); } try { stream2.Reset(); while (stream2.Next() != null) ; Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message)); } // Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper // as attribute instance. // TokenWrapper encapsulates a Token instance that can be exchanged // by another Token instance without changing the AttributeImpl instance // itsself. TokenStream.SetOnlyUseNewAPI(false); stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc)); Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TokenWrapper, "TermAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is TokenWrapper, "OffsetAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is TokenWrapper, "FlagsAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is TokenWrapper, "PayloadAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is TokenWrapper, "PositionIncrementAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TokenWrapper, "TypeAttribute is implemented by TokenWrapper"); // This one is not implemented by TokenWrapper: Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl"); } finally { TokenStream.SetOnlyUseNewAPI(false); } }
/// <summary>Builds an analyzer with the stop words from the given reader. </summary> /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)"> /// </seealso> /// <param name="matchVersion">See <a href="#Version">above</a> /// </param> /// <param name="stopwords">Reader to load stop words from /// </param> public StopAnalyzer(Version matchVersion, System.IO.TextReader stopwords) { stopWords = WordlistLoader.GetWordSet(stopwords); enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }
/// <summary>Builds an analyzer with the stop words from the given set.</summary> public StopAnalyzer(Version matchVersion, ICollection <string> stopWords) { this.stopWords = stopWords; useDefaultStopPositionIncrement = false; enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }
/// <summary>Builds an analyzer with the stop words from the given set.</summary> public StopAnalyzer(Version matchVersion, System.Collections.Hashtable stopWords) { this.stopWords = stopWords; useDefaultStopPositionIncrement = false; enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }
public StopAnalyzer(System.String[] stopWords, bool enablePositionIncrements) { this.stopWords = StopFilter.MakeStopSet(stopWords); this.enablePositionIncrements = enablePositionIncrements; useDefaultStopPositionIncrement = false; }
/// <summary> Builds an analyzer with the stop words from the given file. /// /// </summary> /// <seealso cref="WordlistLoader.getWordSet(File)"> /// </seealso> /// <param name="matchVersion">See <a href="#version">above</a> /// </param> /// <param name="stopwordsFile">File to load stop words from /// </param> public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile) { stopWords = WordlistLoader.GetWordSet(stopwordsFile); this.enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); useDefaultStopPositionIncrement = false; }
/// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary> public StopAnalyzer(Version matchVersion) { stopWords = ENGLISH_STOP_WORDS_SET; enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }
public virtual void TestStopPositons() { System.Text.StringBuilder sb = new System.Text.StringBuilder(); System.Collections.Generic.List <string> a = new System.Collections.Generic.List <string>(); for (int i = 0; i < 20; i++) { System.String w = English.IntToEnglish(i).Trim(); sb.Append(w).Append(" "); if (i % 3 != 0) { a.Add(w); } } Log(sb.ToString()); System.String[] stopWords = (System.String[])a.ToArray(); for (int i = 0; i < a.Count; i++) { Log("Stop: " + stopWords[i]); } var stopSet = StopFilter.MakeStopSet(stopWords); // with increments System.IO.StringReader reader = new System.IO.StringReader(sb.ToString()); StopFilter stpf = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet); DoTestStopPositons(stpf, true); // without increments reader = new System.IO.StringReader(sb.ToString()); stpf = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet); DoTestStopPositons(stpf, false); // with increments, concatenating two stop filters System.Collections.Generic.List <System.String> a0 = new System.Collections.Generic.List <System.String>(); System.Collections.Generic.List <System.String> a1 = new System.Collections.Generic.List <System.String>(); for (int i = 0; i < a.Count; i++) { if (i % 2 == 0) { a0.Add(a[i]); } else { a1.Add(a[i]); } } System.String[] stopWords0 = (System.String[])a0.ToArray(); for (int i = 0; i < a0.Count; i++) { Log("Stop0: " + stopWords0[i]); } System.String[] stopWords1 = (System.String[])a1.ToArray(); for (int i = 0; i < a1.Count; i++) { Log("Stop1: " + stopWords1[i]); } var stopSet0 = StopFilter.MakeStopSet(stopWords0); var stopSet1 = StopFilter.MakeStopSet(stopWords1); reader = new System.IO.StringReader(sb.ToString()); StopFilter stpf0 = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet0); // first part of the set stpf0.EnablePositionIncrements = true; StopFilter stpf01 = new StopFilter(false, stpf0, stopSet1); // two stop filters concatenated! DoTestStopPositons(stpf01, true); }
/// <summary>Builds an analyzer which removes words in ENGLISH_STOP_WORDS. </summary> public StopAnalyzer() { stopWords = StopFilter.MakeStopSet(ENGLISH_STOP_WORDS); }
/// <summary>Builds an analyzer which removes words in the provided array. </summary> public StopAnalyzer(System.String[] stopWords) { this.stopWords = StopFilter.MakeStopSet(stopWords); }
private void TestTeeSinkCustomToken(int api) { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); SinkTokenizer sink = new SinkTokenizer(); TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink); stream = new TeeTokenFilter(stream, sink); stream = new PartOfSpeechAnnotatingFilter(stream); switch (api) { case 0: ConsumeStreamNewAPI(stream); ConsumeStreamNewAPI(stream1); break; case 1: ConsumeStreamOldAPI(stream); ConsumeStreamOldAPI(stream1); break; case 2: ConsumeStreamVeryOldAPI(stream); ConsumeStreamVeryOldAPI(stream1); break; } }
public virtual void TestOnlyNewAPI() { TokenStream.SetOnlyUseNewAPI(true); try { // this should fail with UOE try { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil! stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); while (stream.IncrementToken()) { ; } Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue((typeof(PartOfSpeechTaggingFilter).FullName + " does not implement incrementToken() which is needed for onlyUseNewAPI.").Equals(uoe.Message)); } // this should pass, as all core token streams support the new API TokenStream stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream2 = new LowerCaseFilter(stream2); stream2 = new StopFilter(stream2, stopwords); while (stream2.IncrementToken()) { ; } // Test, if all attributes are implemented by their implementation, not Token/TokenWrapper Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TermAttributeImpl, "TermAttribute is implemented by TermAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is OffsetAttributeImpl, "OffsetAttribute is implemented by OffsetAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is FlagsAttributeImpl, "FlagsAttribute is implemented by FlagsAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is PayloadAttributeImpl, "PayloadAttribute is implemented by PayloadAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is PositionIncrementAttributeImpl, "PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TypeAttributeImpl, "TypeAttribute is implemented by TypeAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl"); // try to call old API, this should fail try { stream2.Reset(); Token reusableToken = new Token(); while ((reusableToken = stream2.Next(reusableToken)) != null) { ; } Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message)); } try { stream2.Reset(); while (stream2.Next() != null) { ; } Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message)); } // Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper // as attribute instance. // TokenWrapper encapsulates a Token instance that can be exchanged // by another Token instance without changing the AttributeImpl instance // itsself. TokenStream.SetOnlyUseNewAPI(false); stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc)); Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TokenWrapper, "TermAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is TokenWrapper, "OffsetAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is TokenWrapper, "FlagsAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is TokenWrapper, "PayloadAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is TokenWrapper, "PositionIncrementAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TokenWrapper, "TypeAttribute is implemented by TokenWrapper"); // This one is not implemented by TokenWrapper: Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl"); } finally { TokenStream.SetOnlyUseNewAPI(false); } }
/// <summary>Builds an analyzer with the stop words from the given set.</summary> public StopAnalyzer(Version matchVersion, ISet <string> stopWords) { this.stopWords = stopWords; enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }