public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader); var shingleMatrix = new ShingleMatrixFilter(tokenizer, 2, 8, ' '); var lowerCaseFilter = new LowerCaseFilter(shingleMatrix); return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { // This should be a good tokenizer for most European-language documents: // Splits words at punctuation characters, removing punctuation. // Splits words at hyphens, unless there's a number in the token... // Recognizes email addresses and internet hostnames as one token. var intput = new StandardTokenizer(Version.LUCENE_30, reader); // A ShingleMatrixFilter constructs shingles from a token stream. // "2010 Audi RS5 Quattro Coupe" => "2010 Audi", "Audi RS5", "RS5 Quattro", "Quattro Coupe" var shingleMatrixOutput = new ShingleMatrixFilter( // stream from which to construct the matrix intput, // minimum number of tokens in any shingle 2, // maximum number of tokens in any shingle. 8, // character to use between texts of the token parts in a shingle. ' '); // Normalizes token text to lower case. var lowerCaseFilter = new LowerCaseFilter(shingleMatrixOutput); // Removes stop words from a token stream. return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
/* public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader) { MMSegTokenizer mmsegTokenizer = (MMSegTokenizer)base.PreviousTokenStream; if (mmsegTokenizer == null) { mmsegTokenizer = new MMSegTokenizer(NewSeg, reader); base.PreviousTokenStream = mmsegTokenizer; } else { mmsegTokenizer.Reset(reader); } return mmsegTokenizer; } */ public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { Lucene.Net.Analysis.TokenStream result = new MMSegTokenizer(NewSeg, reader); result.Reset(); result = new LowerCaseFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = whitespaceAnalyzer.TokenStream(fieldName, reader); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stopWords); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new RuSnowballFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new LowerCaseFilter( new StandardTokenizer(_version, reader)); return result; }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; }
public override TokenStream TokenStream(string fieldname, TextReader reader) { TokenStream result = new PersianTokenizer(reader); result = new LowerCaseFilter(result); result = new PersianNormalizationFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable); result = new PersianLemmatizationFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var seg = new JiebaSegmenter(); TokenStream result = new JiebaTokenizer(seg, reader); // This filter is necessary, because the parser converts the queries to lower case. result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopWords); return result; }
public override TokenStream TokenStream(String fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream sink = new StandardFilter(source); sink = new LowerCaseFilter(sink); //sink = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), sink, stopSet); sink = new CroatianStemFilter(sink, stemmer); return sink; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { // Split the title based on IdSeparators, then run it through the innerAnalyzer string title = reader.ReadToEnd(); string partiallyTokenized = String.Join(" ", title.Split(PackageIndexEntity.IdSeparators, StringSplitOptions.RemoveEmptyEntries)); TokenStream result = whitespaceAnalyzer.TokenStream(fieldName, new StringReader(partiallyTokenized)); result = new LowerCaseFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_29, reader); tokenizer.MaxTokenLength = 255; TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET); return new NGramTokenFilter(filter, 2, 6); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { var seg = new JiebaSegmenter(); TokenStream result = new JiebaTokenizer(seg, reader); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopWords); return result; }
/** Constructs a {@link StandardTokenizer} filtered by a {@link * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter} * and a {@link SpanishStemFilter}. */ public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(Version.LUCENE_24,reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true,result, stopTable); result = new SpanishStemFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(this.enableStopPositionIncrements, result, stoptable); result = new BulgarianStemFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Need to account for the | breaks in relatedcontent var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader))); //return new ShingleFilter(tokenizedInput, 4); var output = new ShingleFilter(tokenizedInput, 4); //output.SetOutputUnigrams(false); return output; }
/// <summary> /// </summary> public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream t = null; t = new LetterOrDigitTokenizer(reader); t = new LowerCaseFilter(t); t = new ASCIIFoldingFilter(t); t = new SingleCharTokenizer(t); return t; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader); tokenizer.MaxTokenLength = 255; TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new NGramTokenFilter(filter, 2, 255); return filter; }
/// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new SnowballFilter(result, name); return result; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(kLuceneVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords)); result = new EdgeNGramTokenFilter( result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.Side.FRONT, 1, 20); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(this._luceneVersion, reader); result = new StandardFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this._luceneVersion), result, CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false)) ); result = new FrenchStemFilter(result, CharArraySet.EMPTY_SET); // Convert to lowercase after stemming! result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); result = new SnowballFilter(result, "English"); return result; }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns></returns> public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { //create the tokenizer TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader); //add in filters result = new StandardFilter(result); // first normalize the StandardTokenizer result = new LowerCaseFilter(result);// makes sure everything is lower case result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // use the default list of Stop Words, provided by the StopAnalyzer class. result = new SynonymFilter(result, SynonymEngine); // injects the synonyms. //return the built token stream. return result; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); // we are using a distinct version of the Spanish stemmer, called Spanish2 // Please check if this class can be found in the Snowball library, the relative path // should be: Snowball\SF\Snowball\Ext\ // just in case, I would leave a copy of this class in this project result = new SnowballFilter(result, "Spanish"); return result; }
public virtual void TestOverridesAny() { try { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new AnonymousClassTokenFilter(this, stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); while (stream.IncrementToken()) { ; } Assert.Fail("One TokenFilter does not override any of the required methods, so it should fail."); } catch (System.NotSupportedException uoe) { Assert.IsTrue(uoe.Message.EndsWith("does not implement any of incrementToken(), next(Token), next().")); } }
public override TokenStream TokenStream(string fieldName, TextReader reader) { Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader); TokenStream filter = new StandardFilter(src); //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET; HashSet<string> newWords = new HashSet<string>() { "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also", "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a", "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it" }; foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET) { newWords.Add(var); } TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader); result = new SnowballFilter(result, new EnglishStemmer()); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(true, result, newWords); return result; }
private void TestCachingCustomToken(int api) { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); stream = new CachingTokenFilter(stream); // <- the caching is done before the annotating! stream = new PartOfSpeechAnnotatingFilter(stream); switch (api) { case 0: ConsumeStreamNewAPI(stream); ConsumeStreamNewAPI(stream); break; case 1: ConsumeStreamOldAPI(stream); ConsumeStreamOldAPI(stream); break; case 2: ConsumeStreamVeryOldAPI(stream); ConsumeStreamVeryOldAPI(stream); break; case 3: ConsumeStreamNewAPI(stream); ConsumeStreamOldAPI(stream); ConsumeStreamVeryOldAPI(stream); ConsumeStreamNewAPI(stream); ConsumeStreamVeryOldAPI(stream); break; } }
public virtual void TestOnlyNewAPI() { TokenStream.SetOnlyUseNewAPI(true); try { // this should fail with UOE try { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil! stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); while (stream.IncrementToken()) { ; } Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue((typeof(PartOfSpeechTaggingFilter).FullName + " does not implement incrementToken() which is needed for onlyUseNewAPI.").Equals(uoe.Message)); } // this should pass, as all core token streams support the new API TokenStream stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream2 = new LowerCaseFilter(stream2); stream2 = new StopFilter(stream2, stopwords); while (stream2.IncrementToken()) { ; } // Test, if all attributes are implemented by their implementation, not Token/TokenWrapper Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TermAttributeImpl, "TermAttribute is implemented by TermAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is OffsetAttributeImpl, "OffsetAttribute is implemented by OffsetAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is FlagsAttributeImpl, "FlagsAttribute is implemented by FlagsAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is PayloadAttributeImpl, "PayloadAttribute is implemented by PayloadAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is PositionIncrementAttributeImpl, "PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TypeAttributeImpl, "TypeAttribute is implemented by TypeAttributeImpl"); Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl"); // try to call old API, this should fail try { stream2.Reset(); Token reusableToken = new Token(); while ((reusableToken = stream2.Next(reusableToken)) != null) { ; } Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message)); } try { stream2.Reset(); while (stream2.Next() != null) { ; } Assert.Fail("If only the new API is allowed, this should fail with an UOE"); } catch (System.NotSupportedException uoe) { Assert.IsTrue("This TokenStream only supports the new Attributes API.".Equals(uoe.Message)); } // Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper // as attribute instance. // TokenWrapper encapsulates a Token instance that can be exchanged // by another Token instance without changing the AttributeImpl instance // itsself. TokenStream.SetOnlyUseNewAPI(false); stream2 = new WhitespaceTokenizer(new System.IO.StringReader(doc)); Assert.IsTrue(stream2.AddAttribute(typeof(TermAttribute)) is TokenWrapper, "TermAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(OffsetAttribute)) is TokenWrapper, "OffsetAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)) is TokenWrapper, "FlagsAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(PayloadAttribute)) is TokenWrapper, "PayloadAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(PositionIncrementAttribute)) is TokenWrapper, "PositionIncrementAttribute is implemented by TokenWrapper"); Assert.IsTrue(stream2.AddAttribute(typeof(TypeAttribute)) is TokenWrapper, "TypeAttribute is implemented by TokenWrapper"); // This one is not implemented by TokenWrapper: Assert.IsTrue(stream2.AddAttribute(typeof(SenselessAttribute)) is SenselessAttributeImpl, "SenselessAttribute is not implemented by SenselessAttributeImpl"); } finally { TokenStream.SetOnlyUseNewAPI(false); } }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new PanGuTokenizer(reader); result = new LowerCaseFilter(result); return result; }
public virtual void TestMultipleSources() { SinkTokenizer theDetector = new AnonymousClassSinkTokenizer1(this, null); SinkTokenizer dogDetector = new AnonymousClassSinkTokenizer2(this, null); TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2); Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1); i = 0; for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]); i++; } Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length); Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4); Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2); i = 0; for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count); i = 0; for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs"); i++; } Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower()); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the * provided {@link Reader}. * * @return A {@link TokenStream} built from a * {@link RussianLetterTokenizer} filtered with * {@link RussianLowerCaseFilter}, {@link StopFilter}, * and {@link RussianStemFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new RussianStemFilter(result); return result; }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter} */ public override sealed TokenStream TokenStream( String fieldName, TextReader reader ) { TokenStream result = new StandardTokenizer( matchVersion, reader ); result = new StandardFilter( result ); result = new LowerCaseFilter( result ); result = new StopFilter( StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable ); return result; }
public virtual void TestOverridesAny() { try { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new AnonymousClassTokenFilter(this, stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); while (stream.IncrementToken()) ; Assert.Fail("One TokenFilter does not override any of the required methods, so it should fail."); } catch (System.NotSupportedException uoe) { Assert.IsTrue(uoe.Message.EndsWith("does not implement any of incrementToken(), next(Token), next().")); } }