protected override TokenStreamComponents createComponents(string field, java.io.Reader reader) { var tokenizer = new PathTokenizer(reader); TokenStream tokenStream = new StandardFilter(tokenizer); tokenStream = new LowerCaseFilter(tokenStream); tokenStream = new StopFilter(tokenStream, StandardAnalyzer.STOP_WORDS_SET); return new TokenStreamComponents(tokenizer, tokenStream); }
public override TokenStream TokenStream(string fieldname, TextReader reader) { TokenStream result = new StandardTokenizer(_version, reader); result = new LowerCaseFilter(result); result = new PersianNormalizationFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable); result = new PersianStemFilter(result); return result; }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { TokenStreamComponents tokenStreamComponents = null; Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream stream = new StandardFilter(matchVersion, tokenizer); stream = new LowerCaseFilter(matchVersion, stream); stream = new StopFilter(matchVersion, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); stream = new PorterStemFilter(stream); stream = new SnowballFilter(stream, new EnglishStemmer()); tokenStreamComponents = new TokenStreamComponents(tokenizer, stream); return(tokenStreamComponents); }
/// <summary> /// Creates a <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> /// A <see cref="TokenStreamComponents"/> built from an <see cref="StandardTokenizer"/> /// filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem excusion set is provided and <see cref="StempelFilter"/>. /// </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new StempelFilter(result, new StempelStemmer(stemTable)); return(new TokenStreamComponents(source, result)); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { //TokenStream result = new StandardTokenizer(matchVersion, reader); TokenStream result = new WhitespaceTokenizer(@reader); result = new StandardFilter(result); result = new EnglishPossessiveFilter(result); result = new LowerCaseFilter(result); //La ricerca nel titolo deve indicizzare tutto, pertanto niente stopwords //result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopTable); result = new PorterStemFilter(result); //Per gestire la creazione di parole tagliando i simboli result = new SymbolsFilter(result); return(result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <seealso cref="IndonesianStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new IndonesianStemFilter(result))); }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="SoraniNormalizationFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <see cref="SoraniStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new SoraniNormalizationFilter(result); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SoraniStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : (Tokenizer) new ArabicLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! result = new StopFilter(matchVersion, result, stopwords); // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?! result = new ArabicNormalizationFilter(result); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new ArabicStemFilter(result))); }
/// <summary> /// Creates a <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns>A <see cref="TokenStreamComponents"/> built from a <see cref="StandardTokenizer"/> /// filtered with <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <see cref="MorfologikFilter"/> on the Ukrainian dictionary.</returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new LowerCaseFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new MorfologikFilter(result, GetDictionary()); return(new TokenStreamComponents(source, result)); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) { result = new StopFilter(false, result, STOP_WORDS); } result = new ASCIIFoldingFilter(result); result = new SnowballFilter(result, "English"); return(result); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var attributeSource = new AttributeSource(); attributeSource.AddAttributeImpl(new SpellAttribute()); attributeSource.AddAttributeImpl(new StemAttribute()); var tokenizer = new RussianLetterTokenizer(attributeSource, reader); var lowercaseFilter = new LowerCaseFilter(tokenizer); var badWordsFilter = new BadWordsFilter(lowercaseFilter); var stopWordFilter = new StopFilter(false, badWordsFilter, StopWords); var preFilter = new StemFilter(stopWordFilter, SpellChecker, NumberOfSuggestions); var similarFilter = new SimilarFilter(preFilter); return(similarFilter); }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns></returns> public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); result = new NGramTokenFilter(result, 1, 30); //result = new StopFilter(true, result, stoptable); //result = new PorterStemFilter(result); //result = new GermanStemFilter(result, excltable); return(result); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testMultipleSources() throws Exception public virtual void testMultipleSources() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); tee1.reset(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1); TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(typeof(CheckClearAttributesAttribute)); dogDetector.addAttribute(typeof(CheckClearAttributesAttribute)); theDetector.addAttribute(typeof(CheckClearAttributesAttribute)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false)); tee2.addSinkTokenStream(dogDetector); tee2.addSinkTokenStream(theDetector); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TokenStream source2 = tee2; TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); assertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" }); assertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" }); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); string[] lowerCaseTokens = new string[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT); } assertTokenStreamContents(lowerCasing, lowerCaseTokens); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = matchVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_31) ? new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! result = new StopFilter(matchVersion, result, stopwords); // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?! result = new ArabicNormalizationFilter(result); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new ArabicStemFilter(result))); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided * {@link Reader}. * * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} * filtered with {@link LowerCaseFilter}, * {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new ArabicLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); /* * the order here is important: the stopword list is normalized with the * above! */ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); return(result); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { // Splits words at punctuation characters, removing punctuation. // Splits words at hyphens, unless there's a number in the token... // Recognizes email addresses and internet hostnames as one token. var tokenizer = new StandardTokenizer(_version, reader); TokenStream filter = new StandardFilter(tokenizer); // Normalizes token text to lower case. filter = new LowerCaseFilter(filter); // Removes stop words from a token stream. filter = new StopFilter(true, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return(new NGramTokenFilter(filter, _minGram, _maxGram)); }
/// <summary> /// Test that LowercaseFilter handles the lowercasing correctly if the term /// buffer has a trailing surrogate character leftover and the current term in /// the buffer ends with a corresponding leading surrogate. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testLowerCaseFilterLowSurrogateLeftover() throws java.io.IOException public virtual void testLowerCaseFilterLowSurrogateLeftover() { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); assertTokenStreamContents(filter, new string[] {"bogustermbogusterm\udc16"}); filter.reset(); string highSurEndingUpper = "BogustermBoguster\ud801"; string highSurEndingLower = "bogustermboguster\ud801"; tokenizer.Reader = new StringReader(highSurEndingUpper); assertTokenStreamContents(filter, new string[] {highSurEndingLower}); assertTrue(filter.hasAttribute(typeof(CharTermAttribute))); char[] termBuffer = filter.getAttribute(typeof(CharTermAttribute)).buffer(); int length = highSurEndingLower.Length; assertEquals('\ud801', termBuffer[length - 1]); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { Tokenizer source = new StandardTokenizer(matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(matchVersion, result); result = new CJKBigramFilter(result); return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords))); } else { Tokenizer source = new CJKTokenizer(reader); return(new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords))); } }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , and <seealso cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If /// a version is >= LUCENE_31 and a stem exclusion set is provided via /// <seealso cref="#CzechAnalyzer(Version, CharArraySet, CharArraySet)"/> a /// <seealso cref="SetKeywordMarkerFilter"/> is added before /// <seealso cref="CzechStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { if (this.stemExclusionTable.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); } result = new CzechStemFilter(result); } return(new TokenStreamComponents(source, result)); }
public GitHubIndex(Directory indexDirectory, string githubApiKey) { github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo")) { Credentials = new Credentials(githubApiKey) }; analyzer = new PerFieldAnalyzerWrapper( // Example of a pre-built custom analyzer defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion), // Example of inline anonymous analyzers fieldAnalyzers: new Dictionary <string, Analyzer> { // Field analyzer for owner { "owner", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new KeywordTokenizer(reader); TokenStream result = new ASCIIFoldingFilter(source); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) }, // Field analyzer for name { "name", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new StandardTokenizer(GitHubIndex.MatchVersion, reader); TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) } }); queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion, new[] { "name", "description", "readme" }, analyzer); indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer)); searcherManager = new SearcherManager(indexWriter, true, null); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (_adapter.IsNumericField(fieldName) || _adapter.IsNotAnalyzed(fieldName)) { var tokenizer = new KeywordTokenizer(reader); var filter = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer); return(new TokenStreamComponents(tokenizer, filter)); } else { var tokenizer = new MtgTokenizer(reader); var lowerCaseFilter = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer); var replacementFilter = new ReplaceFilter(lowerCaseFilter, MtgAplhabet.Replacements); return(new TokenStreamComponents(tokenizer, replacementFilter)); } }
public override TokenStream TokenStream (string fieldName, System.IO.TextReader reader) { //create the tokenizer TokenStream result = new StandardTokenizer(reader); //add in filters // first normalize the StandardTokenizer result = new StandardFilter(result); // makes sure everything is lower case result = new LowerCaseFilter(result); // use the default list of Stop Words, provided by the StopAnalyzer class. result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); // injects the synonyms. result = new SynonymFilter(result, SynonymEngine); //return the built token stream. return(result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym); tokenStream.SetMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (useDefaultStopPositionIncrements) { result = new StopFilter(result, stopSet); } else { result = new StopFilter(enableStopPositionIncrements, result, stopSet); } return(result); }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result; if (tagsMode) { result = new TagsTokenizer(reader); } else { result = new StandardTokenizer(reader); } result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return(result); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// and <see cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If /// a version is >= LUCENE_31 and a stem exclusion set is provided via /// <see cref="CzechAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> a /// <see cref="SetKeywordMarkerFilter"/> is added before /// <see cref="CzechStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { if (this.stemExclusionTable.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); } result = new CzechStemFilter(result); } return(new TokenStreamComponents(source, result)); }
public MySearch(string indexPath) { //_analyzer = new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION); _analyzer = new MultiFieldAnalyzerWrapper( defaultAnalyzer: new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION, true), new[] { ( new[] { "genre", "year" }, Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new KeywordTokenizer(reader); TokenStream result = new ASCIIFoldingFilter(source); result = new LowerCaseFilter(MATCH_LUCENE_VERSION, result); return(new TokenStreamComponents(source, result)); }) ) });
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PorterStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { result = new EnglishPossessiveFilter(matchVersion, result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (stopSet != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); } //Now, our Stemming filter goes here result = new BestBetsWordFormsFilter(result); //This will remove duplicate keywords - bad for best bets/term count matching result = new RemoveDuplicatesTokenFilter(result); return(result); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="FrenchLightStemFilter"/> </returns> /// protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result))); } }
// the ordering of these filters is important! public override TokenStream TokenStream(string fieldName, TextReader reader) { if (string.Equals("MilitaryIDNumber", fieldName)) { TokenStream result = new WhitespaceTokenizer(reader); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new AlphaNumericFilter(result); // behaves weirdly when used on Name field // during indexing, we will encounter some of the following extraneous text we don't care about. string[] stopWords = new string[] { "", "formerly", "or", "former", "pir", "tbc", "id", "pnc" }; return(new StopFilter(false, result, new CharArraySet(stopWords, true), true)); } else { TokenStream result = new AlphaNumericTokenizer(reader); result = new LowerCaseFilter(result); return(new ASCIIFoldingFilter(result)); } }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new ListMultiValueCharTokenizer(reader); //add in filters // first normalize the StandardTokenizer //result = new StandardFilter(result); // makes sure everything is lower case result = new LowerCaseFilter(result); // use the default list of Stop Words, provided by the StopAnalyzer class. //result = new StopFilter(result,new string[]{"קבוצת"}); // injects the synonyms. //result = new SynonymFilter(result, SynonymEngine); //return the built token stream. return(result); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) { result = new StopFilter(false, result, STOP_WORDS); } result = new ASCIIFoldingFilter(result); // we are using a distinct version of the Spanish stemmer, called Spanish2 // Please check if this class can be found in the Snowball library, the relative path // should be: Snowball\SF\Snowball\Ext\ // just in case, I would leave a copy of this class in this project result = new SnowballFilter(result, "Spanish"); return(result); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new HebrewTokenizer(reader, PrefixTree); // Niqqud normalization result = new NiqqudFilter(result); // TODO: should we ignoreCase in StopFilter? result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET); // TODO: Apply LowerCaseFilter to NonHebrew tokens only result = new LowerCaseFilter(result); if (suffixByTokenType != null && suffixByTokenType.Count > 0) result = new AddSuffixFilter(result, suffixByTokenType); return result; }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SoraniStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new SoraniNormalizationFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SoraniStemFilter(result); return new TokenStreamComponents(source, result); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_36)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(matchVersion, result); result = new CJKBigramFilter(result); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader); Tokenizer source = new CJKTokenizer(reader); return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); } }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream stream = new StandardTokenizer(LuceneVersion.LUCENE_29, reader); stream = new LowerCaseFilter(stream); stream = new HunspellStemFilter(stream, _dictionary); return stream; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new PanGuTokenizer(reader, _OriginalResult, _options, _parameters); result = new LowerCaseFilter(result); return result; }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/> /// , and <seealso cref="BrazilianStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StandardFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable != null && !excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } return new TokenStreamComponents(source, new BrazilianStemFilter(result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new SetKeywordMarkerFilter(result, exclusionSet); #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new GermanNormalizationFilter(result); result = new GermanLightStemFilter(result); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new SnowballFilter(result, new German2Stemmer()); } else { result = new GermanStemFilter(result); } return new TokenStreamComponents(source, result); }
/** * Creates a {@link TokenStream} which tokenizes all the text in the provided * {@link Reader}. * * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} * filtered with {@link LowerCaseFilter}, * {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new ArabicLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); /* * the order here is important: the stopword list is normalized with the * above! */ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); return result; }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); tokenStream.MaxTokenLength = maxTokenLength; TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new StopFilter(enableStopPositionIncrements, result, stopSet); return result; }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="ThaiWordFilter"/>, and /// <seealso cref="StopFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_48)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new ThaiTokenizer(reader); Tokenizer source = new ThaiTokenizer(reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); return new TokenStreamComponents(source, result); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new LowerCaseFilter(matchVersion, result); } result = new ThaiWordFilter(matchVersion, result); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } }