/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/> /// , <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="SnowballFilter"/> </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return(new TokenStreamComponents(source, result)); } else { #pragma warning disable 612, 618 Tokenizer source = new RussianLetterTokenizer(m_matchVersion, reader); #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return(new TokenStreamComponents(source, result)); } }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return(new TokenStreamComponents(source, new GermanStemFilter(sink))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return(new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink))); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="SnowballFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return(new TokenStreamComponents(source, result)); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return(new TokenStreamComponents(source, result)); } }
/// <summary> /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the /// text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/> /// filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided, /// <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader) { if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } if (stemdict != null) { result = new StemmerOverrideFilter(result, stemdict); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer()); return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } result = new DutchStemFilter(result, origStemdict); return(new TokenStreamComponents(source, result)); } }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <see cref="ItalianLightStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_32)) #pragma warning restore 612, 618 { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new ItalianLightStemFilter(result); } else { result = new SnowballFilter(result, new ItalianStemmer()); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="LowerCaseFilter"/>, <see cref="ArabicNormalizationFilter"/>, /// <see cref="PersianNormalizationFilter"/> and Persian Stop words </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source; #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { source = new StandardTokenizer(m_matchVersion, reader); } else { #pragma warning disable 612, 618 source = new ArabicLetterTokenizer(m_matchVersion, reader); #pragma warning restore 612, 618 } TokenStream result = new LowerCaseFilter(m_matchVersion, source); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); /* * the order here is important: the stopword list is normalized with the * above! */ result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new PersianStemFilter(result))); }
/// <summary> /// Creates /// <see cref="Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="Analyzer.TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="LowerCaseFilter"/>, <see cref="IndicNormalizationFilter"/>, /// <see cref="HindiNormalizationFilter"/>, <see cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided, <see cref="HindiStemFilter"/>, and /// Hindi Stop words </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source; #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { source = new StandardTokenizer(m_matchVersion, reader); } else { source = new IndicTokenizer(m_matchVersion, reader); } #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(m_matchVersion, source); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new IndicNormalizationFilter(result); result = new HindiNormalizationFilter(result); result = new StopFilter(m_matchVersion, result, m_stopwords); result = new HindiStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, <see cref="GermanNormalizationFilter"/> and <see cref="GermanLightStemFilter"/> </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); result = new SetKeywordMarkerFilter(result, exclusionSet); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new GermanNormalizationFilter(result); result = new GermanLightStemFilter(result); } else if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new SnowballFilter(result, new German2Stemmer()); } else { result = new GermanStemFilter(result); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="IndicNormalizationFilter"/>, /// <seealso cref="HindiNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided, <seealso cref="HindiStemFilter"/>, and /// Hindi Stop words </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source; Tokenizer source; if (matchVersion.onOrAfter(Version.LUCENE_36)) { source = new StandardTokenizer(matchVersion, reader); } else { source = new IndicTokenizer(matchVersion, reader); } TokenStream result = new LowerCaseFilter(matchVersion, source); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new IndicNormalizationFilter(result); result = new HindiNormalizationFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new HindiStemFilter(result); return(new TokenStreamComponents(source, result)); }
public void TestKeyword() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("コーヒー"), false); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return(new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink))); }); CheckOneTerm(a, "コーヒー", "コーヒー"); }
public virtual void TestKeyword() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("quilométricas"), false); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return(new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink))); }); CheckOneTerm(a, "quilométricas", "quilométricas"); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, /// <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <see cref="IndonesianStemFilter"/>. </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new IndonesianStemFilter(result))); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="LowerCaseFilter"/>, <see cref="StandardFilter"/>, <see cref="StopFilter"/>, /// and <see cref="BrazilianStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new LowerCaseFilter(m_matchVersion, source); result = new StandardFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable != null && excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } return(new TokenStreamComponents(source, new BrazilianStemFilter(result))); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { CharArraySet keywords = new CharArraySet(TEST_VERSION_CURRENT, 1, false); keywords.add("liście"); Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new SetKeywordMarkerFilter(src, keywords); result = new MorfologikFilter(result); return(new TokenStreamComponents(src, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new HungarianStemmer()); return(new TokenStreamComponents(source, result)); }
public void TestKeyword() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("あり"), false); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return(new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink))); }); AssertAnalyzesTo(a, "それはまだ実験段階にあります", new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" } ); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : (Tokenizer) new ArabicLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! result = new StopFilter(matchVersion, result, stopwords); // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?! result = new ArabicNormalizationFilter(result); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new ArabicStemFilter(result))); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <seealso cref="IndonesianStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new IndonesianStemFilter(result))); }
/// <summary> /// Creates a <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> /// A <see cref="TokenStreamComponents"/> built from an <see cref="StandardTokenizer"/> /// filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem excusion set is provided and <see cref="StempelFilter"/>. /// </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new StempelFilter(result, new StempelStemmer(stemTable)); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = matchVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_31) ? new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! result = new StopFilter(matchVersion, result, stopwords); // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?! result = new ArabicNormalizationFilter(result); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return(new TokenStreamComponents(source, new ArabicStemFilter(result))); }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="TurkishLowerCaseFilter"/>, /// <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <see cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , and <seealso cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If /// a version is >= LUCENE_31 and a stem exclusion set is provided via /// <seealso cref="#CzechAnalyzer(Version, CharArraySet, CharArraySet)"/> a /// <seealso cref="SetKeywordMarkerFilter"/> is added before /// <seealso cref="CzechStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { if (this.stemExclusionTable.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); } result = new CzechStemFilter(result); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// and <see cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If /// a version is >= LUCENE_31 and a stem exclusion set is provided via /// <see cref="CzechAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> a /// <see cref="SetKeywordMarkerFilter"/> is added before /// <see cref="CzechStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { if (this.stemExclusionTable.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); } result = new CzechStemFilter(result); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="FrenchLightStemFilter"/> </returns> /// protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result))); } }
public override TokenStream Create(TokenStream input) { SnowballProgram program; try { program = (SnowballProgram)Activator.CreateInstance(stemClass); } catch (Exception e) { throw new Exception("Error instantiating stemmer for language " + language + "from class " + stemClass, e); } if (protectedWords != null) { input = new SetKeywordMarkerFilter(input, protectedWords); } return(new SnowballFilter(input, program)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PorterStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { result = new EnglishPossessiveFilter(matchVersion, result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result))); } }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="EnglishPossessiveFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <see cref="PorterStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new EnglishPossessiveFilter(m_matchVersion, result); } result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/> /// , and <seealso cref="BrazilianStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StandardFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable != null && !excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } return new TokenStreamComponents(source, new BrazilianStemFilter(result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SoraniStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new SoraniNormalizationFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SoraniStemFilter(result); return new TokenStreamComponents(source, result); }