/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <see cref="ItalianLightStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_32)) #pragma warning restore 612, 618 { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new ItalianLightStemFilter(result); } else { result = new SnowballFilter(result, new ItalianStemmer()); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="ItalianLightStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_32)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new ItalianLightStemFilter(result); } else { result = new SnowballFilter(result, new ItalianStemmer()); } return(new TokenStreamComponents(source, result)); }
public virtual void TestElision_() { string test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test)); CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, AsSet("l", "M"), false); TokenFilter filter = new ElisionFilter(tokenizer, articles); IList<string> tas = Filter(filter); assertEquals("embrouille", tas[4]); assertEquals("O'brian", tas[6]); assertEquals("enfin", tas[7]); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(Lucene.Net.Util.LuceneVersion.LUCENE_48, reader); TokenStream result = new StandardFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, source); result = new ElisionFilter(result, _defaultArticles); result = new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, result); result = new StopFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, result, _defaultStopwords); result = new PermutermTokenFilter(result); return(new TokenStreamComponents(source, result)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testElision() throws Exception public virtual void testElision() { string test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test)); CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false); TokenFilter filter = new ElisionFilter(tokenizer, articles); IList <string> tas = filter(filter); assertEquals("embrouille", tas[4]); assertEquals("O'brian", tas[6]); assertEquals("enfin", tas[7]); }
public void TestElision2() { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test)); HashSet <String> articles = new HashSet <String>(); articles.Add("l"); articles.Add("M"); TokenFilter filter = new ElisionFilter(tokenizer, articles); List <string> tas = Filtre(filter); Assert.AreEqual("embrouille", tas[4]); Assert.AreEqual("O'brian", tas[6]); Assert.AreEqual("enfin", tas[7]); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="FrenchLightStemFilter"/> </returns> /// protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result))); } }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) { s.EnablePositionIncrements = false; } result = s; result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new IrishStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> /// public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }