public void TestElision2() { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test)); HashSet<String> articles = new HashSet<String>(); articles.Add("l"); articles.Add("M"); TokenFilter filter = new ElisionFilter(tokenizer, articles); List<string> tas = Filtre(filter); Assert.AreEqual("embrouille", tas[4]); Assert.AreEqual("O'brian", tas[6]); Assert.AreEqual("enfin", tas[7]); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="FrenchLightStemFilter"/> </returns> /// protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result))); } }