Removes elisions from a TokenStream. For example, "l'avion" (the plane) will be tokenized as "avion" (plane).
Inheritance: TokenFilter
Esempio n. 1
0
        public virtual void TestElision_()
        {
            string         test      = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
            Tokenizer      tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
            CharArraySet   articles  = new CharArraySet(TEST_VERSION_CURRENT, AsSet("l", "M"), false);
            TokenFilter    filter    = new ElisionFilter(tokenizer, articles);
            IList <string> tas       = Filter(filter);

            assertEquals("embrouille", tas[4]);
            assertEquals("O'brian", tas[6]);
            assertEquals("enfin", tas[7]);
        }
Esempio n. 2
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
     #pragma warning disable 612, 618
     if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
     #pragma warning restore 612, 618
     {
         s.EnablePositionIncrements = false;
     }
     result = s;
     result = new ElisionFilter(result, DEFAULT_ARTICLES);
     result = new IrishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new IrishStemmer());
     return new TokenStreamComponents(source, result);
 }
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="ItalianLightStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_32))
#pragma warning restore 612, 618
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                result = new ItalianLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new ItalianStemmer());
            }
            return new TokenStreamComponents(source, result);
        }