Strips all characters after an apostrophe (including the apostrophe itself).

In Turkish, apostrophe is used to separate suffixes from proper names (continent, sea, river, lake, mountain, upland, proper names related to religion and mythology). This filter intended to be used before stem filters. For more information, see Role of Apostrophes in Turkish Information Retrieval

Inheritance: TokenFilter
 public virtual void TestApostropheFilter()
 {
     TokenStream stream = new MockTokenizer(new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm"), MockTokenizer.WHITESPACE, false);
     stream = new TurkishLowerCaseFilter(stream);
     stream = new ApostropheFilter(stream);
     AssertTokenStreamContents(stream, new string[] { "türkiye", "2003", "van", "gölü", "gördüm" });
 }
Example #2
0
        public virtual void TestApostropheFilter()
        {
            TokenStream stream = new MockTokenizer(new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm"), MockTokenizer.WHITESPACE, false);

            stream = new TurkishLowerCaseFilter(stream);
            stream = new ApostropheFilter(stream);
            AssertTokenStreamContents(stream, new string[] { "türkiye", "2003", "van", "gölü", "gördüm" });
        }
Example #3
0
 /// <summary>
 /// Creates a
 /// <see cref="TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <see cref="TextReader"/>.
 /// </summary>
 /// <returns> A
 ///         <see cref="TokenStreamComponents"/>
 ///         built from an <see cref="StandardTokenizer"/> filtered with
 ///         <see cref="StandardFilter"/>, <see cref="TurkishLowerCaseFilter"/>,
 ///         <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem
 ///         exclusion set is provided and <see cref="SnowballFilter"/>. </returns>
 protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(m_matchVersion, reader);
     TokenStream result = new StandardFilter(m_matchVersion, source);
     if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         result = new ApostropheFilter(result);
     }
     result = new TurkishLowerCaseFilter(result);
     result = new StopFilter(m_matchVersion, result, m_stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
 }
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>,
 ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem
 ///         exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         result = new ApostropheFilter(result);
     }
     result = new TurkishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Any())
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
 }