public virtual void TestDecomposed2() { TokenStream stream = new MockTokenizer(new StringReader("\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"), MockTokenizer.WHITESPACE, false); TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); AssertTokenStreamContents(filter, new string[] { "i\u0316stanbul", "izmir", "\u0131\u0316sparta" }); }
public virtual void TestDecomposed3() { TokenStream stream = new MockTokenizer(new StringReader("\u0049\u0307"), MockTokenizer.WHITESPACE, false); TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); AssertTokenStreamContents(filter, new string[] { "i" }); }
public virtual void TestTurkishLowerCaseFilter() { TokenStream stream = new MockTokenizer(new StringReader("\u0130STANBUL \u0130ZM\u0130R ISPARTA"), MockTokenizer.WHITESPACE, false); TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); AssertTokenStreamContents(filter, new string[] { "istanbul", "izmir", "\u0131sparta" }); }
public virtual void TestApostropheFilter() { TokenStream stream = new MockTokenizer(new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm"), MockTokenizer.WHITESPACE, false); stream = new TurkishLowerCaseFilter(stream); stream = new ApostropheFilter(stream); AssertTokenStreamContents(stream, new string[] { "türkiye", "2003", "van", "gölü", "gördüm" }); }
/// <summary> /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>, /// and a <seealso cref="SnowballFilter"/> /// </summary> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins"))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
public virtual void TestApostropheFilter() { TokenStream stream = new MockTokenizer(new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm"), MockTokenizer.WHITESPACE, false); stream = new TurkishLowerCaseFilter(stream); stream = new ApostropheFilter(stream); AssertTokenStreamContents(stream, new string[] { "türkiye", "2003", "van", "gölü", "gördüm" }); }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="TurkishLowerCaseFilter"/>, /// <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <see cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return new TokenStreamComponents(source, result); }
public virtual void TestDecomposed3() { TokenStream stream = new MockTokenizer(new StringReader("\u0049\u0307"), MockTokenizer.WHITESPACE, false); TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); AssertTokenStreamContents(filter, new string[] { "i" }); }
public virtual void TestDecomposed2() { TokenStream stream = new MockTokenizer(new StringReader("\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"), MockTokenizer.WHITESPACE, false); TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); AssertTokenStreamContents(filter, new string[] { "i\u0316stanbul", "izmir", "\u0131\u0316sparta" }); }
public virtual void TestTurkishLowerCaseFilter() { TokenStream stream = new MockTokenizer(new StringReader("\u0130STANBUL \u0130ZM\u0130R ISPARTA"), MockTokenizer.WHITESPACE, false); TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); AssertTokenStreamContents(filter, new string[] { "istanbul", "izmir", "\u0131sparta" }); }