This analyzer implements light-stemming as specified by: Light Stemming for Arabic Information Retrieval http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
The analysis package contains three primary components:
public virtual void TestReusableTokenStream() { ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT); AssertAnalyzesTo(a, "كبير", new string[] { "كبير" }); AssertAnalyzesTo(a, "كبيرة", new string[] { "كبير" }); // feminine marker }
public virtual void TestCustomStopwords() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, AsSet("the", "and", "a"), false); ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set); AssertAnalyzesTo(a, "The quick brown fox.", new string[] { "quick", "brown", "fox" }); }
public virtual void TestWithStemExclusionSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ساهدهات"), false); ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهدهات" }); AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهدهات" }); a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET); AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهد" }); AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهد" }); }
public virtual void TestBasicFeatures() { ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT); AssertAnalyzesTo(a, "كبير", new string[] { "كبير" }); AssertAnalyzesTo(a, "كبيرة", new string[] { "كبير" }); // feminine marker AssertAnalyzesTo(a, "مشروب", new string[] { "مشروب" }); AssertAnalyzesTo(a, "مشروبات", new string[] { "مشروب" }); // plural -at AssertAnalyzesTo(a, "أمريكيين", new string[] { "امريك" }); // plural -in AssertAnalyzesTo(a, "امريكي", new string[] { "امريك" }); // singular with bare alif AssertAnalyzesTo(a, "كتاب", new string[] { "كتاب" }); AssertAnalyzesTo(a, "الكتاب", new string[] { "كتاب" }); // definite article AssertAnalyzesTo(a, "ما ملكت أيمانكم", new string[] { "ملكت", "ايمانكم" }); AssertAnalyzesTo(a, "الذين ملكت أيمانكم", new string[] { "ملكت", "ايمانكم" }); // stopwords }