Analyzer for Arabic.

This analyzer implements light-stemming as specified by: Light Stemming for Arabic Information Retrieval http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf

The analysis package contains three primary components:

  • ArabicNormalizationFilter: Arabic orthographic normalization.
  • ArabicStemFilter: Arabic light stemming
  • Arabic stop words file: a set of default Arabic stop words.

Inheritance: Lucene.Net.Analysis.Util.StopwordAnalyzerBase
示例#1
0
        public virtual void TestReusableTokenStream()
        {
            ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);

            AssertAnalyzesTo(a, "كبير", new string[] { "كبير" });
            AssertAnalyzesTo(a, "كبيرة", new string[] { "كبير" }); // feminine marker
        }
示例#2
0
        public virtual void TestCustomStopwords()
        {
            CharArraySet   set = new CharArraySet(TEST_VERSION_CURRENT, AsSet("the", "and", "a"), false);
            ArabicAnalyzer a   = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);

            AssertAnalyzesTo(a, "The quick brown fox.", new string[] { "quick", "brown", "fox" });
        }
示例#3
0
        public virtual void TestWithStemExclusionSet()
        {
            CharArraySet   set = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ساهدهات"), false);
            ArabicAnalyzer a   = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);

            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهدهات" });
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهدهات" });


            a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهد" });
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهد" });
        }
        public virtual void TestBasicFeatures()
        {
            ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
            AssertAnalyzesTo(a, "كبير", new string[] { "كبير" });
            AssertAnalyzesTo(a, "كبيرة", new string[] { "كبير" }); // feminine marker

            AssertAnalyzesTo(a, "مشروب", new string[] { "مشروب" });
            AssertAnalyzesTo(a, "مشروبات", new string[] { "مشروب" }); // plural -at

            AssertAnalyzesTo(a, "أمريكيين", new string[] { "امريك" }); // plural -in
            AssertAnalyzesTo(a, "امريكي", new string[] { "امريك" }); // singular with bare alif

            AssertAnalyzesTo(a, "كتاب", new string[] { "كتاب" });
            AssertAnalyzesTo(a, "الكتاب", new string[] { "كتاب" }); // definite article

            AssertAnalyzesTo(a, "ما ملكت أيمانكم", new string[] { "ملكت", "ايمانكم" });
            AssertAnalyzesTo(a, "الذين ملكت أيمانكم", new string[] { "ملكت", "ايمانكم" }); // stopwords
        }
示例#5
0
        public virtual void TestBasicFeatures()
        {
            ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);

            AssertAnalyzesTo(a, "كبير", new string[] { "كبير" });
            AssertAnalyzesTo(a, "كبيرة", new string[] { "كبير" }); // feminine marker

            AssertAnalyzesTo(a, "مشروب", new string[] { "مشروب" });
            AssertAnalyzesTo(a, "مشروبات", new string[] { "مشروب" });  // plural -at

            AssertAnalyzesTo(a, "أمريكيين", new string[] { "امريك" }); // plural -in
            AssertAnalyzesTo(a, "امريكي", new string[] { "امريك" });   // singular with bare alif

            AssertAnalyzesTo(a, "كتاب", new string[] { "كتاب" });
            AssertAnalyzesTo(a, "الكتاب", new string[] { "كتاب" }); // definite article

            AssertAnalyzesTo(a, "ما ملكت أيمانكم", new string[] { "ملكت", "ايمانكم" });
            AssertAnalyzesTo(a, "الذين ملكت أيمانكم", new string[] { "ملكت", "ايمانكم" }); // stopwords
        }
        public virtual void TestWithStemExclusionSet()
        {
            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ساهدهات"), false);
            ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهدهات" });
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهدهات" });


            a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهد" });
            AssertAnalyzesTo(a, "كبيرة the quick ساهدهات", new string[] { "كبير", "the", "quick", "ساهد" });
        }
 public virtual void TestCustomStopwords()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, AsSet("the", "and", "a"), false);
     ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
     AssertAnalyzesTo(a, "The quick brown fox.", new string[] { "quick", "brown", "fox" });
 }
 public virtual void TestReusableTokenStream()
 {
     ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
     AssertAnalyzesTo(a, "كبير", new string[] { "كبير" });
     AssertAnalyzesTo(a, "كبيرة", new string[] { "كبير" }); // feminine marker
 }