Beispiel #1
0
        /// <summary>
        /// Mainly remove the definite article </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> new stemmed length </returns>
        private int RemoveArticle(char[] s, int len)
        {
            if (len > 6 && StemmerUtil.EndsWith(s, len, "ият"))
            {
                return(len - 3);
            }

            if (len > 5)
            {
                if (StemmerUtil.EndsWith(s, len, "ът") || StemmerUtil.EndsWith(s, len, "то") || StemmerUtil.EndsWith(s, len, "те") || StemmerUtil.EndsWith(s, len, "та") || StemmerUtil.EndsWith(s, len, "ия"))
                {
                    return(len - 2);
                }
            }

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ят"))
            {
                return(len - 2);
            }

            return(len);
        }
Beispiel #2
0
        private int Step2(char[] s, int len)
        {
            if (len > 5)
            {
                if (StemmerUtil.EndsWith(s, len, "lla") || StemmerUtil.EndsWith(s, len, "tse") || StemmerUtil.EndsWith(s, len, "sti"))
                {
                    return(len - 3);
                }

                if (StemmerUtil.EndsWith(s, len, "ni"))
                {
                    return(len - 2);
                }

                if (StemmerUtil.EndsWith(s, len, "aa"))
                {
                    return(len - 1); // aa -> a
                }
            }

            return(len);
        }
Beispiel #3
0
        private int Rule16(char[] s, int len)
        {
            bool removed = false;

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ησου"))
            {
                len    -= 4;
                removed = true;
            }
            else if (len > 3 && (StemmerUtil.EndsWith(s, len, "ησε") || StemmerUtil.EndsWith(s, len, "ησα")))
            {
                len    -= 3;
                removed = true;
            }

            if (removed && exc16.Contains(s, 0, len))
            {
                len += 2; // add back -ησ
            }

            return(len);
        }
Beispiel #4
0
        private int Rule15(char[] s, int len)
        {
            bool removed = false;

            if (len > 4 && StemmerUtil.EndsWith(s, len, "αγεσ"))
            {
                len    -= 4;
                removed = true;
            }
            else if (len > 3 && (StemmerUtil.EndsWith(s, len, "αγα") || StemmerUtil.EndsWith(s, len, "αγε")))
            {
                len    -= 3;
                removed = true;
            }

            if (removed)
            {
                bool cond1 = exc15a.Contains(s, 0, len) ||
                             StemmerUtil.EndsWith(s, len, "οφ") ||
                             StemmerUtil.EndsWith(s, len, "πελ") ||
                             StemmerUtil.EndsWith(s, len, "χορτ") ||
                             StemmerUtil.EndsWith(s, len, "λλ") ||
                             StemmerUtil.EndsWith(s, len, "σφ") ||
                             StemmerUtil.EndsWith(s, len, "ρπ") ||
                             StemmerUtil.EndsWith(s, len, "φρ") ||
                             StemmerUtil.EndsWith(s, len, "πρ") ||
                             StemmerUtil.EndsWith(s, len, "λοχ") ||
                             StemmerUtil.EndsWith(s, len, "σμην");

                bool cond2 = exc15b.Contains(s, 0, len) || StemmerUtil.EndsWith(s, len, "κολλ");

                if (cond1 && !cond2)
                {
                    len += 2; // add back -αγ
                }
            }

            return(len);
        }
Beispiel #5
0
#pragma warning restore 612, 618

        private int Rule12(char[] s, int len)
        {
            if (len > 5 && StemmerUtil.EndsWith(s, len, "ιεστε"))
            {
                len -= 5;
                if (exc12a.Contains(s, 0, len))
                {
                    len += 4; // add back -ιεστ
                }
            }

            if (len > 4 && StemmerUtil.EndsWith(s, len, "εστε"))
            {
                len -= 4;
                if (exc12b.Contains(s, 0, len))
                {
                    len += 3; // add back -εστ
                }
            }

            return(len);
        }
Beispiel #6
0
 private int Rule1(char[] s, int len)
 {
     if (len > 4 && (StemmerUtil.EndsWith(s, len, "αδεσ") ||
                     StemmerUtil.EndsWith(s, len, "αδων")))
     {
         len -= 4;
         if (!(StemmerUtil.EndsWith(s, len, "οκ") ||
               StemmerUtil.EndsWith(s, len, "μαμ") ||
               StemmerUtil.EndsWith(s, len, "μαν") ||
               StemmerUtil.EndsWith(s, len, "μπαμπ") ||
               StemmerUtil.EndsWith(s, len, "πατερ") ||
               StemmerUtil.EndsWith(s, len, "γιαγι") ||
               StemmerUtil.EndsWith(s, len, "νταντ") ||
               StemmerUtil.EndsWith(s, len, "κυρ") ||
               StemmerUtil.EndsWith(s, len, "θει") ||
               StemmerUtil.EndsWith(s, len, "πεθερ")))
         {
             len += 2; // add back -αδ
         }
     }
     return(len);
 }
Beispiel #7
0
        private int Rule13(char[] s, int len)
        {
            if (len > 6 && StemmerUtil.EndsWith(s, len, "ηθηκεσ"))
            {
                len -= 6;
            }
            else if (len > 5 && (StemmerUtil.EndsWith(s, len, "ηθηκα") || StemmerUtil.EndsWith(s, len, "ηθηκε")))
            {
                len -= 5;
            }

            bool removed = false;

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ηκεσ"))
            {
                len    -= 4;
                removed = true;
            }
            else if (len > 3 && (StemmerUtil.EndsWith(s, len, "ηκα") || StemmerUtil.EndsWith(s, len, "ηκε")))
            {
                len    -= 3;
                removed = true;
            }

            if (removed && (exc13.Contains(s, 0, len) ||
                            StemmerUtil.EndsWith(s, len, "σκωλ") ||
                            StemmerUtil.EndsWith(s, len, "σκουλ") ||
                            StemmerUtil.EndsWith(s, len, "ναρθ") ||
                            StemmerUtil.EndsWith(s, len, "σφ") ||
                            StemmerUtil.EndsWith(s, len, "οθ") ||
                            StemmerUtil.EndsWith(s, len, "πιθ")))
            {
                len += 2; // add back the -ηκ
            }

            return(len);
        }
Beispiel #8
0
        private int Rule19(char[] s, int len)
        {
            bool removed = false;

            if (len > 6 && (StemmerUtil.EndsWith(s, len, "ησουμε") || StemmerUtil.EndsWith(s, len, "ηθουμε")))
            {
                len    -= 6;
                removed = true;
            }
            else if (len > 4 && StemmerUtil.EndsWith(s, len, "ουμε"))
            {
                len    -= 4;
                removed = true;
            }

            if (removed && exc19.Contains(s, 0, len))
            {
                len       += 3;
                s[len - 3] = 'ο';
                s[len - 2] = 'υ';
                s[len - 1] = 'μ';
            }
            return(len);
        }
Beispiel #9
0
        private int Rule14(char[] s, int len)
        {
            bool removed = false;

            if (len > 5 && StemmerUtil.EndsWith(s, len, "ουσεσ"))
            {
                len    -= 5;
                removed = true;
            }
            else if (len > 4 && (StemmerUtil.EndsWith(s, len, "ουσα") ||
                                 StemmerUtil.EndsWith(s, len, "ουσε")))
            {
                len    -= 4;
                removed = true;
            }

            if (removed && (exc14.Contains(s, 0, len) ||
                            EndsWithVowel(s, len) ||
                            StemmerUtil.EndsWith(s, len, "ποδαρ") ||
                            StemmerUtil.EndsWith(s, len, "βλεπ") ||
                            StemmerUtil.EndsWith(s, len, "πανταχ") ||
                            StemmerUtil.EndsWith(s, len, "φρυδ") ||
                            StemmerUtil.EndsWith(s, len, "μαντιλ") ||
                            StemmerUtil.EndsWith(s, len, "μαλλ") ||
                            StemmerUtil.EndsWith(s, len, "κυματ") ||
                            StemmerUtil.EndsWith(s, len, "λαχ") ||
                            StemmerUtil.EndsWith(s, len, "ληγ") ||
                            StemmerUtil.EndsWith(s, len, "φαγ") ||
                            StemmerUtil.EndsWith(s, len, "ομ") ||
                            StemmerUtil.EndsWith(s, len, "πρωτ")))
            {
                len += 3; // add back -ουσ
            }

            return(len);
        }
Beispiel #10
0
        private int Norm2(char[] s, int len)
        {
            if (len > 8)
            {
                if (s[len - 1] == 'e' || s[len - 1] == 'o' || s[len - 1] == 'u')
                {
                    len--;
                }
            }

            if (len > 4)
            {
                if (s[len - 1] == 'i')
                {
                    len--;
                }

                if (len > 4)
                {
                    char ch = s[0];
                    for (int i = 1; i < len; i++)
                    {
                        if (s[i] == ch && (ch == 'k' || ch == 'p' || ch == 't'))
                        {
                            len = StemmerUtil.Delete(s, i--, len);
                        }
                        else
                        {
                            ch = s[i];
                        }
                    }
                }
            }

            return(len);
        }
Beispiel #11
0
            /// <returns> new valid length of the string after applying the entire step. </returns>
            public virtual int Apply(char[] s, int len)
            {
                if (len < m_min)
                {
                    return(len);
                }

                if (m_suffixes != null)
                {
                    bool found = false;

                    for (int i = 0; i < m_suffixes.Length; i++)
                    {
                        if (StemmerUtil.EndsWith(s, len, m_suffixes[i]))
                        {
                            found = true;
                            break;
                        }
                    }

                    if (!found)
                    {
                        return(len);
                    }
                }

                for (int i = 0; i < m_rules.Length; i++)
                {
                    if (m_rules[i].Matches(s, len))
                    {
                        return(m_rules[i].Replace(s, len));
                    }
                }

                return(len);
            }
Beispiel #12
0
        private int RemoveCase(char[] s, int len)
        {
            if (len > 6 && (StemmerUtil.EndsWith(s, len, "иями") || StemmerUtil.EndsWith(s, len, "оями")))
            {
                return(len - 4);
            }

            if (len > 5 && (StemmerUtil.EndsWith(s, len, "иям") || StemmerUtil.EndsWith(s, len, "иях") || StemmerUtil.EndsWith(s, len, "оях") || StemmerUtil.EndsWith(s, len, "ями") || StemmerUtil.EndsWith(s, len, "оям") || StemmerUtil.EndsWith(s, len, "оьв") || StemmerUtil.EndsWith(s, len, "ами") || StemmerUtil.EndsWith(s, len, "его") || StemmerUtil.EndsWith(s, len, "ему") || StemmerUtil.EndsWith(s, len, "ери") || StemmerUtil.EndsWith(s, len, "ими") || StemmerUtil.EndsWith(s, len, "ого") || StemmerUtil.EndsWith(s, len, "ому") || StemmerUtil.EndsWith(s, len, "ыми") || StemmerUtil.EndsWith(s, len, "оев")))
            {
                return(len - 3);
            }

            if (len > 4 && (StemmerUtil.EndsWith(s, len, "ая") || StemmerUtil.EndsWith(s, len, "яя") || StemmerUtil.EndsWith(s, len, "ях") || StemmerUtil.EndsWith(s, len, "юю") || StemmerUtil.EndsWith(s, len, "ах") || StemmerUtil.EndsWith(s, len, "ею") || StemmerUtil.EndsWith(s, len, "их") || StemmerUtil.EndsWith(s, len, "ия") || StemmerUtil.EndsWith(s, len, "ию") || StemmerUtil.EndsWith(s, len, "ьв") || StemmerUtil.EndsWith(s, len, "ою") || StemmerUtil.EndsWith(s, len, "ую") || StemmerUtil.EndsWith(s, len, "ям") || StemmerUtil.EndsWith(s, len, "ых") || StemmerUtil.EndsWith(s, len, "ея") || StemmerUtil.EndsWith(s, len, "ам") || StemmerUtil.EndsWith(s, len, "ем") || StemmerUtil.EndsWith(s, len, "ей") || StemmerUtil.EndsWith(s, len, "ём") || StemmerUtil.EndsWith(s, len, "ев") || StemmerUtil.EndsWith(s, len, "ий") || StemmerUtil.EndsWith(s, len, "им") || StemmerUtil.EndsWith(s, len, "ое") || StemmerUtil.EndsWith(s, len, "ой") || StemmerUtil.EndsWith(s, len, "ом") || StemmerUtil.EndsWith(s, len, "ов") || StemmerUtil.EndsWith(s, len, "ые") || StemmerUtil.EndsWith(s, len, "ый") || StemmerUtil.EndsWith(s, len, "ым") || StemmerUtil.EndsWith(s, len, "ми")))
            {
                return(len - 2);
            }

            if (len > 3)
            {
                switch (s[len - 1])
                {
                case 'а':
                case 'е':
                case 'и':
                case 'о':
                case 'у':
                case 'й':
                case 'ы':
                case 'я':
                case 'ь':
                    return(len - 1);
                }
            }

            return(len);
        }
Beispiel #13
0
        public virtual int stem(char[] s, int len)
        {
            // Remove posessive -s (bilens -> bilen) and continue checking
            if (len > 4 && s[len - 1] == 's')
            {
                len--;
            }

            // Remove common endings, single-pass
            if (len > 7 && ((StemmerUtil.EndsWith(s, len, "heter") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "heten") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "heita") && useNynorsk)))     // general ending (hemmeleg-heita -> hemmeleg) -  general ending (hemmelig-heten -> hemmelig) -  general ending (hemmelig-heter -> hemmelig)
            {
                return(len - 5);
            }

            // Remove Nynorsk common endings, single-pass
            if (len > 8 && useNynorsk && (StemmerUtil.EndsWith(s, len, "heiter") || StemmerUtil.EndsWith(s, len, "leiken") || StemmerUtil.EndsWith(s, len, "leikar")))     // general ending (trygg-leikar -> trygg) -  general ending (trygg-leiken -> trygg) -  general ending (hemmeleg-heiter -> hemmeleg)
            {
                return(len - 6);
            }

            if (len > 5 && (StemmerUtil.EndsWith(s, len, "dom") || (StemmerUtil.EndsWith(s, len, "het") && useBokmaal)))     // general ending (hemmelig-het -> hemmelig) -  general ending (kristen-dom -> kristen)
            {
                return(len - 3);
            }

            if (len > 6 && useNynorsk && (StemmerUtil.EndsWith(s, len, "heit") || StemmerUtil.EndsWith(s, len, "semd") || StemmerUtil.EndsWith(s, len, "leik")))     // general ending (trygg-leik -> trygg) -  general ending (verk-semd -> verk) -  general ending (hemmeleg-heit -> hemmeleg)
            {
                return(len - 4);
            }

            if (len > 7 && (StemmerUtil.EndsWith(s, len, "elser") || StemmerUtil.EndsWith(s, len, "elsen")))     // general ending (føl-elsen -> føl) -  general ending (føl-elser -> føl)
            {
                return(len - 5);
            }

            if (len > 6 && ((StemmerUtil.EndsWith(s, len, "ende") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "ande") && useNynorsk) || StemmerUtil.EndsWith(s, len, "else") || (StemmerUtil.EndsWith(s, len, "este") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "aste") && useNynorsk) || (StemmerUtil.EndsWith(s, len, "eren") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "aren") && useNynorsk)))     // masc -  masc -  adj (fin-aste -> fin) -  adj (fin-este -> fin) -  general ending (føl-else -> føl) -  (sov-ande -> sov) -  (sov-ende -> sov)
            {
                return(len - 4);
            }

            if (len > 5 && ((StemmerUtil.EndsWith(s, len, "ere") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "are") && useNynorsk) || (StemmerUtil.EndsWith(s, len, "est") && useBokmaal) || (StemmerUtil.EndsWith(s, len, "ast") && useNynorsk) || StemmerUtil.EndsWith(s, len, "ene") || (StemmerUtil.EndsWith(s, len, "ane") && useNynorsk)))     // masc pl definite (gut-ane) -  masc/fem/neutr pl definite (hus-ene) -  adj (fin-ast -> fin) -  adj (fin-est -> fin) -  adj (fin-are -> fin) -  adj (fin-ere -> fin)
            {
                return(len - 3);
            }

            if (len > 4 && (StemmerUtil.EndsWith(s, len, "er") || StemmerUtil.EndsWith(s, len, "en") || StemmerUtil.EndsWith(s, len, "et") || (StemmerUtil.EndsWith(s, len, "ar") && useNynorsk) || (StemmerUtil.EndsWith(s, len, "st") && useBokmaal) || StemmerUtil.EndsWith(s, len, "te")))     // adj (billig-st -> billig) -  masc pl indefinite -  neutr definite -  masc/fem definite -  masc/fem indefinite
            {
                return(len - 2);
            }

            if (len > 3)
            {
                switch (s[len - 1])
                {
                case 'a':         // fem definite
                case 'e':         // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
                case 'n':
                    return(len - 1);
                }
            }

            return(len);
        }
Beispiel #14
0
        /// <summary>
        /// Stem an input buffer of Sorani text.
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int Stem(char[] s, int len)
        {
            // postposition
            if (len > 5 && StemmerUtil.EndsWith(s, len, "دا"))
            {
                len -= 2;
            }
            else if (len > 4 && StemmerUtil.EndsWith(s, len, "نا"))
            {
                len--;
            }
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "ەوە"))
            {
                len -= 3;
            }

            // possessive pronoun
            if (len > 6 && (StemmerUtil.EndsWith(s, len, "مان") || StemmerUtil.EndsWith(s, len, "یان") || StemmerUtil.EndsWith(s, len, "تان")))
            {
                len -= 3;
            }

            // indefinite singular ezafe
            if (len > 6 && StemmerUtil.EndsWith(s, len, "ێکی"))
            {
                return(len - 3);
            }
            else if (len > 7 && StemmerUtil.EndsWith(s, len, "یەکی"))
            {
                return(len - 4);
            }
            // indefinite singular
            if (len > 5 && StemmerUtil.EndsWith(s, len, "ێک"))
            {
                return(len - 2);
            }
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "یەک"))
            {
                return(len - 3);
            }
            // definite singular
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "ەکە"))
            {
                return(len - 3);
            }
            else if (len > 5 && StemmerUtil.EndsWith(s, len, "کە"))
            {
                return(len - 2);
            }
            // definite plural
            else if (len > 7 && StemmerUtil.EndsWith(s, len, "ەکان"))
            {
                return(len - 4);
            }
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "کان"))
            {
                return(len - 3);
            }
            // indefinite plural ezafe
            else if (len > 7 && StemmerUtil.EndsWith(s, len, "یانی"))
            {
                return(len - 4);
            }
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "انی"))
            {
                return(len - 3);
            }
            // indefinite plural
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "یان"))
            {
                return(len - 3);
            }
            else if (len > 5 && StemmerUtil.EndsWith(s, len, "ان"))
            {
                return(len - 2);
            }
            // demonstrative plural
            else if (len > 7 && StemmerUtil.EndsWith(s, len, "یانە"))
            {
                return(len - 4);
            }
            else if (len > 6 && StemmerUtil.EndsWith(s, len, "انە"))
            {
                return(len - 3);
            }
            // demonstrative singular
            else if (len > 5 && (StemmerUtil.EndsWith(s, len, "ایە") || StemmerUtil.EndsWith(s, len, "ەیە")))
            {
                return(len - 2);
            }
            else if (len > 4 && StemmerUtil.EndsWith(s, len, "ە"))
            {
                return(len - 1);
            }
            // absolute singular ezafe
            else if (len > 4 && StemmerUtil.EndsWith(s, len, "ی"))
            {
                return(len - 1);
            }
            return(len);
        }
Beispiel #15
0
 /// <returns> true if the word matches this rule. </returns>
 public virtual bool Matches(char[] s, int len)
 {
     return(len - m_suffix.Length >= m_min && StemmerUtil.EndsWith(s, len, m_suffix));
 }
        private int RemovePossessive(char[] s, int len)
        {
            if (len > 6)
            {
                if (!IsVowel(s[len - 5]) && (StemmerUtil.EndsWith(s, len, "atok") || StemmerUtil.EndsWith(s, len, "otok") || StemmerUtil.EndsWith(s, len, "etek")))
                {
                    return(len - 4);
                }

                if (StemmerUtil.EndsWith(s, len, "itek") || StemmerUtil.EndsWith(s, len, "itok"))
                {
                    return(len - 4);
                }
            }

            if (len > 5)
            {
                if (!IsVowel(s[len - 4]) && (StemmerUtil.EndsWith(s, len, "unk") || StemmerUtil.EndsWith(s, len, "tok") || StemmerUtil.EndsWith(s, len, "tek")))
                {
                    return(len - 3);
                }

                if (IsVowel(s[len - 4]) && StemmerUtil.EndsWith(s, len, "juk"))
                {
                    return(len - 3);
                }

                if (StemmerUtil.EndsWith(s, len, "ink"))
                {
                    return(len - 3);
                }
            }

            if (len > 4)
            {
                if (!IsVowel(s[len - 3]) && (StemmerUtil.EndsWith(s, len, "am") || StemmerUtil.EndsWith(s, len, "em") || StemmerUtil.EndsWith(s, len, "om") || StemmerUtil.EndsWith(s, len, "ad") || StemmerUtil.EndsWith(s, len, "ed") || StemmerUtil.EndsWith(s, len, "od") || StemmerUtil.EndsWith(s, len, "uk")))
                {
                    return(len - 2);
                }

                if (IsVowel(s[len - 3]) && (StemmerUtil.EndsWith(s, len, "nk") || StemmerUtil.EndsWith(s, len, "ja") || StemmerUtil.EndsWith(s, len, "je")))
                {
                    return(len - 2);
                }

                if (StemmerUtil.EndsWith(s, len, "im") || StemmerUtil.EndsWith(s, len, "id") || StemmerUtil.EndsWith(s, len, "ik"))
                {
                    return(len - 2);
                }
            }

            if (len > 3)
            {
                switch (s[len - 1])
                {
                case 'a':
                case 'e':
                    if (!IsVowel(s[len - 2]))
                    {
                        return(len - 1);
                    }
                    break;

                case 'm':
                case 'd':
                    if (IsVowel(s[len - 2]))
                    {
                        return(len - 1);
                    }
                    break;

                case 'i':
                    return(len - 1);
                }
            }

            return(len);
        }
        private int RemoveCase(char[] s, int len)
        {
            if (len > 6 && StemmerUtil.EndsWith(s, len, "kent"))
            {
                return(len - 4);
            }

            if (len > 5)
            {
                if (StemmerUtil.EndsWith(s, len, "nak") || StemmerUtil.EndsWith(s, len, "nek") || StemmerUtil.EndsWith(s, len, "val") || StemmerUtil.EndsWith(s, len, "vel") || StemmerUtil.EndsWith(s, len, "ert") || StemmerUtil.EndsWith(s, len, "rol") || StemmerUtil.EndsWith(s, len, "ban") || StemmerUtil.EndsWith(s, len, "ben") || StemmerUtil.EndsWith(s, len, "bol") || StemmerUtil.EndsWith(s, len, "nal") || StemmerUtil.EndsWith(s, len, "nel") || StemmerUtil.EndsWith(s, len, "hoz") || StemmerUtil.EndsWith(s, len, "hez") || StemmerUtil.EndsWith(s, len, "tol"))
                {
                    return(len - 3);
                }

                if (StemmerUtil.EndsWith(s, len, "al") || StemmerUtil.EndsWith(s, len, "el"))
                {
                    if (!IsVowel(s[len - 3]) && s[len - 3] == s[len - 4])
                    {
                        return(len - 3);
                    }
                }
            }

            if (len > 4)
            {
                if (StemmerUtil.EndsWith(s, len, "at") || StemmerUtil.EndsWith(s, len, "et") || StemmerUtil.EndsWith(s, len, "ot") || StemmerUtil.EndsWith(s, len, "va") || StemmerUtil.EndsWith(s, len, "ve") || StemmerUtil.EndsWith(s, len, "ra") || StemmerUtil.EndsWith(s, len, "re") || StemmerUtil.EndsWith(s, len, "ba") || StemmerUtil.EndsWith(s, len, "be") || StemmerUtil.EndsWith(s, len, "ul") || StemmerUtil.EndsWith(s, len, "ig"))
                {
                    return(len - 2);
                }

                if ((StemmerUtil.EndsWith(s, len, "on") || StemmerUtil.EndsWith(s, len, "en")) && !IsVowel(s[len - 3]))
                {
                    return(len - 2);
                }

                switch (s[len - 1])
                {
                case 't':
                case 'n':
                    return(len - 1);

                case 'a':
                case 'e':
                    if (s[len - 2] == s[len - 3] && !IsVowel(s[len - 2]))
                    {
                        return(len - 2);
                    }
                    break;
                }
            }

            return(len);
        }
Beispiel #18
0
        private int Step3(char[] s, int len)
        {
            if (len > 8)
            {
                if (StemmerUtil.EndsWith(s, len, "nnen"))
                {
                    s[len - 4] = 's';
                    return(len - 3);
                }

                if (StemmerUtil.EndsWith(s, len, "ntena"))
                {
                    s[len - 5] = 's';
                    return(len - 4);
                }

                if (StemmerUtil.EndsWith(s, len, "tten"))
                {
                    return(len - 4);
                }

                if (StemmerUtil.EndsWith(s, len, "eiden"))
                {
                    return(len - 5);
                }
            }

            if (len > 6)
            {
                if (StemmerUtil.EndsWith(s, len, "neen") || StemmerUtil.EndsWith(s, len, "niin") || StemmerUtil.EndsWith(s, len, "seen") || StemmerUtil.EndsWith(s, len, "teen") || StemmerUtil.EndsWith(s, len, "inen"))
                {
                    return(len - 4);
                }

                if (s[len - 3] == 'h' && IsVowel(s[len - 2]) && s[len - 1] == 'n')
                {
                    return(len - 3);
                }

                if (StemmerUtil.EndsWith(s, len, "den"))
                {
                    s[len - 3] = 's';
                    return(len - 2);
                }

                if (StemmerUtil.EndsWith(s, len, "ksen"))
                {
                    s[len - 4] = 's';
                    return(len - 3);
                }

                if (StemmerUtil.EndsWith(s, len, "ssa") || StemmerUtil.EndsWith(s, len, "sta") || StemmerUtil.EndsWith(s, len, "lla") || StemmerUtil.EndsWith(s, len, "lta") || StemmerUtil.EndsWith(s, len, "tta") || StemmerUtil.EndsWith(s, len, "ksi") || StemmerUtil.EndsWith(s, len, "lle"))
                {
                    return(len - 3);
                }
            }

            if (len > 5)
            {
                if (StemmerUtil.EndsWith(s, len, "na") || StemmerUtil.EndsWith(s, len, "ne"))
                {
                    return(len - 2);
                }

                if (StemmerUtil.EndsWith(s, len, "nei"))
                {
                    return(len - 3);
                }
            }

            if (len > 4)
            {
                if (StemmerUtil.EndsWith(s, len, "ja") || StemmerUtil.EndsWith(s, len, "ta"))
                {
                    return(len - 2);
                }

                if (s[len - 1] == 'a')
                {
                    return(len - 1);
                }

                if (s[len - 1] == 'n' && IsVowel(s[len - 2]))
                {
                    return(len - 2);
                }

                if (s[len - 1] == 'n')
                {
                    return(len - 1);
                }
            }

            return(len);
        }
Beispiel #19
0
        private int RemoveCase(char[] s, int len)
        {
            if (len > 7 && StemmerUtil.EndsWith(s, len, "atech"))
            {
                return(len - 5);
            }

            if (len > 6 && (StemmerUtil.EndsWith(s, len, "ětem") || StemmerUtil.EndsWith(s, len, "etem") || StemmerUtil.EndsWith(s, len, "atům")))
            {
                return(len - 4);
            }

            if (len > 5 && (StemmerUtil.EndsWith(s, len, "ech") || StemmerUtil.EndsWith(s, len, "ich") || StemmerUtil.EndsWith(s, len, "ích") || StemmerUtil.EndsWith(s, len, "ého") || StemmerUtil.EndsWith(s, len, "ěmi") || StemmerUtil.EndsWith(s, len, "emi") || StemmerUtil.EndsWith(s, len, "ému") || StemmerUtil.EndsWith(s, len, "ěte") || StemmerUtil.EndsWith(s, len, "ete") || StemmerUtil.EndsWith(s, len, "ěti") || StemmerUtil.EndsWith(s, len, "eti") || StemmerUtil.EndsWith(s, len, "ího") || StemmerUtil.EndsWith(s, len, "iho") || StemmerUtil.EndsWith(s, len, "ími") || StemmerUtil.EndsWith(s, len, "ímu") || StemmerUtil.EndsWith(s, len, "imu") || StemmerUtil.EndsWith(s, len, "ách") || StemmerUtil.EndsWith(s, len, "ata") || StemmerUtil.EndsWith(s, len, "aty") || StemmerUtil.EndsWith(s, len, "ých") || StemmerUtil.EndsWith(s, len, "ama") || StemmerUtil.EndsWith(s, len, "ami") || StemmerUtil.EndsWith(s, len, "ové") || StemmerUtil.EndsWith(s, len, "ovi") || StemmerUtil.EndsWith(s, len, "ými")))
            {
                return(len - 3);
            }

            if (len > 4 && (StemmerUtil.EndsWith(s, len, "em") || StemmerUtil.EndsWith(s, len, "es") || StemmerUtil.EndsWith(s, len, "ém") || StemmerUtil.EndsWith(s, len, "ím") || StemmerUtil.EndsWith(s, len, "ům") || StemmerUtil.EndsWith(s, len, "at") || StemmerUtil.EndsWith(s, len, "ám") || StemmerUtil.EndsWith(s, len, "os") || StemmerUtil.EndsWith(s, len, "us") || StemmerUtil.EndsWith(s, len, "ým") || StemmerUtil.EndsWith(s, len, "mi") || StemmerUtil.EndsWith(s, len, "ou")))
            {
                return(len - 2);
            }

            if (len > 3)
            {
                switch (s[len - 1])
                {
                case 'a':
                case 'e':
                case 'i':
                case 'o':
                case 'u':
                case 'ů':
                case 'y':
                case 'á':
                case 'é':
                case 'í':
                case 'ý':
                case 'ě':
                    return(len - 1);
                }
            }

            return(len);
        }
Beispiel #20
0
        public virtual int Stem(char[] s, int len)
        {
            if (len > 5 && s[len - 1] == 'x')
            {
                if (s[len - 3] == 'a' && s[len - 2] == 'u' && s[len - 4] != 'e')
                {
                    s[len - 2] = 'l';
                }
                len--;
            }

            if (len > 3 && s[len - 1] == 'x')
            {
                len--;
            }

            if (len > 3 && s[len - 1] == 's')
            {
                len--;
            }

            if (len > 9 && StemmerUtil.EndsWith(s, len, "issement"))
            {
                len       -= 6;
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "issant"))
            {
                len       -= 4;
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 6 && StemmerUtil.EndsWith(s, len, "ement"))
            {
                len -= 4;
                if (len > 3 && StemmerUtil.EndsWith(s, len, "ive"))
                {
                    len--;
                    s[len - 1] = 'f';
                }
                return(Norm(s, len));
            }

            if (len > 11 && StemmerUtil.EndsWith(s, len, "ficatrice"))
            {
                len       -= 5;
                s[len - 2] = 'e';
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 10 && StemmerUtil.EndsWith(s, len, "ficateur"))
            {
                len       -= 4;
                s[len - 2] = 'e';
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 9 && StemmerUtil.EndsWith(s, len, "catrice"))
            {
                len       -= 3;
                s[len - 4] = 'q';
                s[len - 3] = 'u';
                s[len - 2] = 'e';
                //s[len-1] = 'r' <-- unnecessary, already 'r'.
                return(Norm(s, len));
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "cateur"))
            {
                len       -= 2;
                s[len - 4] = 'q';
                s[len - 3] = 'u';
                s[len - 2] = 'e';
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "atrice"))
            {
                len       -= 4;
                s[len - 2] = 'e';
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 7 && StemmerUtil.EndsWith(s, len, "ateur"))
            {
                len       -= 3;
                s[len - 2] = 'e';
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 6 && StemmerUtil.EndsWith(s, len, "trice"))
            {
                len--;
                s[len - 3] = 'e';
                s[len - 2] = 'u';
                s[len - 1] = 'r';
            }

            if (len > 5 && StemmerUtil.EndsWith(s, len, "ième"))
            {
                return(Norm(s, len - 4));
            }

            if (len > 7 && StemmerUtil.EndsWith(s, len, "teuse"))
            {
                len       -= 2;
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 6 && StemmerUtil.EndsWith(s, len, "teur"))
            {
                len--;
                s[len - 1] = 'r';
                return(Norm(s, len));
            }

            if (len > 5 && StemmerUtil.EndsWith(s, len, "euse"))
            {
                return(Norm(s, len - 2));
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "ère"))
            {
                len--;
                s[len - 2] = 'e';
                return(Norm(s, len));
            }

            if (len > 7 && StemmerUtil.EndsWith(s, len, "ive"))
            {
                len--;
                s[len - 1] = 'f';
                return(Norm(s, len));
            }

            if (len > 4 && (StemmerUtil.EndsWith(s, len, "folle") || StemmerUtil.EndsWith(s, len, "molle")))
            {
                len       -= 2;
                s[len - 1] = 'u';
                return(Norm(s, len));
            }

            if (len > 9 && StemmerUtil.EndsWith(s, len, "nnelle"))
            {
                return(Norm(s, len - 5));
            }

            if (len > 9 && StemmerUtil.EndsWith(s, len, "nnel"))
            {
                return(Norm(s, len - 3));
            }

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ète"))
            {
                len--;
                s[len - 2] = 'e';
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "ique"))
            {
                len -= 4;
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "esse"))
            {
                return(Norm(s, len - 3));
            }

            if (len > 7 && StemmerUtil.EndsWith(s, len, "inage"))
            {
                return(Norm(s, len - 3));
            }

            if (len > 9 && StemmerUtil.EndsWith(s, len, "isation"))
            {
                len -= 7;
                if (len > 5 && StemmerUtil.EndsWith(s, len, "ual"))
                {
                    s[len - 2] = 'e';
                }
                return(Norm(s, len));
            }

            if (len > 9 && StemmerUtil.EndsWith(s, len, "isateur"))
            {
                return(Norm(s, len - 7));
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "ation"))
            {
                return(Norm(s, len - 5));
            }

            if (len > 8 && StemmerUtil.EndsWith(s, len, "ition"))
            {
                return(Norm(s, len - 5));
            }

            return(Norm(s, len));
        }
Beispiel #21
0
        private int Norm(char[] s, int len)
        {
            if (len > 4)
            {
                for (int i = 0; i < len; i++)
                {
                    switch (s[i])
                    {
                    case 'à':
                    case 'á':
                    case 'â':
                        s[i] = 'a';
                        break;

                    case 'ô':
                        s[i] = 'o';
                        break;

                    case 'è':
                    case 'é':
                    case 'ê':
                        s[i] = 'e';
                        break;

                    case 'ù':
                    case 'û':
                        s[i] = 'u';
                        break;

                    case 'î':
                        s[i] = 'i';
                        break;

                    case 'ç':
                        s[i] = 'c';
                        break;
                    }
                }

                char ch = s[0];
                for (int i = 1; i < len; i++)
                {
                    if (s[i] == ch && char.IsLetter(ch))
                    {
                        len = StemmerUtil.Delete(s, i--, len);
                    }
                    else
                    {
                        ch = s[i];
                    }
                }
            }

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ie"))
            {
                len -= 2;
            }

            if (len > 4)
            {
                if (s[len - 1] == 'r')
                {
                    len--;
                }
                if (s[len - 1] == 'e')
                {
                    len--;
                }
                if (s[len - 1] == 'e')
                {
                    len--;
                }
                if (s[len - 1] == s[len - 2] && char.IsLetter(s[len - 1]))
                {
                    len--;
                }
            }
            return(len);
        }
Beispiel #22
0
        private int Rule21(char[] s, int len)
        {
            if (len > 9 && StemmerUtil.EndsWith(s, len, "ιοντουσαν"))
            {
                return(len - 9);
            }

            if (len > 8 && (StemmerUtil.EndsWith(s, len, "ιομασταν") ||
                            StemmerUtil.EndsWith(s, len, "ιοσασταν") ||
                            StemmerUtil.EndsWith(s, len, "ιουμαστε") ||
                            StemmerUtil.EndsWith(s, len, "οντουσαν")))
            {
                return(len - 8);
            }

            if (len > 7 && (StemmerUtil.EndsWith(s, len, "ιεμαστε") ||
                            StemmerUtil.EndsWith(s, len, "ιεσαστε") ||
                            StemmerUtil.EndsWith(s, len, "ιομουνα") ||
                            StemmerUtil.EndsWith(s, len, "ιοσαστε") ||
                            StemmerUtil.EndsWith(s, len, "ιοσουνα") ||
                            StemmerUtil.EndsWith(s, len, "ιουνται") ||
                            StemmerUtil.EndsWith(s, len, "ιουνταν") ||
                            StemmerUtil.EndsWith(s, len, "ηθηκατε") ||
                            StemmerUtil.EndsWith(s, len, "ομασταν") ||
                            StemmerUtil.EndsWith(s, len, "οσασταν") ||
                            StemmerUtil.EndsWith(s, len, "ουμαστε")))
            {
                return(len - 7);
            }

            if (len > 6 && (StemmerUtil.EndsWith(s, len, "ιομουν") ||
                            StemmerUtil.EndsWith(s, len, "ιονταν") ||
                            StemmerUtil.EndsWith(s, len, "ιοσουν") ||
                            StemmerUtil.EndsWith(s, len, "ηθειτε") ||
                            StemmerUtil.EndsWith(s, len, "ηθηκαν") ||
                            StemmerUtil.EndsWith(s, len, "ομουνα") ||
                            StemmerUtil.EndsWith(s, len, "οσαστε") ||
                            StemmerUtil.EndsWith(s, len, "οσουνα") ||
                            StemmerUtil.EndsWith(s, len, "ουνται") ||
                            StemmerUtil.EndsWith(s, len, "ουνταν") ||
                            StemmerUtil.EndsWith(s, len, "ουσατε")))
            {
                return(len - 6);
            }

            if (len > 5 && (StemmerUtil.EndsWith(s, len, "αγατε") ||
                            StemmerUtil.EndsWith(s, len, "ιεμαι") ||
                            StemmerUtil.EndsWith(s, len, "ιεται") ||
                            StemmerUtil.EndsWith(s, len, "ιεσαι") ||
                            StemmerUtil.EndsWith(s, len, "ιοταν") ||
                            StemmerUtil.EndsWith(s, len, "ιουμα") ||
                            StemmerUtil.EndsWith(s, len, "ηθεισ") ||
                            StemmerUtil.EndsWith(s, len, "ηθουν") ||
                            StemmerUtil.EndsWith(s, len, "ηκατε") ||
                            StemmerUtil.EndsWith(s, len, "ησατε") ||
                            StemmerUtil.EndsWith(s, len, "ησουν") ||
                            StemmerUtil.EndsWith(s, len, "ομουν") ||
                            StemmerUtil.EndsWith(s, len, "ονται") ||
                            StemmerUtil.EndsWith(s, len, "ονταν") ||
                            StemmerUtil.EndsWith(s, len, "οσουν") ||
                            StemmerUtil.EndsWith(s, len, "ουμαι") ||
                            StemmerUtil.EndsWith(s, len, "ουσαν")))
            {
                return(len - 5);
            }

            if (len > 4 && (StemmerUtil.EndsWith(s, len, "αγαν") ||
                            StemmerUtil.EndsWith(s, len, "αμαι") ||
                            StemmerUtil.EndsWith(s, len, "ασαι") ||
                            StemmerUtil.EndsWith(s, len, "αται") ||
                            StemmerUtil.EndsWith(s, len, "ειτε") ||
                            StemmerUtil.EndsWith(s, len, "εσαι") ||
                            StemmerUtil.EndsWith(s, len, "εται") ||
                            StemmerUtil.EndsWith(s, len, "ηδεσ") ||
                            StemmerUtil.EndsWith(s, len, "ηδων") ||
                            StemmerUtil.EndsWith(s, len, "ηθει") ||
                            StemmerUtil.EndsWith(s, len, "ηκαν") ||
                            StemmerUtil.EndsWith(s, len, "ησαν") ||
                            StemmerUtil.EndsWith(s, len, "ησει") ||
                            StemmerUtil.EndsWith(s, len, "ησεσ") ||
                            StemmerUtil.EndsWith(s, len, "ομαι") ||
                            StemmerUtil.EndsWith(s, len, "οταν")))
            {
                return(len - 4);
            }

            if (len > 3 && (StemmerUtil.EndsWith(s, len, "αει") ||
                            StemmerUtil.EndsWith(s, len, "εισ") ||
                            StemmerUtil.EndsWith(s, len, "ηθω") ||
                            StemmerUtil.EndsWith(s, len, "ησω") ||
                            StemmerUtil.EndsWith(s, len, "ουν") ||
                            StemmerUtil.EndsWith(s, len, "ουσ")))
            {
                return(len - 3);
            }

            if (len > 2 && (StemmerUtil.EndsWith(s, len, "αν") ||
                            StemmerUtil.EndsWith(s, len, "ασ") ||
                            StemmerUtil.EndsWith(s, len, "αω") ||
                            StemmerUtil.EndsWith(s, len, "ει") ||
                            StemmerUtil.EndsWith(s, len, "εσ") ||
                            StemmerUtil.EndsWith(s, len, "ησ") ||
                            StemmerUtil.EndsWith(s, len, "οι") ||
                            StemmerUtil.EndsWith(s, len, "οσ") ||
                            StemmerUtil.EndsWith(s, len, "ου") ||
                            StemmerUtil.EndsWith(s, len, "υσ") ||
                            StemmerUtil.EndsWith(s, len, "ων")))
            {
                return(len - 2);
            }

            if (len > 1 && EndsWithVowel(s, len))
            {
                return(len - 1);
            }

            return(len);
        }
Beispiel #23
0
        /// <summary>
        /// Stem an input buffer of Bulgarian text.
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int Stem(char[] s, int len)
        {
            if (len < 4) // do not stem
            {
                return(len);
            }

            if (len > 5 && StemmerUtil.EndsWith(s, len, "ища"))
            {
                return(len - 3);
            }

            len = RemoveArticle(s, len);
            len = RemovePlural(s, len);

            if (len > 3)
            {
                if (StemmerUtil.EndsWith(s, len, "я"))
                {
                    len--;
                }
                if (StemmerUtil.EndsWith(s, len, "а") || StemmerUtil.EndsWith(s, len, "о") || StemmerUtil.EndsWith(s, len, "е"))
                {
                    len--;
                }
            }

            // the rule to rewrite ен -> н is duplicated in the paper.
            // in the perl implementation referenced by the paper, this is fixed.
            // (it is fixed here as well)
            if (len > 4 && StemmerUtil.EndsWith(s, len, "ен"))
            {
                s[len - 2] = 'н'; // replace with н
                len--;
            }

            if (len > 5 && s[len - 2] == 'ъ')
            {
                s[len - 2] = s[len - 1]; // replace ъN with N
                len--;
            }

            return(len);
        }
Beispiel #24
0
        private int RemovePlural(char[] s, int len)
        {
            if (len > 6)
            {
                if (StemmerUtil.EndsWith(s, len, "овци"))
                {
                    return(len - 3); // replace with о
                }
                if (StemmerUtil.EndsWith(s, len, "ове"))
                {
                    return(len - 3);
                }
                if (StemmerUtil.EndsWith(s, len, "еве"))
                {
                    s[len - 3] = 'й'; // replace with й
                    return(len - 2);
                }
            }

            if (len > 5)
            {
                if (StemmerUtil.EndsWith(s, len, "ища"))
                {
                    return(len - 3);
                }
                if (StemmerUtil.EndsWith(s, len, "та"))
                {
                    return(len - 2);
                }
                if (StemmerUtil.EndsWith(s, len, "ци"))
                {
                    s[len - 2] = 'к'; // replace with к
                    return(len - 1);
                }
                if (StemmerUtil.EndsWith(s, len, "зи"))
                {
                    s[len - 2] = 'г'; // replace with г
                    return(len - 1);
                }

                if (s[len - 3] == 'е' && s[len - 1] == 'и')
                {
                    s[len - 3] = 'я'; // replace е with я, remove и
                    return(len - 1);
                }
            }

            if (len > 4)
            {
                if (StemmerUtil.EndsWith(s, len, "си"))
                {
                    s[len - 2] = 'х'; // replace with х
                    return(len - 1);
                }
                if (StemmerUtil.EndsWith(s, len, "и"))
                {
                    return(len - 1);
                }
            }

            return(len);
        }
Beispiel #25
0
        public virtual int Stem(char[] buffer, int len)
        {
            // 5
            if ((len > 6) && (StemmerUtil.EndsWith(buffer, len, "ाएंगी") || StemmerUtil.EndsWith(buffer, len, "ाएंगे") || StemmerUtil.EndsWith(buffer, len, "ाऊंगी") || StemmerUtil.EndsWith(buffer, len, "ाऊंगा") || StemmerUtil.EndsWith(buffer, len, "ाइयाँ") || StemmerUtil.EndsWith(buffer, len, "ाइयों") || StemmerUtil.EndsWith(buffer, len, "ाइयां")))
            {
                return(len - 5);
            }

            // 4
            if ((len > 5) && (StemmerUtil.EndsWith(buffer, len, "ाएगी") || StemmerUtil.EndsWith(buffer, len, "ाएगा") || StemmerUtil.EndsWith(buffer, len, "ाओगी") || StemmerUtil.EndsWith(buffer, len, "ाओगे") || StemmerUtil.EndsWith(buffer, len, "एंगी") || StemmerUtil.EndsWith(buffer, len, "ेंगी") || StemmerUtil.EndsWith(buffer, len, "एंगे") || StemmerUtil.EndsWith(buffer, len, "ेंगे") || StemmerUtil.EndsWith(buffer, len, "ूंगी") || StemmerUtil.EndsWith(buffer, len, "ूंगा") || StemmerUtil.EndsWith(buffer, len, "ातीं") || StemmerUtil.EndsWith(buffer, len, "नाओं") || StemmerUtil.EndsWith(buffer, len, "नाएं") || StemmerUtil.EndsWith(buffer, len, "ताओं") || StemmerUtil.EndsWith(buffer, len, "ताएं") || StemmerUtil.EndsWith(buffer, len, "ियाँ") || StemmerUtil.EndsWith(buffer, len, "ियों") || StemmerUtil.EndsWith(buffer, len, "ियां")))
            {
                return(len - 4);
            }

            // 3
            if ((len > 4) && (StemmerUtil.EndsWith(buffer, len, "ाकर") || StemmerUtil.EndsWith(buffer, len, "ाइए") || StemmerUtil.EndsWith(buffer, len, "ाईं") || StemmerUtil.EndsWith(buffer, len, "ाया") || StemmerUtil.EndsWith(buffer, len, "ेगी") || StemmerUtil.EndsWith(buffer, len, "ेगा") || StemmerUtil.EndsWith(buffer, len, "ोगी") || StemmerUtil.EndsWith(buffer, len, "ोगे") || StemmerUtil.EndsWith(buffer, len, "ाने") || StemmerUtil.EndsWith(buffer, len, "ाना") || StemmerUtil.EndsWith(buffer, len, "ाते") || StemmerUtil.EndsWith(buffer, len, "ाती") || StemmerUtil.EndsWith(buffer, len, "ाता") || StemmerUtil.EndsWith(buffer, len, "तीं") || StemmerUtil.EndsWith(buffer, len, "ाओं") || StemmerUtil.EndsWith(buffer, len, "ाएं") || StemmerUtil.EndsWith(buffer, len, "ुओं") || StemmerUtil.EndsWith(buffer, len, "ुएं") || StemmerUtil.EndsWith(buffer, len, "ुआं")))
            {
                return(len - 3);
            }

            // 2
            if ((len > 3) && (StemmerUtil.EndsWith(buffer, len, "कर") || StemmerUtil.EndsWith(buffer, len, "ाओ") || StemmerUtil.EndsWith(buffer, len, "िए") || StemmerUtil.EndsWith(buffer, len, "ाई") || StemmerUtil.EndsWith(buffer, len, "ाए") || StemmerUtil.EndsWith(buffer, len, "ने") || StemmerUtil.EndsWith(buffer, len, "नी") || StemmerUtil.EndsWith(buffer, len, "ना") || StemmerUtil.EndsWith(buffer, len, "ते") || StemmerUtil.EndsWith(buffer, len, "ीं") || StemmerUtil.EndsWith(buffer, len, "ती") || StemmerUtil.EndsWith(buffer, len, "ता") || StemmerUtil.EndsWith(buffer, len, "ाँ") || StemmerUtil.EndsWith(buffer, len, "ां") || StemmerUtil.EndsWith(buffer, len, "ों") || StemmerUtil.EndsWith(buffer, len, "ें")))
            {
                return(len - 2);
            }

            // 1
            if ((len > 2) && (StemmerUtil.EndsWith(buffer, len, "ो") || StemmerUtil.EndsWith(buffer, len, "े") || StemmerUtil.EndsWith(buffer, len, "ू") || StemmerUtil.EndsWith(buffer, len, "ु") || StemmerUtil.EndsWith(buffer, len, "ी") || StemmerUtil.EndsWith(buffer, len, "ि") || StemmerUtil.EndsWith(buffer, len, "ा")))
            {
                return(len - 1);
            }
            return(len);
        }
Beispiel #26
0
        private int RemoveParticle(char[] text, int length)
        {
            if (StemmerUtil.EndsWith(text, length, "kah") || StemmerUtil.EndsWith(text, length, "lah") || StemmerUtil.EndsWith(text, length, "pun"))
            {
                numSyllables--;
                return(length - 3);
            }

            return(length);
        }
Beispiel #27
0
        private int RemoveFirstOrderPrefix(char[] text, int length)
        {
            if (StemmerUtil.StartsWith(text, length, "meng"))
            {
                flags |= REMOVED_MENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 4));
            }

            if (StemmerUtil.StartsWith(text, length, "meny") && length > 4 && IsVowel(text[4]))
            {
                flags  |= REMOVED_MENG;
                text[3] = 's';
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "men"))
            {
                flags |= REMOVED_MENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "mem"))
            {
                flags |= REMOVED_MENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "me"))
            {
                flags |= REMOVED_MENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 2));
            }

            if (StemmerUtil.StartsWith(text, length, "peng"))
            {
                flags |= REMOVED_PENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 4));
            }

            if (StemmerUtil.StartsWith(text, length, "peny") && length > 4 && IsVowel(text[4]))
            {
                flags  |= REMOVED_PENG;
                text[3] = 's';
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "peny"))
            {
                flags |= REMOVED_PENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 4));
            }

            if (StemmerUtil.StartsWith(text, length, "pen") && length > 3 && IsVowel(text[3]))
            {
                flags  |= REMOVED_PENG;
                text[2] = 't';
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 2));
            }

            if (StemmerUtil.StartsWith(text, length, "pen"))
            {
                flags |= REMOVED_PENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "pem"))
            {
                flags |= REMOVED_PENG;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "di"))
            {
                flags |= REMOVED_DI;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 2));
            }

            if (StemmerUtil.StartsWith(text, length, "ter"))
            {
                flags |= REMOVED_TER;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 3));
            }

            if (StemmerUtil.StartsWith(text, length, "ke"))
            {
                flags |= REMOVED_KE;
                numSyllables--;
                return(StemmerUtil.DeleteN(text, 0, length, 2));
            }

            return(length);
        }
Beispiel #28
0
        private int RemovePossessives(char[] s, int len)
        {
            if (len > 5 && (StemmerUtil.EndsWith(s, len, "ov") || StemmerUtil.EndsWith(s, len, "in") || StemmerUtil.EndsWith(s, len, "ův")))
            {
                return(len - 2);
            }

            return(len);
        }
Beispiel #29
0
        public virtual int Stem(char[] s, int len)
        {
            // Remove genitiv s
            if (len > 4 && s[len - 1] == 's')
            {
                len--;
            }

            if (len > 5 && (StemmerUtil.EndsWith(s, len, "ene") || (StemmerUtil.EndsWith(s, len, "ane") && useNynorsk))) // masc pl definite (gut-ane) -  masc/fem/neutr pl definite (hus-ene)
            {
                return(len - 3);
            }

            if (len > 4 && (StemmerUtil.EndsWith(s, len, "er") || StemmerUtil.EndsWith(s, len, "en") || StemmerUtil.EndsWith(s, len, "et") || (StemmerUtil.EndsWith(s, len, "ar") && useNynorsk))) // masc pl indefinite -  neutr definite -  masc/fem definite -  masc/fem indefinite
            {
                return(len - 2);
            }

            if (len > 3)
            {
                switch (s[len - 1])
                {
                case 'a':     // fem definite
                case 'e':     // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
                    return(len - 1);
                }
            }

            return(len);
        }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                int    state  = N;
                char[] buffer = termAtt.Buffer();
                int    length = termAtt.Length;
                for (int i = 0; i < length; i++)
                {
                    char c = buffer[i];
                    switch (c)
                    {
                    case 'a':
                    case 'o':
                        state = U;
                        break;

                    case 'u':
                        state = (state == N) ? U : V;
                        break;

                    case 'e':
                        if (state == U)
                        {
                            length = StemmerUtil.Delete(buffer, i--, length);
                        }
                        state = V;
                        break;

                    case 'i':
                    case 'q':
                    case 'y':
                        state = V;
                        break;

                    case 'ä':
                        buffer[i] = 'a';
                        state     = V;
                        break;

                    case 'ö':
                        buffer[i] = 'o';
                        state     = V;
                        break;

                    case 'ü':
                        buffer[i] = 'u';
                        state     = V;
                        break;

                    case 'ß':
                        buffer[i++] = 's';
                        buffer      = termAtt.ResizeBuffer(1 + length);
                        if (i < length)
                        {
                            Array.Copy(buffer, i, buffer, i + 1, (length - i));
                        }
                        buffer[i] = 's';
                        length++;
                        state = N;
                        break;

                    default:
                        state = N;
                        break;
                    }
                }
                termAtt.Length = length;
                return(true);
            }
            else
            {
                return(false);
            }
        }