Пример #1
0
        /// <summary>
        /// Normalize an input buffer of Persian text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int Normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                case FARSI_YEH:
                case YEH_BARREE:
                    s[i] = YEH;
                    break;

                case KEHEH:
                    s[i] = KAF;
                    break;

                case HEH_YEH:
                case HEH_GOAL:
                    s[i] = HEH;
                    break;

                case HAMZA_ABOVE:     // necessary for HEH + HAMZA
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                default:
                    break;
                }
            }

            return(len);
        }
Пример #2
0
 public override bool IncrementToken()
 {
     if (m_input.IncrementToken())
     {
         char[] text   = termAtt.Buffer;
         int    length = termAtt.Length;
         for (int i = 0; i < length; i++)
         {
             char ch = text[i];
             if (ch >= 0xFF01 && ch <= 0xFF5E)
             {
                 // Fullwidth ASCII variants
                 text[i] = (char)(text[i] - 0xFEE0);
             }
             else if (ch >= 0xFF65 && ch <= 0xFF9F)
             {
                 // Halfwidth Katakana variants
                 if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && Combine(text, i, ch))
                 {
                     length = StemmerUtil.Delete(text, i--, length);
                 }
                 else
                 {
                     text[i] = KANA_NORM[ch - 0xFF65];
                 }
             }
         }
         termAtt.Length = length;
         return(true);
     }
     else
     {
         return(false);
     }
 }
Пример #3
0
        /// <summary>
        /// Compose into standard form any compositions in the decompositions table.
        /// </summary>
        private int Compose(int ch0, Regex block0, ScriptData sd, char[] text, int pos, int len)
        {
            if (pos + 1 >= len) // need at least 2 chars!
            {
                return(len);
            }

            int ch1    = text[pos + 1] - sd.@base;
            var block1 = GetBlockForChar(text[pos + 1]);

            if (block1 != block0) // needs to be the same writing system
            {
                return(len);
            }

            int ch2 = -1;

            if (pos + 2 < len)
            {
                ch2 = text[pos + 2] - sd.@base;
                var block2 = GetBlockForChar(text[pos + 2]);
                if (text[pos + 2] == '\u200D') // ZWJ
                {
                    ch2 = 0xFF;
                }
                else if (block2 != block1) // still allow a 2-char match
                {
                    ch2 = -1;
                }
            }

            for (int i = 0; i < decompositions.Length; i++)
            {
                if (decompositions[i][0] == ch0 && (decompositions[i][4] & (int)sd.flag) != 0)
                {
                    if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2))
                    {
                        text[pos] = (char)(sd.@base + decompositions[i][3]);
                        len       = StemmerUtil.Delete(text, pos + 1, len);
                        if (decompositions[i][2] >= 0)
                        {
                            len = StemmerUtil.Delete(text, pos + 1, len);
                        }
                        return(len);
                    }
                }
            }

            return(len);
        }
Пример #4
0
        private const char oe_se = '\u00F6'; //ö


        public override bool IncrementToken()
        {
            if (!m_input.IncrementToken())
            {
                return(false);
            }

            char[] buffer = charTermAttribute.Buffer;
            int    length = charTermAttribute.Length;


            int i;

            for (i = 0; i < length; i++)
            {
                if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae)
                {
                    buffer[i] = 'a';
                }
                else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE)
                {
                    buffer[i] = 'A';
                }
                else if (buffer[i] == oe || buffer[i] == oe_se)
                {
                    buffer[i] = 'o';
                }
                else if (buffer[i] == OE || buffer[i] == OE_se)
                {
                    buffer[i] = 'O';
                }
                else if (length - 1 > i)
                {
                    if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                    {
                        length = StemmerUtil.Delete(buffer, i + 1, length);
                    }
                    else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                    {
                        length = StemmerUtil.Delete(buffer, i + 1, length);
                    }
                }
            }

            charTermAttribute.Length = length;

            return(true);
        }
Пример #5
0
        /// <summary>
        /// Normalize an input buffer of Arabic text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int Normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                case ALEF_MADDA:
                case ALEF_HAMZA_ABOVE:
                case ALEF_HAMZA_BELOW:
                    s[i] = ALEF;
                    break;

                case DOTLESS_YEH:
                    s[i] = YEH;
                    break;

                case TEH_MARBUTA:
                    s[i] = HEH;
                    break;

                case TATWEEL:
                case KASRATAN:
                case DAMMATAN:
                case FATHATAN:
                case FATHA:
                case DAMMA:
                case KASRA:
                case SHADDA:
                case SUKUN:
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                default:
                    break;
                }
            }

            return(len);
        }
Пример #6
0
        private int Norm2(char[] s, int len)
        {
            if (len > 8)
            {
                if (s[len - 1] == 'e' || s[len - 1] == 'o' || s[len - 1] == 'u')
                {
                    len--;
                }
            }

            if (len > 4)
            {
                if (s[len - 1] == 'i')
                {
                    len--;
                }

                if (len > 4)
                {
                    char ch = s[0];
                    for (int i = 1; i < len; i++)
                    {
                        if (s[i] == ch && (ch == 'k' || ch == 'p' || ch == 't'))
                        {
                            len = StemmerUtil.Delete(s, i--, len);
                        }
                        else
                        {
                            ch = s[i];
                        }
                    }
                }
            }

            return(len);
        }
Пример #7
0
        private int Norm(char[] s, int len)
        {
            if (len > 4)
            {
                for (int i = 0; i < len; i++)
                {
                    switch (s[i])
                    {
                    case 'à':
                    case 'á':
                    case 'â':
                        s[i] = 'a';
                        break;

                    case 'ô':
                        s[i] = 'o';
                        break;

                    case 'è':
                    case 'é':
                    case 'ê':
                        s[i] = 'e';
                        break;

                    case 'ù':
                    case 'û':
                        s[i] = 'u';
                        break;

                    case 'î':
                        s[i] = 'i';
                        break;

                    case 'ç':
                        s[i] = 'c';
                        break;
                    }
                }

                char ch = s[0];
                for (int i = 1; i < len; i++)
                {
                    if (s[i] == ch && char.IsLetter(ch))
                    {
                        len = StemmerUtil.Delete(s, i--, len);
                    }
                    else
                    {
                        ch = s[i];
                    }
                }
            }

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ie"))
            {
                len -= 2;
            }

            if (len > 4)
            {
                if (s[len - 1] == 'r')
                {
                    len--;
                }
                if (s[len - 1] == 'e')
                {
                    len--;
                }
                if (s[len - 1] == 'e')
                {
                    len--;
                }
                if (s[len - 1] == s[len - 2] && char.IsLetter(s[len - 1]))
                {
                    len--;
                }
            }
            return(len);
        }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                int    state  = N;
                char[] buffer = termAtt.Buffer();
                int    length = termAtt.Length;
                for (int i = 0; i < length; i++)
                {
                    char c = buffer[i];
                    switch (c)
                    {
                    case 'a':
                    case 'o':
                        state = U;
                        break;

                    case 'u':
                        state = (state == N) ? U : V;
                        break;

                    case 'e':
                        if (state == U)
                        {
                            length = StemmerUtil.Delete(buffer, i--, length);
                        }
                        state = V;
                        break;

                    case 'i':
                    case 'q':
                    case 'y':
                        state = V;
                        break;

                    case 'ä':
                        buffer[i] = 'a';
                        state     = V;
                        break;

                    case 'ö':
                        buffer[i] = 'o';
                        state     = V;
                        break;

                    case 'ü':
                        buffer[i] = 'u';
                        state     = V;
                        break;

                    case 'ß':
                        buffer[i++] = 's';
                        buffer      = termAtt.ResizeBuffer(1 + length);
                        if (i < length)
                        {
                            Array.Copy(buffer, i, buffer, i + 1, (length - i));
                        }
                        buffer[i] = 's';
                        length++;
                        state = N;
                        break;

                    default:
                        state = N;
                        break;
                    }
                }
                termAtt.Length = length;
                return(true);
            }
            else
            {
                return(false);
            }
        }
Пример #9
0
        /// <summary>
        /// Normalize an input buffer of Hindi text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                // dead n -> bindu
                case '\u0928':
                    if (i + 1 < len && s[i + 1] == '\u094D')
                    {
                        s[i] = '\u0902';
                        len  = StemmerUtil.Delete(s, i + 1, len);
                    }
                    break;

                // candrabindu -> bindu
                case '\u0901':
                    s[i] = '\u0902';
                    break;

                // nukta deletions
                case '\u093C':
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                case '\u0929':
                    s[i] = '\u0928';
                    break;

                case '\u0931':
                    s[i] = '\u0930';
                    break;

                case '\u0934':
                    s[i] = '\u0933';
                    break;

                case '\u0958':
                    s[i] = '\u0915';
                    break;

                case '\u0959':
                    s[i] = '\u0916';
                    break;

                case '\u095A':
                    s[i] = '\u0917';
                    break;

                case '\u095B':
                    s[i] = '\u091C';
                    break;

                case '\u095C':
                    s[i] = '\u0921';
                    break;

                case '\u095D':
                    s[i] = '\u0922';
                    break;

                case '\u095E':
                    s[i] = '\u092B';
                    break;

                case '\u095F':
                    s[i] = '\u092F';
                    break;

                // zwj/zwnj -> delete
                case '\u200D':
                case '\u200C':
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                // virama -> delete
                case '\u094D':
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                // chandra/short -> replace
                case '\u0945':
                case '\u0946':
                    s[i] = '\u0947';
                    break;

                case '\u0949':
                case '\u094A':
                    s[i] = '\u094B';
                    break;

                case '\u090D':
                case '\u090E':
                    s[i] = '\u090F';
                    break;

                case '\u0911':
                case '\u0912':
                    s[i] = '\u0913';
                    break;

                case '\u0972':
                    s[i] = '\u0905';
                    break;

                // long -> short ind. vowels
                case '\u0906':
                    s[i] = '\u0905';
                    break;

                case '\u0908':
                    s[i] = '\u0907';
                    break;

                case '\u090A':
                    s[i] = '\u0909';
                    break;

                case '\u0960':
                    s[i] = '\u090B';
                    break;

                case '\u0961':
                    s[i] = '\u090C';
                    break;

                case '\u0910':
                    s[i] = '\u090F';
                    break;

                case '\u0914':
                    s[i] = '\u0913';
                    break;

                // long -> short dep. vowels
                case '\u0940':
                    s[i] = '\u093F';
                    break;

                case '\u0942':
                    s[i] = '\u0941';
                    break;

                case '\u0944':
                    s[i] = '\u0943';
                    break;

                case '\u0963':
                    s[i] = '\u0962';
                    break;

                case '\u0948':
                    s[i] = '\u0947';
                    break;

                case '\u094C':
                    s[i] = '\u094B';
                    break;

                default:
                    break;
                }
            }

            return(len);
        }
Пример #10
0
        /// <summary>
        /// Normalize an input buffer of Sorani text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int Normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                case YEH:
                case DOTLESS_YEH:
                    s[i] = FARSI_YEH;
                    break;

                case KAF:
                    s[i] = KEHEH;
                    break;

                case ZWNJ:
                    if (i > 0 && s[i - 1] == HEH)
                    {
                        s[i - 1] = AE;
                    }
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                case HEH:
                    if (i == len - 1)
                    {
                        s[i] = AE;
                    }
                    break;

                case TEH_MARBUTA:
                    s[i] = AE;
                    break;

                case HEH_DOACHASHMEE:
                    s[i] = HEH;
                    break;

                case REH:
                    if (i == 0)
                    {
                        s[i] = RREH;
                    }
                    break;

                case RREH_ABOVE:
                    s[i] = RREH;
                    break;

                case TATWEEL:
                case KASRATAN:
                case DAMMATAN:
                case FATHATAN:
                case FATHA:
                case DAMMA:
                case KASRA:
                case SHADDA:
                case SUKUN:
                    len = StemmerUtil.Delete(s, i, len);
                    i--;
                    break;

                default:
                    if (CharUnicodeInfo.GetUnicodeCategory(s[i]) == UnicodeCategory.Format)
                    {
                        len = StemmerUtil.Delete(s, i, len);
                        i--;
                    }
                    break;
                }
            }
            return(len);
        }