Пример #1
0
        /// <summary>
        /// Normalize an input buffer of Persian text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                case FARSI_YEH:
                case YEH_BARREE:
                    s[i] = YEH;
                    break;

                case KEHEH:
                    s[i] = KAF;
                    break;

                case HEH_YEH:
                case HEH_GOAL:
                    s[i] = HEH;
                    break;

                case HAMZA_ABOVE:   // necessary for HEH + HAMZA
                    len = StemmerUtil.delete(s, i, len);
                    i--;
                    break;

                default:
                    break;
                }
            }

            return(len);
        }
Пример #2
0
        private const char oe_se = '\u00F6'; //ö


//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] buffer = charTermAttribute.buffer();
            int    length = charTermAttribute.length();


            int i;

            for (i = 0; i < length; i++)
            {
                if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae)
                {
                    buffer[i] = 'a';
                }
                else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE)
                {
                    buffer[i] = 'A';
                }
                else if (buffer[i] == oe || buffer[i] == oe_se)
                {
                    buffer[i] = 'o';
                }
                else if (buffer[i] == OE || buffer[i] == OE_se)
                {
                    buffer[i] = 'O';
                }
                else if (length - 1 > i)
                {
                    if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                    {
                        length = StemmerUtil.delete(buffer, i + 1, length);
                    }
                    else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                    {
                        length = StemmerUtil.delete(buffer, i + 1, length);
                    }
                }
            }

            charTermAttribute.Length = length;


            return(true);
        }
Пример #3
0
        /// <summary>
        /// Normalize an input buffer of Arabic text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                case ALEF_MADDA:
                case ALEF_HAMZA_ABOVE:
                case ALEF_HAMZA_BELOW:
                    s[i] = ALEF;
                    break;

                case DOTLESS_YEH:
                    s[i] = YEH;
                    break;

                case TEH_MARBUTA:
                    s[i] = HEH;
                    break;

                case TATWEEL:
                case KASRATAN:
                case DAMMATAN:
                case FATHATAN:
                case FATHA:
                case DAMMA:
                case KASRA:
                case SHADDA:
                case SUKUN:
                    len = StemmerUtil.delete(s, i, len);
                    i--;
                    break;

                default:
                    break;
                }
            }

            return(len);
        }
Пример #4
0
        private int norm2(char[] s, int len)
        {
            if (len > 8)
            {
                if (s[len - 1] == 'e' || s[len - 1] == 'o' || s[len - 1] == 'u')
                {
                    len--;
                }
            }

            if (len > 4)
            {
                if (s[len - 1] == 'i')
                {
                    len--;
                }

                if (len > 4)
                {
                    char ch = s[0];
                    for (int i = 1; i < len; i++)
                    {
                        if (s[i] == ch && (ch == 'k' || ch == 'p' || ch == 't'))
                        {
                            len = StemmerUtil.delete(s, i--, len);
                        }
                        else
                        {
                            ch = s[i];
                        }
                    }
                }
            }

            return(len);
        }
Пример #5
0
        private int norm(char[] s, int len)
        {
            if (len > 4)
            {
                for (int i = 0; i < len; i++)
                {
                    switch (s[i])
                    {
                    case 'à':
                    case 'á':
                    case 'â':
                        s[i] = 'a';
                        break;

                    case 'ô':
                        s[i] = 'o';
                        break;

                    case 'è':
                    case 'é':
                    case 'ê':
                        s[i] = 'e';
                        break;

                    case 'ù':
                    case 'û':
                        s[i] = 'u';
                        break;

                    case 'î':
                        s[i] = 'i';
                        break;

                    case 'ç':
                        s[i] = 'c';
                        break;
                    }
                }

                char ch = s[0];
                for (int i = 1; i < len; i++)
                {
                    if (s[i] == ch && char.IsLetter(ch))
                    {
                        len = StemmerUtil.delete(s, i--, len);
                    }
                    else
                    {
                        ch = s[i];
                    }
                }
            }

            if (len > 4 && StemmerUtil.EndsWith(s, len, "ie"))
            {
                len -= 2;
            }

            if (len > 4)
            {
                if (s[len - 1] == 'r')
                {
                    len--;
                }
                if (s[len - 1] == 'e')
                {
                    len--;
                }
                if (s[len - 1] == 'e')
                {
                    len--;
                }
                if (s[len - 1] == s[len - 2] && char.IsLetter(s[len - 1]))
                {
                    len--;
                }
            }
            return(len);
        }
Пример #6
0
        /// <summary>
        /// Normalize an input buffer of Hindi text
        /// </summary>
        /// <param name="s"> input buffer </param>
        /// <param name="len"> length of input buffer </param>
        /// <returns> length of input buffer after normalization </returns>
        public virtual int normalize(char[] s, int len)
        {
            for (int i = 0; i < len; i++)
            {
                switch (s[i])
                {
                // dead n -> bindu
                case '\u0928':
                    if (i + 1 < len && s[i + 1] == '\u094D')
                    {
                        s[i] = '\u0902';
                        len  = StemmerUtil.delete(s, i + 1, len);
                    }
                    break;

                // candrabindu -> bindu
                case '\u0901':
                    s[i] = '\u0902';
                    break;

                // nukta deletions
                case '\u093C':
                    len = StemmerUtil.delete(s, i, len);
                    i--;
                    break;

                case '\u0929':
                    s[i] = '\u0928';
                    break;

                case '\u0931':
                    s[i] = '\u0930';
                    break;

                case '\u0934':
                    s[i] = '\u0933';
                    break;

                case '\u0958':
                    s[i] = '\u0915';
                    break;

                case '\u0959':
                    s[i] = '\u0916';
                    break;

                case '\u095A':
                    s[i] = '\u0917';
                    break;

                case '\u095B':
                    s[i] = '\u091C';
                    break;

                case '\u095C':
                    s[i] = '\u0921';
                    break;

                case '\u095D':
                    s[i] = '\u0922';
                    break;

                case '\u095E':
                    s[i] = '\u092B';
                    break;

                case '\u095F':
                    s[i] = '\u092F';
                    break;

                // zwj/zwnj -> delete
                case '\u200D':
                case '\u200C':
                    len = StemmerUtil.delete(s, i, len);
                    i--;
                    break;

                // virama -> delete
                case '\u094D':
                    len = StemmerUtil.delete(s, i, len);
                    i--;
                    break;

                // chandra/short -> replace
                case '\u0945':
                case '\u0946':
                    s[i] = '\u0947';
                    break;

                case '\u0949':
                case '\u094A':
                    s[i] = '\u094B';
                    break;

                case '\u090D':
                case '\u090E':
                    s[i] = '\u090F';
                    break;

                case '\u0911':
                case '\u0912':
                    s[i] = '\u0913';
                    break;

                case '\u0972':
                    s[i] = '\u0905';
                    break;

                // long -> short ind. vowels
                case '\u0906':
                    s[i] = '\u0905';
                    break;

                case '\u0908':
                    s[i] = '\u0907';
                    break;

                case '\u090A':
                    s[i] = '\u0909';
                    break;

                case '\u0960':
                    s[i] = '\u090B';
                    break;

                case '\u0961':
                    s[i] = '\u090C';
                    break;

                case '\u0910':
                    s[i] = '\u090F';
                    break;

                case '\u0914':
                    s[i] = '\u0913';
                    break;

                // long -> short dep. vowels
                case '\u0940':
                    s[i] = '\u093F';
                    break;

                case '\u0942':
                    s[i] = '\u0941';
                    break;

                case '\u0944':
                    s[i] = '\u0943';
                    break;

                case '\u0963':
                    s[i] = '\u0962';
                    break;

                case '\u0948':
                    s[i] = '\u0947';
                    break;

                case '\u094C':
                    s[i] = '\u094B';
                    break;

                default:
                    break;
                }
            }

            return(len);
        }
Пример #7
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                int    state  = N;
                char[] buffer = termAtt.buffer();
                int    length = termAtt.length();
                for (int i = 0; i < length; i++)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char c = buffer[i];
                    char c = buffer[i];
                    switch (c)
                    {
                    case 'a':
                    case 'o':
                        state = U;
                        break;

                    case 'u':
                        state = (state == N) ? U : V;
                        break;

                    case 'e':
                        if (state == U)
                        {
                            length = StemmerUtil.delete(buffer, i--, length);
                        }
                        state = V;
                        break;

                    case 'i':
                    case 'q':
                    case 'y':
                        state = V;
                        break;

                    case 'ä':
                        buffer[i] = 'a';
                        state     = V;
                        break;

                    case 'ö':
                        buffer[i] = 'o';
                        state     = V;
                        break;

                    case 'ü':
                        buffer[i] = 'u';
                        state     = V;
                        break;

                    case 'ß':
                        buffer[i++] = 's';
                        buffer      = termAtt.resizeBuffer(1 + length);
                        if (i < length)
                        {
                            Array.Copy(buffer, i, buffer, i + 1, (length - i));
                        }
                        buffer[i] = 's';
                        length++;
                        state = N;
                        break;

                    default:
                        state = N;
                        break;
                    }
                }
                termAtt.Length = length;
                return(true);
            }
            else
            {
                return(false);
            }
        }