Example #1
0
        /// <summary>
        /// </summary>
        /// <param name="open"> </param>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="ExtensibleTokenException"></exception>
        private TOK scanLit(int open, byte[] buf, int off, int end, Token token)
        {
            while (off != end)
            {
                int t = byteType(buf, off);
                switch (t)
                {
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialTokenException();
                        }

                        check2(buf, off);
                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialTokenException();
                        }

                        check3(buf, off);
                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialTokenException();
                        }

                        check4(buf, off);
                        off += 4;
                        break;
                    case BT_NONXML:
                    case BT_MALFORM:
                        throw new InvalidTokenException(off);
                    case BT_QUOT:
                    case BT_APOS:
                        off += minBPC;
                        if (t != open)
                        {
                            break;
                        }

                        if (off == end)
                        {
                            throw new ExtensibleTokenException(TOK.LITERAL);
                        }

                        switch (byteType(buf, off))
                        {
                            case BT_S:
                            case BT_CR:
                            case BT_LF:
                            case BT_GT:
                            case BT_PERCNT:
                            case BT_LSQB:
                                token.TokenEnd = off;
                                return TOK.LITERAL;
                            default:
                                throw new InvalidTokenException(off);
                        }

                    default:
                        off += minBPC;
                        break;
                }
            }

            throw new PartialTokenException();
        }
Example #2
0
        /* off points to character following "%" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        private TOK scanPercent(byte[] buf, int off, int end, Token token)
        {
            if (off == end)
            {
                throw new PartialTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_NMSTRT:
                    off += minBPC;
                    break;
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType2(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 2;
                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType3(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 3;
                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType4(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 4;
                    break;
                case BT_S:
                case BT_LF:
                case BT_CR:
                case BT_PERCNT:
                    token.TokenEnd = off;
                    return TOK.PERCENT;
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar2(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar3(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar4(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 4;
                        break;
                    case BT_SEMI:
                        token.NameEnd = off;
                        token.TokenEnd = off + minBPC;
                        return TOK.PARAM_ENTITY_REF;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new PartialTokenException();
        }
Example #3
0
        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="ExtensibleTokenException"></exception>
        private TOK scanPoundName(byte[] buf, int off, int end, Token token)
        {
            if (off == end)
            {
                throw new PartialTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_NMSTRT:
                    off += minBPC;
                    break;
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType2(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 2;
                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType3(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 3;
                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType4(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 4;
                    break;
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar2(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar3(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar4(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 4;
                        break;
                    case BT_CR:
                    case BT_LF:
                    case BT_S:
                    case BT_RPAR:
                    case BT_GT:
                    case BT_PERCNT:
                    case BT_VERBAR:
                        token.TokenEnd = off;
                        return TOK.POUND_NAME;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new ExtensibleTokenException(TOK.POUND_NAME);
        }
Example #4
0
        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        private bool isMagicEntityRef(byte[] buf, int off, int end, Token token)
        {
            switch (byteToAscii(buf, off))
            {
                case 'a':
                    if (end - off < minBPC*4)
                    {
                        break;
                    }

                    switch (byteToAscii(buf, off + minBPC))
                    {
                        case 'm':
                            if (charMatches(buf, off + minBPC*2, 'p') && charMatches(buf, off + minBPC*3, ';'))
                            {
                                token.TokenEnd = off + minBPC*4;
                                token.RefChar1 = '&';
                                return true;
                            }

                            break;
                        case 'p':
                            if (end - off >= minBPC*5 && charMatches(buf, off + minBPC*2, 'o') &&
                                charMatches(buf, off + minBPC*3, 's') && charMatches(buf, off + minBPC*4, ';'))
                            {
                                token.TokenEnd = off + minBPC*5;
                                token.RefChar1 = '\'';
                                return true;
                            }

                            break;
                    }

                    break;
                case 'l':
                    if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') &&
                        charMatches(buf, off + minBPC*2, ';'))
                    {
                        token.TokenEnd = off + minBPC*3;
                        token.RefChar1 = '<';
                        return true;
                    }

                    break;
                case 'g':
                    if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') &&
                        charMatches(buf, off + minBPC*2, ';'))
                    {
                        token.TokenEnd = off + minBPC*3;
                        token.RefChar1 = '>';
                        return true;
                    }

                    break;
                case 'q':
                    if (end - off >= minBPC*5 && charMatches(buf, off + minBPC, 'u') &&
                        charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 't') &&
                        charMatches(buf, off + minBPC*4, ';'))
                    {
                        token.TokenEnd = off + minBPC*5;
                        token.RefChar1 = '"';
                        return true;
                    }

                    break;
            }

            return false;
        }
Example #5
0
        /* off points to character following "&" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        private TOK scanRef(byte[] buf, int off, int end, Token token)
        {
            if (off == end)
            {
                throw new PartialTokenException();
            }

            if (isMagicEntityRef(buf, off, end, token))
            {
                return TOK.MAGIC_ENTITY_REF;
            }

            switch (byteType(buf, off))
            {
                case BT_NMSTRT:
                    off += minBPC;
                    break;
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType2(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 2;
                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType3(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 3;
                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType4(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 4;
                    break;
                case BT_NUM:
                    return scanCharRef(buf, off + minBPC, end, token);
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar2(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar3(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar4(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 4;
                        break;
                    case BT_SEMI:
                        token.NameEnd = off;
                        token.TokenEnd = off + minBPC;
                        return TOK.ENTITY_REF;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new PartialTokenException();
        }
Example #6
0
        /**
         * Scans the first token of a byte subarrary that contains part of
         * literal attribute value.  The opening and closing delimiters
         * are not included in the subarrary.
         * Returns one of the following integers according to the type of
         * token that the subarray starts with:
         * <ul>
         * <li><code>TOK.DATA_CHARS</code></li>
         * <li><code>TOK.DATA_NEWLINE</code></li>
         * <li><code>TOK.ATTRIBUTE_VALUE_S</code></li>
         * <li><code>TOK.MAGIC_ENTITY_REF</code></li>
         * <li><code>TOK.ENTITY_REF</code></li>
         * <li><code>TOK.CHAR_REF</code></li>
         * <li><code>TOK.CHAR_PAIR_REF</code></li>
         * </ul>
         * @exception EmptyTokenException if the subarray is empty
         * @exception PartialTokenException if the subarray contains only part of
         * a legal token
         * @exception InvalidTokenException if the subarrary does not start
         * with a legal token or part of one
         * @exception ExtensibleTokenException if the subarray encodes just a carriage
         * return ('\r')
         * @see #TOK.DATA_CHARS
         * @see #TOK.DATA_NEWLINE
         * @see #TOK.ATTRIBUTE_VALUE_S
         * @see #TOK.MAGIC_ENTITY_REF
         * @see #TOK.ENTITY_REF
         * @see #TOK.CHAR_REF
         * @see #TOK.CHAR_PAIR_REF
         * @see Token
         * @see EmptyTokenException
         * @see PartialTokenException
         * @see InvalidTokenException
         * @see ExtensibleTokenException
         */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="EmptyTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="ExtensibleTokenException"></exception>
        public TOK tokenizeAttributeValue(byte[] buf, int off, int end, Token token)
        {
            if (minBPC > 1)
            {
                end = adjustEnd(off, end);
            }

            if (off == end)
            {
                throw new EmptyTokenException();
            }

            int start = off;
            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        off += 4;
                        break;
                    case BT_AMP:
                        if (off == start)
                        {
                            return scanRef(buf, off + minBPC, end, token);
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    case BT_LT:

                        /* this is for inside entity references */
                        throw new InvalidTokenException(off);
                    case BT_S:
                        if (off == start)
                        {
                            token.TokenEnd = off + minBPC;
                            return TOK.ATTRIBUTE_VALUE_S;
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    case BT_LF:
                        if (off == start)
                        {
                            token.TokenEnd = off + minBPC;
                            return TOK.DATA_NEWLINE;
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    case BT_CR:
                        if (off == start)
                        {
                            off += minBPC;
                            if (off == end)
                            {
                                throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
                            }

                            if (byteType(buf, off) == BT_LF)
                            {
                                off += minBPC;
                            }

                            token.TokenEnd = off;
                            return TOK.DATA_NEWLINE;
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    default:
                        off += minBPC;
                        break;
                }
            }

            token.TokenEnd = off;
            return TOK.DATA_CHARS;
        }
Example #7
0
        /* num is known to be < 0x110000; return the token code */

        /// <summary>
        /// </summary>
        /// <param name="num"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="InvalidTokenException"></exception>
        private TOK setRefChar(int num, Token token)
        {
            if (num < 0x10000)
            {
                switch (charTypeTable[num >> 8][num & 0xFF])
                {
                    case BT_NONXML:
                    case BT_LEAD4:
                    case BT_MALFORM:
                        throw new InvalidTokenException(token.TokenEnd - minBPC);
                }

                token.RefChar1 = (char) num;
                return TOK.CHAR_REF;
            }
            else
            {
                num -= 0x10000;
                token.RefChar1 = (char) ((num >> 10) + 0xD800);
                token.RefChar2 = (char) ((num & ((1 << 10) - 1)) + 0xDC00);
                return TOK.CHAR_PAIR_REF;
            }
        }
Example #8
0
        /* off points to character following "</" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        private TOK scanEndTag(byte[] buf, int off, int end, Token token)
        {
            if (off == end)
            {
                throw new PartialTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_NMSTRT:
                    off += minBPC;
                    break;
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType2(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 2;
                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType3(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 3;
                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType4(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 4;
                    break;
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar2(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar3(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar4(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 4;
                        break;
                    case BT_S:
                    case BT_CR:
                    case BT_LF:
                        token.NameEnd = off;
                        for (off += minBPC; off != end; off += minBPC)
                        {
                            switch (byteType(buf, off))
                            {
                                case BT_S:
                                case BT_CR:
                                case BT_LF:
                                    break;
                                case BT_GT:
                                    token.TokenEnd = off + minBPC;
                                    return TOK.END_TAG;
                                default:
                                    throw new InvalidTokenException(off);
                            }
                        }

                        throw new PartialTokenException();
                    case BT_GT:
                        token.NameEnd = off;
                        token.TokenEnd = off + minBPC;
                        return TOK.END_TAG;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new PartialTokenException();
        }
Example #9
0
        /* off points to character following "&#X" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="PartialTokenException"></exception>
        private TOK scanHexCharRef(byte[] buf, int off, int end, Token token)
        {
            if (off != end)
            {
                int c = byteToAscii(buf, off);
                int num;
                switch (c)
                {
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                        num = c - '0';
                        break;
                    case 'A':
                    case 'B':
                    case 'C':
                    case 'D':
                    case 'E':
                    case 'F':
                        num = c - ('A' - 10);
                        break;
                    case 'a':
                    case 'b':
                    case 'c':
                    case 'd':
                    case 'e':
                    case 'f':
                        num = c - ('a' - 10);
                        break;
                    default:
                        throw new InvalidTokenException(off);
                }

                for (off += minBPC; off != end; off += minBPC)
                {
                    c = byteToAscii(buf, off);
                    switch (c)
                    {
                        case '0':
                        case '1':
                        case '2':
                        case '3':
                        case '4':
                        case '5':
                        case '6':
                        case '7':
                        case '8':
                        case '9':
                            num = (num << 4) + c - '0';
                            break;
                        case 'A':
                        case 'B':
                        case 'C':
                        case 'D':
                        case 'E':
                        case 'F':
                            num = (num << 4) + c - ('A' - 10);
                            break;
                        case 'a':
                        case 'b':
                        case 'c':
                        case 'd':
                        case 'e':
                        case 'f':
                            num = (num << 4) + c - ('a' - 10);
                            break;
                        case ';':
                            token.TokenEnd = off + minBPC;
                            return setRefChar(num, token);
                        default:
                            throw new InvalidTokenException(off);
                    }

                    if (num >= 0x110000)
                    {
                        throw new InvalidTokenException(off);
                    }
                }
            }

            throw new PartialTokenException();
        }
Example #10
0
        /* off points to character following "<?" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        private TOK scanPi(byte[] buf, int off, int end, Token token)
        {
            int target = off;
            if (off == end)
            {
                throw new PartialTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_NMSTRT:
                    off += minBPC;
                    break;
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType2(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 2;
                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType3(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 3;
                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    if (byteType4(buf, off) != BT_NMSTRT)
                    {
                        throw new InvalidTokenException(off);
                    }

                    off += 4;
                    break;
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar2(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar3(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar4(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 4;
                        break;
                    case BT_S:
                    case BT_CR:
                    case BT_LF:
                        bool isXml = targetIsXml(buf, target, off);
                        token.NameEnd = off;
                        off += minBPC;
                        while (off != end)
                        {
                            switch (byteType(buf, off))
                            {
                                case BT_LEAD2:
                                    if (end - off < 2)
                                    {
                                        throw new PartialCharException(off);
                                    }

                                    check2(buf, off);
                                    off += 2;
                                    break;
                                case BT_LEAD3:
                                    if (end - off < 3)
                                    {
                                        throw new PartialCharException(off);
                                    }

                                    check3(buf, off);
                                    off += 3;
                                    break;
                                case BT_LEAD4:
                                    if (end - off < 4)
                                    {
                                        throw new PartialCharException(off);
                                    }

                                    check4(buf, off);
                                    off += 4;
                                    break;
                                case BT_NONXML:
                                case BT_MALFORM:
                                    throw new InvalidTokenException(off);
                                case BT_QUEST:
                                    off += minBPC;
                                    if (off == end)
                                    {
                                        throw new PartialTokenException();
                                    }

                                    if (charMatches(buf, off, '>'))
                                    {
                                        token.TokenEnd = off + minBPC;
                                        if (isXml)
                                        {
                                            return TOK.XML_DECL;
                                        }
                                        else
                                        {
                                            return TOK.PI;
                                        }
                                    }

                                    break;
                                default:
                                    off += minBPC;
                                    break;
                            }
                        }

                        throw new PartialTokenException();
                    case BT_QUEST:
                        token.NameEnd = off;
                        off += minBPC;
                        if (off == end)
                        {
                            throw new PartialTokenException();
                        }

                        checkCharMatches(buf, off, '>');
                        token.TokenEnd = off + minBPC;
                        return targetIsXml(buf, target, token.NameEnd) ? TOK.XML_DECL : TOK.PI;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new PartialTokenException();
        }
Example #11
0
        /* off points to character following "<![" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        private TOK scanCdataSection(byte[] buf, int off, int end, Token token)
        {
            /* "CDATA[".length() == 6 */
            if (end - off < 6*minBPC)
            {
                throw new PartialTokenException();
            }

            for (int i = 0; i < CDATA.Length; i++, off += minBPC)
            {
                checkCharMatches(buf, off, CDATA[i]);
            }

            token.TokenEnd = off;
            return TOK.CDATA_SECT_OPEN;
        }
Example #12
0
        /* off points to character following "<!" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        private TOK scanDecl(byte[] buf, int off, int end, Token token)
        {
            if (off == end)
            {
                throw new PartialTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_MINUS:
                    return scanComment(buf, off + minBPC, end, token);
                case BT_LSQB:
                    token.TokenEnd = off + minBPC;
                    return TOK.COND_SECT_OPEN;
                case BT_NMSTRT:
                    off += minBPC;
                    break;
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_PERCNT:
                        if (off + minBPC == end)
                        {
                            throw new PartialTokenException();
                        }

                        /* don't allow <!ENTITY% foo "whatever"> */
                        switch (byteType(buf, off + minBPC))
                        {
                            case BT_S:
                            case BT_CR:
                            case BT_LF:
                            case BT_PERCNT:
                                throw new InvalidTokenException(off);
                        }

                        /* fall through */
                        goto case BT_S;
                    case BT_S:
                    case BT_CR:
                    case BT_LF:
                        token.TokenEnd = off;
                        return TOK.DECL_OPEN;
                    case BT_NMSTRT:
                        off += minBPC;
                        break;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new PartialTokenException();
        }
Example #13
0
        /* off points to character following "<!-" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="PartialTokenException"></exception>
        private TOK scanComment(byte[] buf, int off, int end, Token token)
        {
            if (off != end)
            {
                checkCharMatches(buf, off, '-');
                off += minBPC;
                while (off != end)
                {
                    switch (byteType(buf, off))
                    {
                        case BT_LEAD2:
                            if (end - off < 2)
                            {
                                throw new PartialCharException(off);
                            }

                            check2(buf, off);
                            off += 2;
                            break;
                        case BT_LEAD3:
                            if (end - off < 3)
                            {
                                throw new PartialCharException(off);
                            }

                            check3(buf, off);
                            off += 3;
                            break;
                        case BT_LEAD4:
                            if (end - off < 4)
                            {
                                throw new PartialCharException(off);
                            }

                            check4(buf, off);
                            off += 4;
                            break;
                        case BT_NONXML:
                        case BT_MALFORM:
                            throw new InvalidTokenException(off);
                        case BT_MINUS:
                            if ((off += minBPC) == end)
                            {
                                throw new PartialTokenException();
                            }

                            if (charMatches(buf, off, '-'))
                            {
                                if ((off += minBPC) == end)
                                {
                                    throw new PartialTokenException();
                                }

                                checkCharMatches(buf, off, '>');
                                token.TokenEnd = off + minBPC;
                                return TOK.COMMENT;
                            }

                            break;
                        default:
                            off += minBPC;
                            break;
                    }
                }
            }

            throw new PartialTokenException();
        }
Example #14
0
        /**
         * Scans the first token of a byte subarrary that contains part of
         * literal entity value.  The opening and closing delimiters
         * are not included in the subarrary.
         * Returns one of the following integers according to the type of
         * token that the subarray starts with:
         * <ul>
         * <li><code>TOK.DATA_CHARS</code></li>
         * <li><code>TOK.DATA_NEWLINE</code></li>
         * <li><code>TOK.PARAM_ENTITY_REF</code></li>
         * <li><code>TOK.MAGIC_ENTITY_REF</code></li>
         * <li><code>TOK.ENTITY_REF</code></li>
         * <li><code>TOK.CHAR_REF</code></li>
         * <li><code>TOK.CHAR_PAIR_REF</code></li>
         * </ul>
         * @exception EmptyTokenException if the subarray is empty
         * @exception PartialTokenException if the subarray contains only part of
         * a legal token
         * @exception InvalidTokenException if the subarrary does not start
         * with a legal token or part of one
         * @exception ExtensibleTokenException if the subarray encodes just a carriage
         * return ('\r')
         * @see #TOK.DATA_CHARS
         * @see #TOK.DATA_NEWLINE
         * @see #TOK.MAGIC_ENTITY_REF
         * @see #TOK.ENTITY_REF
         * @see #TOK.PARAM_ENTITY_REF
         * @see #TOK.CHAR_REF
         * @see #TOK.CHAR_PAIR_REF
         * @see Token
         * @see EmptyTokenException
         * @see PartialTokenException
         * @see InvalidTokenException
         * @see ExtensibleTokenException
         */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="EmptyTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        /// <exception cref="ExtensibleTokenException"></exception>
        public TOK tokenizeEntityValue(byte[] buf, int off, int end, Token token)
        {
            if (minBPC > 1)
            {
                end = adjustEnd(off, end);
            }

            if (off == end)
            {
                throw new EmptyTokenException();
            }

            int start = off;
            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        off += 4;
                        break;
                    case BT_AMP:
                        if (off == start)
                        {
                            return scanRef(buf, off + minBPC, end, token);
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    case BT_PERCNT:
                        if (off == start)
                        {
                            return scanPercent(buf, off + minBPC, end, token);
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    case BT_LF:
                        if (off == start)
                        {
                            token.TokenEnd = off + minBPC;
                            return TOK.DATA_NEWLINE;
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    case BT_CR:
                        if (off == start)
                        {
                            off += minBPC;
                            if (off == end)
                            {
                                throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
                            }

                            if (byteType(buf, off) == BT_LF)
                            {
                                off += minBPC;
                            }

                            token.TokenEnd = off;
                            return TOK.DATA_NEWLINE;
                        }

                        token.TokenEnd = off;
                        return TOK.DATA_CHARS;
                    default:
                        off += minBPC;
                        break;
                }
            }

            token.TokenEnd = off;
            return TOK.DATA_CHARS;
        }
Example #15
0
        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        public static Encoding getInitialEncoding(byte[] buf, int off, int end, Token token)
        {
            token.TokenEnd = off;
            switch (end - off)
            {
                case 0:
                    break;
                case 1:
                    if (buf[off] > 127)
                    {
                        return null;
                    }

                    break;
                default:
                    int b0 = buf[off] & 0xFF;
                    int b1 = buf[off + 1] & 0xFF;
                    switch ((b0 << 8) | b1)
                    {
                        case 0xFEFF:
                            token.TokenEnd = off + 2;

                            /* fall through */
                            goto case '<';
                        case '<': /* not legal; but not a fatal error */
                            return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
                        case 0xFFFE:
                            token.TokenEnd = off + 2;

                            /* fall through */
                            goto case '<' << 8;
                        case '<' << 8: /* not legal; but not a fatal error */
                            return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING);
                    }

                    break;
            }

            return getEncoding(UTF8_ENCODING);
        }
Example #16
0
        /* off points to character following "&#" */

        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="PartialTokenException"></exception>
        private TOK scanCharRef(byte[] buf, int off, int end, Token token)
        {
            if (off != end)
            {
                int c = byteToAscii(buf, off);
                switch (c)
                {
                    case 'x':
                        return scanHexCharRef(buf, off + minBPC, end, token);
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                        break;
                    default:
                        throw new InvalidTokenException(off);
                }

                int num = c - '0';
                for (off += minBPC; off != end; off += minBPC)
                {
                    c = byteToAscii(buf, off);
                    switch (c)
                    {
                        case '0':
                        case '1':
                        case '2':
                        case '3':
                        case '4':
                        case '5':
                        case '6':
                        case '7':
                        case '8':
                        case '9':
                            num = num*10 + (c - '0');
                            if (num < 0x110000)
                            {
                                break;
                            }

                            /* fall through */
                            goto default;
                        default:
                            throw new InvalidTokenException(off);
                        case ';':
                            token.TokenEnd = off + minBPC;
                            return setRefChar(num, token);
                    }
                }
            }

            throw new PartialTokenException();
        }
Example #17
0
        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="EmptyTokenException"></exception>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="ExtensibleTokenException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        public TOK tokenizeCdataSection(byte[] buf, int off, int end, Token token)
        {
            if (minBPC > 1)
            {
                end = adjustEnd(off, end);
            }

            if (off == end)
            {
                throw new EmptyTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_RSQB:
                    off += minBPC;
                    if (off == end)
                    {
                        throw new PartialTokenException();
                    }

                    if (!charMatches(buf, off, ']'))
                    {
                        break;
                    }

                    off += minBPC;
                    if (off == end)
                    {
                        throw new PartialTokenException();
                    }

                    if (!charMatches(buf, off, '>'))
                    {
                        off -= minBPC;
                        break;
                    }

                    token.TokenEnd = off + minBPC;
                    return TOK.CDATA_SECT_CLOSE;
                case BT_CR:
                    off += minBPC;
                    if (off == end)
                    {
                        throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
                    }

                    if (byteType(buf, off) == BT_LF)
                    {
                        off += minBPC;
                    }

                    token.TokenEnd = off;
                    return TOK.DATA_NEWLINE;
                case BT_LF:
                    token.TokenEnd = off + minBPC;
                    return TOK.DATA_NEWLINE;
                case BT_NONXML:
                case BT_MALFORM:
                    throw new InvalidTokenException(off);
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    check2(buf, off);
                    off += 2;
                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    check3(buf, off);
                    off += 3;
                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    check4(buf, off);
                    off += 4;
                    break;
                default:
                    off += minBPC;
                    break;
            }

            token.TokenEnd = extendCdata(buf, off, end);
            return TOK.DATA_CHARS;
        }
Example #18
0
        /// <summary>
        /// </summary>
        /// <param name="buf"> </param>
        /// <param name="off"> </param>
        /// <param name="end"> </param>
        /// <param name="token"> </param>
        /// <returns> </returns>
        /// <exception cref="EmptyTokenException"></exception>
        /// <exception cref="PartialTokenException"></exception>
        /// <exception cref="EndOfPrologException"></exception>
        /// <exception cref="InvalidTokenException"></exception>
        /// <exception cref="ExtensibleTokenException"></exception>
        /// <exception cref="PartialCharException"></exception>
        public TOK tokenizeProlog(byte[] buf, int off, int end, Token token)
        {
            TOK tok;
            if (minBPC > 1)
            {
                end = adjustEnd(off, end);
            }

            if (off == end)
            {
                throw new EmptyTokenException();
            }

            switch (byteType(buf, off))
            {
                case BT_QUOT:
                    return scanLit(BT_QUOT, buf, off + minBPC, end, token);
                case BT_APOS:
                    return scanLit(BT_APOS, buf, off + minBPC, end, token);
                case BT_LT:
                    {
                        off += minBPC;
                        if (off == end)
                        {
                            throw new PartialTokenException();
                        }

                        switch (byteType(buf, off))
                        {
                            case BT_EXCL:
                                return scanDecl(buf, off + minBPC, end, token);
                            case BT_QUEST:
                                return scanPi(buf, off + minBPC, end, token);
                            case BT_NMSTRT:
                            case BT_LEAD2:
                            case BT_LEAD3:
                            case BT_LEAD4:
                                token.TokenEnd = off - minBPC;
                                throw new EndOfPrologException();
                        }

                        throw new InvalidTokenException(off);
                    }

                case BT_CR:
                    if (off + minBPC == end)
                    {
                        throw new ExtensibleTokenException(TOK.PROLOG_S);
                    }

                    /* fall through */
                    goto case BT_S;
                case BT_S:
                case BT_LF:
                    for (;;)
                    {
                        off += minBPC;
                        if (off == end)
                        {
                            break;
                        }

                        switch (byteType(buf, off))
                        {
                            case BT_S:
                            case BT_LF:
                                break;
                            case BT_CR:

                                /* don't split CR/LF pair */
                                if (off + minBPC != end)
                                {
                                    break;
                                }

                                /* fall through */
                                goto default;
                            default:
                                token.TokenEnd = off;
                                return TOK.PROLOG_S;
                        }
                    }

                    token.TokenEnd = off;
                    return TOK.PROLOG_S;
                case BT_PERCNT:
                    return scanPercent(buf, off + minBPC, end, token);
                case BT_COMMA:
                    token.TokenEnd = off + minBPC;
                    return TOK.COMMA;
                case BT_LSQB:
                    token.TokenEnd = off + minBPC;
                    return TOK.OPEN_BRACKET;
                case BT_RSQB:
                    off += minBPC;
                    if (off == end)
                    {
                        throw new ExtensibleTokenException(TOK.CLOSE_BRACKET);
                    }

                    if (charMatches(buf, off, ']'))
                    {
                        if (off + minBPC == end)
                        {
                            throw new PartialTokenException();
                        }

                        if (charMatches(buf, off + minBPC, '>'))
                        {
                            token.TokenEnd = off + 2*minBPC;
                            return TOK.COND_SECT_CLOSE;
                        }
                    }

                    token.TokenEnd = off;
                    return TOK.CLOSE_BRACKET;
                case BT_LPAR:
                    token.TokenEnd = off + minBPC;
                    return TOK.OPEN_PAREN;
                case BT_RPAR:
                    off += minBPC;
                    if (off == end)
                    {
                        throw new ExtensibleTokenException(TOK.CLOSE_PAREN);
                    }

                    switch (byteType(buf, off))
                    {
                        case BT_AST:
                            token.TokenEnd = off + minBPC;
                            return TOK.CLOSE_PAREN_ASTERISK;
                        case BT_QUEST:
                            token.TokenEnd = off + minBPC;
                            return TOK.CLOSE_PAREN_QUESTION;
                        case BT_PLUS:
                            token.TokenEnd = off + minBPC;
                            return TOK.CLOSE_PAREN_PLUS;
                        case BT_CR:
                        case BT_LF:
                        case BT_S:
                        case BT_GT:
                        case BT_COMMA:
                        case BT_VERBAR:
                        case BT_RPAR:
                            token.TokenEnd = off;
                            return TOK.CLOSE_PAREN;
                    }

                    throw new InvalidTokenException(off);
                case BT_VERBAR:
                    token.TokenEnd = off + minBPC;
                    return TOK.OR;
                case BT_GT:
                    token.TokenEnd = off + minBPC;
                    return TOK.DECL_CLOSE;
                case BT_NUM:
                    return scanPoundName(buf, off + minBPC, end, token);
                case BT_LEAD2:
                    if (end - off < 2)
                    {
                        throw new PartialCharException(off);
                    }

                    switch (byteType2(buf, off))
                    {
                        case BT_NMSTRT:
                            off += 2;
                            tok = TOK.NAME;
                            break;
                        case BT_NAME:
                            off += 2;
                            tok = TOK.NMTOKEN;
                            break;
                        default:
                            throw new InvalidTokenException(off);
                    }

                    break;
                case BT_LEAD3:
                    if (end - off < 3)
                    {
                        throw new PartialCharException(off);
                    }

                    switch (byteType3(buf, off))
                    {
                        case BT_NMSTRT:
                            off += 3;
                            tok = TOK.NAME;
                            break;
                        case BT_NAME:
                            off += 3;
                            tok = TOK.NMTOKEN;
                            break;
                        default:
                            throw new InvalidTokenException(off);
                    }

                    break;
                case BT_LEAD4:
                    if (end - off < 4)
                    {
                        throw new PartialCharException(off);
                    }

                    switch (byteType4(buf, off))
                    {
                        case BT_NMSTRT:
                            off += 4;
                            tok = TOK.NAME;
                            break;
                        case BT_NAME:
                            off += 4;
                            tok = TOK.NMTOKEN;
                            break;
                        default:
                            throw new InvalidTokenException(off);
                    }

                    break;
                case BT_NMSTRT:
                    tok = TOK.NAME;
                    off += minBPC;
                    break;
                case BT_NAME:
                case BT_MINUS:
                    tok = TOK.NMTOKEN;
                    off += minBPC;
                    break;
                default:
                    throw new InvalidTokenException(off);
            }

            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar2(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar3(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                        {
                            throw new PartialCharException(off);
                        }

                        if (!isNameChar4(buf, off))
                        {
                            throw new InvalidTokenException(off);
                        }

                        off += 4;
                        break;
                    case BT_GT:
                    case BT_RPAR:
                    case BT_COMMA:
                    case BT_VERBAR:
                    case BT_LSQB:
                    case BT_PERCNT:
                    case BT_S:
                    case BT_CR:
                    case BT_LF:
                        token.TokenEnd = off;
                        return tok;
                    case BT_PLUS:
                        if (tok != TOK.NAME)
                        {
                            throw new InvalidTokenException(off);
                        }

                        token.TokenEnd = off + minBPC;
                        return TOK.NAME_PLUS;
                    case BT_AST:
                        if (tok != TOK.NAME)
                        {
                            throw new InvalidTokenException(off);
                        }

                        token.TokenEnd = off + minBPC;
                        return TOK.NAME_ASTERISK;
                    case BT_QUEST:
                        if (tok != TOK.NAME)
                        {
                            throw new InvalidTokenException(off);
                        }

                        token.TokenEnd = off + minBPC;
                        return TOK.NAME_QUESTION;
                    default:
                        throw new InvalidTokenException(off);
                }
            }

            throw new ExtensibleTokenException(tok);
        }