Esempio n. 1
0
 /* num is known to be < 0x110000; return the token code */
 private TOK setRefChar(int num, Token token)
 {
     if (num < 0x10000)
     {
         switch (charTypeTable[num >> 8][num & 0xFF])
         {
             case BT_NONXML:
             case BT_LEAD4:
             case BT_MALFORM:
                 throw new InvalidTokenException(token.TokenEnd - minBPC);
         }
         token.RefChar1 = (char)num;
         return TOK.CHAR_REF;
     }
     else
     {
         num -= 0x10000;
         token.RefChar1 = (char)((num >> 10) + 0xD800);
         token.RefChar2 = (char)((num & ((1 << 10) - 1)) + 0xDC00);
         return TOK.CHAR_PAIR_REF;
     }
 }
Esempio n. 2
0
 private TOK scanPoundName(byte[] buf, int off, int end, Token token)
 {
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_CR:
             case BT_LF:
             case BT_S:
             case BT_RPAR:
             case BT_GT:
             case BT_PERCNT:
             case BT_VERBAR:
                 token.TokenEnd = off;
                 return TOK.POUND_NAME;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new ExtensibleTokenException(TOK.POUND_NAME);
 }
Esempio n. 3
0
 /* off points to character following "&" */
 private TOK scanRef(byte[] buf, int off, int end, Token token)
 {
     if (off == end)
         throw new PartialTokenException();
     if (isMagicEntityRef(buf, off, end, token))
         return TOK.MAGIC_ENTITY_REF;
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         case BT_NUM:
             return scanCharRef(buf, off + minBPC, end, token);
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_SEMI:
                 token.NameEnd = off;
                 token.TokenEnd = off + minBPC;
                 return TOK.ENTITY_REF;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 4
0
 /* off points to character following "%" */
 private TOK scanPercent(byte[] buf, int off, int end, Token token)
 {
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         case BT_S:
         case BT_LF:
         case BT_CR:
         case BT_PERCNT:
             token.TokenEnd = off;
             return TOK.PERCENT;
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_SEMI:
                 token.NameEnd = off;
                 token.TokenEnd = off + minBPC;
                 return TOK.PARAM_ENTITY_REF;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 5
0
 /* off points to character following "<?" */
 private TOK scanPi(byte[] buf, int off, int end, Token token)
 {
     int target = off;
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_S:
             case BT_CR:
             case BT_LF:
                 bool isXml = targetIsXml(buf, target, off);
                 token.NameEnd = off;
                 off += minBPC;
                 while (off != end)
                 {
                     switch (byteType(buf, off))
                     {
                         case BT_LEAD2:
                             if (end - off < 2)
                                 throw new PartialCharException(off);
                             check2(buf, off);
                             off += 2;
                             break;
                         case BT_LEAD3:
                             if (end - off < 3)
                                 throw new PartialCharException(off);
                             check3(buf, off);
                             off += 3;
                             break;
                         case BT_LEAD4:
                             if (end - off < 4)
                                 throw new PartialCharException(off);
                             check4(buf, off);
                             off += 4;
                             break;
                         case BT_NONXML:
                         case BT_MALFORM:
                             throw new InvalidTokenException(off);
                         case BT_QUEST:
                             off += minBPC;
                             if (off == end)
                                 throw new PartialTokenException();
                             if (charMatches(buf, off, '>'))
                             {
                                 token.TokenEnd = off + minBPC;
                                 if (isXml)
                                     return TOK.XML_DECL;
                                 else
                                     return TOK.PI;
                             }
                             break;
                         default:
                             off += minBPC;
                             break;
                     }
                 }
                 throw new PartialTokenException();
             case BT_QUEST:
                 token.NameEnd = off;
                 off += minBPC;
                 if (off == end)
                     throw new PartialTokenException();
                 checkCharMatches(buf, off, '>');
                 token.TokenEnd = off + minBPC;
                 return (targetIsXml(buf, target, token.NameEnd)
                     ? TOK.XML_DECL
                     : TOK.PI);
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 6
0
 /**
  * Scans the first token of a byte subarrary that contains part of
  * literal entity value.  The opening and closing delimiters
  * are not included in the subarrary.
  * Returns one of the following integers according to the type of
  * token that the subarray starts with:
  * <ul>
  * <li><code>TOK.DATA_CHARS</code></li>
  * <li><code>TOK.DATA_NEWLINE</code></li>
  * <li><code>TOK.PARAM_ENTITY_REF</code></li>
  * <li><code>TOK.MAGIC_ENTITY_REF</code></li>
  * <li><code>TOK.ENTITY_REF</code></li>
  * <li><code>TOK.CHAR_REF</code></li>
  * <li><code>TOK.CHAR_PAIR_REF</code></li>
  * </ul>
  * @exception EmptyTokenException if the subarray is empty
  * @exception PartialTokenException if the subarray contains only part of
  * a legal token
  * @exception InvalidTokenException if the subarrary does not start
  * with a legal token or part of one
  * @exception ExtensibleTokenException if the subarray encodes just a carriage
  * return ('\r')
  * @see #TOK.DATA_CHARS
  * @see #TOK.DATA_NEWLINE
  * @see #TOK.MAGIC_ENTITY_REF
  * @see #TOK.ENTITY_REF
  * @see #TOK.PARAM_ENTITY_REF
  * @see #TOK.CHAR_REF
  * @see #TOK.CHAR_PAIR_REF
  * @see Token
  * @see EmptyTokenException
  * @see PartialTokenException
  * @see InvalidTokenException
  * @see ExtensibleTokenException
  */
 public TOK tokenizeEntityValue(byte[] buf, int off, int end,
     Token token)
 {
     if (minBPC > 1)
         end = adjustEnd(off, end);
     if (off == end)
         throw new EmptyTokenException();
     int start = off;
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 off += 4;
                 break;
             case BT_AMP:
                 if (off == start)
                     return scanRef(buf, off + minBPC, end, token);
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             case BT_PERCNT:
                 if (off == start)
                     return scanPercent(buf, off + minBPC, end, token);
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             case BT_LF:
                 if (off == start)
                 {
                     token.TokenEnd = off + minBPC;
                     return TOK.DATA_NEWLINE;
                 }
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             case BT_CR:
                 if (off == start)
                 {
                     off += minBPC;
                     if (off == end)
                         throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
                     if (byteType(buf, off) == BT_LF)
                         off += minBPC;
                     token.TokenEnd = off;
                     return TOK.DATA_NEWLINE;
                 }
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             default:
                 off += minBPC;
                 break;
         }
     }
     token.TokenEnd = off;
     return TOK.DATA_CHARS;
 }
Esempio n. 7
0
 private TOK scanLit(int open, byte[] buf, int off, int end, Token token)
 {
     while (off != end)
     {
         int t = byteType(buf, off);
         switch (t)
         {
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialTokenException();
                 check2(buf, off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialTokenException();
                 check3(buf, off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialTokenException();
                 check4(buf, off);
                 off += 4;
                 break;
             case BT_NONXML:
             case BT_MALFORM:
                 throw new InvalidTokenException(off);
             case BT_QUOT:
             case BT_APOS:
                 off += minBPC;
                 if (t != open)
                     break;
                 if (off == end)
                     throw new ExtensibleTokenException(TOK.LITERAL);
             switch (byteType(buf, off))
             {
                 case BT_S:
                 case BT_CR:
                 case BT_LF:
                 case BT_GT:
                 case BT_PERCNT:
                 case BT_LSQB:
                     token.TokenEnd = off;
                     return TOK.LITERAL;
                 default:
                     throw new InvalidTokenException(off);
             }
             default:
                 off += minBPC;
                 break;
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 8
0
 /* off points to character following "<!" */
 private TOK scanDecl(byte[] buf, int off, int end, Token token)
 {
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_MINUS:
             return scanComment(buf, off + minBPC, end, token);
         case BT_LSQB:
             token.TokenEnd = off + minBPC;
             return TOK.COND_SECT_OPEN;
         case BT_NMSTRT:
             off += minBPC;
             break;
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_PERCNT:
                 if (off + minBPC == end)
                     throw new PartialTokenException();
                 /* don't allow <!ENTITY% foo "whatever"> */
             switch (byteType(buf, off + minBPC))
             {
                 case BT_S:
                 case BT_CR:
                 case BT_LF:
                 case BT_PERCNT:
                     throw new InvalidTokenException(off);
             }
                 /* fall through */
                 goto case BT_S;
             case BT_S:
             case BT_CR:
             case BT_LF:
                 token.TokenEnd = off;
                 return TOK.DECL_OPEN;
             case BT_NMSTRT:
                 off += minBPC;
                 break;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 9
0
 /* off points to character following "</" */
 private TOK scanEndTag(byte[] buf, int off, int end, Token token)
 {
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_S:
             case BT_CR:
             case BT_LF:
                 token.NameEnd = off;
                 for (off += minBPC; off != end; off += minBPC)
                 {
                     switch (byteType(buf, off))
                     {
                         case BT_S:
                         case BT_CR:
                         case BT_LF:
                             break;
                         case BT_GT:
                             token.TokenEnd = off + minBPC;
                             return TOK.END_TAG;
                         default:
                             throw new InvalidTokenException(off);
                     }
                 }
                 throw new PartialTokenException();
             case BT_GT:
                 token.NameEnd = off;
                 token.TokenEnd = off + minBPC;
                 return TOK.END_TAG;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 10
0
        /* off points to character following "&#" */
        private TOK scanCharRef(byte[] buf, int off, int end, Token token)
        {
            if (off != end)
            {
                int c = byteToAscii(buf, off);
                switch (c)
                {
                    case 'x':
                        return scanHexCharRef(buf, off + minBPC, end, token);
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                        break;
                    default:
                        throw new InvalidTokenException(off);
                }

                int num = c - '0';
                for (off += minBPC; off != end; off += minBPC)
                {
                    c = byteToAscii(buf, off);
                    switch (c)
                    {
                        case '0':
                        case '1':
                        case '2':
                        case '3':
                        case '4':
                        case '5':
                        case '6':
                        case '7':
                        case '8':
                        case '9':
                            num = num * 10 + (c - '0');
                            if (num < 0x110000)
                                break;
                            /* fall through */
                            goto default;
                        default:
                            throw new InvalidTokenException(off);
                        case ';':
                            token.TokenEnd = off + minBPC;
                            return setRefChar(num, token);
                    }
                }
            }
            throw new PartialTokenException();
        }
Esempio n. 11
0
 /* off points to character following "<!-" */
 private TOK scanComment(byte[] buf, int off, int end, Token token)
 {
     if (off != end)
     {
         checkCharMatches(buf, off, '-');
         off += minBPC;
         while (off != end)
         {
             switch (byteType(buf, off))
             {
                 case BT_LEAD2:
                     if (end - off < 2)
                         throw new PartialCharException(off);
                     check2(buf, off);
                     off += 2;
                     break;
                 case BT_LEAD3:
                     if (end - off < 3)
                         throw new PartialCharException(off);
                     check3(buf, off);
                     off += 3;
                     break;
                 case BT_LEAD4:
                     if (end - off < 4)
                         throw new PartialCharException(off);
                     check4(buf, off);
                     off += 4;
                     break;
                 case BT_NONXML:
                 case BT_MALFORM:
                     throw new InvalidTokenException(off);
                 case BT_MINUS:
                     if ((off += minBPC) == end)
                         throw new PartialTokenException();
                     if (charMatches(buf, off, '-'))
                     {
                         if ((off += minBPC) == end)
                             throw new PartialTokenException();
                         checkCharMatches(buf, off, '>');
                         token.TokenEnd = off + minBPC;
                         return TOK.COMMENT;
                     }
                     break;
                 default:
                     off += minBPC;
                     break;
             }
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 12
0
 private TOK scanCdataSection(byte[] buf, int off, int end, Token token)
 {
     /* "CDATA[".length() == 6 */
     if (end - off < 6 * minBPC)
         throw new PartialTokenException();
     for (int i = 0; i < CDATA.Length; i++, off += minBPC)
         checkCharMatches(buf, off, CDATA[i]);
     token.TokenEnd = off;
     return TOK.CDATA_SECT_OPEN;
 }
Esempio n. 13
0
 private bool isMagicEntityRef(byte[] buf, int off, int end, Token token)
 {
     switch (byteToAscii(buf, off))
     {
         case 'a':
             if (end - off < minBPC*4)
                 break;
         switch (byteToAscii(buf, off + minBPC))
         {
             case 'm':
                 if (charMatches(buf, off + minBPC*2, 'p')
                     && charMatches(buf, off + minBPC*3, ';'))
                 {
                     token.TokenEnd = off + minBPC*4;
                     token.RefChar1 = '&';
                     return true;
                 }
                 break;
             case 'p':
                 if (end - off >= minBPC*5
                     && charMatches(buf, off + minBPC*2, 'o')
                     && charMatches(buf, off + minBPC*3, 's')
                     && charMatches(buf, off + minBPC*4, ';'))
                 {
                     token.TokenEnd = off + minBPC*5;
                     token.RefChar1 = '\'';
                     return true;
                 }
                 break;
         }
             break;
         case 'l':
             if (end - off >= minBPC*3
                 && charMatches(buf, off + minBPC, 't')
                 && charMatches(buf, off + minBPC*2, ';'))
             {
                 token.TokenEnd = off + minBPC*3;
                 token.RefChar1 = '<';
                 return true;
             }
             break;
         case 'g':
             if (end - off >= minBPC*3
                 && charMatches(buf, off + minBPC, 't')
                 && charMatches(buf, off + minBPC*2, ';'))
             {
                 token.TokenEnd = off + minBPC*3;
                 token.RefChar1 = '>';
                 return true;
             }
             break;
         case 'q':
             if (end - off >= minBPC*5
                 && charMatches(buf, off + minBPC, 'u')
                 && charMatches(buf, off + minBPC*2, 'o')
                 && charMatches(buf, off + minBPC*3, 't')
                 && charMatches(buf, off + minBPC*4, ';'))
             {
                 token.TokenEnd = off + minBPC*5;
                 token.RefChar1 = '"';
                 return true;
             }
             break;
     }
     return false;
 }
Esempio n. 14
0
 /**
  * Scans the first token of a byte subarray that contains part of a
  * prolog.
  * Returns one of the following integers according to the type of token
  * that the subarray starts with:
  * <ul>
  * <li><code>TOK.PI</code></li>
  * <li><code>TOK.XML_DECL</code></li>
  * <li><code>TOK.COMMENT</code></li>
  * <li><code>TOK.PARAM_ENTITY_REF</code></li>
  * <li><code>TOK.PROLOG_S</code></li>
  * <li><code>TOK.DECL_OPEN</code></li>
  * <li><code>TOK.DECL_CLOSE</code></li>
  * <li><code>TOK.NAME</code></li>
  * <li><code>TOK.NMTOKEN</code></li>
  * <li><code>TOK.POUND_NAME</code></li>
  * <li><code>TOK.OR</code></li>
  * <li><code>TOK.PERCENT</code></li>
  * <li><code>TOK.OPEN_PAREN</code></li>
  * <li><code>TOK.CLOSE_PAREN</code></li>
  * <li><code>TOK.OPEN_BRACKET</code></li>
  * <li><code>TOK.CLOSE_BRACKET</code></li>
  * <li><code>TOK.LITERAL</code></li>
  * <li><code>TOK.NAME_QUESTION</code></li>
  * <li><code>TOK.NAME_ASTERISK</code></li>
  * <li><code>TOK.NAME_PLUS</code></li>
  * <li><code>TOK.COND_SECT_OPEN</code></li>
  * <li><code>TOK.COND_SECT_CLOSE</code></li>
  * <li><code>TOK.CLOSE_PAREN_QUESTION</code></li>
  * <li><code>TOK.CLOSE_PAREN_ASTERISK</code></li>
  * <li><code>TOK.CLOSE_PAREN_PLUS</code></li>
  * <li><code>TOK.COMMA</code></li>
  * </ul>
  * @exception EmptyTokenException if the subarray is empty
  * @exception PartialTokenException if the subarray contains only part of
  * a legal token
  * @exception InvalidTokenException if the subarrary does not start
  * with a legal token or part of one
  * @exception EndOfPrologException if the subarray starts with the document
  * element; <code>tokenizeContent</code> should be used on the remainder
  * of the entity
  * @exception ExtensibleTokenException if the subarray is a legal token
  * but subsequent bytes in the same entity could be part of the token
  * @see #TOK.PI
  * @see #TOK.XML_DECL
  * @see #TOK.COMMENT
  * @see #TOK.PARAM_ENTITY_REF
  * @see #TOK.PROLOG_S
  * @see #TOK.DECL_OPEN
  * @see #TOK.DECL_CLOSE
  * @see #TOK.NAME
  * @see #TOK.NMTOKEN
  * @see #TOK.POUND_NAME
  * @see #TOK.OR
  * @see #TOK.PERCENT
  * @see #TOK.OPEN_PAREN
  * @see #TOK.CLOSE_PAREN
  * @see #TOK.OPEN_BRACKET
  * @see #TOK.CLOSE_BRACKET
  * @see #TOK.LITERAL
  * @see #TOK.NAME_QUESTION
  * @see #TOK.NAME_ASTERISK
  * @see #TOK.NAME_PLUS
  * @see #TOK.COND_SECT_OPEN
  * @see #TOK.COND_SECT_CLOSE
  * @see #TOK.CLOSE_PAREN_QUESTION
  * @see #TOK.CLOSE_PAREN_ASTERISK
  * @see #TOK.CLOSE_PAREN_PLUS
  * @see #TOK.COMMA
  * @see ContentToken
  * @see EmptyTokenException
  * @see PartialTokenException
  * @see InvalidTokenException
  * @see ExtensibleTokenException
  * @see EndOfPrologException
  */
 public TOK tokenizeProlog(byte[] buf, int off, int end, Token token)
 {
     TOK tok;
     if (minBPC > 1)
         end = adjustEnd(off, end);
     if (off == end)
         throw new EmptyTokenException();
     switch (byteType(buf, off))
     {
         case BT_QUOT:
             return scanLit(BT_QUOT, buf, off + minBPC, end, token);
         case BT_APOS:
             return scanLit(BT_APOS, buf, off + minBPC, end, token);
         case BT_LT:
         {
             off += minBPC;
             if (off == end)
                 throw new PartialTokenException();
             switch (byteType(buf, off))
             {
                 case BT_EXCL:
                     return scanDecl(buf, off + minBPC, end, token);
                 case BT_QUEST:
                     return scanPi(buf, off + minBPC, end, token);
                 case BT_NMSTRT:
                 case BT_LEAD2:
                 case BT_LEAD3:
                 case BT_LEAD4:
                     token.TokenEnd = off - minBPC;
                     throw new EndOfPrologException();
             }
             throw new InvalidTokenException(off);
         }
         case BT_CR:
             if (off + minBPC == end)
                 throw new ExtensibleTokenException(TOK.PROLOG_S);
             /* fall through */
             goto case BT_S;
         case BT_S:
         case BT_LF:
             for (;;)
             {
                 off += minBPC;
                 if (off == end)
                     break;
                 switch (byteType(buf, off))
                 {
                     case BT_S:
                     case BT_LF:
                         break;
                     case BT_CR:
                         /* don't split CR/LF pair */
                         if (off + minBPC != end)
                             break;
                         /* fall through */
                         goto default;
                     default:
                         token.TokenEnd = off;
                         return TOK.PROLOG_S;
                 }
             }
             token.TokenEnd = off;
             return TOK.PROLOG_S;
         case BT_PERCNT:
             return scanPercent(buf, off + minBPC, end, token);
         case BT_COMMA:
             token.TokenEnd = off + minBPC;
             return TOK.COMMA;
         case BT_LSQB:
             token.TokenEnd = off + minBPC;
             return TOK.OPEN_BRACKET;
         case BT_RSQB:
             off += minBPC;
             if (off == end)
                 throw new ExtensibleTokenException(TOK.CLOSE_BRACKET);
             if (charMatches(buf, off, ']'))
             {
                 if (off + minBPC == end)
                     throw new PartialTokenException();
                 if (charMatches(buf, off + minBPC, '>'))
                 {
                     token.TokenEnd = off + 2*minBPC;
                     return TOK.COND_SECT_CLOSE;
                 }
             }
             token.TokenEnd = off;
             return TOK.CLOSE_BRACKET;
         case BT_LPAR:
             token.TokenEnd = off + minBPC;
             return TOK.OPEN_PAREN;
         case BT_RPAR:
             off += minBPC;
             if (off == end)
                 throw new ExtensibleTokenException(TOK.CLOSE_PAREN);
         switch (byteType(buf, off))
         {
             case BT_AST:
                 token.TokenEnd = off + minBPC;
                 return TOK.CLOSE_PAREN_ASTERISK;
             case BT_QUEST:
                 token.TokenEnd = off + minBPC;
                 return TOK.CLOSE_PAREN_QUESTION;
             case BT_PLUS:
                 token.TokenEnd = off + minBPC;
                 return TOK.CLOSE_PAREN_PLUS;
             case BT_CR:
             case BT_LF:
             case BT_S:
             case BT_GT:
             case BT_COMMA:
             case BT_VERBAR:
             case BT_RPAR:
                 token.TokenEnd = off;
                 return TOK.CLOSE_PAREN;
         }
             throw new InvalidTokenException(off);
         case BT_VERBAR:
             token.TokenEnd = off + minBPC;
             return TOK.OR;
         case BT_GT:
             token.TokenEnd = off + minBPC;
             return TOK.DECL_CLOSE;
         case BT_NUM:
             return scanPoundName(buf, off + minBPC, end, token);
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
         switch (byteType2(buf, off))
         {
             case BT_NMSTRT:
                 off += 2;
                 tok = TOK.NAME;
                 break;
             case BT_NAME:
                 off += 2;
                 tok = TOK.NMTOKEN;
                 break;
             default:
                 throw new InvalidTokenException(off);
         }
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
         switch (byteType3(buf, off))
         {
             case BT_NMSTRT:
                 off += 3;
                 tok = TOK.NAME;
                 break;
             case BT_NAME:
                 off += 3;
                 tok = TOK.NMTOKEN;
                 break;
             default:
                 throw new InvalidTokenException(off);
         }
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
         switch (byteType4(buf, off))
         {
             case BT_NMSTRT:
                 off += 4;
                 tok = TOK.NAME;
                 break;
             case BT_NAME:
                 off += 4;
                 tok = TOK.NMTOKEN;
                 break;
             default:
                 throw new InvalidTokenException(off);
         }
             break;
         case BT_NMSTRT:
             tok = TOK.NAME;
             off += minBPC;
             break;
         case BT_NAME:
         case BT_MINUS:
             tok = TOK.NMTOKEN;
             off += minBPC;
             break;
         default:
             throw new InvalidTokenException(off);
     }
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_GT:
             case BT_RPAR:
             case BT_COMMA:
             case BT_VERBAR:
             case BT_LSQB:
             case BT_PERCNT:
             case BT_S:
             case BT_CR:
             case BT_LF:
                 token.TokenEnd = off;
                 return tok;
             case BT_PLUS:
                 if (tok != TOK.NAME)
                     throw new InvalidTokenException(off);
                 token.TokenEnd = off + minBPC;
                 return TOK.NAME_PLUS;
             case BT_AST:
                 if (tok != TOK.NAME)
                     throw new InvalidTokenException(off);
                 token.TokenEnd = off + minBPC;
                 return TOK.NAME_ASTERISK;
             case BT_QUEST:
                 if (tok != TOK.NAME)
                     throw new InvalidTokenException(off);
                 token.TokenEnd = off + minBPC;
                 return TOK.NAME_QUESTION;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new ExtensibleTokenException(tok);
 }
Esempio n. 15
0
 /**
  * Returns an encoding object to be used to start parsing an
  * external entity.  The encoding is chosen based on the
  * initial 4 bytes of the entity.
  *
  * @param buf the byte array containing the initial bytes of
  * the entity @param off the index in <code>buf</code> of the
  * first byte of the entity @param end the index in
  * <code>buf</code> following the last available byte of the
  * entity; <code>end - off</code> must be greater than or
  * equal to 4 unless the entity has fewer that 4 bytes, in
  * which case it must be equal to the length of the entity
  * @param token receives information about the presence of a
  * byte order mark; if the entity starts with a byte order
  * mark then <code>token.getTokenEnd()</code> will return
  * <code>off + 2</code>, otherwise it will return
  * <code>off</code>
  *
  * @see TextDecl
  * @see XmlDecl
  * @see #TOK.XML_DECL
  * @see #getEncoding
  * @see #getInternalEncoding
  */
 public static Encoding getInitialEncoding(byte[] buf, int off, int end,
     Token token)
 {
     token.TokenEnd = off;
     switch (end - off)
     {
         case 0:
             break;
         case 1:
             if (buf[off] > 127)
                 return null;
             break;
         default:
             int b0 = buf[off] & 0xFF;
             int b1 = buf[off + 1] & 0xFF;
         switch ((b0 << 8) | b1)
         {
             case 0xFEFF:
                 token.TokenEnd = off + 2;
                 /* fall through */
                 goto case '<';
             case '<': /* not legal; but not a fatal error */
                 return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
             case 0xFFFE:
                 token.TokenEnd = off + 2;
                 /* fall through */
                 goto case '<' << 8;
             case '<' << 8:  /* not legal; but not a fatal error */
                 return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING);
         }
             break;
     }
     return getEncoding(UTF8_ENCODING);
 }
Esempio n. 16
0
 /* off points to character following "&#X" */
 private TOK scanHexCharRef(byte[] buf, int off, int end, Token token)
 {
     if (off != end)
     {
         int c = byteToAscii(buf, off);
         int num;
         switch (c)
         {
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9':
                 num = c - '0';
                 break;
             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                 num = c - ('A' - 10);
                 break;
             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                 num = c - ('a' - 10);
                 break;
             default:
                 throw new InvalidTokenException(off);
         }
         for (off += minBPC; off != end; off += minBPC)
         {
             c = byteToAscii(buf, off);
             switch (c)
             {
                 case '0': case '1': case '2': case '3': case '4':
                 case '5': case '6': case '7': case '8': case '9':
                     num = (num << 4) + c - '0';
                     break;
                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                     num = (num << 4) + c - ('A' - 10);
                     break;
                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                     num = (num << 4) + c - ('a' - 10);
                     break;
                 case ';':
                     token.TokenEnd = off + minBPC;
                     return setRefChar(num, token);
                 default:
                     throw new InvalidTokenException(off);
             }
             if (num >= 0x110000)
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Esempio n. 17
0
 /**
  * Scans the first token of a byte subarrary that contains part of
  * literal attribute value.  The opening and closing delimiters
  * are not included in the subarrary.
  * Returns one of the following integers according to the type of
  * token that the subarray starts with:
  * <ul>
  * <li><code>TOK.DATA_CHARS</code></li>
  * <li><code>TOK.DATA_NEWLINE</code></li>
  * <li><code>TOK.ATTRIBUTE_VALUE_S</code></li>
  * <li><code>TOK.MAGIC_ENTITY_REF</code></li>
  * <li><code>TOK.ENTITY_REF</code></li>
  * <li><code>TOK.CHAR_REF</code></li>
  * <li><code>TOK.CHAR_PAIR_REF</code></li>
  * </ul>
  * @exception EmptyTokenException if the subarray is empty
  * @exception PartialTokenException if the subarray contains only part of
  * a legal token
  * @exception InvalidTokenException if the subarrary does not start
  * with a legal token or part of one
  * @exception ExtensibleTokenException if the subarray encodes just a carriage
  * return ('\r')
  * @see #TOK.DATA_CHARS
  * @see #TOK.DATA_NEWLINE
  * @see #TOK.ATTRIBUTE_VALUE_S
  * @see #TOK.MAGIC_ENTITY_REF
  * @see #TOK.ENTITY_REF
  * @see #TOK.CHAR_REF
  * @see #TOK.CHAR_PAIR_REF
  * @see Token
  * @see EmptyTokenException
  * @see PartialTokenException
  * @see InvalidTokenException
  * @see ExtensibleTokenException
  */
 public TOK tokenizeAttributeValue(byte[] buf, int off, int end, Token token)
 {
     if (minBPC > 1)
         end = adjustEnd(off, end);
     if (off == end)
         throw new EmptyTokenException();
     int start = off;
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 off += 4;
                 break;
             case BT_AMP:
                 if (off == start)
                     return scanRef(buf, off + minBPC, end, token);
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             case BT_LT:
                 /* this is for inside entity references */
                 throw new InvalidTokenException(off);
             case BT_S:
                 if (off == start)
                 {
                     token.TokenEnd = off + minBPC;
                     return TOK.ATTRIBUTE_VALUE_S;
                 }
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             case BT_LF:
                 if (off == start)
                 {
                     token.TokenEnd = off + minBPC;
                     return TOK.DATA_NEWLINE;
                 }
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             case BT_CR:
                 if (off == start)
                 {
                     off += minBPC;
                     if (off == end)
                         throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
                     if (byteType(buf, off) == BT_LF)
                         off += minBPC;
                     token.TokenEnd = off;
                     return TOK.DATA_NEWLINE;
                 }
                 token.TokenEnd = off;
                 return TOK.DATA_CHARS;
             default:
                 off += minBPC;
                 break;
         }
     }
     token.TokenEnd = off;
     return TOK.DATA_CHARS;
 }
Esempio n. 18
0
 /**
  * Scans the first token of a byte subarrary that starts with the
  * content of a CDATA section.
  * Returns one of the following integers according to the type of token
  * that the subarray starts with:
  * <ul>
  * <li><code>TOK.DATA_CHARS</code></li>
  * <li><code>TOK.DATA_NEWLINE</code></li>
  * <li><code>TOK.CDATA_SECT_CLOSE</code></li>
  * </ul>
  * <p>
  * Information about the token is stored in <code>token</code>.
  * </p>
  * After <code>TOK.CDATA_SECT_CLOSE</code> is returned, the application
  * should use <code>tokenizeContent</code>.
  *
  * @exception EmptyTokenException if the subarray is empty
  * @exception PartialTokenException if the subarray contains only part of
  * a legal token
  * @exception InvalidTokenException if the subarrary does not start
  * with a legal token or part of one
  * @exception ExtensibleTokenException if the subarray encodes just a carriage
  * return ('\r')
  *
  * @see #TOK.DATA_CHARS
  * @see #TOK.DATA_NEWLINE
  * @see #TOK.CDATA_SECT_CLOSE
  * @see Token
  * @see EmptyTokenException
  * @see PartialTokenException
  * @see InvalidTokenException
  * @see ExtensibleTokenException
  * @see #tokenizeContent
  */
 public TOK tokenizeCdataSection(byte[] buf, int off, int end,
     Token token)
 {
     if (minBPC > 1)
         end = adjustEnd(off, end);
     if (off == end)
         throw new EmptyTokenException();
     switch (byteType(buf, off))
     {
         case BT_RSQB:
             off += minBPC;
             if (off == end)
                 throw new PartialTokenException();
             if (!charMatches(buf, off, ']'))
                 break;
             off += minBPC;
             if (off == end)
                 throw new PartialTokenException();
             if (!charMatches(buf, off, '>'))
             {
                 off -= minBPC;
                 break;
             }
             token.TokenEnd = off + minBPC;
             return TOK.CDATA_SECT_CLOSE;
         case BT_CR:
             off += minBPC;
             if (off == end)
                 throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
             if (byteType(buf, off) == BT_LF)
                 off += minBPC;
             token.TokenEnd = off;
             return TOK.DATA_NEWLINE;
         case BT_LF:
             token.TokenEnd = off + minBPC;
             return TOK.DATA_NEWLINE;
         case BT_NONXML:
         case BT_MALFORM:
             throw new InvalidTokenException(off);
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             check2(buf, off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             check3(buf, off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             check4(buf, off);
             off += 4;
             break;
         default:
             off += minBPC;
             break;
     }
     token.TokenEnd = extendCdata(buf, off, end);
     return TOK.DATA_CHARS;
 }