private TOK scanCdataSection(byte[] buf, int off, int end, Token token) { /* "CDATA[".length() == 6 */ if (end - off < 6 * minBPC) throw new PartialTokenException(); for (int i = 0; i < CDATA.Length; i++, off += minBPC) checkCharMatches(buf, off, CDATA[i]); token.TokenEnd = off; return TOK.CDATA_SECT_OPEN; }
/* off points to character following "<!" */ private TOK scanDecl(byte[] buf, int off, int end, Token token) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_MINUS: return scanComment(buf, off + minBPC, end, token); case BT_LSQB: token.TokenEnd = off + minBPC; return TOK.COND_SECT_OPEN; case BT_NMSTRT: off += minBPC; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_PERCNT: if (off + minBPC == end) throw new PartialTokenException(); /* don't allow <!ENTITY% foo "whatever"> */ switch (byteType(buf, off + minBPC)) { case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: throw new InvalidTokenException(off); } /* fall through */ goto case BT_S; case BT_S: case BT_CR: case BT_LF: token.TokenEnd = off; return TOK.DECL_OPEN; case BT_NMSTRT: off += minBPC; break; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/* off points to character following "<?" */ private TOK scanPi(byte[] buf, int off, int end, Token token) { int target = off; if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: bool isXml = targetIsXml(buf, target, off); token.NameEnd = off; off += minBPC; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_QUEST: off += minBPC; if (off == end) throw new PartialTokenException(); if (charMatches(buf, off, '>')) { token.TokenEnd = off + minBPC; if (isXml) return TOK.XML_DECL; else return TOK.PI; } break; default: off += minBPC; break; } } throw new PartialTokenException(); case BT_QUEST: token.NameEnd = off; off += minBPC; if (off == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.TokenEnd = off + minBPC; return (targetIsXml(buf, target, token.NameEnd) ? TOK.XML_DECL : TOK.PI); default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/** * Scans the first token of a byte subarrary that contains part of * literal entity value. The opening and closing delimiters * are not included in the subarrary. * Returns one of the following integers according to the type of * token that the subarray starts with: * <ul> * <li><code>TOK.DATA_CHARS</code></li> * <li><code>TOK.DATA_NEWLINE</code></li> * <li><code>TOK.PARAM_ENTITY_REF</code></li> * <li><code>TOK.MAGIC_ENTITY_REF</code></li> * <li><code>TOK.ENTITY_REF</code></li> * <li><code>TOK.CHAR_REF</code></li> * <li><code>TOK.CHAR_PAIR_REF</code></li> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * @see #TOK.DATA_CHARS * @see #TOK.DATA_NEWLINE * @see #TOK.MAGIC_ENTITY_REF * @see #TOK.ENTITY_REF * @see #TOK.PARAM_ENTITY_REF * @see #TOK.CHAR_REF * @see #TOK.CHAR_PAIR_REF * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException */ public TOK tokenizeEntityValue(byte[] buf, int off, int end, Token token) { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); int start = off; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); off += 4; break; case BT_AMP: if (off == start) return scanRef(buf, off + minBPC, end, token); token.TokenEnd = off; return TOK.DATA_CHARS; case BT_PERCNT: if (off == start) return scanPercent(buf, off + minBPC, end, token); token.TokenEnd = off; return TOK.DATA_CHARS; case BT_LF: if (off == start) { token.TokenEnd = off + minBPC; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_CR: if (off == start) { off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK.DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.TokenEnd = off; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; default: off += minBPC; break; } } token.TokenEnd = off; return TOK.DATA_CHARS; }
/* off points to character following "<!-" */ private TOK scanComment(byte[] buf, int off, int end, Token token) { if (off != end) { checkCharMatches(buf, off, '-'); off += minBPC; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_MINUS: if ((off += minBPC) == end) throw new PartialTokenException(); if (charMatches(buf, off, '-')) { if ((off += minBPC) == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.TokenEnd = off + minBPC; return TOK.COMMENT; } break; default: off += minBPC; break; } } } throw new PartialTokenException(); }
/* off points to character following "&#" */ private TOK scanCharRef(byte[] buf, int off, int end, Token token) { if (off != end) { int c = byteToAscii(buf, off); switch (c) { case 'x': return scanHexCharRef(buf, off + minBPC, end, token); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; default: throw new InvalidTokenException(off); } int num = c - '0'; for (off += minBPC; off != end; off += minBPC) { c = byteToAscii(buf, off); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = num * 10 + (c - '0'); if (num < 0x110000) break; /* fall through */ goto default; default: throw new InvalidTokenException(off); case ';': token.TokenEnd = off + minBPC; return setRefChar(num, token); } } } throw new PartialTokenException(); }
/** * Scans the first token of a byte subarrary that contains part of * literal attribute value. The opening and closing delimiters * are not included in the subarrary. * Returns one of the following integers according to the type of * token that the subarray starts with: * <ul> * <li><code>TOK.DATA_CHARS</code></li> * <li><code>TOK.DATA_NEWLINE</code></li> * <li><code>TOK.ATTRIBUTE_VALUE_S</code></li> * <li><code>TOK.MAGIC_ENTITY_REF</code></li> * <li><code>TOK.ENTITY_REF</code></li> * <li><code>TOK.CHAR_REF</code></li> * <li><code>TOK.CHAR_PAIR_REF</code></li> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * @see #TOK.DATA_CHARS * @see #TOK.DATA_NEWLINE * @see #TOK.ATTRIBUTE_VALUE_S * @see #TOK.MAGIC_ENTITY_REF * @see #TOK.ENTITY_REF * @see #TOK.CHAR_REF * @see #TOK.CHAR_PAIR_REF * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException */ public TOK tokenizeAttributeValue(byte[] buf, int off, int end, Token token) { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); int start = off; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); off += 4; break; case BT_AMP: if (off == start) return scanRef(buf, off + minBPC, end, token); token.TokenEnd = off; return TOK.DATA_CHARS; case BT_LT: /* this is for inside entity references */ throw new InvalidTokenException(off); case BT_S: if (off == start) { token.TokenEnd = off + minBPC; return TOK.ATTRIBUTE_VALUE_S; } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_LF: if (off == start) { token.TokenEnd = off + minBPC; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_CR: if (off == start) { off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK.DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.TokenEnd = off; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; default: off += minBPC; break; } } token.TokenEnd = off; return TOK.DATA_CHARS; }
private TOK scanLit(int open, byte[] buf, int off, int end, Token token) { while (off != end) { int t = byteType(buf, off); switch (t) { case BT_LEAD2: if (end - off < 2) throw new PartialTokenException(); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialTokenException(); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialTokenException(); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_QUOT: case BT_APOS: off += minBPC; if (t != open) break; if (off == end) throw new ExtensibleTokenException(TOK.LITERAL); switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: case BT_GT: case BT_PERCNT: case BT_LSQB: token.TokenEnd = off; return TOK.LITERAL; default: throw new InvalidTokenException(off); } default: off += minBPC; break; } } throw new PartialTokenException(); }
/** * Returns an encoding object to be used to start parsing an * external entity. The encoding is chosen based on the * initial 4 bytes of the entity. * * @param buf the byte array containing the initial bytes of * the entity @param off the index in <code>buf</code> of the * first byte of the entity @param end the index in * <code>buf</code> following the last available byte of the * entity; <code>end - off</code> must be greater than or * equal to 4 unless the entity has fewer that 4 bytes, in * which case it must be equal to the length of the entity * @param token receives information about the presence of a * byte order mark; if the entity starts with a byte order * mark then <code>token.getTokenEnd()</code> will return * <code>off + 2</code>, otherwise it will return * <code>off</code> * * @see TextDecl * @see XmlDecl * @see #TOK.XML_DECL * @see #getEncoding * @see #getInternalEncoding */ public static Encoding getInitialEncoding(byte[] buf, int off, int end, Token token) { token.TokenEnd = off; switch (end - off) { case 0: break; case 1: if (buf[off] > 127) return null; break; default: int b0 = buf[off] & 0xFF; int b1 = buf[off + 1] & 0xFF; switch ((b0 << 8) | b1) { case 0xFEFF: token.TokenEnd = off + 2; /* fall through */ goto case '<'; case '<': /* not legal; but not a fatal error */ return getEncoding(UTF16_BIG_ENDIAN_ENCODING); case 0xFFFE: token.TokenEnd = off + 2; /* fall through */ goto case '<' << 8; case '<' << 8: /* not legal; but not a fatal error */ return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING); } break; } return getEncoding(UTF8_ENCODING); }
/* off points to character following "%" */ private TOK scanPercent(byte[] buf, int off, int end, Token token) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: token.TokenEnd = off; return TOK.PERCENT; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_SEMI: token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.PARAM_ENTITY_REF; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
private TOK scanPoundName(byte[] buf, int off, int end, Token token) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_CR: case BT_LF: case BT_S: case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: token.TokenEnd = off; return TOK.POUND_NAME; default: throw new InvalidTokenException(off); } } throw new ExtensibleTokenException(TOK.POUND_NAME); }
/* off points to character following "&" */ private TOK scanRef(byte[] buf, int off, int end, Token token) { if (off == end) throw new PartialTokenException(); if (isMagicEntityRef(buf, off, end, token)) return TOK.MAGIC_ENTITY_REF; switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; case BT_NUM: return scanCharRef(buf, off + minBPC, end, token); default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_SEMI: token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.ENTITY_REF; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
private bool isMagicEntityRef(byte[] buf, int off, int end, Token token) { switch (byteToAscii(buf, off)) { case 'a': if (end - off < minBPC*4) break; switch (byteToAscii(buf, off + minBPC)) { case 'm': if (charMatches(buf, off + minBPC*2, 'p') && charMatches(buf, off + minBPC*3, ';')) { token.TokenEnd = off + minBPC*4; token.RefChar1 = '&'; return true; } break; case 'p': if (end - off >= minBPC*5 && charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 's') && charMatches(buf, off + minBPC*4, ';')) { token.TokenEnd = off + minBPC*5; token.RefChar1 = '\''; return true; } break; } break; case 'l': if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') && charMatches(buf, off + minBPC*2, ';')) { token.TokenEnd = off + minBPC*3; token.RefChar1 = '<'; return true; } break; case 'g': if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') && charMatches(buf, off + minBPC*2, ';')) { token.TokenEnd = off + minBPC*3; token.RefChar1 = '>'; return true; } break; case 'q': if (end - off >= minBPC*5 && charMatches(buf, off + minBPC, 'u') && charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 't') && charMatches(buf, off + minBPC*4, ';')) { token.TokenEnd = off + minBPC*5; token.RefChar1 = '"'; return true; } break; } return false; }
/* num is known to be < 0x110000; return the token code */ private TOK setRefChar(int num, Token token) { if (num < 0x10000) { switch (charTypeTable[num >> 8][num & 0xFF]) { case BT_NONXML: case BT_LEAD4: case BT_MALFORM: throw new InvalidTokenException(token.TokenEnd - minBPC); } token.RefChar1 = (char)num; return TOK.CHAR_REF; } else { num -= 0x10000; token.RefChar1 = (char)((num >> 10) + 0xD800); token.RefChar2 = (char)((num & ((1 << 10) - 1)) + 0xDC00); return TOK.CHAR_PAIR_REF; } }
/** * Scans the first token of a byte subarrary that starts with the * content of a CDATA section. * Returns one of the following integers according to the type of token * that the subarray starts with: * <ul> * <li><code>TOK.DATA_CHARS</code></li> * <li><code>TOK.DATA_NEWLINE</code></li> * <li><code>TOK.CDATA_SECT_CLOSE</code></li> * </ul> * <p> * Information about the token is stored in <code>token</code>. * </p> * After <code>TOK.CDATA_SECT_CLOSE</code> is returned, the application * should use <code>tokenizeContent</code>. * * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * * @see #TOK.DATA_CHARS * @see #TOK.DATA_NEWLINE * @see #TOK.CDATA_SECT_CLOSE * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException * @see #tokenizeContent */ public TOK tokenizeCdataSection(byte[] buf, int off, int end, Token token) { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); switch (byteType(buf, off)) { case BT_RSQB: off += minBPC; if (off == end) throw new PartialTokenException(); if (!charMatches(buf, off, ']')) break; off += minBPC; if (off == end) throw new PartialTokenException(); if (!charMatches(buf, off, '>')) { off -= minBPC; break; } token.TokenEnd = off + minBPC; return TOK.CDATA_SECT_CLOSE; case BT_CR: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK.DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.TokenEnd = off; return TOK.DATA_NEWLINE; case BT_LF: token.TokenEnd = off + minBPC; return TOK.DATA_NEWLINE; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; default: off += minBPC; break; } token.TokenEnd = extendCdata(buf, off, end); return TOK.DATA_CHARS; }
/** * Scans the first token of a byte subarray that contains part of a * prolog. * Returns one of the following integers according to the type of token * that the subarray starts with: * <ul> * <li><code>TOK.PI</code></li> * <li><code>TOK.XML_DECL</code></li> * <li><code>TOK.COMMENT</code></li> * <li><code>TOK.PARAM_ENTITY_REF</code></li> * <li><code>TOK.PROLOG_S</code></li> * <li><code>TOK.DECL_OPEN</code></li> * <li><code>TOK.DECL_CLOSE</code></li> * <li><code>TOK.NAME</code></li> * <li><code>TOK.NMTOKEN</code></li> * <li><code>TOK.POUND_NAME</code></li> * <li><code>TOK.OR</code></li> * <li><code>TOK.PERCENT</code></li> * <li><code>TOK.OPEN_PAREN</code></li> * <li><code>TOK.CLOSE_PAREN</code></li> * <li><code>TOK.OPEN_BRACKET</code></li> * <li><code>TOK.CLOSE_BRACKET</code></li> * <li><code>TOK.LITERAL</code></li> * <li><code>TOK.NAME_QUESTION</code></li> * <li><code>TOK.NAME_ASTERISK</code></li> * <li><code>TOK.NAME_PLUS</code></li> * <li><code>TOK.COND_SECT_OPEN</code></li> * <li><code>TOK.COND_SECT_CLOSE</code></li> * <li><code>TOK.CLOSE_PAREN_QUESTION</code></li> * <li><code>TOK.CLOSE_PAREN_ASTERISK</code></li> * <li><code>TOK.CLOSE_PAREN_PLUS</code></li> * <li><code>TOK.COMMA</code></li> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception EndOfPrologException if the subarray starts with the document * element; <code>tokenizeContent</code> should be used on the remainder * of the entity * @exception ExtensibleTokenException if the subarray is a legal token * but subsequent bytes in the same entity could be part of the token * @see #TOK.PI * @see #TOK.XML_DECL * @see #TOK.COMMENT * @see #TOK.PARAM_ENTITY_REF * @see #TOK.PROLOG_S * @see #TOK.DECL_OPEN * @see #TOK.DECL_CLOSE * @see #TOK.NAME * @see #TOK.NMTOKEN * @see #TOK.POUND_NAME * @see #TOK.OR * @see #TOK.PERCENT * @see #TOK.OPEN_PAREN * @see #TOK.CLOSE_PAREN * @see #TOK.OPEN_BRACKET * @see #TOK.CLOSE_BRACKET * @see #TOK.LITERAL * @see #TOK.NAME_QUESTION * @see #TOK.NAME_ASTERISK * @see #TOK.NAME_PLUS * @see #TOK.COND_SECT_OPEN * @see #TOK.COND_SECT_CLOSE * @see #TOK.CLOSE_PAREN_QUESTION * @see #TOK.CLOSE_PAREN_ASTERISK * @see #TOK.CLOSE_PAREN_PLUS * @see #TOK.COMMA * @see ContentToken * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException * @see EndOfPrologException */ public TOK tokenizeProlog(byte[] buf, int off, int end, Token token) { TOK tok; if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); switch (byteType(buf, off)) { case BT_QUOT: return scanLit(BT_QUOT, buf, off + minBPC, end, token); case BT_APOS: return scanLit(BT_APOS, buf, off + minBPC, end, token); case BT_LT: { off += minBPC; if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_EXCL: return scanDecl(buf, off + minBPC, end, token); case BT_QUEST: return scanPi(buf, off + minBPC, end, token); case BT_NMSTRT: case BT_LEAD2: case BT_LEAD3: case BT_LEAD4: token.TokenEnd = off - minBPC; throw new EndOfPrologException(); } throw new InvalidTokenException(off); } case BT_CR: if (off + minBPC == end) throw new ExtensibleTokenException(TOK.PROLOG_S); /* fall through */ goto case BT_S; case BT_S: case BT_LF: for (;;) { off += minBPC; if (off == end) break; switch (byteType(buf, off)) { case BT_S: case BT_LF: break; case BT_CR: /* don't split CR/LF pair */ if (off + minBPC != end) break; /* fall through */ goto default; default: token.TokenEnd = off; return TOK.PROLOG_S; } } token.TokenEnd = off; return TOK.PROLOG_S; case BT_PERCNT: return scanPercent(buf, off + minBPC, end, token); case BT_COMMA: token.TokenEnd = off + minBPC; return TOK.COMMA; case BT_LSQB: token.TokenEnd = off + minBPC; return TOK.OPEN_BRACKET; case BT_RSQB: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK.CLOSE_BRACKET); if (charMatches(buf, off, ']')) { if (off + minBPC == end) throw new PartialTokenException(); if (charMatches(buf, off + minBPC, '>')) { token.TokenEnd = off + 2*minBPC; return TOK.COND_SECT_CLOSE; } } token.TokenEnd = off; return TOK.CLOSE_BRACKET; case BT_LPAR: token.TokenEnd = off + minBPC; return TOK.OPEN_PAREN; case BT_RPAR: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK.CLOSE_PAREN); switch (byteType(buf, off)) { case BT_AST: token.TokenEnd = off + minBPC; return TOK.CLOSE_PAREN_ASTERISK; case BT_QUEST: token.TokenEnd = off + minBPC; return TOK.CLOSE_PAREN_QUESTION; case BT_PLUS: token.TokenEnd = off + minBPC; return TOK.CLOSE_PAREN_PLUS; case BT_CR: case BT_LF: case BT_S: case BT_GT: case BT_COMMA: case BT_VERBAR: case BT_RPAR: token.TokenEnd = off; return TOK.CLOSE_PAREN; } throw new InvalidTokenException(off); case BT_VERBAR: token.TokenEnd = off + minBPC; return TOK.OR; case BT_GT: token.TokenEnd = off + minBPC; return TOK.DECL_CLOSE; case BT_NUM: return scanPoundName(buf, off + minBPC, end, token); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); switch (byteType2(buf, off)) { case BT_NMSTRT: off += 2; tok = TOK.NAME; break; case BT_NAME: off += 2; tok = TOK.NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); switch (byteType3(buf, off)) { case BT_NMSTRT: off += 3; tok = TOK.NAME; break; case BT_NAME: off += 3; tok = TOK.NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); switch (byteType4(buf, off)) { case BT_NMSTRT: off += 4; tok = TOK.NAME; break; case BT_NAME: off += 4; tok = TOK.NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_NMSTRT: tok = TOK.NAME; off += minBPC; break; case BT_NAME: case BT_MINUS: tok = TOK.NMTOKEN; off += minBPC; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_GT: case BT_RPAR: case BT_COMMA: case BT_VERBAR: case BT_LSQB: case BT_PERCNT: case BT_S: case BT_CR: case BT_LF: token.TokenEnd = off; return tok; case BT_PLUS: if (tok != TOK.NAME) throw new InvalidTokenException(off); token.TokenEnd = off + minBPC; return TOK.NAME_PLUS; case BT_AST: if (tok != TOK.NAME) throw new InvalidTokenException(off); token.TokenEnd = off + minBPC; return TOK.NAME_ASTERISK; case BT_QUEST: if (tok != TOK.NAME) throw new InvalidTokenException(off); token.TokenEnd = off + minBPC; return TOK.NAME_QUESTION; default: throw new InvalidTokenException(off); } } throw new ExtensibleTokenException(tok); }
/* off points to character following "</" */ private TOK scanEndTag(byte[] buf, int off, int end, Token token) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: token.NameEnd = off; for (off += minBPC; off != end; off += minBPC) { switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: break; case BT_GT: token.TokenEnd = off + minBPC; return TOK.END_TAG; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); case BT_GT: token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.END_TAG; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/* off points to character following "&#X" */ private TOK scanHexCharRef(byte[] buf, int off, int end, Token token) { if (off != end) { int c = byteToAscii(buf, off); int num; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = c - '0'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': num = c - ('A' - 10); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': num = c - ('a' - 10); break; default: throw new InvalidTokenException(off); } for (off += minBPC; off != end; off += minBPC) { c = byteToAscii(buf, off); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = (num << 4) + c - '0'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': num = (num << 4) + c - ('A' - 10); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': num = (num << 4) + c - ('a' - 10); break; case ';': token.TokenEnd = off + minBPC; return setRefChar(num, token); default: throw new InvalidTokenException(off); } if (num >= 0x110000) throw new InvalidTokenException(off); } } throw new PartialTokenException(); }