/// <summary> /// </summary> /// <param name="open"> </param> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="ExtensibleTokenException"></exception> private TOK scanLit(int open, byte[] buf, int off, int end, Token token) { while (off != end) { int t = byteType(buf, off); switch (t) { case BT_LEAD2: if (end - off < 2) { throw new PartialTokenException(); } check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialTokenException(); } check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialTokenException(); } check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_QUOT: case BT_APOS: off += minBPC; if (t != open) { break; } if (off == end) { throw new ExtensibleTokenException(TOK.LITERAL); } switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: case BT_GT: case BT_PERCNT: case BT_LSQB: token.TokenEnd = off; return TOK.LITERAL; default: throw new InvalidTokenException(off); } default: off += minBPC; break; } } throw new PartialTokenException(); }
/* off points to character following "%" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> private TOK scanPercent(byte[] buf, int off, int end, Token token) { if (off == end) { throw new PartialTokenException(); } switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (byteType2(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (byteType3(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (byteType4(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 4; break; case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: token.TokenEnd = off; return TOK.PERCENT; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (!isNameChar2(buf, off)) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (!isNameChar3(buf, off)) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (!isNameChar4(buf, off)) { throw new InvalidTokenException(off); } off += 4; break; case BT_SEMI: token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.PARAM_ENTITY_REF; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="ExtensibleTokenException"></exception> private TOK scanPoundName(byte[] buf, int off, int end, Token token) { if (off == end) { throw new PartialTokenException(); } switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (byteType2(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (byteType3(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (byteType4(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (!isNameChar2(buf, off)) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (!isNameChar3(buf, off)) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (!isNameChar4(buf, off)) { throw new InvalidTokenException(off); } off += 4; break; case BT_CR: case BT_LF: case BT_S: case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: token.TokenEnd = off; return TOK.POUND_NAME; default: throw new InvalidTokenException(off); } } throw new ExtensibleTokenException(TOK.POUND_NAME); }
/// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> private bool isMagicEntityRef(byte[] buf, int off, int end, Token token) { switch (byteToAscii(buf, off)) { case 'a': if (end - off < minBPC*4) { break; } switch (byteToAscii(buf, off + minBPC)) { case 'm': if (charMatches(buf, off + minBPC*2, 'p') && charMatches(buf, off + minBPC*3, ';')) { token.TokenEnd = off + minBPC*4; token.RefChar1 = '&'; return true; } break; case 'p': if (end - off >= minBPC*5 && charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 's') && charMatches(buf, off + minBPC*4, ';')) { token.TokenEnd = off + minBPC*5; token.RefChar1 = '\''; return true; } break; } break; case 'l': if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') && charMatches(buf, off + minBPC*2, ';')) { token.TokenEnd = off + minBPC*3; token.RefChar1 = '<'; return true; } break; case 'g': if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') && charMatches(buf, off + minBPC*2, ';')) { token.TokenEnd = off + minBPC*3; token.RefChar1 = '>'; return true; } break; case 'q': if (end - off >= minBPC*5 && charMatches(buf, off + minBPC, 'u') && charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 't') && charMatches(buf, off + minBPC*4, ';')) { token.TokenEnd = off + minBPC*5; token.RefChar1 = '"'; return true; } break; } return false; }
/* off points to character following "&" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> private TOK scanRef(byte[] buf, int off, int end, Token token) { if (off == end) { throw new PartialTokenException(); } if (isMagicEntityRef(buf, off, end, token)) { return TOK.MAGIC_ENTITY_REF; } switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (byteType2(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (byteType3(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (byteType4(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 4; break; case BT_NUM: return scanCharRef(buf, off + minBPC, end, token); default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (!isNameChar2(buf, off)) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (!isNameChar3(buf, off)) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (!isNameChar4(buf, off)) { throw new InvalidTokenException(off); } off += 4; break; case BT_SEMI: token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.ENTITY_REF; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/** * Scans the first token of a byte subarrary that contains part of * literal attribute value. The opening and closing delimiters * are not included in the subarrary. * Returns one of the following integers according to the type of * token that the subarray starts with: * <ul> * <li><code>TOK.DATA_CHARS</code></li> * <li><code>TOK.DATA_NEWLINE</code></li> * <li><code>TOK.ATTRIBUTE_VALUE_S</code></li> * <li><code>TOK.MAGIC_ENTITY_REF</code></li> * <li><code>TOK.ENTITY_REF</code></li> * <li><code>TOK.CHAR_REF</code></li> * <li><code>TOK.CHAR_PAIR_REF</code></li> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * @see #TOK.DATA_CHARS * @see #TOK.DATA_NEWLINE * @see #TOK.ATTRIBUTE_VALUE_S * @see #TOK.MAGIC_ENTITY_REF * @see #TOK.ENTITY_REF * @see #TOK.CHAR_REF * @see #TOK.CHAR_PAIR_REF * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="EmptyTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="ExtensibleTokenException"></exception> public TOK tokenizeAttributeValue(byte[] buf, int off, int end, Token token) { if (minBPC > 1) { end = adjustEnd(off, end); } if (off == end) { throw new EmptyTokenException(); } int start = off; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } off += 4; break; case BT_AMP: if (off == start) { return scanRef(buf, off + minBPC, end, token); } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_LT: /* this is for inside entity references */ throw new InvalidTokenException(off); case BT_S: if (off == start) { token.TokenEnd = off + minBPC; return TOK.ATTRIBUTE_VALUE_S; } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_LF: if (off == start) { token.TokenEnd = off + minBPC; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_CR: if (off == start) { off += minBPC; if (off == end) { throw new ExtensibleTokenException(TOK.DATA_NEWLINE); } if (byteType(buf, off) == BT_LF) { off += minBPC; } token.TokenEnd = off; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; default: off += minBPC; break; } } token.TokenEnd = off; return TOK.DATA_CHARS; }
/* num is known to be < 0x110000; return the token code */ /// <summary> /// </summary> /// <param name="num"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="InvalidTokenException"></exception> private TOK setRefChar(int num, Token token) { if (num < 0x10000) { switch (charTypeTable[num >> 8][num & 0xFF]) { case BT_NONXML: case BT_LEAD4: case BT_MALFORM: throw new InvalidTokenException(token.TokenEnd - minBPC); } token.RefChar1 = (char) num; return TOK.CHAR_REF; } else { num -= 0x10000; token.RefChar1 = (char) ((num >> 10) + 0xD800); token.RefChar2 = (char) ((num & ((1 << 10) - 1)) + 0xDC00); return TOK.CHAR_PAIR_REF; } }
/* off points to character following "</" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> private TOK scanEndTag(byte[] buf, int off, int end, Token token) { if (off == end) { throw new PartialTokenException(); } switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (byteType2(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (byteType3(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (byteType4(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (!isNameChar2(buf, off)) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (!isNameChar3(buf, off)) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (!isNameChar4(buf, off)) { throw new InvalidTokenException(off); } off += 4; break; case BT_S: case BT_CR: case BT_LF: token.NameEnd = off; for (off += minBPC; off != end; off += minBPC) { switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: break; case BT_GT: token.TokenEnd = off + minBPC; return TOK.END_TAG; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); case BT_GT: token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.END_TAG; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/* off points to character following "&#X" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="PartialTokenException"></exception> private TOK scanHexCharRef(byte[] buf, int off, int end, Token token) { if (off != end) { int c = byteToAscii(buf, off); int num; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = c - '0'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': num = c - ('A' - 10); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': num = c - ('a' - 10); break; default: throw new InvalidTokenException(off); } for (off += minBPC; off != end; off += minBPC) { c = byteToAscii(buf, off); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = (num << 4) + c - '0'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': num = (num << 4) + c - ('A' - 10); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': num = (num << 4) + c - ('a' - 10); break; case ';': token.TokenEnd = off + minBPC; return setRefChar(num, token); default: throw new InvalidTokenException(off); } if (num >= 0x110000) { throw new InvalidTokenException(off); } } } throw new PartialTokenException(); }
/* off points to character following "<?" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> private TOK scanPi(byte[] buf, int off, int end, Token token) { int target = off; if (off == end) { throw new PartialTokenException(); } switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (byteType2(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (byteType3(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (byteType4(buf, off) != BT_NMSTRT) { throw new InvalidTokenException(off); } off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (!isNameChar2(buf, off)) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (!isNameChar3(buf, off)) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (!isNameChar4(buf, off)) { throw new InvalidTokenException(off); } off += 4; break; case BT_S: case BT_CR: case BT_LF: bool isXml = targetIsXml(buf, target, off); token.NameEnd = off; off += minBPC; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_QUEST: off += minBPC; if (off == end) { throw new PartialTokenException(); } if (charMatches(buf, off, '>')) { token.TokenEnd = off + minBPC; if (isXml) { return TOK.XML_DECL; } else { return TOK.PI; } } break; default: off += minBPC; break; } } throw new PartialTokenException(); case BT_QUEST: token.NameEnd = off; off += minBPC; if (off == end) { throw new PartialTokenException(); } checkCharMatches(buf, off, '>'); token.TokenEnd = off + minBPC; return targetIsXml(buf, target, token.NameEnd) ? TOK.XML_DECL : TOK.PI; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/* off points to character following "<![" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> private TOK scanCdataSection(byte[] buf, int off, int end, Token token) { /* "CDATA[".length() == 6 */ if (end - off < 6*minBPC) { throw new PartialTokenException(); } for (int i = 0; i < CDATA.Length; i++, off += minBPC) { checkCharMatches(buf, off, CDATA[i]); } token.TokenEnd = off; return TOK.CDATA_SECT_OPEN; }
/* off points to character following "<!" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialTokenException"></exception> /// <exception cref="InvalidTokenException"></exception> private TOK scanDecl(byte[] buf, int off, int end, Token token) { if (off == end) { throw new PartialTokenException(); } switch (byteType(buf, off)) { case BT_MINUS: return scanComment(buf, off + minBPC, end, token); case BT_LSQB: token.TokenEnd = off + minBPC; return TOK.COND_SECT_OPEN; case BT_NMSTRT: off += minBPC; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_PERCNT: if (off + minBPC == end) { throw new PartialTokenException(); } /* don't allow <!ENTITY% foo "whatever"> */ switch (byteType(buf, off + minBPC)) { case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: throw new InvalidTokenException(off); } /* fall through */ goto case BT_S; case BT_S: case BT_CR: case BT_LF: token.TokenEnd = off; return TOK.DECL_OPEN; case BT_NMSTRT: off += minBPC; break; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/* off points to character following "<!-" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="PartialCharException"></exception> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="PartialTokenException"></exception> private TOK scanComment(byte[] buf, int off, int end, Token token) { if (off != end) { checkCharMatches(buf, off, '-'); off += minBPC; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_MINUS: if ((off += minBPC) == end) { throw new PartialTokenException(); } if (charMatches(buf, off, '-')) { if ((off += minBPC) == end) { throw new PartialTokenException(); } checkCharMatches(buf, off, '>'); token.TokenEnd = off + minBPC; return TOK.COMMENT; } break; default: off += minBPC; break; } } } throw new PartialTokenException(); }
/** * Scans the first token of a byte subarrary that contains part of * literal entity value. The opening and closing delimiters * are not included in the subarrary. * Returns one of the following integers according to the type of * token that the subarray starts with: * <ul> * <li><code>TOK.DATA_CHARS</code></li> * <li><code>TOK.DATA_NEWLINE</code></li> * <li><code>TOK.PARAM_ENTITY_REF</code></li> * <li><code>TOK.MAGIC_ENTITY_REF</code></li> * <li><code>TOK.ENTITY_REF</code></li> * <li><code>TOK.CHAR_REF</code></li> * <li><code>TOK.CHAR_PAIR_REF</code></li> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * @see #TOK.DATA_CHARS * @see #TOK.DATA_NEWLINE * @see #TOK.MAGIC_ENTITY_REF * @see #TOK.ENTITY_REF * @see #TOK.PARAM_ENTITY_REF * @see #TOK.CHAR_REF * @see #TOK.CHAR_PAIR_REF * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="EmptyTokenException"></exception> /// <exception cref="PartialCharException"></exception> /// <exception cref="ExtensibleTokenException"></exception> public TOK tokenizeEntityValue(byte[] buf, int off, int end, Token token) { if (minBPC > 1) { end = adjustEnd(off, end); } if (off == end) { throw new EmptyTokenException(); } int start = off; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } off += 4; break; case BT_AMP: if (off == start) { return scanRef(buf, off + minBPC, end, token); } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_PERCNT: if (off == start) { return scanPercent(buf, off + minBPC, end, token); } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_LF: if (off == start) { token.TokenEnd = off + minBPC; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; case BT_CR: if (off == start) { off += minBPC; if (off == end) { throw new ExtensibleTokenException(TOK.DATA_NEWLINE); } if (byteType(buf, off) == BT_LF) { off += minBPC; } token.TokenEnd = off; return TOK.DATA_NEWLINE; } token.TokenEnd = off; return TOK.DATA_CHARS; default: off += minBPC; break; } } token.TokenEnd = off; return TOK.DATA_CHARS; }
/// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> public static Encoding getInitialEncoding(byte[] buf, int off, int end, Token token) { token.TokenEnd = off; switch (end - off) { case 0: break; case 1: if (buf[off] > 127) { return null; } break; default: int b0 = buf[off] & 0xFF; int b1 = buf[off + 1] & 0xFF; switch ((b0 << 8) | b1) { case 0xFEFF: token.TokenEnd = off + 2; /* fall through */ goto case '<'; case '<': /* not legal; but not a fatal error */ return getEncoding(UTF16_BIG_ENDIAN_ENCODING); case 0xFFFE: token.TokenEnd = off + 2; /* fall through */ goto case '<' << 8; case '<' << 8: /* not legal; but not a fatal error */ return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING); } break; } return getEncoding(UTF8_ENCODING); }
/* off points to character following "&#" */ /// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="PartialTokenException"></exception> private TOK scanCharRef(byte[] buf, int off, int end, Token token) { if (off != end) { int c = byteToAscii(buf, off); switch (c) { case 'x': return scanHexCharRef(buf, off + minBPC, end, token); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; default: throw new InvalidTokenException(off); } int num = c - '0'; for (off += minBPC; off != end; off += minBPC) { c = byteToAscii(buf, off); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = num*10 + (c - '0'); if (num < 0x110000) { break; } /* fall through */ goto default; default: throw new InvalidTokenException(off); case ';': token.TokenEnd = off + minBPC; return setRefChar(num, token); } } } throw new PartialTokenException(); }
/// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="EmptyTokenException"></exception> /// <exception cref="PartialTokenException"></exception> /// <exception cref="ExtensibleTokenException"></exception> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="PartialCharException"></exception> public TOK tokenizeCdataSection(byte[] buf, int off, int end, Token token) { if (minBPC > 1) { end = adjustEnd(off, end); } if (off == end) { throw new EmptyTokenException(); } switch (byteType(buf, off)) { case BT_RSQB: off += minBPC; if (off == end) { throw new PartialTokenException(); } if (!charMatches(buf, off, ']')) { break; } off += minBPC; if (off == end) { throw new PartialTokenException(); } if (!charMatches(buf, off, '>')) { off -= minBPC; break; } token.TokenEnd = off + minBPC; return TOK.CDATA_SECT_CLOSE; case BT_CR: off += minBPC; if (off == end) { throw new ExtensibleTokenException(TOK.DATA_NEWLINE); } if (byteType(buf, off) == BT_LF) { off += minBPC; } token.TokenEnd = off; return TOK.DATA_NEWLINE; case BT_LF: token.TokenEnd = off + minBPC; return TOK.DATA_NEWLINE; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } check4(buf, off); off += 4; break; default: off += minBPC; break; } token.TokenEnd = extendCdata(buf, off, end); return TOK.DATA_CHARS; }
/// <summary> /// </summary> /// <param name="buf"> </param> /// <param name="off"> </param> /// <param name="end"> </param> /// <param name="token"> </param> /// <returns> </returns> /// <exception cref="EmptyTokenException"></exception> /// <exception cref="PartialTokenException"></exception> /// <exception cref="EndOfPrologException"></exception> /// <exception cref="InvalidTokenException"></exception> /// <exception cref="ExtensibleTokenException"></exception> /// <exception cref="PartialCharException"></exception> public TOK tokenizeProlog(byte[] buf, int off, int end, Token token) { TOK tok; if (minBPC > 1) { end = adjustEnd(off, end); } if (off == end) { throw new EmptyTokenException(); } switch (byteType(buf, off)) { case BT_QUOT: return scanLit(BT_QUOT, buf, off + minBPC, end, token); case BT_APOS: return scanLit(BT_APOS, buf, off + minBPC, end, token); case BT_LT: { off += minBPC; if (off == end) { throw new PartialTokenException(); } switch (byteType(buf, off)) { case BT_EXCL: return scanDecl(buf, off + minBPC, end, token); case BT_QUEST: return scanPi(buf, off + minBPC, end, token); case BT_NMSTRT: case BT_LEAD2: case BT_LEAD3: case BT_LEAD4: token.TokenEnd = off - minBPC; throw new EndOfPrologException(); } throw new InvalidTokenException(off); } case BT_CR: if (off + minBPC == end) { throw new ExtensibleTokenException(TOK.PROLOG_S); } /* fall through */ goto case BT_S; case BT_S: case BT_LF: for (;;) { off += minBPC; if (off == end) { break; } switch (byteType(buf, off)) { case BT_S: case BT_LF: break; case BT_CR: /* don't split CR/LF pair */ if (off + minBPC != end) { break; } /* fall through */ goto default; default: token.TokenEnd = off; return TOK.PROLOG_S; } } token.TokenEnd = off; return TOK.PROLOG_S; case BT_PERCNT: return scanPercent(buf, off + minBPC, end, token); case BT_COMMA: token.TokenEnd = off + minBPC; return TOK.COMMA; case BT_LSQB: token.TokenEnd = off + minBPC; return TOK.OPEN_BRACKET; case BT_RSQB: off += minBPC; if (off == end) { throw new ExtensibleTokenException(TOK.CLOSE_BRACKET); } if (charMatches(buf, off, ']')) { if (off + minBPC == end) { throw new PartialTokenException(); } if (charMatches(buf, off + minBPC, '>')) { token.TokenEnd = off + 2*minBPC; return TOK.COND_SECT_CLOSE; } } token.TokenEnd = off; return TOK.CLOSE_BRACKET; case BT_LPAR: token.TokenEnd = off + minBPC; return TOK.OPEN_PAREN; case BT_RPAR: off += minBPC; if (off == end) { throw new ExtensibleTokenException(TOK.CLOSE_PAREN); } switch (byteType(buf, off)) { case BT_AST: token.TokenEnd = off + minBPC; return TOK.CLOSE_PAREN_ASTERISK; case BT_QUEST: token.TokenEnd = off + minBPC; return TOK.CLOSE_PAREN_QUESTION; case BT_PLUS: token.TokenEnd = off + minBPC; return TOK.CLOSE_PAREN_PLUS; case BT_CR: case BT_LF: case BT_S: case BT_GT: case BT_COMMA: case BT_VERBAR: case BT_RPAR: token.TokenEnd = off; return TOK.CLOSE_PAREN; } throw new InvalidTokenException(off); case BT_VERBAR: token.TokenEnd = off + minBPC; return TOK.OR; case BT_GT: token.TokenEnd = off + minBPC; return TOK.DECL_CLOSE; case BT_NUM: return scanPoundName(buf, off + minBPC, end, token); case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } switch (byteType2(buf, off)) { case BT_NMSTRT: off += 2; tok = TOK.NAME; break; case BT_NAME: off += 2; tok = TOK.NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } switch (byteType3(buf, off)) { case BT_NMSTRT: off += 3; tok = TOK.NAME; break; case BT_NAME: off += 3; tok = TOK.NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } switch (byteType4(buf, off)) { case BT_NMSTRT: off += 4; tok = TOK.NAME; break; case BT_NAME: off += 4; tok = TOK.NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_NMSTRT: tok = TOK.NAME; off += minBPC; break; case BT_NAME: case BT_MINUS: tok = TOK.NMTOKEN; off += minBPC; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) { throw new PartialCharException(off); } if (!isNameChar2(buf, off)) { throw new InvalidTokenException(off); } off += 2; break; case BT_LEAD3: if (end - off < 3) { throw new PartialCharException(off); } if (!isNameChar3(buf, off)) { throw new InvalidTokenException(off); } off += 3; break; case BT_LEAD4: if (end - off < 4) { throw new PartialCharException(off); } if (!isNameChar4(buf, off)) { throw new InvalidTokenException(off); } off += 4; break; case BT_GT: case BT_RPAR: case BT_COMMA: case BT_VERBAR: case BT_LSQB: case BT_PERCNT: case BT_S: case BT_CR: case BT_LF: token.TokenEnd = off; return tok; case BT_PLUS: if (tok != TOK.NAME) { throw new InvalidTokenException(off); } token.TokenEnd = off + minBPC; return TOK.NAME_PLUS; case BT_AST: if (tok != TOK.NAME) { throw new InvalidTokenException(off); } token.TokenEnd = off + minBPC; return TOK.NAME_ASTERISK; case BT_QUEST: if (tok != TOK.NAME) { throw new InvalidTokenException(off); } token.TokenEnd = off + minBPC; return TOK.NAME_QUESTION; default: throw new InvalidTokenException(off); } } throw new ExtensibleTokenException(tok); }