/* off points to character following "<" */ private TOK scanLt(byte[] buf, int off, int end, ContentToken token) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; case BT_EXCL: if ((off += minBPC) == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_MINUS: return scanComment(buf, off + minBPC, end, token); case BT_LSQB: return scanCdataSection(buf, off + minBPC, end, token); } throw new InvalidTokenException(off); case BT_QUEST: return scanPi(buf, off + minBPC, end, token); case BT_SOL: return scanEndTag(buf, off + minBPC, end, token); default: throw new InvalidTokenException(off); } /* we have a start-tag */ token.NameEnd = -1; token.clearAttributes(); while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: token.NameEnd = off; off += minBPC; for (;;) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: return scanAtts(off, buf, off + minBPC, end, token); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); return scanAtts(off, buf, off + 2, end, token); case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); return scanAtts(off, buf, off + 3, end, token); case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); return scanAtts(off, buf, off + 4, end, token); case BT_GT: case BT_SOL: goto loop; case BT_S: case BT_CR: case BT_LF: off += minBPC; break; default: throw new InvalidTokenException(off); } } loop: break; case BT_GT: if (token.NameEnd < 0) token.NameEnd = off; token.TokenEnd = off + minBPC; return TOK.START_TAG_NO_ATTS; case BT_SOL: if (token.NameEnd < 0) token.NameEnd = off; off += minBPC; if (off == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.TokenEnd = off + minBPC; return TOK.EMPTY_ELEMENT_NO_ATTS; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); }
/// <summary> /// Put bytes into the parser. /// </summary> /// <param name="buf">The bytes to put into the parse stream</param> /// <param name="offset">Offset into buf to start at</param> /// <param name="length">Number of bytes to write</param> public void Push(byte[] buf, int offset, int length) { // or assert, really, but this is a little nicer. if (length == 0) return; // No locking is required. Read() won't get called again // until this method returns. Keep in mind that we're // already on a thread in a ThreadPool, which is created // and managed by System.IO at the end of the day. // TODO: only do this copy if we have a partial token at the // end of parsing. byte[] copy = new byte[length]; System.Buffer.BlockCopy(buf, offset, copy, 0, length); m_buf.Write(copy); byte[] b = m_buf.GetBuffer(); int off = 0; TOK tok = TOK.END_TAG; ContentToken ct = new ContentToken(); try { while (off < b.Length) { if (m_cdata) tok = m_enc.tokenizeCdataSection(b, off, b.Length, ct); else tok = m_enc.tokenizeContent(b, off, b.Length, ct); switch (tok) { case TOK.EMPTY_ELEMENT_NO_ATTS: case TOK.EMPTY_ELEMENT_WITH_ATTS: StartTag(b, off, ct, tok); EndTag(b, off, ct, tok); break; case TOK.START_TAG_NO_ATTS: case TOK.START_TAG_WITH_ATTS: StartTag(b, off, ct, tok); break; case TOK.END_TAG: EndTag(b, off, ct, tok); break; case TOK.DATA_CHARS: case TOK.DATA_NEWLINE: AddText(utf.GetString(b, off, ct.TokenEnd - off)); break; case TOK.CHAR_REF: case TOK.MAGIC_ENTITY_REF: AddText(new string(new char[] { ct.RefChar1 })); break; case TOK.CHAR_PAIR_REF: AddText(new string(new char[] {ct.RefChar1, ct.RefChar2})); break; case TOK.COMMENT: if (m_elem != null) { // <!-- 4 // --> 3 int start = off + 4*m_enc.MinBytesPerChar; int end = ct.TokenEnd - off - 7*m_enc.MinBytesPerChar; string text = utf.GetString(b, start, end); m_elem.AppendChild(m_doc.CreateComment(text)); } break; case TOK.CDATA_SECT_OPEN: m_cdata = true; break; case TOK.CDATA_SECT_CLOSE: m_cdata = false; break; case TOK.XML_DECL: // thou shalt use UTF8, and XML version 1. // i shall ignore evidence to the contrary... // TODO: Throw an exception if these assuptions are // wrong break; case TOK.ENTITY_REF: case TOK.PI: throw new System.NotImplementedException("Token type not implemented: " + tok); } off = ct.TokenEnd; ct.clearAttributes(); } } catch (PartialTokenException) { // Console.WriteLine("PartialTokenException: " + System.Text.Encoding.UTF8.GetString(copy)); // ignored; } catch (ExtensibleTokenException) { // ignored; } catch (Kixeye.Xpnet.InvalidTokenException e) { throw new XMLParseException(e, this, buf, offset, length); } catch (Exception e) { throw new Exception("Unexpected exception", e); } finally { m_buf.Clear(off); ct.clearAttributes(); } }