Example #1
0
 /* off points to character following "<" */
 private TOK scanLt(byte[] buf, int off, int end, ContentToken token)
 {
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         case BT_EXCL:
             if ((off += minBPC) == end)
                 throw new PartialTokenException();
         switch (byteType(buf, off))
         {
             case BT_MINUS:
                 return scanComment(buf, off + minBPC, end, token);
             case BT_LSQB:
                 return scanCdataSection(buf, off + minBPC, end, token);
         }
             throw new InvalidTokenException(off);
         case BT_QUEST:
             return scanPi(buf, off + minBPC, end, token);
         case BT_SOL:
             return scanEndTag(buf, off + minBPC, end, token);
         default:
             throw new InvalidTokenException(off);
     }
     /* we have a start-tag */
     token.NameEnd = -1;
     token.clearAttributes();
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_S:
             case BT_CR:
             case BT_LF:
                 token.NameEnd = off;
                 off += minBPC;
                 for (;;)
                 {
                     if (off == end)
                         throw new PartialTokenException();
                     switch (byteType(buf, off))
                     {
                         case BT_NMSTRT:
                             return scanAtts(off, buf, off + minBPC, end, token);
                         case BT_LEAD2:
                             if (end - off < 2)
                                 throw new PartialCharException(off);
                             if (byteType2(buf, off) != BT_NMSTRT)
                                 throw new InvalidTokenException(off);
                             return scanAtts(off, buf, off + 2, end, token);
                         case BT_LEAD3:
                             if (end - off < 3)
                                 throw new PartialCharException(off);
                             if (byteType3(buf, off) != BT_NMSTRT)
                                 throw new InvalidTokenException(off);
                             return scanAtts(off, buf, off + 3, end, token);
                         case BT_LEAD4:
                             if (end - off < 4)
                                 throw new PartialCharException(off);
                             if (byteType4(buf, off) != BT_NMSTRT)
                                 throw new InvalidTokenException(off);
                             return scanAtts(off, buf, off + 4, end, token);
                         case BT_GT:
                         case BT_SOL:
                             goto loop;
                         case BT_S:
                         case BT_CR:
                         case BT_LF:
                             off += minBPC;
                             break;
                         default:
                             throw new InvalidTokenException(off);
                     }
                 }
                 loop:
                     break;
             case BT_GT:
                 if (token.NameEnd < 0)
                     token.NameEnd = off;
                 token.TokenEnd = off + minBPC;
                 return TOK.START_TAG_NO_ATTS;
             case BT_SOL:
                 if (token.NameEnd < 0)
                     token.NameEnd = off;
                 off += minBPC;
                 if (off == end)
                     throw new PartialTokenException();
                 checkCharMatches(buf, off, '>');
                 token.TokenEnd = off + minBPC;
                 return TOK.EMPTY_ELEMENT_NO_ATTS;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
        /// <summary>
        /// Put bytes into the parser.
        /// </summary>
        /// <param name="buf">The bytes to put into the parse stream</param>
        /// <param name="offset">Offset into buf to start at</param>
        /// <param name="length">Number of bytes to write</param>
        public void Push(byte[] buf, int offset, int length)
        {
            // or assert, really, but this is a little nicer.
            if (length == 0)
                return;

            // No locking is required.  Read() won't get called again
            // until this method returns.  Keep in mind that we're
            // already on a thread in a ThreadPool, which is created
            // and managed by System.IO at the end of the day.

            // TODO: only do this copy if we have a partial token at the
            // end of parsing.
            byte[] copy = new byte[length];
            System.Buffer.BlockCopy(buf, offset, copy, 0, length);
            m_buf.Write(copy);

            byte[] b = m_buf.GetBuffer();
            int off = 0;
            TOK tok = TOK.END_TAG;
            ContentToken ct = new ContentToken();

            try
            {
                while (off < b.Length)
                {

                    if (m_cdata)
                        tok = m_enc.tokenizeCdataSection(b, off, b.Length, ct);
                    else
                        tok = m_enc.tokenizeContent(b, off, b.Length, ct);

                    switch (tok)
                    {
                    case TOK.EMPTY_ELEMENT_NO_ATTS:
                    case TOK.EMPTY_ELEMENT_WITH_ATTS:
                        StartTag(b, off, ct, tok);
                        EndTag(b, off, ct, tok);
                        break;
                    case TOK.START_TAG_NO_ATTS:
                    case TOK.START_TAG_WITH_ATTS:
                        StartTag(b, off, ct, tok);
                        break;
                    case TOK.END_TAG:
                        EndTag(b, off, ct, tok);
                        break;
                    case TOK.DATA_CHARS:
                    case TOK.DATA_NEWLINE:
                        AddText(utf.GetString(b, off, ct.TokenEnd - off));
                        break;
                    case TOK.CHAR_REF:
                    case TOK.MAGIC_ENTITY_REF:
                        AddText(new string(new char[] { ct.RefChar1 }));
                        break;
                    case TOK.CHAR_PAIR_REF:
                        AddText(new string(new char[] {ct.RefChar1,
                                                              ct.RefChar2}));
                        break;
                    case TOK.COMMENT:
                        if (m_elem != null)
                        {
                            // <!-- 4
                            //  --> 3
                            int start = off + 4*m_enc.MinBytesPerChar;
                            int end = ct.TokenEnd - off -
                                    7*m_enc.MinBytesPerChar;
                            string text = utf.GetString(b, start, end);
                            m_elem.AppendChild(m_doc.CreateComment(text));
                        }
                        break;
                    case TOK.CDATA_SECT_OPEN:
                        m_cdata = true;
                        break;
                    case TOK.CDATA_SECT_CLOSE:
                        m_cdata = false;
                        break;
                    case TOK.XML_DECL:
                        // thou shalt use UTF8, and XML version 1.
                        // i shall ignore evidence to the contrary...

                        // TODO: Throw an exception if these assuptions are
                        // wrong
                        break;
                    case TOK.ENTITY_REF:
                    case TOK.PI:
                        throw new System.NotImplementedException("Token type not implemented: " + tok);
                    }
                    off = ct.TokenEnd;
                    ct.clearAttributes();
                }
            }
            catch (PartialTokenException)
            {
                // Console.WriteLine("PartialTokenException: " + System.Text.Encoding.UTF8.GetString(copy));
                // ignored;
            }
            catch (ExtensibleTokenException)
            {
                // ignored;
            }
            catch (xpnet.InvalidTokenException e)
            {
                throw new XMLParseException(e, this, buf, offset, length);
            }
            catch (Exception e)
            {
                throw new Exception("Unexpected exception", e);
            }
            finally
            {
                m_buf.Clear(off);
                ct.clearAttributes();
            }
        }