Example #1
0
 /* off points to character following "<" */
 private TOK scanLt(byte[] buf, int off, int end, ContentToken token)
 {
     if (off == end)
         throw new PartialTokenException();
     switch (byteType(buf, off))
     {
         case BT_NMSTRT:
             off += minBPC;
             break;
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             if (byteType2(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             if (byteType3(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             if (byteType4(buf, off) != BT_NMSTRT)
                 throw new InvalidTokenException(off);
             off += 4;
             break;
         case BT_EXCL:
             if ((off += minBPC) == end)
                 throw new PartialTokenException();
         switch (byteType(buf, off))
         {
             case BT_MINUS:
                 return scanComment(buf, off + minBPC, end, token);
             case BT_LSQB:
                 return scanCdataSection(buf, off + minBPC, end, token);
         }
             throw new InvalidTokenException(off);
         case BT_QUEST:
             return scanPi(buf, off + minBPC, end, token);
         case BT_SOL:
             return scanEndTag(buf, off + minBPC, end, token);
         default:
             throw new InvalidTokenException(off);
     }
     /* we have a start-tag */
     token.NameEnd = -1;
     token.clearAttributes();
     while (off != end)
     {
         switch (byteType(buf, off))
         {
             case BT_NMSTRT:
             case BT_NAME:
             case BT_MINUS:
                 off += minBPC;
                 break;
             case BT_LEAD2:
                 if (end - off < 2)
                     throw new PartialCharException(off);
                 if (!isNameChar2(buf, off))
                     throw new InvalidTokenException(off);
                 off += 2;
                 break;
             case BT_LEAD3:
                 if (end - off < 3)
                     throw new PartialCharException(off);
                 if (!isNameChar3(buf, off))
                     throw new InvalidTokenException(off);
                 off += 3;
                 break;
             case BT_LEAD4:
                 if (end - off < 4)
                     throw new PartialCharException(off);
                 if (!isNameChar4(buf, off))
                     throw new InvalidTokenException(off);
                 off += 4;
                 break;
             case BT_S:
             case BT_CR:
             case BT_LF:
                 token.NameEnd = off;
                 off += minBPC;
                 for (;;)
                 {
                     if (off == end)
                         throw new PartialTokenException();
                     switch (byteType(buf, off))
                     {
                         case BT_NMSTRT:
                             return scanAtts(off, buf, off + minBPC, end, token);
                         case BT_LEAD2:
                             if (end - off < 2)
                                 throw new PartialCharException(off);
                             if (byteType2(buf, off) != BT_NMSTRT)
                                 throw new InvalidTokenException(off);
                             return scanAtts(off, buf, off + 2, end, token);
                         case BT_LEAD3:
                             if (end - off < 3)
                                 throw new PartialCharException(off);
                             if (byteType3(buf, off) != BT_NMSTRT)
                                 throw new InvalidTokenException(off);
                             return scanAtts(off, buf, off + 3, end, token);
                         case BT_LEAD4:
                             if (end - off < 4)
                                 throw new PartialCharException(off);
                             if (byteType4(buf, off) != BT_NMSTRT)
                                 throw new InvalidTokenException(off);
                             return scanAtts(off, buf, off + 4, end, token);
                         case BT_GT:
                         case BT_SOL:
                             goto loop;
                         case BT_S:
                         case BT_CR:
                         case BT_LF:
                             off += minBPC;
                             break;
                         default:
                             throw new InvalidTokenException(off);
                     }
                 }
                 loop:
                     break;
             case BT_GT:
                 if (token.NameEnd < 0)
                     token.NameEnd = off;
                 token.TokenEnd = off + minBPC;
                 return TOK.START_TAG_NO_ATTS;
             case BT_SOL:
                 if (token.NameEnd < 0)
                     token.NameEnd = off;
                 off += minBPC;
                 if (off == end)
                     throw new PartialTokenException();
                 checkCharMatches(buf, off, '>');
                 token.TokenEnd = off + minBPC;
                 return TOK.EMPTY_ELEMENT_NO_ATTS;
             default:
                 throw new InvalidTokenException(off);
         }
     }
     throw new PartialTokenException();
 }
Example #2
0
        /* off points to character following first character of
           attribute name */
        private TOK scanAtts(int nameStart, byte[] buf, int off, int end,
            ContentToken token)
        {
            int NameEnd = -1;
            while (off != end)
            {
                switch (byteType(buf, off))
                {
                    case BT_NMSTRT:
                    case BT_NAME:
                    case BT_MINUS:
                        off += minBPC;
                        break;
                    case BT_LEAD2:
                        if (end - off < 2)
                            throw new PartialCharException(off);
                        if (!isNameChar2(buf, off))
                            throw new InvalidTokenException(off);
                        off += 2;
                        break;
                    case BT_LEAD3:
                        if (end - off < 3)
                            throw new PartialCharException(off);
                        if (!isNameChar3(buf, off))
                            throw new InvalidTokenException(off);
                        off += 3;
                        break;
                    case BT_LEAD4:
                        if (end - off < 4)
                            throw new PartialCharException(off);
                        if (!isNameChar4(buf, off))
                            throw new InvalidTokenException(off);
                        off += 4;
                        break;
                    case BT_S:
                    case BT_CR:
                    case BT_LF:
                        NameEnd = off;
                        for (;;)
                        {
                            off += minBPC;
                            if (off == end)
                                throw new PartialTokenException();
                            switch (byteType(buf, off))
                            {
                                case BT_EQUALS:
                                    goto loop;
                                case BT_S:
                                case BT_LF:
                                case BT_CR:
                                    break;
                                default:
                                    throw new InvalidTokenException(off);
                            }
                        }
                        loop: ;
                        /* fall through */
                        goto case BT_EQUALS;
                    case BT_EQUALS:
                    {
                        if (NameEnd < 0)
                            NameEnd = off;
                        int open;
                        for (;;)
                        {

                            off += minBPC;
                            if (off == end)
                                throw new PartialTokenException();
                            open = byteType(buf, off);
                            if (open == BT_QUOT || open == BT_APOS)
                                break;
                            switch (open)
                            {
                                case BT_S:
                                case BT_LF:
                                case BT_CR:
                                    break;
                                default:
                                    throw new InvalidTokenException(off);
                            }
                        }
                        off += minBPC;
                        int valueStart = off;
                        bool normalized = true;
                        int t;
                        /* in attribute value */
                        for (;;)
                        {
                            if (off == end)
                                throw new PartialTokenException();
                            t = byteType(buf, off);
                            if (t == open)
                                break;
                            switch (t)
                            {
                                case BT_NONXML:
                                case BT_MALFORM:
                                    throw new InvalidTokenException(off);
                                case BT_LEAD2:
                                    if (end - off < 2)
                                        throw new PartialCharException(off);
                                    check2(buf, off);
                                    off += 2;
                                    break;
                                case BT_LEAD3:
                                    if (end - off < 3)
                                        throw new PartialCharException(off);
                                    check3(buf, off);
                                    off += 3;
                                    break;
                                case BT_LEAD4:
                                    if (end - off < 4)
                                        throw new PartialCharException(off);
                                    check4(buf, off);
                                    off += 4;
                                    break;
                                case BT_AMP:
                                {
                                    normalized = false;
                                    int saveNameEnd = token.NameEnd;
                                    scanRef(buf, off + minBPC, end, token);
                                    token.NameEnd = saveNameEnd;
                                    off = token.TokenEnd;
                                    break;
                                }
                                case BT_S:
                                    if (normalized
                                        && (off == valueStart
                                        || byteToAscii(buf, off) != ' '
                                        || (off + minBPC != end
                                        && (byteToAscii(buf, off + minBPC) == ' '
                                        || byteType(buf, off + minBPC) == open))))
                                        normalized = false;
                                    off += minBPC;
                                    break;
                                case BT_LT:
                                    throw new InvalidTokenException(off);
                                case BT_LF:
                                case BT_CR:
                                    normalized = false;
                                    /* fall through */
                                    goto default;
                                default:
                                    off += minBPC;
                                    break;
                            }
                        }
                        token.appendAttribute(nameStart, NameEnd, valueStart,
                            off,
                            normalized);
                        off += minBPC;
                        if (off == end)
                            throw new PartialTokenException();
                        t = byteType(buf, off);
                        switch (t)
                        {
                            case BT_S:
                            case BT_CR:
                            case BT_LF:
                                off += minBPC;
                                if (off == end)
                                    throw new PartialTokenException();
                                t = byteType(buf, off);
                                break;
                            case BT_GT:
                            case BT_SOL:
                                break;
                            default:
                                throw new InvalidTokenException(off);
                        }
                        /* off points to closing quote */
                        for (;;)
                        {
                            switch (t)
                            {
                                case BT_NMSTRT:
                                    nameStart = off;
                                    off += minBPC;
                                    goto skipToName;
                                case BT_LEAD2:
                                    if (end - off < 2)
                                        throw new PartialCharException(off);
                                    if (byteType2(buf, off) != BT_NMSTRT)
                                        throw new InvalidTokenException(off);
                                    nameStart = off;
                                    off += 2;
                                    goto skipToName;
                                case BT_LEAD3:
                                    if (end - off < 3)
                                        throw new PartialCharException(off);
                                    if (byteType3(buf, off) != BT_NMSTRT)
                                        throw new InvalidTokenException(off);
                                    nameStart = off;
                                    off += 3;
                                    goto skipToName;
                                case BT_LEAD4:
                                    if (end - off < 4)
                                        throw new PartialCharException(off);
                                    if (byteType4(buf, off) != BT_NMSTRT)
                                        throw new InvalidTokenException(off);
                                    nameStart = off;
                                    off += 4;
                                    goto skipToName;
                                case BT_S:
                                case BT_CR:
                                case BT_LF:
                                    break;
                                case BT_GT:
                                    token.checkAttributeUniqueness(buf);
                                    token.TokenEnd = off + minBPC;
                                    return TOK.START_TAG_WITH_ATTS;
                                case BT_SOL:
                                    off += minBPC;
                                    if (off == end)
                                        throw new PartialTokenException();
                                    checkCharMatches(buf, off, '>');
                                    token.checkAttributeUniqueness(buf);
                                    token.TokenEnd = off + minBPC;
                                    return TOK.EMPTY_ELEMENT_WITH_ATTS;
                                default:
                                    throw new InvalidTokenException(off);
                            }
                            off += minBPC;
                            if (off == end)
                                throw new PartialTokenException();
                            t = byteType(buf, off);
                        }

                        skipToName:
                            NameEnd = -1;
                        break;
                    }
                    default:
                        throw new InvalidTokenException(off);
                }
            }
            throw new PartialTokenException();
        }
Example #3
0
 /**
  * Scans the first token of a byte subarrary that contains content.
  * Returns one of the following integers according to the type of token
  * that the subarray starts with:
  * <ul>
  * <li><code>TOK.START_TAG_NO_ATTS</code></li>
  * <li><code>TOK.START_TAG_WITH_ATTS</code></li>
  * <li><code>TOK.EMPTY_ELEMENT_NO_ATTS</code></li>
  * <li><code>TOK.EMPTY_ELEMENT_WITH_ATTS</code></li>
  * <li><code>TOK.END_TAG</code></li>
  * <li><code>TOK.DATA_CHARS</code></li>
  * <li><code>TOK.DATA_NEWLINE</code></li>
  * <li><code>TOK.CDATA_SECT_OPEN</code></li>
  * <li><code>TOK.ENTITY_REF</code></li>
  * <li><code>TOK.MAGIC_ENTITY_REF</code></li>
  * <li><code>TOK.CHAR_REF</code></li>
  * <li><code>TOK.CHAR_PAIR_REF</code></li>
  * <li><code>TOK.PI</code></li>
  * <li><code>TOK.XML_DECL</code></li>
  * <li><code>TOK.COMMENT</code></li>
  * </ul>
  * <p>
  * Information about the token is stored in <code>token</code>.
  * </p>
  * When <code>TOK.CDATA_SECT_OPEN</code> is returned,
  * <code>tokenizeCdataSection</code> should be called until
  * it returns <code>TOK.CDATA_SECT</code>.
  *
  * @exception EmptyTokenException if the subarray is empty
  * @exception PartialTokenException if the subarray contains only part of
  * a legal token
  * @exception InvalidTokenException if the subarrary does not start
  * with a legal token or part of one
  * @exception ExtensibleTokenException if the subarray encodes just a carriage
  * return ('\r')
  *
  * @see #TOK.START_TAG_NO_ATTS
  * @see #TOK.START_TAG_WITH_ATTS
  * @see #TOK.EMPTY_ELEMENT_NO_ATTS
  * @see #TOK.EMPTY_ELEMENT_WITH_ATTS
  * @see #TOK.END_TAG
  * @see #TOK.DATA_CHARS
  * @see #TOK.DATA_NEWLINE
  * @see #TOK.CDATA_SECT_OPEN
  * @see #TOK.ENTITY_REF
  * @see #TOK.MAGIC_ENTITY_REF
  * @see #TOK.CHAR_REF
  * @see #TOK.CHAR_PAIR_REF
  * @see #TOK.PI
  * @see #TOK.XML_DECL
  * @see #TOK.COMMENT
  * @see ContentToken
  * @see EmptyTokenException
  * @see PartialTokenException
  * @see InvalidTokenException
  * @see ExtensibleTokenException
  * @see #tokenizeCdataSection
  */
 public TOK tokenizeContent(byte[] buf, int off, int end,
     ContentToken token)
 {
     if (minBPC > 1)
         end = adjustEnd(off, end);
     if (off == end)
         throw new EmptyTokenException();
     switch (byteType(buf, off))
     {
         case BT_LT:
             return scanLt(buf, off + minBPC, end, token);
         case BT_AMP:
             return scanRef(buf, off + minBPC, end, token);
         case BT_CR:
             off += minBPC;
             if (off == end)
                 throw new ExtensibleTokenException(TOK.DATA_NEWLINE);
             if (byteType(buf, off) == BT_LF)
                 off += minBPC;
             token.TokenEnd = off;
             return TOK.DATA_NEWLINE;
         case BT_LF:
             token.TokenEnd = off + minBPC;
             return TOK.DATA_NEWLINE;
         case BT_RSQB:
             off += minBPC;
             if (off == end)
                 throw new ExtensibleTokenException(TOK.DATA_CHARS);
             if (!charMatches(buf, off, ']'))
                 break;
             off += minBPC;
             if (off == end)
                 throw new ExtensibleTokenException(TOK.DATA_CHARS);
             if (!charMatches(buf, off, '>'))
             {
                 off -= minBPC;
                 break;
             }
             throw new InvalidTokenException(off);
         case BT_NONXML:
         case BT_MALFORM:
             throw new InvalidTokenException(off);
         case BT_LEAD2:
             if (end - off < 2)
                 throw new PartialCharException(off);
             check2(buf, off);
             off += 2;
             break;
         case BT_LEAD3:
             if (end - off < 3)
                 throw new PartialCharException(off);
             check3(buf, off);
             off += 3;
             break;
         case BT_LEAD4:
             if (end - off < 4)
                 throw new PartialCharException(off);
             check4(buf, off);
             off += 4;
             break;
         default:
             off += minBPC;
             break;
     }
     token.TokenEnd = extendData(buf, off, end);
     return TOK.DATA_CHARS;
 }
        /// <summary>
        /// Put bytes into the parser.
        /// </summary>
        /// <param name="buf">The bytes to put into the parse stream</param>
        /// <param name="offset">Offset into buf to start at</param>
        /// <param name="length">Number of bytes to write</param>
        public void Push(byte[] buf, int offset, int length)
        {
            // or assert, really, but this is a little nicer.
            if (length == 0)
                return;

            // No locking is required.  Read() won't get called again
            // until this method returns.  Keep in mind that we're
            // already on a thread in a ThreadPool, which is created
            // and managed by System.IO at the end of the day.

            // TODO: only do this copy if we have a partial token at the
            // end of parsing.
            byte[] copy = new byte[length];
            System.Buffer.BlockCopy(buf, offset, copy, 0, length);
            m_buf.Write(copy);

            byte[] b = m_buf.GetBuffer();
            int off = 0;
            TOK tok = TOK.END_TAG;
            ContentToken ct = new ContentToken();

            try
            {
                while (off < b.Length)
                {

                    if (m_cdata)
                        tok = m_enc.tokenizeCdataSection(b, off, b.Length, ct);
                    else
                        tok = m_enc.tokenizeContent(b, off, b.Length, ct);

                    switch (tok)
                    {
                    case TOK.EMPTY_ELEMENT_NO_ATTS:
                    case TOK.EMPTY_ELEMENT_WITH_ATTS:
                        StartTag(b, off, ct, tok);
                        EndTag(b, off, ct, tok);
                        break;
                    case TOK.START_TAG_NO_ATTS:
                    case TOK.START_TAG_WITH_ATTS:
                        StartTag(b, off, ct, tok);
                        break;
                    case TOK.END_TAG:
                        EndTag(b, off, ct, tok);
                        break;
                    case TOK.DATA_CHARS:
                    case TOK.DATA_NEWLINE:
                        AddText(utf.GetString(b, off, ct.TokenEnd - off));
                        break;
                    case TOK.CHAR_REF:
                    case TOK.MAGIC_ENTITY_REF:
                        AddText(new string(new char[] { ct.RefChar1 }));
                        break;
                    case TOK.CHAR_PAIR_REF:
                        AddText(new string(new char[] {ct.RefChar1,
                                                              ct.RefChar2}));
                        break;
                    case TOK.COMMENT:
                        if (m_elem != null)
                        {
                            // <!-- 4
                            //  --> 3
                            int start = off + 4*m_enc.MinBytesPerChar;
                            int end = ct.TokenEnd - off -
                                    7*m_enc.MinBytesPerChar;
                            string text = utf.GetString(b, start, end);
                            m_elem.AppendChild(m_doc.CreateComment(text));
                        }
                        break;
                    case TOK.CDATA_SECT_OPEN:
                        m_cdata = true;
                        break;
                    case TOK.CDATA_SECT_CLOSE:
                        m_cdata = false;
                        break;
                    case TOK.XML_DECL:
                        // thou shalt use UTF8, and XML version 1.
                        // i shall ignore evidence to the contrary...

                        // TODO: Throw an exception if these assuptions are
                        // wrong
                        break;
                    case TOK.ENTITY_REF:
                    case TOK.PI:
                        throw new System.NotImplementedException("Token type not implemented: " + tok);
                    }
                    off = ct.TokenEnd;
                    ct.clearAttributes();
                }
            }
            catch (PartialTokenException)
            {
                // Console.WriteLine("PartialTokenException: " + System.Text.Encoding.UTF8.GetString(copy));
                // ignored;
            }
            catch (ExtensibleTokenException)
            {
                // ignored;
            }
            catch (xpnet.InvalidTokenException e)
            {
                throw new XMLParseException(e, this, buf, offset, length);
            }
            catch (Exception e)
            {
                throw new Exception("Unexpected exception", e);
            }
            finally
            {
                m_buf.Clear(off);
                ct.clearAttributes();
            }
        }
        private void StartTag(byte[] buf, int offset,
            ContentToken ct, TOK tok)
        {
            int colon;
            string name;
            string prefix;
            Hashtable ht = new Hashtable();

            m_ns.PushScope();

            // if i have attributes
            if ((tok == TOK.START_TAG_WITH_ATTS) ||
                (tok == TOK.EMPTY_ELEMENT_WITH_ATTS))
            {
                int start;
                int end;
                string val;
                for (int i=0; i<ct.getAttributeSpecifiedCount(); i++)
                {
                    start = ct.getAttributeNameStart(i);
                    end = ct.getAttributeNameEnd(i);
                    name = utf.GetString(buf, start, end - start);

                    start = ct.getAttributeValueStart(i);
                    end =  ct.getAttributeValueEnd(i);
                    val = utf.GetString(buf, start, end - start);

                    // <foo b='&amp;'/>
                    // <foo b='&amp;amp;'
                    // TODO: if val includes &amp;, it gets double-escaped
                    if (name.StartsWith("xmlns:"))
                    {
                        colon = name.IndexOf(':');
                        prefix = name.Substring(colon+1);
                        m_ns.AddNamespace(prefix, val);
                    }
                    else if (name == "xmlns")
                    {
                        m_ns.AddNamespace(string.Empty, val);
                    }
                    ht.Add(name, val);
                }
            }

            name = utf.GetString(buf,
                                 offset + m_enc.MinBytesPerChar,
                                 ct.NameEnd - offset - m_enc.MinBytesPerChar);
            colon = name.IndexOf(':');
            string ns = "";
            prefix = "";
            if (colon > 0)
            {
                prefix = name.Substring(0, colon);
                name = name.Substring(colon + 1);
                ns = m_ns.LookupNamespace(prefix);
            }
            else
            {
                ns = m_ns.DefaultNamespace;
            }

            XmlQualifiedName q = new XmlQualifiedName(name, ns);
            XmlElement elem = m_factory.GetElement(prefix, q, m_doc);

            foreach (string attrname in ht.Keys)
            {
                colon = attrname.IndexOf(':');
                if (colon > 0)
                {
                    prefix = attrname.Substring(0, colon);
                    name = attrname.Substring(colon+1);

                    XmlAttribute attr = m_doc.CreateAttribute(prefix,
                                                              name,
                                                              m_ns.LookupNamespace(prefix));
                    attr.InnerXml = (string)ht[attrname];
                    elem.SetAttributeNode(attr);
                }
                else
                {
                    XmlAttribute attr = m_doc.CreateAttribute(attrname);
                    attr.InnerXml = (string)ht[attrname];
                    elem.SetAttributeNode(attr);
                }
            }

            if (m_root == null)
            {
                m_root = elem;
                FireOnDocumentStart(m_root);
            }
            else
            {
                if (m_elem != null)
                    m_elem.AppendChild(elem);
                m_elem = elem;
            }
        }
        private void EndTag(byte[] buf, int offset,
            ContentToken ct, TOK tok)
        {
            m_ns.PopScope();

            if (m_elem == null)
            {// end of doc
                FireOnDocumentEnd();
                return;
            }

            string name = null;

            if ((tok == TOK.EMPTY_ELEMENT_WITH_ATTS) ||
                (tok == TOK.EMPTY_ELEMENT_NO_ATTS))
                name = utf.GetString(buf,
                                     offset + m_enc.MinBytesPerChar,
                                     ct.NameEnd - offset -
                                     m_enc.MinBytesPerChar);
            else
                name = utf.GetString(buf,
                                     offset + m_enc.MinBytesPerChar*2,
                                     ct.NameEnd - offset -
                                     m_enc.MinBytesPerChar*2);

            if (m_elem.Name != name)
                throw new XmlException("Invalid end tag: " + name +
                                       " != " + m_elem.Name);

            XmlElement parent = (XmlElement)m_elem.ParentNode;
            if (parent == null)
            {
                FireOnElement(m_elem);
            }
            m_elem = parent;
        }