Пример #1
0
        public void Parse(StreamReader streamReader)
        {
            ParseState state          = ParseState.Outside;
            bool       parseCancelled = false;
            int        depth          = 0;

            string buffer = streamReader.ReadToEnd();

            char[] c             = buffer.ToCharArray();
            char   attrQuoteChar = '\0';
            int    i             = -1;

            string elementName = "";
            string attrName    = "";
            Dictionary <string, string> attributes = new Dictionary <string, string>();

            bool          documentStarted = false;
            bool          tokenComplete   = false;
            StringBuilder token           = new StringBuilder();

            while (++i < c.Length && !parseCancelled)
            {
                char cc = c[i]; // current char

                switch (state)
                {
                case ParseState.Outside:
                    if (cc == '<')
                    {
                        documentStarted = true;
                        StartDocument?.Invoke();
                        state = ParseState.LessThan;
                    }
                    else
                    {
                        token.Append(cc);
                    }
                    break;

                case ParseState.LessThan:
                    if (char.IsWhiteSpace(cc))
                    {
                        // do nothing
                    }
                    else if (MayStartXmlName(cc))
                    {
                        elementName = "";
                        attributes  = new Dictionary <string, string>();
                        token       = new StringBuilder(cc.ToString());
                        state       = ParseState.NewElementName;
                    }
                    else if (cc == '/')
                    {
                        tokenComplete = false;
                        token         = new StringBuilder();
                        state         = ParseState.EndElementName;
                    }
                    else if (cc == '!')     // !DOCTYPE or !CDATA[[ or !--
                    {
                        var bang = new string(c.SubArray(i, 10));
                        if (bang.StartsWith("!DOCTYPE "))
                        {
                            i    += 8;
                            token = new StringBuilder();
                            state = ParseState.DocType;
                        }
                        else if (bang.StartsWith("!CDATA[["))
                        {
                            i    += 7;
                            token = new StringBuilder();
                            state = ParseState.CData;
                        }
                        else if (bang.StartsWith("!--"))
                        {
                            i    += 2;
                            token = new StringBuilder();
                            state = ParseState.Comment;
                        }
                        else
                        {
                            parseCancelled = RaiseError("Unexpected chars after <! : <" + c.SubArray(i, 5), i);
                        }
                    }
                    else if (cc == '?')     // ?xml declaration ?
                    {
                        if (new string(c.SubArray(i, 5)) == "?xml ")
                        {
                            i    += 4;
                            state = ParseState.XmlDeclaration;
                        }
                        else
                        {
                            state = ParseState.ProcessingInstruction;
                        }
                    }
                    else
                    {
                        parseCancelled = RaiseError("Unexpected char '" + cc.ToString() + "' after <", i);
                    }
                    break;

                case ParseState.DocType:
                case ParseState.XmlDeclaration:
                    if (cc == '>')
                    {
                        state = ParseState.Outside;
                    }
                    // Ignore anything else, until we get to '>'
                    break;

                case ParseState.NewElementName:
                    if (ValidInXmlName(cc))
                    {
                        token.Append(cc);
                    }
                    else if (cc == '>')
                    {
                        elementName = token.ToString();
                        StartElement?.Invoke(elementName, attributes);
                        depth++;
                        token = new StringBuilder();
                        state = ParseState.InsideElement;
                    }
                    else if (cc == '/')
                    {
                        elementName = token.ToString();
                        state       = ParseState.EmptyElement;
                    }
                    else if (cc == ' ')
                    {
                        elementName = token.ToString();
                        state       = ParseState.SeekingAttrName;
                        attributes  = new Dictionary <string, string>();
                    }
                    else
                    {
                        parseCancelled = RaiseError("Invalid char '" + cc.ToString() + "' in Element Name", i);
                    }
                    break;

                case ParseState.SeekingAttrName:
                    if (char.IsWhiteSpace(cc))
                    {
                        // do nothing
                    }
                    else if (MayStartXmlName(cc))
                    {
                        token = new StringBuilder(cc.ToString());
                        state = ParseState.AttrName;
                    }
                    else if (cc == '/')
                    {
                        state = ParseState.EmptyElement;
                    }
                    else if (cc == '>')
                    {
                        StartElement?.Invoke(elementName, attributes);
                        depth++;
                        token = new StringBuilder();
                        state = ParseState.InsideElement;
                    }
                    else
                    {
                        parseCancelled = RaiseError("Invalid char '" + cc.ToString() + "' at start of Attribute Name", i);
                    }
                    break;

                case ParseState.AttrName:
                    if (ValidInXmlName(cc))
                    {
                        token.Append(cc);
                    }
                    else if (cc == ' ')
                    {
                        attrName = token.ToString();
                        token    = new StringBuilder();
                        state    = ParseState.SeekingEquals;
                    }
                    else if (cc == '=')
                    {
                        attrName = token.ToString();
                        token    = new StringBuilder();
                        state    = ParseState.SeekingAttrValue;
                    }
                    else
                    {
                        parseCancelled = RaiseError("Invalid char '" + cc.ToString() + "' in Attribute Name", i);
                    }
                    break;

                case ParseState.SeekingEquals:
                    if (cc == '=')
                    {
                        state = ParseState.SeekingAttrValue;
                    }
                    else if (cc == '/')
                    {
                        attributes.Add(attrName, "");
                        state = ParseState.EmptyElement;
                    }
                    else if (cc == '>')
                    {
                        attributes.Add(attrName, "");
                        token = new StringBuilder();
                        state = ParseState.InsideElement;
                    }
                    else if (MayStartXmlName(cc))
                    {
                        // Assume empty attribute (with no value)
                        attributes.Add(attrName, "");
                        token = new StringBuilder(cc.ToString());
                        state = ParseState.AttrName;
                    }
                    else
                    {
                        parseCancelled = RaiseError("Found '" + cc.ToString() + "' when expecting '=' after Attribute Name", i);
                    }
                    break;

                case ParseState.SeekingAttrValue:
                    if (cc == '\'' || cc == '\"')
                    {
                        attrQuoteChar = cc;
                        token         = new StringBuilder();
                        state         = ParseState.AttrValue;
                    }
                    else
                    {
                        parseCancelled = RaiseError("Found '" + cc.ToString() + "' when expecting quoted Attribute Value", i);
                    }
                    break;

                case ParseState.AttrValue:
                    if (cc == attrQuoteChar)
                    {
                        attributes.Add(attrName, token.ToString());
                        token = new StringBuilder();
                        state = ParseState.SeekingAttrName;
                    }
                    else
                    {
                        token.Append(cc);
                    }
                    break;

                case ParseState.EmptyElement:
                    if (cc == '>')
                    {
                        StartElement?.Invoke(elementName, attributes);
                        // no change to depth, because...
                        EndElement?.Invoke(elementName);
                        token = new StringBuilder();
                        state = ParseState.InsideElement;
                    }
                    else if (cc == ' ')
                    {
                        // do nothing
                    }
                    else
                    {
                        parseCancelled = RaiseError("Found '" + cc.ToString() + "' after '/' in empty Element Tag", i);
                    }
                    break;

                case ParseState.InsideElement:
                    if (cc == '<')
                    {
                        string data = token.ToString();
                        if (data.Length > 0)
                        {
                            if (string.IsNullOrWhiteSpace(data))
                            {
                                IgnorableWhitespace?.Invoke(data, 0, data.Length);
                            }
                            else
                            {
                                Characters?.Invoke(data, 0, data.Length);
                            }
                        }
                        token = new StringBuilder();
                        state = ParseState.LessThan;
                    }
                    else
                    {
                        token.Append(cc);
                    }
                    break;

                case ParseState.EndElementName:
                    if (MayStartXmlName(cc) && token.Length == 0)
                    {
                        token.Append(cc);
                        tokenComplete = false;
                    }
                    else if (ValidInXmlName(cc) && !tokenComplete)
                    {
                        token.Append(cc);
                    }
                    else if (cc == '>')
                    {
                        elementName = token.ToString();
                        EndElement?.Invoke(elementName);
                        token = new StringBuilder();
                        if (--depth > 0)
                        {
                            state = ParseState.InsideElement;
                        }
                        else
                        {
                            state = ParseState.Outside;
                        }
                    }
                    else if (cc == ' ')
                    {
                        tokenComplete = true;
                    }
                    else
                    {
                        parseCancelled = RaiseError("Found '" + cc.ToString() + "' after Name in End-of-Element tag.", i);
                    }
                    break;

                case ParseState.Comment:
                    if (cc == '-' && new string(c.SubArray(i, 3)) == "-->")
                    {
                        i += 2;
                        Comment?.Invoke(token.ToString());
                        token = new StringBuilder();
                        if (depth > 0)
                        {
                            state = ParseState.InsideElement;
                        }
                        else
                        {
                            state = ParseState.Outside;
                        }
                    }
                    else
                    {
                        token.Append(cc);
                    }
                    break;

                case ParseState.ProcessingInstruction:
                    if (cc == '?' && new string(c.SubArray(i, 2)) == "?>")
                    {
                        i          += 1;
                        elementName = token.ToString();
                        ProcessingInstruction?.Invoke(elementName.Head(), elementName.Tail());
                        token = new StringBuilder();
                        if (depth > 0)
                        {
                            state = ParseState.InsideElement;
                        }
                        else
                        {
                            state = ParseState.Outside;
                        }
                    }
                    else
                    {
                        token.Append(cc);
                    }
                    break;

                default:
                    parseCancelled = RaiseError("Unhandled ParseState: " + state.ToString(), i);
                    break;
                }
            }
            if (documentStarted)
            {
                EndDocument?.Invoke();
            }
        }