Beispiel #1
0
        /// <summary>
        /// Parses the given HTML string into a collection of root nodes and their
        /// children.
        /// </summary>
        /// <param name="html">The HTML text to parse.</param>
        public IEnumerable <HtmlNode> ParseChildren(string?html, bool ignoreHtmlRules = false)
        {
            HtmlElementNode rootNode   = new("[TempContainer]");
            HtmlElementNode parentNode = rootNode;

            Parser.Reset(html);
            bool   selfClosing;
            string?tag;

            // Loop until end of input
            while (!Parser.EndOfText)
            {
                if (Parser.Peek() == HtmlRules.TagStart)
                {
                    // CDATA segments (blocks we store but don't parse--includes comments)
                    CDataDefinition?definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => Parser.MatchesCurrentPosition(dd.StartText, dd.StartComparison));
                    if (definition != null)
                    {
                        parentNode.Children.Add(ParseCDataNode(definition));
                        continue;
                    }

                    // Closing tag
                    if (Parser.Peek(1) == HtmlRules.ForwardSlash)
                    {
                        Parser.Index += 2;
                        tag           = Parser.ParseWhile(HtmlRules.IsTagCharacter);
                        if (tag.Length > 0)
                        {
                            if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                            {
                                // Should never have matched parent if the top-level node
                                if (!parentNode.IsTopLevelNode)
                                {
                                    parentNode = parentNode.ParentNode;
                                }
                            }
                            else
                            {
                                // Handle mismatched closing tag
                                int tagPriority = HtmlRules.GetTagNestLevel(tag);

                                while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagNestLevel(parentNode.TagName))
                                {
                                    parentNode = parentNode.ParentNode;
                                }

                                if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                                {
                                    if (!parentNode.IsTopLevelNode)
                                    {
                                        parentNode = parentNode.ParentNode;
                                    }
                                }
                            }
                        }
                        Parser.SkipTo(HtmlRules.TagEnd);
                        Parser.Next();
                        continue;
                    }

                    // Open tag
                    if (ParseTag(out tag))
                    {
                        HtmlTagFlag flags = ignoreHtmlRules ? HtmlTagFlag.None : HtmlRules.GetTagFlags(tag);
                        if (flags.HasFlag(HtmlTagFlag.HtmlHeader))
                        {
                            parentNode.Children.Add(ParseHtmlHeader());
                        }
                        else if (flags.HasFlag(HtmlTagFlag.XmlHeader))
                        {
                            parentNode.Children.Add(ParseXmlHeader());
                        }
                        else
                        {
                            // Parse attributes
                            HtmlAttributeCollection attributes = ParseAttributes();

                            // Parse rest of tag
                            if (Parser.Peek() == HtmlRules.ForwardSlash)
                            {
                                Parser.Next();
                                Parser.SkipWhiteSpace();
                                selfClosing = true;
                            }
                            else
                            {
                                selfClosing = false;
                            }
                            Parser.SkipTo(HtmlRules.TagEnd);
                            Parser.Next();

                            // Add node
                            HtmlElementNode node = new(tag, attributes);
                            while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode)
                            {
                                parentNode = parentNode.ParentNode;
                            }
                            parentNode.Children.Add(node);

                            if (flags.HasFlag(HtmlTagFlag.CData))
                            {
                                // CDATA tags are treated as elements but we store and do not parse the inner content
                                if (!selfClosing)
                                {
                                    if (ParseToClosingTag(tag, out string?content) && content.Length > 0)
                                    {
                                        node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content));
                                    }
                                }
                            }
                            else
                            {
                                if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing))
                                {
                                    selfClosing = false;
                                }
                                if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren))
                                {
                                    parentNode = node;  // Node becomes new parent
                                }
                            }
                        }
                        continue;
                    }
                }

                // Text node: must be at least 1 character (handles '<' that was not a tag)
                string text = Parser.ParseCharacter();
                text += Parser.ParseTo(HtmlRules.TagStart);
                parentNode.Children.Add(new HtmlTextNode(text));
            }

            // Return top-level nodes from nodes just parsed
            return(rootNode.Children);
        }