/// <summary> /// Parses the given HTML string into a collection of root nodes and their /// children. /// </summary> /// <param name="html">The HTML text to parse.</param> public IEnumerable <HtmlNode> ParseChildren(string?html, bool ignoreHtmlRules = false) { HtmlElementNode rootNode = new("[TempContainer]"); HtmlElementNode parentNode = rootNode; Parser.Reset(html); bool selfClosing; string?tag; // Loop until end of input while (!Parser.EndOfText) { if (Parser.Peek() == HtmlRules.TagStart) { // CDATA segments (blocks we store but don't parse--includes comments) CDataDefinition?definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => Parser.MatchesCurrentPosition(dd.StartText, dd.StartComparison)); if (definition != null) { parentNode.Children.Add(ParseCDataNode(definition)); continue; } // Closing tag if (Parser.Peek(1) == HtmlRules.ForwardSlash) { Parser.Index += 2; tag = Parser.ParseWhile(HtmlRules.IsTagCharacter); if (tag.Length > 0) { if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison)) { // Should never have matched parent if the top-level node if (!parentNode.IsTopLevelNode) { parentNode = parentNode.ParentNode; } } else { // Handle mismatched closing tag int tagPriority = HtmlRules.GetTagNestLevel(tag); while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagNestLevel(parentNode.TagName)) { parentNode = parentNode.ParentNode; } if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison)) { if (!parentNode.IsTopLevelNode) { parentNode = parentNode.ParentNode; } } } } Parser.SkipTo(HtmlRules.TagEnd); Parser.Next(); continue; } // Open tag if (ParseTag(out tag)) { HtmlTagFlag flags = ignoreHtmlRules ? HtmlTagFlag.None : HtmlRules.GetTagFlags(tag); if (flags.HasFlag(HtmlTagFlag.HtmlHeader)) { parentNode.Children.Add(ParseHtmlHeader()); } else if (flags.HasFlag(HtmlTagFlag.XmlHeader)) { parentNode.Children.Add(ParseXmlHeader()); } else { // Parse attributes HtmlAttributeCollection attributes = ParseAttributes(); // Parse rest of tag if (Parser.Peek() == HtmlRules.ForwardSlash) { Parser.Next(); Parser.SkipWhiteSpace(); selfClosing = true; } else { selfClosing = false; } Parser.SkipTo(HtmlRules.TagEnd); Parser.Next(); // Add node HtmlElementNode node = new(tag, attributes); while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode) { parentNode = parentNode.ParentNode; } parentNode.Children.Add(node); if (flags.HasFlag(HtmlTagFlag.CData)) { // CDATA tags are treated as elements but we store and do not parse the inner content if (!selfClosing) { if (ParseToClosingTag(tag, out string?content) && content.Length > 0) { node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content)); } } } else { if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing)) { selfClosing = false; } if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren)) { parentNode = node; // Node becomes new parent } } } continue; } } // Text node: must be at least 1 character (handles '<' that was not a tag) string text = Parser.ParseCharacter(); text += Parser.ParseTo(HtmlRules.TagStart); parentNode.Children.Add(new HtmlTextNode(text)); } // Return top-level nodes from nodes just parsed return(rootNode.Children); }