private HtmlCDataNode ParseCDataNode(TextParser parser, CDataDefinition definition) { Debug.Assert(parser.MatchesCurrentPosition(definition.StartText)); parser.MoveAhead(definition.StartText.Length); int start = parser.Position; parser.MoveTo(definition.EndText); string content = parser.Extract(start, parser.Position); parser.MoveAhead(definition.EndText.Length); return(new HtmlCDataNode(definition.StartText, definition.EndText, content)); }
/// <summary> /// Parses the given HTML string into a number of root nodes. /// </summary> /// <param name="html">The HTML text to parse.</param> public IEnumerable <HtmlNode> ParseChildren(string html) { TextParser parser = new TextParser(html); HtmlElementNode rootNode = new HtmlElementNode("[Root]"); HtmlElementNode parentNode = rootNode; string tag; bool selfClosing; // Loop until end of input while (!parser.EndOfText) { if (parser.Peek() == HtmlRules.TagStart) { // Test for CDATA segments, which we store but do not parse. This includes comments. CDataDefinition definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => parser.MatchesCurrentPosition(dd.StartText, dd.IgnoreCase)); if (definition != null) { parentNode.Children.Add(ParseCDataNode(parser, definition)); continue; } // Closing tag if (parser.Peek(1) == HtmlRules.ForwardSlash) { parser.MoveAhead(2); tag = parser.ParseWhile(c => HtmlRules.IsTagCharacter(c)); if (tag.Length > 0) { if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison)) { // Should never have matched parent if the top-level node Debug.Assert(!parentNode.IsTopLevelNode); parentNode = parentNode.ParentNode; } else { // Handle mismatched closing tag int tagPriority = HtmlRules.GetTagPriority(tag); while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagPriority(parentNode.TagName)) { parentNode = parentNode.ParentNode; } if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison)) { Debug.Assert(!parentNode.IsTopLevelNode); parentNode = parentNode.ParentNode; } } } parser.MoveTo(HtmlRules.TagEnd); parser.MoveAhead(); continue; } // Open tag if (ParseTag(parser, out tag)) { HtmlTagFlag flags = HtmlRules.GetTagFlags(tag); if (flags.HasFlag(HtmlTagFlag.HtmlHeader)) { parentNode.Children.Add(ParseHtmlHeader(parser)); } else if (flags.HasFlag(HtmlTagFlag.XmlHeader)) { parentNode.Children.Add(ParseXmlHeader(parser)); } else { // Parse attributes HtmlAttributeCollection attributes = ParseAttributes(parser); // Parse rest of tag if (parser.Peek() == HtmlRules.ForwardSlash) { parser.MoveAhead(); parser.MovePastWhitespace(); selfClosing = true; } else { selfClosing = false; } parser.MoveTo(HtmlRules.TagEnd); parser.MoveAhead(); // Add node HtmlElementNode node = new HtmlElementNode(tag, attributes); while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode) { Debug.Assert(parentNode.ParentNode != null); parentNode = parentNode.ParentNode; } parentNode.Children.Add(node); if (flags.HasFlag(HtmlTagFlag.CData)) { // CDATA tags are treated as elements but we store and do not parse the inner content if (!selfClosing) { if (ParseToClosingTag(parser, tag, out string content) && content.Length > 0) { node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content)); } } } else { if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing)) { selfClosing = false; } if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren)) { parentNode = node; // Node becomes new parent } } } continue; } } // Text node int start = parser.Position; // Text must be at least 1 character (handle '<' that is not part of a tag) parser.MoveAhead(); parser.MoveTo(HtmlRules.TagStart); Debug.Assert(parser.Position > start); parentNode.Children.Add(new HtmlTextNode(parser.Extract(start, parser.Position))); } // return(rootNode.Children); }