/// <summary> /// Parses an HTML DOCTYPE header tag. Assumes current position is just after tag name. /// </summary> /// <param name="parser">Parser object.</param> private HtmlHeaderNode ParseHtmlHeader(TextParser parser) { HtmlHeaderNode node = new HtmlHeaderNode(); while (true) { parser.MovePastWhitespace(); char c = parser.Peek(); if (HtmlRules.IsQuoteChar(c)) { node.Parameters.Add($"\"{parser.ParseQuotedText()}\""); } else if (HtmlRules.IsAttributeNameCharacter(c)) { node.Parameters.Add(parser.ParseWhile(c2 => HtmlRules.IsAttributeNameCharacter(c2))); } else { break; } } parser.MoveTo(HtmlRules.TagEnd); parser.MoveAhead(); return(node); }
/// <summary> /// Attempts to parse an element tag at the current location. If the tag is parsed, /// the parser position is advanced to the end of the tag name and true is returned. /// Otherwise, false is returned and the current parser position does not change. /// </summary> /// <param name="parser"></param> /// <param name="tag"></param> private bool ParseTag(TextParser parser, out string tag) { tag = null; int pos = 0; Debug.Assert(parser.Peek() == HtmlRules.TagStart); char c = parser.Peek(++pos); if (c == '!' || c == '?') { c = parser.Peek(++pos); } if (HtmlRules.IsTagCharacter(c)) { while (HtmlRules.IsTagCharacter(parser.Peek(++pos))) { ; } // Move past '<' parser.MoveAhead(); // Extract tag name int length = pos - 1; tag = parser.Text.Substring(parser.Position, length); parser.MoveAhead(length); return(true); } // No tag found at this position return(false); }
/// <summary> /// Parses the attributes of an element tag. When finished, the parser /// position is at the next non-space character that follows the attributes. /// </summary> private HtmlAttributeCollection ParseAttributes(TextParser parser) { HtmlAttributeCollection attributes = new HtmlAttributeCollection(); // Parse tag attributes parser.MovePastWhitespace(); while (HtmlRules.IsAttributeNameCharacter(parser.Peek())) { // Parse attribute name HtmlAttribute attribute = new HtmlAttribute() { Name = parser.ParseWhile(c => HtmlRules.IsAttributeNameCharacter(c)) }; Debug.Assert(attribute.Name.Length > 0); // Parse attribute value parser.MovePastWhitespace(); if (parser.Peek() == '=') { parser.MoveAhead(); // Skip '=' parser.MovePastWhitespace(); if (HtmlRules.IsQuoteChar(parser.Peek())) { // Quoted attribute value attribute.Value = parser.ParseQuotedText(); } else { // Unquoted attribute value attribute.Value = parser.ParseWhile(c => HtmlRules.IsAttributeValueCharacter(c)); Debug.Assert(attribute.Value.Length > 0); } } else { // Null attribute value indicates no equals sign attribute.Value = null; } // Add attribute to tag attributes.Add(attribute.Name, attribute); // Continue parser.MovePastWhitespace(); } return(attributes); }
/// <summary> /// Moves the parser position to the closing tag for the given tag name. /// If the closing tag is not found, the parser position is set to the end /// of the text and false is returned. /// </summary> /// <param name="parser">Parser object.</param> /// <param name="tag">Tag name for which the closing tag is being searched.</param> /// <param name="content">Returns the content before the closing tag</param> /// <returns></returns> private bool ParseToClosingTag(TextParser parser, string tag, out string content) { string endTag = $"</{tag}"; int start = parser.Position; // Position assumed to just after open tag Debug.Assert(parser.Position > 0 && parser.Peek(-1) == HtmlRules.TagEnd); while (!parser.EndOfText) { parser.MoveTo(endTag, true); // Check that we didn't just match the first part of a longer tag if (!HtmlRules.IsTagCharacter(parser.Peek(endTag.Length))) { content = parser.Extract(start, parser.Position); parser.MoveAhead(endTag.Length); parser.MoveTo(HtmlRules.TagEnd); parser.MoveAhead(); return(true); } parser.MoveAhead(); } content = null; return(false); }
/// <summary> /// Parses the given selector text and returns the corresponding data structures. /// </summary> /// <param name="selectorText">The selector text to be parsed.</param> /// <remarks> /// Returns multiple <see cref="Selector"/>s when the selector contains commas. /// </remarks> /// <returns>The parsed selector data structures.</returns> public static SelectorCollection ParseSelector(string selectorText) { SelectorCollection selectors = new SelectorCollection(); if (!string.IsNullOrWhiteSpace(selectorText)) { TextParser parser = new TextParser(selectorText); parser.MovePastWhitespace(); while (!parser.EndOfText) { // Test next character char ch = parser.Peek(); if (IsNameCharacter(ch) || ch == '*') { // Parse tag name Selector selector = selectors.GetLast(true); if (ch == '*') { selector.Tag = null; // Match all tags } else { selector.Tag = parser.ParseWhile(c => IsNameCharacter(c)); } } else if (SpecialCharacters.TryGetValue(ch, out string name)) { // Parse special attributes parser.MoveAhead(); string value = parser.ParseWhile(c => IsValueCharacter(c)); if (value.Length > 0) { SelectorAttribute attribute = new SelectorAttribute { Name = name, Value = value, Mode = SelectorAttributeMode.Contains }; Selector selector = selectors.GetLast(true); selector.Attributes.Add(attribute); } } else if (ch == '[') { // Parse attribute selector parser.MoveAhead(); parser.MovePastWhitespace(); name = parser.ParseWhile(c => IsNameCharacter(c)); if (name.Length > 0) { SelectorAttribute attribute = new SelectorAttribute { Name = name }; // Parse attribute assignment operator parser.MovePastWhitespace(); if (parser.Peek() == '=') { attribute.Mode = SelectorAttributeMode.Match; parser.MoveAhead(); } else if (parser.Peek() == ':' && parser.Peek(1) == '=') { attribute.Mode = SelectorAttributeMode.RegEx; parser.MoveAhead(2); } else { attribute.Mode = SelectorAttributeMode.ExistsOnly; } // Parse attribute value if (attribute.Mode != SelectorAttributeMode.ExistsOnly) { parser.MovePastWhitespace(); if (HtmlRules.IsQuoteChar(parser.Peek())) { attribute.Value = parser.ParseQuotedText(); } else { attribute.Value = parser.ParseWhile(c => IsValueCharacter(c)); } } Selector selector = selectors.GetLast(true); selector.Attributes.Add(attribute); } // Close out attribute selector parser.MovePastWhitespace(); Debug.Assert(parser.Peek() == ']'); if (parser.Peek() == ']') { parser.MoveAhead(); } } else if (ch == ',') { // Multiple selectors parser.MoveAhead(); parser.MovePastWhitespace(); selectors.Add(new Selector()); } else if (ch == '>') { // Whitespace indicates child selector parser.MoveAhead(); parser.MovePastWhitespace(); Debug.Assert(selectors.Any()); Selector selector = selectors.AddChildSelector(); selector.ImmediateChildOnly = true; } else if (char.IsWhiteSpace(ch)) { // Handle whitespace parser.MovePastWhitespace(); // ',' and '>' change meaning of whitespace if (parser.Peek() != ',' && parser.Peek() != '>') { selectors.AddChildSelector(); } } else { // Unknown syntax Debug.Assert(false); parser.MoveAhead(); } } } selectors.RemoveEmpty(); return(selectors); }
/// <summary> /// Parses the given HTML string into a number of root nodes. /// </summary> /// <param name="html">The HTML text to parse.</param> public IEnumerable <HtmlNode> ParseChildren(string html) { TextParser parser = new TextParser(html); HtmlElementNode rootNode = new HtmlElementNode("[Root]"); HtmlElementNode parentNode = rootNode; string tag; bool selfClosing; // Loop until end of input while (!parser.EndOfText) { if (parser.Peek() == HtmlRules.TagStart) { // Test for CDATA segments, which we store but do not parse. This includes comments. CDataDefinition definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => parser.MatchesCurrentPosition(dd.StartText, dd.IgnoreCase)); if (definition != null) { parentNode.Children.Add(ParseCDataNode(parser, definition)); continue; } // Closing tag if (parser.Peek(1) == HtmlRules.ForwardSlash) { parser.MoveAhead(2); tag = parser.ParseWhile(c => HtmlRules.IsTagCharacter(c)); if (tag.Length > 0) { if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison)) { // Should never have matched parent if the top-level node Debug.Assert(!parentNode.IsTopLevelNode); parentNode = parentNode.ParentNode; } else { // Handle mismatched closing tag int tagPriority = HtmlRules.GetTagPriority(tag); while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagPriority(parentNode.TagName)) { parentNode = parentNode.ParentNode; } if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison)) { Debug.Assert(!parentNode.IsTopLevelNode); parentNode = parentNode.ParentNode; } } } parser.MoveTo(HtmlRules.TagEnd); parser.MoveAhead(); continue; } // Open tag if (ParseTag(parser, out tag)) { HtmlTagFlag flags = HtmlRules.GetTagFlags(tag); if (flags.HasFlag(HtmlTagFlag.HtmlHeader)) { parentNode.Children.Add(ParseHtmlHeader(parser)); } else if (flags.HasFlag(HtmlTagFlag.XmlHeader)) { parentNode.Children.Add(ParseXmlHeader(parser)); } else { // Parse attributes HtmlAttributeCollection attributes = ParseAttributes(parser); // Parse rest of tag if (parser.Peek() == HtmlRules.ForwardSlash) { parser.MoveAhead(); parser.MovePastWhitespace(); selfClosing = true; } else { selfClosing = false; } parser.MoveTo(HtmlRules.TagEnd); parser.MoveAhead(); // Add node HtmlElementNode node = new HtmlElementNode(tag, attributes); while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode) { Debug.Assert(parentNode.ParentNode != null); parentNode = parentNode.ParentNode; } parentNode.Children.Add(node); if (flags.HasFlag(HtmlTagFlag.CData)) { // CDATA tags are treated as elements but we store and do not parse the inner content if (!selfClosing) { if (ParseToClosingTag(parser, tag, out string content) && content.Length > 0) { node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content)); } } } else { if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing)) { selfClosing = false; } if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren)) { parentNode = node; // Node becomes new parent } } } continue; } } // Text node int start = parser.Position; // Text must be at least 1 character (handle '<' that is not part of a tag) parser.MoveAhead(); parser.MoveTo(HtmlRules.TagStart); Debug.Assert(parser.Position > start); parentNode.Children.Add(new HtmlTextNode(parser.Extract(start, parser.Position))); } // return(rootNode.Children); }