Пример #1
0
        /// <summary>
        /// Attempts to parse an element tag at the current location. If the tag is parsed,
        /// the parser position is advanced to the end of the tag name and true is returned.
        /// Otherwise, false is returned and the current parser position does not change.
        /// </summary>
        /// <param name="parser"></param>
        /// <param name="tag"></param>
        private bool ParseTag(TextParser parser, out string tag)
        {
            tag = null;
            int pos = 0;

            Debug.Assert(parser.Peek() == HtmlRules.TagStart);
            char c = parser.Peek(++pos);

            if (c == '!' || c == '?')
            {
                c = parser.Peek(++pos);
            }

            if (HtmlRules.IsTagCharacter(c))
            {
                while (HtmlRules.IsTagCharacter(parser.Peek(++pos)))
                {
                    ;
                }
                // Move past '<'
                parser.MoveAhead();
                // Extract tag name
                int length = pos - 1;
                tag = parser.Text.Substring(parser.Index, length);
                parser.MoveAhead(length);
                return(true);
            }
            // No tag found at this position
            return(false);
        }
Пример #2
0
        private bool ParseToClosingTag(string tag, out string?content)
#endif
        {
            string endTag = $"</{tag}";
            int    start  = Parser.Index;

            // Position assumed to just after open tag
            Debug.Assert(Parser.Index > 0 && Parser.Peek(-1) == HtmlRules.TagEnd);
            while (!Parser.EndOfText)
            {
                Parser.SkipTo(endTag, StringComparison.OrdinalIgnoreCase);
                // Check that we didn't just match the first part of a longer tag
                if (!HtmlRules.IsTagCharacter(Parser.Peek(endTag.Length)))
                {
                    content       = Parser.Extract(start, Parser.Index);
                    Parser.Index += endTag.Length;
                    Parser.SkipTo(HtmlRules.TagEnd);
                    Parser.Next();
                    return(true);
                }
                Parser.Next();
            }
            content = null;
            return(false);
        }
Пример #3
0
        private bool ParseTag(out string tag)
#endif
        {
            tag = null;
            int pos = 0;

            Debug.Assert(Parser.Peek() == HtmlRules.TagStart);
            char c = Parser.Peek(++pos);

            if (c == '!' || c == '?')
            {
                c = Parser.Peek(++pos);
            }

            if (HtmlRules.IsTagCharacter(c))
            {
                while (HtmlRules.IsTagCharacter(Parser.Peek(++pos)))
                {
                    ;
                }
                // Move past '<'
                Parser.Next();
                // Extract tag name
                int length = pos - 1;
                tag           = Parser.Text.Substring(Parser.Index, length);
                Parser.Index += length;
                return(true);
            }
            // No tag found at this position
            return(false);
        }
Пример #4
0
        /// <summary>
        /// Parses the attributes of an element tag. When finished, the parser
        /// position is at the next non-space character that follows the attributes.
        /// </summary>
        private HtmlAttributeCollection ParseAttributes(TextParser parser)
        {
            HtmlAttributeCollection attributes = new HtmlAttributeCollection();

            // Parse tag attributes
            parser.MovePastWhitespace();
            char ch = parser.Peek();

            while (HtmlRules.IsAttributeNameCharacter(ch) || HtmlRules.IsQuoteChar(ch))
            {
                // Parse attribute name
                HtmlAttribute attribute = new HtmlAttribute();
                if (HtmlRules.IsQuoteChar(ch))
                {
                    attribute.Name = $"\"{parser.ParseQuotedText()}\"";
                }
                else
                {
                    attribute.Name = parser.ParseWhile(c => HtmlRules.IsAttributeNameCharacter(c));
                }
                Debug.Assert(attribute.Name.Length > 0);

                // Parse attribute value
                parser.MovePastWhitespace();
                if (parser.Peek() == '=')
                {
                    parser.MoveAhead(); // Skip '='
                    parser.MovePastWhitespace();
                    if (HtmlRules.IsQuoteChar(parser.Peek()))
                    {
                        // Quoted attribute value
                        attribute.Value = parser.ParseQuotedText();
                    }
                    else
                    {
                        // Unquoted attribute value
                        attribute.Value = parser.ParseWhile(c => HtmlRules.IsAttributeValueCharacter(c));
                        Debug.Assert(attribute.Value.Length > 0);
                    }
                }
                else
                {
                    // Null attribute value indicates no equals sign
                    attribute.Value = null;
                }
                // Add attribute to tag
                attributes.Add(attribute.Name, attribute);
                // Continue
                parser.MovePastWhitespace();
                ch = parser.Peek();
            }
            return(attributes);
        }
Пример #5
0
        /// <summary>
        /// Parses the attributes of an element tag. When finished, the parser
        /// position is at the next non-space character that follows the attributes.
        /// </summary>
        private HtmlAttributeCollection ParseAttributes()
        {
            HtmlAttributeCollection attributes = new();

            // Parse tag attributes
            Parser.SkipWhiteSpace();
            char ch = Parser.Peek();

            while (HtmlRules.IsAttributeNameCharacter(ch) || HtmlRules.IsQuoteChar(ch))
            {
                // Parse attribute name
                HtmlAttribute attribute = new();
                if (HtmlRules.IsQuoteChar(ch))
                {
                    attribute.Name = $"\"{Parser.ParseQuotedText()}\"";
                }
                else
                {
                    attribute.Name = Parser.ParseWhile(HtmlRules.IsAttributeNameCharacter);
                }
                Debug.Assert(attribute.Name.Length > 0);

                // Parse attribute value
                Parser.SkipWhiteSpace();
                if (Parser.Peek() == '=')
                {
                    Parser.Next(); // Skip '='
                    Parser.SkipWhiteSpace();
                    if (HtmlRules.IsQuoteChar(Parser.Peek()))
                    {
                        // Quoted attribute value
                        attribute.Value = Parser.ParseQuotedText();
                    }
                    else
                    {
                        // Unquoted attribute value
                        attribute.Value = Parser.ParseWhile(HtmlRules.IsAttributeValueCharacter);
                        Debug.Assert(attribute.Value.Length > 0);
                    }
                }
                else
                {
                    // Null attribute value indicates no equals sign
                    attribute.Value = null;
                }
                // Add attribute to tag
                attributes.Add(attribute);
                // Continue
                Parser.SkipWhiteSpace();
                ch = Parser.Peek();
            }
            return(attributes);
        }
Пример #6
0
        /// <summary>
        /// Moves the parser position to the closing tag for the given tag name.
        /// If the closing tag is not found, the parser position is set to the end
        /// of the text and false is returned.
        /// </summary>
        /// <param name="parser">Parser object.</param>
        /// <param name="tag">Tag name for which the closing tag is being searched.</param>
        /// <param name="content">Returns the content before the closing tag</param>
        /// <returns></returns>
        private bool ParseToClosingTag(TextParser parser, string tag, out string content)
        {
            string endTag = $"</{tag}";
            int    start  = parser.Index;

            // Position assumed to just after open tag
            Debug.Assert(parser.Index > 0 && parser.Peek(-1) == HtmlRules.TagEnd);
            while (!parser.EndOfText)
            {
                parser.MoveTo(endTag, true);
                // Check that we didn't just match the first part of a longer tag
                if (!HtmlRules.IsTagCharacter(parser.Peek(endTag.Length)))
                {
                    content = parser.Extract(start, parser.Index);
                    parser.MoveAhead(endTag.Length);
                    parser.MoveTo(HtmlRules.TagEnd);
                    parser.MoveAhead();
                    return(true);
                }
                parser.MoveAhead();
            }
            content = null;
            return(false);
        }
Пример #7
0
        /// <summary>
        /// Parses the given HTML string into a number of root nodes.
        /// </summary>
        /// <param name="html">The HTML text to parse.</param>
        public IEnumerable <HtmlNode> ParseChildren(string html)
        {
            TextParser      parser     = new TextParser(html);
            HtmlElementNode rootNode   = new HtmlElementNode("[Root]");
            HtmlElementNode parentNode = rootNode;
            string          tag;
            bool            selfClosing;

            // Loop until end of input
            while (!parser.EndOfText)
            {
                if (parser.Peek() == HtmlRules.TagStart)
                {
                    // Test for CDATA segments, which we store but do not parse. This includes comments.
                    CDataDefinition definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => parser.MatchesCurrentPosition(dd.StartText, dd.IgnoreCase));
                    if (definition != null)
                    {
                        parentNode.Children.Add(ParseCDataNode(parser, definition));
                        continue;
                    }

                    // Closing tag
                    if (parser.Peek(1) == HtmlRules.ForwardSlash)
                    {
                        parser.MoveAhead(2);
                        tag = parser.ParseWhile(c => HtmlRules.IsTagCharacter(c));
                        if (tag.Length > 0)
                        {
                            if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                            {
                                // Should never have matched parent if the top-level node
                                Debug.Assert(!parentNode.IsTopLevelNode);
                                parentNode = parentNode.ParentNode;
                            }
                            else
                            {
                                // Handle mismatched closing tag
                                int tagPriority = HtmlRules.GetTagPriority(tag);
                                while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagPriority(parentNode.TagName))
                                {
                                    parentNode = parentNode.ParentNode;
                                }
                                if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                                {
                                    Debug.Assert(!parentNode.IsTopLevelNode);
                                    parentNode = parentNode.ParentNode;
                                }
                            }
                        }
                        parser.MoveTo(HtmlRules.TagEnd);
                        parser.MoveAhead();
                        continue;
                    }

                    // Open tag
                    if (ParseTag(parser, out tag))
                    {
                        HtmlTagFlag flags = HtmlRules.GetTagFlags(tag);
                        if (flags.HasFlag(HtmlTagFlag.HtmlHeader))
                        {
                            parentNode.Children.Add(ParseHtmlHeader(parser));
                        }
                        else if (flags.HasFlag(HtmlTagFlag.XmlHeader))
                        {
                            parentNode.Children.Add(ParseXmlHeader(parser));
                        }
                        else
                        {
                            // Parse attributes
                            HtmlAttributeCollection attributes = ParseAttributes(parser);

                            // Parse rest of tag
                            if (parser.Peek() == HtmlRules.ForwardSlash)
                            {
                                parser.MoveAhead();
                                parser.MovePastWhitespace();
                                selfClosing = true;
                            }
                            else
                            {
                                selfClosing = false;
                            }
                            parser.MoveTo(HtmlRules.TagEnd);
                            parser.MoveAhead();

                            // Add node
                            HtmlElementNode node = new HtmlElementNode(tag, attributes);
                            while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode)
                            {
                                Debug.Assert(parentNode.ParentNode != null);
                                parentNode = parentNode.ParentNode;
                            }
                            parentNode.Children.Add(node);

                            if (flags.HasFlag(HtmlTagFlag.CData))
                            {
                                // CDATA tags are treated as elements but we store and do not parse the inner content
                                if (!selfClosing)
                                {
                                    if (ParseToClosingTag(parser, tag, out string content) && content.Length > 0)
                                    {
                                        node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content));
                                    }
                                }
                            }
                            else
                            {
                                if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing))
                                {
                                    selfClosing = false;
                                }
                                if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren))
                                {
                                    parentNode = node;  // Node becomes new parent
                                }
                            }
                        }
                        continue;
                    }
                }

                // Text node
                int start = parser.Index;
                // Text must be at least 1 character (handle '<' that is not part of a tag)
                parser.MoveAhead();
                parser.MoveTo(HtmlRules.TagStart);
                Debug.Assert(parser.Index > start);
                parentNode.Children.Add(new HtmlTextNode(parser.Extract(start, parser.Index)));
            }

            //
            return(rootNode.Children);
        }
Пример #8
0
        /// <summary>
        /// Parses the given selector text and returns the corresponding data structures.
        /// </summary>
        /// <param name="selectorText">The selector text to be parsed.</param>
        /// <remarks>
        /// Returns multiple <see cref="Selector"/>s when the selector contains commas.
        /// </remarks>
        /// <returns>The parsed selector data structures.</returns>
        public static SelectorCollection ParseSelector(string selectorText)
        {
            SelectorCollection selectors = new SelectorCollection();

            if (!string.IsNullOrWhiteSpace(selectorText))
            {
                TextParser parser = new TextParser(selectorText);
                parser.MovePastWhitespace();

                while (!parser.EndOfText)
                {
                    // Test next character
                    char ch = parser.Peek();
                    if (IsNameCharacter(ch) || ch == '*')
                    {
                        // Parse tag name
                        Selector selector = selectors.GetLast(true);
                        if (ch == '*')
                        {
                            selector.Tag = null;    // Match all tags
                        }
                        else
                        {
                            selector.Tag = parser.ParseWhile(c => IsNameCharacter(c));
                        }
                    }
                    else if (SpecialCharacters.TryGetValue(ch, out string name))
                    {
                        // Parse special attributes
                        parser.MoveAhead();
                        string value = parser.ParseWhile(c => IsValueCharacter(c));
                        if (value.Length > 0)
                        {
                            SelectorAttribute attribute = new SelectorAttribute
                            {
                                Name  = name,
                                Value = value,
                                Mode  = SelectorAttributeMode.Contains
                            };

                            Selector selector = selectors.GetLast(true);
                            selector.Attributes.Add(attribute);
                        }
                    }
                    else if (ch == '[')
                    {
                        // Parse attribute selector
                        parser.MoveAhead();
                        parser.MovePastWhitespace();
                        name = parser.ParseWhile(c => IsNameCharacter(c));
                        if (name.Length > 0)
                        {
                            SelectorAttribute attribute = new SelectorAttribute
                            {
                                Name = name
                            };

                            // Parse attribute assignment operator
                            parser.MovePastWhitespace();
                            if (parser.Peek() == '=')
                            {
                                attribute.Mode = SelectorAttributeMode.Match;
                                parser.MoveAhead();
                            }
                            else if (parser.Peek() == ':' && parser.Peek(1) == '=')
                            {
                                attribute.Mode = SelectorAttributeMode.RegEx;
                                parser.MoveAhead(2);
                            }
                            else
                            {
                                attribute.Mode = SelectorAttributeMode.ExistsOnly;
                            }

                            // Parse attribute value
                            if (attribute.Mode != SelectorAttributeMode.ExistsOnly)
                            {
                                parser.MovePastWhitespace();
                                if (HtmlRules.IsQuoteChar(parser.Peek()))
                                {
                                    attribute.Value = parser.ParseQuotedText();
                                }
                                else
                                {
                                    attribute.Value = parser.ParseWhile(c => IsValueCharacter(c));
                                }
                            }

                            Selector selector = selectors.GetLast(true);
                            selector.Attributes.Add(attribute);
                        }

                        // Close out attribute selector
                        parser.MovePastWhitespace();
                        Debug.Assert(parser.Peek() == ']');
                        if (parser.Peek() == ']')
                        {
                            parser.MoveAhead();
                        }
                    }
                    else if (ch == ',')
                    {
                        // Multiple selectors
                        parser.MoveAhead();
                        parser.MovePastWhitespace();
                        selectors.Add(new Selector());
                    }
                    else if (ch == '>')
                    {
                        // Whitespace indicates child selector
                        parser.MoveAhead();
                        parser.MovePastWhitespace();
                        Debug.Assert(selectors.Any());
                        Selector selector = selectors.AddChildSelector();
                        selector.ImmediateChildOnly = true;
                    }
                    else if (char.IsWhiteSpace(ch))
                    {
                        // Handle whitespace
                        parser.MovePastWhitespace();
                        // ',' and '>' change meaning of whitespace
                        if (parser.Peek() != ',' && parser.Peek() != '>')
                        {
                            selectors.AddChildSelector();
                        }
                    }
                    else
                    {
                        // Unknown syntax
                        Debug.Assert(false);
                        parser.MoveAhead();
                    }
                }
            }
            selectors.RemoveEmpty();
            return(selectors);
        }
Пример #9
0
        /// <summary>
        /// Parses the given HTML string into a collection of root nodes and their
        /// children.
        /// </summary>
        /// <param name="html">The HTML text to parse.</param>
        public IEnumerable <HtmlNode> ParseChildren(string?html, bool ignoreHtmlRules = false)
        {
            HtmlElementNode rootNode   = new("[TempContainer]");
            HtmlElementNode parentNode = rootNode;

            Parser.Reset(html);
            bool   selfClosing;
            string?tag;

            // Loop until end of input
            while (!Parser.EndOfText)
            {
                if (Parser.Peek() == HtmlRules.TagStart)
                {
                    // CDATA segments (blocks we store but don't parse--includes comments)
                    CDataDefinition?definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => Parser.MatchesCurrentPosition(dd.StartText, dd.StartComparison));
                    if (definition != null)
                    {
                        parentNode.Children.Add(ParseCDataNode(definition));
                        continue;
                    }

                    // Closing tag
                    if (Parser.Peek(1) == HtmlRules.ForwardSlash)
                    {
                        Parser.Index += 2;
                        tag           = Parser.ParseWhile(HtmlRules.IsTagCharacter);
                        if (tag.Length > 0)
                        {
                            if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                            {
                                // Should never have matched parent if the top-level node
                                if (!parentNode.IsTopLevelNode)
                                {
                                    parentNode = parentNode.ParentNode;
                                }
                            }
                            else
                            {
                                // Handle mismatched closing tag
                                int tagPriority = HtmlRules.GetTagNestLevel(tag);

                                while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagNestLevel(parentNode.TagName))
                                {
                                    parentNode = parentNode.ParentNode;
                                }

                                if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                                {
                                    if (!parentNode.IsTopLevelNode)
                                    {
                                        parentNode = parentNode.ParentNode;
                                    }
                                }
                            }
                        }
                        Parser.SkipTo(HtmlRules.TagEnd);
                        Parser.Next();
                        continue;
                    }

                    // Open tag
                    if (ParseTag(out tag))
                    {
                        HtmlTagFlag flags = ignoreHtmlRules ? HtmlTagFlag.None : HtmlRules.GetTagFlags(tag);
                        if (flags.HasFlag(HtmlTagFlag.HtmlHeader))
                        {
                            parentNode.Children.Add(ParseHtmlHeader());
                        }
                        else if (flags.HasFlag(HtmlTagFlag.XmlHeader))
                        {
                            parentNode.Children.Add(ParseXmlHeader());
                        }
                        else
                        {
                            // Parse attributes
                            HtmlAttributeCollection attributes = ParseAttributes();

                            // Parse rest of tag
                            if (Parser.Peek() == HtmlRules.ForwardSlash)
                            {
                                Parser.Next();
                                Parser.SkipWhiteSpace();
                                selfClosing = true;
                            }
                            else
                            {
                                selfClosing = false;
                            }
                            Parser.SkipTo(HtmlRules.TagEnd);
                            Parser.Next();

                            // Add node
                            HtmlElementNode node = new(tag, attributes);
                            while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode)
                            {
                                parentNode = parentNode.ParentNode;
                            }
                            parentNode.Children.Add(node);

                            if (flags.HasFlag(HtmlTagFlag.CData))
                            {
                                // CDATA tags are treated as elements but we store and do not parse the inner content
                                if (!selfClosing)
                                {
                                    if (ParseToClosingTag(tag, out string?content) && content.Length > 0)
                                    {
                                        node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content));
                                    }
                                }
                            }
                            else
                            {
                                if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing))
                                {
                                    selfClosing = false;
                                }
                                if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren))
                                {
                                    parentNode = node;  // Node becomes new parent
                                }
                            }
                        }
                        continue;
                    }
                }

                // Text node: must be at least 1 character (handles '<' that was not a tag)
                string text = Parser.ParseCharacter();
                text += Parser.ParseTo(HtmlRules.TagStart);
                parentNode.Children.Add(new HtmlTextNode(text));
            }

            // Return top-level nodes from nodes just parsed
            return(rootNode.Children);
        }