コード例 #1
0
        public void ParentNodeOfNotRootElementIsNotNull()
        {
            var doc = new HtmlDocumentNode();
            var a   = new HtmlElementNode("a");

            doc.AppendChild(a);
            Assert.Equal(doc, a.ParentNode);
        }
コード例 #2
0
 private static void PopulatePropertiesElement(HtmlElementNode node, ListView listView)
 {
     InitializeListView(AttributeColumns, listView);
     foreach (var att in node.Attributes)
     {
         var item = listView.Items.Add(att.Key);
         item.SubItems.Add(att.Value != null ? att.Value.Value : "(null)");
     }
 }
コード例 #3
0
        public void PreviousSiblingIsNull()
        {
            var doc = new HtmlDocumentNode();
            var a   = new HtmlElementNode("a");

            doc.AppendChild(a);
            var previousSibling = a.PreviousSibling;

            Assert.Null(previousSibling);
        }
コード例 #4
0
        public void NextSiblingIsNull()
        {
            var doc = new HtmlDocumentNode();
            var a   = new HtmlElementNode("a");

            doc.AppendChild(a);
            var nextSibling = a.NextSibling;

            Assert.Null(nextSibling);
        }
コード例 #5
0
        public void PreviousSiblingIsBr()
        {
            var doc = new HtmlDocumentNode();
            var a   = new HtmlElementNode("a");

            doc.AppendChild(new HtmlElementNode("br"));
            doc.AppendChild(a);
            var previousSibling = a.PreviousSibling;

            Assert.Equal("br", previousSibling.Name);
        }
コード例 #6
0
        public void NextSiblingIsBr()
        {
            var doc = new HtmlDocumentNode();
            var a   = new HtmlElementNode("a");

            doc.AppendChild(a);
            doc.AppendChild(new HtmlElementNode("br"));
            var nextSibling = a.NextSibling;

            Assert.Equal("br", nextSibling.Name);
        }
コード例 #7
0
ファイル: Selector.cs プロジェクト: piaoye2019/HtmlMonkey
        /// <summary>
        /// Returns true if this selector matches the specified <see cref="HtmlElementNode"/>.
        /// </summary>
        public bool IsMatch(HtmlElementNode node)
        {
            // Compare tag
            if (!string.IsNullOrWhiteSpace(Tag) && !string.Equals(Tag, node.TagName, HtmlRules.TagStringComparison))
            {
                return(false);
            }

            // Compare attributes
            foreach (AttributeSelector selector in Attributes)
            {
                if (!selector.IsMatch(node))
                {
                    return(false);
                }
            }
            return(true);
        }
コード例 #8
0
ファイル: Field.cs プロジェクト: SoftCircuits/WebScraper
 /// <summary>
 /// Recursively finds all the matching field elements from the given node.
 /// </summary>
 /// <param name="node">Root node to search.</param>
 /// <returns>All the matching field elements from the given node.</returns>
 internal IEnumerable <HtmlElementNode> FindValue(HtmlElementNode node) => (Selectors != null) ?
 Selectors.Find(node) :
 Enumerable.Empty <HtmlElementNode>();
コード例 #9
0
ファイル: Field.cs プロジェクト: SoftCircuits/WebScraper
 internal override string GetValueFromNode(HtmlElementNode node) => node.Attributes[AttributeName]?.Value ?? string.Empty;
コード例 #10
0
ファイル: ParserTests.cs プロジェクト: piaoye2019/HtmlMonkey
        public static HtmlDocument BuildXmlDocument()
        {
            HtmlDocument document = new();

            // XML header
            document.RootNodes.Add(new XmlHeaderNode(new HtmlAttributeCollection
            {
                new HtmlAttribute("xml"),
                new HtmlAttribute("version", "1.0"),
                new HtmlAttribute("encoding", "UTF-8")
            }));
            document.RootNodes.Add(new HtmlTextNode("\r\n"));

            // Catalog element
            HtmlElementNode catalogNode = document.RootNodes.Add(new HtmlElementNode("catalog"));

            catalogNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Item element
            HtmlElementNode xmlNode = new("plant");

            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("common", null, new HtmlNodeCollection {
                new HtmlTextNode("Bloodroot")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("botanical", null, new HtmlNodeCollection {
                new HtmlTextNode("Sanguinaria canadensis")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("zone", null, new HtmlNodeCollection {
                new HtmlTextNode("4")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("light", null, new HtmlNodeCollection {
                new HtmlTextNode("Mostly Shady")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("price", null, new HtmlNodeCollection {
                new HtmlTextNode("$2.44")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n  "));
            catalogNode.Children.Add(xmlNode);
            catalogNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Item element
            xmlNode = new("plant");
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("common", null, new HtmlNodeCollection {
                new HtmlTextNode("Columbine")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("botanical", null, new HtmlNodeCollection {
                new HtmlTextNode("Aquilegia canadensis")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("zone", null, new HtmlNodeCollection {
                new HtmlTextNode("3")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("light", null, new HtmlNodeCollection {
                new HtmlTextNode("Mostly Shady")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("price", null, new HtmlNodeCollection {
                new HtmlTextNode("$9.37")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n  "));
            catalogNode.Children.Add(xmlNode);
            catalogNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Item element
            xmlNode = new("plant");
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("common", null, new HtmlNodeCollection {
                new HtmlTextNode("Marsh Marigold")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("botanical", null, new HtmlNodeCollection {
                new HtmlTextNode("Caltha palustris")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("zone", null, new HtmlNodeCollection {
                new HtmlTextNode("4")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("light", null, new HtmlNodeCollection {
                new HtmlTextNode("Mostly Sunny")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("price", null, new HtmlNodeCollection {
                new HtmlTextNode("$6.81")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n  "));
            catalogNode.Children.Add(xmlNode);
            catalogNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Item element
            xmlNode = new("plant");
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("common", null, new HtmlNodeCollection {
                new HtmlTextNode("Dutchman's-Breeches")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("botanical", null, new HtmlNodeCollection {
                new HtmlTextNode("Dicentra cucullaria")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("zone", null, new HtmlNodeCollection {
                new HtmlTextNode("3")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("light", null, new HtmlNodeCollection {
                new HtmlTextNode("Mostly Shady")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("price", null, new HtmlNodeCollection {
                new HtmlTextNode("$6.44")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n  "));
            catalogNode.Children.Add(xmlNode);
            catalogNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Item element
            xmlNode = new("plant");
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("common", null, new HtmlNodeCollection {
                new HtmlTextNode("Ginger, Wild")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("botanical", null, new HtmlNodeCollection {
                new HtmlTextNode("Asarum canadense")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("zone", null, new HtmlNodeCollection {
                new HtmlTextNode("3")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("light", null, new HtmlNodeCollection {
                new HtmlTextNode("Mostly Shady")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n    "));
            xmlNode.Children.Add(new HtmlElementNode("price", null, new HtmlNodeCollection {
                new HtmlTextNode("$9.03")
            }));
            xmlNode.Children.Add(new HtmlTextNode("\r\n  "));
            catalogNode.Children.Add(xmlNode);
            catalogNode.Children.Add(new HtmlTextNode("\r\n"));

            document.RootNodes.Add(new HtmlTextNode("\r\n"));

            return(document);
        }
コード例 #11
0
        /// <summary>
        /// Parses the given HTML string into a number of root nodes.
        /// </summary>
        /// <param name="html">The HTML text to parse.</param>
        public IEnumerable <HtmlNode> ParseChildren(string html)
        {
            TextParser      parser     = new TextParser(html);
            HtmlElementNode rootNode   = new HtmlElementNode("[Root]");
            HtmlElementNode parentNode = rootNode;
            string          tag;
            bool            selfClosing;

            // Loop until end of input
            while (!parser.EndOfText)
            {
                if (parser.Peek() == HtmlRules.TagStart)
                {
                    // Test for CDATA segments, which we store but do not parse. This includes comments.
                    CDataDefinition definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => parser.MatchesCurrentPosition(dd.StartText, dd.IgnoreCase));
                    if (definition != null)
                    {
                        parentNode.Children.Add(ParseCDataNode(parser, definition));
                        continue;
                    }

                    // Closing tag
                    if (parser.Peek(1) == HtmlRules.ForwardSlash)
                    {
                        parser.MoveAhead(2);
                        tag = parser.ParseWhile(c => HtmlRules.IsTagCharacter(c));
                        if (tag.Length > 0)
                        {
                            if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                            {
                                // Should never have matched parent if the top-level node
                                Debug.Assert(!parentNode.IsTopLevelNode);
                                parentNode = parentNode.ParentNode;
                            }
                            else
                            {
                                // Handle mismatched closing tag
                                int tagPriority = HtmlRules.GetTagPriority(tag);
                                while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagPriority(parentNode.TagName))
                                {
                                    parentNode = parentNode.ParentNode;
                                }
                                if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                                {
                                    Debug.Assert(!parentNode.IsTopLevelNode);
                                    parentNode = parentNode.ParentNode;
                                }
                            }
                        }
                        parser.MoveTo(HtmlRules.TagEnd);
                        parser.MoveAhead();
                        continue;
                    }

                    // Open tag
                    if (ParseTag(parser, out tag))
                    {
                        HtmlTagFlag flags = HtmlRules.GetTagFlags(tag);
                        if (flags.HasFlag(HtmlTagFlag.HtmlHeader))
                        {
                            parentNode.Children.Add(ParseHtmlHeader(parser));
                        }
                        else if (flags.HasFlag(HtmlTagFlag.XmlHeader))
                        {
                            parentNode.Children.Add(ParseXmlHeader(parser));
                        }
                        else
                        {
                            // Parse attributes
                            HtmlAttributeCollection attributes = ParseAttributes(parser);

                            // Parse rest of tag
                            if (parser.Peek() == HtmlRules.ForwardSlash)
                            {
                                parser.MoveAhead();
                                parser.MovePastWhitespace();
                                selfClosing = true;
                            }
                            else
                            {
                                selfClosing = false;
                            }
                            parser.MoveTo(HtmlRules.TagEnd);
                            parser.MoveAhead();

                            // Add node
                            HtmlElementNode node = new HtmlElementNode(tag, attributes);
                            while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode)
                            {
                                Debug.Assert(parentNode.ParentNode != null);
                                parentNode = parentNode.ParentNode;
                            }
                            parentNode.Children.Add(node);

                            if (flags.HasFlag(HtmlTagFlag.CData))
                            {
                                // CDATA tags are treated as elements but we store and do not parse the inner content
                                if (!selfClosing)
                                {
                                    if (ParseToClosingTag(parser, tag, out string content) && content.Length > 0)
                                    {
                                        node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content));
                                    }
                                }
                            }
                            else
                            {
                                if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing))
                                {
                                    selfClosing = false;
                                }
                                if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren))
                                {
                                    parentNode = node;  // Node becomes new parent
                                }
                            }
                        }
                        continue;
                    }
                }

                // Text node
                int start = parser.Index;
                // Text must be at least 1 character (handle '<' that is not part of a tag)
                parser.MoveAhead();
                parser.MoveTo(HtmlRules.TagStart);
                Debug.Assert(parser.Index > start);
                parentNode.Children.Add(new HtmlTextNode(parser.Extract(start, parser.Index)));
            }

            //
            return(rootNode.Children);
        }
コード例 #12
0
 private static string ShortDescriptionElement(HtmlElementNode node) => $"<{node.TagName}>";
コード例 #13
0
 private static string LongDescriptionElement(HtmlElementNode node) => string.Empty;
コード例 #14
0
 public HtmlNodeCollection(HtmlElementNode parentNode)
 {
     ParentNode = parentNode;
 }
コード例 #15
0
ファイル: HtmlParser.cs プロジェクト: piaoye2019/HtmlMonkey
        /// <summary>
        /// Parses the given HTML string into a collection of root nodes and their
        /// children.
        /// </summary>
        /// <param name="html">The HTML text to parse.</param>
        public IEnumerable <HtmlNode> ParseChildren(string?html, bool ignoreHtmlRules = false)
        {
            HtmlElementNode rootNode   = new("[TempContainer]");
            HtmlElementNode parentNode = rootNode;

            Parser.Reset(html);
            bool   selfClosing;
            string?tag;

            // Loop until end of input
            while (!Parser.EndOfText)
            {
                if (Parser.Peek() == HtmlRules.TagStart)
                {
                    // CDATA segments (blocks we store but don't parse--includes comments)
                    CDataDefinition?definition = HtmlRules.CDataDefinitions.FirstOrDefault(dd => Parser.MatchesCurrentPosition(dd.StartText, dd.StartComparison));
                    if (definition != null)
                    {
                        parentNode.Children.Add(ParseCDataNode(definition));
                        continue;
                    }

                    // Closing tag
                    if (Parser.Peek(1) == HtmlRules.ForwardSlash)
                    {
                        Parser.Index += 2;
                        tag           = Parser.ParseWhile(HtmlRules.IsTagCharacter);
                        if (tag.Length > 0)
                        {
                            if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                            {
                                // Should never have matched parent if the top-level node
                                if (!parentNode.IsTopLevelNode)
                                {
                                    parentNode = parentNode.ParentNode;
                                }
                            }
                            else
                            {
                                // Handle mismatched closing tag
                                int tagPriority = HtmlRules.GetTagNestLevel(tag);

                                while (!parentNode.IsTopLevelNode && tagPriority > HtmlRules.GetTagNestLevel(parentNode.TagName))
                                {
                                    parentNode = parentNode.ParentNode;
                                }

                                if (parentNode.TagName.Equals(tag, HtmlRules.TagStringComparison))
                                {
                                    if (!parentNode.IsTopLevelNode)
                                    {
                                        parentNode = parentNode.ParentNode;
                                    }
                                }
                            }
                        }
                        Parser.SkipTo(HtmlRules.TagEnd);
                        Parser.Next();
                        continue;
                    }

                    // Open tag
                    if (ParseTag(out tag))
                    {
                        HtmlTagFlag flags = ignoreHtmlRules ? HtmlTagFlag.None : HtmlRules.GetTagFlags(tag);
                        if (flags.HasFlag(HtmlTagFlag.HtmlHeader))
                        {
                            parentNode.Children.Add(ParseHtmlHeader());
                        }
                        else if (flags.HasFlag(HtmlTagFlag.XmlHeader))
                        {
                            parentNode.Children.Add(ParseXmlHeader());
                        }
                        else
                        {
                            // Parse attributes
                            HtmlAttributeCollection attributes = ParseAttributes();

                            // Parse rest of tag
                            if (Parser.Peek() == HtmlRules.ForwardSlash)
                            {
                                Parser.Next();
                                Parser.SkipWhiteSpace();
                                selfClosing = true;
                            }
                            else
                            {
                                selfClosing = false;
                            }
                            Parser.SkipTo(HtmlRules.TagEnd);
                            Parser.Next();

                            // Add node
                            HtmlElementNode node = new(tag, attributes);
                            while (!HtmlRules.TagMayContain(parentNode.TagName, tag) && !parentNode.IsTopLevelNode)
                            {
                                parentNode = parentNode.ParentNode;
                            }
                            parentNode.Children.Add(node);

                            if (flags.HasFlag(HtmlTagFlag.CData))
                            {
                                // CDATA tags are treated as elements but we store and do not parse the inner content
                                if (!selfClosing)
                                {
                                    if (ParseToClosingTag(tag, out string?content) && content.Length > 0)
                                    {
                                        node.Children.Add(new HtmlCDataNode(string.Empty, string.Empty, content));
                                    }
                                }
                            }
                            else
                            {
                                if (selfClosing && flags.HasFlag(HtmlTagFlag.NoSelfClosing))
                                {
                                    selfClosing = false;
                                }
                                if (!selfClosing && !flags.HasFlag(HtmlTagFlag.NoChildren))
                                {
                                    parentNode = node;  // Node becomes new parent
                                }
                            }
                        }
                        continue;
                    }
                }

                // Text node: must be at least 1 character (handles '<' that was not a tag)
                string text = Parser.ParseCharacter();
                text += Parser.ParseTo(HtmlRules.TagStart);
                parentNode.Children.Add(new HtmlTextNode(text));
            }

            // Return top-level nodes from nodes just parsed
            return(rootNode.Children);
        }
コード例 #16
0
ファイル: Field.cs プロジェクト: SoftCircuits/WebScraper
 /// <summary>
 /// Extracts this field value from the given node.
 /// </summary>
 internal abstract string GetValueFromNode(HtmlElementNode node);
コード例 #17
0
ファイル: Field.cs プロジェクト: SoftCircuits/WebScraper
 internal override string GetValueFromNode(HtmlElementNode node) => node.Text;
コード例 #18
0
ファイル: ParserTests.cs プロジェクト: piaoye2019/HtmlMonkey
        public static HtmlDocument BuildHtmlDocument()
        {
            HtmlDocument document = new();

            // HTML header
            document.RootNodes.Add(new HtmlHeaderNode(new HtmlAttributeCollection {
                new HtmlAttribute("html")
            }));
            document.RootNodes.Add(new HtmlTextNode("\r\n"));

            // HTML element
            HtmlElementNode htmlNode = document.RootNodes.Add(new HtmlElementNode("html"));

            htmlNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Head element
            HtmlElementNode headNode = htmlNode.Children.Add(new HtmlElementNode("head"));

            headNode.Children.Add(new HtmlTextNode("\r\n    "));

            // Title element
            HtmlElementNode node = headNode.Children.Add(new HtmlElementNode("title"));

            node.Children.Add(new HtmlTextNode("Title"));

            // Meta element
            headNode.Children.Add(new HtmlTextNode("\r\n    "));
            headNode.Children.Add(new HtmlElementNode("meta", new HtmlAttributeCollection
            {
                new HtmlAttribute("name", "description"),
                new HtmlAttribute("content", "This is my test meta description node!")
            }));
            headNode.Children.Add(new HtmlTextNode("\r\n  "));

            // Body element
            htmlNode.Children.Add(new HtmlTextNode("\r\n  "));
            HtmlElementNode bodyNode = htmlNode.Children.Add(new HtmlElementNode("body"));

            // Comment
            bodyNode.Children.Add(new HtmlTextNode("\r\n    "));
            bodyNode.Children.Add(new HtmlCDataNode("<!--", "-->", " Here's a comment! "));

            // First paragraph
            bodyNode.Children.Add(new HtmlTextNode("\r\n    "));
            node = bodyNode.Children.Add(new HtmlElementNode("p", new HtmlAttributeCollection
            {
                new HtmlAttribute("id", "par1")
            }));
            node.Children.Add(new HtmlTextNode("\r\n      "));
            node.Children.Add(new HtmlTextNode("This is my first paragraph"));
            node.Children.Add(new HtmlTextNode("\r\n    "));

            // Second paragraph
            bodyNode.Children.Add(new HtmlTextNode("\r\n    "));
            node = bodyNode.Children.Add(new HtmlElementNode("p", new HtmlAttributeCollection
            {
                new HtmlAttribute("id", "par2")
            }));
            node.Children.Add(new HtmlTextNode("\r\n      "));
            node.Children.Add(new HtmlTextNode("This is my second paragraph"));
            node.Children.Add(new HtmlTextNode("\r\n    "));

            bodyNode.Children.Add(new HtmlTextNode("\r\n  "));
            htmlNode.Children.Add(new HtmlTextNode("\r\n"));

            return(document);
        }