XHtmlLoader C# (CSharp) Code-Beispiele

Beispiel #1

0

Datei anzeigen

        public void AttributeParsing()
        {
            string      html = @"            
            <html>
	            <body>
		            <a id=01 /class/red >class red</a>
		            <a id=02 /class/= /red >class = red</a>
		            <a id=03 class// / = /red >class = red</a>
		            <a id=04    class / = /red >class = red</a>
		            <a id=05 class   = /red >class='/red'</a>
		            <a id=06 class= /red >class='/red'</a>
		            <a id=07 clas:s= red >clas:s='red'</a>
		            <a id=08 class:= red >class:='red'</a>
		            <a id=09 class= 'red'/ >class='red'</a>
		            <a id=10 class=red/ >class='red/'</a>
		            <a id=11 class=red/>class='red/'</a>
	            </body>
            </html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #2

0

Datei anzeigen

        public void SelfClosingTags()
        {
            string      html = @"           
            <html><body>
            <h1>Hello World</h1>
            Some text <br> Some more text <img src='foobar.jpg'> more text <hr><a>foo</a>
            <p/> non self-closing </p>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure img node has attributes, and no children
            XmlNode imgNode = doc.SelectSingleNode("//img");

            Assert.AreEqual("foobar.jpg", imgNode.Attributes["src"].Value);
            Assert.IsNull(imgNode.FirstChild);

            // Ensure br node has no attributes, and no children
            XmlNode brNode = doc.SelectSingleNode("//br");

            Assert.AreEqual(0, brNode.Attributes.Count);
            Assert.IsNull(brNode.FirstChild);

            // Ensure hr node has no attributes, and no children
            XmlNode hrNode = doc.SelectSingleNode("//hr");

            Assert.AreEqual(0, hrNode.Attributes.Count);
            Assert.IsNull(hrNode.FirstChild);

            // Ensure p node has children, because it is non self-closing
            XmlNode pNode = doc.SelectSingleNode("//p");

            Assert.AreEqual(" non self-closing ", pNode.FirstChild.InnerText);
        }

Beispiel #3

0

Datei anzeigen

        public void FullyQualifyUrls()
        {
            string        html    = @"            
            <html><body>
            <a id='1' href='//foobar.com'>hello</a>
            <a id='2' href='/helloworld.html'>hello</a>
            <a id='3' href='helloworld.html'>hello</a>
            <a id='4' href='http://blah.com/helloworld.html'>hello</a>
            <a id='5' href='../helloworld.html'>hello</a>
            <a id='6' href='/wiki/Wikipedia:Introduction'>hello</a>

            </body></html>";
            string        baseUrl = "http://www.foobar.com/products/cat1/someprod.html";
            ParserOptions options = new ParserOptions {
                BaseUrl = baseUrl
            };
            XmlDocument doc = XHtmlLoader.LoadHtml(html, options);

            Console.WriteLine(doc.OuterXml);

            // Ensure the urls are fully qualified...
            Assert.AreEqual("http://foobar.com/", doc.SelectSingleNode("//a[@id='1']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/helloworld.html", doc.SelectSingleNode("//a[@id='2']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/products/cat1/helloworld.html", doc.SelectSingleNode("//a[@id='3']/@href").Value);
            Assert.AreEqual("http://blah.com/helloworld.html", doc.SelectSingleNode("//a[@id='4']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/products/helloworld.html", doc.SelectSingleNode("//a[@id='5']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/wiki/Wikipedia:Introduction", doc.SelectSingleNode("//a[@id='6']/@href").Value);

            // Linq check
            LinqCompare(doc, html, options);
        }

Beispiel #4

0

Datei anzeigen

        public void TestReadingEncodedFile(string fileName, System.Text.Encoding encoding, System.Text.Encoding defaultEncoding)
        {
            // Set some options, so that we can know if things are working...
            LoaderOptions loaderOptions = new LoaderOptions();

            loaderOptions.DetectEncoding  = true;
            loaderOptions.DefaultEncoding = defaultEncoding;
            loaderOptions.ParserOptions.IncludeMetaData = true;

            // Load multi-byte html file into memory
            XmlDocument doc = XHtmlLoader.LoadHtml(_sampleMultiByteHtml);

            // Ensure Sample directory exists
            string sampleDir = (new DirectoryInfo(AssemblyDirectory)).Parent.Parent.Parent.FullName + "\\SampleData\\";

            if (!Directory.Exists(sampleDir))
            {
                Directory.CreateDirectory(sampleDir);
            }

            // Create Encoded file
            string fullName = sampleDir + fileName;

            using (TextWriter sw = new StreamWriter(File.Create(fullName), encoding))
            {
                doc.Save(sw);
            }

            // Re-load into memory
            XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("file://" + fullName, loaderOptions).Result;

            Console.WriteLine("Reading file: " + fileName);
            Console.WriteLine(doc2.OuterXml);
            Assert.AreEqual(doc.SelectSingleNode("//body").OuterXml, doc2.SelectSingleNode("//body").OuterXml);
        }

Beispiel #5

0

Datei anzeigen

Datei: XHtmlQueryEngine.Tests.cs Projekt: jrsell/XHtmlKit

        public void XML_Characters_InResults()
        {
            XmlDocument doc = XHtmlLoader.LoadHtml(_testHTML);

            Console.WriteLine(ToFormattedString(doc));

            string query   = "<row xpath='//body/div/@class'></row>";
            string results = QueryEngine.SelectOnHtml(_testHTML, query).InnerXml;

            Assert.AreEqual("<row>This &amp; is a test</row>", results);
        }

Beispiel #6

0

Datei anzeigen

    static void Main(string[] args)
    {
        // Load Html string into an XmlDocument
        XmlDocument doc1 = XHtmlLoader.LoadHtml("<html><head><title>Hello World!</title><body><h1>Hello World</h1><p>This is a test</body>");

        Console.WriteLine("OuterXml is: " + doc1.OuterXml);

        // Load web page into an XmlDocument
        XmlDocument doc2  = XHtmlLoader.LoadWebPageAsync("http://wikipedia.org").Result;
        string      title = doc2.SelectSingleNode("//title").InnerText;

        Console.WriteLine("Title is: " + title);
    }

Beispiel #7

0

Datei anzeigen

        public void AttributeParsingMultiple()
        {
            string      html = @"<settings id=01 class=red foo=bar />";
            XmlDocument doc  = new XmlDocument();

            XHtmlLoader.LoadHtmlFragment(doc, html);

            Console.WriteLine(ToFormattedString(doc));

            XmlNode settingsNode = doc.SelectSingleNode("//settings");

            Assert.AreEqual("01", settingsNode.Attributes["id"].Value);
            Assert.AreEqual("red", settingsNode.Attributes["class"].Value);
            Assert.AreEqual("bar", settingsNode.Attributes["foo"].Value);
        }

Beispiel #8

0

Datei anzeigen

        public void HelloWorldBasicTest()
        {
            string html = @"
            <html><body>
            <h1>Hello World</h1>
            </body></html>";

            XmlDocument doc  = XHtmlLoader.LoadHtml(html);
            string      doc1 = doc.OuterXml;

            Console.WriteLine(doc1);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #9

0

Datei anzeigen

        public void HeadTagInsideBodyTag()
        {
            string      html = @"
            <html><body>
            <h1>Hello World</h1><p> para <head>somehead</head> end para </p>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the <head> inside the body tag is ingnored
            Assert.IsNull(doc.SelectSingleNode("//body/head"));

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #10

0

Datei anzeigen

Datei: Program.cs Projekt: jrsell/XHtmlKit

        static string[] XHtmlKit_ParseAndSearch(string html, string xpath = null)
        {
            List <string> searchResults = new List <string>();
            XmlDocument   doc           = XHtmlLoader.LoadHtml(html);

            if (xpath != null)
            {
                var results = doc.DocumentElement.SelectNodes(xpath);
                foreach (XmlNode node in results)
                {
                    string result = node.InnerText;
                    searchResults.Add(result);
                }
            }
            return(searchResults.ToArray());
        }

Beispiel #11

0

Datei anzeigen

        /// <summary>
        /// Sample scraper
        /// </summary>
        public static async Task <Article[]> GetCodeProjectArticlesAsync(int pageNum = 1)
        {
            List <Article> results = new List <Article>();

            // Get web page as an XHtml document using XHtmlKit
            string      url  = "https://www.codeproject.com/script/Articles/Latest.aspx?pgnum=" + pageNum;
            XmlDocument page = await XHtmlLoader.LoadWebPageAsync(url);

            // Select all articles using an anchor node containing a robust @class attribute
            var articles = page.SelectNodes("//table[contains(@class,'article-list')]/tr[@valign]");

            // Get each article
            foreach (XmlNode a in articles)
            {
                // Extract article data - we need to be aware that sometimes there are no results
                // for certain fields
                var           category = a.SelectSingleNode("./td[1]//a/text()");
                var           title    = a.SelectSingleNode(".//div[@class='title']/a/text()");
                var           date     = a.SelectSingleNode(".//div[contains(@class,'modified')]/text()");
                var           rating   = a.SelectSingleNode(".//div[contains(@class,'rating-stars')]/@title");
                var           desc     = a.SelectSingleNode(".//div[@class='description']/text()");
                var           author   = a.SelectSingleNode(".//div[contains(@class,'author')]/text()");
                XmlNodeList   tagNodes = a.SelectNodes(".//div[@class='t']/a/text()");
                StringBuilder tags     = new StringBuilder();
                foreach (XmlNode tagNode in tagNodes)
                {
                    tags.Append((tags.Length > 0 ? "," : "") + tagNode.Value);
                }

                // Create the data structure we want
                Article article = new Article
                {
                    Category    = category != null ? category.Value : string.Empty,
                    Title       = title != null ? title.Value : string.Empty,
                    Author      = author != null ? author.Value : string.Empty,
                    Description = desc != null ? desc.Value : string.Empty,
                    Rating      = rating != null ? rating.Value : string.Empty,
                    Date        = date != null ? date.Value : string.Empty,
                    Tags        = tags.ToString()
                };

                // Add to results
                results.Add(article);
            }
            return(results.ToArray());
        }

Beispiel #12

0

Datei anzeigen

        public void FormattingTags()
        {
            string      html = @"            
            <html><body>
            <h1>Hello World</h1>
            Some text <b><i>italics</b></i>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // The <b> tag should contain the <i> tag
            Assert.AreEqual("<b><i>italics</i></b>", doc.SelectSingleNode("//b").OuterXml);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #13

0

Datei anzeigen

        public void LoadHtmlFragment()
        {
            string      html = @"
                <html>
                <div>
                    <h1>Hello World</h1>
                    <body foo='bar'>
                </div></html>";
            XmlDocument doc  = new XmlDocument();

            XHtmlLoader.LoadHtmlFragment(doc, html);

            Console.WriteLine(doc.OuterXml);

            // Ensure we are not inserting html, head or body nodes...
            Assert.AreEqual("div", doc.DocumentElement.Name);
        }

Beispiel #14

0

Datei anzeigen

        public void TagWithInvalidXmlChar()
        {
            string      html = @"
            <html>
                <body>
                    <ahref='http://yyk.familydoctor.com.cn/1389/' target='_blank'>南方医科大学南方医院</a>
                </body>
            </html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure the
            Assert.AreEqual("ahref_x003D__x0027_http_x003A_", doc.SelectSingleNode("//body").FirstChild.Name);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #15

0

Datei anzeigen

        public void NoOuterHtmlTag()
        {
            string      html = @"
            <title>a title</title> <body>
            <h1>Hello World</h1>
            Some_&nbsp;_text > hello &gt; &copy; &#169; <b><i>italics</b></i>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure the document gets constructed properly...
            Assert.IsTrue(doc.DocumentElement.Name == "html");
            Assert.IsTrue(doc.DocumentElement.FirstChild.Name == "head");
            Assert.IsTrue(doc.DocumentElement.FirstChild.FirstChild.Name == "title");

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #16

0

Datei anzeigen

        public void TitleRCDataParsingCaps()
        {
            string      html = @"            
            <html><head>
                <title>This is a Title!</TITLE>
                </head>
            <body>
            <h1>Hello World</h1>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the </TITLE> match is case insenstive...
            Assert.AreEqual("This is a Title!", doc.SelectSingleNode("//title/text()").Value);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #17

0

Datei anzeigen

        public void HtmlTagWithAttributes()
        {
            string      html = @"
            <html lang='en'><body>
            <h1>Hello World</h1><html lang='fr' style='green'>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure the nested <html> tag is used simply as a source of attributes on the
            // main <html> tag - the 'lang' attribute should not overwrite the value, but the 'style'
            // attribute should get tacked on.
            Assert.IsTrue(doc.DocumentElement.Name == "html");
            Assert.AreEqual("en", doc.DocumentElement.Attributes["lang"].Value);
            Assert.AreEqual("green", doc.DocumentElement.Attributes["style"].Value);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #18

0

Datei anzeigen

        public void BodyTagwithAttributes()
        {
            string      html = @"
            <html>
                <head> 
                    <script src='/js/mobileredirect.js'></script>
                </head>
                <body class='foo'>
                    <h1>Hello World</h1>
                </body>
            </html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            Assert.AreEqual("foo", doc.SelectSingleNode("//body").Attributes["class"].Value);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #19

0

Datei anzeigen

        public void FullyQualifyUrls2()
        {
            string html = @"
            <html>
            <body>
                <a href='/bakery/bread.html'>Bread</a>
            </body>
            </html>";

            ParserOptions options = new ParserOptions {
                BaseUrl = "http://foobar.com", FullyQualifyUrls = false
            };
            XmlDocument doc = XHtmlLoader.LoadHtml(html, options);

            Console.WriteLine(doc.OuterXml);

            // Ensure the comment shows up at the beginning, and the text at the end...
            Assert.AreEqual("/bakery/bread.html", doc.SelectSingleNode("//a/@href").Value, "Ensure we don't fully qualify. We set flag to false");

            // Linq check
            LinqCompare(doc, html, options);
        }

Beispiel #20

0

Datei anzeigen

        public void HeadTagWithNestedTags()
        {
            string      html = @"
            <html>
                <head>
                    <meta> <a> foobar </a> blah </meta>
                </head>
                <body>
            <h1>Hello World</h1>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the text under meta ignored, since meta is a self-closing tag
            Assert.IsNull(doc.SelectSingleNode("//head/meta/a"));
            // The <a> tag should go under the body
            Assert.IsNotNull(doc.SelectSingleNode("//body/a"));

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #21

0

Datei anzeigen

        public void ScriptRCDataParsing()
        {
            string      html = @"            
            <html><body>
            <h1>Hello World</h1><script>
                ga('create', 'UA-40765809-1', {
                  'allowLinker': true,
                  'cookiePath': '/finance'
                });
                ga('send', 'pageview<table>');
            </script>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the </body> inside the script is treated as RCData...
            Assert.IsTrue(doc.SelectSingleNode("//script/text()").Value.Contains("<table>"));

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #22

0

Datei anzeigen

        public void BeforeAndAfterFragments()
        {
            string      html = @"
            <!----- some comment ----->
            <html>
            <body>
                <h1>Hello World</h1>
                <p>Some_&nbsp;_text > hello &gt; &copy; &#169; <b><i>italics</b></i> 
                qrs
                </p>
            </body>
            </html>some after text";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the comment shows up at the beginning, and the text at the end...
            Assert.IsTrue(doc.FirstChild.NodeType == XmlNodeType.Comment);
            Assert.AreEqual("some after text", doc.SelectSingleNode("//body").LastChild.InnerText);

            // Linq check
            LinqCompare(doc, html);
        }

Beispiel #23

0

Datei anzeigen

        public void LoadHtmlFragmentInSubElem()
        {
            string      html   = @"
                <div>
                    <h1>Hello World</h1>
                    <body foo='bar'>
                </div>
                <p>hello</p>
                </foo>abc";
            XmlDocument doc    = new XmlDocument();
            XmlElement  parent = doc.CreateElement("foo");

            doc.AppendChild(parent);

            XHtmlLoader.LoadHtmlFragment(parent, html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure we are not inserting html, head or body nodes...
            // And ensure we can load the fragment
            // And ensure that the </foo> tag does not match our root node...
            Assert.AreEqual("foo", parent.Name);
            Assert.AreEqual(3, parent.ChildNodes.Count);
        }

Beispiel #24

0

Datei anzeigen

 public static void LoadHtml(this XmlDocument doc, string html)
 {
     XHtmlLoader.LoadHtml(doc, new StringReader(html), new ParserOptions());
 }

Beispiel #25

0

Datei anzeigen

 public static async Task LoadWebPageAsync(this XmlDocument doc, string url)
 {
     await XHtmlLoader.LoadWebPageAsync(doc, url, new LoaderOptions());
 }

Beispiel #26

0

Datei anzeigen

 public static void LoadHtmlFragment(this XmlNode node, string html)
 {
     XHtmlLoader.LoadHtmlFragment(node, new StringReader(html), new ParserOptions());
 }

Beispiel #27

0

Datei anzeigen

Datei: Program.cs Projekt: jrsell/XHtmlKit

        static void Crawl(string[] args)
        {
            // Get crawler settings from the command-line
            Settings crawlerSettings = new Settings();

            // Get command-line settings - use XHtmlKit parser. Why not.
            if (args.Length > 0)
            {
                string      settingsHtml = "<settings " + string.Join(" ", args) + " />";
                XmlDocument settingsDoc  = new XmlDocument();
                XHtmlLoader.LoadHtmlFragment(settingsDoc, settingsHtml.ToLower());
                XmlElement settings = settingsDoc.DocumentElement;

                crawlerSettings.Url   = (settings.Attributes["url"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["url"].Value)) ? settings.Attributes["url"].Value.Trim() : crawlerSettings.Url;
                crawlerSettings.Depth = settings.Attributes["depth"] != null?Convert.ToInt32(settings.Attributes["depth"].Value) : crawlerSettings.Depth;

                crawlerSettings.UrlFilter       = (settings.Attributes["urlfilter"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["urlfilter"].Value)) ? settings.Attributes["urlfilter"].Value.Trim() : crawlerSettings.UrlFilter;
                crawlerSettings.OutputDir       = (settings.Attributes["outputdir"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["outputdir"].Value)) ? settings.Attributes["outputdir"].Value.Trim() : crawlerSettings.OutputDir;
                crawlerSettings.Encoding        = (settings.Attributes["encoding"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["encoding"].Value)) ? settings.Attributes["encoding"].Value.Trim() : crawlerSettings.Encoding;
                crawlerSettings.IncludeMetaData = (settings.Attributes["includemetadata"] != null);
            }

            // Gather HtmlParser and HtmlClient options
            LoaderOptions xhtmlLoaderOptions = new LoaderOptions();

            xhtmlLoaderOptions.ParserOptions.IncludeMetaData = crawlerSettings.IncludeMetaData;
            xhtmlLoaderOptions.DetectEncoding  = (crawlerSettings.Encoding == null);
            xhtmlLoaderOptions.DefaultEncoding = crawlerSettings.Encoding != null?System.Text.Encoding.GetEncoding(crawlerSettings.Encoding) : xhtmlLoaderOptions.DefaultEncoding;

            // Create 'todo' and 'done' lists
            Queue <Link>     urlsToCrawl = new Queue <Link>();
            HashSet <string> urlsCrawled = new HashSet <string>();
            Uri baseUri = new Uri(crawlerSettings.Url.ToLower());

            // Add the root url to the todo list
            urlsToCrawl.Enqueue(new Link {
                Url = crawlerSettings.Url
            });

            Console.WriteLine("Crawling Url = " + crawlerSettings.Url + " Depth = " + crawlerSettings.Depth + " OutputDir = '" + crawlerSettings.OutputDir + "' UrlFilter = '" + crawlerSettings.UrlFilter + "'");

            // Crawl all urls on the 'todo' list
            while (urlsToCrawl.Count > 0)
            {
                Link currentUrl = urlsToCrawl.Dequeue();

                Console.Write(currentUrl.Url);

                urlsCrawled.Add(currentUrl.Url.ToLower());

                // Crawl the Url using XHtmlKit
                XmlDocument xhtmlDoc;
                try
                {
                    xhtmlDoc = XHtmlLoader.LoadWebPageAsync(currentUrl.Url, xhtmlLoaderOptions).Result;
                    Console.WriteLine(", [OK]");
                }
                catch (Exception ex)
                {
                    Console.WriteLine(", [Error], " + (ex.InnerException != null ? ex.InnerException.Message : ""));
                    continue;
                }

                // Get title from the XHtml document
                var title = xhtmlDoc.SelectSingleNode("//title");
                if (title != null)
                {
                    currentUrl.PageTitle = title.InnerText.Trim();
                }

                // Save the XHtml file to disk
                try
                {
                    if (!string.IsNullOrWhiteSpace(crawlerSettings.OutputDir))
                    {
                        Uri    currentUri = new Uri(currentUrl.Url.ToLower());
                        string fileName   = currentUri.PathAndQuery.Trim();

                        // Replace invalid characters
                        foreach (char c in System.IO.Path.GetInvalidFileNameChars())
                        {
                            fileName = fileName.Replace(c.ToString(), ".");
                        }
                        fileName = fileName.Trim(new char[] { '.' });

                        // Set default file name
                        if (string.IsNullOrWhiteSpace(fileName))
                        {
                            fileName = "default";
                        }

                        // Add xml extension
                        fileName = fileName + ".xml";

                        // Ensure output directory exists
                        string outputDir = crawlerSettings.OutputDir + "\\" + currentUri.Host;
                        if (!System.IO.Directory.Exists(outputDir))
                        {
                            System.IO.Directory.CreateDirectory(outputDir);
                        }

                        // Save file
                        xhtmlDoc.Save(outputDir + "\\" + fileName);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Error saving document: " + ex.Message);
                }

                // Get sub-links from the XHtml
                var subLinkElems = xhtmlDoc.SelectNodes("//a");

                // If we are at the Max Depth, we won't crawl deeper
                if (currentUrl.Depth >= crawlerSettings.Depth)
                {
                    continue;
                }

                // Add sub-links to the 'todo' list
                int numSubLinks = 0;
                foreach (XmlNode subLinkElem in subLinkElems)
                {
                    // Don't add empty links
                    if (subLinkElem.Attributes["href"] == null || string.IsNullOrWhiteSpace(subLinkElem.Attributes["href"].InnerText))
                    {
                        continue;
                    }

                    // Get the sub-link
                    string sublink = subLinkElem.Attributes["href"].InnerText;
                    if (!Uri.IsWellFormedUriString(sublink.ToLower(), UriKind.RelativeOrAbsolute))
                    {
                        continue;
                    }

                    Uri subUri = new Uri(sublink.ToLower());

                    // Don't add links that don't match the UrlFilter
                    if (!string.IsNullOrWhiteSpace(crawlerSettings.UrlFilter) && (!sublink.Contains(crawlerSettings.UrlFilter)))
                    {
                        continue;
                    }

                    // Don't add links that we have already crawled...
                    if (urlsCrawled.Contains(sublink.ToLower()))
                    {
                        continue;
                    }

                    // Add the sub-link
                    urlsToCrawl.Enqueue(new Link {
                        Url = sublink, LinkText = subLinkElem.InnerText.Trim(), Depth = (currentUrl.Depth + 1)
                    });
                    numSubLinks++;
                }

                currentUrl.SubLinks = numSubLinks;

                // Todo - put the currentUrl metadata somewhere...
            }
        }

C# (CSharp) XHtmlLoader Beispiele