예제 #1
0
        // Function that parses HTML document for internal and external links
        private void ParseHTML()
        {
            // extracting all links
            foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]"))
            {
                // Obtain the URL
                HtmlAttribute att = link.Attributes["href"];

                // Adds host part (for the internal links)
                var crawled_url = new Uri(parentURLData.uriURL, att.Value);
                // TODO: check item type (png, zip, vs vs.)

                // Decide if the url is internal or external for the baseURL
                CrawledType type = CrawledType.externalURL;
                if (Uri.Compare(parentURLData.uriURL, crawled_url, UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0)
                {
                    type = CrawledType.internalURL;
                }

                // add the child url to the parent's list
                URLChildData child = new URLChildData(crawled_url.AbsoluteUri, type, parentURLData.rootURL);
                parentURLData.childrenURLs.Add(child);
            }
        }
예제 #2
0
 public URLChildData(string url, CrawledType type, string rootURL) : base(url, rootURL) // 2019103012
 {
     this.type = type;                                                                  // 2019103006
 }