// Function that parses HTML document for internal and external links private void ParseHTML() { // extracting all links foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]")) { // Obtain the URL HtmlAttribute att = link.Attributes["href"]; // Adds host part (for the internal links) var crawled_url = new Uri(parentURLData.uriURL, att.Value); // TODO: check item type (png, zip, vs vs.) // Decide if the url is internal or external for the baseURL CrawledType type = CrawledType.externalURL; if (Uri.Compare(parentURLData.uriURL, crawled_url, UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0) { type = CrawledType.internalURL; } // add the child url to the parent's list URLChildData child = new URLChildData(crawled_url.AbsoluteUri, type, parentURLData.rootURL); parentURLData.childrenURLs.Add(child); } }
public URLChildData(string url, CrawledType type, string rootURL) : base(url, rootURL) // 2019103012 { this.type = type; // 2019103006 }