public void newRootURL(string url, int max_thread_number, bool externalURLactivated) { if (URLexist(url)) { return; // skip the root url since it already exists } lRootURLs.Add(new URLRootData(url, max_thread_number, externalURLactivated)); URLChildData newRootData = new URLChildData(url, CrawledType.internalURL, url); lock (dToBeCrawledURLs) // 2019103027 dToBeCrawledURLs.Add(newRootData.url, newRootData); }
// Function that parses HTML document for internal and external links private void ParseHTML() { // extracting all links foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]")) { // Obtain the URL HtmlAttribute att = link.Attributes["href"]; // Adds host part (for the internal links) var crawled_url = new Uri(parentURLData.uriURL, att.Value); // TODO: check item type (png, zip, vs vs.) // Decide if the url is internal or external for the baseURL CrawledType type = CrawledType.externalURL; if (Uri.Compare(parentURLData.uriURL, crawled_url, UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0) { type = CrawledType.internalURL; } // add the child url to the parent's list URLChildData child = new URLChildData(crawled_url.AbsoluteUri, type, parentURLData.rootURL); parentURLData.childrenURLs.Add(child); } }
// Constructor public URLParentData(URLChildData child) : base(child.url, child.rootURL) // 2019103012 { errorCounter = child.errorCounter; childrenURLs = new List <URLChildData>(); }
private void startSingleCrawler(URLChildData childToBecomeParent) { // transfer url from toBeCrawled to crawling lock (dToBeCrawledURLs) // 2019103027 dToBeCrawledURLs.Remove(childToBecomeParent.url); lock (dCrawlingURLs) // 2019103027 dCrawlingURLs.Add(childToBecomeParent.url, childToBecomeParent); URLParentData parent = new URLParentData(childToBecomeParent); URLCrawler crawler = new URLCrawler(parent); // create a new task for crawler operation var urlCrawlerTask = Task.Factory.StartNew(() => { crawler.start(); }).ContinueWith(taskInfo => { if (parent.state == CrawlerState.Crawled) { // transfer url from crawling hs to crawled hs lock (dCrawlingURLs) // 2019103027 dCrawlingURLs.Remove(childToBecomeParent.url); lock (dCrawledURLs) // 2019103027 dCrawledURLs.Add(parent.url, parent); // loops over each child URL, checks if exists and adds it to tobecrawled hs foreach (URLChildData child in parent.childrenURLs) { // 2019103026 bool externalURLactivated = lRootURLs.Where(rootData => rootData.rootURL == child.rootURL) .Select(rootData => rootData.externalActivated) .First(); // discards external URLs if externalURL checkbox is not activated if (child.type == CrawledType.externalURL && !externalURLactivated) { continue; } if (!URLexist(child.url)) { lock (dToBeCrawledURLs) dToBeCrawledURLs.Add(child.url, child); } } } else { childToBecomeParent.errorCounter = parent.errorCounter; // handle CrawlerState.Failed if (childToBecomeParent.errorCounter >= 3) { childToBecomeParent.urlDisabledTime = DateTime.Now; childToBecomeParent.errorCounter = 0; lock (disabledURLs) disabledURLs.Add(childToBecomeParent); } else { // transfer url from toBeCrawled to crawling lock (dToBeCrawledURLs) dToBeCrawledURLs.Add(childToBecomeParent.url, childToBecomeParent); } // remove url from crawling list lock (dCrawlingURLs) dCrawlingURLs.Remove(childToBecomeParent.url); } }); Tuple <string, Task, string> newTask = new Tuple <string, Task, string>(childToBecomeParent.rootURL, urlCrawlerTask, childToBecomeParent.url); tl_activelyRunningTasks.Add(newTask); }