public CrawledWindow(URLParentData parent) { InitializeComponent(); tb_title.Text = parent.title; tb_url.Text = parent.url; tb_rootURL.Text = parent.rootURL; tb_crawledTime.Text = parent.urlCrawledTime.ToString(); TimeSpan ts = new TimeSpan(parent.urlCrawledTime.Ticks - parent.urlRegisteredTime.Ticks); tb_duration.Text = ts.TotalSeconds + " s"; foreach (URLChildData child in parent.childrenURLs) { lb_children.Items.Add(child.url); } }
// Take input url as string // 2019103002 public URLCrawler(URLParentData parentURLData) { this.parentURLData = parentURLData; htmlWeb = new HtmlWeb(); htmlDoc = new HtmlDocument(); }
private void startSingleCrawler(URLChildData childToBecomeParent) { // transfer url from toBeCrawled to crawling lock (dToBeCrawledURLs) // 2019103027 dToBeCrawledURLs.Remove(childToBecomeParent.url); lock (dCrawlingURLs) // 2019103027 dCrawlingURLs.Add(childToBecomeParent.url, childToBecomeParent); URLParentData parent = new URLParentData(childToBecomeParent); URLCrawler crawler = new URLCrawler(parent); // create a new task for crawler operation var urlCrawlerTask = Task.Factory.StartNew(() => { crawler.start(); }).ContinueWith(taskInfo => { if (parent.state == CrawlerState.Crawled) { // transfer url from crawling hs to crawled hs lock (dCrawlingURLs) // 2019103027 dCrawlingURLs.Remove(childToBecomeParent.url); lock (dCrawledURLs) // 2019103027 dCrawledURLs.Add(parent.url, parent); // loops over each child URL, checks if exists and adds it to tobecrawled hs foreach (URLChildData child in parent.childrenURLs) { // 2019103026 bool externalURLactivated = lRootURLs.Where(rootData => rootData.rootURL == child.rootURL) .Select(rootData => rootData.externalActivated) .First(); // discards external URLs if externalURL checkbox is not activated if (child.type == CrawledType.externalURL && !externalURLactivated) { continue; } if (!URLexist(child.url)) { lock (dToBeCrawledURLs) dToBeCrawledURLs.Add(child.url, child); } } } else { childToBecomeParent.errorCounter = parent.errorCounter; // handle CrawlerState.Failed if (childToBecomeParent.errorCounter >= 3) { childToBecomeParent.urlDisabledTime = DateTime.Now; childToBecomeParent.errorCounter = 0; lock (disabledURLs) disabledURLs.Add(childToBecomeParent); } else { // transfer url from toBeCrawled to crawling lock (dToBeCrawledURLs) dToBeCrawledURLs.Add(childToBecomeParent.url, childToBecomeParent); } // remove url from crawling list lock (dCrawlingURLs) dCrawlingURLs.Remove(childToBecomeParent.url); } }); Tuple <string, Task, string> newTask = new Tuple <string, Task, string>(childToBecomeParent.rootURL, urlCrawlerTask, childToBecomeParent.url); tl_activelyRunningTasks.Add(newTask); }