public void AddUrl(Web.PageCrawlDetail url) { if (UrlToBeAdded(url)) { UrlsToCrawl.Enqueue(url); } }
public bool UrlToBeAdded(Web.PageCrawlDetail url) { bool isInQueue = UrlsToCrawl.Where(x => x.PageUri.AbsoluteUri == url.PageUri.AbsoluteUri).Count() > 0; bool isInComplete = UrlsCompleted.Where(x => x.PageUri.AbsoluteUri == url.PageUri.AbsoluteUri).Count() > 0; bool isAllowed = Robots.DisallowedList.Where(x => url.PageUri.AbsolutePath.StartsWith(x)).Count() == 0; bool isInDomain = Robots.BastUri.Authority == url.PageUri.Authority; return(!isInQueue && !isInComplete && isAllowed && isInDomain); }
private async Task <string> ProcessUrl(Web.PageCrawlDetail url) { Console.WriteLine("url " + url); var response = await client.GetAsync(url.PageUri.AbsoluteUri); var content = await response.Content.ReadAsStringAsync(); url.LoadContent(content, response.StatusCode); url.LoadUris(); CrawlList.AddUrls(url.AllLinks); url.AllLinks = new List <Web.PageCrawlDetail>(); // dont need it anymore CrawlList.UrlsCompleted.Add(url); return(content); }
public static List <Web.PageCrawlDetail> GetUris(HtmlAgilityPack.HtmlDocument doc, Uri baseUri) { List <Web.PageCrawlDetail> uris = new List <Web.PageCrawlDetail>(); foreach (HtmlAgilityPack.HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]")) { // Get the value of the HREF attribute string hrefValue = link.GetAttributeValue("href", string.Empty); Uri uri = GetUrifromHREF(baseUri, hrefValue); if (uri != null && uris.Where(x => x.PageUri.AbsoluteUri == uri.AbsoluteUri).Count() == 0) { Web.PageCrawlDetail url = new Web.PageCrawlDetail(uri); uris.Add(url); } } return(uris); }
public async Task <bool> Crawl(Web.PageCrawlDetail startUrl) { runningTasks.Add(ProcessUrl(startUrl)); while (runningTasks.Any()) { var completedTask = await Task.WhenAny(runningTasks); runningTasks.Remove(completedTask); var pageHtml = await completedTask; Thread.Sleep(SleepTime); // added so there is not too many while (CrawlList.HasNext() && runningTasks.Count < maxConcurrentDownload) { var url = CrawlList.GetNext(); runningTasks.Add(ProcessUrl(url)); } } return(true); }