//protected virtual async Task<CrawledPage> CrawlThePage(PageToCrawl pageToCrawl) protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl) { _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri); FirePageCrawlStartingEventAsync(pageToCrawl); FirePageCrawlStartingEvent(pageToCrawl); if (pageToCrawl.IsRetry) { WaitMinimumRetryDelay(pageToCrawl); } pageToCrawl.LastRequest = DateTime.Now; CrawledPage crawledPage = _pageRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x)); //CrawledPage crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x)); dynamic combinedPageBag = this.CombinePageBags(pageToCrawl.PageBag, crawledPage.PageBag); Mapper.CreateMap <PageToCrawl, CrawledPage>(); Mapper.Map(pageToCrawl, crawledPage); crawledPage.PageBag = combinedPageBag; if (crawledPage.HttpWebResponse == null) { _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}] Retry:[{2}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri, crawledPage.RetryCount); } else { _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}] Retry:[{3}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri, crawledPage.RetryCount); } return(crawledPage); }
protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl) { FirePageCrawlStartingEventAsync(pageToCrawl); pageToCrawl.LastRequest = DateTime.Now; CrawledPage crawledPage = _pageRequester.MakeRequest(pageToCrawl.Uri, ShouldDownloadPageContent); Map(pageToCrawl, crawledPage); _logger.LogInformation("抓取完毕, 状态:[{0}] 地址:[{1}] 花费:[{2}] 重试:[{3}]", crawledPage.StatusCode, crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.RetryCount); return(crawledPage); }
public IRobotsDotText Find(Uri rootUri) { if (rootUri == null) { throw new ArgumentNullException("rootUri"); } Uri robotsUri = new Uri(rootUri, "/robots.txt"); CrawledPage page = _pageRequester.MakeRequest(robotsUri); if (page == null || page.WebException != null || page.HttpWebResponse == null || page.HttpWebResponse.StatusCode != HttpStatusCode.OK) { _logger.DebugFormat("Did not find robots.txt file at [{0}]", robotsUri); return(null); } _logger.DebugFormat("Found robots.txt file at [{0}]", robotsUri); return(new RobotsDotText(rootUri, page.Content.Text)); }
protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl) { _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri); FirePageCrawlStartingEventAsync(pageToCrawl); FirePageCrawlStartingEvent(pageToCrawl); CrawledPage crawledPage = _httpRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x)); AutoMapper.Mapper.CreateMap <PageToCrawl, CrawledPage>(); AutoMapper.Mapper.Map(pageToCrawl, crawledPage); if (crawledPage.HttpWebResponse == null) { _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri); } else { _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri); } return(crawledPage); }
public async Task Crawl(Uri startingUrl, int maxUrlsToCrawl) { var maxCrawls = maxUrlsToCrawl; var currentCrawls = 0; var crawlComplete = false; _scheduler.Add(startingUrl); while (!crawlComplete && currentCrawls < maxCrawls) { var linksLeftToSchedule = _scheduler.Count; if (linksLeftToSchedule > 0) { var linkToCrawl = _scheduler.GetNext(); ProcessPage(await _pageRequester.MakeRequest(linkToCrawl)); currentCrawls++; } else { //Finished the crawl crawlComplete = true; } } }
public void DownloadPage_Google_Downloads() { CrawledPage page = _requester.MakeRequest(new Uri("http://google.co.uk")); Assert.IsNotNull(page); }