public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) throw new ArgumentNullException("pageToCrawl"); PageToCrawl = pageToCrawl; }
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason) : base(crawlContext, pageToCrawl) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; if (!crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled && crawlContext.CrawledUrls.ContainsKey(pageToCrawl.Uri.AbsoluteUri)) return new CrawlDecision { Allow = false, Reason = "Link already crawled" }; if (crawlContext.CrawledUrls.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { crawlContext.IsCrawlStopRequested = true; return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if (!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (shouldCrawlPageDecision.Allow) shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true }; if (shouldCrawlPageDecision.Allow) { AddPageToContext(pageToCrawl); } else { _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } return shouldCrawlPageDecision.Allow; }
protected virtual void ProcessPage(PageToCrawl pageToCrawl) { try { if (pageToCrawl == null) return; if (!ShouldCrawlPage(pageToCrawl)) return; CrawledPage crawledPage = CrawlThePage(pageToCrawl); if (PageSizeIsAboveMax(crawledPage)) return; FirePageCrawlCompletedEventAsync(crawledPage); FirePageCrawlCompletedEvent(crawledPage); if (ShouldCrawlPageLinks(crawledPage)) SchedulePageLinks(crawledPage); } catch (Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri); _logger.Fatal(e); _crawlContext.IsCrawlHardStopRequested = true; } }
protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl) { EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync; if (threadSafeEvent != null) { //Fire each subscribers delegate async foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList()) { del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null); } } }
protected virtual void FirePageCrawlStartingEvent(PageToCrawl pageToCrawl) { try { EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStarting; if (threadSafeEvent != null) threadSafeEvent(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlStarting event for url:" + pageToCrawl.Uri.AbsoluteUri); _logger.Error(e); } }
protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl) { _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri); FirePageCrawlStartingEventAsync(pageToCrawl); FirePageCrawlStartingEvent(pageToCrawl); CrawledPage crawledPage = _httpRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x)); AutoMapper.Mapper.CreateMap<PageToCrawl, CrawledPage>(); AutoMapper.Mapper.Map(pageToCrawl, crawledPage); if (crawledPage.HttpWebResponse == null) _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri); else _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri); return crawledPage; }
protected virtual void AddPageToContext(PageToCrawl pageToCrawl) { _crawlContext.CrawledUrls.TryAdd(pageToCrawl.Uri.AbsoluteUri, 0); int domainCount = 0; lock (_crawlContext.CrawlCountByDomain) { if (_crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out domainCount)) _crawlContext.CrawlCountByDomain[pageToCrawl.Uri.Authority] = domainCount + 1; else _crawlContext.CrawlCountByDomain.TryAdd(pageToCrawl.Uri.Authority, 1); } }
/// <summary> /// Schedules the param to be crawled in a FIFO fashion /// </summary> public void Add(PageToCrawl page) { if (page == null) throw new ArgumentNullException("page"); if (_allowUriRecrawling) { //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri); _pagesToCrawl.Enqueue(page); } else { if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null)) { //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri); _pagesToCrawl.Enqueue(page); } } }
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { bool allowedByRobots = true; if (_robotsDotText != null) allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); if (!allowedByRobots) { string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return false; } return allowedByRobots && base.ShouldCrawlPage(pageToCrawl); }